The hashlib module provides secure hash functions for checksums, data integrity, and password storage.
Basic Hashing
import hashlib
# Hash a string
data = "Hello, World!"
hash_obj = hashlib.sha256(data.encode())
print(hash_obj.hexdigest())
# 'dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f'
print(hash_obj.digest()) # Raw bytes
print(hash_obj.digest_size) # 32 bytesIncremental Hashing
For large data or streaming:
import hashlib
hasher = hashlib.sha256()
hasher.update(b"chunk 1")
hasher.update(b"chunk 2")
hasher.update(b"chunk 3")
# Same as hashing "chunk 1chunk 2chunk 3"
print(hasher.hexdigest())File Checksums
import hashlib
def file_hash(path, algorithm='sha256', chunk_size=8192):
"""Calculate hash of a file efficiently."""
hasher = hashlib.new(algorithm)
with open(path, 'rb') as f:
while chunk := f.read(chunk_size):
hasher.update(chunk)
return hasher.hexdigest()
# Verify file integrity
checksum = file_hash('download.zip')
print(f"SHA-256: {checksum}")
# Compare with expected
expected = "abc123..."
if checksum == expected:
print("File integrity verified")Multiple Hash Algorithms
import hashlib
def multi_hash(data: bytes) -> dict:
"""Generate multiple hashes for the same data."""
return {
'md5': hashlib.md5(data).hexdigest(),
'sha1': hashlib.sha1(data).hexdigest(),
'sha256': hashlib.sha256(data).hexdigest(),
'sha512': hashlib.sha512(data).hexdigest(),
}
hashes = multi_hash(b"test data")
# {'md5': '...', 'sha1': '...', 'sha256': '...', 'sha512': '...'}Available Algorithms
import hashlib
# Always available
print(hashlib.algorithms_guaranteed)
# {'sha256', 'sha384', 'sha512', 'sha1', 'md5', ...}
# Available on this system
print(hashlib.algorithms_available)
# May include 'sha3_256', 'blake2b', etc.
# Use by name
hasher = hashlib.new('sha3_256')
hasher.update(b"data")BLAKE2 for Speed
BLAKE2 is faster than SHA-256 while being equally secure:
import hashlib
# BLAKE2b (optimized for 64-bit)
b2b = hashlib.blake2b(b"data", digest_size=32)
print(b2b.hexdigest())
# BLAKE2s (optimized for 32-bit)
b2s = hashlib.blake2s(b"data", digest_size=32)
print(b2s.hexdigest())
# Keyed hashing (MAC)
key = b"secret key"
mac = hashlib.blake2b(b"message", key=key, digest_size=32)SHA-3 Family
import hashlib
# SHA-3 variants
sha3_256 = hashlib.sha3_256(b"data").hexdigest()
sha3_512 = hashlib.sha3_512(b"data").hexdigest()
# SHAKE (variable output length)
shake = hashlib.shake_256(b"data")
print(shake.hexdigest(32)) # 32 bytes output
print(shake.hexdigest(64)) # 64 bytes outputPassword Hashing (Don't Use Raw hashlib!)
import hashlib
import os
# DON'T do this for passwords
# bad_hash = hashlib.sha256(password.encode()).hexdigest()
# DO use PBKDF2
def hash_password(password: str) -> tuple[bytes, bytes]:
"""Hash password with PBKDF2."""
salt = os.urandom(32)
key = hashlib.pbkdf2_hmac(
'sha256',
password.encode(),
salt,
iterations=100_000,
dklen=32
)
return salt, key
def verify_password(password: str, salt: bytes, key: bytes) -> bool:
"""Verify password against stored hash."""
new_key = hashlib.pbkdf2_hmac(
'sha256',
password.encode(),
salt,
iterations=100_000,
dklen=32
)
return new_key == key
# Usage
salt, hashed = hash_password("my_password")
print(verify_password("my_password", salt, hashed)) # TrueContent-Addressable Storage
import hashlib
from pathlib import Path
class ContentStore:
"""Store files by their content hash."""
def __init__(self, store_dir: str):
self.store = Path(store_dir)
self.store.mkdir(exist_ok=True)
def put(self, data: bytes) -> str:
"""Store data and return its hash."""
content_hash = hashlib.sha256(data).hexdigest()
# Use first 2 chars as directory (like Git)
subdir = self.store / content_hash[:2]
subdir.mkdir(exist_ok=True)
path = subdir / content_hash
if not path.exists():
path.write_bytes(data)
return content_hash
def get(self, content_hash: str) -> bytes | None:
"""Retrieve data by hash."""
path = self.store / content_hash[:2] / content_hash
if path.exists():
return path.read_bytes()
return None
store = ContentStore('./content')
hash_id = store.put(b"my data")
data = store.get(hash_id)Merkle Tree
import hashlib
def merkle_root(items: list[bytes]) -> str:
"""Calculate Merkle tree root hash."""
if not items:
return hashlib.sha256(b"").hexdigest()
# Hash each item
hashes = [hashlib.sha256(item).digest() for item in items]
# Pad to even number
while len(hashes) > 1:
if len(hashes) % 2:
hashes.append(hashes[-1])
# Combine pairs
hashes = [
hashlib.sha256(hashes[i] + hashes[i+1]).digest()
for i in range(0, len(hashes), 2)
]
return hashes[0].hex()
# Usage
items = [b"tx1", b"tx2", b"tx3", b"tx4"]
root = merkle_root(items)Cache Keys
import hashlib
import json
def cache_key(*args, **kwargs) -> str:
"""Generate deterministic cache key from arguments."""
# Serialize arguments
key_data = json.dumps(
{'args': args, 'kwargs': kwargs},
sort_keys=True,
default=str
)
# Short hash for cache key
return hashlib.md5(key_data.encode()).hexdigest()[:16]
# Usage
key = cache_key('users', page=1, limit=10)
# 'a1b2c3d4e5f67890'Data Deduplication
import hashlib
from pathlib import Path
def find_duplicates(directory: str) -> dict[str, list[Path]]:
"""Find duplicate files by content hash."""
hash_to_files: dict[str, list[Path]] = {}
for path in Path(directory).rglob('*'):
if path.is_file():
with open(path, 'rb') as f:
file_hash = hashlib.sha256(f.read()).hexdigest()
hash_to_files.setdefault(file_hash, []).append(path)
# Return only duplicates
return {h: files for h, files in hash_to_files.items() if len(files) > 1}
duplicates = find_duplicates('./documents')
for hash_val, files in duplicates.items():
print(f"Duplicates ({hash_val[:8]}...):")
for f in files:
print(f" {f}")Hashing Algorithm Comparison
| Algorithm | Output | Speed | Security | Use Case |
|---|---|---|---|---|
| MD5 | 128-bit | Fast | Broken | Checksums only |
| SHA-1 | 160-bit | Fast | Weak | Legacy systems |
| SHA-256 | 256-bit | Good | Strong | General purpose |
| SHA-512 | 512-bit | Good | Strong | High security |
| BLAKE2b | Variable | Fast | Strong | Performance-critical |
| SHA-3 | Variable | Moderate | Strong | Post-quantum prep |
Rule of thumb: Use SHA-256 for general hashing, BLAKE2 for speed, PBKDF2/bcrypt/Argon2 for passwords.
React to this post: