pickle serializes Python objects to bytes. Here's how to use it safely.

Basic Usage

import pickle
 
# Save object
data = {"users": ["Alice", "Bob"], "count": 2}
 
with open("data.pkl", "wb") as f:
    pickle.dump(data, f)
 
# Load object
with open("data.pkl", "rb") as f:
    loaded = pickle.load(f)
 
print(loaded)  # {"users": ["Alice", "Bob"], "count": 2}

Bytes Operations

# To bytes
data = [1, 2, 3]
pickled = pickle.dumps(data)
 
# From bytes
unpickled = pickle.loads(pickled)

What Can Be Pickled

# ✓ Built-in types
pickle.dumps(42)
pickle.dumps("hello")
pickle.dumps([1, 2, 3])
pickle.dumps({"key": "value"})
pickle.dumps((1, 2, 3))
pickle.dumps({1, 2, 3})
 
# ✓ Classes and instances
class User:
    def __init__(self, name):
        self.name = name
 
pickle.dumps(User("Alice"))
 
# ✓ Functions (by reference)
pickle.dumps(len)
 
# ✗ Cannot pickle
pickle.dumps(lambda x: x)      # Lambdas
pickle.dumps(open("file.txt")) # File objects
pickle.dumps(lock)             # Threading locks

Protocols

# Protocol versions (higher = more efficient)
pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
 
# Protocol 0: ASCII (readable)
# Protocol 1: Old binary
# Protocol 2: Python 2.3+
# Protocol 3: Python 3.0+
# Protocol 4: Python 3.4+ (default in 3.8+)
# Protocol 5: Python 3.8+ (out-of-band buffers)
 
# Check default
print(pickle.DEFAULT_PROTOCOL)

Custom Pickling

class Connection:
    def __init__(self, host):
        self.host = host
        self._socket = connect(host)  # Can't pickle
    
    def __getstate__(self):
        """Called when pickling."""
        state = self.__dict__.copy()
        del state["_socket"]
        return state
    
    def __setstate__(self, state):
        """Called when unpickling."""
        self.__dict__.update(state)
        self._socket = connect(self.host)

Using reduce

class ComplexObject:
    def __init__(self, data):
        self.data = data
    
    def __reduce__(self):
        # Return (callable, args) to reconstruct
        return (self.__class__, (self.data,))

Security Warning

⚠️ Never unpickle untrusted data!

# DANGER: This can execute arbitrary code
pickle.loads(untrusted_bytes)  # Don't do this!

Malicious pickles can:

  • Execute shell commands
  • Delete files
  • Open network connections
  • Run any Python code

Safer Alternatives

# For simple data, use JSON
import json
json.dumps({"key": "value"})
 
# For configuration, use TOML/YAML
import tomllib  # Python 3.11+
 
# For data exchange, use structured formats
# - Protocol Buffers
# - MessagePack
# - Apache Avro

If You Must Use pickle

# Only load from trusted sources
# Verify file integrity with hashes
import hashlib
 
def save_secure(obj, path, hash_path):
    data = pickle.dumps(obj)
    Path(path).write_bytes(data)
    hash_val = hashlib.sha256(data).hexdigest()
    Path(hash_path).write_text(hash_val)
 
def load_secure(path, hash_path):
    data = Path(path).read_bytes()
    expected = Path(hash_path).read_text()
    actual = hashlib.sha256(data).hexdigest()
    if actual != expected:
        raise ValueError("File integrity check failed")
    return pickle.loads(data)

Common Use Cases

Caching

from pathlib import Path
import pickle
import hashlib
 
def cached(func):
    cache_dir = Path(".cache")
    cache_dir.mkdir(exist_ok=True)
    
    def wrapper(*args, **kwargs):
        key = hashlib.md5(
            pickle.dumps((func.__name__, args, kwargs))
        ).hexdigest()
        cache_file = cache_dir / f"{key}.pkl"
        
        if cache_file.exists():
            return pickle.loads(cache_file.read_bytes())
        
        result = func(*args, **kwargs)
        cache_file.write_bytes(pickle.dumps(result))
        return result
    
    return wrapper
 
@cached
def expensive_computation(n):
    # ...
    return result

Saving ML Models

import pickle
 
# Save model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)
 
# Load model
with open("model.pkl", "rb") as f:
    model = pickle.load(f)
 
# Better: Use joblib for large numpy arrays
import joblib
joblib.dump(model, "model.joblib")
model = joblib.load("model.joblib")

Session State

class Session:
    def __init__(self, path):
        self.path = Path(path)
        self.data = {}
    
    def save(self):
        self.path.write_bytes(pickle.dumps(self.data))
    
    def load(self):
        if self.path.exists():
            self.data = pickle.loads(self.path.read_bytes())

Performance Tips

# Use highest protocol for speed and size
pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
 
# For large objects, consider:
# - joblib (efficient for numpy)
# - cloudpickle (for lambdas and closures)
# - dill (more object types)

Quick Reference

import pickle
 
# Save to file
with open("data.pkl", "wb") as f:
    pickle.dump(obj, f)
 
# Load from file
with open("data.pkl", "rb") as f:
    obj = pickle.load(f)
 
# To/from bytes
data = pickle.dumps(obj)
obj = pickle.loads(data)
 
# With protocol
pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
 
# Custom pickling
class MyClass:
    def __getstate__(self):
        return self.__dict__
    
    def __setstate__(self, state):
        self.__dict__.update(state)

Use pickle for caching and temporary storage. Never unpickle untrusted data. For data exchange, prefer JSON or other structured formats.

React to this post: