Regular expressions are powerful but tricky. Here's how to use them effectively in Python.

Basic Matching

import re
 
# Check if pattern exists
if re.search(r"hello", "hello world"):
    print("Found")
 
# Match at start only
if re.match(r"hello", "hello world"):
    print("Starts with hello")
 
# Full string match
if re.fullmatch(r"hello", "hello"):
    print("Exact match")
 
# Find all occurrences
matches = re.findall(r"\d+", "abc 123 def 456")
print(matches)  # ['123', '456']

Raw Strings

# Always use raw strings (r"...") for patterns
pattern = r"\d+"      # Good: literal backslash-d
pattern = "\\d+"      # Works but harder to read
 
# Especially important for:
r"\n"   # Matches literal \n
"\n"    # Matches newline character

Match Objects

import re
 
text = "Email: alice@example.com"
match = re.search(r"(\w+)@(\w+)\.(\w+)", text)
 
if match:
    print(match.group())   # alice@example.com
    print(match.group(1))  # alice
    print(match.group(2))  # example
    print(match.groups())  # ('alice', 'example', 'com')
    print(match.start())   # 7
    print(match.end())     # 24
    print(match.span())    # (7, 24)

Named Groups

import re
 
pattern = r"(?P<user>\w+)@(?P<domain>\w+)\.(?P<tld>\w+)"
match = re.search(pattern, "alice@example.com")
 
if match:
    print(match.group("user"))    # alice
    print(match.group("domain"))  # example
    print(match.groupdict())      # {'user': 'alice', 'domain': 'example', 'tld': 'com'}

Flags

import re
 
# Case insensitive
re.search(r"hello", "HELLO", re.IGNORECASE)
re.search(r"(?i)hello", "HELLO")  # Inline flag
 
# Multiline (^ and $ match line boundaries)
re.findall(r"^\w+", "line1\nline2", re.MULTILINE)
 
# Dotall (. matches newlines too)
re.search(r"a.b", "a\nb", re.DOTALL)
 
# Verbose (allow comments and whitespace)
pattern = re.compile(r"""
    \d{3}     # Area code
    [-.]?     # Optional separator
    \d{3}     # First 3 digits
    [-.]?     # Optional separator
    \d{4}     # Last 4 digits
""", re.VERBOSE)
 
# Combine flags
re.search(r"hello", "HELLO\nWORLD", re.IGNORECASE | re.MULTILINE)

Compiled Patterns

import re
 
# Compile once, use many times
EMAIL_RE = re.compile(r"[\w.-]+@[\w.-]+\.\w+")
 
def has_email(text):
    return EMAIL_RE.search(text) is not None
 
def find_emails(text):
    return EMAIL_RE.findall(text)

Substitution

import re
 
# Simple replace
result = re.sub(r"\d+", "X", "abc 123 def 456")
print(result)  # abc X def X
 
# With backreferences
result = re.sub(r"(\w+)@(\w+)", r"\2@\1", "alice@example")
print(result)  # example@alice
 
# With function
def double(match):
    return str(int(match.group()) * 2)
 
result = re.sub(r"\d+", double, "abc 5 def 10")
print(result)  # abc 10 def 20
 
# Limit replacements
result = re.sub(r"\d+", "X", "1 2 3 4 5", count=2)
print(result)  # X X 3 4 5

Splitting

import re
 
# Split on pattern
parts = re.split(r"\s+", "hello   world  foo")
print(parts)  # ['hello', 'world', 'foo']
 
# Keep delimiters
parts = re.split(r"(\s+)", "hello   world")
print(parts)  # ['hello', '   ', 'world']
 
# Limit splits
parts = re.split(r"\s+", "a b c d e", maxsplit=2)
print(parts)  # ['a', 'b', 'c d e']

Common Patterns

import re
 
# Email (simplified)
EMAIL = r"[\w.-]+@[\w.-]+\.\w+"
 
# URL
URL = r"https?://[\w.-]+(?:/[\w./-]*)?"
 
# Phone (US)
PHONE = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
 
# Date (YYYY-MM-DD)
DATE = r"\d{4}-\d{2}-\d{2}"
 
# Time (HH:MM:SS)
TIME = r"\d{2}:\d{2}(?::\d{2})?"
 
# IPv4
IPV4 = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
 
# Slug (URL-safe string)
SLUG = r"[a-z0-9]+(?:-[a-z0-9]+)*"
 
# Username
USERNAME = r"[a-zA-Z][a-zA-Z0-9_]{2,29}"

Non-Greedy Matching

import re
 
text = "<tag>content</tag>"
 
# Greedy (default): matches as much as possible
print(re.search(r"<.*>", text).group())  # <tag>content</tag>
 
# Non-greedy: matches as little as possible
print(re.search(r"<.*?>", text).group())  # <tag>

Lookahead and Lookbehind

import re
 
# Positive lookahead: match only if followed by
re.findall(r"\d+(?= dollars)", "100 dollars 50 euros")  # ['100']
 
# Negative lookahead: match only if NOT followed by
re.findall(r"\d+(?! dollars)", "100 dollars 50 euros")  # ['50']
 
# Positive lookbehind: match only if preceded by
re.findall(r"(?<=\$)\d+", "$100 £50")  # ['100']
 
# Negative lookbehind: match only if NOT preceded by
re.findall(r"(?<!\$)\d+", "$100 £50")  # ['50']

Practical Examples

import re
 
# Extract domain from URL
def get_domain(url):
    match = re.search(r"https?://([^/]+)", url)
    return match.group(1) if match else None
 
# Validate password strength
def is_strong_password(password):
    checks = [
        r".{8,}",        # At least 8 chars
        r"[A-Z]",        # Has uppercase
        r"[a-z]",        # Has lowercase
        r"\d",           # Has digit
        r"[!@#$%^&*]",   # Has special char
    ]
    return all(re.search(p, password) for p in checks)
 
# Clean whitespace
def normalize_whitespace(text):
    return re.sub(r"\s+", " ", text).strip()
 
# Extract hashtags
def find_hashtags(text):
    return re.findall(r"#(\w+)", text)
 
# Mask sensitive data
def mask_email(text):
    return re.sub(
        r"(\w)[^@]*(@\w+\.\w+)",
        r"\1***\2",
        text
    )

Escaping

import re
 
# Escape special characters
user_input = "hello (world)"
pattern = re.escape(user_input)
print(pattern)  # hello\ \(world\)
 
# Safe search with user input
def safe_search(text, query):
    return re.search(re.escape(query), text)

Performance Tips

import re
 
# Compile frequently used patterns
PATTERN = re.compile(r"\d+")
 
# Use non-capturing groups when you don't need captures
r"(?:https?://)?"  # Non-capturing
r"(https?://)?"    # Capturing (slower, stores match)
 
# Anchor patterns when possible
r"^start"          # Faster than r"start" at beginning
r"end$"            # Faster than r"end" at end
 
# Be specific
r"[0-9]"           # Same as \d but clearer intent
r"[a-zA-Z0-9_]"    # Same as \w

Common Mistakes

import re
 
# Mistake: forgetting raw string
re.search("\d+", "123")     # Works but confusing
re.search(r"\d+", "123")    # Better
 
# Mistake: using match when you want search
re.match(r"world", "hello world")   # None (match is at start only)
re.search(r"world", "hello world")  # Match found
 
# Mistake: greedy when you want non-greedy
re.sub(r"<.*>", "", "<a>text</a>")    # Removes everything
re.sub(r"<.*?>", "", "<a>text</a>")   # Removes tags only
 
# Mistake: not escaping user input
user = "hello.*"
re.search(user, text)               # Dangerous
re.search(re.escape(user), text)    # Safe

Regular expressions are a precision tool. Use them for pattern matching, but don't force them where simpler string methods work.

React to this post: