Beyond basic matching, Python's re module offers powerful pattern capabilities.

Compiled Patterns

import re
 
# Compile once, use many times
EMAIL = re.compile(r'[\w.-]+@[\w.-]+\.\w+')
 
emails = EMAIL.findall(text)
if EMAIL.match(user_input):
    # valid email format
    pass

Flags

import re
 
# Case insensitive
re.search(r'hello', 'HELLO', re.IGNORECASE)
re.search(r'hello', 'HELLO', re.I)
 
# Multiline (^ and $ match line boundaries)
re.findall(r'^item', text, re.MULTILINE)
re.findall(r'^item', text, re.M)
 
# Dotall (. matches newlines)
re.search(r'start.*end', text, re.DOTALL)
re.search(r'start.*end', text, re.S)
 
# Verbose (allow comments and whitespace)
pattern = re.compile(r'''
    \d{3}       # Area code
    [-.\s]?     # Optional separator
    \d{3}       # Exchange
    [-.\s]?     # Optional separator
    \d{4}       # Number
''', re.VERBOSE)
 
# Combine flags
re.search(r'pattern', text, re.I | re.M | re.S)

Named Groups

import re
 
pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'
match = re.match(pattern, '2026-03-21')
 
print(match.group('year'))   # 2026
print(match.group('month'))  # 03
print(match.groupdict())     # {'year': '2026', 'month': '03', 'day': '21'}

Non-Capturing Groups

import re
 
# Capturing group
re.findall(r'(https?)://(\S+)', text)  # Returns tuples
 
# Non-capturing group
re.findall(r'(?:https?)://(\S+)', text)  # Only captures URL

Lookahead and Lookbehind

import re
 
# Positive lookahead: match if followed by
re.findall(r'\w+(?=@)', 'user@example.com')  # ['user']
 
# Negative lookahead: match if NOT followed by
re.findall(r'\d+(?!px)', '100px 200em 300')  # ['200', '300']
 
# Positive lookbehind: match if preceded by
re.findall(r'(?<=\$)\d+', 'Price: $100')  # ['100']
 
# Negative lookbehind: match if NOT preceded by
re.findall(r'(?<!\$)\d+', '$100 200')  # ['00', '200']

Substitution

import re
 
# Basic replace
re.sub(r'\s+', ' ', 'too   many    spaces')
 
# With function
def double(match):
    return str(int(match.group()) * 2)
 
re.sub(r'\d+', double, 'a1b2c3')  # 'a2b4c6'
 
# With groups
re.sub(r'(\w+)@(\w+)', r'\2:\1', 'user@host')  # 'host:user'
 
# Limit replacements
re.sub(r'\d', 'X', '123456', count=3)  # 'XXX456'

Split

import re
 
# Split on pattern
re.split(r'[,;\s]+', 'a, b; c d')  # ['a', 'b', 'c', 'd']
 
# Keep delimiters
re.split(r'([,;])', 'a,b;c')  # ['a', ',', 'b', ';', 'c']
 
# Limit splits
re.split(r'\s', 'a b c d', maxsplit=2)  # ['a', 'b', 'c d']

finditer (Memory Efficient)

import re
 
# For large texts, use iterator
for match in re.finditer(r'\b\w+@\w+\.\w+\b', huge_text):
    print(match.group(), match.start(), match.end())

Match Object Methods

import re
 
text = "Hello, World!"
match = re.search(r'(\w+), (\w+)', text)
 
match.group()     # 'Hello, World'
match.group(0)    # Same as above
match.group(1)    # 'Hello'
match.group(2)    # 'World'
match.groups()    # ('Hello', 'World')
match.start()     # 0
match.end()       # 12
match.span()      # (0, 12)

Greedy vs Non-Greedy

import re
 
text = '<tag>content</tag>'
 
# Greedy (default)
re.findall(r'<.*>', text)   # ['<tag>content</tag>']
 
# Non-greedy
re.findall(r'<.*?>', text)  # ['<tag>', '</tag>']
 
# Also works with +, ?
re.findall(r'\d+?', '12345')  # ['1', '2', '3', '4', '5']

Atomic Groups (Python 3.11+)

import re
 
# Possessive quantifiers prevent backtracking
re.search(r'a++b', 'aaab')  # Faster, no backtrack

Practical Patterns

Email Validation

EMAIL = re.compile(r'''
    ^                       # Start
    [\w.+-]+                # Local part
    @                       # At symbol
    [a-zA-Z\d.-]+           # Domain
    \.                      # Dot
    [a-zA-Z]{2,}            # TLD
    $                       # End
''', re.VERBOSE)

URL Extraction

URL = re.compile(r'''
    https?://               # Protocol
    (?:[\w-]+\.)+           # Subdomains
    [\w-]+                  # Domain
    (?:/[\w./-]*)?          # Path
    (?:\?[\w=&]*)?          # Query
''', re.VERBOSE)

Password Validation

def is_strong_password(password):
    patterns = [
        r'.{8,}',           # At least 8 chars
        r'[A-Z]',           # Uppercase
        r'[a-z]',           # Lowercase
        r'\d',              # Digit
        r'[!@#$%^&*]',      # Special char
    ]
    return all(re.search(p, password) for p in patterns)

Log Parsing

LOG_PATTERN = re.compile(r'''
    (?P<ip>\d+\.\d+\.\d+\.\d+)\s+
    \[(?P<timestamp>[^\]]+)\]\s+
    "(?P<method>\w+)\s+(?P<path>\S+)\s+HTTP/\d\.\d"\s+
    (?P<status>\d+)\s+
    (?P<size>\d+)
''', re.VERBOSE)
 
for match in LOG_PATTERN.finditer(log_file):
    print(match.groupdict())

HTML Tag Extraction

# Simple tag matching (use proper parser for complex HTML)
TAGS = re.compile(r'<(\w+)[^>]*>(.*?)</\1>', re.DOTALL)
 
for tag, content in TAGS.findall(html):
    print(f"{tag}: {content[:50]}")

Performance Tips

import re
 
# Compile patterns used multiple times
pattern = re.compile(r'\d+')
 
# Use raw strings to avoid escape issues
r'\n\t\d+'  # Correct
'\\n\\t\\d+'  # Works but messy
 
# Anchor patterns when possible
re.match(r'start', text)  # Faster than search with ^
 
# Avoid catastrophic backtracking
# Bad: r'(a+)+b'
# Good: r'a+b'

Escape Special Characters

import re
 
# Escape user input
user_input = "cost: $100"
safe = re.escape(user_input)
# 'cost:\\ \\$100'
 
re.search(safe, text)  # Safe literal match

Summary

Advanced re patterns:

  • Flags: re.I, re.M, re.S, re.X
  • Groups: Named (?P<name>), non-capturing (?:)
  • Lookaround: (?=), (?!), (?<=), (?<!)
  • Greedy/lazy: * vs *?, + vs +?
  • Substitution: re.sub() with functions
  • Performance: Compile patterns, anchor when possible

Master regex for text processing and validation.

React to this post: