Python Regular Expressions

Regex is powerful but cryptic. Here's how to use it effectively in Python.

Basic Usage

import re
 
text = "Contact us at hello@example.com"
 
# Search for pattern
match = re.search(r"\w+@\w+\.\w+", text)
if match:
    print(match.group())  # hello@example.com

Key Functions

# search - find first match anywhere
re.search(pattern, text)
 
# match - match at start only
re.match(pattern, text)
 
# findall - find all matches
re.findall(pattern, text)
 
# finditer - iterator of match objects
re.finditer(pattern, text)
 
# sub - replace matches
re.sub(pattern, replacement, text)
 
# split - split by pattern
re.split(pattern, text)

Common Patterns

# Digits
r"\d"       # Single digit
r"\d+"      # One or more digits
r"\d{3}"    # Exactly 3 digits
r"\d{2,4}"  # 2 to 4 digits
 
# Word characters
r"\w"       # Letter, digit, underscore
r"\w+"      # Word
 
# Whitespace
r"\s"       # Space, tab, newline
r"\s+"      # One or more whitespace
 
# Any character
r"."        # Any char except newline
r".*"       # Any number of any chars

Character Classes

r"[aeiou]"      # Any vowel
r"[a-z]"        # Lowercase letter
r"[A-Z]"        # Uppercase letter
r"[0-9]"        # Digit
r"[a-zA-Z0-9]"  # Alphanumeric
r"[^0-9]"       # NOT a digit

Anchors

r"^start"   # Start of string
r"end$"     # End of string
r"\bword\b" # Word boundary

Groups

text = "John Smith, 25 years old"
 
# Capturing groups
match = re.search(r"(\w+) (\w+), (\d+)", text)
if match:
    print(match.group(0))  # Full match
    print(match.group(1))  # John
    print(match.group(2))  # Smith
    print(match.group(3))  # 25
    print(match.groups())  # ('John', 'Smith', '25')

Named Groups

pattern = r"(?P<first>\w+) (?P<last>\w+), (?P<age>\d+)"
match = re.search(pattern, text)
if match:
    print(match.group("first"))  # John
    print(match.groupdict())     # {'first': 'John', ...}

Substitution

text = "Hello World"
 
# Simple replace
re.sub(r"World", "Python", text)  # Hello Python
 
# With groups
text = "John Smith"
re.sub(r"(\w+) (\w+)", r"\2, \1", text)  # Smith, John
 
# With function
def upper(match):
    return match.group(0).upper()
 
re.sub(r"\w+", upper, "hello world")  # HELLO WORLD

Flags

# Case insensitive
re.search(r"hello", "HELLO", re.IGNORECASE)
re.search(r"hello", "HELLO", re.I)
 
# Multiline - ^ and $ match line boundaries
re.findall(r"^\w+", text, re.MULTILINE)
 
# Dotall - . matches newlines too
re.search(r"start.*end", text, re.DOTALL)
 
# Verbose - allow comments and whitespace
pattern = re.compile(r"""
    \d{3}   # Area code
    -       # Separator
    \d{4}   # Number
""", re.VERBOSE)

Compile for Reuse

# Compile once, use many times
email_pattern = re.compile(r"[\w.-]+@[\w.-]+\.\w+")
 
emails = email_pattern.findall(text)
is_valid = email_pattern.match(user_input)

Common Patterns

Email

r"[\w.-]+@[\w.-]+\.\w+"

URL

r"https?://[\w./%-]+"

Phone Number

r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"

IP Address

r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"

Date (YYYY-MM-DD)

r"\d{4}-\d{2}-\d{2}"

Greedy vs Non-Greedy

text = "<div>content</div>"
 
# Greedy (default) - matches as much as possible
re.search(r"<.*>", text).group()   # <div>content</div>
 
# Non-greedy - matches as little as possible
re.search(r"<.*?>", text).group()  # <div>

Add ? after quantifier for non-greedy.

Lookahead and Lookbehind

# Positive lookahead - followed by
r"\d+(?= dollars)"  # Matches "50" in "50 dollars"
 
# Negative lookahead - NOT followed by
r"\d+(?! dollars)"  # Matches digits not followed by "dollars"
 
# Positive lookbehind - preceded by
r"(?<=\$)\d+"       # Matches "50" in "$50"
 
# Negative lookbehind - NOT preceded by
r"(?<!\$)\d+"       # Matches digits not preceded by "$"

Best Practices

Use raw strings:

# Good
r"\d+\.\d+"
 
# Bad - need to escape backslashes
"\\d+\\.\\d+"

Compile patterns you reuse:

pattern = re.compile(r"\d+")
for text in texts:
    pattern.findall(text)

Test your patterns:

# Use regex101.com to test patterns
# Write unit tests for complex patterns

Don't overuse regex:

# For simple checks, string methods are clearer
if "@" in email:  # Better than regex for simple check

When Not to Use Regex

Parsing HTML/XML (use BeautifulSoup)
Complex nested structures
When string methods suffice
When readability matters more than brevity

Regex is powerful but can be cryptic. Use it when it's the right tool.

React to this post:

#Basic Usage

#Key Functions

#Common Patterns

#Character Classes

#Anchors

#Groups

#Named Groups

#Substitution

#Flags

#Compile for Reuse

#Common Patterns

#Email

#URL

#Phone Number

#IP Address

#Date (YYYY-MM-DD)

#Greedy vs Non-Greedy

#Lookahead and Lookbehind

#Best Practices

#When Not to Use Regex

Keep Reading

Need help shipping fast?

Basic Usage

Key Functions

Common Patterns

Character Classes

Anchors

Groups

Named Groups

Substitution

Flags

Compile for Reuse

Common Patterns

Email

URL

Phone Number

IP Address

Date (YYYY-MM-DD)

Greedy vs Non-Greedy

Lookahead and Lookbehind

Best Practices

When Not to Use Regex