Python urllib Module

Python's urllib package provides tools for working with URLs and making HTTP requests without external dependencies. It's split into four modules.

urllib.request - Making Requests

Basic GET

from urllib.request import urlopen
 
response = urlopen('https://httpbin.org/get')
html = response.read().decode('utf-8')
print(html)

With Headers

from urllib.request import Request, urlopen
 
req = Request(
    'https://httpbin.org/get',
    headers={'User-Agent': 'MyApp/1.0'}
)
 
response = urlopen(req)
print(response.read().decode())

POST Request

from urllib.request import Request, urlopen
import json
 
data = json.dumps({'key': 'value'}).encode('utf-8')
 
req = Request(
    'https://httpbin.org/post',
    data=data,
    headers={'Content-Type': 'application/json'},
    method='POST'
)
 
response = urlopen(req)
print(response.read().decode())

Form Data

from urllib.request import Request, urlopen
from urllib.parse import urlencode
 
form_data = urlencode({'username': 'alice', 'password': 'secret'})
 
req = Request(
    'https://httpbin.org/post',
    data=form_data.encode('utf-8'),
    method='POST'
)
 
response = urlopen(req)

urllib.parse - URL Manipulation

Parse URLs

from urllib.parse import urlparse
 
url = 'https://user:pass@example.com:8080/path?query=1#section'
parsed = urlparse(url)
 
print(parsed.scheme)    # https
print(parsed.netloc)    # user:pass@example.com:8080
print(parsed.hostname)  # example.com
print(parsed.port)      # 8080
print(parsed.path)      # /path
print(parsed.query)     # query=1
print(parsed.fragment)  # section
print(parsed.username)  # user
print(parsed.password)  # pass

Build URLs

from urllib.parse import urlunparse, urljoin
 
# Build from parts
url = urlunparse((
    'https',           # scheme
    'example.com',     # netloc
    '/path',           # path
    '',                # params
    'query=1',         # query
    'section'          # fragment
))
print(url)  # https://example.com/path?query=1#section
 
# Join URLs
base = 'https://example.com/api/v1/'
print(urljoin(base, 'users'))     # https://example.com/api/v1/users
print(urljoin(base, '/absolute')) # https://example.com/absolute

Query Strings

from urllib.parse import urlencode, parse_qs, parse_qsl
 
# Build query string
params = {'name': 'Alice', 'tags': ['a', 'b']}
query = urlencode(params, doseq=True)
print(query)  # name=Alice&tags=a&tags=b
 
# Parse query string
parsed = parse_qs('name=Alice&tags=a&tags=b')
print(parsed)  # {'name': ['Alice'], 'tags': ['a', 'b']}
 
# Parse as list of tuples
parsed = parse_qsl('name=Alice&age=30')
print(parsed)  # [('name', 'Alice'), ('age', '30')]

URL Encoding

from urllib.parse import quote, quote_plus, unquote
 
# Encode special characters
print(quote('hello world'))      # hello%20world
print(quote_plus('hello world')) # hello+world (for form data)
 
# Decode
print(unquote('hello%20world'))  # hello world

urllib.error - Exceptions

from urllib.request import urlopen
from urllib.error import URLError, HTTPError
 
try:
    response = urlopen('https://httpbin.org/status/404')
except HTTPError as e:
    print(f"HTTP {e.code}: {e.reason}")
except URLError as e:
    print(f"URL Error: {e.reason}")

urllib.robotparser - robots.txt

from urllib.robotparser import RobotFileParser
 
rp = RobotFileParser()
rp.set_url('https://example.com/robots.txt')
rp.read()
 
# Check if URL is allowed
print(rp.can_fetch('*', '/'))
print(rp.can_fetch('*', '/private'))

Response Handling

from urllib.request import urlopen
 
response = urlopen('https://httpbin.org/get')
 
# Response info
print(response.status)        # 200
print(response.reason)        # OK
print(response.headers)       # Headers object
print(response.getheader('Content-Type'))
 
# Read body
body = response.read()        # bytes
text = body.decode('utf-8')   # string
 
# Stream large responses
for line in response:
    print(line.decode())

Timeouts

from urllib.request import urlopen
from urllib.error import URLError
import socket
 
try:
    response = urlopen('https://httpbin.org/delay/5', timeout=2)
except socket.timeout:
    print("Request timed out")
except URLError as e:
    if isinstance(e.reason, socket.timeout):
        print("Request timed out")

Downloading Files

from urllib.request import urlretrieve
 
# Simple download
urlretrieve('https://example.com/file.zip', 'local_file.zip')
 
# With progress
def progress(count, block_size, total_size):
    percent = count * block_size * 100 / total_size
    print(f"\rDownloading: {percent:.1f}%", end='')
 
urlretrieve('https://example.com/file.zip', 'local_file.zip', progress)
print()

Custom Handlers

from urllib.request import build_opener, HTTPHandler
 
# Debug handler
handler = HTTPHandler(debuglevel=1)
opener = build_opener(handler)
 
response = opener.open('https://httpbin.org/get')

Basic Authentication

from urllib.request import HTTPPasswordMgrWithDefaultRealm
from urllib.request import HTTPBasicAuthHandler, build_opener
 
password_mgr = HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, 'https://httpbin.org/', 'user', 'pass')
 
auth_handler = HTTPBasicAuthHandler(password_mgr)
opener = build_opener(auth_handler)
 
response = opener.open('https://httpbin.org/basic-auth/user/pass')

Proxy Support

from urllib.request import ProxyHandler, build_opener
 
proxy = ProxyHandler({
    'http': 'http://proxy.example.com:8080',
    'https': 'http://proxy.example.com:8080'
})
 
opener = build_opener(proxy)
response = opener.open('https://httpbin.org/get')

Context Manager

from urllib.request import urlopen
 
with urlopen('https://httpbin.org/get') as response:
    data = response.read().decode()
    print(data)
# Automatically closes

When to Use urllib vs requests

urllib (stdlib):

No external dependencies
Simple GET/POST
URL parsing and manipulation
Part of standard library

requests (external):

Cleaner API
Session handling
Better JSON support
More intuitive

Summary

urllib handles URLs and HTTP in the standard library:

urllib.request - make HTTP requests
urllib.parse - parse and build URLs
urllib.error - handle exceptions
urllib.robotparser - parse robots.txt

For simple scripts with no external deps, urllib works. For complex HTTP work, consider requests.

React to this post:

#urllib.request - Making Requests

#Basic GET

#With Headers

#POST Request

#Form Data

#urllib.parse - URL Manipulation

#Parse URLs

#Build URLs

#Query Strings

#URL Encoding

#urllib.error - Exceptions

#urllib.robotparser - robots.txt

#Response Handling

#Timeouts

#Downloading Files

#Custom Handlers

#Basic Authentication

#Proxy Support

#Context Manager

#When to Use urllib vs requests

#Summary

Keep Reading

Need help shipping fast?