Python's urllib package provides tools for working with URLs and making HTTP requests without external dependencies. It's split into four modules.
urllib.request - Making Requests
Basic GET
from urllib.request import urlopen
response = urlopen('https://httpbin.org/get')
html = response.read().decode('utf-8')
print(html)With Headers
from urllib.request import Request, urlopen
req = Request(
'https://httpbin.org/get',
headers={'User-Agent': 'MyApp/1.0'}
)
response = urlopen(req)
print(response.read().decode())POST Request
from urllib.request import Request, urlopen
import json
data = json.dumps({'key': 'value'}).encode('utf-8')
req = Request(
'https://httpbin.org/post',
data=data,
headers={'Content-Type': 'application/json'},
method='POST'
)
response = urlopen(req)
print(response.read().decode())Form Data
from urllib.request import Request, urlopen
from urllib.parse import urlencode
form_data = urlencode({'username': 'alice', 'password': 'secret'})
req = Request(
'https://httpbin.org/post',
data=form_data.encode('utf-8'),
method='POST'
)
response = urlopen(req)urllib.parse - URL Manipulation
Parse URLs
from urllib.parse import urlparse
url = 'https://user:pass@example.com:8080/path?query=1#section'
parsed = urlparse(url)
print(parsed.scheme) # https
print(parsed.netloc) # user:pass@example.com:8080
print(parsed.hostname) # example.com
print(parsed.port) # 8080
print(parsed.path) # /path
print(parsed.query) # query=1
print(parsed.fragment) # section
print(parsed.username) # user
print(parsed.password) # passBuild URLs
from urllib.parse import urlunparse, urljoin
# Build from parts
url = urlunparse((
'https', # scheme
'example.com', # netloc
'/path', # path
'', # params
'query=1', # query
'section' # fragment
))
print(url) # https://example.com/path?query=1#section
# Join URLs
base = 'https://example.com/api/v1/'
print(urljoin(base, 'users')) # https://example.com/api/v1/users
print(urljoin(base, '/absolute')) # https://example.com/absoluteQuery Strings
from urllib.parse import urlencode, parse_qs, parse_qsl
# Build query string
params = {'name': 'Alice', 'tags': ['a', 'b']}
query = urlencode(params, doseq=True)
print(query) # name=Alice&tags=a&tags=b
# Parse query string
parsed = parse_qs('name=Alice&tags=a&tags=b')
print(parsed) # {'name': ['Alice'], 'tags': ['a', 'b']}
# Parse as list of tuples
parsed = parse_qsl('name=Alice&age=30')
print(parsed) # [('name', 'Alice'), ('age', '30')]URL Encoding
from urllib.parse import quote, quote_plus, unquote
# Encode special characters
print(quote('hello world')) # hello%20world
print(quote_plus('hello world')) # hello+world (for form data)
# Decode
print(unquote('hello%20world')) # hello worldurllib.error - Exceptions
from urllib.request import urlopen
from urllib.error import URLError, HTTPError
try:
response = urlopen('https://httpbin.org/status/404')
except HTTPError as e:
print(f"HTTP {e.code}: {e.reason}")
except URLError as e:
print(f"URL Error: {e.reason}")urllib.robotparser - robots.txt
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url('https://example.com/robots.txt')
rp.read()
# Check if URL is allowed
print(rp.can_fetch('*', '/'))
print(rp.can_fetch('*', '/private'))Response Handling
from urllib.request import urlopen
response = urlopen('https://httpbin.org/get')
# Response info
print(response.status) # 200
print(response.reason) # OK
print(response.headers) # Headers object
print(response.getheader('Content-Type'))
# Read body
body = response.read() # bytes
text = body.decode('utf-8') # string
# Stream large responses
for line in response:
print(line.decode())Timeouts
from urllib.request import urlopen
from urllib.error import URLError
import socket
try:
response = urlopen('https://httpbin.org/delay/5', timeout=2)
except socket.timeout:
print("Request timed out")
except URLError as e:
if isinstance(e.reason, socket.timeout):
print("Request timed out")Downloading Files
from urllib.request import urlretrieve
# Simple download
urlretrieve('https://example.com/file.zip', 'local_file.zip')
# With progress
def progress(count, block_size, total_size):
percent = count * block_size * 100 / total_size
print(f"\rDownloading: {percent:.1f}%", end='')
urlretrieve('https://example.com/file.zip', 'local_file.zip', progress)
print()Custom Handlers
from urllib.request import build_opener, HTTPHandler
# Debug handler
handler = HTTPHandler(debuglevel=1)
opener = build_opener(handler)
response = opener.open('https://httpbin.org/get')Basic Authentication
from urllib.request import HTTPPasswordMgrWithDefaultRealm
from urllib.request import HTTPBasicAuthHandler, build_opener
password_mgr = HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, 'https://httpbin.org/', 'user', 'pass')
auth_handler = HTTPBasicAuthHandler(password_mgr)
opener = build_opener(auth_handler)
response = opener.open('https://httpbin.org/basic-auth/user/pass')Proxy Support
from urllib.request import ProxyHandler, build_opener
proxy = ProxyHandler({
'http': 'http://proxy.example.com:8080',
'https': 'http://proxy.example.com:8080'
})
opener = build_opener(proxy)
response = opener.open('https://httpbin.org/get')Context Manager
from urllib.request import urlopen
with urlopen('https://httpbin.org/get') as response:
data = response.read().decode()
print(data)
# Automatically closesWhen to Use urllib vs requests
urllib (stdlib):
- No external dependencies
- Simple GET/POST
- URL parsing and manipulation
- Part of standard library
requests (external):
- Cleaner API
- Session handling
- Better JSON support
- More intuitive
Summary
urllib handles URLs and HTTP in the standard library:
urllib.request- make HTTP requestsurllib.parse- parse and build URLsurllib.error- handle exceptionsurllib.robotparser- parse robots.txt
For simple scripts with no external deps, urllib works. For complex HTTP work, consider requests.
React to this post: