Python's xml package provides several modules for XML parsing. The most common are xml.etree.ElementTree for general use and xml.dom.minidom for DOM manipulation.
ElementTree - The Standard Choice
Parsing XML
import xml.etree.ElementTree as ET
xml_string = """
<catalog>
<book id="1">
<title>Python Guide</title>
<author>Alice</author>
<price>29.99</price>
</book>
<book id="2">
<title>Data Science</title>
<author>Bob</author>
<price>39.99</price>
</book>
</catalog>
"""
root = ET.fromstring(xml_string)
# Root element
print(root.tag) # catalog
# Find elements
for book in root.findall('book'):
title = book.find('title').text
author = book.find('author').text
book_id = book.get('id')
print(f"{book_id}: {title} by {author}")From File
import xml.etree.ElementTree as ET
tree = ET.parse('data.xml')
root = tree.getroot()
for elem in root:
print(elem.tag, elem.attrib)Finding Elements
import xml.etree.ElementTree as ET
root = ET.fromstring(xml_string)
# Direct children
for child in root:
print(child.tag)
# Find first match
book = root.find('book')
# Find all matches
books = root.findall('book')
# Find by attribute
book = root.find(".//book[@id='1']")
# Find nested elements
titles = root.findall('.//title')
# Iterate all descendants
for elem in root.iter():
print(elem.tag)
# Iterate specific tag
for title in root.iter('title'):
print(title.text)XPath Support
import xml.etree.ElementTree as ET
root = ET.fromstring(xml_string)
# Find by path
root.findall('./book/title') # Direct path
root.findall('.//title') # Any depth
root.findall(".//book[@id='1']") # By attribute
root.findall('.//book[1]') # By position
root.findall(".//book[price='29.99']") # By child textCreating XML
import xml.etree.ElementTree as ET
# Create elements
root = ET.Element('catalog')
book = ET.SubElement(root, 'book')
book.set('id', '1')
title = ET.SubElement(book, 'title')
title.text = 'Python Guide'
author = ET.SubElement(book, 'author')
author.text = 'Alice'
# Convert to string
xml_str = ET.tostring(root, encoding='unicode')
print(xml_str)
# With declaration and formatting
tree = ET.ElementTree(root)
ET.indent(tree, space=" ") # Python 3.9+
tree.write('output.xml', encoding='utf-8', xml_declaration=True)Modifying XML
import xml.etree.ElementTree as ET
root = ET.fromstring(xml_string)
# Modify text
for price in root.iter('price'):
new_price = float(price.text) * 1.1 # 10% increase
price.text = f"{new_price:.2f}"
# Modify attributes
for book in root.findall('book'):
book.set('updated', 'true')
# Add element
new_book = ET.SubElement(root, 'book')
new_book.set('id', '3')
ET.SubElement(new_book, 'title').text = 'New Book'
# Remove element
for book in root.findall(".//book[@id='1']"):
root.remove(book)minidom - DOM API
from xml.dom.minidom import parseString, parse
xml_string = "<root><item>Hello</item></root>"
# Parse string
doc = parseString(xml_string)
# Parse file
doc = parse('data.xml')
# Access elements
root = doc.documentElement
items = doc.getElementsByTagName('item')
for item in items:
print(item.firstChild.nodeValue)
# Pretty print
print(doc.toprettyxml(indent=" "))SAX - Event-Based Parsing
For large files that don't fit in memory:
import xml.sax
class BookHandler(xml.sax.ContentHandler):
def __init__(self):
self.current = ""
self.title = ""
def startElement(self, tag, attrs):
self.current = tag
if tag == "book":
print(f"Book ID: {attrs['id']}")
def characters(self, content):
if self.current == "title":
self.title = content
def endElement(self, tag):
if tag == "title":
print(f"Title: {self.title}")
self.current = ""
parser = xml.sax.make_parser()
parser.setContentHandler(BookHandler())
parser.parse('books.xml')Namespaces
import xml.etree.ElementTree as ET
xml_string = """
<root xmlns:h="http://www.w3.org/HTML" xmlns:f="http://www.w3.org/Form">
<h:table>
<h:tr><h:td>Data</h:td></h:tr>
</h:table>
<f:form><f:input/></f:form>
</root>
"""
root = ET.fromstring(xml_string)
# Find with namespace
ns = {'h': 'http://www.w3.org/HTML'}
tables = root.findall('.//h:table', ns)
# Iterate ignoring namespace
for elem in root.iter():
# Remove namespace from tag
tag = elem.tag.split('}')[1] if '}' in elem.tag else elem.tag
print(tag)CDATA Handling
import xml.etree.ElementTree as ET
# CDATA becomes normal text
xml_string = "<root><![CDATA[Some <special> text]]></root>"
root = ET.fromstring(xml_string)
print(root.text) # Some <special> textError Handling
import xml.etree.ElementTree as ET
try:
root = ET.fromstring("<invalid><xml>")
except ET.ParseError as e:
print(f"Parse error: {e}")Converting to Dict
import xml.etree.ElementTree as ET
def xml_to_dict(elem):
result = {}
for child in elem:
if len(child) == 0:
result[child.tag] = child.text
else:
result[child.tag] = xml_to_dict(child)
return result
root = ET.fromstring(xml_string)
data = xml_to_dict(root.find('book'))
print(data) # {'title': 'Python Guide', 'author': 'Alice', ...}Security Warning
# UNSAFE - vulnerable to XXE attacks
import xml.etree.ElementTree as ET
root = ET.parse(untrusted_file)
# SAFE - use defusedxml for untrusted input
# pip install defusedxml
import defusedxml.ElementTree as ET
root = ET.parse(untrusted_file)Summary
Python's XML tools:
- ElementTree: Simple, Pythonic API (recommended)
- minidom: DOM API, good for pretty printing
- SAX: Event-based, memory efficient for large files
For most use cases, xml.etree.ElementTree is the right choice. Use defusedxml when parsing untrusted input.
React to this post: