Python XML Parsing

Python's xml package provides several modules for XML parsing. The most common are xml.etree.ElementTree for general use and xml.dom.minidom for DOM manipulation.

ElementTree - The Standard Choice

Parsing XML

import xml.etree.ElementTree as ET
 
xml_string = """
<catalog>
    <book id="1">
        <title>Python Guide</title>
        <author>Alice</author>
        <price>29.99</price>
    </book>
    <book id="2">
        <title>Data Science</title>
        <author>Bob</author>
        <price>39.99</price>
    </book>
</catalog>
"""
 
root = ET.fromstring(xml_string)
 
# Root element
print(root.tag)  # catalog
 
# Find elements
for book in root.findall('book'):
    title = book.find('title').text
    author = book.find('author').text
    book_id = book.get('id')
    print(f"{book_id}: {title} by {author}")

From File

import xml.etree.ElementTree as ET
 
tree = ET.parse('data.xml')
root = tree.getroot()
 
for elem in root:
    print(elem.tag, elem.attrib)

Finding Elements

import xml.etree.ElementTree as ET
 
root = ET.fromstring(xml_string)
 
# Direct children
for child in root:
    print(child.tag)
 
# Find first match
book = root.find('book')
 
# Find all matches
books = root.findall('book')
 
# Find by attribute
book = root.find(".//book[@id='1']")
 
# Find nested elements
titles = root.findall('.//title')
 
# Iterate all descendants
for elem in root.iter():
    print(elem.tag)
 
# Iterate specific tag
for title in root.iter('title'):
    print(title.text)

XPath Support

import xml.etree.ElementTree as ET
 
root = ET.fromstring(xml_string)
 
# Find by path
root.findall('./book/title')      # Direct path
root.findall('.//title')          # Any depth
root.findall(".//book[@id='1']")  # By attribute
root.findall('.//book[1]')        # By position
root.findall(".//book[price='29.99']")  # By child text

Creating XML

import xml.etree.ElementTree as ET
 
# Create elements
root = ET.Element('catalog')
 
book = ET.SubElement(root, 'book')
book.set('id', '1')
 
title = ET.SubElement(book, 'title')
title.text = 'Python Guide'
 
author = ET.SubElement(book, 'author')
author.text = 'Alice'
 
# Convert to string
xml_str = ET.tostring(root, encoding='unicode')
print(xml_str)
 
# With declaration and formatting
tree = ET.ElementTree(root)
ET.indent(tree, space="  ")  # Python 3.9+
tree.write('output.xml', encoding='utf-8', xml_declaration=True)

Modifying XML

import xml.etree.ElementTree as ET
 
root = ET.fromstring(xml_string)
 
# Modify text
for price in root.iter('price'):
    new_price = float(price.text) * 1.1  # 10% increase
    price.text = f"{new_price:.2f}"
 
# Modify attributes
for book in root.findall('book'):
    book.set('updated', 'true')
 
# Add element
new_book = ET.SubElement(root, 'book')
new_book.set('id', '3')
ET.SubElement(new_book, 'title').text = 'New Book'
 
# Remove element
for book in root.findall(".//book[@id='1']"):
    root.remove(book)

minidom - DOM API

from xml.dom.minidom import parseString, parse
 
xml_string = "<root><item>Hello</item></root>"
 
# Parse string
doc = parseString(xml_string)
 
# Parse file
doc = parse('data.xml')
 
# Access elements
root = doc.documentElement
items = doc.getElementsByTagName('item')
 
for item in items:
    print(item.firstChild.nodeValue)
 
# Pretty print
print(doc.toprettyxml(indent="  "))

SAX - Event-Based Parsing

For large files that don't fit in memory:

import xml.sax
 
class BookHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.current = ""
        self.title = ""
        
    def startElement(self, tag, attrs):
        self.current = tag
        if tag == "book":
            print(f"Book ID: {attrs['id']}")
    
    def characters(self, content):
        if self.current == "title":
            self.title = content
    
    def endElement(self, tag):
        if tag == "title":
            print(f"Title: {self.title}")
        self.current = ""
 
parser = xml.sax.make_parser()
parser.setContentHandler(BookHandler())
parser.parse('books.xml')

Namespaces

import xml.etree.ElementTree as ET
 
xml_string = """
<root xmlns:h="http://www.w3.org/HTML" xmlns:f="http://www.w3.org/Form">
    <h:table>
        <h:tr><h:td>Data</h:td></h:tr>
    </h:table>
    <f:form><f:input/></f:form>
</root>
"""
 
root = ET.fromstring(xml_string)
 
# Find with namespace
ns = {'h': 'http://www.w3.org/HTML'}
tables = root.findall('.//h:table', ns)
 
# Iterate ignoring namespace
for elem in root.iter():
    # Remove namespace from tag
    tag = elem.tag.split('}')[1] if '}' in elem.tag else elem.tag
    print(tag)

CDATA Handling

import xml.etree.ElementTree as ET
 
# CDATA becomes normal text
xml_string = "<root><![CDATA[Some <special> text]]></root>"
root = ET.fromstring(xml_string)
print(root.text)  # Some <special> text

Error Handling

import xml.etree.ElementTree as ET
 
try:
    root = ET.fromstring("<invalid><xml>")
except ET.ParseError as e:
    print(f"Parse error: {e}")

Converting to Dict

import xml.etree.ElementTree as ET
 
def xml_to_dict(elem):
    result = {}
    for child in elem:
        if len(child) == 0:
            result[child.tag] = child.text
        else:
            result[child.tag] = xml_to_dict(child)
    return result
 
root = ET.fromstring(xml_string)
data = xml_to_dict(root.find('book'))
print(data)  # {'title': 'Python Guide', 'author': 'Alice', ...}

Security Warning

# UNSAFE - vulnerable to XXE attacks
import xml.etree.ElementTree as ET
root = ET.parse(untrusted_file)
 
# SAFE - use defusedxml for untrusted input
# pip install defusedxml
import defusedxml.ElementTree as ET
root = ET.parse(untrusted_file)

Summary

Python's XML tools:

ElementTree: Simple, Pythonic API (recommended)
minidom: DOM API, good for pretty printing
SAX: Event-based, memory efficient for large files

For most use cases, xml.etree.ElementTree is the right choice. Use defusedxml when parsing untrusted input.

React to this post:

#ElementTree - The Standard Choice

#Parsing XML

#From File

#Finding Elements

#XPath Support

#Creating XML

#Modifying XML

#minidom - DOM API

#SAX - Event-Based Parsing

#Namespaces

#CDATA Handling

#Error Handling

#Converting to Dict

#Security Warning

#Summary

Keep Reading

Need help shipping fast?