Why XML Is Still Relevant
Although JSON has replaced XML in most modern APIs, XML remains indispensable in key sectors:
- Office Open XML:
.docx,.xlsx,.pptxare ZIP archives of XML files - SVG: vector graphics in XML
- RSS/Atom: news feeds and podcasts
- SOAP: legacy enterprise web services (banks, governments, SAP)
- XHTML/HTML: the browser DOM is XML-based
- Android: layouts, manifests and resources in XML
- Maven/Gradle:
pom.xmlfor Java projects - Configuration: Spring Boot, .NET config, Kubernetes manifests
ElementTree: The Standard Library
import xml.etree.ElementTree as ET
from pathlib import Path
xml_str = '''<?xml version="1.0" encoding="UTF-8"?>
<library>
<book isbn="978-0-13-110362-7">
<title>The C Programming Language</title>
<authors>
<author>Brian W. Kernighan</author>
<author>Dennis M. Ritchie</author>
</authors>
<year>1988</year>
<price currency="USD">45.99</price>
<available>true</available>
</book>
<book isbn="978-0-201-63361-0">
<title>The Pragmatic Programmer</title>
<authors>
<author>Andrew Hunt</author>
<author>David Thomas</author>
</authors>
<year>1999</year>
<price currency="USD">39.99</price>
<available>false</available>
</book>
</library>'''
def parse_books(xml_content):
"""Extract book information from XML."""
root = ET.fromstring(xml_content)
books = []
for book in root.findall('book'):
isbn = book.get('isbn')
title = book.findtext('title')
year = book.findtext('year')
price = book.findtext('price')
currency = book.find('price').get('currency')
authors = [a.text for a in book.findall('authors/author')]
available = book.findtext('available') == 'true'
books.append({
'isbn': isbn,
'title': title,
'year': int(year),
'price': float(price),
'currency': currency,
'authors': authors,
'available': available,
})
return books
books = parse_books(xml_str)
for book in books:
print(f"{book['title']} ({book['year']}) — ${book['price']}")
Creating XML with ElementTree
def create_xml_employees(employees, output_file):
"""
Generate structured XML from a list of dictionaries.
employees = [{'name': ..., 'role': ..., 'salary': ..., 'skills': [...]}]
"""
company = ET.Element('company')
company.set('version', '1.0')
staff_elem = ET.SubElement(company, 'employees')
staff_elem.set('total', str(len(employees)))
for emp in employees:
emp_elem = ET.SubElement(staff_elem, 'employee')
emp_elem.set('id', str(emp.get('id', '')))
for field in ('name', 'role', 'email'):
if field in emp:
child = ET.SubElement(emp_elem, field)
child.text = str(emp[field])
salary_elem = ET.SubElement(emp_elem, 'salary')
salary_elem.set('currency', 'USD')
salary_elem.text = str(emp.get('salary', 0))
if emp.get('skills'):
skills_elem = ET.SubElement(emp_elem, 'skills')
for skill in emp['skills']:
s = ET.SubElement(skills_elem, 'skill')
s.text = skill
ET.indent(company, space=' ')
ET.ElementTree(company).write(
output_file, encoding='unicode', xml_declaration=True
)
print(f"XML created: {output_file}")
employees_data = [
{'id': 1, 'name': 'Alice Johnson', 'role': 'Senior Engineer',
'email': 'alice@company.com', 'salary': 120000,
'skills': ['Python', 'Docker', 'PostgreSQL']},
{'id': 2, 'name': 'Bob Smith', 'role': 'UX Designer',
'email': 'bob@company.com', 'salary': 95000,
'skills': ['Figma', 'Sketch', 'CSS']},
]
create_xml_employees(employees_data, 'employees.xml')
lxml: High-Performance XML with XPath
from lxml import etree
def search_xpath(xml_content_or_path, xpath_expression):
"""Execute an XPath query on an XML document."""
if isinstance(xml_content_or_path, str) and xml_content_or_path.startswith('<'):
root = etree.fromstring(xml_content_or_path.encode())
else:
root = etree.parse(xml_content_or_path).getroot()
return root.xpath(xpath_expression)
root = etree.fromstring(xml_str.encode())
# All titles
titles = root.xpath('//title/text()')
print("Titles:", titles)
# Available books
available = root.xpath('//book[available="true"]/title/text()')
print("Available:", available)
# All authors
authors = root.xpath('//author/text()')
print("Authors:", authors)
# Books sorted by price
books_sorted = sorted(root.xpath('//book'),
key=lambda b: float(b.findtext('price', '0')))
for book in books_sorted:
print(f" {book.findtext('title')}: ${book.findtext('price')}")
XML Namespaces
from lxml import etree
xml_ns = '''<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom"
xmlns:media="http://search.yahoo.com/mrss/">
<title>My Blog</title>
<entry>
<title>First Article</title>
<id>urn:uuid:1234</id>
<updated>2024-01-15T10:30:00Z</updated>
<media:thumbnail url="https://example.com/img.jpg" width="320" height="240"/>
</entry>
</feed>'''
def parse_atom_feed(xml_content):
"""Parse an Atom feed with namespaces."""
root = etree.fromstring(xml_content.encode())
ns = {
'atom': 'http://www.w3.org/2005/Atom',
'media': 'http://search.yahoo.com/mrss/',
}
feed_title = root.xpath('atom:title/text()', namespaces=ns)
entries = root.xpath('atom:entry', namespaces=ns)
result = {'title': feed_title[0] if feed_title else '', 'entries': []}
for entry in entries:
title = entry.xpath('atom:title/text()', namespaces=ns)
updated = entry.xpath('atom:updated/text()', namespaces=ns)
thumb = entry.xpath('media:thumbnail', namespaces=ns)
result['entries'].append({
'title': title[0] if title else '',
'updated': updated[0] if updated else '',
'thumbnail': thumb[0].get('url') if thumb else None,
})
return result
feed = parse_atom_feed(xml_ns)
print(f"Feed: {feed['title']}")
for entry in feed['entries']:
print(f" - {entry['title']} ({entry['updated']})")
Converting XML to JSON and Back
import json
def xml_elem_to_dict(element):
"""Recursively convert an XML element to a Python dictionary."""
result = {}
if element.attrib:
result['@attributes'] = element.attrib
if element.text and element.text.strip():
if element.attrib or len(element):
result['#text'] = element.text.strip()
else:
return element.text.strip()
children = {}
for child in element:
child_value = xml_elem_to_dict(child)
if child.tag in children:
if not isinstance(children[child.tag], list):
children[child.tag] = [children[child.tag]]
children[child.tag].append(child_value)
else:
children[child.tag] = child_value
result.update(children)
return result
def xml_to_json(xml_content_or_path, json_output=None, indent=2):
"""Convert XML to JSON."""
if isinstance(xml_content_or_path, str) and xml_content_or_path.startswith('<'):
root = ET.fromstring(xml_content_or_path)
else:
root = ET.parse(xml_content_or_path).getroot()
data = {root.tag: xml_elem_to_dict(root)}
json_str = json.dumps(data, ensure_ascii=False, indent=indent)
if json_output:
Path(json_output).write_text(json_str, encoding='utf-8')
print(f"XML→JSON: {json_output}")
return json_str
json_result = xml_to_json(xml_str)
print("XML→JSON preview:", json_result[:200], "...")
XSD Validation with lxml
from lxml import etree
def validate_with_xsd(xml_content, xsd_content):
"""Validate an XML document against an XSD schema."""
try:
xsd_doc = etree.fromstring(xsd_content.encode())
schema = etree.XMLSchema(xsd_doc)
xml_doc = etree.fromstring(xml_content.encode())
valid = schema.validate(xml_doc)
if valid:
print("✅ XML is valid according to the XSD schema")
else:
print("❌ XML is invalid:")
for error in schema.error_log:
print(f" Line {error.line}: {error.message}")
return valid
except etree.XMLSyntaxError as e:
print(f"XML syntax error: {e}")
return False
xsd = '''<?xml version="1.0" encoding="UTF-8"?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="library">
<xs:complexType>
<xs:sequence>
<xs:element name="book" maxOccurs="unbounded">
<xs:complexType>
<xs:sequence>
<xs:element name="title" type="xs:string"/>
<xs:element name="year" type="xs:integer"/>
<xs:element name="available" type="xs:boolean"/>
</xs:sequence>
<xs:attribute name="isbn" type="xs:string" use="required"/>
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema>'''
xml_valid = '''<library>
<book isbn="978-0-13-110362-7">
<title>The C Programming Language</title>
<year>1988</year>
<available>true</available>
</book>
</library>'''
validate_with_xsd(xml_valid, xsd)
ElementTree vs lxml Comparison
| Criterion | ElementTree | lxml |
|---|---|---|
| Dependencies | ✅ Standard (none) | ⚠️ pip install lxml |
| Speed | ⚠️ Moderate | ✅ C-based, very fast |
| XPath | ⚠️ Limited subset | ✅ Full XPath 1.0 |
| XSLT | ❌ Not supported | ✅ Complete |
| XSD validation | ❌ Not supported | ✅ Complete |
| Namespaces | ⚠️ Awkward | ✅ Native |
| HTML parsing | ❌ | ✅ html.fromstring() |
| Large files | ⚠️ High RAM | ✅ Efficient iterparse |
Conclusion
For basic parsing and generation tasks, xml.etree.ElementTree (standard library) is sufficient. For anything involving complex XPath, namespaces, XSD/DTD validation, XSLT, or large XML files (> 100 MB), lxml is the correct choice. The XML↔JSON conversion is trivial in Python and opens the door to integrating legacy SOAP/XML systems with modern REST APIs.
Related conversions
Document conversions that follow this topic naturally: