Python XML Sax to dictionary
I'm trying to parse some relative large xml-files using the standard sax parser in Python and I would prefer to avoid manually save/check on each element to a dictionary because I'm working with multiple xml-schemas and some are quite large.
Obviously the code example below doesn't work, but it's what I got so far. Other low-memory solutions is also welcome.
(Note: the complete xml files contains more than just two levels of nested structures)
from xml import sax
from cStringIO import StringIO
xml_string = """<?xml ver开发者_运维百科sion="1.0" encoding="iso-8859-1"?>
<n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld">
<n1:product>
<n1:status>
<n7:created>2005-02-08T18:18:30.53</n7:created>
<n7:updated>2008-09-18T10:29:58.26</n7:updated>
</n1:status>
<n1:productid>28321503</n1:productid>
<n1:producttext>
<n7:text>Some product info</n7:text>
<n7:type>Info</n7:type>
</n1:producttext>
<n1:terms>
<n7:term>
<n7:number>1</n7:number>
<n7:name>Term1</n7:name>
</n7:term>
<n7:term>
<n7:number>2</n7:number>
<n7:name>Term2</n7:name>
</n7:term>
</n1:terms>
</n1:product>
</n1:products>
"""
class XML_Handler(sax.ContentHandler):
def __init__(self):
self.data = {}
self.vbuffer = ''
def startElementNS(self, name, qname, attrs):
(ns, localname) = name
if localname == 'product':
self.data = {}
self.fetch = True
def endElementNS(self, name, qname):
(ns, localname) = name
if localname == 'product':
# Got my data, call some process function..
print self.data
elif self.fetch:
if self.vbuffer != '':
self.data[localname] = self.vbuffer
else:
pass
self.vbuffer = ''
def characters (self, ch):
self.vbuffer += ch.rstrip()
if __name__ == '__main__':
parser = sax.make_parser()
parser.setContentHandler(XML_Handler())
parser.setFeature(sax.handler.feature_namespaces, 1)
inpsrc = sax.xmlreader.InputSource()
inpsrc.setByteStream(StringIO(xml_string))
parser.parse(inpsrc)
What I'm trying to achieve:
result = {
'status' : {
'created' : '2005-02-08T18:18:30.53',
'updated' : '2008-09-18T10:29:58.26',
},
'productid' : '28321503',
'producttext' : {
'text' : 'Some product',
'type' : 'Info',
},
'terms' : [{'number': '1', 'name': 'Term1'}, {'number': '2', 'name': 'Term2'}]
}
https://www.assembla.com/code/pysnipps/subversion/nodes/python/mXMLDao.py long time ago i made a crappy library for mapping xml into classes maybe it helps you.
Finally got this working. It might not be the most robust solution, but good enough for my use case.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import simplejson as json
from xml import sax
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
xml_string = '''<?xml version="1.0" encoding="iso-8859-1"?>
<n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld">
<n1:product>
<n1:status>
<n7:created>2005-02-08T18:18:30.53</n7:created>
<n7:updated>2008-09-18T10:29:58.26</n7:updated>
</n1:status>
<n1:productid>28321503</n1:productid>
<n1:producttext>
<n7:text>Some product info</n7:text>
<n7:type>Info</n7:type>
</n1:producttext>
<n1:terms>
<n7:term>
<n7:number>1</n7:number>
<n7:name>Term1</n7:name>
</n7:term>
<n7:term>
<n7:number>2</n7:number>
<n7:name>Term2</n7:name>
</n7:term>
</n1:terms>
</n1:product>
</n1:products>
'''
def display(data):
import pprint
pp = pprint.PrettyPrinter(depth=10)
pp.pprint(data)
class Element:
def setData(self, key, value):
self.__dict__[key] = value
def setObject(self, key, object):
if key in self.__dict__ and not isinstance(self.__dict__[key], (list, tuple)):
prev_object = self.__dict__[key]
self.__dict__[key] = []
self.__dict__[key].append(prev_object)
self.__dict__[key].append(object)
elif key in self.__dict__:
self.__dict__[key].append(object)
else:
self.__dict__[key] = object
def jsonable(self):
return self._traverse(self.__dict__)
# http://stackoverflow.com/questions/1036409/recursively-convert-python-object-graph-to-dictionary/1118038#1118038
def _traverse(self, obj):
if isinstance(obj, dict):
for k in obj.keys():
obj[k] = self._traverse(obj[k])
return obj
elif hasattr(obj, "__iter__"):
return [self._traverse(v) for v in obj]
elif hasattr(obj, "__dict__"):
data = dict([(key, self._traverse(value))
for key, value in obj.__dict__.iteritems()
if not callable(value) and not key.startswith('_')])
return data
else:
return obj
class ObjBuilder(sax.ContentHandler):
def __init__(self, node):
sax.ContentHandler.__init__(self)
self.obj = []
self.node = node
self.fetch = False
self.__buffer = ''
def startElementNS(self, name, qname, attrs):
(ns, localname) = name
if self.node == localname:
self.fetch = True
o = Element()
self.rootobject = o
self.obj.append(o)
elif self.fetch:
self.__buffer = ''
o = Element()
self.obj[-1].setObject(localname, o)
self.obj.append(o)
def characters(self,contents):
if self.fetch:
self.__buffer += contents.strip()
def endElementNS(self, name, qname):
(ns, localname) = name
if self.node == localname:
self.fetch = False
display(self.rootobject.jsonable())
data = self.rootobject.jsonable()
elif self.fetch:
if self.__buffer != '':
self.obj[-2].setData(localname, self.__buffer)
del self.obj[-1]
self.__buffer = ''
if __name__ == '__main__':
parser = sax.make_parser()
parser.setContentHandler(ObjBuilder('product'))
parser.setFeature(sax.handler.feature_namespaces, 1)
inpsrc = sax.xmlreader.InputSource()
inpsrc.setByteStream(StringIO(xml_string))
parser.parse(inpsrc)
精彩评论