开发者

Python XML Sax to dictionary

I'm trying to parse some relative large xml-files using the standard sax parser in Python and I would prefer to avoid manually save/check on each element to a dictionary because I'm working with multiple xml-schemas and some are quite large.

Obviously the code example below doesn't work, but it's what I got so far. Other low-memory solutions is also welcome.

(Note: the complete xml files contains more than just two levels of nested structures)

from xml import sax
from cStringIO import StringIO

xml_string = """<?xml ver开发者_运维百科sion="1.0" encoding="iso-8859-1"?>
<n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld">
  <n1:product>
    <n1:status>
      <n7:created>2005-02-08T18:18:30.53</n7:created>
      <n7:updated>2008-09-18T10:29:58.26</n7:updated>
    </n1:status>
    <n1:productid>28321503</n1:productid>
    <n1:producttext>
      <n7:text>Some product info</n7:text>
      <n7:type>Info</n7:type>
    </n1:producttext>
    <n1:terms>
      <n7:term>
        <n7:number>1</n7:number>
        <n7:name>Term1</n7:name>
      </n7:term>
      <n7:term>
        <n7:number>2</n7:number>
        <n7:name>Term2</n7:name>
      </n7:term>
    </n1:terms>   
  </n1:product>
</n1:products>
"""

class XML_Handler(sax.ContentHandler):    
    def __init__(self):
        self.data = {}
        self.vbuffer = ''
    def startElementNS(self, name, qname, attrs):
        (ns, localname) = name
        if localname == 'product':
            self.data = {}
            self.fetch = True
    def endElementNS(self, name, qname):
        (ns, localname) = name
        if localname == 'product':
            # Got my data, call some process function..
            print self.data
        elif self.fetch:
            if self.vbuffer != '':
                self.data[localname] = self.vbuffer
            else:
                pass
        self.vbuffer = ''
    def characters (self, ch):
        self.vbuffer += ch.rstrip()

if __name__ == '__main__':
    parser = sax.make_parser()
    parser.setContentHandler(XML_Handler())
    parser.setFeature(sax.handler.feature_namespaces, 1)
    inpsrc = sax.xmlreader.InputSource()
    inpsrc.setByteStream(StringIO(xml_string))
    parser.parse(inpsrc)

What I'm trying to achieve:

result = {
    'status' : {
        'created' : '2005-02-08T18:18:30.53',
        'updated' : '2008-09-18T10:29:58.26',
    },
    'productid' : '28321503',
    'producttext' : {
        'text' : 'Some product',
        'type' : 'Info',
    },
    'terms' : [{'number': '1', 'name': 'Term1'}, {'number': '2', 'name': 'Term2'}]
}


https://www.assembla.com/code/pysnipps/subversion/nodes/python/mXMLDao.py long time ago i made a crappy library for mapping xml into classes maybe it helps you.


Finally got this working. It might not be the most robust solution, but good enough for my use case.

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import simplejson as json
from xml import sax
try:
    from cStringIO import StringIO
except ImportError:
    from StringIO import StringIO

xml_string = '''<?xml version="1.0" encoding="iso-8859-1"?>
<n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld">
  <n1:product>
    <n1:status>
      <n7:created>2005-02-08T18:18:30.53</n7:created>
      <n7:updated>2008-09-18T10:29:58.26</n7:updated>
    </n1:status>
    <n1:productid>28321503</n1:productid>
    <n1:producttext>
      <n7:text>Some product info</n7:text>
      <n7:type>Info</n7:type>
    </n1:producttext>
    <n1:terms>
      <n7:term>
        <n7:number>1</n7:number>
        <n7:name>Term1</n7:name>
      </n7:term>
      <n7:term>
        <n7:number>2</n7:number>
        <n7:name>Term2</n7:name>
      </n7:term>
    </n1:terms>   
  </n1:product>
</n1:products>
'''

def display(data):
    import pprint
    pp = pprint.PrettyPrinter(depth=10)
    pp.pprint(data)

class Element:
    def setData(self, key, value):
        self.__dict__[key] = value

    def setObject(self, key, object):
        if key in self.__dict__ and not isinstance(self.__dict__[key], (list, tuple)):
            prev_object = self.__dict__[key]
            self.__dict__[key] = []
            self.__dict__[key].append(prev_object)
            self.__dict__[key].append(object)
        elif key in self.__dict__:
            self.__dict__[key].append(object)
        else:
            self.__dict__[key] = object

    def jsonable(self):
        return self._traverse(self.__dict__)

    # http://stackoverflow.com/questions/1036409/recursively-convert-python-object-graph-to-dictionary/1118038#1118038
    def _traverse(self, obj):
        if isinstance(obj, dict):
            for k in obj.keys():
                obj[k] = self._traverse(obj[k])
            return obj
        elif hasattr(obj, "__iter__"):
            return [self._traverse(v) for v in obj]
        elif hasattr(obj, "__dict__"):
            data = dict([(key, self._traverse(value))
                for key, value in obj.__dict__.iteritems()
                if not callable(value) and not key.startswith('_')])
            return data
        else:
            return obj

class ObjBuilder(sax.ContentHandler):
    def __init__(self, node):
        sax.ContentHandler.__init__(self)
        self.obj = []
        self.node = node
        self.fetch = False
        self.__buffer = ''

    def startElementNS(self, name, qname, attrs):
        (ns, localname) = name
        if self.node == localname:
            self.fetch = True
            o = Element()
            self.rootobject = o
            self.obj.append(o)
        elif self.fetch:
            self.__buffer = ''
            o = Element()
            self.obj[-1].setObject(localname, o)
            self.obj.append(o)

    def characters(self,contents):
        if self.fetch:
            self.__buffer += contents.strip()

    def endElementNS(self, name, qname):
        (ns, localname) = name
        if self.node == localname:
            self.fetch = False
            display(self.rootobject.jsonable())
            data = self.rootobject.jsonable()
        elif self.fetch:
            if self.__buffer != '':
                self.obj[-2].setData(localname, self.__buffer)
            del self.obj[-1]
            self.__buffer = ''

if __name__ == '__main__':
    parser = sax.make_parser()
    parser.setContentHandler(ObjBuilder('product'))
    parser.setFeature(sax.handler.feature_namespaces, 1)

    inpsrc = sax.xmlreader.InputSource()
    inpsrc.setByteStream(StringIO(xml_string))
    parser.parse(inpsrc)
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜