2011-08-07 41 views
1

我想解析一些相对较大的xml文件在Python中使用标准萨克斯解析器,我宁愿避免手动保存/检查每个元素到字典,因为 我正在与多个XML模式,有些非常大。Python的XML萨克斯到字典

显然下面的代码示例不起作用,但这是我到目前为止。其他低内存解决方案也受到欢迎。

(注:完整的XML文件包含的不仅仅是两个级别嵌套结构的更多)

from xml import sax 
from cStringIO import StringIO 

xml_string = """<?xml version="1.0" encoding="iso-8859-1"?> 
<n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld"> 
    <n1:product> 
    <n1:status> 
     <n7:created>2005-02-08T18:18:30.53</n7:created> 
     <n7:updated>2008-09-18T10:29:58.26</n7:updated> 
    </n1:status> 
    <n1:productid>28321503</n1:productid> 
    <n1:producttext> 
     <n7:text>Some product info</n7:text> 
     <n7:type>Info</n7:type> 
    </n1:producttext> 
    <n1:terms> 
     <n7:term> 
     <n7:number>1</n7:number> 
     <n7:name>Term1</n7:name> 
     </n7:term> 
     <n7:term> 
     <n7:number>2</n7:number> 
     <n7:name>Term2</n7:name> 
     </n7:term> 
    </n1:terms> 
    </n1:product> 
</n1:products> 
""" 

class XML_Handler(sax.ContentHandler):  
    def __init__(self): 
     self.data = {} 
     self.vbuffer = '' 
    def startElementNS(self, name, qname, attrs): 
     (ns, localname) = name 
     if localname == 'product': 
      self.data = {} 
      self.fetch = True 
    def endElementNS(self, name, qname): 
     (ns, localname) = name 
     if localname == 'product': 
      # Got my data, call some process function.. 
      print self.data 
     elif self.fetch: 
      if self.vbuffer != '': 
       self.data[localname] = self.vbuffer 
      else: 
       pass 
     self.vbuffer = '' 
    def characters (self, ch): 
     self.vbuffer += ch.rstrip() 

if __name__ == '__main__': 
    parser = sax.make_parser() 
    parser.setContentHandler(XML_Handler()) 
    parser.setFeature(sax.handler.feature_namespaces, 1) 
    inpsrc = sax.xmlreader.InputSource() 
    inpsrc.setByteStream(StringIO(xml_string)) 
    parser.parse(inpsrc) 

我想要实现:

result = { 
    'status' : { 
     'created' : '2005-02-08T18:18:30.53', 
     'updated' : '2008-09-18T10:29:58.26', 
    }, 
    'productid' : '28321503', 
    'producttext' : { 
     'text' : 'Some product', 
     'type' : 'Info', 
    }, 
    'terms' : [{'number': '1', 'name': 'Term1'}, {'number': '2', 'name': 'Term2'}] 
} 

回答

0

最后得到了这个工作。它可能不是最强大的解决方案,但足以满足我的使用案例。

#!/usr/bin/env python 
# -*- coding: utf-8 -*- 

import simplejson as json 
from xml import sax 
try: 
    from cStringIO import StringIO 
except ImportError: 
    from StringIO import StringIO 

xml_string = '''<?xml version="1.0" encoding="iso-8859-1"?> 
<n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld"> 
    <n1:product> 
    <n1:status> 
     <n7:created>2005-02-08T18:18:30.53</n7:created> 
     <n7:updated>2008-09-18T10:29:58.26</n7:updated> 
    </n1:status> 
    <n1:productid>28321503</n1:productid> 
    <n1:producttext> 
     <n7:text>Some product info</n7:text> 
     <n7:type>Info</n7:type> 
    </n1:producttext> 
    <n1:terms> 
     <n7:term> 
     <n7:number>1</n7:number> 
     <n7:name>Term1</n7:name> 
     </n7:term> 
     <n7:term> 
     <n7:number>2</n7:number> 
     <n7:name>Term2</n7:name> 
     </n7:term> 
    </n1:terms> 
    </n1:product> 
</n1:products> 
''' 

def display(data): 
    import pprint 
    pp = pprint.PrettyPrinter(depth=10) 
    pp.pprint(data) 

class Element: 
    def setData(self, key, value): 
     self.__dict__[key] = value 

    def setObject(self, key, object): 
     if key in self.__dict__ and not isinstance(self.__dict__[key], (list, tuple)): 
      prev_object = self.__dict__[key] 
      self.__dict__[key] = [] 
      self.__dict__[key].append(prev_object) 
      self.__dict__[key].append(object) 
     elif key in self.__dict__: 
      self.__dict__[key].append(object) 
     else: 
      self.__dict__[key] = object 

    def jsonable(self): 
     return self._traverse(self.__dict__) 

    # http://stackoverflow.com/questions/1036409/recursively-convert-python-object-graph-to-dictionary/1118038#1118038 
    def _traverse(self, obj): 
     if isinstance(obj, dict): 
      for k in obj.keys(): 
       obj[k] = self._traverse(obj[k]) 
      return obj 
     elif hasattr(obj, "__iter__"): 
      return [self._traverse(v) for v in obj] 
     elif hasattr(obj, "__dict__"): 
      data = dict([(key, self._traverse(value)) 
       for key, value in obj.__dict__.iteritems() 
       if not callable(value) and not key.startswith('_')]) 
      return data 
     else: 
      return obj 

class ObjBuilder(sax.ContentHandler): 
    def __init__(self, node): 
     sax.ContentHandler.__init__(self) 
     self.obj = [] 
     self.node = node 
     self.fetch = False 
     self.__buffer = '' 

    def startElementNS(self, name, qname, attrs): 
     (ns, localname) = name 
     if self.node == localname: 
      self.fetch = True 
      o = Element() 
      self.rootobject = o 
      self.obj.append(o) 
     elif self.fetch: 
      self.__buffer = '' 
      o = Element() 
      self.obj[-1].setObject(localname, o) 
      self.obj.append(o) 

    def characters(self,contents): 
     if self.fetch: 
      self.__buffer += contents.strip() 

    def endElementNS(self, name, qname): 
     (ns, localname) = name 
     if self.node == localname: 
      self.fetch = False 
      display(self.rootobject.jsonable()) 
      data = self.rootobject.jsonable() 
     elif self.fetch: 
      if self.__buffer != '': 
       self.obj[-2].setData(localname, self.__buffer) 
      del self.obj[-1] 
      self.__buffer = '' 

if __name__ == '__main__': 
    parser = sax.make_parser() 
    parser.setContentHandler(ObjBuilder('product')) 
    parser.setFeature(sax.handler.feature_namespaces, 1) 

    inpsrc = sax.xmlreader.InputSource() 
    inpsrc.setByteStream(StringIO(xml_string)) 
    parser.parse(inpsrc)