开发者

How to extract data from XML using python minidom

given this xml file, i would like to extract the data out from it. However, i have trouble extracting the data from <LandmarkPointListXml> onwards.

The XML file:

  <?xml version="1.0" encoding="utf-8"?>
  <Map xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
  <MapName>er</MapName>
  <MapURL>er.gif</MapURL>
  <Name>er</Name>
  <URL>er.gif</URL>
  <LandmarkPointListXml>
    <anyType xsi:type="LandmarkPointProperty">
      <LandmarkPointX>400</LandmarkPointX>
      <LandmarkPointY>292</LandmarkPointY>
      <LandmarkDesc>my room door</LandmarkDesc>
    </anyType>
    <anyType xsi:type="LandmarkPointProperty">
      <LandmarkPointX>399</LandmarkPointX>
      <Land开发者_Go百科markPointY>219</LandmarkPointY>
      <LandmarkDesc>bro room door</LandmarkDesc>
    </anyType>
  </LandmarkPointListXml>
  <RegionPointListXml />
</Map>

Python program:

    def GetMapData(self):
        result = ""
        haha = self.XMLdoc.firstChild #root node
        for child in haha.childNodes:
            if (cmp(child.nodeName,'LandmarkPointListXml')==0):
                result = result + '|' + self.loopLandmark(child.childNodes) + '|'
            else:
                result = result + child.firstChild.nodeValue + ','
        return result

    def loopLandmark(self, landmarks):
        result=""
        haha=landmarks.getElementsByTagName('anyType')
        for child in haha.childNodes:
            if (cmp(haha.firstChild.nodeName,'LandmarkPointX') == 0):
                result=result+child.firstChild.nodeValue+','
                ChildNode = ChildNode.nextSibling
                result=result+child.firstChild.nodeValue+','
                ChildNode = ChildNode.nextSibling
                result=result+child.firstChild.nodeValue
        return result

I was able to retrieve the result, "er,er.gif,er,er.gif," till the program reaches <LandmarkPointListXml>.


This code is quite fragile. It makes strong assumptions on the XML input, and would fail if the XML was modified in a valid way (e.g. if is not immediately after ).

I suggest using a standard library when parsing XML, such as Element Tree ( http://docs.python.org/library/xml.etree.elementtree.html ) or lxml ( http://lxml.de ), which can also validate your XML input.

The code I'm writing below uses Element Tree and works on your XML input (I have removed the 'self' arguments to the parent class). It also tolerates (ignores) empty values in XML elements.

import xml.etree.ElementTree as ET

def GetMapData( xmlfile ):
    result = ""
    try:
        tree = ET.parse( xmlfile )
    except IOError, e:
        print "Failure Parsing %s: %s" % (xmlfile, e)
    root = tree.getroot() # root node
    for child in root:
        if ( child.tag == 'LandmarkPointListXml' ):
            result += '|' + loopLandmark(child) + '|'
        elif child.text is not None:
            result += child.text + ','
    return result

def loopLandmark( landmarks ):
    result=""
    for landmark in landmarks:
        if ( landmark.tag == 'anyType' ): # check also xsi:type="LandmarkPointProperty"?
            for child in landmark:
                if ( child.text and child.tag in [ 'LandmarkPointX', 'LandmarkPointY' ] ):
                    result += child.text + ','
    return result

GetMapData( 'xml.in' )


i managed to extract out the data from the XML file posted. But feel that it can be more simpler than the answer i provided. Lots of looping to do to get each data.

import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString

class mapDataClass:

def __init__(self):
    self.XMLdoc = Document()
    self.MakeRootNode()

def MakeRootNode(self):
    self.RootNode = self.XMLdoc.createElement('Map')
    self.XMLdoc.appendChild(self.RootNode)

def GetXML_Doc(self):
    return self.XMLdoc

def LoadXMLFile(self, AbsFileName):
    try:
        self.XMLdoc.unlink()
        self.XMLdoc = parse(AbsFileName)
        if (self.XMLdoc.hasChildNodes()): #if not empty
            #Determine if root node <CalibrationData> exist
            if (cmp(self.XMLdoc.firstChild.nodeName,
                                'Map') == 0):
                self.RootNode = self.XMLdoc.firstChild

        return True

    except IOError:
        print 'File ' + AbsFileName + ' not found'
        return False

def GetMapData(self):
    result = ""
    haha = self.XMLdoc.firstChild #root node
    for child in haha.childNodes:
        if cmp(child.nodeName, 'LandmarkPointListXml')==0:
            result1 = self.loopLandmark(child)
        elif cmp(child.nodeName, 'RegionPointListXml')==0:
            print 'Empty'
        elif cmp(child.nodeName, 'URL')==0:
            result = result + child.firstChild.nodeValue
        else:
            result = result + child.firstChild.nodeValue + ','
    result = result + "|" + result1 + "EMPTY"
    return result

def loopLandmark(self,landmarks):
    result2=""
    tempResult=""
    haha=landmarks.getElementsByTagName('anyType')
    for i in range(0, len(haha)):
        result2=self.loopAnyType(haha[i])
        if ((i+1)!=len(haha)):
            tempResult = tempResult + result2 + ';'
        else:
            tempResult = tempResult + result2 + '|'
    return tempResult

def loopAnyType(self,anyType):
    result3=""
    haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
    haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
    haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
    result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
    return result3

profile = mapDataClass()
boolean = profile.LoadXMLFile('upload\er.m')
print boolean
result = profile.GetMapData()
print result


My previous answer is still not complete. Here is the one which is think should be alright.

import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString, Node

class mapDataClass:

def __init__(self):
    self.XMLdoc = Document()
    self.MakeRootNode()

def MakeRootNode(self):
    self.RootNode = self.XMLdoc.createElement('Map')
    self.XMLdoc.appendChild(self.RootNode)

def GetXML_Doc(self):
    return self.XMLdoc

def LoadXMLFile(self, AbsFileName):
    try:
        self.XMLdoc.unlink()
        self.XMLdoc = parse(AbsFileName)
        if (self.XMLdoc.hasChildNodes()): #if not empty
            if (cmp(self.XMLdoc.firstChild.nodeName,
                                'Map') == 0):
                self.RootNode = self.XMLdoc.firstChild

        return True

    except IOError:
        print 'File ' + AbsFileName + ' not found'
        return False

def GetMapData(self):
    result = ""
    result1 = ""
    result2 = ""
    haha = self.XMLdoc.firstChild #root node
    for child in haha.childNodes:
        if child.nodeType == Node.ELEMENT_NODE:
            if cmp(child.nodeName, 'LandmarkPointListXml')<>0 and cmp(child.nodeName, 'RegionPointListXml')<>0:
                if cmp(child.nodeName, 'URL')==0:
                    result = result + child.firstChild.nodeValue       
                else:
                    result = result + child.firstChild.nodeValue + ','
            elif cmp(child.nodeName, 'LandmarkPointListXml')==0:
                if child.firstChild is not None:
                    result1 = self.loopLandmark(child)
                else:
                    result1 = 'EMPTY|'
            elif cmp(child.nodeName, 'RegionPointListXml')==0:
                if child.firstChild is None:
                    result2 =  'EMPTY'

    result = result + "|" + result1 + result2
    return result

def loopLandmark(self,landmarks):
    result2=""
    tempResult=""
    haha=landmarks.getElementsByTagName('anyType')
    for i in range(0, len(haha)):
        result2=self.loopAnyType(haha[i])
        if ((i+1)!=len(haha)):
            tempResult = tempResult + result2 + ';'
        else:
            tempResult = tempResult + result2 + '|'
    return tempResult

def loopAnyType(self,anyType):
    result3=""
    haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
    haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
    haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
    result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
    return result3

data = mapDataClass()
success = data.LoadXMLFile("upload\homeTest.m")
if success:
    print "file loaded"
    print data.GetMapData()
else:
    print "no such file found"
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜