How to extract data from XML using python minidom
given this xml file, i would like to extract the data out from it. However, i have trouble extracting the data from <LandmarkPointListXml>
onwards.
The XML file:
<?xml version="1.0" encoding="utf-8"?>
<Map xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<MapName>er</MapName>
<MapURL>er.gif</MapURL>
<Name>er</Name>
<URL>er.gif</URL>
<LandmarkPointListXml>
<anyType xsi:type="LandmarkPointProperty">
<LandmarkPointX>400</LandmarkPointX>
<LandmarkPointY>292</LandmarkPointY>
<LandmarkDesc>my room door</LandmarkDesc>
</anyType>
<anyType xsi:type="LandmarkPointProperty">
<LandmarkPointX>399</LandmarkPointX>
<Land开发者_Go百科markPointY>219</LandmarkPointY>
<LandmarkDesc>bro room door</LandmarkDesc>
</anyType>
</LandmarkPointListXml>
<RegionPointListXml />
</Map>
Python program:
def GetMapData(self):
result = ""
haha = self.XMLdoc.firstChild #root node
for child in haha.childNodes:
if (cmp(child.nodeName,'LandmarkPointListXml')==0):
result = result + '|' + self.loopLandmark(child.childNodes) + '|'
else:
result = result + child.firstChild.nodeValue + ','
return result
def loopLandmark(self, landmarks):
result=""
haha=landmarks.getElementsByTagName('anyType')
for child in haha.childNodes:
if (cmp(haha.firstChild.nodeName,'LandmarkPointX') == 0):
result=result+child.firstChild.nodeValue+','
ChildNode = ChildNode.nextSibling
result=result+child.firstChild.nodeValue+','
ChildNode = ChildNode.nextSibling
result=result+child.firstChild.nodeValue
return result
I was able to retrieve the result, "er,er.gif,er,er.gif," till the program reaches <LandmarkPointListXml>
.
This code is quite fragile. It makes strong assumptions on the XML input, and would fail if the XML was modified in a valid way (e.g. if is not immediately after ).
I suggest using a standard library when parsing XML, such as Element Tree ( http://docs.python.org/library/xml.etree.elementtree.html ) or lxml ( http://lxml.de ), which can also validate your XML input.
The code I'm writing below uses Element Tree and works on your XML input (I have removed the 'self' arguments to the parent class). It also tolerates (ignores) empty values in XML elements.
import xml.etree.ElementTree as ET
def GetMapData( xmlfile ):
result = ""
try:
tree = ET.parse( xmlfile )
except IOError, e:
print "Failure Parsing %s: %s" % (xmlfile, e)
root = tree.getroot() # root node
for child in root:
if ( child.tag == 'LandmarkPointListXml' ):
result += '|' + loopLandmark(child) + '|'
elif child.text is not None:
result += child.text + ','
return result
def loopLandmark( landmarks ):
result=""
for landmark in landmarks:
if ( landmark.tag == 'anyType' ): # check also xsi:type="LandmarkPointProperty"?
for child in landmark:
if ( child.text and child.tag in [ 'LandmarkPointX', 'LandmarkPointY' ] ):
result += child.text + ','
return result
GetMapData( 'xml.in' )
i managed to extract out the data from the XML file posted. But feel that it can be more simpler than the answer i provided. Lots of looping to do to get each data.
import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString
class mapDataClass:
def __init__(self):
self.XMLdoc = Document()
self.MakeRootNode()
def MakeRootNode(self):
self.RootNode = self.XMLdoc.createElement('Map')
self.XMLdoc.appendChild(self.RootNode)
def GetXML_Doc(self):
return self.XMLdoc
def LoadXMLFile(self, AbsFileName):
try:
self.XMLdoc.unlink()
self.XMLdoc = parse(AbsFileName)
if (self.XMLdoc.hasChildNodes()): #if not empty
#Determine if root node <CalibrationData> exist
if (cmp(self.XMLdoc.firstChild.nodeName,
'Map') == 0):
self.RootNode = self.XMLdoc.firstChild
return True
except IOError:
print 'File ' + AbsFileName + ' not found'
return False
def GetMapData(self):
result = ""
haha = self.XMLdoc.firstChild #root node
for child in haha.childNodes:
if cmp(child.nodeName, 'LandmarkPointListXml')==0:
result1 = self.loopLandmark(child)
elif cmp(child.nodeName, 'RegionPointListXml')==0:
print 'Empty'
elif cmp(child.nodeName, 'URL')==0:
result = result + child.firstChild.nodeValue
else:
result = result + child.firstChild.nodeValue + ','
result = result + "|" + result1 + "EMPTY"
return result
def loopLandmark(self,landmarks):
result2=""
tempResult=""
haha=landmarks.getElementsByTagName('anyType')
for i in range(0, len(haha)):
result2=self.loopAnyType(haha[i])
if ((i+1)!=len(haha)):
tempResult = tempResult + result2 + ';'
else:
tempResult = tempResult + result2 + '|'
return tempResult
def loopAnyType(self,anyType):
result3=""
haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
return result3
profile = mapDataClass()
boolean = profile.LoadXMLFile('upload\er.m')
print boolean
result = profile.GetMapData()
print result
My previous answer is still not complete. Here is the one which is think should be alright.
import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString, Node
class mapDataClass:
def __init__(self):
self.XMLdoc = Document()
self.MakeRootNode()
def MakeRootNode(self):
self.RootNode = self.XMLdoc.createElement('Map')
self.XMLdoc.appendChild(self.RootNode)
def GetXML_Doc(self):
return self.XMLdoc
def LoadXMLFile(self, AbsFileName):
try:
self.XMLdoc.unlink()
self.XMLdoc = parse(AbsFileName)
if (self.XMLdoc.hasChildNodes()): #if not empty
if (cmp(self.XMLdoc.firstChild.nodeName,
'Map') == 0):
self.RootNode = self.XMLdoc.firstChild
return True
except IOError:
print 'File ' + AbsFileName + ' not found'
return False
def GetMapData(self):
result = ""
result1 = ""
result2 = ""
haha = self.XMLdoc.firstChild #root node
for child in haha.childNodes:
if child.nodeType == Node.ELEMENT_NODE:
if cmp(child.nodeName, 'LandmarkPointListXml')<>0 and cmp(child.nodeName, 'RegionPointListXml')<>0:
if cmp(child.nodeName, 'URL')==0:
result = result + child.firstChild.nodeValue
else:
result = result + child.firstChild.nodeValue + ','
elif cmp(child.nodeName, 'LandmarkPointListXml')==0:
if child.firstChild is not None:
result1 = self.loopLandmark(child)
else:
result1 = 'EMPTY|'
elif cmp(child.nodeName, 'RegionPointListXml')==0:
if child.firstChild is None:
result2 = 'EMPTY'
result = result + "|" + result1 + result2
return result
def loopLandmark(self,landmarks):
result2=""
tempResult=""
haha=landmarks.getElementsByTagName('anyType')
for i in range(0, len(haha)):
result2=self.loopAnyType(haha[i])
if ((i+1)!=len(haha)):
tempResult = tempResult + result2 + ';'
else:
tempResult = tempResult + result2 + '|'
return tempResult
def loopAnyType(self,anyType):
result3=""
haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
return result3
data = mapDataClass()
success = data.LoadXMLFile("upload\homeTest.m")
if success:
print "file loaded"
print data.GetMapData()
else:
print "no such file found"
精彩评论