开发者

search/replace content of xml

I've been successful using xml.etree.ElementTree to parse an xml, search for content, then write this to a diffe开发者_如何学JAVArent xml. However, I just worked with text, inside of a singe tag.

import os, sys, glob, xml.etree.ElementTree as ET
path = r"G:\\63D RRC GIS Data\\metadata\\general\\2010_contract"
for fn in os.listdir(path):
    filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
    for filepath in filepaths:
        (pa, filename) = os.path.split(filepath)
        ####use this section to grab element text from old, archived metadata files; this text then gets put into current, working .xml###
        root = ET.parse(pa + os.sep + "archive" + os.sep + "base_metadata_overall.xml").getroot()

        iterator = root.getiterator()
        for item in iterator:
            if item.tag == "abstract":
                correct_abstract = item.text

        root2 = ET.parse(pa + os.sep + "base_metadata_overall.xml").getroot()

        iterator2 = root2.getiterator("descript")
        for item in iterator2:
            if item.tag == "abstract":
                old_abstract = item.find("abstract")
                old_abstract_text = old_abstract.text
                item.remove(old_abstract)
                new_symbol_abstract = ET.SubElement(item, "title")
                new_symbol_abstract.text = correct_abstract                
        tree = ET.ElementTree(root2)
        tree.write(pa + os.sep + "base_metadata_overall.xml")
        print "created --- " + filename + " metadata"

But now, I need to:

1) search an xml and grab everything between "attr" tags, below is example:

<attr><attrlabl Sync="TRUE">OBJECTID</attrlabl><attalias Sync="TRUE">ObjectIdentifier</attalias><attrtype Sync="TRUE">OID</attrtype><attwidth Sync="TRUE">4</attwidth><atprecis Sync="TRUE">0</atprecis><attscale Sync="TRUE">0</attscale><attrdef Sync="TRUE">Internal feature number.</attrdef></attr>

2) Now, I need to open a different xml and search for all content between the same "attr" tag and replace with the above.

Basically, what I was doing before, but ignoring subelements, attributes, ect... between "attr" tags and treat it like text.

thanks!!

Please bear with me, this forum is a little different (posting) then Im used to!

Here's what I have so far:

import os, sys, glob, re, xml.etree.ElementTree as ET
from lxml import etree

path = r"C:\\temp\\python\\xml"
for fn in os.listdir(path):
    filepaths = glob.glob(path + os.sep + fn + os.sep +  "*overall.xml")
    for filepath in filepaths:
            (pa, filename) = os.path.split(filepath)

            xml = open(pa + os.sep + "attributes.xml")
            xmltext = xml.read()
            correct_attrs = re.findall("<attr> (.*?)</attr>",xmltext,re.DOTALL)
            for item in correct_attrs:
                correct_attribute = "<attr>" + item + "</attr>"

                xml2 = open(pa + os.sep + "base_metadata_overall.xml")
                xmltext2 = xml2.read()
                old_attrs = re.findall("<attr>(.*?)</attr>",xmltext,re.DOTALL)
                for item2 in old_attrs:
                    old_attribute = "<attr>" + item + "</attr>"               



                    old = etree.fromstring(old_attribute)
                    replacement = new.xpath('//attr')
                    for attr in old.xpath('//attr'):
                        attr.getparent().replace(attr, copy.deepcopy(replacement))
                        print lxml.etree.tostring(old)

got this working, see below, even figured out how to export to new .xml However, If the # of attr's is dif. from source to dest, I get the following error, any suggestions?

node = replacements.pop()

IndexError: pop from empty list

import os, sys, glob, re, copy, lxml, xml.etree.ElementTree as ET
from lxml import etree
path = r"C:\\temp\\python\\xml"
for fn in os.listdir(path):
filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
for filepath in filepaths:
        xmlatributes = open(pa + os.sep + "attributes.xml")
        xmlatributes_txt = xmlatributes.read()
        xmltarget = open(pa + os.sep + "base_metadata_overall.xml")
        xmltarget_txt = xmltarget.read()
        source = lxml.etree.fromstring(xmlatributes_txt)
        dest = lxml.etree.fromstring(xmltarget_txt)            




        replacements = source.xpath('//attr')
        replacements.reverse()


        for attr in dest.xpath('//attr'):
            node = replacements.pop()
            attr.getparent().replace(attr, copy.deepcopy(node))
        #print lxml.etree.tostring(dest)
        tree = ET.ElementTree(dest)
        tree.write (pa + os.sep + "edited_metadata.xml")
        print fn + "--- sucessfully edited"

update 5/16/2011 restructured a few things to fix the "IndexError: pop from empty list" error mentioned above. Realized that the replacement of the "attr" tags will not always be a 1-to-1 replacement. For ex. sometimes the source .xml has 20 attr's and the destination .xml has 25 attr's. In this case, the 1-to-1 replacement would choke.

Anyway, the below will remove all attr's, then replace with the source attr's. It also checks for another tag, "subtype" if it exists, it adds them after the attr's, but inside the "detailed" tags.

thanks again to everyone who helped.

import os, sys, glob, re, copy, lxml, xml.etree.ElementTree as ET
from lxml import etree
path = r"G:\\63D RRC GIS Data\\metadata\\general\\2010_contract"
#path = r"C:\\temp\python\\xml"
for fn in os.listdir(path):
    correct_title = fn.replace ('_', ' ') + " various facilities"
    correct_fc_name = fn.replace ('_', ' ')
    filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
    for filepath in filepaths:
        print "-----" + fn + "-----"
        (pa, filename) = os.path.split(filepath)
        xmlatributes = open(pa + os.sep + "attributes.xml")
        xmlatributes_txt = xmlatributes.read()
        xmltarget = open(pa + os.sep + "base_metadata_overall.xml")
        xmltarget_txt = xmltarget.read()
        source = lxml.etree.fromstring(xmlatributes_txt)
        dest = lxml.etree.fromstring(xmltarget_txt)
        replacements = source.xpath('//attr')
        replacesubtypes = source.xpath('//subtype')
        subtype_true_f = len(replacesubtypes)

        attrtag = dest.xpath('//attr')
        #print len(attrtag)
        num_realatrs = len(replacements)
        for n in attrtag:
            n.getparent().remove(n)
        print n.tag + " removed"

        detailedtag = dest.xpath('//detailed')
        for n2 in detailedtag:
            pos = 0
            for realatrs in replacements:
                n2.insert(pos + 1, realatrs)
            print "attr's replaced"
            if subtype_true_f >= 1:
                #print subtype_true_f
                for realsubtypes in replacesubtypes:
                   n2.insert(num_realatrs + 1, realsubtypes)
                print "subtype's replaced"

        tree = ET.ElementTree(dest)
        tree.write (pa + os.sep + "base_metadata_overall_v2.xml")
        print fn + "--- sucessfully edited"


Here is an example of using lxml to do this. I'm not exactly sure how you want the <attr/> nodes replaced, but this example should provide a pattern you can reuse.

Update - I changed it to replace each <attr> in tree2 with the corresponding node from tree1, in document order:

import copy
import lxml.etree

xml1 = '''<root><attr><chaos foo="0"/></attr><attr><arena foo="1"/></attr></root>'''
xml2 = '''<tree><attr><one/></attr><attr><two/></attr></tree>'''
tree1 = lxml.etree.fromstring(xml1)
tree2 = lxml.etree.fromstring(xml2)

# select <attr/> nodes from tree1, will be used to replace corresponding
# nodes in tree2
replacements = tree1.xpath('//attr')
replacements.reverse()

for attr in tree2.xpath('//attr'):
    # replace the attr node in tree2 with 'replacement' from tree1
    node = replacements.pop()
    attr.getparent().replace(attr, copy.deepcopy(node))

print lxml.etree.tostring(tree2)

Result:

<tree>
  <attr><chaos foo="0"/></attr>
  <attr><arena foo="1"/></attr>
</tree>


This sounds like something that XSL-T transformations were made for. Have you tried that?

I'd also recommend a library like Beautiful Soup for parsing and manipulating XML.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜