开发者

Comparing 2 node sets based on attribute sequence

I'm trying to build up a kind of library XML, comparing various nodes and combining them for later reuse. The logic should be fairly straightforward, if the tag_XX attribute value sequence of a given language is equal to the tag_YY attribute value sequence of another language, the nodes can be combined. See below XML example

<Book>
<Section>
    <GB>
        <Para tag_GB="L1">
            <Content_GB>string_1</Content_GB>
        </Para>
        <Para tag_GB="Illanc">
            <Content_GB>string_2</Content_GB>
        </Para>
        <Para tag_GB="|PLB">
            <Content_GB>string_3</Content_GB>
        </Para>
        <Para tag_GB="L1">
            <Content_GB>string_4</Content_GB>
        </Para>
        <Para tag_GB="Sub">
            <Content_GB>string_5</Content_GB>
        </Para>
        <Para tag_GB="L3">
            <Content_GB>string_6</Content_GB>
        </Para>
        <Para tag_GB="Subbull">
            <Content_GB>string_7</Content_GB>
        </Para>
    </GB>
    <!-- German translations - OK because same attribute sequence -->
    <DE>
        <Para tag_DE="L1">
            <Content_DE>German_translation of_string_1</Content_DE>
        </Para>
        <Para tag_DE="Illanc">
            <Content_DE>German_translation of_string_2</Content_DE>
        </Para>
        <Para tag_DE="|PLB">
            <Content_DE>German_translation of_string_3</Content_DE>
        </Para>
        <Para tag_DE="L1">
            <Content_DE>German_translation of_string_4</Content_DE>
        </Para>
        <Para tag_DE="Sub">
            <Content_DE>German_translation of_string_5</Content_DE>
        </Para>
        <Para tag_DE="L3">
            <Content_DE>German_translation of_string_6</Content_DE>
        </Para>
        <Para tag_DE="Subbull">
            <Content_DE>German_translation of_string_7</Content_DE>
        </Para>
    </DE>
    <!-- Danish translations - NG because not same attribute sequence -->
    <DK>
        <Para tag_DK="L1">
            <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
        </Para>
        <Para tag_DK="L1_sub">
            <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
        </Para>
        <Para tag_DK="Illanc">
            <Content_DK>Danish_translation_of_string_2</Content_DK>
        </Para>
        <Para tag_DK="L1">
            <Content_DK>Danish_translation_of_string_4</Content_DK>
        </Para>
        <Para tag_DK="|PLB">
            <Content_DK>Danish_translation_of_string_3</Content_DK>
        </Para>
        <Para tag_DK="L3">
            <Content_DK>Danish_translation_of_string_6</Content_DK>
        </Para>
        <Para tag_DK="Sub">
            <Content_DK>Danish_translation_of_string_5</Content_DK>
        </Para>
        <Para tag_DK="Subbull">
            <Content_DK>Danish_translation_of_string_7</Content_DK>
        </Para>
    </DK>
</Section>
</Book>

So

GB tag_GB value sequence = L1 -> Illanc -> ... -> SubBul

DE tag_DE value sequence = L1 -> Illanc -> ... -> SubBul (same as GB so ok)

DK tag_DK value sequence = L1 -> L1.sub -> Oops, expected Illanc meaning this sequence is not the same as GB and locale can be ignored

Since German and English node sets have the same attribute sequence I like to combine them as follows :

<Book>
<Dictionary>
    <Para tag="L1">
        <Content_GB>string_1</Content_GB>
        <Content_DE>German_translation of_string_1</Content_DE>
    </Para>
    <Para tag="Illanc">
        <Content_GB>string_2</Content_GB>
        <Content_DE>German_translation of_string_2</Content_DE>
    </Para>
    <Para tag="|PLB">
        <Content_GB>string_3</Content_GB>
        <Content_DE>German_translation of_string_3</Content_DE>
    </Para>
    <Para tag="L1">
        <Content_GB>string_4</Content_GB>
        <Content_DE>German_translation of_string_4</Content_DE>
    </Para>
    <Para tag="Sub">
        <Content_GB>string_5</Content_GB>
        <Content_DE>German_translation of_string_5</Content_DE>
    </Para>
    <Para tag="L3">
        <Content_GB>string_6</Content_GB>
        <Content_DE>German_translation of_string_6</Content_DE>
    </Para>
    <Para tag="Subbull">
        <Content_GB>string_7</Content_GB>
        <Content_DE>German_translation of_string_7</Content_DE>
    </Para>
</Dictionary>
</Book>

The stylesheet I use is the following :

<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" version="1.0" xmlns="http://www.w3.org/1999/xhtml" encoding="UTF-8" indent="yes"/>
<xsl:output omit-xml-declaration="yes" indent="yes"/>
<xsl:template match="/">
    <xsl:copy>
        <xsl:apply-templates select="@* | node()"/>
    </xsl:copy>
</xsl:template>
<xsl:template match="@* | node()">
    <xsl:copy>
        <xsl:apply-templates select="@* | node()"/>
    </xsl:copy>
</xsl:template>
<xsl:template match="text()">
    <xsl:value-of select="normalize-space(.)"/>
</xsl:template>
<xsl:template match="Section">
    <!-- store reference tag list -->
    <xsl:variable name="Ref_tagList" select="GB/Para/attribute()[1]"/>
    <Dictionary>
        <xsl:for-each select="GB/Para">
            <xsl:variable name="pos" select="position()"/>
            <Para tag="{@tag_GB}">
                <!-- Copy English Master -->
                <xsl:apply-templates select="element()[1]"/>
                <xsl:for-each select="//Book/Section/element()[not(self::GB)]">
                    <!-- store current locale tag list -->
                    <xsl:variable name="Curr_tagList" select="Para/attribute()[1]"/>
                    <xsl:if test="$Ref_tagList = $Curr_tagList">
                        <!-- Copy current locale is current tag list equals reference tag list -->
                        <xsl:apply-templates select="Para[position()=$pos]/element()[1]"/>
                    </xsl:if>
                </xsl:for-each>
            </Para>
        </xsl:for-each>
    </Dictionary>
</xsl:template>
</xsl:stylesheet>

Apart from probably not the most efficient way to do this (I'm fairly new to the xslt game...) it's not working either. The logic I had in mind is to take the attribute set of the English master, and if the attribute set of any other locale is equal I copy, if not I ignore. But for some reason also nodesets that have a different attribute sequence are happily copied (as seen in below). Can some one tell me where my logic conflicts with reality ? Thanks in advance !

Current output Including Danish that should have been ignored ...

<Book>
<Dictionary>
    <Para tag="L1">
        <Content_GB>string_1</Content_GB>
        <Content_DE>German_translation of_string_1</Content_DE>
        <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
    </Para>
    <Para tag="Illanc">
        <Content_GB>string_2</Content_GB>
        <Content_DE>German_translation of_string_2</Content_DE>
        <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
    </Para>
    <Para tag="|PLB">
        <Content_GB>string_3</Content_GB>
        <Content_DE>German_translation of_string_3</Content_DE>
        <Content_DK>Danish_translation_of_string_开发者_StackOverflow中文版2</Content_DK>
    </Para>
    <Para tag="L1">
        <Content_GB>string_4</Content_GB>
        <Content_DE>German_translation of_string_4</Content_DE>
        <Content_DK>Danish_translation_of_string_4</Content_DK>
    </Para>
    <Para tag="Sub">
        <Content_GB>string_5</Content_GB>
        <Content_DE>German_translation of_string_5</Content_DE>
        <Content_DK>Danish_translation_of_string_3</Content_DK>
    </Para>
    <Para tag="L3">
        <Content_GB>string_6</Content_GB>
        <Content_DE>German_translation of_string_6</Content_DE>
        <Content_DK>Danish_translation_of_string_6</Content_DK>
    </Para>
    <Para tag="Subbull">
        <Content_GB>string_7</Content_GB>
        <Content_DE>German_translation of_string_7</Content_DE>
        <Content_DK>Danish_translation_of_string_5</Content_DK>
    </Para>
</Dictionary>
</Book>


This is might not be the best solution. I've used the following XSLT 2.0 features:

  • I compared the sequence of attributes using string-join().
  • I've exploited the possibility of using RTF variables

There are probably more XSLT 2.0 facilities which can resolve your problem. but I think the BIG problem here is your input document.

I'm sorry did not have a look to your current transform. Just implemented one from scratch. Hope it helps:

<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output indent="yes"/>
    <xsl:strip-space elements="*"/>

    <xsl:template match="GB">
        <Book>
            <Dictionary>

                <xsl:variable name="matches">
                    <xsl:for-each select="following-sibling::*
                        [string-join(Para/@*,'-')
                        = string-join(current()/Para/@*,'-')]">
                        <match><xsl:copy-of select="Para/*"/></match>
                    </xsl:for-each>
                </xsl:variable>

                <xsl:apply-templates select="Para">
                    <xsl:with-param name="matches" select="$matches"/>
                </xsl:apply-templates>

            </Dictionary>
        </Book>
    </xsl:template>

    <xsl:template match="Para[parent::GB]">
        <xsl:param name="matches"/>
        <xsl:variable name="pos" select="position()"/>
        <Para tag="{@tag_GB}">
            <xsl:copy-of select="Content_GB"/>
            <xsl:copy-of select="$matches/match/*[position()=$pos]"/>
        </Para>
    </xsl:template>

    <xsl:template match="text()"/>

</xsl:stylesheet> 

When applied to the input document provided in the question, the following output is produced:

<Book>
   <Dictionary>
      <Para tag="L1">
         <Content_GB>string_1</Content_GB>
         <Content_DE>German_translation of_string_1</Content_DE>
      </Para>
      <Para tag="Illanc">
         <Content_GB>string_2</Content_GB>
         <Content_DE>German_translation of_string_2</Content_DE>
      </Para>
      <Para tag="|PLB">
         <Content_GB>string_3</Content_GB>
         <Content_DE>German_translation of_string_3</Content_DE>
      </Para>
      <Para tag="L1">
         <Content_GB>string_4</Content_GB>
         <Content_DE>German_translation of_string_4</Content_DE>
      </Para>
      <Para tag="Sub">
         <Content_GB>string_5</Content_GB>
         <Content_DE>German_translation of_string_5</Content_DE>
      </Para>
      <Para tag="L3">
         <Content_GB>string_6</Content_GB>
         <Content_DE>German_translation of_string_6</Content_DE>
      </Para>
      <Para tag="Subbull">
         <Content_GB>string_7</Content_GB>
         <Content_DE>German_translation of_string_7</Content_DE>
      </Para>
   </Dictionary>
</Book>


This stylesheet makes use of <xsl:for-each-group>

  1. First, groups the elements by their sequence of Para/@* values
  2. Then, for each of those sequences, groups the Para using the number of following sibling elements that have attributes that start with "tag".

I have predicate filters on the matches for @*, to ensure that it is comparing the ones that start with "tag_". That may not be necessary, but would help ensure that it still worked if other attributes were added to the instance XML.

<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" version="1.0" xmlns="http://www.w3.org/1999/xhtml" encoding="UTF-8"
        indent="yes"/>
    <xsl:output omit-xml-declaration="yes" indent="yes"/>

    <xsl:template match="@* | node()">
        <xsl:copy>
            <xsl:apply-templates select="@* | node()"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="text()" priority="1">
        <xsl:value-of select="normalize-space(.)"/>
    </xsl:template>

    <xsl:template match="Section">
        <xsl:for-each-group select="*"
            group-adjacent="string-join(
            Para/@*[starts-with(local-name(),'tag_')],'|')">
            <Dictionary>
                <xsl:for-each-group select="current-group()/Para"
                    group-by="count(
                    following-sibling::*[@*[starts-with(local-name(),'tag_')]])">
                    <Para tag="{(current-group()/@*[starts-with(local-name(),'tag_')])[1]}">
                        <xsl:copy-of select="current-group()/*"/>
                    </Para>
                </xsl:for-each-group>
            </Dictionary>
        </xsl:for-each-group>
    </xsl:template>

</xsl:stylesheet>

When applied to the sample input XML, produces the following output:

<Book>
   <Dictionary>
      <Para tag="L1">
         <Content_GB>string_1</Content_GB>
         <Content_DE>German_translation of_string_1</Content_DE>
      </Para>
      <Para tag="Illanc">
         <Content_GB>string_2</Content_GB>
         <Content_DE>German_translation of_string_2</Content_DE>
      </Para>
      <Para tag="|PLB">
         <Content_GB>string_3</Content_GB>
         <Content_DE>German_translation of_string_3</Content_DE>
      </Para>
      <Para tag="L1">
         <Content_GB>string_4</Content_GB>
         <Content_DE>German_translation of_string_4</Content_DE>
      </Para>
      <Para tag="Sub">
         <Content_GB>string_5</Content_GB>
         <Content_DE>German_translation of_string_5</Content_DE>
      </Para>
      <Para tag="L3">
         <Content_GB>string_6</Content_GB>
         <Content_DE>German_translation of_string_6</Content_DE>
      </Para>
      <Para tag="Subbull">
         <Content_GB>string_7</Content_GB>
         <Content_DE>German_translation of_string_7</Content_DE>
      </Para>
   </Dictionary>
   <Dictionary>
      <Para tag="L1">
         <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
      </Para>
      <Para tag="L1_sub">
         <Content_DK>Partial_Danish_translation_of_string_1</Content_DK>
      </Para>
      <Para tag="Illanc">
         <Content_DK>Danish_translation_of_string_2</Content_DK>
      </Para>
      <Para tag="L1">
         <Content_DK>Danish_translation_of_string_4</Content_DK>
      </Para>
      <Para tag="|PLB">
         <Content_DK>Danish_translation_of_string_3</Content_DK>
      </Para>
      <Para tag="L3">
         <Content_DK>Danish_translation_of_string_6</Content_DK>
      </Para>
      <Para tag="Sub">
         <Content_DK>Danish_translation_of_string_5</Content_DK>
      </Para>
      <Para tag="Subbull">
         <Content_DK>Danish_translation_of_string_7</Content_DK>
      </Para>
   </Dictionary>
</Book>
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜