word ifilter for docx parser error

2022-12-14 08:37 问答作者：

.Docx documents do not appear to be being indexed.

I used a unique string in a .docx, but the .docx is not returned when I search on "one".

For example here's the following text:

"Here is the text for line one and here is the text for line two."

Will be extracted via the iFilter as:

"Here is the text开发者_StackOverflow中文版 for line oneand here is the text for line two."

So when the Ifilter parses the .docx he deletes the line break separator and tries to parse "oneand here"... .

So it seems that the Word ifilter for .docx concatenates the last word of a line with the first word of the next line.

Can anyone give some ideas of how to get around this issue?

Thanks in advance.

OK I figured this one out now. Basically the 64 bit IFilter is not working correctly. It merges words that are separated by line breaks and does not carry them through. I used Ionic.zip to access the docx zip archive and parsed the important xml files using a slightly modified version of DocxToText. This works perfectly now.

Here is the modified code originally created by Jevgenij Pankov

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Ionic.Zip;
using System.IO;
using System.Xml;

public class DocxToText
{
    private const string ContentTypeNamespace =
        @"http://schemas.openxmlformats.org/package/2006/content-types";

    private const string WordprocessingMlNamespace =
        @"http://schemas.openxmlformats.org/wordprocessingml/2006/main";

    private const string DocumentXmlXPath =
        "/t:Types/t:Override[@ContentType=\"" +
        "application/vnd.openxmlformats-officedocument." +
        "wordprocessingml.document.main+xml\"]";

    private const string BodyXPath = "/w:document/w:body";

    private string docxFile = "";
    private string docxFileLocation = "";

    public DocxToText(string fileName)
    {
        docxFile = fileName;
    }

    #region ExtractText()
    /// 

    /// Extracts text from the Docx file.

    /// 

    /// Extracted text.

    public string ExtractText()
    {
        if (string.IsNullOrEmpty(docxFile))
            throw new Exception("Input file not specified.");

        // Usually it is "/word/document.xml"


        docxFileLocation = FindDocumentXmlLocation();

        if (string.IsNullOrEmpty(docxFileLocation))
            throw new Exception("It is not a valid Docx file.");

        return ReadDocumentXml();
    }
    #endregion

    #region FindDocumentXmlLocation()
    /// 

    /// Gets location of the "document.xml" zip entry.

    /// 

    /// Location of the "document.xml".

    private string FindDocumentXmlLocation()
    {
        using (ZipFile zip = new ZipFile(docxFile))
        {
            foreach (ZipEntry entry in zip)
            {
                // Find "[Content_Types].xml" zip entry
                if (string.Compare(entry.FileName, "[Content_Types].xml", true) == 0)
                {
                    XmlDocument xmlDoc = new XmlDocument();
                    using (var stream = new MemoryStream())
                    {

                        entry.Extract(stream);
                        stream.Position = 0;

                        xmlDoc.PreserveWhitespace = true;
                        xmlDoc.Load(stream);
                    }

                    //Create an XmlNamespaceManager for resolving namespaces


                    XmlNamespaceManager nsmgr =
                        new XmlNamespaceManager(xmlDoc.NameTable);
                    nsmgr.AddNamespace("t", ContentTypeNamespace);

                    // Find location of "document.xml"


                    XmlNode node = xmlDoc.DocumentElement.SelectSingleNode(
                        DocumentXmlXPath, nsmgr);

                    if (node != null)
                    {
                        string location =
                            ((XmlElement)node).GetAttribute("PartName");
                        return location.TrimStart(new char[] { '/' });
                    }
                    break;
                }
            }
        }
        return null;
    }
    #endregion

    #region ReadDocumentXml()
    /// 

    /// Reads "document.xml" zip entry.

    /// 

    /// Text containing in the document.

    private string ReadDocumentXml()
    {
        StringBuilder sb = new StringBuilder();

        using (ZipFile zip = new ZipFile(docxFile))
        {
            foreach (ZipEntry entry in zip)
            {
                if (string.Compare(entry.FileName, docxFileLocation, true) == 0)
                {
                    XmlDocument xmlDoc = new XmlDocument();
                    using (var stream = new MemoryStream())
                    {

                        entry.Extract(stream);
                        stream.Position = 0;

                        xmlDoc.PreserveWhitespace = true;
                        xmlDoc.Load(stream);
                    }

                    XmlNamespaceManager nsmgr =
                        new XmlNamespaceManager(xmlDoc.NameTable);
                    nsmgr.AddNamespace("w", WordprocessingMlNamespace);

                    XmlNode node =
                        xmlDoc.DocumentElement.SelectSingleNode(BodyXPath, nsmgr);

                    if (node == null)
                        return string.Empty;

                    sb.Append(ReadNode(node));

                    break;
                }
            }
        }
        return sb.ToString();
    }
    #endregion

    #region ReadNode()
    /// 

    /// Reads content of the node and its nested childs.

    /// 

    /// XmlNode.

    /// Text containing in the node.

    private string ReadNode(XmlNode node)
    {
        if (node == null || node.NodeType != XmlNodeType.Element)
            return string.Empty;

        StringBuilder sb = new StringBuilder();
        foreach (XmlNode child in node.ChildNodes)
        {
            if (child.NodeType != XmlNodeType.Element) continue;

            switch (child.LocalName)
            {
                case "t": // Text

                    sb.Append(child.InnerText.TrimEnd());

                    string space =
                        ((XmlElement)child).GetAttribute("xml:space");
                    if (!string.IsNullOrEmpty(space) &&
                        space == "preserve")
                        sb.Append(' ');

                    break;

                case "cr":                          // Carriage return

                case "br":                          // Page break

                    sb.Append(Environment.NewLine);
                    break;

                case "tab":                         // Tab

                    sb.Append("\t");
                    break;

                case "p":                           // Paragraph

                    sb.Append(ReadNode(child));
                    sb.Append(Environment.NewLine);
                    sb.Append(Environment.NewLine);
                    break;

                default:
                    sb.Append(ReadNode(child));
                    break;
            }
        }
        return sb.ToString();
    }
    #endregion
}

Here is the usage of this code...

DocxToText dtt = new DocxToText(filepath);
string docxText = dtt.ExtractText();

Placing the cursor in the middle of a word and saving the document will result in the word being split among two XML tags, with a "_GoBack" bookmark in between. The result is that after parsing with this routine, a space is placed between these two string fragments, instead of merging them back to one string. It's easy enough to handle the "_GoBack" scenario, but there's probably other ones as well. Maybe "Track Changes" and who knows what else.

Does a more detailed parsing algorithm exist for DOCX?

继续阅读：ifilter

word ifilter for docx parser error

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？