Javascript / jQuery Find Text duplicates

2023-02-11 18:00 问答作者：

How would you approach finding duplicates in a text document. Duplicates can be a set of consecutive words or sentence. Sentence not necessary ending with a dot. let's say page contains a 开发者_运维百科document of 200 lines of which 2 sentences are identical, we want to highlight those 2 sentences as duplicates when "check duplicate button" is clicked.

Interesting question — here's an idea on how I'd do that probably: http://jsfiddle.net/SaQAs/1/ — Not anyhow optimized!

var text = $('p').text(),
    words = text.split(' '),
    sortedWords = words.slice(0).sort(),
    duplicateWords = [],
    sentences = text.split('.'),
    sortedSentences = sentences.slice(0).sort(),
    duplicateSentences = [];


for (var i=0; i<sortedWords.length-1; i++) {
    if (sortedWords[i+1] == sortedWords[i]) {
        duplicateWords.push(sortedWords[i]);
    }
}
duplicateWords = $.unique(duplicateWords);

for (var i=0; i<sortedSentences.length-1; i++) {
    if (sortedSentences[i+1] == sortedSentences[i]) {
        duplicateSentences.push(sortedSentences[i]);
    }
}
duplicateSentences = $.unique(duplicateSentences);

$('a.words').click(function(){
    var highlighted = $.map(words, function(word){
        if ($.inArray(word, duplicateWords) > -1)
            return '<span class="duplicate">' + word + '</span>';
        else return word;
    });
    $('p').html(highlighted.join(' '));
    return false;
});

$('a.sentences').click(function(){
    var highlighted = $.map(sentences, function(sentence){
        if ($.inArray(sentence, duplicateSentences) > -1)
            return '<span class="duplicate">' + sentence + '</span>';
        else return sentence;
    });
    $('p').html(highlighted.join('.'));
    return false;
});

Update 1

This one finds sequences of identical words: http://jsfiddle.net/YQdk5/1/ From here it shouldn't be hard to e.g. ignore any kind of punctuation at the end of fragments when comparing — you'd would just have to write your own version of an inArray method.

var text = $('p').text(),
    words = text.split(' '),
    sortedWords = words.slice(0).sort(),
    duplicateWords = []
    highlighted = [];

for (var i=0; i<sortedWords.length-1; i++) {
    if (sortedWords[i+1] == sortedWords[i]) {
        duplicateWords.push(sortedWords[i]);
    }
}
duplicateWords = $.unique(duplicateWords);

for (var j=0, m=[]; j<words.length; j++) {
    m.push($.inArray(words[j], duplicateWords) > -1);
    if (!m[j] && m[j-1]) 
        highlighted.push('</span>');
    else if (m[j] && !m[j-1])
        highlighted.push('<span class="duplicate">');
    highlighted.push(words[j]);
}

$('p').html(highlighted.join(' '));

Update 2

My regex-fu is weak, but this (pretty messy!) version seems to work okay: http://jsfiddle.net/YQdk5/2/ — I'm pretty sure that there might be a better way of doing this, but for now I've got to leave it alone! :D — Good luck!

Update 3

Thinking about it, I don't think that the code from the previous update is any good. That's why I've removed it. You can still find it here: http://jsfiddle.net/YQdk5/2/ The main point is to use a regex to match words, something along the lines of:

/^word(\.?)$/

Here is the solution that uses suffix tree:

function SuffixTree(text) {
    var regex = /\b\w+/g;
    var words = text.match(regex);
    var wave = [];
    var words_l = words.length;
    if (words_l == 0) return false;
    this.tree = this.node("", false);
    for (var i = 0; i < words_l; ++i) {
        var x = words[i] + "_";
        wave.push(this.tree);
        var wave_l = wave.length;
        for (var j = 0; j < wave_l; ++j) {
            var y = wave[j];
            if (typeof y[x] != 'undefined') y[x].count++;
            else y[x] = this.node(words[i], y);
            wave[j] = y[x];
        }
    }
}

SuffixTree.prototype = {
    dummy: {count: 1},

    node: function(word, num, parent) {
        return {
            count: 1,
            word: word,
            parent: parent
        };
    },

    duplicates: function(h) {
        this.dups = [];
        this.bypass(this.tree, h, 0);
        var l = this.dups.length;
        this.dups.sort(function(d1, d2) { return d1.depth > d2.depth ? 1 : -1; });
        for (var i = 0; i < l; ++i) {
            var d = this.dups[i];
            this.dups[i] = { s: " " + this.sentence(d.a) + " ", depth: d.depth, count: d.a.count };
        }
        for (var i = 0; i < l; ++i) {
            var d = this.dups[i];
            console.log(i, d.s);
        }
        for (var i = 0; i < l; ++i) {
            var d = this.dups[i];
            var fl = true;
            for (var j = i + 1; j < l; ++j) {
                if (this.dups[j].s.indexOf(d.s) != -1) fl = false;
            }
            if (fl) h(d.s.substr(1, d.s.length - 2), d.count);
        }
    },

    bypass: function(a, h, depth) {
        if (a.constructor != Object) return;
        var fl = true;
        for (var i in a) {
            if (i == 'parent') continue;
            var b = a[i];
            if (b.count == a.count) fl = false;
            this.bypass(b, h, depth + 1);
        }
        if (fl && a.count > 1) {
            this.dups.push({ a: a, depth: depth });
        }
    },

    sentence: function(a) {
        var s = a.word;
        while (a = a.parent) {
            s = a.word + " " + s;
        }
        return s;
    }
};

var text = "This is a text with some duplicates: words, sentences of different length. For example here is a duplicate word. This sentence has some duplicates. But not all of us can find clones.";

var T = new SuffixTree(text);
var h = function(s, c) {
    document.write(s + "[" + c + "]<br/>");
};
T.duplicates(h);

1) Split the input text to the array of words. 2) Build the suffix tree. 3) Find the longest suffixes of the tree. 4) Remove sentences that are contained in others (i.e. remove "is" that is a part of "this is a").

You can change the regular expression to take into account html tags.

I hope this helps you.

P.S. h is the callback for found duplicates.

You're javascript contains references to a javascript library named jQuery.

You are not including this in your HTML, and thus it will fail. You can include it via jquery cdn

And todays tip: Use the developer tools in your browser. In the console you can see what parts of javascript that fails.

继续阅读：duplicates javascript jquery text

Javascript / jQuery Find Text duplicates

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？