Listing trigrams in JavaScript
I am trying to list all the trigrams(3 letter words) in a given string depending on the user input. The use开发者_开发技巧r may enter 1,2 or 3 characters. I wrote the following code:
if (3 == trigram.length) {
var re = new RegExp(trigram);
} else if (2 == trigram.length) {
trigram = trigram + '\\S|\\S' + trigram;
var re = new RegExp(trigram);
} else if (1 == trigram.length) {
trigram = trigram + '\\S\\S|\\S\\S' + trigram + '|\\S' + trigram + '\\S';
var re = new RegExp(trigram);
} else {
alert("Trigram search pattern can be either one, two or three characters!");
return null;
}
var re = new RegExp(trigram, "ig"); alert(re);
trigramList = givenString.match(re);
This is working fine except that if I have the following sequence of characters in my string "KDSGKHAGSKH" and I am searching for trigrams consisting 'A' my code returns only "KHA" where I am expecting it to return {KHA, HAG, AGS}
Here are two simple functions that seem to be what you're looking for
String.prototype.ngrams = function(n) {
var r = [];
for(var i = 0; i <= this.length - n; i++)
r.push(this.substring(i, i + n));
return r;
}
Array.prototype.grep = function(re) {
var r = [];
for(var i = 0; i < this.length; i++)
if(re.test(this[i]))
r.push(this[i]);
return r;
}
s = "abcdefghjkl";
alert(s.ngrams(3).grep(/d/))
prints "bcd", "cde", "def". Not the most efficient but simple.
The problem with the original is that regular expressions set the end of a successful match as where to start of the next match, which means you can't get overlapping matches so easily. You need to find a way to make the matching string exactly one character long, so that the starting index is always one more than the start of the previously successful match. You can do this with positive lookahead, and use capture groups to get whatever matches the lookahead.
var onegram = /A(?=(\S\S))|\S(?=(\SA))|\S(?=(A\S))/ig;
var str = 'KDSGKHAGSKH';
var match
var ngrams = [];
while ((match = onegram.exec(str)) != null) {
ngrams.push(match.join(''));
}
You can generate the RE fairly simply (though not with optimal efficiency) with the help of one additional method on String:
String.prototype.repeat = function (n) {
if (n<1) return '';
var accum = '', c=this;
for (; n; n >>=1) {
if (1&n) accum += c;
c += c;
}
return accum;
};
function ngrammer(kgram, n) {
var m = n - kgram.length;
var branches = [];
for (var i = 0; i <= m; ++i) {
branches.push(('\\S'.repeat(i) + kgram + '\\S'.repeat(m-i) + '))').replace(/^\\?./, '$&(?=('));
}
return new RegExp(branches.join('|'), 'ig');
}
var onegram = ngrammer('A', 3);
...
精彩评论