Use JavaScript regex to replace numerical HTML entities with their actual characters
I'm trying to use JavaScript & regex to replace numerical HTML entities with their actual Unicode characters, e.g.
foo's bar
→
foo's bar
This is what I got so far:
"foo's b开发者_开发知识库ar".replace(/&#([^\s]*);/g, "$1"); // "foo39s bar"
All that's left to do is to replace the number with String.fromCharCode($1)
, but I can't seem to get it to work. How can I do this?
"foo's bar".replace(/&#(\d+);/g, function(match, match2) {return String.fromCharCode(+match2);})
"foo's bar".replace(/&#([^\s]*);/g, function(x, y) { return String.fromCharCode(y) })
First argument (x) is a "'" in current example. y is 39.
As well as using a callback function, you may want to consider adding support for hex character references (ሴ
).
Also, fromCharCode
may not be enough. eg 𐤀
is a valid reference to a Phoenician character, but because it is outside the Basic Multilingual Plane, and JavaScript's String model is based on UTF-16 code units, not complete character code points, fromCharCode(67840)
won't work. You'd need a UTF-16 encoder, for example:
String.fromCharCodePoint= function(/* codepoints */) {
var codeunits= [];
for (var i= 0; i<arguments.length; i++) {
var c= arguments[i];
if (arguments[i]<0x10000) {
codeunits.push(arguments[i]);
} else if (arguments[i]<0x110000) {
c-= 0x10000;
codeunits.push((c>>10 & 0x3FF) + 0xD800);
codeunits.push((c&0x3FF) + 0xDC00);
}
}
return String.fromCharCode.apply(String, codeunits);
};
function decodeCharacterReferences(s) {
return s.replace(/&#(\d+);/g, function(_, n) {;
return String.fromCharCodePoint(parseInt(n, 10));
}).replace(/&#x([0-9a-f]+);/gi, function(_, n) {
return String.fromCharCodePoint(parseInt(n, 16));
});
};
alert(decodeCharacterReferences('Hello 𐤀 mum 𐤀!'));
If you don't want to define all the entities you can let the browser do it for you- this bit creates an empty p element, writes the html and returns the text it produces. The p element is never added to the document.
function translateEntities(string){
var text, p=document.createElement('p');
p.innerHTML=string;
text= p.innerText || p.textContent;
p.innerHTML='';
return text;
}
var s= 'foo's bar';
translateEntities(s);
/* returned value: (String)
foo's bar
*/
精彩评论