开发者

How can I transform numeric/character entities to their string literal forms without defining my own dictionary/map?

I run a bot on IRC. It captures a url pattern which a user sends in a message and returns the title.

Often times, the title has a numeric/character entity such as " or '. These entities are literally being printed out. I'd l开发者_如何学Pythonike to convert them to their string literal versions, so " would become ".

Is anyone aware of some sort of library or utility I could use in node.js/JS without having to define my own map/dictionary, such as below?

var dictionary = {
   '"' : '"',
};


We recently were in need for this for out JavaScript Room chat bot here on SO, we couldn't find anything for Node.js, so we ended up with this piece of code which works fine (for most cases):

var _entities = {
    " ": " ",
    "¡": "¡",
    "¢": "¢",
    "£": "£",
    "¤": "¤",
    "¥": "¥",
    "¦": "¦",
    "§": "§",
    "¨": "¨",
    "©": "©",
    "ª": "ª",
    "«": "«",
    "¬": "¬",
    "­": "",
    "®": "®",
    "¯": "¯",
    "°": "°",
    "±": "±",
    "²": "²",
    "³": "³",
    "´": "´",
    "µ": "µ",
    "¶": "¶",
    "·": "·",
    "¸": "¸",
    "¹": "¹",
    "º": "º",
    "»": "»",
    "¼": "¼",
    "½": "½",
    "¾": "¾",
    "¿": "¿",
    "À": "À",
    "Á": "Á",
    "Â": "Â",
    "Ã": "Ã",
    "Ä": "Ä",
    "Å": "Å",
    "Æ": "Æ",
    "Ç": "Ç",
    "È": "È",
    "É": "É",
    "Ê": "Ê",
    "Ë": "Ë",
    "Ì": "Ì",
    "Í": "Í",
    "Î": "Î",
    "Ï": "Ï",
    "Ð": "Ð",
    "Ñ": "Ñ",
    "Ò": "Ò",
    "Ó": "Ó",
    "Ô": "Ô",
    "Õ": "Õ",
    "Ö": "Ö",
    "×": "×",
    "Ø": "Ø",
    "Ù": "Ù",
    "Ú": "Ú",
    "Û": "Û",
    "Ü": "Ü",
    "Ý": "Ý",
    "Þ": "Þ",
    "ß": "ß",
    "à": "à",
    "á": "á",
    "â": "â",
    "ã": "ã",
    "ä": "ä",
    "å": "å",
    "æ": "æ",
    "ç": "ç",
    "è": "è",
    "é": "é",
    "ê": "ê",
    "ë": "ë",
    "ì": "ì",
    "í": "í",
    "î": "î",
    "ï": "ï",
    "ð": "ð",
    "ñ": "ñ",
    "ò": "ò",
    "ó": "ó",
    "ô": "ô",
    "õ": "õ",
    "ö": "ö",
    "÷": "÷",
    "ø": "ø",
    "ù": "ù",
    "ú": "ú",
    "û": "û",
    "ü": "ü",
    "ý": "ý",
    "þ": "þ",
    "ÿ": "ÿ",
    "ƒ": "ƒ",
    "Α": "Α",
    "Β": "Β",
    "Γ": "Γ",
    "Δ": "Δ",
    "Ε": "Ε",
    "Ζ": "Ζ",
    "Η": "Η",
    "Θ": "Θ",
    "Ι": "Ι",
    "Κ": "Κ",
    "Λ": "Λ",
    "Μ": "Μ",
    "Ν": "Ν",
    "Ξ": "Ξ",
    "Ο": "Ο",
    "Π": "Π",
    "Ρ": "Ρ",
    "Σ": "Σ",
    "Τ": "Τ",
    "Υ": "Υ",
    "Φ": "Φ",
    "Χ": "Χ",
    "Ψ": "Ψ",
    "Ω": "Ω",
    "α": "α",
    "β": "β",
    "γ": "γ",
    "δ": "δ",
    "ε": "ε",
    "ζ": "ζ",
    "η": "η",
    "θ": "θ",
    "ι": "ι",
    "κ": "κ",
    "λ": "λ",
    "μ": "μ",
    "ν": "ν",
    "ξ": "ξ",
    "ο": "ο",
    "π": "π",
    "ρ": "ρ",
    "ς": "ς",
    "σ": "σ",
    "τ": "τ",
    "υ": "υ",
    "φ": "φ",
    "χ": "χ",
    "ψ": "ψ",
    "ω": "ω",
    "ϑ": "ϑ",
    "ϒ": "ϒ",
    "ϖ": "ϖ",
    "•": "•",
    "…": "…",
    "′": "′",
    "″": "″",
    "‾": "‾",
    "⁄": "⁄",
    "℘": "℘",
    "ℑ": "ℑ",
    "ℜ": "ℜ",
    "™": "™",
    "ℵ": "ℵ",
    "←": "←",
    "↑": "↑",
    "→": "→",
    "↓": "↓",
    "↔": "↔",
    "↵": "↵",
    "⇐": "⇐",
    "⇑": "⇑",
    "⇒": "⇒",
    "⇓": "⇓",
    "⇔": "⇔",
    "∀": "∀",
    "∂": "∂",
    "∃": "∃",
    "∅": "∅",
    "∇": "∇",
    "∈": "∈",
    "∉": "∉",
    "∋": "∋",
    "∏": "∏",
    "∑": "∑",
    "−": "−",
    "∗": "∗",
    "√": "√",
    "∝": "∝",
    "∞": "∞",
    "∠": "∠",
    "∧": "∧",
    "∨": "∨",
    "∩": "∩",
    "∪": "∪",
    "∫": "∫",
    "∴": "∴",
    "∼": "∼",
    "≅": "≅",
    "≈": "≈",
    "≠": "≠",
    "≡": "≡",
    "≤": "≤",
    "≥": "≥",
    "⊂": "⊂",
    "⊃": "⊃",
    "⊄": "⊄",
    "⊆": "⊆",
    "⊇": "⊇",
    "⊕": "⊕",
    "⊗": "⊗",
    "⊥": "⊥",
    "⋅": "⋅",
    "⌈": "⌈",
    "⌉": "⌉",
    "⌊": "⌊",
    "⌋": "⌋",
    "⟨": "〈",
    "⟩": "〉",
    "◊": "◊",
    "♠": "♠",
    "♣": "♣",
    "♥": "♥",
    "♦": "♦",
    """: "\"",
    "&": "&",
    "&lt;": "<",
    "&gt;": ">",
    "&OElig;": "Œ",
    "&oelig;": "œ",
    "&Scaron;": "Š",
    "&scaron;": "š",
    "&Yuml;": "Ÿ",
    "&circ;": "ˆ",
    "&tilde;": "˜",
    "&ensp;": " ",
    "&emsp;": " ",
    "&thinsp;": " ",
    "&zwnj;": "‌",
    "&zwj;": "‍",
    "&lrm;": "‎",
    "&rlm;": "‏",
    "&ndash;": "–",
    "&mdash;": "—",
    "&lsquo;": "‘",
    "&rsquo;": "’",
    "&sbquo;": "‚",
    "&ldquo;": "“",
    "&rdquo;": "”",
    "&bdquo;": "„",
    "&dagger;": "†",
    "&Dagger;": "‡",
    "&permil;": "‰",
    "&lsaquo;": "‹",
    "&rsaquo;": "›",
    "&euro;": "€",
};

function unescape_entity(input) {
    if (input.charAt(1) === '#') {
        return String.fromCharCode(parseInt(input.substr(2), 10));

    } else if (_entities.hasOwnProperty(input)) {
        return _entities[input];

    } else {
        return null;
    }
}

function unescape2(input) {
    var entityRe = /&(#?)(\d{1,5}|\w{1,8});/gm;
    return input.replace(entityRe, unescape_entity);
}

exports.unescape = unescape2;

Of course if you need a full mapping... that would take a "bit" more effort, you can find a Python version on the web (iirc ActiveState) which works better, but that uses Pythons stdlib for the mappings, so someone in the end has to do the job and provide those mappings for JS.


If you use the numeric encoding you can use String.fromCharCode(codepoint);

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜