开发者

Regex Linkify: WWW, nonWWW, HTTP, nonHTTP

I'm struggling to try to linkfiy links with or without "www" / "http"

This is what I got:

        noProtocolUrl = /\b((?:www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<&开发者_运维问答gt;]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))/g,
    httpOrMailtoUrl = /\b((?:[a-z][\w-]+:)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))/gi,
        linkifier = function (html) {
            return FormatLink(html
                        .replace(noProtocolUrl, '<a href="<``>://$1" rel="nofollow external" class="external_link">$1</a>')  // NOTE: we escape `"http` as `"<``>` to make sure `httpOrMailtoUrl` below doesn't find it as a false-positive
                        .replace(httpOrMailtoUrl, '<a href="$1" rel="nofollow external" class="external_link">$1</a>')
                        .replace(/"<``>/g, '"http'));  // reinsert `"http`

It's working great except that simple links with http:// are gettint the linkify treatment twice.

http://google.com would become two links: htttp:// and http://google.com

Any idea on how to fix this?

Thanks!

EDIT

Well, I got it working for any link except links without http* and **www like bit.ly/foo

If anyone know how to catch those links too, you're welcome.

var noProtocolUrl = /(^|["'(\s]|&lt;)(www\..+?\..+?)((?:[:?]|\.+)?(?:\s|$)|&gt;|[)"',])/g,
httpOrMailtoUrl = /\b((?:[a-z][\w-]+:)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))/gi,
linkifier = function ( html ) {
    return FormatLink(html
                .replace( noProtocolUrl, '$1<a href="<``>://$2" rel="nofollow external" class="external_link">$2</a>$3' )  // NOTE: we escape `"http` as `"<``>` to make sure `httpOrMailtoUrl` below doesn't find it as a false-positive
                .replace( httpOrMailtoUrl, '<a href="$1" rel="nofollow external" class="external_link">$1</a>' )
                .replace( /"<``>/g, '"http' ));  // reinsert `"http`
  },


use

var noProtocolUrl = /(^|["'(\s]|&lt;)((?:[a-z0-9-]+\.)+(?:ac|ad|ae|aero|af|ag|ai|al|am|an|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|xxx|ye|yt|za|zm|zw)(?:\/[a-zA-Z0-9_][^\s]*)?)((?:[:?]|\.+)?(?:\s|$)|&gt;|[)"',])/g,

in your code.

the huge list of alternatives is iana's authoritative list of tlds and serves to terminate the server portion of the url. it isn't strictly necessary but probably comes handy to reduce the number of false positives when linkifying arbitrary text.

the other modification is a complement of the 2nd referenced subexpression, namely ...

(?:\/[a-zA-Z0-9_][^\s]*)?

that matches an optional path and possibly query and fragment identifier - actually everything up to the end of the string or the first whitespace character.

you may wish to have a look at RFC 3986 which defines the formal syntax of uris. building a regex based on the specification within this document, marking the scheme portion an optional match should turn your matching regexes into an even more robust solution - you'll probably get along without that much precision, however.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜