From f2de5dc8c87b4625790fb45446c296d476b1e747 Mon Sep 17 00:00:00 2001 From: mrsdizzie Date: Thu, 7 Mar 2019 15:12:01 -0500 Subject: [PATCH] Replace linkRegex with xurls library (#6261) * Replace linkRegex with xurls library Rather than maintaining a complicated regex to match URLs for autolinking, gitea can use this existing go library that takes care of the matching with very little code change to gitea itself. After spending a while trying to find the perfect regex for all cases this library still works better as it is more flexible than a single regex ever will be. This will also fix the following issues: #5844 #3095 #3381 This passes all our current tests and I've added new ones mentioned in those issues as well. * Use xurls.StrictMatchingScheme instead of xurls.Strict This is much faster and we only care about https? links to preserve existing behavior. --- Gopkg.lock | 9 + Gopkg.toml | 4 + modules/markup/html.go | 5 +- modules/markup/html_test.go | 9 + vendor/github.com/mvdan/xurls/LICENSE | 27 + vendor/github.com/mvdan/xurls/schemes.go | 299 ++++ vendor/github.com/mvdan/xurls/tlds.go | 1557 ++++++++++++++++++ vendor/github.com/mvdan/xurls/tlds_pseudo.go | 24 + vendor/github.com/mvdan/xurls/xurls.go | 107 ++ 9 files changed, 2038 insertions(+), 3 deletions(-) create mode 100644 vendor/github.com/mvdan/xurls/LICENSE create mode 100644 vendor/github.com/mvdan/xurls/schemes.go create mode 100644 vendor/github.com/mvdan/xurls/tlds.go create mode 100644 vendor/github.com/mvdan/xurls/tlds_pseudo.go create mode 100644 vendor/github.com/mvdan/xurls/xurls.go diff --git a/Gopkg.lock b/Gopkg.lock index b1103ce08..00dea0587 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -725,6 +725,14 @@ pruneopts = "NUT" revision = "02ccfbfaf0cc627aa3aec8ef7ed5cfeec5b43f63" +[[projects]] + digest = "1:63953ffb90bbc880c612d576fcfd973a5904277d25ec9e2d8d5719bf67969662" + name = "github.com/mvdan/xurls" + packages = ["."] + pruneopts = "NUT" + revision = "e52e821cbfe8fe163ff6f8628ab5869b11fc05af" + version = "v2.0.0" + [[projects]] digest = "1:2be1d891535ce3d6d2a3db9087f07415e909744e9eff1a30f8f0b2519df60ae6" name = "github.com/nfnt/resize" @@ -1293,6 +1301,7 @@ "github.com/mcuadros/go-version", "github.com/microcosm-cc/bluemonday", "github.com/msteinert/pam", + "github.com/mvdan/xurls", "github.com/nfnt/resize", "github.com/pquerna/otp", "github.com/pquerna/otp/totp", diff --git a/Gopkg.toml b/Gopkg.toml index 3a981f529..f5dcb4686 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -113,3 +113,7 @@ ignored = ["google.golang.org/appengine*"] [[constraint]] name = "github.com/prometheus/client_golang" version = "0.9.0" + +[[constraint]] + name = "github.com/mvdan/xurls" + version = "2.0.0" diff --git a/modules/markup/html.go b/modules/markup/html.go index dab6d4e8e..036b664b0 100644 --- a/modules/markup/html.go +++ b/modules/markup/html.go @@ -17,6 +17,7 @@ import ( "code.gitea.io/gitea/modules/util" "github.com/Unknwon/com" + "github.com/mvdan/xurls" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) @@ -64,9 +65,7 @@ var ( // https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail) emailRegex = regexp.MustCompile("[a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*") - // matches http/https links. used for autlinking those. partly modified from - // the original present in autolink.js - linkRegex = regexp.MustCompile(`(?:(?:http|https):\/\/(?:[\-;:&=\+\$,\w]+@)?[A-Za-z0-9\.\-]+(?:\.|[\-;:&=\+\$,\w]+@)[A-Za-z0-9\.\-]+)(?:(?:\/[\+~%\/\.\w\-]*)?\??(?:[\-\+:=&;%@\.\w]*)#?(?:[\.\!\/\\\w]*))?`) + linkRegex, _ = xurls.StrictMatchingScheme("https?://") ) // regexp for full links to issues/pulls diff --git a/modules/markup/html_test.go b/modules/markup/html_test.go index f430cb04b..ff6820199 100644 --- a/modules/markup/html_test.go +++ b/modules/markup/html_test.go @@ -104,6 +104,15 @@ func TestRender_links(t *testing.T) { test( "http://142.42.1.1/", `

http://142.42.1.1/

`) + test( + "https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd", + `

https://github.com/go-gitea/gitea/?p=aaa/bbb.html#ccc-ddd

`) + test( + "https://en.wikipedia.org/wiki/URL_(disambiguation)", + `

https://en.wikipedia.org/wiki/URL_(disambiguation)

`) + test( + "https://foo_bar.example.com/", + `

https://foo_bar.example.com/

`) // Test that should *not* be turned into URL test( diff --git a/vendor/github.com/mvdan/xurls/LICENSE b/vendor/github.com/mvdan/xurls/LICENSE new file mode 100644 index 000000000..7d71d51a5 --- /dev/null +++ b/vendor/github.com/mvdan/xurls/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2015, Daniel Martí. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/mvdan/xurls/schemes.go b/vendor/github.com/mvdan/xurls/schemes.go new file mode 100644 index 000000000..01b7944ae --- /dev/null +++ b/vendor/github.com/mvdan/xurls/schemes.go @@ -0,0 +1,299 @@ +// Generated by schemesgen + +package xurls + +// Schemes is a sorted list of all IANA assigned schemes. +// +// Source: +// https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv +var Schemes = []string{ + `aaa`, + `aaas`, + `about`, + `acap`, + `acct`, + `acr`, + `adiumxtra`, + `afp`, + `afs`, + `aim`, + `appdata`, + `apt`, + `attachment`, + `aw`, + `barion`, + `beshare`, + `bitcoin`, + `bitcoincash`, + `blob`, + `bolo`, + `browserext`, + `callto`, + `cap`, + `chrome`, + `chrome-extension`, + `cid`, + `coap`, + `coap+tcp`, + `coap+ws`, + `coaps`, + `coaps+tcp`, + `coaps+ws`, + `com-eventbrite-attendee`, + `content`, + `conti`, + `crid`, + `cvs`, + `data`, + `dav`, + `diaspora`, + `dict`, + `did`, + `dis`, + `dlna-playcontainer`, + `dlna-playsingle`, + `dns`, + `dntp`, + `dtn`, + `dvb`, + `ed2k`, + `elsi`, + `example`, + `facetime`, + `fax`, + `feed`, + `feedready`, + `file`, + `filesystem`, + `finger`, + `fish`, + `ftp`, + `geo`, + `gg`, + `git`, + `gizmoproject`, + `go`, + `gopher`, + `graph`, + `gtalk`, + `h323`, + `ham`, + `hcap`, + `hcp`, + `http`, + `https`, + `hxxp`, + `hxxps`, + `hydrazone`, + `iax`, + `icap`, + `icon`, + `im`, + `imap`, + `info`, + `iotdisco`, + `ipn`, + `ipp`, + `ipps`, + `irc`, + `irc6`, + `ircs`, + `iris`, + `iris.beep`, + `iris.lwz`, + `iris.xpc`, + `iris.xpcs`, + `isostore`, + `itms`, + `jabber`, + `jar`, + `jms`, + `keyparc`, + `lastfm`, + `ldap`, + `ldaps`, + `lvlt`, + `magnet`, + `mailserver`, + `mailto`, + `maps`, + `market`, + `message`, + `microsoft.windows.camera`, + `microsoft.windows.camera.multipicker`, + `microsoft.windows.camera.picker`, + `mid`, + `mms`, + `modem`, + `mongodb`, + `moz`, + `ms-access`, + `ms-browser-extension`, + `ms-drive-to`, + `ms-enrollment`, + `ms-excel`, + `ms-gamebarservices`, + `ms-gamingoverlay`, + `ms-getoffice`, + `ms-help`, + `ms-infopath`, + `ms-inputapp`, + `ms-lockscreencomponent-config`, + `ms-media-stream-id`, + `ms-mixedrealitycapture`, + `ms-officeapp`, + `ms-people`, + `ms-project`, + `ms-powerpoint`, + `ms-publisher`, + `ms-restoretabcompanion`, + `ms-screenclip`, + `ms-screensketch`, + `ms-search`, + `ms-search-repair`, + `ms-secondary-screen-controller`, + `ms-secondary-screen-setup`, + `ms-settings`, + `ms-settings-airplanemode`, + `ms-settings-bluetooth`, + `ms-settings-camera`, + `ms-settings-cellular`, + `ms-settings-cloudstorage`, + `ms-settings-connectabledevices`, + `ms-settings-displays-topology`, + `ms-settings-emailandaccounts`, + `ms-settings-language`, + `ms-settings-location`, + `ms-settings-lock`, + `ms-settings-nfctransactions`, + `ms-settings-notifications`, + `ms-settings-power`, + `ms-settings-privacy`, + `ms-settings-proximity`, + `ms-settings-screenrotation`, + `ms-settings-wifi`, + `ms-settings-workplace`, + `ms-spd`, + `ms-sttoverlay`, + `ms-transit-to`, + `ms-useractivityset`, + `ms-virtualtouchpad`, + `ms-visio`, + `ms-walk-to`, + `ms-whiteboard`, + `ms-whiteboard-cmd`, + `ms-word`, + `msnim`, + `msrp`, + `msrps`, + `mtqp`, + `mumble`, + `mupdate`, + `mvn`, + `news`, + `nfs`, + `ni`, + `nih`, + `nntp`, + `notes`, + `ocf`, + `oid`, + `onenote`, + `onenote-cmd`, + `opaquelocktoken`, + `openpgp4fpr`, + `pack`, + `palm`, + `paparazzi`, + `pkcs11`, + `platform`, + `pop`, + `pres`, + `prospero`, + `proxy`, + `pwid`, + `psyc`, + `qb`, + `query`, + `redis`, + `rediss`, + `reload`, + `res`, + `resource`, + `rmi`, + `rsync`, + `rtmfp`, + `rtmp`, + `rtsp`, + `rtsps`, + `rtspu`, + `secondlife`, + `service`, + `session`, + `sftp`, + `sgn`, + `shttp`, + `sieve`, + `simpleledger`, + `sip`, + `sips`, + `skype`, + `smb`, + `sms`, + `smtp`, + `snews`, + `snmp`, + `soap.beep`, + `soap.beeps`, + `soldat`, + `spiffe`, + `spotify`, + `ssh`, + `steam`, + `stun`, + `stuns`, + `submit`, + `svn`, + `tag`, + `teamspeak`, + `tel`, + `teliaeid`, + `telnet`, + `tftp`, + `things`, + `thismessage`, + `tip`, + `tn3270`, + `tool`, + `turn`, + `turns`, + `tv`, + `udp`, + `unreal`, + `urn`, + `ut2004`, + `v-event`, + `vemmi`, + `ventrilo`, + `videotex`, + `vnc`, + `view-source`, + `wais`, + `webcal`, + `wpid`, + `ws`, + `wss`, + `wtai`, + `wyciwyg`, + `xcon`, + `xcon-userid`, + `xfire`, + `xmlrpc.beep`, + `xmlrpc.beeps`, + `xmpp`, + `xri`, + `ymsgr`, + `z39.50`, + `z39.50r`, + `z39.50s`, +} diff --git a/vendor/github.com/mvdan/xurls/tlds.go b/vendor/github.com/mvdan/xurls/tlds.go new file mode 100644 index 000000000..084ab84d4 --- /dev/null +++ b/vendor/github.com/mvdan/xurls/tlds.go @@ -0,0 +1,1557 @@ +// Generated by tldsgen + +package xurls + +// TLDs is a sorted list of all public top-level domains. +// +// Sources: +// * https://data.iana.org/TLD/tlds-alpha-by-domain.txt +// * https://publicsuffix.org/list/effective_tld_names.dat +var TLDs = []string{ + `aaa`, + `aarp`, + `abarth`, + `abb`, + `abbott`, + `abbvie`, + `abc`, + `able`, + `abogado`, + `abudhabi`, + `ac`, + `academy`, + `accenture`, + `accountant`, + `accountants`, + `aco`, + `active`, + `actor`, + `ad`, + `adac`, + `ads`, + `adult`, + `ae`, + `aeg`, + `aero`, + `aetna`, + `af`, + `afamilycompany`, + `afl`, + `africa`, + `ag`, + `agakhan`, + `agency`, + `ai`, + `aig`, + `aigo`, + `airbus`, + `airforce`, + `airtel`, + `akdn`, + `al`, + `alfaromeo`, + `alibaba`, + `alipay`, + `allfinanz`, + `allstate`, + `ally`, + `alsace`, + `alstom`, + `am`, + `americanexpress`, + `americanfamily`, + `amex`, + `amfam`, + `amica`, + `amsterdam`, + `analytics`, + `android`, + `anquan`, + `anz`, + `ao`, + `aol`, + `apartments`, + `app`, + `apple`, + `aq`, + `aquarelle`, + `ar`, + `arab`, + `aramco`, + `archi`, + `army`, + `arpa`, + `art`, + `arte`, + `as`, + `asda`, + `asia`, + `associates`, + `at`, + `athleta`, + `attorney`, + `au`, + `auction`, + `audi`, + `audible`, + `audio`, + `auspost`, + `author`, + `auto`, + `autos`, + `avianca`, + `aw`, + `aws`, + `ax`, + `axa`, + `az`, + `azure`, + `ba`, + `baby`, + `baidu`, + `banamex`, + `bananarepublic`, + `band`, + `bank`, + `bar`, + `barcelona`, + `barclaycard`, + `barclays`, + `barefoot`, + `bargains`, + `baseball`, + `basketball`, + `bauhaus`, + `bayern`, + `bb`, + `bbc`, + `bbt`, + `bbva`, + `bcg`, + `bcn`, + `bd`, + `be`, + `beats`, + `beauty`, + `beer`, + `bentley`, + `berlin`, + `best`, + `bestbuy`, + `bet`, + `bf`, + `bg`, + `bh`, + `bharti`, + `bi`, + `bible`, + `bid`, + `bike`, + `bing`, + `bingo`, + `bio`, + `biz`, + `bj`, + `black`, + `blackfriday`, + `blanco`, + `blockbuster`, + `blog`, + `bloomberg`, + `blue`, + `bm`, + `bms`, + `bmw`, + `bn`, + `bnl`, + `bnpparibas`, + `bo`, + `boats`, + `boehringer`, + `bofa`, + `bom`, + `bond`, + `boo`, + `book`, + `booking`, + `bosch`, + `bostik`, + `boston`, + `bot`, + `boutique`, + `box`, + `br`, + `bradesco`, + `bridgestone`, + `broadway`, + `broker`, + `brother`, + `brussels`, + `bs`, + `bt`, + `budapest`, + `bugatti`, + `build`, + `builders`, + `business`, + `buy`, + `buzz`, + `bv`, + `bw`, + `by`, + `bz`, + `bzh`, + `ca`, + `cab`, + `cafe`, + `cal`, + `call`, + `calvinklein`, + `cam`, + `camera`, + `camp`, + `cancerresearch`, + `canon`, + `capetown`, + `capital`, + `capitalone`, + `car`, + `caravan`, + `cards`, + `care`, + `career`, + `careers`, + `cars`, + `cartier`, + `casa`, + `case`, + `caseih`, + `cash`, + `casino`, + `cat`, + `catering`, + `catholic`, + `cba`, + `cbn`, + `cbre`, + `cbs`, + `cc`, + `cd`, + `ceb`, + `center`, + `ceo`, + `cern`, + `cf`, + `cfa`, + `cfd`, + `cg`, + `ch`, + `chanel`, + `channel`, + `charity`, + `chase`, + `chat`, + `cheap`, + `chintai`, + `christmas`, + `chrome`, + `chrysler`, + `church`, + `ci`, + `cipriani`, + `circle`, + `cisco`, + `citadel`, + `citi`, + `citic`, + `city`, + `cityeats`, + `ck`, + `cl`, + `claims`, + `cleaning`, + `click`, + `clinic`, + `clinique`, + `clothing`, + `cloud`, + `club`, + `clubmed`, + `cm`, + `cn`, + `co`, + `coach`, + `codes`, + `coffee`, + `college`, + `cologne`, + `com`, + `comcast`, + `commbank`, + `community`, + `company`, + `compare`, + `computer`, + `comsec`, + `condos`, + `construction`, + `consulting`, + `contact`, + `contractors`, + `cooking`, + `cookingchannel`, + `cool`, + `coop`, + `corsica`, + `country`, + `coupon`, + `coupons`, + `courses`, + `cr`, + `credit`, + `creditcard`, + `creditunion`, + `cricket`, + `crown`, + `crs`, + `cruise`, + `cruises`, + `csc`, + `cu`, + `cuisinella`, + `cv`, + `cw`, + `cx`, + `cy`, + `cymru`, + `cyou`, + `cz`, + `dabur`, + `dad`, + `dance`, + `data`, + `date`, + `dating`, + `datsun`, + `day`, + `dclk`, + `dds`, + `de`, + `deal`, + `dealer`, + `deals`, + `degree`, + `delivery`, + `dell`, + `deloitte`, + `delta`, + `democrat`, + `dental`, + `dentist`, + `desi`, + `design`, + `dev`, + `dhl`, + `diamonds`, + `diet`, + `digital`, + `direct`, + `directory`, + `discount`, + `discover`, + `dish`, + `diy`, + `dj`, + `dk`, + `dm`, + `dnp`, + `do`, + `docs`, + `doctor`, + `dodge`, + `dog`, + `doha`, + `domains`, + `dot`, + `download`, + `drive`, + `dtv`, + `dubai`, + `duck`, + `dunlop`, + `duns`, + `dupont`, + `durban`, + `dvag`, + `dvr`, + `dz`, + `earth`, + `eat`, + `ec`, + `eco`, + `edeka`, + `edu`, + `education`, + `ee`, + `eg`, + `email`, + `emerck`, + `energy`, + `engineer`, + `engineering`, + `enterprises`, + `epost`, + `epson`, + `equipment`, + `er`, + `ericsson`, + `erni`, + `es`, + `esq`, + `estate`, + `esurance`, + `et`, + `etisalat`, + `eu`, + `eurovision`, + `eus`, + `events`, + `everbank`, + `exchange`, + `expert`, + `exposed`, + `express`, + `extraspace`, + `fage`, + `fail`, + `fairwinds`, + `faith`, + `family`, + `fan`, + `fans`, + `farm`, + `farmers`, + `fashion`, + `fast`, + `fedex`, + `feedback`, + `ferrari`, + `ferrero`, + `fi`, + `fiat`, + `fidelity`, + `fido`, + `film`, + `final`, + `finance`, + `financial`, + `fire`, + `firestone`, + `firmdale`, + `fish`, + `fishing`, + `fit`, + `fitness`, + `fj`, + `fk`, + `flickr`, + `flights`, + `flir`, + `florist`, + `flowers`, + `fly`, + `fm`, + `fo`, + `foo`, + `food`, + `foodnetwork`, + `football`, + `ford`, + `forex`, + `forsale`, + `forum`, + `foundation`, + `fox`, + `fr`, + `free`, + `fresenius`, + `frl`, + `frogans`, + `frontdoor`, + `frontier`, + `ftr`, + `fujitsu`, + `fujixerox`, + `fun`, + `fund`, + `furniture`, + `futbol`, + `fyi`, + `ga`, + `gal`, + `gallery`, + `gallo`, + `gallup`, + `game`, + `games`, + `gap`, + `garden`, + `gb`, + `gbiz`, + `gd`, + `gdn`, + `ge`, + `gea`, + `gent`, + `genting`, + `george`, + `gf`, + `gg`, + `ggee`, + `gh`, + `gi`, + `gift`, + `gifts`, + `gives`, + `giving`, + `gl`, + `glade`, + `glass`, + `gle`, + `global`, + `globo`, + `gm`, + `gmail`, + `gmbh`, + `gmo`, + `gmx`, + `gn`, + `godaddy`, + `gold`, + `goldpoint`, + `golf`, + `goo`, + `goodyear`, + `goog`, + `google`, + `gop`, + `got`, + `gov`, + `gp`, + `gq`, + `gr`, + `grainger`, + `graphics`, + `gratis`, + `green`, + `gripe`, + `grocery`, + `group`, + `gs`, + `gt`, + `gu`, + `guardian`, + `gucci`, + `guge`, + `guide`, + `guitars`, + `guru`, + `gw`, + `gy`, + `hair`, + `hamburg`, + `hangout`, + `haus`, + `hbo`, + `hdfc`, + `hdfcbank`, + `health`, + `healthcare`, + `help`, + `helsinki`, + `here`, + `hermes`, + `hgtv`, + `hiphop`, + `hisamitsu`, + `hitachi`, + `hiv`, + `hk`, + `hkt`, + `hm`, + `hn`, + `hockey`, + `holdings`, + `holiday`, + `homedepot`, + `homegoods`, + `homes`, + `homesense`, + `honda`, + `honeywell`, + `horse`, + `hospital`, + `host`, + `hosting`, + `hot`, + `hoteles`, + `hotels`, + `hotmail`, + `house`, + `how`, + `hr`, + `hsbc`, + `ht`, + `hu`, + `hughes`, + `hyatt`, + `hyundai`, + `ibm`, + `icbc`, + `ice`, + `icu`, + `id`, + `ie`, + `ieee`, + `ifm`, + `ikano`, + `il`, + `im`, + `imamat`, + `imdb`, + `immo`, + `immobilien`, + `in`, + `inc`, + `industries`, + `infiniti`, + `info`, + `ing`, + `ink`, + `institute`, + `insurance`, + `insure`, + `int`, + `intel`, + `international`, + `intuit`, + `investments`, + `io`, + `ipiranga`, + `iq`, + `ir`, + `irish`, + `is`, + `iselect`, + `ismaili`, + `ist`, + `istanbul`, + `it`, + `itau`, + `itv`, + `iveco`, + `jaguar`, + `java`, + `jcb`, + `jcp`, + `je`, + `jeep`, + `jetzt`, + `jewelry`, + `jio`, + `jll`, + `jm`, + `jmp`, + `jnj`, + `jo`, + `jobs`, + `joburg`, + `jot`, + `joy`, + `jp`, + `jpmorgan`, + `jprs`, + `juegos`, + `juniper`, + `kaufen`, + `kddi`, + `ke`, + `kerryhotels`, + `kerrylogistics`, + `kerryproperties`, + `kfh`, + `kg`, + `kh`, + `ki`, + `kia`, + `kim`, + `kinder`, + `kindle`, + `kitchen`, + `kiwi`, + `km`, + `kn`, + `koeln`, + `komatsu`, + `kosher`, + `kp`, + `kpmg`, + `kpn`, + `kr`, + `krd`, + `kred`, + `kuokgroup`, + `kw`, + `ky`, + `kyoto`, + `kz`, + `la`, + `lacaixa`, + `ladbrokes`, + `lamborghini`, + `lamer`, + `lancaster`, + `lancia`, + `lancome`, + `land`, + `landrover`, + `lanxess`, + `lasalle`, + `lat`, + `latino`, + `latrobe`, + `law`, + `lawyer`, + `lb`, + `lc`, + `lds`, + `lease`, + `leclerc`, + `lefrak`, + `legal`, + `lego`, + `lexus`, + `lgbt`, + `li`, + `liaison`, + `lidl`, + `life`, + `lifeinsurance`, + `lifestyle`, + `lighting`, + `like`, + `lilly`, + `limited`, + `limo`, + `lincoln`, + `linde`, + `link`, + `lipsy`, + `live`, + `living`, + `lixil`, + `lk`, + `llc`, + `loan`, + `loans`, + `locker`, + `locus`, + `loft`, + `lol`, + `london`, + `lotte`, + `lotto`, + `love`, + `lpl`, + `lplfinancial`, + `lr`, + `ls`, + `lt`, + `ltd`, + `ltda`, + `lu`, + `lundbeck`, + `lupin`, + `luxe`, + `luxury`, + `lv`, + `ly`, + `ma`, + `macys`, + `madrid`, + `maif`, + `maison`, + `makeup`, + `man`, + `management`, + `mango`, + `map`, + `market`, + `marketing`, + `markets`, + `marriott`, + `marshalls`, + `maserati`, + `mattel`, + `mba`, + `mc`, + `mckinsey`, + `md`, + `me`, + `med`, + `media`, + `meet`, + `melbourne`, + `meme`, + `memorial`, + `men`, + `menu`, + `merckmsd`, + `metlife`, + `mg`, + `mh`, + `miami`, + `microsoft`, + `mil`, + `mini`, + `mint`, + `mit`, + `mitsubishi`, + `mk`, + `ml`, + `mlb`, + `mls`, + `mm`, + `mma`, + `mn`, + `mo`, + `mobi`, + `mobile`, + `mobily`, + `moda`, + `moe`, + `moi`, + `mom`, + `monash`, + `money`, + `monster`, + `mopar`, + `mormon`, + `mortgage`, + `moscow`, + `moto`, + `motorcycles`, + `mov`, + `movie`, + `movistar`, + `mp`, + `mq`, + `mr`, + `ms`, + `msd`, + `mt`, + `mtn`, + `mtr`, + `mu`, + `museum`, + `mutual`, + `mv`, + `mw`, + `mx`, + `my`, + `mz`, + `na`, + `nab`, + `nadex`, + `nagoya`, + `name`, + `nationwide`, + `natura`, + `navy`, + `nba`, + `nc`, + `ne`, + `nec`, + `net`, + `netbank`, + `netflix`, + `network`, + `neustar`, + `new`, + `newholland`, + `news`, + `next`, + `nextdirect`, + `nexus`, + `nf`, + `nfl`, + `ng`, + `ngo`, + `nhk`, + `ni`, + `nico`, + `nike`, + `nikon`, + `ninja`, + `nissan`, + `nissay`, + `nl`, + `no`, + `nokia`, + `northwesternmutual`, + `norton`, + `now`, + `nowruz`, + `nowtv`, + `np`, + `nr`, + `nra`, + `nrw`, + `ntt`, + `nu`, + `nyc`, + `nz`, + `obi`, + `observer`, + `off`, + `office`, + `okinawa`, + `olayan`, + `olayangroup`, + `oldnavy`, + `ollo`, + `om`, + `omega`, + `one`, + `ong`, + `onion`, + `onl`, + `online`, + `onyourside`, + `ooo`, + `open`, + `oracle`, + `orange`, + `org`, + `organic`, + `origins`, + `osaka`, + `otsuka`, + `ott`, + `ovh`, + `pa`, + `page`, + `panasonic`, + `paris`, + `pars`, + `partners`, + `parts`, + `party`, + `passagens`, + `pay`, + `pccw`, + `pe`, + `pet`, + `pf`, + `pfizer`, + `pg`, + `ph`, + `pharmacy`, + `phd`, + `philips`, + `phone`, + `photo`, + `photography`, + `photos`, + `physio`, + `piaget`, + `pics`, + `pictet`, + `pictures`, + `pid`, + `pin`, + `ping`, + `pink`, + `pioneer`, + `pizza`, + `pk`, + `pl`, + `place`, + `play`, + `playstation`, + `plumbing`, + `plus`, + `pm`, + `pn`, + `pnc`, + `pohl`, + `poker`, + `politie`, + `porn`, + `post`, + `pr`, + `pramerica`, + `praxi`, + `press`, + `prime`, + `pro`, + `prod`, + `productions`, + `prof`, + `progressive`, + `promo`, + `properties`, + `property`, + `protection`, + `pru`, + `prudential`, + `ps`, + `pt`, + `pub`, + `pw`, + `pwc`, + `py`, + `qa`, + `qpon`, + `quebec`, + `quest`, + `qvc`, + `racing`, + `radio`, + `raid`, + `re`, + `read`, + `realestate`, + `realtor`, + `realty`, + `recipes`, + `red`, + `redstone`, + `redumbrella`, + `rehab`, + `reise`, + `reisen`, + `reit`, + `reliance`, + `ren`, + `rent`, + `rentals`, + `repair`, + `report`, + `republican`, + `rest`, + `restaurant`, + `review`, + `reviews`, + `rexroth`, + `rich`, + `richardli`, + `ricoh`, + `rightathome`, + `ril`, + `rio`, + `rip`, + `rmit`, + `ro`, + `rocher`, + `rocks`, + `rodeo`, + `rogers`, + `room`, + `rs`, + `rsvp`, + `ru`, + `rugby`, + `ruhr`, + `run`, + `rw`, + `rwe`, + `ryukyu`, + `sa`, + `saarland`, + `safe`, + `safety`, + `sakura`, + `sale`, + `salon`, + `samsclub`, + `samsung`, + `sandvik`, + `sandvikcoromant`, + `sanofi`, + `sap`, + `sarl`, + `sas`, + `save`, + `saxo`, + `sb`, + `sbi`, + `sbs`, + `sc`, + `sca`, + `scb`, + `schaeffler`, + `schmidt`, + `scholarships`, + `school`, + `schule`, + `schwarz`, + `science`, + `scjohnson`, + `scor`, + `scot`, + `sd`, + `se`, + `search`, + `seat`, + `secure`, + `security`, + `seek`, + `select`, + `sener`, + `services`, + `ses`, + `seven`, + `sew`, + `sex`, + `sexy`, + `sfr`, + `sg`, + `sh`, + `shangrila`, + `sharp`, + `shaw`, + `shell`, + `shia`, + `shiksha`, + `shoes`, + `shop`, + `shopping`, + `shouji`, + `show`, + `showtime`, + `shriram`, + `si`, + `silk`, + `sina`, + `singles`, + `site`, + `sj`, + `sk`, + `ski`, + `skin`, + `sky`, + `skype`, + `sl`, + `sling`, + `sm`, + `smart`, + `smile`, + `sn`, + `sncf`, + `so`, + `soccer`, + `social`, + `softbank`, + `software`, + `sohu`, + `solar`, + `solutions`, + `song`, + `sony`, + `soy`, + `space`, + `spiegel`, + `sport`, + `spot`, + `spreadbetting`, + `sr`, + `srl`, + `srt`, + `st`, + `stada`, + `staples`, + `star`, + `starhub`, + `statebank`, + `statefarm`, + `statoil`, + `stc`, + `stcgroup`, + `stockholm`, + `storage`, + `store`, + `stream`, + `studio`, + `study`, + `style`, + `su`, + `sucks`, + `supplies`, + `supply`, + `support`, + `surf`, + `surgery`, + `suzuki`, + `sv`, + `swatch`, + `swiftcover`, + `swiss`, + `sx`, + `sy`, + `sydney`, + `symantec`, + `systems`, + `sz`, + `tab`, + `taipei`, + `talk`, + `taobao`, + `target`, + `tatamotors`, + `tatar`, + `tattoo`, + `tax`, + `taxi`, + `tc`, + `tci`, + `td`, + `tdk`, + `team`, + `tech`, + `technology`, + `tel`, + `telefonica`, + `temasek`, + `tennis`, + `teva`, + `tf`, + `tg`, + `th`, + `thd`, + `theater`, + `theatre`, + `tiaa`, + `tickets`, + `tienda`, + `tiffany`, + `tips`, + `tires`, + `tirol`, + `tj`, + `tjmaxx`, + `tjx`, + `tk`, + `tkmaxx`, + `tl`, + `tm`, + `tmall`, + `tn`, + `to`, + `today`, + `tokyo`, + `tools`, + `top`, + `toray`, + `toshiba`, + `total`, + `tours`, + `town`, + `toyota`, + `toys`, + `tr`, + `trade`, + `trading`, + `training`, + `travel`, + `travelchannel`, + `travelers`, + `travelersinsurance`, + `trust`, + `trv`, + `tt`, + `tube`, + `tui`, + `tunes`, + `tushu`, + `tv`, + `tvs`, + `tw`, + `tz`, + `ua`, + `ubank`, + `ubs`, + `uconnect`, + `ug`, + `uk`, + `unicom`, + `university`, + `uno`, + `uol`, + `ups`, + `us`, + `uy`, + `uz`, + `va`, + `vacations`, + `vana`, + `vanguard`, + `vc`, + `ve`, + `vegas`, + `ventures`, + `verisign`, + `vermögensberater`, + `vermögensberatung`, + `versicherung`, + `vet`, + `vg`, + `vi`, + `viajes`, + `video`, + `vig`, + `viking`, + `villas`, + `vin`, + `vip`, + `virgin`, + `visa`, + `vision`, + `vistaprint`, + `viva`, + `vivo`, + `vlaanderen`, + `vn`, + `vodka`, + `volkswagen`, + `volvo`, + `vote`, + `voting`, + `voto`, + `voyage`, + `vu`, + `vuelos`, + `wales`, + `walmart`, + `walter`, + `wang`, + `wanggou`, + `warman`, + `watch`, + `watches`, + `weather`, + `weatherchannel`, + `webcam`, + `weber`, + `website`, + `wed`, + `wedding`, + `weibo`, + `weir`, + `wf`, + `whoswho`, + `wien`, + `wiki`, + `williamhill`, + `win`, + `windows`, + `wine`, + `winners`, + `wme`, + `wolterskluwer`, + `woodside`, + `work`, + `works`, + `world`, + `wow`, + `ws`, + `wtc`, + `wtf`, + `xbox`, + `xerox`, + `xfinity`, + `xihuan`, + `xin`, + `xxx`, + `xyz`, + `yachts`, + `yahoo`, + `yamaxun`, + `yandex`, + `ye`, + `yodobashi`, + `yoga`, + `yokohama`, + `you`, + `youtube`, + `yt`, + `yun`, + `za`, + `zappos`, + `zara`, + `zero`, + `zip`, + `zippo`, + `zm`, + `zone`, + `zuerich`, + `zw`, + `ελ`, + `бг`, + `бел`, + `дети`, + `ею`, + `католик`, + `ком`, + `мкд`, + `мон`, + `москва`, + `онлайн`, + `орг`, + `рус`, + `рф`, + `сайт`, + `срб`, + `укр`, + `қаз`, + `հայ`, + `קום`, + `ابوظبي`, + `اتصالات`, + `ارامكو`, + `الاردن`, + `الجزائر`, + `السعودية`, + `السعوديه`, + `السعودیة`, + `السعودیۃ`, + `العليان`, + `المغرب`, + `اليمن`, + `امارات`, + `ايران`, + `ایران`, + `بارت`, + `بازار`, + `بيتك`, + `بھارت`, + `تونس`, + `سودان`, + `سوريا`, + `سورية`, + `شبكة`, + `عراق`, + `عرب`, + `عمان`, + `فلسطين`, + `قطر`, + `كاثوليك`, + `كوم`, + `مصر`, + `مليسيا`, + `موبايلي`, + `موقع`, + `همراه`, + `پاكستان`, + `پاکستان`, + `ڀارت`, + `कॉम`, + `नेट`, + `भारत`, + `भारतम्`, + `भारोत`, + `संगठन`, + `বাংলা`, + `ভারত`, + `ভাৰত`, + `ਭਾਰਤ`, + `ભારત`, + `ଭାରତ`, + `இந்தியா`, + `இலங்கை`, + `சிங்கப்பூர்`, + `భారత్`, + `ಭಾರತ`, + `ഭാരതം`, + `ලංකා`, + `คอม`, + `ไทย`, + `გე`, + `みんな`, + `クラウド`, + `グーグル`, + `コム`, + `ストア`, + `セール`, + `ファッション`, + `ポイント`, + `世界`, + `中信`, + `中国`, + `中國`, + `中文网`, + `企业`, + `佛山`, + `信息`, + `健康`, + `八卦`, + `公司`, + `公益`, + `台湾`, + `台灣`, + `商城`, + `商店`, + `商标`, + `嘉里`, + `嘉里大酒店`, + `在线`, + `大众汽车`, + `大拿`, + `天主教`, + `娱乐`, + `家電`, + `工行`, + `广东`, + `微博`, + `慈善`, + `我爱你`, + `手机`, + `手表`, + `招聘`, + `政务`, + `政府`, + `新加坡`, + `新闻`, + `时尚`, + `書籍`, + `机构`, + `淡马锡`, + `游戏`, + `澳門`, + `澳门`, + `点看`, + `珠宝`, + `移动`, + `组织机构`, + `网址`, + `网店`, + `网站`, + `网络`, + `联通`, + `臺灣`, + `诺基亚`, + `谷歌`, + `购物`, + `通販`, + `集团`, + `電訊盈科`, + `飞利浦`, + `食品`, + `餐厅`, + `香格里拉`, + `香港`, + `닷넷`, + `닷컴`, + `삼성`, + `한국`, +} diff --git a/vendor/github.com/mvdan/xurls/tlds_pseudo.go b/vendor/github.com/mvdan/xurls/tlds_pseudo.go new file mode 100644 index 000000000..94c67d15b --- /dev/null +++ b/vendor/github.com/mvdan/xurls/tlds_pseudo.go @@ -0,0 +1,24 @@ +// Copyright (c) 2015, Daniel Martí +// See LICENSE for licensing information + +package xurls + +// PseudoTLDs is a sorted list of some widely used unofficial TLDs. +// +// Sources: +// * https://en.wikipedia.org/wiki/Pseudo-top-level_domain +// * https://en.wikipedia.org/wiki/Category:Pseudo-top-level_domains +// * https://tools.ietf.org/html/draft-grothoff-iesg-special-use-p2p-names-00 +// * https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml +var PseudoTLDs = []string{ + `bit`, // Namecoin + `example`, // Example domain + `exit`, // Tor exit node + `gnu`, // GNS by public key + `i2p`, // I2P network + `invalid`, // Invalid domain + `local`, // Local network + `localhost`, // Local network + `test`, // Test domain + `zkey`, // GNS domain name +} diff --git a/vendor/github.com/mvdan/xurls/xurls.go b/vendor/github.com/mvdan/xurls/xurls.go new file mode 100644 index 000000000..d6279ae60 --- /dev/null +++ b/vendor/github.com/mvdan/xurls/xurls.go @@ -0,0 +1,107 @@ +// Copyright (c) 2015, Daniel Martí +// See LICENSE for licensing information + +// Package xurls extracts urls from plain text using regular expressions. +package xurls + +import ( + "bytes" + "regexp" +) + +//go:generate go run generate/tldsgen/main.go +//go:generate go run generate/schemesgen/main.go + +const ( + letter = `\p{L}` + mark = `\p{M}` + number = `\p{N}` + iriChar = letter + mark + number + currency = `\p{Sc}` + otherSymb = `\p{So}` + endChar = iriChar + `/\-+_&~*%=#` + currency + otherSymb + otherPunc = `\p{Po}` + midChar = endChar + `|` + otherPunc + wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)` + wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]` + wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}` + wellAll = wellParen + `|` + wellBrack + `|` + wellBrace + pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+` + + iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?` + domain = `(` + iri + `\.)+` + octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])` + ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b` + ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:` + ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)` + port = `(:[0-9]*)?` +) + +// AnyScheme can be passed to StrictMatchingScheme to match any possibly valid +// scheme, and not just the known ones. +var AnyScheme = `([a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)` + +// SchemesNoAuthority is a sorted list of some well-known url schemes that are +// followed by ":" instead of "://". +var SchemesNoAuthority = []string{ + `bitcoin`, // Bitcoin + `file`, // Files + `magnet`, // Torrent magnets + `mailto`, // Mail + `sms`, // SMS + `tel`, // Telephone + `xmpp`, // XMPP +} + +func anyOf(strs ...string) string { + var b bytes.Buffer + b.WriteByte('(') + for i, s := range strs { + if i != 0 { + b.WriteByte('|') + } + b.WriteString(regexp.QuoteMeta(s)) + } + b.WriteByte(')') + return b.String() +} + +func strictExp() string { + schemes := `(` + anyOf(Schemes...) + `://|` + anyOf(SchemesNoAuthority...) + `:)` + return `(?i)` + schemes + `(?-i)` + pathCont +} + +func relaxedExp() string { + site := domain + `(?i)` + anyOf(append(TLDs, PseudoTLDs...)...) + `(?-i)` + hostName := `(` + site + `|` + ipAddr + `)` + webURL := hostName + port + `(/|/` + pathCont + `?|\b|$)` + return strictExp() + `|` + webURL +} + +// Strict produces a regexp that matches any URL with a scheme in either the +// Schemes or SchemesNoAuthority lists. +func Strict() *regexp.Regexp { + re := regexp.MustCompile(strictExp()) + re.Longest() + return re +} + +// Relaxed produces a regexp that matches any URL matched by Strict, plus any +// URL with no scheme. +func Relaxed() *regexp.Regexp { + re := regexp.MustCompile(relaxedExp()) + re.Longest() + return re +} + +// StrictMatchingScheme produces a regexp similar to Strict, but requiring that +// the scheme match the given regular expression. See AnyScheme too. +func StrictMatchingScheme(exp string) (*regexp.Regexp, error) { + strictMatching := `(?i)(` + exp + `)(?-i)` + pathCont + re, err := regexp.Compile(strictMatching) + if err != nil { + return nil, err + } + re.Longest() + return re, nil +}