Replace linkRegex with xurls library (#6261)
* Replace linkRegex with xurls library Rather than maintaining a complicated regex to match URLs for autolinking, gitea can use this existing go library that takes care of the matching with very little code change to gitea itself. After spending a while trying to find the perfect regex for all cases this library still works better as it is more flexible than a single regex ever will be. This will also fix the following issues: #5844 #3095 #3381 This passes all our current tests and I've added new ones mentioned in those issues as well. * Use xurls.StrictMatchingScheme instead of xurls.Strict This is much faster and we only care about https? links to preserve existing behavior.tokarchuk/v1.17
parent
01bd1fcd33
commit
f2de5dc8c8
@ -0,0 +1,27 @@ |
||||
Copyright (c) 2015, Daniel Martí. All rights reserved. |
||||
|
||||
Redistribution and use in source and binary forms, with or without |
||||
modification, are permitted provided that the following conditions are |
||||
met: |
||||
|
||||
* Redistributions of source code must retain the above copyright |
||||
notice, this list of conditions and the following disclaimer. |
||||
* Redistributions in binary form must reproduce the above |
||||
copyright notice, this list of conditions and the following disclaimer |
||||
in the documentation and/or other materials provided with the |
||||
distribution. |
||||
* Neither the name of the copyright holder nor the names of its |
||||
contributors may be used to endorse or promote products derived from |
||||
this software without specific prior written permission. |
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
@ -0,0 +1,299 @@ |
||||
// Generated by schemesgen
|
||||
|
||||
package xurls |
||||
|
||||
// Schemes is a sorted list of all IANA assigned schemes.
|
||||
//
|
||||
// Source:
|
||||
// https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv
|
||||
var Schemes = []string{ |
||||
`aaa`, |
||||
`aaas`, |
||||
`about`, |
||||
`acap`, |
||||
`acct`, |
||||
`acr`, |
||||
`adiumxtra`, |
||||
`afp`, |
||||
`afs`, |
||||
`aim`, |
||||
`appdata`, |
||||
`apt`, |
||||
`attachment`, |
||||
`aw`, |
||||
`barion`, |
||||
`beshare`, |
||||
`bitcoin`, |
||||
`bitcoincash`, |
||||
`blob`, |
||||
`bolo`, |
||||
`browserext`, |
||||
`callto`, |
||||
`cap`, |
||||
`chrome`, |
||||
`chrome-extension`, |
||||
`cid`, |
||||
`coap`, |
||||
`coap+tcp`, |
||||
`coap+ws`, |
||||
`coaps`, |
||||
`coaps+tcp`, |
||||
`coaps+ws`, |
||||
`com-eventbrite-attendee`, |
||||
`content`, |
||||
`conti`, |
||||
`crid`, |
||||
`cvs`, |
||||
`data`, |
||||
`dav`, |
||||
`diaspora`, |
||||
`dict`, |
||||
`did`, |
||||
`dis`, |
||||
`dlna-playcontainer`, |
||||
`dlna-playsingle`, |
||||
`dns`, |
||||
`dntp`, |
||||
`dtn`, |
||||
`dvb`, |
||||
`ed2k`, |
||||
`elsi`, |
||||
`example`, |
||||
`facetime`, |
||||
`fax`, |
||||
`feed`, |
||||
`feedready`, |
||||
`file`, |
||||
`filesystem`, |
||||
`finger`, |
||||
`fish`, |
||||
`ftp`, |
||||
`geo`, |
||||
`gg`, |
||||
`git`, |
||||
`gizmoproject`, |
||||
`go`, |
||||
`gopher`, |
||||
`graph`, |
||||
`gtalk`, |
||||
`h323`, |
||||
`ham`, |
||||
`hcap`, |
||||
`hcp`, |
||||
`http`, |
||||
`https`, |
||||
`hxxp`, |
||||
`hxxps`, |
||||
`hydrazone`, |
||||
`iax`, |
||||
`icap`, |
||||
`icon`, |
||||
`im`, |
||||
`imap`, |
||||
`info`, |
||||
`iotdisco`, |
||||
`ipn`, |
||||
`ipp`, |
||||
`ipps`, |
||||
`irc`, |
||||
`irc6`, |
||||
`ircs`, |
||||
`iris`, |
||||
`iris.beep`, |
||||
`iris.lwz`, |
||||
`iris.xpc`, |
||||
`iris.xpcs`, |
||||
`isostore`, |
||||
`itms`, |
||||
`jabber`, |
||||
`jar`, |
||||
`jms`, |
||||
`keyparc`, |
||||
`lastfm`, |
||||
`ldap`, |
||||
`ldaps`, |
||||
`lvlt`, |
||||
`magnet`, |
||||
`mailserver`, |
||||
`mailto`, |
||||
`maps`, |
||||
`market`, |
||||
`message`, |
||||
`microsoft.windows.camera`, |
||||
`microsoft.windows.camera.multipicker`, |
||||
`microsoft.windows.camera.picker`, |
||||
`mid`, |
||||
`mms`, |
||||
`modem`, |
||||
`mongodb`, |
||||
`moz`, |
||||
`ms-access`, |
||||
`ms-browser-extension`, |
||||
`ms-drive-to`, |
||||
`ms-enrollment`, |
||||
`ms-excel`, |
||||
`ms-gamebarservices`, |
||||
`ms-gamingoverlay`, |
||||
`ms-getoffice`, |
||||
`ms-help`, |
||||
`ms-infopath`, |
||||
`ms-inputapp`, |
||||
`ms-lockscreencomponent-config`, |
||||
`ms-media-stream-id`, |
||||
`ms-mixedrealitycapture`, |
||||
`ms-officeapp`, |
||||
`ms-people`, |
||||
`ms-project`, |
||||
`ms-powerpoint`, |
||||
`ms-publisher`, |
||||
`ms-restoretabcompanion`, |
||||
`ms-screenclip`, |
||||
`ms-screensketch`, |
||||
`ms-search`, |
||||
`ms-search-repair`, |
||||
`ms-secondary-screen-controller`, |
||||
`ms-secondary-screen-setup`, |
||||
`ms-settings`, |
||||
`ms-settings-airplanemode`, |
||||
`ms-settings-bluetooth`, |
||||
`ms-settings-camera`, |
||||
`ms-settings-cellular`, |
||||
`ms-settings-cloudstorage`, |
||||
`ms-settings-connectabledevices`, |
||||
`ms-settings-displays-topology`, |
||||
`ms-settings-emailandaccounts`, |
||||
`ms-settings-language`, |
||||
`ms-settings-location`, |
||||
`ms-settings-lock`, |
||||
`ms-settings-nfctransactions`, |
||||
`ms-settings-notifications`, |
||||
`ms-settings-power`, |
||||
`ms-settings-privacy`, |
||||
`ms-settings-proximity`, |
||||
`ms-settings-screenrotation`, |
||||
`ms-settings-wifi`, |
||||
`ms-settings-workplace`, |
||||
`ms-spd`, |
||||
`ms-sttoverlay`, |
||||
`ms-transit-to`, |
||||
`ms-useractivityset`, |
||||
`ms-virtualtouchpad`, |
||||
`ms-visio`, |
||||
`ms-walk-to`, |
||||
`ms-whiteboard`, |
||||
`ms-whiteboard-cmd`, |
||||
`ms-word`, |
||||
`msnim`, |
||||
`msrp`, |
||||
`msrps`, |
||||
`mtqp`, |
||||
`mumble`, |
||||
`mupdate`, |
||||
`mvn`, |
||||
`news`, |
||||
`nfs`, |
||||
`ni`, |
||||
`nih`, |
||||
`nntp`, |
||||
`notes`, |
||||
`ocf`, |
||||
`oid`, |
||||
`onenote`, |
||||
`onenote-cmd`, |
||||
`opaquelocktoken`, |
||||
`openpgp4fpr`, |
||||
`pack`, |
||||
`palm`, |
||||
`paparazzi`, |
||||
`pkcs11`, |
||||
`platform`, |
||||
`pop`, |
||||
`pres`, |
||||
`prospero`, |
||||
`proxy`, |
||||
`pwid`, |
||||
`psyc`, |
||||
`qb`, |
||||
`query`, |
||||
`redis`, |
||||
`rediss`, |
||||
`reload`, |
||||
`res`, |
||||
`resource`, |
||||
`rmi`, |
||||
`rsync`, |
||||
`rtmfp`, |
||||
`rtmp`, |
||||
`rtsp`, |
||||
`rtsps`, |
||||
`rtspu`, |
||||
`secondlife`, |
||||
`service`, |
||||
`session`, |
||||
`sftp`, |
||||
`sgn`, |
||||
`shttp`, |
||||
`sieve`, |
||||
`simpleledger`, |
||||
`sip`, |
||||
`sips`, |
||||
`skype`, |
||||
`smb`, |
||||
`sms`, |
||||
`smtp`, |
||||
`snews`, |
||||
`snmp`, |
||||
`soap.beep`, |
||||
`soap.beeps`, |
||||
`soldat`, |
||||
`spiffe`, |
||||
`spotify`, |
||||
`ssh`, |
||||
`steam`, |
||||
`stun`, |
||||
`stuns`, |
||||
`submit`, |
||||
`svn`, |
||||
`tag`, |
||||
`teamspeak`, |
||||
`tel`, |
||||
`teliaeid`, |
||||
`telnet`, |
||||
`tftp`, |
||||
`things`, |
||||
`thismessage`, |
||||
`tip`, |
||||
`tn3270`, |
||||
`tool`, |
||||
`turn`, |
||||
`turns`, |
||||
`tv`, |
||||
`udp`, |
||||
`unreal`, |
||||
`urn`, |
||||
`ut2004`, |
||||
`v-event`, |
||||
`vemmi`, |
||||
`ventrilo`, |
||||
`videotex`, |
||||
`vnc`, |
||||
`view-source`, |
||||
`wais`, |
||||
`webcal`, |
||||
`wpid`, |
||||
`ws`, |
||||
`wss`, |
||||
`wtai`, |
||||
`wyciwyg`, |
||||
`xcon`, |
||||
`xcon-userid`, |
||||
`xfire`, |
||||
`xmlrpc.beep`, |
||||
`xmlrpc.beeps`, |
||||
`xmpp`, |
||||
`xri`, |
||||
`ymsgr`, |
||||
`z39.50`, |
||||
`z39.50r`, |
||||
`z39.50s`, |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,24 @@ |
||||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
package xurls |
||||
|
||||
// PseudoTLDs is a sorted list of some widely used unofficial TLDs.
|
||||
//
|
||||
// Sources:
|
||||
// * https://en.wikipedia.org/wiki/Pseudo-top-level_domain
|
||||
// * https://en.wikipedia.org/wiki/Category:Pseudo-top-level_domains
|
||||
// * https://tools.ietf.org/html/draft-grothoff-iesg-special-use-p2p-names-00
|
||||
// * https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml
|
||||
var PseudoTLDs = []string{ |
||||
`bit`, // Namecoin
|
||||
`example`, // Example domain
|
||||
`exit`, // Tor exit node
|
||||
`gnu`, // GNS by public key
|
||||
`i2p`, // I2P network
|
||||
`invalid`, // Invalid domain
|
||||
`local`, // Local network
|
||||
`localhost`, // Local network
|
||||
`test`, // Test domain
|
||||
`zkey`, // GNS domain name
|
||||
} |
@ -0,0 +1,107 @@ |
||||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
// Package xurls extracts urls from plain text using regular expressions.
|
||||
package xurls |
||||
|
||||
import ( |
||||
"bytes" |
||||
"regexp" |
||||
) |
||||
|
||||
//go:generate go run generate/tldsgen/main.go
|
||||
//go:generate go run generate/schemesgen/main.go
|
||||
|
||||
const ( |
||||
letter = `\p{L}` |
||||
mark = `\p{M}` |
||||
number = `\p{N}` |
||||
iriChar = letter + mark + number |
||||
currency = `\p{Sc}` |
||||
otherSymb = `\p{So}` |
||||
endChar = iriChar + `/\-+_&~*%=#` + currency + otherSymb |
||||
otherPunc = `\p{Po}` |
||||
midChar = endChar + `|` + otherPunc |
||||
wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)` |
||||
wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]` |
||||
wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}` |
||||
wellAll = wellParen + `|` + wellBrack + `|` + wellBrace |
||||
pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+` |
||||
|
||||
iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?` |
||||
domain = `(` + iri + `\.)+` |
||||
octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])` |
||||
ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b` |
||||
ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:` |
||||
ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)` |
||||
port = `(:[0-9]*)?` |
||||
) |
||||
|
||||
// AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
|
||||
// scheme, and not just the known ones.
|
||||
var AnyScheme = `([a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)` |
||||
|
||||
// SchemesNoAuthority is a sorted list of some well-known url schemes that are
|
||||
// followed by ":" instead of "://".
|
||||
var SchemesNoAuthority = []string{ |
||||
`bitcoin`, // Bitcoin
|
||||
`file`, // Files
|
||||
`magnet`, // Torrent magnets
|
||||
`mailto`, // Mail
|
||||
`sms`, // SMS
|
||||
`tel`, // Telephone
|
||||
`xmpp`, // XMPP
|
||||
} |
||||
|
||||
func anyOf(strs ...string) string { |
||||
var b bytes.Buffer |
||||
b.WriteByte('(') |
||||
for i, s := range strs { |
||||
if i != 0 { |
||||
b.WriteByte('|') |
||||
} |
||||
b.WriteString(regexp.QuoteMeta(s)) |
||||
} |
||||
b.WriteByte(')') |
||||
return b.String() |
||||
} |
||||
|
||||
func strictExp() string { |
||||
schemes := `(` + anyOf(Schemes...) + `://|` + anyOf(SchemesNoAuthority...) + `:)` |
||||
return `(?i)` + schemes + `(?-i)` + pathCont |
||||
} |
||||
|
||||
func relaxedExp() string { |
||||
site := domain + `(?i)` + anyOf(append(TLDs, PseudoTLDs...)...) + `(?-i)` |
||||
hostName := `(` + site + `|` + ipAddr + `)` |
||||
webURL := hostName + port + `(/|/` + pathCont + `?|\b|$)` |
||||
return strictExp() + `|` + webURL |
||||
} |
||||
|
||||
// Strict produces a regexp that matches any URL with a scheme in either the
|
||||
// Schemes or SchemesNoAuthority lists.
|
||||
func Strict() *regexp.Regexp { |
||||
re := regexp.MustCompile(strictExp()) |
||||
re.Longest() |
||||
return re |
||||
} |
||||
|
||||
// Relaxed produces a regexp that matches any URL matched by Strict, plus any
|
||||
// URL with no scheme.
|
||||
func Relaxed() *regexp.Regexp { |
||||
re := regexp.MustCompile(relaxedExp()) |
||||
re.Longest() |
||||
return re |
||||
} |
||||
|
||||
// StrictMatchingScheme produces a regexp similar to Strict, but requiring that
|
||||
// the scheme match the given regular expression. See AnyScheme too.
|
||||
func StrictMatchingScheme(exp string) (*regexp.Regexp, error) { |
||||
strictMatching := `(?i)(` + exp + `)(?-i)` + pathCont |
||||
re, err := regexp.Compile(strictMatching) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
re.Longest() |
||||
return re, nil |
||||
} |
Loading…
Reference in new issue