Replace linkRegex with xurls library (#6261)
* Replace linkRegex with xurls library Rather than maintaining a complicated regex to match URLs for autolinking, gitea can use this existing go library that takes care of the matching with very little code change to gitea itself. After spending a while trying to find the perfect regex for all cases this library still works better as it is more flexible than a single regex ever will be. This will also fix the following issues: #5844 #3095 #3381 This passes all our current tests and I've added new ones mentioned in those issues as well. * Use xurls.StrictMatchingScheme instead of xurls.Strict This is much faster and we only care about https? links to preserve existing behavior.tokarchuk/v1.17
parent
01bd1fcd33
commit
f2de5dc8c8
@ -0,0 +1,27 @@ |
|||||||
|
Copyright (c) 2015, Daniel Martí. All rights reserved. |
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without |
||||||
|
modification, are permitted provided that the following conditions are |
||||||
|
met: |
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright |
||||||
|
notice, this list of conditions and the following disclaimer. |
||||||
|
* Redistributions in binary form must reproduce the above |
||||||
|
copyright notice, this list of conditions and the following disclaimer |
||||||
|
in the documentation and/or other materials provided with the |
||||||
|
distribution. |
||||||
|
* Neither the name of the copyright holder nor the names of its |
||||||
|
contributors may be used to endorse or promote products derived from |
||||||
|
this software without specific prior written permission. |
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
@ -0,0 +1,299 @@ |
|||||||
|
// Generated by schemesgen
|
||||||
|
|
||||||
|
package xurls |
||||||
|
|
||||||
|
// Schemes is a sorted list of all IANA assigned schemes.
|
||||||
|
//
|
||||||
|
// Source:
|
||||||
|
// https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv
|
||||||
|
var Schemes = []string{ |
||||||
|
`aaa`, |
||||||
|
`aaas`, |
||||||
|
`about`, |
||||||
|
`acap`, |
||||||
|
`acct`, |
||||||
|
`acr`, |
||||||
|
`adiumxtra`, |
||||||
|
`afp`, |
||||||
|
`afs`, |
||||||
|
`aim`, |
||||||
|
`appdata`, |
||||||
|
`apt`, |
||||||
|
`attachment`, |
||||||
|
`aw`, |
||||||
|
`barion`, |
||||||
|
`beshare`, |
||||||
|
`bitcoin`, |
||||||
|
`bitcoincash`, |
||||||
|
`blob`, |
||||||
|
`bolo`, |
||||||
|
`browserext`, |
||||||
|
`callto`, |
||||||
|
`cap`, |
||||||
|
`chrome`, |
||||||
|
`chrome-extension`, |
||||||
|
`cid`, |
||||||
|
`coap`, |
||||||
|
`coap+tcp`, |
||||||
|
`coap+ws`, |
||||||
|
`coaps`, |
||||||
|
`coaps+tcp`, |
||||||
|
`coaps+ws`, |
||||||
|
`com-eventbrite-attendee`, |
||||||
|
`content`, |
||||||
|
`conti`, |
||||||
|
`crid`, |
||||||
|
`cvs`, |
||||||
|
`data`, |
||||||
|
`dav`, |
||||||
|
`diaspora`, |
||||||
|
`dict`, |
||||||
|
`did`, |
||||||
|
`dis`, |
||||||
|
`dlna-playcontainer`, |
||||||
|
`dlna-playsingle`, |
||||||
|
`dns`, |
||||||
|
`dntp`, |
||||||
|
`dtn`, |
||||||
|
`dvb`, |
||||||
|
`ed2k`, |
||||||
|
`elsi`, |
||||||
|
`example`, |
||||||
|
`facetime`, |
||||||
|
`fax`, |
||||||
|
`feed`, |
||||||
|
`feedready`, |
||||||
|
`file`, |
||||||
|
`filesystem`, |
||||||
|
`finger`, |
||||||
|
`fish`, |
||||||
|
`ftp`, |
||||||
|
`geo`, |
||||||
|
`gg`, |
||||||
|
`git`, |
||||||
|
`gizmoproject`, |
||||||
|
`go`, |
||||||
|
`gopher`, |
||||||
|
`graph`, |
||||||
|
`gtalk`, |
||||||
|
`h323`, |
||||||
|
`ham`, |
||||||
|
`hcap`, |
||||||
|
`hcp`, |
||||||
|
`http`, |
||||||
|
`https`, |
||||||
|
`hxxp`, |
||||||
|
`hxxps`, |
||||||
|
`hydrazone`, |
||||||
|
`iax`, |
||||||
|
`icap`, |
||||||
|
`icon`, |
||||||
|
`im`, |
||||||
|
`imap`, |
||||||
|
`info`, |
||||||
|
`iotdisco`, |
||||||
|
`ipn`, |
||||||
|
`ipp`, |
||||||
|
`ipps`, |
||||||
|
`irc`, |
||||||
|
`irc6`, |
||||||
|
`ircs`, |
||||||
|
`iris`, |
||||||
|
`iris.beep`, |
||||||
|
`iris.lwz`, |
||||||
|
`iris.xpc`, |
||||||
|
`iris.xpcs`, |
||||||
|
`isostore`, |
||||||
|
`itms`, |
||||||
|
`jabber`, |
||||||
|
`jar`, |
||||||
|
`jms`, |
||||||
|
`keyparc`, |
||||||
|
`lastfm`, |
||||||
|
`ldap`, |
||||||
|
`ldaps`, |
||||||
|
`lvlt`, |
||||||
|
`magnet`, |
||||||
|
`mailserver`, |
||||||
|
`mailto`, |
||||||
|
`maps`, |
||||||
|
`market`, |
||||||
|
`message`, |
||||||
|
`microsoft.windows.camera`, |
||||||
|
`microsoft.windows.camera.multipicker`, |
||||||
|
`microsoft.windows.camera.picker`, |
||||||
|
`mid`, |
||||||
|
`mms`, |
||||||
|
`modem`, |
||||||
|
`mongodb`, |
||||||
|
`moz`, |
||||||
|
`ms-access`, |
||||||
|
`ms-browser-extension`, |
||||||
|
`ms-drive-to`, |
||||||
|
`ms-enrollment`, |
||||||
|
`ms-excel`, |
||||||
|
`ms-gamebarservices`, |
||||||
|
`ms-gamingoverlay`, |
||||||
|
`ms-getoffice`, |
||||||
|
`ms-help`, |
||||||
|
`ms-infopath`, |
||||||
|
`ms-inputapp`, |
||||||
|
`ms-lockscreencomponent-config`, |
||||||
|
`ms-media-stream-id`, |
||||||
|
`ms-mixedrealitycapture`, |
||||||
|
`ms-officeapp`, |
||||||
|
`ms-people`, |
||||||
|
`ms-project`, |
||||||
|
`ms-powerpoint`, |
||||||
|
`ms-publisher`, |
||||||
|
`ms-restoretabcompanion`, |
||||||
|
`ms-screenclip`, |
||||||
|
`ms-screensketch`, |
||||||
|
`ms-search`, |
||||||
|
`ms-search-repair`, |
||||||
|
`ms-secondary-screen-controller`, |
||||||
|
`ms-secondary-screen-setup`, |
||||||
|
`ms-settings`, |
||||||
|
`ms-settings-airplanemode`, |
||||||
|
`ms-settings-bluetooth`, |
||||||
|
`ms-settings-camera`, |
||||||
|
`ms-settings-cellular`, |
||||||
|
`ms-settings-cloudstorage`, |
||||||
|
`ms-settings-connectabledevices`, |
||||||
|
`ms-settings-displays-topology`, |
||||||
|
`ms-settings-emailandaccounts`, |
||||||
|
`ms-settings-language`, |
||||||
|
`ms-settings-location`, |
||||||
|
`ms-settings-lock`, |
||||||
|
`ms-settings-nfctransactions`, |
||||||
|
`ms-settings-notifications`, |
||||||
|
`ms-settings-power`, |
||||||
|
`ms-settings-privacy`, |
||||||
|
`ms-settings-proximity`, |
||||||
|
`ms-settings-screenrotation`, |
||||||
|
`ms-settings-wifi`, |
||||||
|
`ms-settings-workplace`, |
||||||
|
`ms-spd`, |
||||||
|
`ms-sttoverlay`, |
||||||
|
`ms-transit-to`, |
||||||
|
`ms-useractivityset`, |
||||||
|
`ms-virtualtouchpad`, |
||||||
|
`ms-visio`, |
||||||
|
`ms-walk-to`, |
||||||
|
`ms-whiteboard`, |
||||||
|
`ms-whiteboard-cmd`, |
||||||
|
`ms-word`, |
||||||
|
`msnim`, |
||||||
|
`msrp`, |
||||||
|
`msrps`, |
||||||
|
`mtqp`, |
||||||
|
`mumble`, |
||||||
|
`mupdate`, |
||||||
|
`mvn`, |
||||||
|
`news`, |
||||||
|
`nfs`, |
||||||
|
`ni`, |
||||||
|
`nih`, |
||||||
|
`nntp`, |
||||||
|
`notes`, |
||||||
|
`ocf`, |
||||||
|
`oid`, |
||||||
|
`onenote`, |
||||||
|
`onenote-cmd`, |
||||||
|
`opaquelocktoken`, |
||||||
|
`openpgp4fpr`, |
||||||
|
`pack`, |
||||||
|
`palm`, |
||||||
|
`paparazzi`, |
||||||
|
`pkcs11`, |
||||||
|
`platform`, |
||||||
|
`pop`, |
||||||
|
`pres`, |
||||||
|
`prospero`, |
||||||
|
`proxy`, |
||||||
|
`pwid`, |
||||||
|
`psyc`, |
||||||
|
`qb`, |
||||||
|
`query`, |
||||||
|
`redis`, |
||||||
|
`rediss`, |
||||||
|
`reload`, |
||||||
|
`res`, |
||||||
|
`resource`, |
||||||
|
`rmi`, |
||||||
|
`rsync`, |
||||||
|
`rtmfp`, |
||||||
|
`rtmp`, |
||||||
|
`rtsp`, |
||||||
|
`rtsps`, |
||||||
|
`rtspu`, |
||||||
|
`secondlife`, |
||||||
|
`service`, |
||||||
|
`session`, |
||||||
|
`sftp`, |
||||||
|
`sgn`, |
||||||
|
`shttp`, |
||||||
|
`sieve`, |
||||||
|
`simpleledger`, |
||||||
|
`sip`, |
||||||
|
`sips`, |
||||||
|
`skype`, |
||||||
|
`smb`, |
||||||
|
`sms`, |
||||||
|
`smtp`, |
||||||
|
`snews`, |
||||||
|
`snmp`, |
||||||
|
`soap.beep`, |
||||||
|
`soap.beeps`, |
||||||
|
`soldat`, |
||||||
|
`spiffe`, |
||||||
|
`spotify`, |
||||||
|
`ssh`, |
||||||
|
`steam`, |
||||||
|
`stun`, |
||||||
|
`stuns`, |
||||||
|
`submit`, |
||||||
|
`svn`, |
||||||
|
`tag`, |
||||||
|
`teamspeak`, |
||||||
|
`tel`, |
||||||
|
`teliaeid`, |
||||||
|
`telnet`, |
||||||
|
`tftp`, |
||||||
|
`things`, |
||||||
|
`thismessage`, |
||||||
|
`tip`, |
||||||
|
`tn3270`, |
||||||
|
`tool`, |
||||||
|
`turn`, |
||||||
|
`turns`, |
||||||
|
`tv`, |
||||||
|
`udp`, |
||||||
|
`unreal`, |
||||||
|
`urn`, |
||||||
|
`ut2004`, |
||||||
|
`v-event`, |
||||||
|
`vemmi`, |
||||||
|
`ventrilo`, |
||||||
|
`videotex`, |
||||||
|
`vnc`, |
||||||
|
`view-source`, |
||||||
|
`wais`, |
||||||
|
`webcal`, |
||||||
|
`wpid`, |
||||||
|
`ws`, |
||||||
|
`wss`, |
||||||
|
`wtai`, |
||||||
|
`wyciwyg`, |
||||||
|
`xcon`, |
||||||
|
`xcon-userid`, |
||||||
|
`xfire`, |
||||||
|
`xmlrpc.beep`, |
||||||
|
`xmlrpc.beeps`, |
||||||
|
`xmpp`, |
||||||
|
`xri`, |
||||||
|
`ymsgr`, |
||||||
|
`z39.50`, |
||||||
|
`z39.50r`, |
||||||
|
`z39.50s`, |
||||||
|
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,24 @@ |
|||||||
|
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||||
|
// See LICENSE for licensing information
|
||||||
|
|
||||||
|
package xurls |
||||||
|
|
||||||
|
// PseudoTLDs is a sorted list of some widely used unofficial TLDs.
|
||||||
|
//
|
||||||
|
// Sources:
|
||||||
|
// * https://en.wikipedia.org/wiki/Pseudo-top-level_domain
|
||||||
|
// * https://en.wikipedia.org/wiki/Category:Pseudo-top-level_domains
|
||||||
|
// * https://tools.ietf.org/html/draft-grothoff-iesg-special-use-p2p-names-00
|
||||||
|
// * https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml
|
||||||
|
var PseudoTLDs = []string{ |
||||||
|
`bit`, // Namecoin
|
||||||
|
`example`, // Example domain
|
||||||
|
`exit`, // Tor exit node
|
||||||
|
`gnu`, // GNS by public key
|
||||||
|
`i2p`, // I2P network
|
||||||
|
`invalid`, // Invalid domain
|
||||||
|
`local`, // Local network
|
||||||
|
`localhost`, // Local network
|
||||||
|
`test`, // Test domain
|
||||||
|
`zkey`, // GNS domain name
|
||||||
|
} |
@ -0,0 +1,107 @@ |
|||||||
|
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||||
|
// See LICENSE for licensing information
|
||||||
|
|
||||||
|
// Package xurls extracts urls from plain text using regular expressions.
|
||||||
|
package xurls |
||||||
|
|
||||||
|
import ( |
||||||
|
"bytes" |
||||||
|
"regexp" |
||||||
|
) |
||||||
|
|
||||||
|
//go:generate go run generate/tldsgen/main.go
|
||||||
|
//go:generate go run generate/schemesgen/main.go
|
||||||
|
|
||||||
|
const ( |
||||||
|
letter = `\p{L}` |
||||||
|
mark = `\p{M}` |
||||||
|
number = `\p{N}` |
||||||
|
iriChar = letter + mark + number |
||||||
|
currency = `\p{Sc}` |
||||||
|
otherSymb = `\p{So}` |
||||||
|
endChar = iriChar + `/\-+_&~*%=#` + currency + otherSymb |
||||||
|
otherPunc = `\p{Po}` |
||||||
|
midChar = endChar + `|` + otherPunc |
||||||
|
wellParen = `\([` + midChar + `]*(\([` + midChar + `]*\)[` + midChar + `]*)*\)` |
||||||
|
wellBrack = `\[[` + midChar + `]*(\[[` + midChar + `]*\][` + midChar + `]*)*\]` |
||||||
|
wellBrace = `\{[` + midChar + `]*(\{[` + midChar + `]*\}[` + midChar + `]*)*\}` |
||||||
|
wellAll = wellParen + `|` + wellBrack + `|` + wellBrace |
||||||
|
pathCont = `([` + midChar + `]*(` + wellAll + `|[` + endChar + `])+)+` |
||||||
|
|
||||||
|
iri = `[` + iriChar + `]([` + iriChar + `\-]*[` + iriChar + `])?` |
||||||
|
domain = `(` + iri + `\.)+` |
||||||
|
octet = `(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])` |
||||||
|
ipv4Addr = `\b` + octet + `\.` + octet + `\.` + octet + `\.` + octet + `\b` |
||||||
|
ipv6Addr = `([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:([0-9a-fA-F]{1,4}:[0-9a-fA-F]{0,4}|:[0-9a-fA-F]{1,4})?|(:[0-9a-fA-F]{1,4}){0,2})|(:[0-9a-fA-F]{1,4}){0,3})|(:[0-9a-fA-F]{1,4}){0,4})|:(:[0-9a-fA-F]{1,4}){0,5})((:[0-9a-fA-F]{1,4}){2}|:(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})|(([0-9a-fA-F]{1,4}:){1,6}|:):[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){7}:` |
||||||
|
ipAddr = `(` + ipv4Addr + `|` + ipv6Addr + `)` |
||||||
|
port = `(:[0-9]*)?` |
||||||
|
) |
||||||
|
|
||||||
|
// AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
|
||||||
|
// scheme, and not just the known ones.
|
||||||
|
var AnyScheme = `([a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)` |
||||||
|
|
||||||
|
// SchemesNoAuthority is a sorted list of some well-known url schemes that are
|
||||||
|
// followed by ":" instead of "://".
|
||||||
|
var SchemesNoAuthority = []string{ |
||||||
|
`bitcoin`, // Bitcoin
|
||||||
|
`file`, // Files
|
||||||
|
`magnet`, // Torrent magnets
|
||||||
|
`mailto`, // Mail
|
||||||
|
`sms`, // SMS
|
||||||
|
`tel`, // Telephone
|
||||||
|
`xmpp`, // XMPP
|
||||||
|
} |
||||||
|
|
||||||
|
func anyOf(strs ...string) string { |
||||||
|
var b bytes.Buffer |
||||||
|
b.WriteByte('(') |
||||||
|
for i, s := range strs { |
||||||
|
if i != 0 { |
||||||
|
b.WriteByte('|') |
||||||
|
} |
||||||
|
b.WriteString(regexp.QuoteMeta(s)) |
||||||
|
} |
||||||
|
b.WriteByte(')') |
||||||
|
return b.String() |
||||||
|
} |
||||||
|
|
||||||
|
func strictExp() string { |
||||||
|
schemes := `(` + anyOf(Schemes...) + `://|` + anyOf(SchemesNoAuthority...) + `:)` |
||||||
|
return `(?i)` + schemes + `(?-i)` + pathCont |
||||||
|
} |
||||||
|
|
||||||
|
func relaxedExp() string { |
||||||
|
site := domain + `(?i)` + anyOf(append(TLDs, PseudoTLDs...)...) + `(?-i)` |
||||||
|
hostName := `(` + site + `|` + ipAddr + `)` |
||||||
|
webURL := hostName + port + `(/|/` + pathCont + `?|\b|$)` |
||||||
|
return strictExp() + `|` + webURL |
||||||
|
} |
||||||
|
|
||||||
|
// Strict produces a regexp that matches any URL with a scheme in either the
|
||||||
|
// Schemes or SchemesNoAuthority lists.
|
||||||
|
func Strict() *regexp.Regexp { |
||||||
|
re := regexp.MustCompile(strictExp()) |
||||||
|
re.Longest() |
||||||
|
return re |
||||||
|
} |
||||||
|
|
||||||
|
// Relaxed produces a regexp that matches any URL matched by Strict, plus any
|
||||||
|
// URL with no scheme.
|
||||||
|
func Relaxed() *regexp.Regexp { |
||||||
|
re := regexp.MustCompile(relaxedExp()) |
||||||
|
re.Longest() |
||||||
|
return re |
||||||
|
} |
||||||
|
|
||||||
|
// StrictMatchingScheme produces a regexp similar to Strict, but requiring that
|
||||||
|
// the scheme match the given regular expression. See AnyScheme too.
|
||||||
|
func StrictMatchingScheme(exp string) (*regexp.Regexp, error) { |
||||||
|
strictMatching := `(?i)(` + exp + `)(?-i)` + pathCont |
||||||
|
re, err := regexp.Compile(strictMatching) |
||||||
|
if err != nil { |
||||||
|
return nil, err |
||||||
|
} |
||||||
|
re.Longest() |
||||||
|
return re, nil |
||||||
|
} |
Loading…
Reference in new issue