You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
180 lines
3.5 KiB
180 lines
3.5 KiB
10 years ago
|
package mahonia
|
||
|
|
||
|
// decoding HTML entities
|
||
|
|
||
|
import (
|
||
|
"sort"
|
||
|
)
|
||
|
|
||
|
// EntityDecoder returns a Decoder that decodes HTML character entities.
|
||
|
// If there is no valid character entity at the current position, it returns INVALID_CHAR.
|
||
|
// So it needs to be combined with another Decoder via FallbackDecoder.
|
||
|
func EntityDecoder() Decoder {
|
||
|
var leftover rune // leftover rune from two-rune entity
|
||
|
return func(p []byte) (r rune, size int, status Status) {
|
||
|
if leftover != 0 {
|
||
|
r = leftover
|
||
|
leftover = 0
|
||
|
return r, 0, SUCCESS
|
||
|
}
|
||
|
|
||
|
if len(p) == 0 {
|
||
|
return 0, 0, NO_ROOM
|
||
|
}
|
||
|
|
||
|
if p[0] != '&' {
|
||
|
return 0xfffd, 1, INVALID_CHAR
|
||
|
}
|
||
|
|
||
|
if len(p) < 3 {
|
||
|
return 0, 1, NO_ROOM
|
||
|
}
|
||
|
|
||
|
r, size, status = 0xfffd, 1, INVALID_CHAR
|
||
|
n := 1 // number of bytes read so far
|
||
|
|
||
|
if p[n] == '#' {
|
||
|
n++
|
||
|
c := p[n]
|
||
|
hex := false
|
||
|
if c == 'x' || c == 'X' {
|
||
|
hex = true
|
||
|
n++
|
||
|
}
|
||
|
|
||
|
var x rune
|
||
|
for n < len(p) {
|
||
|
c = p[n]
|
||
|
n++
|
||
|
if hex {
|
||
|
if '0' <= c && c <= '9' {
|
||
|
x = 16*x + rune(c) - '0'
|
||
|
continue
|
||
|
} else if 'a' <= c && c <= 'f' {
|
||
|
x = 16*x + rune(c) - 'a' + 10
|
||
|
continue
|
||
|
} else if 'A' <= c && c <= 'F' {
|
||
|
x = 16*x + rune(c) - 'A' + 10
|
||
|
continue
|
||
|
}
|
||
|
} else if '0' <= c && c <= '9' {
|
||
|
x = 10*x + rune(c) - '0'
|
||
|
continue
|
||
|
}
|
||
|
if c != ';' {
|
||
|
n--
|
||
|
}
|
||
|
break
|
||
|
}
|
||
|
|
||
|
if n == len(p) && p[n-1] != ';' {
|
||
|
return 0, 0, NO_ROOM
|
||
|
}
|
||
|
|
||
|
size = n
|
||
|
if p[n-1] == ';' {
|
||
|
n--
|
||
|
}
|
||
|
if hex {
|
||
|
n--
|
||
|
}
|
||
|
n--
|
||
|
// Now n is the number of actual digits read.
|
||
|
if n == 0 {
|
||
|
return 0xfffd, 1, INVALID_CHAR
|
||
|
}
|
||
|
|
||
|
if 0x80 <= x && x <= 0x9F {
|
||
|
// Replace characters from Windows-1252 with UTF-8 equivalents.
|
||
|
x = replacementTable[x-0x80]
|
||
|
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
|
||
|
// Replace invalid characters with the replacement character.
|
||
|
return 0xfffd, size, INVALID_CHAR
|
||
|
}
|
||
|
|
||
|
r = x
|
||
|
status = SUCCESS
|
||
|
return
|
||
|
}
|
||
|
|
||
|
// Look for a named entity in EntityList.
|
||
|
|
||
|
possible := entityList
|
||
|
for len(possible) > 0 {
|
||
|
if len(p) <= n {
|
||
|
leftover = 0
|
||
|
return 0, 0, NO_ROOM
|
||
|
}
|
||
|
|
||
|
c := p[n]
|
||
|
|
||
|
// Narrow down the selection in possible to those items that have c in the
|
||
|
// appropriate byte.
|
||
|
first := sort.Search(len(possible), func(i int) bool {
|
||
|
e := possible[i].name
|
||
|
if len(e) < n {
|
||
|
return false
|
||
|
}
|
||
|
return e[n-1] >= c
|
||
|
})
|
||
|
possible = possible[first:]
|
||
|
last := sort.Search(len(possible), func(i int) bool {
|
||
|
return possible[i].name[n-1] > c
|
||
|
})
|
||
|
possible = possible[:last]
|
||
|
|
||
|
n++
|
||
|
if len(possible) > 0 && len(possible[0].name) == n-1 {
|
||
|
r, leftover = possible[0].r1, possible[0].r2
|
||
|
size = n
|
||
|
status = SUCCESS
|
||
|
// but don't return yet, since we need the longest match
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// This table is copied from /src/pkg/html/escape.go in the Go source
|
||
|
//
|
||
|
// These replacements permit compatibility with old numeric entities that
|
||
|
// assumed Windows-1252 encoding.
|
||
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
||
|
var replacementTable = [...]rune{
|
||
|
'\u20AC', // First entry is what 0x80 should be replaced with.
|
||
|
'\u0081',
|
||
|
'\u201A',
|
||
|
'\u0192',
|
||
|
'\u201E',
|
||
|
'\u2026',
|
||
|
'\u2020',
|
||
|
'\u2021',
|
||
|
'\u02C6',
|
||
|
'\u2030',
|
||
|
'\u0160',
|
||
|
'\u2039',
|
||
|
'\u0152',
|
||
|
'\u008D',
|
||
|
'\u017D',
|
||
|
'\u008F',
|
||
|
'\u0090',
|
||
|
'\u2018',
|
||
|
'\u2019',
|
||
|
'\u201C',
|
||
|
'\u201D',
|
||
|
'\u2022',
|
||
|
'\u2013',
|
||
|
'\u2014',
|
||
|
'\u02DC',
|
||
|
'\u2122',
|
||
|
'\u0161',
|
||
|
'\u203A',
|
||
|
'\u0153',
|
||
|
'\u009D',
|
||
|
'\u017E',
|
||
|
'\u0178', // Last entry is 0x9F.
|
||
|
// 0x00->'\uFFFD' is handled programmatically.
|
||
|
// 0x0D->'\u000D' is a no-op.
|
||
|
}
|