You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
268 lines
8.5 KiB
268 lines
8.5 KiB
package uniseg
|
|
|
|
import "unicode/utf8"
|
|
|
|
// The states of the grapheme cluster parser.
|
|
const (
|
|
grAny = iota
|
|
grCR
|
|
grControlLF
|
|
grL
|
|
grLVV
|
|
grLVTT
|
|
grPrepend
|
|
grExtendedPictographic
|
|
grExtendedPictographicZWJ
|
|
grRIOdd
|
|
grRIEven
|
|
)
|
|
|
|
// The grapheme cluster parser's breaking instructions.
|
|
const (
|
|
grNoBoundary = iota
|
|
grBoundary
|
|
)
|
|
|
|
// The grapheme cluster parser's state transitions. Maps (state, property) to
|
|
// (new state, breaking instruction, rule number). The breaking instruction
|
|
// always refers to the boundary between the last and next code point.
|
|
//
|
|
// This map is queried as follows:
|
|
//
|
|
// 1. Find specific state + specific property. Stop if found.
|
|
// 2. Find specific state + any property.
|
|
// 3. Find any state + specific property.
|
|
// 4. If only (2) or (3) (but not both) was found, stop.
|
|
// 5. If both (2) and (3) were found, use state and breaking instruction from
|
|
// the transition with the lower rule number, prefer (3) if rule numbers
|
|
// are equal. Stop.
|
|
// 6. Assume grAny and grBoundary.
|
|
var grTransitions = map[[2]int][3]int{
|
|
// GB5
|
|
{grAny, prCR}: {grCR, grBoundary, 50},
|
|
{grAny, prLF}: {grControlLF, grBoundary, 50},
|
|
{grAny, prControl}: {grControlLF, grBoundary, 50},
|
|
|
|
// GB4
|
|
{grCR, prAny}: {grAny, grBoundary, 40},
|
|
{grControlLF, prAny}: {grAny, grBoundary, 40},
|
|
|
|
// GB3.
|
|
{grCR, prLF}: {grAny, grNoBoundary, 30},
|
|
|
|
// GB6.
|
|
{grAny, prL}: {grL, grBoundary, 9990},
|
|
{grL, prL}: {grL, grNoBoundary, 60},
|
|
{grL, prV}: {grLVV, grNoBoundary, 60},
|
|
{grL, prLV}: {grLVV, grNoBoundary, 60},
|
|
{grL, prLVT}: {grLVTT, grNoBoundary, 60},
|
|
|
|
// GB7.
|
|
{grAny, prLV}: {grLVV, grBoundary, 9990},
|
|
{grAny, prV}: {grLVV, grBoundary, 9990},
|
|
{grLVV, prV}: {grLVV, grNoBoundary, 70},
|
|
{grLVV, prT}: {grLVTT, grNoBoundary, 70},
|
|
|
|
// GB8.
|
|
{grAny, prLVT}: {grLVTT, grBoundary, 9990},
|
|
{grAny, prT}: {grLVTT, grBoundary, 9990},
|
|
{grLVTT, prT}: {grLVTT, grNoBoundary, 80},
|
|
|
|
// GB9.
|
|
{grAny, prExtend}: {grAny, grNoBoundary, 90},
|
|
{grAny, prZWJ}: {grAny, grNoBoundary, 90},
|
|
|
|
// GB9a.
|
|
{grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
|
|
|
|
// GB9b.
|
|
{grAny, prPreprend}: {grPrepend, grBoundary, 9990},
|
|
{grPrepend, prAny}: {grAny, grNoBoundary, 92},
|
|
|
|
// GB11.
|
|
{grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990},
|
|
{grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110},
|
|
{grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110},
|
|
{grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
|
|
|
|
// GB12 / GB13.
|
|
{grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990},
|
|
{grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120},
|
|
{grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
|
|
}
|
|
|
|
// Graphemes implements an iterator over Unicode extended grapheme clusters,
|
|
// specified in the Unicode Standard Annex #29. Grapheme clusters correspond to
|
|
// "user-perceived characters". These characters often consist of multiple
|
|
// code points (e.g. the "woman kissing woman" emoji consists of 8 code points:
|
|
// woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ +
|
|
// woman) and the rules described in Annex #29 must be applied to group those
|
|
// code points into clusters perceived by the user as one character.
|
|
type Graphemes struct {
|
|
// The code points over which this class iterates.
|
|
codePoints []rune
|
|
|
|
// The (byte-based) indices of the code points into the original string plus
|
|
// len(original string). Thus, len(indices) = len(codePoints) + 1.
|
|
indices []int
|
|
|
|
// The current grapheme cluster to be returned. These are indices into
|
|
// codePoints/indices. If start == end, we either haven't started iterating
|
|
// yet (0) or the iteration has already completed (1).
|
|
start, end int
|
|
|
|
// The index of the next code point to be parsed.
|
|
pos int
|
|
|
|
// The current state of the code point parser.
|
|
state int
|
|
}
|
|
|
|
// NewGraphemes returns a new grapheme cluster iterator.
|
|
func NewGraphemes(s string) *Graphemes {
|
|
l := utf8.RuneCountInString(s)
|
|
codePoints := make([]rune, l)
|
|
indices := make([]int, l+1)
|
|
i := 0
|
|
for pos, r := range s {
|
|
codePoints[i] = r
|
|
indices[i] = pos
|
|
i++
|
|
}
|
|
indices[l] = len(s)
|
|
g := &Graphemes{
|
|
codePoints: codePoints,
|
|
indices: indices,
|
|
}
|
|
g.Next() // Parse ahead.
|
|
return g
|
|
}
|
|
|
|
// Next advances the iterator by one grapheme cluster and returns false if no
|
|
// clusters are left. This function must be called before the first cluster is
|
|
// accessed.
|
|
func (g *Graphemes) Next() bool {
|
|
g.start = g.end
|
|
|
|
// The state transition gives us a boundary instruction BEFORE the next code
|
|
// point so we always need to stay ahead by one code point.
|
|
|
|
// Parse the next code point.
|
|
for g.pos <= len(g.codePoints) {
|
|
// GB2.
|
|
if g.pos == len(g.codePoints) {
|
|
g.end = g.pos
|
|
g.pos++
|
|
break
|
|
}
|
|
|
|
// Determine the property of the next character.
|
|
nextProperty := property(g.codePoints[g.pos])
|
|
g.pos++
|
|
|
|
// Find the applicable transition.
|
|
var boundary bool
|
|
transition, ok := grTransitions[[2]int{g.state, nextProperty}]
|
|
if ok {
|
|
// We have a specific transition. We'll use it.
|
|
g.state = transition[0]
|
|
boundary = transition[1] == grBoundary
|
|
} else {
|
|
// No specific transition found. Try the less specific ones.
|
|
transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}]
|
|
transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}]
|
|
if okAnyProp && okAnyState {
|
|
// Both apply. We'll use a mix (see comments for grTransitions).
|
|
g.state = transAnyState[0]
|
|
boundary = transAnyState[1] == grBoundary
|
|
if transAnyProp[2] < transAnyState[2] {
|
|
g.state = transAnyProp[0]
|
|
boundary = transAnyProp[1] == grBoundary
|
|
}
|
|
} else if okAnyProp {
|
|
// We only have a specific state.
|
|
g.state = transAnyProp[0]
|
|
boundary = transAnyProp[1] == grBoundary
|
|
// This branch will probably never be reached because okAnyState will
|
|
// always be true given the current transition map. But we keep it here
|
|
// for future modifications to the transition map where this may not be
|
|
// true anymore.
|
|
} else if okAnyState {
|
|
// We only have a specific property.
|
|
g.state = transAnyState[0]
|
|
boundary = transAnyState[1] == grBoundary
|
|
} else {
|
|
// No known transition. GB999: Any x Any.
|
|
g.state = grAny
|
|
boundary = true
|
|
}
|
|
}
|
|
|
|
// If we found a cluster boundary, let's stop here. The current cluster will
|
|
// be the one that just ended.
|
|
if g.pos-1 == 0 /* GB1 */ || boundary {
|
|
g.end = g.pos - 1
|
|
break
|
|
}
|
|
}
|
|
|
|
return g.start != g.end
|
|
}
|
|
|
|
// Runes returns a slice of runes (code points) which corresponds to the current
|
|
// grapheme cluster. If the iterator is already past the end or Next() has not
|
|
// yet been called, nil is returned.
|
|
func (g *Graphemes) Runes() []rune {
|
|
if g.start == g.end {
|
|
return nil
|
|
}
|
|
return g.codePoints[g.start:g.end]
|
|
}
|
|
|
|
// Str returns a substring of the original string which corresponds to the
|
|
// current grapheme cluster. If the iterator is already past the end or Next()
|
|
// has not yet been called, an empty string is returned.
|
|
func (g *Graphemes) Str() string {
|
|
if g.start == g.end {
|
|
return ""
|
|
}
|
|
return string(g.codePoints[g.start:g.end])
|
|
}
|
|
|
|
// Bytes returns a byte slice which corresponds to the current grapheme cluster.
|
|
// If the iterator is already past the end or Next() has not yet been called,
|
|
// nil is returned.
|
|
func (g *Graphemes) Bytes() []byte {
|
|
if g.start == g.end {
|
|
return nil
|
|
}
|
|
return []byte(string(g.codePoints[g.start:g.end]))
|
|
}
|
|
|
|
// Positions returns the interval of the current grapheme cluster as byte
|
|
// positions into the original string. The first returned value "from" indexes
|
|
// the first byte and the second returned value "to" indexes the first byte that
|
|
// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
|
|
// the original string "str". If Next() has not yet been called, both values are
|
|
// 0. If the iterator is already past the end, both values are 1.
|
|
func (g *Graphemes) Positions() (int, int) {
|
|
return g.indices[g.start], g.indices[g.end]
|
|
}
|
|
|
|
// Reset puts the iterator into its initial state such that the next call to
|
|
// Next() sets it to the first grapheme cluster again.
|
|
func (g *Graphemes) Reset() {
|
|
g.start, g.end, g.pos, g.state = 0, 0, 0, grAny
|
|
g.Next() // Parse ahead again.
|
|
}
|
|
|
|
// GraphemeClusterCount returns the number of user-perceived characters
|
|
// (grapheme clusters) for the given string. To calculate this number, it
|
|
// iterates through the string using the Graphemes iterator.
|
|
func GraphemeClusterCount(s string) (n int) {
|
|
g := NewGraphemes(s)
|
|
for g.Next() {
|
|
n++
|
|
}
|
|
return
|
|
}
|
|
|