You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
390 lines
6.8 KiB
390 lines
6.8 KiB
5 years ago
|
package snowballstem
|
||
|
|
||
|
import (
|
||
|
"log"
|
||
|
"strings"
|
||
|
"unicode/utf8"
|
||
|
)
|
||
|
|
||
|
// Env represents the Snowball execution environment
|
||
|
type Env struct {
|
||
|
current string
|
||
|
Cursor int
|
||
|
Limit int
|
||
|
LimitBackward int
|
||
|
Bra int
|
||
|
Ket int
|
||
|
}
|
||
|
|
||
|
// NewEnv creates a new Snowball execution environment on the provided string
|
||
|
func NewEnv(val string) *Env {
|
||
|
return &Env{
|
||
|
current: val,
|
||
|
Cursor: 0,
|
||
|
Limit: len(val),
|
||
|
LimitBackward: 0,
|
||
|
Bra: 0,
|
||
|
Ket: len(val),
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (env *Env) Current() string {
|
||
|
return env.current
|
||
|
}
|
||
|
|
||
|
func (env *Env) SetCurrent(s string) {
|
||
|
env.current = s
|
||
|
env.Cursor = 0
|
||
|
env.Limit = len(s)
|
||
|
env.LimitBackward = 0
|
||
|
env.Bra = 0
|
||
|
env.Ket = len(s)
|
||
|
}
|
||
|
|
||
|
func (env *Env) ReplaceS(bra, ket int, s string) int32 {
|
||
|
adjustment := int32(len(s)) - (int32(ket) - int32(bra))
|
||
|
result, _ := splitAt(env.current, bra)
|
||
|
rsplit := ket
|
||
|
if ket < bra {
|
||
|
rsplit = bra
|
||
|
}
|
||
|
_, rhs := splitAt(env.current, rsplit)
|
||
|
result += s
|
||
|
result += rhs
|
||
|
|
||
|
newLim := int32(env.Limit) + adjustment
|
||
|
env.Limit = int(newLim)
|
||
|
|
||
|
if env.Cursor >= ket {
|
||
|
newCur := int32(env.Cursor) + adjustment
|
||
|
env.Cursor = int(newCur)
|
||
|
} else if env.Cursor > bra {
|
||
|
env.Cursor = bra
|
||
|
}
|
||
|
|
||
|
env.current = result
|
||
|
return adjustment
|
||
|
}
|
||
|
|
||
|
func (env *Env) EqS(s string) bool {
|
||
|
if env.Cursor >= env.Limit {
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
if strings.HasPrefix(env.current[env.Cursor:], s) {
|
||
|
env.Cursor += len(s)
|
||
|
for !onCharBoundary(env.current, env.Cursor) {
|
||
|
env.Cursor++
|
||
|
}
|
||
|
return true
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func (env *Env) EqSB(s string) bool {
|
||
|
if int32(env.Cursor)-int32(env.LimitBackward) < int32(len(s)) {
|
||
|
return false
|
||
|
} else if !onCharBoundary(env.current, env.Cursor-len(s)) ||
|
||
|
!strings.HasPrefix(env.current[env.Cursor-len(s):], s) {
|
||
|
return false
|
||
|
} else {
|
||
|
env.Cursor -= len(s)
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (env *Env) SliceFrom(s string) bool {
|
||
|
bra, ket := env.Bra, env.Ket
|
||
|
env.ReplaceS(bra, ket, s)
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
func (env *Env) NextChar() {
|
||
|
env.Cursor++
|
||
|
for !onCharBoundary(env.current, env.Cursor) {
|
||
|
env.Cursor++
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (env *Env) PrevChar() {
|
||
|
env.Cursor--
|
||
|
for !onCharBoundary(env.current, env.Cursor) {
|
||
|
env.Cursor--
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (env *Env) ByteIndexForHop(delta int32) int32 {
|
||
|
if delta > 0 {
|
||
|
res := env.Cursor
|
||
|
for delta > 0 {
|
||
|
res++
|
||
|
delta--
|
||
|
for res <= len(env.current) && !onCharBoundary(env.current, res) {
|
||
|
res++
|
||
|
}
|
||
|
}
|
||
|
return int32(res)
|
||
|
} else if delta < 0 {
|
||
|
res := env.Cursor
|
||
|
for delta < 0 {
|
||
|
res--
|
||
|
delta++
|
||
|
for res >= 0 && !onCharBoundary(env.current, res) {
|
||
|
res--
|
||
|
}
|
||
|
}
|
||
|
return int32(res)
|
||
|
} else {
|
||
|
return int32(env.Cursor)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (env *Env) InGrouping(chars []byte, min, max int32) bool {
|
||
|
if env.Cursor >= env.Limit {
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
||
|
if r != utf8.RuneError {
|
||
|
if r > max || r < min {
|
||
|
return false
|
||
|
}
|
||
|
r -= min
|
||
|
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
||
|
return false
|
||
|
}
|
||
|
env.NextChar()
|
||
|
return true
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func (env *Env) InGroupingB(chars []byte, min, max int32) bool {
|
||
|
if env.Cursor <= env.LimitBackward {
|
||
|
return false
|
||
|
}
|
||
|
env.PrevChar()
|
||
|
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
||
|
if r != utf8.RuneError {
|
||
|
env.NextChar()
|
||
|
if r > max || r < min {
|
||
|
return false
|
||
|
}
|
||
|
r -= min
|
||
|
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
||
|
return false
|
||
|
}
|
||
|
env.PrevChar()
|
||
|
return true
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func (env *Env) OutGrouping(chars []byte, min, max int32) bool {
|
||
|
if env.Cursor >= env.Limit {
|
||
|
return false
|
||
|
}
|
||
|
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
||
|
if r != utf8.RuneError {
|
||
|
if r > max || r < min {
|
||
|
env.NextChar()
|
||
|
return true
|
||
|
}
|
||
|
r -= min
|
||
|
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
||
|
env.NextChar()
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func (env *Env) OutGroupingB(chars []byte, min, max int32) bool {
|
||
|
if env.Cursor <= env.LimitBackward {
|
||
|
return false
|
||
|
}
|
||
|
env.PrevChar()
|
||
|
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
||
|
if r != utf8.RuneError {
|
||
|
env.NextChar()
|
||
|
if r > max || r < min {
|
||
|
env.PrevChar()
|
||
|
return true
|
||
|
}
|
||
|
r -= min
|
||
|
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
||
|
env.PrevChar()
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func (env *Env) SliceDel() bool {
|
||
|
return env.SliceFrom("")
|
||
|
}
|
||
|
|
||
|
func (env *Env) Insert(bra, ket int, s string) {
|
||
|
adjustment := env.ReplaceS(bra, ket, s)
|
||
|
if bra <= env.Bra {
|
||
|
env.Bra = int(int32(env.Bra) + adjustment)
|
||
|
}
|
||
|
if bra <= env.Ket {
|
||
|
env.Ket = int(int32(env.Ket) + adjustment)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (env *Env) SliceTo() string {
|
||
|
return env.current[env.Bra:env.Ket]
|
||
|
}
|
||
|
|
||
|
func (env *Env) FindAmong(amongs []*Among, ctx interface{}) int32 {
|
||
|
var i int32
|
||
|
j := int32(len(amongs))
|
||
|
|
||
|
c := env.Cursor
|
||
|
l := env.Limit
|
||
|
|
||
|
var commonI, commonJ int
|
||
|
|
||
|
firstKeyInspected := false
|
||
|
for {
|
||
|
k := i + ((j - i) >> 1)
|
||
|
var diff int32
|
||
|
common := min(commonI, commonJ)
|
||
|
w := amongs[k]
|
||
|
for lvar := common; lvar < len(w.Str); lvar++ {
|
||
|
if c+common == l {
|
||
|
diff--
|
||
|
break
|
||
|
}
|
||
|
diff = int32(env.current[c+common]) - int32(w.Str[lvar])
|
||
|
if diff != 0 {
|
||
|
break
|
||
|
}
|
||
|
common++
|
||
|
}
|
||
|
if diff < 0 {
|
||
|
j = k
|
||
|
commonJ = common
|
||
|
} else {
|
||
|
i = k
|
||
|
commonI = common
|
||
|
}
|
||
|
if j-i <= 1 {
|
||
|
if i > 0 {
|
||
|
break
|
||
|
}
|
||
|
if j == i {
|
||
|
break
|
||
|
}
|
||
|
if firstKeyInspected {
|
||
|
break
|
||
|
}
|
||
|
firstKeyInspected = true
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for {
|
||
|
w := amongs[i]
|
||
|
if commonI >= len(w.Str) {
|
||
|
env.Cursor = c + len(w.Str)
|
||
|
if w.F != nil {
|
||
|
res := w.F(env, ctx)
|
||
|
env.Cursor = c + len(w.Str)
|
||
|
if res {
|
||
|
return w.B
|
||
|
}
|
||
|
} else {
|
||
|
return w.B
|
||
|
}
|
||
|
}
|
||
|
i = w.A
|
||
|
if i < 0 {
|
||
|
return 0
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (env *Env) FindAmongB(amongs []*Among, ctx interface{}) int32 {
|
||
|
var i int32
|
||
|
j := int32(len(amongs))
|
||
|
|
||
|
c := env.Cursor
|
||
|
lb := env.LimitBackward
|
||
|
|
||
|
var commonI, commonJ int
|
||
|
|
||
|
firstKeyInspected := false
|
||
|
|
||
|
for {
|
||
|
k := i + ((j - i) >> 1)
|
||
|
diff := int32(0)
|
||
|
common := min(commonI, commonJ)
|
||
|
w := amongs[k]
|
||
|
for lvar := len(w.Str) - int(common) - 1; lvar >= 0; lvar-- {
|
||
|
if c-common == lb {
|
||
|
diff--
|
||
|
break
|
||
|
}
|
||
|
diff = int32(env.current[c-common-1]) - int32(w.Str[lvar])
|
||
|
if diff != 0 {
|
||
|
break
|
||
|
}
|
||
|
// Count up commons. But not one character but the byte width of that char
|
||
|
common++
|
||
|
}
|
||
|
if diff < 0 {
|
||
|
j = k
|
||
|
commonJ = common
|
||
|
} else {
|
||
|
i = k
|
||
|
commonI = common
|
||
|
}
|
||
|
if j-i <= 1 {
|
||
|
if i > 0 {
|
||
|
break
|
||
|
}
|
||
|
if j == i {
|
||
|
break
|
||
|
}
|
||
|
if firstKeyInspected {
|
||
|
break
|
||
|
}
|
||
|
firstKeyInspected = true
|
||
|
}
|
||
|
}
|
||
|
for {
|
||
|
w := amongs[i]
|
||
|
if commonI >= len(w.Str) {
|
||
|
env.Cursor = c - len(w.Str)
|
||
|
if w.F != nil {
|
||
|
res := w.F(env, ctx)
|
||
|
env.Cursor = c - len(w.Str)
|
||
|
if res {
|
||
|
return w.B
|
||
|
}
|
||
|
} else {
|
||
|
return w.B
|
||
|
}
|
||
|
}
|
||
|
i = w.A
|
||
|
if i < 0 {
|
||
|
return 0
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (env *Env) Debug(count, lineNumber int) {
|
||
|
log.Printf("snowball debug, count: %d, line: %d", count, lineNumber)
|
||
|
}
|
||
|
|
||
|
func (env *Env) Clone() *Env {
|
||
|
clone := *env
|
||
|
return &clone
|
||
|
}
|
||
|
|
||
|
func (env *Env) AssignTo() string {
|
||
|
return env.Current()
|
||
|
}
|