You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							500 lines
						
					
					
						
							11 KiB
						
					
					
				
			
		
		
	
	
							500 lines
						
					
					
						
							11 KiB
						
					
					
				| package syntax
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"fmt"
 | |
| 	"math"
 | |
| 	"os"
 | |
| )
 | |
| 
 | |
| func Write(tree *RegexTree) (*Code, error) {
 | |
| 	w := writer{
 | |
| 		intStack:   make([]int, 0, 32),
 | |
| 		emitted:    make([]int, 2),
 | |
| 		stringhash: make(map[string]int),
 | |
| 		sethash:    make(map[string]int),
 | |
| 	}
 | |
| 
 | |
| 	code, err := w.codeFromTree(tree)
 | |
| 
 | |
| 	if tree.options&Debug > 0 && code != nil {
 | |
| 		os.Stdout.WriteString(code.Dump())
 | |
| 		os.Stdout.WriteString("\n")
 | |
| 	}
 | |
| 
 | |
| 	return code, err
 | |
| }
 | |
| 
 | |
| type writer struct {
 | |
| 	emitted []int
 | |
| 
 | |
| 	intStack    []int
 | |
| 	curpos      int
 | |
| 	stringhash  map[string]int
 | |
| 	stringtable [][]rune
 | |
| 	sethash     map[string]int
 | |
| 	settable    []*CharSet
 | |
| 	counting    bool
 | |
| 	count       int
 | |
| 	trackcount  int
 | |
| 	caps        map[int]int
 | |
| }
 | |
| 
 | |
| const (
 | |
| 	beforeChild nodeType = 64
 | |
| 	afterChild           = 128
 | |
| 	//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
 | |
| 	MaxPrefixSize = 50
 | |
| )
 | |
| 
 | |
| // The top level RegexCode generator. It does a depth-first walk
 | |
| // through the tree and calls EmitFragment to emits code before
 | |
| // and after each child of an interior node, and at each leaf.
 | |
| //
 | |
| // It runs two passes, first to count the size of the generated
 | |
| // code, and second to generate the code.
 | |
| //
 | |
| // We should time it against the alternative, which is
 | |
| // to just generate the code and grow the array as we go.
 | |
| func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
 | |
| 	var (
 | |
| 		curNode  *regexNode
 | |
| 		curChild int
 | |
| 		capsize  int
 | |
| 	)
 | |
| 	// construct sparse capnum mapping if some numbers are unused
 | |
| 
 | |
| 	if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
 | |
| 		capsize = tree.captop
 | |
| 		w.caps = nil
 | |
| 	} else {
 | |
| 		capsize = len(tree.capnumlist)
 | |
| 		w.caps = tree.caps
 | |
| 		for i := 0; i < len(tree.capnumlist); i++ {
 | |
| 			w.caps[tree.capnumlist[i]] = i
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	w.counting = true
 | |
| 
 | |
| 	for {
 | |
| 		if !w.counting {
 | |
| 			w.emitted = make([]int, w.count)
 | |
| 		}
 | |
| 
 | |
| 		curNode = tree.root
 | |
| 		curChild = 0
 | |
| 
 | |
| 		w.emit1(Lazybranch, 0)
 | |
| 
 | |
| 		for {
 | |
| 			if len(curNode.children) == 0 {
 | |
| 				w.emitFragment(curNode.t, curNode, 0)
 | |
| 			} else if curChild < len(curNode.children) {
 | |
| 				w.emitFragment(curNode.t|beforeChild, curNode, curChild)
 | |
| 
 | |
| 				curNode = curNode.children[curChild]
 | |
| 
 | |
| 				w.pushInt(curChild)
 | |
| 				curChild = 0
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			if w.emptyStack() {
 | |
| 				break
 | |
| 			}
 | |
| 
 | |
| 			curChild = w.popInt()
 | |
| 			curNode = curNode.next
 | |
| 
 | |
| 			w.emitFragment(curNode.t|afterChild, curNode, curChild)
 | |
| 			curChild++
 | |
| 		}
 | |
| 
 | |
| 		w.patchJump(0, w.curPos())
 | |
| 		w.emit(Stop)
 | |
| 
 | |
| 		if !w.counting {
 | |
| 			break
 | |
| 		}
 | |
| 
 | |
| 		w.counting = false
 | |
| 	}
 | |
| 
 | |
| 	fcPrefix := getFirstCharsPrefix(tree)
 | |
| 	prefix := getPrefix(tree)
 | |
| 	rtl := (tree.options & RightToLeft) != 0
 | |
| 
 | |
| 	var bmPrefix *BmPrefix
 | |
| 	//TODO: benchmark string prefixes
 | |
| 	if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
 | |
| 		if len(prefix.PrefixStr) > MaxPrefixSize {
 | |
| 			// limit prefix changes to 10k
 | |
| 			prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
 | |
| 		}
 | |
| 		bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
 | |
| 	} else {
 | |
| 		bmPrefix = nil
 | |
| 	}
 | |
| 
 | |
| 	return &Code{
 | |
| 		Codes:       w.emitted,
 | |
| 		Strings:     w.stringtable,
 | |
| 		Sets:        w.settable,
 | |
| 		TrackCount:  w.trackcount,
 | |
| 		Caps:        w.caps,
 | |
| 		Capsize:     capsize,
 | |
| 		FcPrefix:    fcPrefix,
 | |
| 		BmPrefix:    bmPrefix,
 | |
| 		Anchors:     getAnchors(tree),
 | |
| 		RightToLeft: rtl,
 | |
| 	}, nil
 | |
| }
 | |
| 
 | |
| // The main RegexCode generator. It does a depth-first walk
 | |
| // through the tree and calls EmitFragment to emits code before
 | |
| // and after each child of an interior node, and at each leaf.
 | |
| func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
 | |
| 	bits := InstOp(0)
 | |
| 
 | |
| 	if nodetype <= ntRef {
 | |
| 		if (node.options & RightToLeft) != 0 {
 | |
| 			bits |= Rtl
 | |
| 		}
 | |
| 		if (node.options & IgnoreCase) != 0 {
 | |
| 			bits |= Ci
 | |
| 		}
 | |
| 	}
 | |
| 	ntBits := nodeType(bits)
 | |
| 
 | |
| 	switch nodetype {
 | |
| 	case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
 | |
| 		break
 | |
| 
 | |
| 	case ntAlternate | beforeChild:
 | |
| 		if curIndex < len(node.children)-1 {
 | |
| 			w.pushInt(w.curPos())
 | |
| 			w.emit1(Lazybranch, 0)
 | |
| 		}
 | |
| 
 | |
| 	case ntAlternate | afterChild:
 | |
| 		if curIndex < len(node.children)-1 {
 | |
| 			lbPos := w.popInt()
 | |
| 			w.pushInt(w.curPos())
 | |
| 			w.emit1(Goto, 0)
 | |
| 			w.patchJump(lbPos, w.curPos())
 | |
| 		} else {
 | |
| 			for i := 0; i < curIndex; i++ {
 | |
| 				w.patchJump(w.popInt(), w.curPos())
 | |
| 			}
 | |
| 		}
 | |
| 		break
 | |
| 
 | |
| 	case ntTestref | beforeChild:
 | |
| 		if curIndex == 0 {
 | |
| 			w.emit(Setjump)
 | |
| 			w.pushInt(w.curPos())
 | |
| 			w.emit1(Lazybranch, 0)
 | |
| 			w.emit1(Testref, w.mapCapnum(node.m))
 | |
| 			w.emit(Forejump)
 | |
| 		}
 | |
| 
 | |
| 	case ntTestref | afterChild:
 | |
| 		if curIndex == 0 {
 | |
| 			branchpos := w.popInt()
 | |
| 			w.pushInt(w.curPos())
 | |
| 			w.emit1(Goto, 0)
 | |
| 			w.patchJump(branchpos, w.curPos())
 | |
| 			w.emit(Forejump)
 | |
| 			if len(node.children) <= 1 {
 | |
| 				w.patchJump(w.popInt(), w.curPos())
 | |
| 			}
 | |
| 		} else if curIndex == 1 {
 | |
| 			w.patchJump(w.popInt(), w.curPos())
 | |
| 		}
 | |
| 
 | |
| 	case ntTestgroup | beforeChild:
 | |
| 		if curIndex == 0 {
 | |
| 			w.emit(Setjump)
 | |
| 			w.emit(Setmark)
 | |
| 			w.pushInt(w.curPos())
 | |
| 			w.emit1(Lazybranch, 0)
 | |
| 		}
 | |
| 
 | |
| 	case ntTestgroup | afterChild:
 | |
| 		if curIndex == 0 {
 | |
| 			w.emit(Getmark)
 | |
| 			w.emit(Forejump)
 | |
| 		} else if curIndex == 1 {
 | |
| 			Branchpos := w.popInt()
 | |
| 			w.pushInt(w.curPos())
 | |
| 			w.emit1(Goto, 0)
 | |
| 			w.patchJump(Branchpos, w.curPos())
 | |
| 			w.emit(Getmark)
 | |
| 			w.emit(Forejump)
 | |
| 			if len(node.children) <= 2 {
 | |
| 				w.patchJump(w.popInt(), w.curPos())
 | |
| 			}
 | |
| 		} else if curIndex == 2 {
 | |
| 			w.patchJump(w.popInt(), w.curPos())
 | |
| 		}
 | |
| 
 | |
| 	case ntLoop | beforeChild, ntLazyloop | beforeChild:
 | |
| 
 | |
| 		if node.n < math.MaxInt32 || node.m > 1 {
 | |
| 			if node.m == 0 {
 | |
| 				w.emit1(Nullcount, 0)
 | |
| 			} else {
 | |
| 				w.emit1(Setcount, 1-node.m)
 | |
| 			}
 | |
| 		} else if node.m == 0 {
 | |
| 			w.emit(Nullmark)
 | |
| 		} else {
 | |
| 			w.emit(Setmark)
 | |
| 		}
 | |
| 
 | |
| 		if node.m == 0 {
 | |
| 			w.pushInt(w.curPos())
 | |
| 			w.emit1(Goto, 0)
 | |
| 		}
 | |
| 		w.pushInt(w.curPos())
 | |
| 
 | |
| 	case ntLoop | afterChild, ntLazyloop | afterChild:
 | |
| 
 | |
| 		startJumpPos := w.curPos()
 | |
| 		lazy := (nodetype - (ntLoop | afterChild))
 | |
| 
 | |
| 		if node.n < math.MaxInt32 || node.m > 1 {
 | |
| 			if node.n == math.MaxInt32 {
 | |
| 				w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
 | |
| 			} else {
 | |
| 				w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
 | |
| 			}
 | |
| 		} else {
 | |
| 			w.emit1(InstOp(Branchmark+lazy), w.popInt())
 | |
| 		}
 | |
| 
 | |
| 		if node.m == 0 {
 | |
| 			w.patchJump(w.popInt(), startJumpPos)
 | |
| 		}
 | |
| 
 | |
| 	case ntGroup | beforeChild, ntGroup | afterChild:
 | |
| 
 | |
| 	case ntCapture | beforeChild:
 | |
| 		w.emit(Setmark)
 | |
| 
 | |
| 	case ntCapture | afterChild:
 | |
| 		w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
 | |
| 
 | |
| 	case ntRequire | beforeChild:
 | |
| 		// NOTE: the following line causes lookahead/lookbehind to be
 | |
| 		// NON-BACKTRACKING. It can be commented out with (*)
 | |
| 		w.emit(Setjump)
 | |
| 
 | |
| 		w.emit(Setmark)
 | |
| 
 | |
| 	case ntRequire | afterChild:
 | |
| 		w.emit(Getmark)
 | |
| 
 | |
| 		// NOTE: the following line causes lookahead/lookbehind to be
 | |
| 		// NON-BACKTRACKING. It can be commented out with (*)
 | |
| 		w.emit(Forejump)
 | |
| 
 | |
| 	case ntPrevent | beforeChild:
 | |
| 		w.emit(Setjump)
 | |
| 		w.pushInt(w.curPos())
 | |
| 		w.emit1(Lazybranch, 0)
 | |
| 
 | |
| 	case ntPrevent | afterChild:
 | |
| 		w.emit(Backjump)
 | |
| 		w.patchJump(w.popInt(), w.curPos())
 | |
| 		w.emit(Forejump)
 | |
| 
 | |
| 	case ntGreedy | beforeChild:
 | |
| 		w.emit(Setjump)
 | |
| 
 | |
| 	case ntGreedy | afterChild:
 | |
| 		w.emit(Forejump)
 | |
| 
 | |
| 	case ntOne, ntNotone:
 | |
| 		w.emit1(InstOp(node.t|ntBits), int(node.ch))
 | |
| 
 | |
| 	case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
 | |
| 		if node.m > 0 {
 | |
| 			if node.t == ntOneloop || node.t == ntOnelazy {
 | |
| 				w.emit2(Onerep|bits, int(node.ch), node.m)
 | |
| 			} else {
 | |
| 				w.emit2(Notonerep|bits, int(node.ch), node.m)
 | |
| 			}
 | |
| 		}
 | |
| 		if node.n > node.m {
 | |
| 			if node.n == math.MaxInt32 {
 | |
| 				w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
 | |
| 			} else {
 | |
| 				w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 	case ntSetloop, ntSetlazy:
 | |
| 		if node.m > 0 {
 | |
| 			w.emit2(Setrep|bits, w.setCode(node.set), node.m)
 | |
| 		}
 | |
| 		if node.n > node.m {
 | |
| 			if node.n == math.MaxInt32 {
 | |
| 				w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
 | |
| 			} else {
 | |
| 				w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 	case ntMulti:
 | |
| 		w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
 | |
| 
 | |
| 	case ntSet:
 | |
| 		w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
 | |
| 
 | |
| 	case ntRef:
 | |
| 		w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
 | |
| 
 | |
| 	case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
 | |
| 		w.emit(InstOp(node.t))
 | |
| 
 | |
| 	default:
 | |
| 		return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // To avoid recursion, we use a simple integer stack.
 | |
| // This is the push.
 | |
| func (w *writer) pushInt(i int) {
 | |
| 	w.intStack = append(w.intStack, i)
 | |
| }
 | |
| 
 | |
| // Returns true if the stack is empty.
 | |
| func (w *writer) emptyStack() bool {
 | |
| 	return len(w.intStack) == 0
 | |
| }
 | |
| 
 | |
| // This is the pop.
 | |
| func (w *writer) popInt() int {
 | |
| 	//get our item
 | |
| 	idx := len(w.intStack) - 1
 | |
| 	i := w.intStack[idx]
 | |
| 	//trim our slice
 | |
| 	w.intStack = w.intStack[:idx]
 | |
| 	return i
 | |
| }
 | |
| 
 | |
| // Returns the current position in the emitted code.
 | |
| func (w *writer) curPos() int {
 | |
| 	return w.curpos
 | |
| }
 | |
| 
 | |
| // Fixes up a jump instruction at the specified offset
 | |
| // so that it jumps to the specified jumpDest.
 | |
| func (w *writer) patchJump(offset, jumpDest int) {
 | |
| 	w.emitted[offset+1] = jumpDest
 | |
| }
 | |
| 
 | |
| // Returns an index in the set table for a charset
 | |
| // uses a map to eliminate duplicates.
 | |
| func (w *writer) setCode(set *CharSet) int {
 | |
| 	if w.counting {
 | |
| 		return 0
 | |
| 	}
 | |
| 
 | |
| 	buf := &bytes.Buffer{}
 | |
| 
 | |
| 	set.mapHashFill(buf)
 | |
| 	hash := buf.String()
 | |
| 	i, ok := w.sethash[hash]
 | |
| 	if !ok {
 | |
| 		i = len(w.sethash)
 | |
| 		w.sethash[hash] = i
 | |
| 		w.settable = append(w.settable, set)
 | |
| 	}
 | |
| 	return i
 | |
| }
 | |
| 
 | |
| // Returns an index in the string table for a string.
 | |
| // uses a map to eliminate duplicates.
 | |
| func (w *writer) stringCode(str []rune) int {
 | |
| 	if w.counting {
 | |
| 		return 0
 | |
| 	}
 | |
| 
 | |
| 	hash := string(str)
 | |
| 	i, ok := w.stringhash[hash]
 | |
| 	if !ok {
 | |
| 		i = len(w.stringhash)
 | |
| 		w.stringhash[hash] = i
 | |
| 		w.stringtable = append(w.stringtable, str)
 | |
| 	}
 | |
| 
 | |
| 	return i
 | |
| }
 | |
| 
 | |
| // When generating code on a regex that uses a sparse set
 | |
| // of capture slots, we hash them to a dense set of indices
 | |
| // for an array of capture slots. Instead of doing the hash
 | |
| // at match time, it's done at compile time, here.
 | |
| func (w *writer) mapCapnum(capnum int) int {
 | |
| 	if capnum == -1 {
 | |
| 		return -1
 | |
| 	}
 | |
| 
 | |
| 	if w.caps != nil {
 | |
| 		return w.caps[capnum]
 | |
| 	}
 | |
| 
 | |
| 	return capnum
 | |
| }
 | |
| 
 | |
| // Emits a zero-argument operation. Note that the emit
 | |
| // functions all run in two modes: they can emit code, or
 | |
| // they can just count the size of the code.
 | |
| func (w *writer) emit(op InstOp) {
 | |
| 	if w.counting {
 | |
| 		w.count++
 | |
| 		if opcodeBacktracks(op) {
 | |
| 			w.trackcount++
 | |
| 		}
 | |
| 		return
 | |
| 	}
 | |
| 	w.emitted[w.curpos] = int(op)
 | |
| 	w.curpos++
 | |
| }
 | |
| 
 | |
| // Emits a one-argument operation.
 | |
| func (w *writer) emit1(op InstOp, opd1 int) {
 | |
| 	if w.counting {
 | |
| 		w.count += 2
 | |
| 		if opcodeBacktracks(op) {
 | |
| 			w.trackcount++
 | |
| 		}
 | |
| 		return
 | |
| 	}
 | |
| 	w.emitted[w.curpos] = int(op)
 | |
| 	w.curpos++
 | |
| 	w.emitted[w.curpos] = opd1
 | |
| 	w.curpos++
 | |
| }
 | |
| 
 | |
| // Emits a two-argument operation.
 | |
| func (w *writer) emit2(op InstOp, opd1, opd2 int) {
 | |
| 	if w.counting {
 | |
| 		w.count += 3
 | |
| 		if opcodeBacktracks(op) {
 | |
| 			w.trackcount++
 | |
| 		}
 | |
| 		return
 | |
| 	}
 | |
| 	w.emitted[w.curpos] = int(op)
 | |
| 	w.curpos++
 | |
| 	w.emitted[w.curpos] = opd1
 | |
| 	w.curpos++
 | |
| 	w.emitted[w.curpos] = opd2
 | |
| 	w.curpos++
 | |
| }
 | |
| 
 |