chroma-markdown/vendor/github.com/dlclark/regexp2/syntax/tree.go

655 lines
16 KiB
Go
Raw Normal View History

2017-10-22 05:37:38 +02:00
package syntax
import (
"bytes"
"fmt"
"math"
"strconv"
)
type RegexTree struct {
root *regexNode
caps map[int]int
capnumlist []int
captop int
Capnames map[string]int
Caplist []string
options RegexOptions
}
// It is built into a parsed tree for a regular expression.
// Implementation notes:
//
// Since the node tree is a temporary data structure only used
// during compilation of the regexp to integer codes, it's
// designed for clarity and convenience rather than
// space efficiency.
//
// RegexNodes are built into a tree, linked by the n.children list.
// Each node also has a n.parent and n.ichild member indicating
// its parent and which child # it is in its parent's list.
//
// RegexNodes come in as many types as there are constructs in
// a regular expression, for example, "concatenate", "alternate",
// "one", "rept", "group". There are also node types for basic
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
//
// Because perl 5 allows "lookback" groups that scan backwards,
// each node also gets a "direction". Normally the value of
// boolean n.backward = false.
//
// During parsing, top-level nodes are also stacked onto a parse
// stack (a stack of trees). For this purpose we have a n.next
// pointer. [Note that to save a few bytes, we could overload the
// n.parent pointer instead.]
//
// On the parse stack, each tree has a "role" - basically, the
// nonterminal in the grammar that the parser has currently
// assigned to the tree. That code is stored in n.role.
//
// Finally, some of the different kinds of nodes have data.
// Two integers (for the looping constructs) are stored in
// n.operands, an an object (either a string or a set)
// is stored in n.data
type regexNode struct {
t nodeType
children []*regexNode
str []rune
set *CharSet
ch rune
m int
n int
options RegexOptions
next *regexNode
}
type nodeType int32
const (
// The following are leaves, and correspond to primitive operations
ntOnerep nodeType = 0 // lef,back char,min,max a {n}
ntNotonerep = 1 // lef,back char,min,max .{n}
ntSetrep = 2 // lef,back set,min,max [\d]{n}
ntOneloop = 3 // lef,back char,min,max a {,n}
ntNotoneloop = 4 // lef,back char,min,max .{,n}
ntSetloop = 5 // lef,back set,min,max [\d]{,n}
ntOnelazy = 6 // lef,back char,min,max a {,n}?
ntNotonelazy = 7 // lef,back char,min,max .{,n}?
ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
ntOne = 9 // lef char a
ntNotone = 10 // lef char [^a]
ntSet = 11 // lef set [a-z\s] \w \s \d
ntMulti = 12 // lef string abcd
ntRef = 13 // lef group \#
ntBol = 14 // ^
ntEol = 15 // $
ntBoundary = 16 // \b
ntNonboundary = 17 // \B
ntBeginning = 18 // \A
ntStart = 19 // \G
ntEndZ = 20 // \Z
ntEnd = 21 // \Z
// Interior nodes do not correspond to primitive operations, but
// control structures compositing other operations
// Concat and alternate take n children, and can run forward or backwards
ntNothing = 22 // []
ntEmpty = 23 // ()
ntAlternate = 24 // a|b
ntConcatenate = 25 // ab
ntLoop = 26 // m,x * + ? {,}
ntLazyloop = 27 // m,x *? +? ?? {,}?
ntCapture = 28 // n ()
ntGroup = 29 // (?:)
ntRequire = 30 // (?=) (?<=)
ntPrevent = 31 // (?!) (?<!)
ntGreedy = 32 // (?>) (?<)
ntTestref = 33 // (?(n) | )
ntTestgroup = 34 // (?(...) | )
ntECMABoundary = 41 // \b
ntNonECMABoundary = 42 // \B
)
func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
return &regexNode{
t: t,
options: opt,
}
}
func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
return &regexNode{
t: t,
options: opt,
ch: ch,
}
}
func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
return &regexNode{
t: t,
options: opt,
str: str,
}
}
func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
return &regexNode{
t: t,
options: opt,
set: set,
}
}
func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
return &regexNode{
t: t,
options: opt,
m: m,
}
}
func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
return &regexNode{
t: t,
options: opt,
m: m,
n: n,
}
}
func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
for i := 0; i < len(n.str); i++ {
buf.WriteRune(n.str[i])
}
}
func (n *regexNode) addChild(child *regexNode) {
reduced := child.reduce()
n.children = append(n.children, reduced)
reduced.next = n
}
func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
}
// removes children including the start but not the end index
func (n *regexNode) removeChildren(startIndex, endIndex int) {
n.children = append(n.children[:startIndex], n.children[endIndex:]...)
}
// Pass type as OneLazy or OneLoop
func (n *regexNode) makeRep(t nodeType, min, max int) {
n.t += (t - ntOne)
n.m = min
n.n = max
}
func (n *regexNode) reduce() *regexNode {
switch n.t {
case ntAlternate:
return n.reduceAlternation()
case ntConcatenate:
return n.reduceConcatenation()
case ntLoop, ntLazyloop:
return n.reduceRep()
case ntGroup:
return n.reduceGroup()
case ntSet, ntSetloop:
return n.reduceSet()
default:
return n
}
}
// Basic optimization. Single-letter alternations can be replaced
// by faster set specifications, and nested alternations with no
// intervening operators can be flattened:
//
// a|b|c|def|g|h -> [a-c]|def|[gh]
// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
func (n *regexNode) reduceAlternation() *regexNode {
if len(n.children) == 0 {
return newRegexNode(ntNothing, n.options)
}
wasLastSet := false
lastNodeCannotMerge := false
var optionsLast RegexOptions
var i, j int
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
at := n.children[i]
if j < i {
n.children[j] = at
}
for {
if at.t == ntAlternate {
for k := 0; k < len(at.children); k++ {
at.children[k].next = n
}
n.insertChildren(i+1, at.children)
j--
} else if at.t == ntSet || at.t == ntOne {
// Cannot merge sets if L or I options differ, or if either are negated.
optionsAt := at.options & (RightToLeft | IgnoreCase)
if at.t == ntSet {
if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
wasLastSet = true
lastNodeCannotMerge = !at.set.IsMergeable()
optionsLast = optionsAt
break
}
} else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
wasLastSet = true
lastNodeCannotMerge = false
optionsLast = optionsAt
break
}
// The last node was a Set or a One, we're a Set or One and our options are the same.
// Merge the two nodes.
j--
prev := n.children[j]
var prevCharClass *CharSet
if prev.t == ntOne {
prevCharClass = &CharSet{}
prevCharClass.addChar(prev.ch)
} else {
prevCharClass = prev.set
}
if at.t == ntOne {
prevCharClass.addChar(at.ch)
} else {
prevCharClass.addSet(*at.set)
}
prev.t = ntSet
prev.set = prevCharClass
} else if at.t == ntNothing {
j--
} else {
wasLastSet = false
lastNodeCannotMerge = false
}
break
}
}
if j < i {
n.removeChildren(j, i)
}
return n.stripEnation(ntNothing)
}
// Basic optimization. Adjacent strings can be concatenated.
//
// (?:abc)(?:def) -> abcdef
func (n *regexNode) reduceConcatenation() *regexNode {
// Eliminate empties and concat adjacent strings/chars
var optionsLast RegexOptions
var optionsAt RegexOptions
var i, j int
if len(n.children) == 0 {
return newRegexNode(ntEmpty, n.options)
}
wasLastString := false
for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
var at, prev *regexNode
at = n.children[i]
if j < i {
n.children[j] = at
}
if at.t == ntConcatenate &&
((at.options & RightToLeft) == (n.options & RightToLeft)) {
for k := 0; k < len(at.children); k++ {
at.children[k].next = n
}
//insert at.children at i+1 index in n.children
n.insertChildren(i+1, at.children)
j--
} else if at.t == ntMulti || at.t == ntOne {
// Cannot merge strings if L or I options differ
optionsAt = at.options & (RightToLeft | IgnoreCase)
if !wasLastString || optionsLast != optionsAt {
wasLastString = true
optionsLast = optionsAt
continue
}
j--
prev = n.children[j]
if prev.t == ntOne {
prev.t = ntMulti
prev.str = []rune{prev.ch}
}
if (optionsAt & RightToLeft) == 0 {
if at.t == ntOne {
prev.str = append(prev.str, at.ch)
} else {
prev.str = append(prev.str, at.str...)
}
} else {
if at.t == ntOne {
// insert at the front by expanding our slice, copying the data over, and then setting the value
prev.str = append(prev.str, 0)
copy(prev.str[1:], prev.str)
prev.str[0] = at.ch
} else {
//insert at the front...this one we'll make a new slice and copy both into it
merge := make([]rune, len(prev.str)+len(at.str))
copy(merge, at.str)
copy(merge[len(at.str):], prev.str)
prev.str = merge
}
}
} else if at.t == ntEmpty {
j--
} else {
wasLastString = false
}
}
if j < i {
// remove indices j through i from the children
n.removeChildren(j, i)
}
return n.stripEnation(ntEmpty)
}
// Nested repeaters just get multiplied with each other if they're not
// too lumpy
func (n *regexNode) reduceRep() *regexNode {
u := n
t := n.t
min := n.m
max := n.n
for {
if len(u.children) == 0 {
break
}
child := u.children[0]
// multiply reps of the same type only
if child.t != t {
childType := child.t
if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
break
}
}
// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
// [but things like (a {2,})+ are not too lumpy...]
if u.m == 0 && child.m > 1 || child.n < child.m*2 {
break
}
u = child
if u.m > 0 {
if (math.MaxInt32-1)/u.m < min {
u.m = math.MaxInt32
} else {
u.m = u.m * min
}
}
if u.n > 0 {
if (math.MaxInt32-1)/u.n < max {
u.n = math.MaxInt32
} else {
u.n = u.n * max
}
}
}
if math.MaxInt32 == min {
return newRegexNode(ntNothing, n.options)
}
return u
}
// Simple optimization. If a concatenation or alternation has only
// one child strip out the intermediate node. If it has zero children,
// turn it into an empty.
func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
switch len(n.children) {
case 0:
return newRegexNode(emptyType, n.options)
case 1:
return n.children[0]
default:
return n
}
}
func (n *regexNode) reduceGroup() *regexNode {
u := n
for u.t == ntGroup {
u = u.children[0]
}
return u
}
// Simple optimization. If a set is a singleton, an inverse singleton,
// or empty, it's transformed accordingly.
func (n *regexNode) reduceSet() *regexNode {
// Extract empty-set, one and not-one case as special
if n.set == nil {
n.t = ntNothing
} else if n.set.IsSingleton() {
n.ch = n.set.SingletonChar()
n.set = nil
n.t += (ntOne - ntSet)
} else if n.set.IsSingletonInverse() {
n.ch = n.set.SingletonChar()
n.set = nil
n.t += (ntNotone - ntSet)
}
return n
}
func (n *regexNode) reverseLeft() *regexNode {
if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
//reverse children order
for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
n.children[left], n.children[right] = n.children[right], n.children[left]
}
}
return n
}
func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
if min == 0 && max == 0 {
return newRegexNode(ntEmpty, n.options)
}
if min == 1 && max == 1 {
return n
}
switch n.t {
case ntOne, ntNotone, ntSet:
if lazy {
n.makeRep(Onelazy, min, max)
} else {
n.makeRep(Oneloop, min, max)
}
return n
default:
var t nodeType
if lazy {
t = ntLazyloop
} else {
t = ntLoop
}
result := newRegexNodeMN(t, n.options, min, max)
result.addChild(n)
return result
}
}
// debug functions
var typeStr = []string{
"Onerep", "Notonerep", "Setrep",
"Oneloop", "Notoneloop", "Setloop",
"Onelazy", "Notonelazy", "Setlazy",
"One", "Notone", "Set",
"Multi", "Ref",
"Bol", "Eol", "Boundary", "Nonboundary",
"Beginning", "Start", "EndZ", "End",
"Nothing", "Empty",
"Alternate", "Concatenate",
"Loop", "Lazyloop",
"Capture", "Group", "Require", "Prevent", "Greedy",
"Testref", "Testgroup",
"Unknown", "Unknown", "Unknown",
"Unknown", "Unknown", "Unknown",
"ECMABoundary", "NonECMABoundary",
}
func (n *regexNode) description() string {
buf := &bytes.Buffer{}
buf.WriteString(typeStr[n.t])
if (n.options & ExplicitCapture) != 0 {
buf.WriteString("-C")
}
if (n.options & IgnoreCase) != 0 {
buf.WriteString("-I")
}
if (n.options & RightToLeft) != 0 {
buf.WriteString("-L")
}
if (n.options & Multiline) != 0 {
buf.WriteString("-M")
}
if (n.options & Singleline) != 0 {
buf.WriteString("-S")
}
if (n.options & IgnorePatternWhitespace) != 0 {
buf.WriteString("-X")
}
if (n.options & ECMAScript) != 0 {
buf.WriteString("-E")
}
switch n.t {
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
break
case ntCapture:
buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
break
case ntRef, ntTestref:
buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
break
case ntMulti:
fmt.Fprintf(buf, "(String = %s)", string(n.str))
break
case ntSet, ntSetloop, ntSetlazy:
buf.WriteString("(Set = " + n.set.String() + ")")
break
}
switch n.t {
case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
buf.WriteString("(Min = ")
buf.WriteString(strconv.Itoa(n.m))
buf.WriteString(", Max = ")
if n.n == math.MaxInt32 {
buf.WriteString("inf")
} else {
buf.WriteString(strconv.Itoa(n.n))
}
buf.WriteString(")")
break
}
return buf.String()
}
var padSpace = []byte(" ")
func (t *RegexTree) Dump() string {
return t.root.dump()
}
func (n *regexNode) dump() string {
var stack []int
CurNode := n
CurChild := 0
buf := bytes.NewBufferString(CurNode.description())
buf.WriteRune('\n')
for {
if CurNode.children != nil && CurChild < len(CurNode.children) {
stack = append(stack, CurChild+1)
CurNode = CurNode.children[CurChild]
CurChild = 0
Depth := len(stack)
if Depth > 32 {
Depth = 32
}
buf.Write(padSpace[:Depth])
buf.WriteString(CurNode.description())
buf.WriteRune('\n')
} else {
if len(stack) == 0 {
break
}
CurChild = stack[len(stack)-1]
stack = stack[:len(stack)-1]
CurNode = CurNode.next
}
}
return buf.String()
}