410 lines
9.5 KiB
Go
410 lines
9.5 KiB
Go
package regexp2
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
// Process the file "testoutput1" from PCRE2 v10.21 (public domain)
|
|
var totalCount, failCount = 0, 0
|
|
|
|
func TestPcre_Basics(t *testing.T) {
|
|
defer func() {
|
|
if failCount > 0 {
|
|
t.Logf("%v of %v patterns failed", failCount, totalCount)
|
|
}
|
|
}()
|
|
// open our test patterns file and run through it
|
|
// validating results as we go
|
|
file, err := os.Open("testoutput1")
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
defer file.Close()
|
|
|
|
// the high level structure of the file:
|
|
// #comments - ignore only outside of the pattern
|
|
// pattern (could be multi-line, could be surrounded by "" or //) after the / there are the options some we understand, some we don't
|
|
// test case
|
|
// 0: success case
|
|
// \= Expect no match (ignored)
|
|
// another test case
|
|
// No Match
|
|
//
|
|
// another pattern ...etc
|
|
|
|
scanner := bufio.NewScanner(file)
|
|
// main pattern loop
|
|
for scanner.Scan() {
|
|
// reading the file a line at a time
|
|
line := scanner.Text()
|
|
|
|
if trim := strings.TrimSpace(line); trim == "" || strings.HasPrefix(trim, "#") {
|
|
// skip blanks and comments
|
|
continue
|
|
}
|
|
|
|
patternStart := line[0]
|
|
if patternStart != '/' && patternStart != '"' {
|
|
// an error! expected a pattern but we didn't understand what was in the file
|
|
t.Fatalf("Unknown file format, expected line to start with '/' or '\"', line in: %v", line)
|
|
}
|
|
|
|
// start building our pattern, handling multi-line patterns
|
|
pattern := line
|
|
totalCount++
|
|
|
|
// keep appending the lines to our pattern string until we
|
|
// find our closing tag, don't allow the first char to match on the
|
|
// line start, but subsequent lines could end on the first char
|
|
allowFirst := false
|
|
for !containsEnder(line, patternStart, allowFirst) {
|
|
if !scanner.Scan() {
|
|
// an error! expected more pattern, but got eof
|
|
t.Fatalf("Unknown file format, expected more pattern text, but got EOF, pattern so far: %v", pattern)
|
|
}
|
|
line = scanner.Text()
|
|
pattern += fmt.Sprintf("\n%s", line)
|
|
allowFirst = true
|
|
}
|
|
|
|
// we have our raw pattern! -- we need to convert this to a compiled regex
|
|
re := compileRawPattern(t, pattern)
|
|
|
|
var (
|
|
capsIdx map[int]int
|
|
m *Match
|
|
toMatch string
|
|
)
|
|
// now we need to parse the test cases if there are any
|
|
// they start with 4 spaces -- if we don't get a 4-space start then
|
|
// we're back out to our next pattern
|
|
for scanner.Scan() {
|
|
line = scanner.Text()
|
|
|
|
// blank line is our separator for a new pattern
|
|
if strings.TrimSpace(line) == "" {
|
|
break
|
|
}
|
|
|
|
// could be either " " or "\= Expect"
|
|
if strings.HasPrefix(line, "\\= Expect") {
|
|
continue
|
|
} else if strings.HasPrefix(line, " ") {
|
|
// trim off leading spaces for our text to match
|
|
toMatch = line[4:]
|
|
// trim off trailing spaces too
|
|
toMatch = strings.TrimRight(toMatch, " ")
|
|
|
|
m = matchString(t, re, toMatch)
|
|
|
|
capsIdx = make(map[int]int)
|
|
continue
|
|
//t.Fatalf("Expected match text to start with 4 spaces, instead got: '%v'", line)
|
|
} else if strings.HasPrefix(line, "No match") {
|
|
validateNoMatch(t, re, m)
|
|
// no match means we're done
|
|
continue
|
|
} else if subs := matchGroup.FindStringSubmatch(line); len(subs) == 3 {
|
|
gIdx, _ := strconv.Atoi(subs[1])
|
|
if _, ok := capsIdx[gIdx]; !ok {
|
|
capsIdx[gIdx] = 0
|
|
}
|
|
validateMatch(t, re, m, toMatch, subs[2], gIdx, capsIdx[gIdx])
|
|
capsIdx[gIdx]++
|
|
continue
|
|
} else {
|
|
// no match -- problem
|
|
t.Fatalf("Unknown file format, expected match or match group but got '%v'", line)
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if err := scanner.Err(); err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
}
|
|
|
|
var matchGroup = regexp.MustCompile(`^\s*(\d+): (.*)`)
|
|
|
|
func problem(t *testing.T, input string, args ...interface{}) {
|
|
failCount++
|
|
t.Errorf(input, args...)
|
|
}
|
|
|
|
func validateNoMatch(t *testing.T, re *Regexp, m *Match) {
|
|
if re == nil || m == nil {
|
|
return
|
|
}
|
|
|
|
problem(t, "Expected no match for pattern '%v', but got '%v'", re.pattern, m.String())
|
|
}
|
|
|
|
func validateMatch(t *testing.T, re *Regexp, m *Match, toMatch, value string, idx, capIdx int) {
|
|
if re == nil {
|
|
// already error'd earlier up stream
|
|
return
|
|
}
|
|
|
|
if m == nil {
|
|
// we didn't match, but should have
|
|
problem(t, "Expected match for pattern '%v' with input '%v', but got no match", re.pattern, toMatch)
|
|
return
|
|
}
|
|
|
|
g := m.Groups()
|
|
if len(g) <= idx {
|
|
problem(t, "Expected group %v does not exist in pattern '%v' with input '%v'", idx, re.pattern, toMatch)
|
|
return
|
|
}
|
|
|
|
if value == "<unset>" {
|
|
// this means we shouldn't have a cap for this group
|
|
if len(g[idx].Captures) > 0 {
|
|
problem(t, "Expected no cap %v in group %v in pattern '%v' with input '%v'", g[idx].Captures[capIdx].String(), idx, re.pattern, toMatch)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
if len(g[idx].Captures) <= capIdx {
|
|
problem(t, "Expected cap %v does not exist in group %v in pattern '%v' with input '%v'", capIdx, idx, re.pattern, toMatch)
|
|
return
|
|
}
|
|
|
|
escp := unEscapeGroup(g[idx].String())
|
|
//escp := unEscapeGroup(g[idx].Captures[capIdx].String())
|
|
if escp != value {
|
|
problem(t, "Expected '%v' but got '%v' for cap %v, group %v for pattern '%v' with input '%v'", value, escp, capIdx, idx, re.pattern, toMatch)
|
|
return
|
|
}
|
|
}
|
|
|
|
func compileRawPattern(t *testing.T, pattern string) *Regexp {
|
|
// check our end for RegexOptions -trim them off
|
|
index := strings.LastIndexAny(pattern, "/\"")
|
|
//
|
|
// Append "= Debug" to compare details between corefx and regexp2 on the PCRE test suite
|
|
//
|
|
var opts RegexOptions
|
|
|
|
if index+1 < len(pattern) {
|
|
textOptions := pattern[index+1:]
|
|
pattern = pattern[:index+1]
|
|
// there are lots of complex options here
|
|
for _, textOpt := range strings.Split(textOptions, ",") {
|
|
switch textOpt {
|
|
case "dupnames":
|
|
// we don't know how to handle this...
|
|
default:
|
|
if strings.Contains(textOpt, "i") {
|
|
opts |= IgnoreCase
|
|
}
|
|
if strings.Contains(textOpt, "s") {
|
|
opts |= Singleline
|
|
}
|
|
if strings.Contains(textOpt, "m") {
|
|
opts |= Multiline
|
|
}
|
|
if strings.Contains(textOpt, "x") {
|
|
opts |= IgnorePatternWhitespace
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
// trim off first and last char
|
|
pattern = pattern[1 : len(pattern)-1]
|
|
|
|
defer func() {
|
|
if rec := recover(); rec != nil {
|
|
problem(t, "PANIC in compiling \"%v\": %v", pattern, rec)
|
|
}
|
|
}()
|
|
re, err := Compile(pattern, opts)
|
|
if err != nil {
|
|
problem(t, "Error parsing \"%v\": %v", pattern, err)
|
|
}
|
|
return re
|
|
}
|
|
|
|
func matchString(t *testing.T, re *Regexp, toMatch string) *Match {
|
|
if re == nil {
|
|
return nil
|
|
}
|
|
|
|
re.MatchTimeout = time.Second * 1
|
|
|
|
escp := ""
|
|
var err error
|
|
if toMatch != "\\" {
|
|
escp = unEscapeToMatch(toMatch)
|
|
}
|
|
m, err := re.FindStringMatch(escp)
|
|
if err != nil {
|
|
problem(t, "Error matching \"%v\" in pattern \"%v\": %v", toMatch, re.pattern, err)
|
|
}
|
|
return m
|
|
}
|
|
|
|
func containsEnder(line string, ender byte, allowFirst bool) bool {
|
|
index := strings.LastIndexByte(line, ender)
|
|
if index > 0 {
|
|
return true
|
|
} else if index == 0 && allowFirst {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func unEscapeToMatch(line string) string {
|
|
idx := strings.IndexRune(line, '\\')
|
|
// no slashes means no unescape needed
|
|
if idx == -1 {
|
|
return line
|
|
}
|
|
|
|
buf := bytes.NewBufferString(line[:idx])
|
|
// get the runes for the rest of the string -- we're going full parser scan on this
|
|
|
|
inEscape := false
|
|
// take any \'s and convert them
|
|
for i := idx; i < len(line); i++ {
|
|
ch := line[i]
|
|
if ch == '\\' {
|
|
if inEscape {
|
|
buf.WriteByte(ch)
|
|
}
|
|
inEscape = !inEscape
|
|
continue
|
|
}
|
|
if inEscape {
|
|
switch ch {
|
|
case 'x':
|
|
buf.WriteByte(scanHex(line, &i))
|
|
case 'a':
|
|
buf.WriteByte(0x07)
|
|
case 'b':
|
|
buf.WriteByte('\b')
|
|
case 'e':
|
|
buf.WriteByte(0x1b)
|
|
case 'f':
|
|
buf.WriteByte('\f')
|
|
case 'n':
|
|
buf.WriteByte('\n')
|
|
case 'r':
|
|
buf.WriteByte('\r')
|
|
case 't':
|
|
buf.WriteByte('\t')
|
|
case 'v':
|
|
buf.WriteByte(0x0b)
|
|
default:
|
|
if ch >= '0' && ch <= '7' {
|
|
buf.WriteByte(scanOctal(line, &i))
|
|
} else {
|
|
buf.WriteByte(ch)
|
|
//panic(fmt.Sprintf("unexpected escape '%v' in %v", string(ch), line))
|
|
}
|
|
}
|
|
inEscape = false
|
|
} else {
|
|
buf.WriteByte(ch)
|
|
}
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
func unEscapeGroup(val string) string {
|
|
// use hex for chars 0x00-0x1f, 0x7f-0xff
|
|
buf := &bytes.Buffer{}
|
|
|
|
for i := 0; i < len(val); i++ {
|
|
ch := val[i]
|
|
if ch <= 0x1f || ch >= 0x7f {
|
|
//write it as a \x00
|
|
fmt.Fprintf(buf, "\\x%.2x", ch)
|
|
} else {
|
|
// write as-is
|
|
buf.WriteByte(ch)
|
|
}
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
func scanHex(line string, idx *int) byte {
|
|
if *idx >= len(line)-2 {
|
|
panic(fmt.Sprintf("not enough hex chars in %v at %v", line, *idx))
|
|
}
|
|
(*idx)++
|
|
d1 := hexDigit(line[*idx])
|
|
(*idx)++
|
|
d2 := hexDigit(line[*idx])
|
|
if d1 < 0 || d2 < 0 {
|
|
panic("bad hex chars")
|
|
}
|
|
|
|
return byte(d1*0x10 + d2)
|
|
}
|
|
|
|
// Returns n <= 0xF for a hex digit.
|
|
func hexDigit(ch byte) int {
|
|
|
|
if d := uint(ch - '0'); d <= 9 {
|
|
return int(d)
|
|
}
|
|
|
|
if d := uint(ch - 'a'); d <= 5 {
|
|
return int(d + 0xa)
|
|
}
|
|
|
|
if d := uint(ch - 'A'); d <= 5 {
|
|
return int(d + 0xa)
|
|
}
|
|
|
|
return -1
|
|
}
|
|
|
|
// Scans up to three octal digits (stops before exceeding 0377).
|
|
func scanOctal(line string, idx *int) byte {
|
|
// Consume octal chars only up to 3 digits and value 0377
|
|
|
|
// octals can be 3,2, or 1 digit
|
|
c := 3
|
|
|
|
if diff := len(line) - *idx; c > diff {
|
|
c = diff
|
|
}
|
|
|
|
i := 0
|
|
d := int(line[*idx] - '0')
|
|
for c > 0 && d <= 7 {
|
|
i *= 8
|
|
i += d
|
|
|
|
c--
|
|
(*idx)++
|
|
if *idx < len(line) {
|
|
d = int(line[*idx] - '0')
|
|
}
|
|
}
|
|
(*idx)--
|
|
|
|
// Octal codes only go up to 255. Any larger and the behavior that Perl follows
|
|
// is simply to truncate the high bits.
|
|
i &= 0xFF
|
|
|
|
return byte(i)
|
|
}
|