pm.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638
  1. // Lua pattern match functions for Go
  2. package pm
  3. import (
  4. "fmt"
  5. )
  6. const EOS = -1
  7. const _UNKNOWN = -2
  8. /* Error {{{ */
  9. type Error struct {
  10. Pos int
  11. Message string
  12. }
  13. func newError(pos int, message string, args ...interface{}) *Error {
  14. if len(args) == 0 {
  15. return &Error{pos, message}
  16. }
  17. return &Error{pos, fmt.Sprintf(message, args...)}
  18. }
  19. func (e *Error) Error() string {
  20. switch e.Pos {
  21. case EOS:
  22. return fmt.Sprintf("%s at EOS", e.Message)
  23. case _UNKNOWN:
  24. return fmt.Sprintf("%s", e.Message)
  25. default:
  26. return fmt.Sprintf("%s at %d", e.Message, e.Pos)
  27. }
  28. }
  29. /* }}} */
  30. /* MatchData {{{ */
  31. type MatchData struct {
  32. // captured positions
  33. // layout
  34. // xxxx xxxx xxxx xxx0 : caputured positions
  35. // xxxx xxxx xxxx xxx1 : position captured positions
  36. captures []uint32
  37. }
  38. func newMatchState() *MatchData { return &MatchData{[]uint32{}} }
  39. func (st *MatchData) addPosCapture(s, pos int) {
  40. for s+1 >= len(st.captures) {
  41. st.captures = append(st.captures, 0)
  42. }
  43. st.captures[s] = (uint32(pos) << 1) | 1
  44. st.captures[s+1] = (uint32(pos) << 1) | 1
  45. }
  46. func (st *MatchData) setCapture(s, pos int) uint32 {
  47. for s >= len(st.captures) {
  48. st.captures = append(st.captures, 0)
  49. }
  50. v := st.captures[s]
  51. st.captures[s] = (uint32(pos) << 1)
  52. return v
  53. }
  54. func (st *MatchData) restoreCapture(s int, pos uint32) { st.captures[s] = pos }
  55. func (st *MatchData) CaptureLength() int { return len(st.captures) }
  56. func (st *MatchData) IsPosCapture(idx int) bool { return (st.captures[idx] & 1) == 1 }
  57. func (st *MatchData) Capture(idx int) int { return int(st.captures[idx] >> 1) }
  58. /* }}} */
  59. /* scanner {{{ */
  60. type scannerState struct {
  61. Pos int
  62. started bool
  63. }
  64. type scanner struct {
  65. src []byte
  66. State scannerState
  67. saved scannerState
  68. }
  69. func newScanner(src []byte) *scanner {
  70. return &scanner{
  71. src: src,
  72. State: scannerState{
  73. Pos: 0,
  74. started: false,
  75. },
  76. saved: scannerState{},
  77. }
  78. }
  79. func (sc *scanner) Length() int { return len(sc.src) }
  80. func (sc *scanner) Next() int {
  81. if !sc.State.started {
  82. sc.State.started = true
  83. if len(sc.src) == 0 {
  84. sc.State.Pos = EOS
  85. }
  86. } else {
  87. sc.State.Pos = sc.NextPos()
  88. }
  89. if sc.State.Pos == EOS {
  90. return EOS
  91. }
  92. return int(sc.src[sc.State.Pos])
  93. }
  94. func (sc *scanner) CurrentPos() int {
  95. return sc.State.Pos
  96. }
  97. func (sc *scanner) NextPos() int {
  98. if sc.State.Pos == EOS || sc.State.Pos >= len(sc.src)-1 {
  99. return EOS
  100. }
  101. if !sc.State.started {
  102. return 0
  103. } else {
  104. return sc.State.Pos + 1
  105. }
  106. }
  107. func (sc *scanner) Peek() int {
  108. cureof := sc.State.Pos == EOS
  109. ch := sc.Next()
  110. if !cureof {
  111. if sc.State.Pos == EOS {
  112. sc.State.Pos = len(sc.src) - 1
  113. } else {
  114. sc.State.Pos--
  115. if sc.State.Pos < 0 {
  116. sc.State.Pos = 0
  117. sc.State.started = false
  118. }
  119. }
  120. }
  121. return ch
  122. }
  123. func (sc *scanner) Save() { sc.saved = sc.State }
  124. func (sc *scanner) Restore() { sc.State = sc.saved }
  125. /* }}} */
  126. /* bytecode {{{ */
  127. type opCode int
  128. const (
  129. opChar opCode = iota
  130. opMatch
  131. opTailMatch
  132. opJmp
  133. opSplit
  134. opSave
  135. opPSave
  136. opBrace
  137. opNumber
  138. )
  139. type inst struct {
  140. OpCode opCode
  141. Class class
  142. Operand1 int
  143. Operand2 int
  144. }
  145. /* }}} */
  146. /* classes {{{ */
  147. type class interface {
  148. Matches(ch int) bool
  149. }
  150. type dotClass struct{}
  151. func (pn *dotClass) Matches(ch int) bool { return true }
  152. type charClass struct {
  153. Ch int
  154. }
  155. func (pn *charClass) Matches(ch int) bool { return pn.Ch == ch }
  156. type singleClass struct {
  157. Class int
  158. }
  159. func (pn *singleClass) Matches(ch int) bool {
  160. ret := false
  161. switch pn.Class {
  162. case 'a', 'A':
  163. ret = 'A' <= ch && ch <= 'Z' || 'a' <= ch && ch <= 'z'
  164. case 'c', 'C':
  165. ret = (0x00 <= ch && ch <= 0x1F) || ch == 0x7F
  166. case 'd', 'D':
  167. ret = '0' <= ch && ch <= '9'
  168. case 'l', 'L':
  169. ret = 'a' <= ch && ch <= 'z'
  170. case 'p', 'P':
  171. ret = (0x21 <= ch && ch <= 0x2f) || (0x3a <= ch && ch <= 0x40) || (0x5b <= ch && ch <= 0x60) || (0x7b <= ch && ch <= 0x7e)
  172. case 's', 'S':
  173. switch ch {
  174. case ' ', '\f', '\n', '\r', '\t', '\v':
  175. ret = true
  176. }
  177. case 'u', 'U':
  178. ret = 'A' <= ch && ch <= 'Z'
  179. case 'w', 'W':
  180. ret = '0' <= ch && ch <= '9' || 'A' <= ch && ch <= 'Z' || 'a' <= ch && ch <= 'z'
  181. case 'x', 'X':
  182. ret = '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
  183. case 'z', 'Z':
  184. ret = ch == 0
  185. default:
  186. return ch == pn.Class
  187. }
  188. if 'A' <= pn.Class && pn.Class <= 'Z' {
  189. return !ret
  190. }
  191. return ret
  192. }
  193. type setClass struct {
  194. IsNot bool
  195. Classes []class
  196. }
  197. func (pn *setClass) Matches(ch int) bool {
  198. for _, class := range pn.Classes {
  199. if class.Matches(ch) {
  200. return !pn.IsNot
  201. }
  202. }
  203. return pn.IsNot
  204. }
  205. type rangeClass struct {
  206. Begin class
  207. End class
  208. }
  209. func (pn *rangeClass) Matches(ch int) bool {
  210. switch begin := pn.Begin.(type) {
  211. case *charClass:
  212. end, ok := pn.End.(*charClass)
  213. if !ok {
  214. return false
  215. }
  216. return begin.Ch <= ch && ch <= end.Ch
  217. }
  218. return false
  219. }
  220. // }}}
  221. // patterns {{{
  222. type pattern interface{}
  223. type singlePattern struct {
  224. Class class
  225. }
  226. type seqPattern struct {
  227. MustHead bool
  228. MustTail bool
  229. Patterns []pattern
  230. }
  231. type repeatPattern struct {
  232. Type int
  233. Class class
  234. }
  235. type posCapPattern struct{}
  236. type capPattern struct {
  237. Pattern pattern
  238. }
  239. type numberPattern struct {
  240. N int
  241. }
  242. type bracePattern struct {
  243. Begin int
  244. End int
  245. }
  246. // }}}
  247. /* parse {{{ */
  248. func parseClass(sc *scanner, allowset bool) class {
  249. ch := sc.Next()
  250. switch ch {
  251. case '%':
  252. return &singleClass{sc.Next()}
  253. case '.':
  254. if allowset {
  255. return &dotClass{}
  256. }
  257. return &charClass{ch}
  258. case '[':
  259. if allowset {
  260. return parseClassSet(sc)
  261. }
  262. return &charClass{ch}
  263. //case '^' '$', '(', ')', ']', '*', '+', '-', '?':
  264. // panic(newError(sc.CurrentPos(), "invalid %c", ch))
  265. case EOS:
  266. panic(newError(sc.CurrentPos(), "unexpected EOS"))
  267. default:
  268. return &charClass{ch}
  269. }
  270. }
  271. func parseClassSet(sc *scanner) class {
  272. set := &setClass{false, []class{}}
  273. if sc.Peek() == '^' {
  274. set.IsNot = true
  275. sc.Next()
  276. }
  277. isrange := false
  278. for {
  279. ch := sc.Peek()
  280. switch ch {
  281. // case '[':
  282. // panic(newError(sc.CurrentPos(), "'[' can not be nested"))
  283. case EOS:
  284. panic(newError(sc.CurrentPos(), "unexpected EOS"))
  285. case ']':
  286. if len(set.Classes) > 0 {
  287. sc.Next()
  288. goto exit
  289. }
  290. fallthrough
  291. case '-':
  292. if len(set.Classes) > 0 {
  293. sc.Next()
  294. isrange = true
  295. continue
  296. }
  297. fallthrough
  298. default:
  299. set.Classes = append(set.Classes, parseClass(sc, false))
  300. }
  301. if isrange {
  302. begin := set.Classes[len(set.Classes)-2]
  303. end := set.Classes[len(set.Classes)-1]
  304. set.Classes = set.Classes[0 : len(set.Classes)-2]
  305. set.Classes = append(set.Classes, &rangeClass{begin, end})
  306. isrange = false
  307. }
  308. }
  309. exit:
  310. if isrange {
  311. set.Classes = append(set.Classes, &charClass{'-'})
  312. }
  313. return set
  314. }
  315. func parsePattern(sc *scanner, toplevel bool) *seqPattern {
  316. pat := &seqPattern{}
  317. if toplevel {
  318. if sc.Peek() == '^' {
  319. sc.Next()
  320. pat.MustHead = true
  321. }
  322. }
  323. for {
  324. ch := sc.Peek()
  325. switch ch {
  326. case '%':
  327. sc.Save()
  328. sc.Next()
  329. switch sc.Peek() {
  330. case '0':
  331. panic(newError(sc.CurrentPos(), "invalid capture index"))
  332. case '1', '2', '3', '4', '5', '6', '7', '8', '9':
  333. pat.Patterns = append(pat.Patterns, &numberPattern{sc.Next() - 48})
  334. case 'b':
  335. sc.Next()
  336. pat.Patterns = append(pat.Patterns, &bracePattern{sc.Next(), sc.Next()})
  337. default:
  338. sc.Restore()
  339. pat.Patterns = append(pat.Patterns, &singlePattern{parseClass(sc, true)})
  340. }
  341. case '.', '[', ']':
  342. pat.Patterns = append(pat.Patterns, &singlePattern{parseClass(sc, true)})
  343. //case ']':
  344. // panic(newError(sc.CurrentPos(), "invalid ']'"))
  345. case ')':
  346. if toplevel {
  347. panic(newError(sc.CurrentPos(), "invalid ')'"))
  348. }
  349. return pat
  350. case '(':
  351. sc.Next()
  352. if sc.Peek() == ')' {
  353. sc.Next()
  354. pat.Patterns = append(pat.Patterns, &posCapPattern{})
  355. } else {
  356. ret := &capPattern{parsePattern(sc, false)}
  357. if sc.Peek() != ')' {
  358. panic(newError(sc.CurrentPos(), "unfinished capture"))
  359. }
  360. sc.Next()
  361. pat.Patterns = append(pat.Patterns, ret)
  362. }
  363. case '*', '+', '-', '?':
  364. sc.Next()
  365. if len(pat.Patterns) > 0 {
  366. spat, ok := pat.Patterns[len(pat.Patterns)-1].(*singlePattern)
  367. if ok {
  368. pat.Patterns = pat.Patterns[0 : len(pat.Patterns)-1]
  369. pat.Patterns = append(pat.Patterns, &repeatPattern{ch, spat.Class})
  370. continue
  371. }
  372. }
  373. pat.Patterns = append(pat.Patterns, &singlePattern{&charClass{ch}})
  374. case '$':
  375. if toplevel && (sc.NextPos() == sc.Length()-1 || sc.NextPos() == EOS) {
  376. pat.MustTail = true
  377. } else {
  378. pat.Patterns = append(pat.Patterns, &singlePattern{&charClass{ch}})
  379. }
  380. sc.Next()
  381. case EOS:
  382. sc.Next()
  383. goto exit
  384. default:
  385. sc.Next()
  386. pat.Patterns = append(pat.Patterns, &singlePattern{&charClass{ch}})
  387. }
  388. }
  389. exit:
  390. return pat
  391. }
  392. type iptr struct {
  393. insts []inst
  394. capture int
  395. }
  396. func compilePattern(p pattern, ps ...*iptr) []inst {
  397. var ptr *iptr
  398. toplevel := false
  399. if len(ps) == 0 {
  400. toplevel = true
  401. ptr = &iptr{[]inst{inst{opSave, nil, 0, -1}}, 2}
  402. } else {
  403. ptr = ps[0]
  404. }
  405. switch pat := p.(type) {
  406. case *singlePattern:
  407. ptr.insts = append(ptr.insts, inst{opChar, pat.Class, -1, -1})
  408. case *seqPattern:
  409. for _, cp := range pat.Patterns {
  410. compilePattern(cp, ptr)
  411. }
  412. case *repeatPattern:
  413. idx := len(ptr.insts)
  414. switch pat.Type {
  415. case '*':
  416. ptr.insts = append(ptr.insts,
  417. inst{opSplit, nil, idx + 1, idx + 3},
  418. inst{opChar, pat.Class, -1, -1},
  419. inst{opJmp, nil, idx, -1})
  420. case '+':
  421. ptr.insts = append(ptr.insts,
  422. inst{opChar, pat.Class, -1, -1},
  423. inst{opSplit, nil, idx, idx + 2})
  424. case '-':
  425. ptr.insts = append(ptr.insts,
  426. inst{opSplit, nil, idx + 3, idx + 1},
  427. inst{opChar, pat.Class, -1, -1},
  428. inst{opJmp, nil, idx, -1})
  429. case '?':
  430. ptr.insts = append(ptr.insts,
  431. inst{opSplit, nil, idx + 1, idx + 2},
  432. inst{opChar, pat.Class, -1, -1})
  433. }
  434. case *posCapPattern:
  435. ptr.insts = append(ptr.insts, inst{opPSave, nil, ptr.capture, -1})
  436. ptr.capture += 2
  437. case *capPattern:
  438. c0, c1 := ptr.capture, ptr.capture+1
  439. ptr.capture += 2
  440. ptr.insts = append(ptr.insts, inst{opSave, nil, c0, -1})
  441. compilePattern(pat.Pattern, ptr)
  442. ptr.insts = append(ptr.insts, inst{opSave, nil, c1, -1})
  443. case *bracePattern:
  444. ptr.insts = append(ptr.insts, inst{opBrace, nil, pat.Begin, pat.End})
  445. case *numberPattern:
  446. ptr.insts = append(ptr.insts, inst{opNumber, nil, pat.N, -1})
  447. }
  448. if toplevel {
  449. if p.(*seqPattern).MustTail {
  450. ptr.insts = append(ptr.insts, inst{opSave, nil, 1, -1}, inst{opTailMatch, nil, -1, -1})
  451. }
  452. ptr.insts = append(ptr.insts, inst{opSave, nil, 1, -1}, inst{opMatch, nil, -1, -1})
  453. }
  454. return ptr.insts
  455. }
  456. /* }}} parse */
  457. /* VM {{{ */
  458. // Simple recursive virtual machine based on the
  459. // "Regular Expression Matching: the Virtual Machine Approach" (https://swtch.com/~rsc/regexp/regexp2.html)
  460. func recursiveVM(src []byte, insts []inst, pc, sp int, ms ...*MatchData) (bool, int, *MatchData) {
  461. var m *MatchData
  462. if len(ms) == 0 {
  463. m = newMatchState()
  464. } else {
  465. m = ms[0]
  466. }
  467. redo:
  468. inst := insts[pc]
  469. switch inst.OpCode {
  470. case opChar:
  471. if sp >= len(src) || !inst.Class.Matches(int(src[sp])) {
  472. return false, sp, m
  473. }
  474. pc++
  475. sp++
  476. goto redo
  477. case opMatch:
  478. return true, sp, m
  479. case opTailMatch:
  480. return sp >= len(src), sp, m
  481. case opJmp:
  482. pc = inst.Operand1
  483. goto redo
  484. case opSplit:
  485. if ok, nsp, _ := recursiveVM(src, insts, inst.Operand1, sp, m); ok {
  486. return true, nsp, m
  487. }
  488. pc = inst.Operand2
  489. goto redo
  490. case opSave:
  491. s := m.setCapture(inst.Operand1, sp)
  492. if ok, nsp, _ := recursiveVM(src, insts, pc+1, sp, m); ok {
  493. return true, nsp, m
  494. }
  495. m.restoreCapture(inst.Operand1, s)
  496. return false, sp, m
  497. case opPSave:
  498. m.addPosCapture(inst.Operand1, sp+1)
  499. pc++
  500. goto redo
  501. case opBrace:
  502. if sp >= len(src) || int(src[sp]) != inst.Operand1 {
  503. return false, sp, m
  504. }
  505. count := 1
  506. for sp = sp + 1; sp < len(src); sp++ {
  507. if int(src[sp]) == inst.Operand2 {
  508. count--
  509. }
  510. if count == 0 {
  511. pc++
  512. sp++
  513. goto redo
  514. }
  515. if int(src[sp]) == inst.Operand1 {
  516. count++
  517. }
  518. }
  519. return false, sp, m
  520. case opNumber:
  521. idx := inst.Operand1 * 2
  522. if idx >= m.CaptureLength()-1 {
  523. panic(newError(_UNKNOWN, "invalid capture index"))
  524. }
  525. capture := src[m.Capture(idx):m.Capture(idx+1)]
  526. for i := 0; i < len(capture); i++ {
  527. if i+sp >= len(src) || capture[i] != src[i+sp] {
  528. return false, sp, m
  529. }
  530. }
  531. pc++
  532. sp += len(capture)
  533. goto redo
  534. }
  535. panic("should not reach here")
  536. }
  537. /* }}} */
  538. /* API {{{ */
  539. func Find(p string, src []byte, offset, limit int) (matches []*MatchData, err error) {
  540. defer func() {
  541. if v := recover(); v != nil {
  542. if perr, ok := v.(*Error); ok {
  543. err = perr
  544. } else {
  545. panic(v)
  546. }
  547. }
  548. }()
  549. pat := parsePattern(newScanner([]byte(p)), true)
  550. insts := compilePattern(pat)
  551. matches = []*MatchData{}
  552. for sp := offset; sp <= len(src); {
  553. ok, nsp, ms := recursiveVM(src, insts, 0, sp)
  554. sp++
  555. if ok {
  556. if sp < nsp {
  557. sp = nsp
  558. }
  559. matches = append(matches, ms)
  560. }
  561. if len(matches) == limit || pat.MustHead {
  562. break
  563. }
  564. }
  565. return
  566. }
  567. /* }}} */