pm.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637
  1. // Lua pattern match functions for Go
  2. package pm
  3. import (
  4. "fmt"
  5. )
  6. const EOS = -1
  7. const _UNKNOWN = -2
  8. /* Error {{{ */
  9. type Error struct {
  10. Pos int
  11. Message string
  12. }
  13. func newError(pos int, message string, args ...interface{}) *Error {
  14. if len(args) == 0 {
  15. return &Error{pos, message}
  16. }
  17. return &Error{pos, fmt.Sprintf(message, args...)}
  18. }
  19. func (e *Error) Error() string {
  20. switch e.Pos {
  21. case EOS:
  22. return fmt.Sprintf("%s at EOS", e.Message)
  23. case _UNKNOWN:
  24. return fmt.Sprintf("%s", e.Message)
  25. default:
  26. return fmt.Sprintf("%s at %d", e.Message, e.Pos)
  27. }
  28. }
  29. /* }}} */
  30. /* MatchData {{{ */
  31. type MatchData struct {
  32. // captured positions
  33. // layout
  34. // xxxx xxxx xxxx xxx0 : caputured positions
  35. // xxxx xxxx xxxx xxx1 : position captured positions
  36. captures []uint32
  37. }
  38. func newMatchState() *MatchData { return &MatchData{[]uint32{}} }
  39. func (st *MatchData) addPosCapture(s, pos int) {
  40. for s+1 >= len(st.captures) {
  41. st.captures = append(st.captures, 0)
  42. }
  43. st.captures[s] = (uint32(pos) << 1) | 1
  44. st.captures[s+1] = (uint32(pos) << 1) | 1
  45. }
  46. func (st *MatchData) setCapture(s, pos int) uint32 {
  47. for s >= len(st.captures) {
  48. st.captures = append(st.captures, 0)
  49. }
  50. v := st.captures[s]
  51. st.captures[s] = (uint32(pos) << 1)
  52. return v
  53. }
  54. func (st *MatchData) restoreCapture(s int, pos uint32) { st.captures[s] = pos }
  55. func (st *MatchData) CaptureLength() int { return len(st.captures) }
  56. func (st *MatchData) IsPosCapture(idx int) bool { return (st.captures[idx] & 1) == 1 }
  57. func (st *MatchData) Capture(idx int) int { return int(st.captures[idx] >> 1) }
  58. /* }}} */
  59. /* scanner {{{ */
  60. type scannerState struct {
  61. Pos int
  62. started bool
  63. }
  64. type scanner struct {
  65. src []byte
  66. State scannerState
  67. saved scannerState
  68. }
  69. func newScanner(src []byte) *scanner {
  70. return &scanner{
  71. src: src,
  72. State: scannerState{
  73. Pos: 0,
  74. started: false,
  75. },
  76. saved: scannerState{},
  77. }
  78. }
  79. func (sc *scanner) Length() int { return len(sc.src) }
  80. func (sc *scanner) Next() int {
  81. if !sc.State.started {
  82. sc.State.started = true
  83. if len(sc.src) == 0 {
  84. sc.State.Pos = EOS
  85. }
  86. } else {
  87. sc.State.Pos = sc.NextPos()
  88. }
  89. if sc.State.Pos == EOS {
  90. return EOS
  91. }
  92. return int(sc.src[sc.State.Pos])
  93. }
  94. func (sc *scanner) CurrentPos() int {
  95. return sc.State.Pos
  96. }
  97. func (sc *scanner) NextPos() int {
  98. if sc.State.Pos == EOS || sc.State.Pos >= len(sc.src)-1 {
  99. return EOS
  100. }
  101. if !sc.State.started {
  102. return 0
  103. } else {
  104. return sc.State.Pos + 1
  105. }
  106. }
  107. func (sc *scanner) Peek() int {
  108. cureof := sc.State.Pos == EOS
  109. ch := sc.Next()
  110. if !cureof {
  111. if sc.State.Pos == EOS {
  112. sc.State.Pos = len(sc.src) - 1
  113. } else {
  114. sc.State.Pos--
  115. if sc.State.Pos < 0 {
  116. sc.State.Pos = 0
  117. sc.State.started = false
  118. }
  119. }
  120. }
  121. return ch
  122. }
  123. func (sc *scanner) Save() { sc.saved = sc.State }
  124. func (sc *scanner) Restore() { sc.State = sc.saved }
  125. /* }}} */
  126. /* bytecode {{{ */
  127. type opCode int
  128. const (
  129. opChar opCode = iota
  130. opMatch
  131. opTailMatch
  132. opJmp
  133. opSplit
  134. opSave
  135. opPSave
  136. opBrace
  137. opNumber
  138. )
  139. type inst struct {
  140. OpCode opCode
  141. Class class
  142. Operand1 int
  143. Operand2 int
  144. }
  145. /* }}} */
  146. /* classes {{{ */
  147. type class interface {
  148. Matches(ch int) bool
  149. }
  150. type dotClass struct{}
  151. func (pn *dotClass) Matches(ch int) bool { return true }
  152. type charClass struct {
  153. Ch int
  154. }
  155. func (pn *charClass) Matches(ch int) bool { return pn.Ch == ch }
  156. type singleClass struct {
  157. Class int
  158. }
  159. func (pn *singleClass) Matches(ch int) bool {
  160. ret := false
  161. switch pn.Class {
  162. case 'a', 'A':
  163. ret = 'A' <= ch && ch <= 'Z' || 'a' <= ch && ch <= 'z'
  164. case 'c', 'C':
  165. ret = (0x00 <= ch && ch <= 0x1F) || ch == 0x7F
  166. case 'd', 'D':
  167. ret = '0' <= ch && ch <= '9'
  168. case 'l', 'L':
  169. ret = 'a' <= ch && ch <= 'z'
  170. case 'p', 'P':
  171. ret = (0x21 <= ch && ch <= 0x2f) || (0x30 <= ch && ch <= 0x40) || (0x5b <= ch && ch <= 0x60) || (0x7b <= ch && ch <= 0x7e)
  172. case 's', 'S':
  173. switch ch {
  174. case ' ', '\f', '\n', '\r', '\t', '\v':
  175. ret = true
  176. }
  177. case 'u', 'U':
  178. ret = 'A' <= ch && ch <= 'Z'
  179. case 'w', 'W':
  180. ret = '0' <= ch && ch <= '9' || 'A' <= ch && ch <= 'Z' || 'a' <= ch && ch <= 'z'
  181. case 'x', 'X':
  182. ret = '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
  183. case 'z', 'Z':
  184. ret = ch == 0
  185. default:
  186. return ch == pn.Class
  187. }
  188. if 'A' <= pn.Class && pn.Class <= 'Z' {
  189. return !ret
  190. }
  191. return ret
  192. }
  193. type setClass struct {
  194. IsNot bool
  195. Classes []class
  196. }
  197. func (pn *setClass) Matches(ch int) bool {
  198. for _, class := range pn.Classes {
  199. if class.Matches(ch) {
  200. return !pn.IsNot
  201. }
  202. }
  203. return pn.IsNot
  204. }
  205. type rangeClass struct {
  206. Begin class
  207. End class
  208. }
  209. func (pn *rangeClass) Matches(ch int) bool {
  210. switch begin := pn.Begin.(type) {
  211. case *charClass:
  212. end, ok := pn.End.(*charClass)
  213. if !ok {
  214. return false
  215. }
  216. return begin.Ch <= ch && ch <= end.Ch
  217. }
  218. return false
  219. }
  220. // }}}
  221. // patterns {{{
  222. type pattern interface{}
  223. type singlePattern struct {
  224. Class class
  225. }
  226. type seqPattern struct {
  227. MustHead bool
  228. MustTail bool
  229. Patterns []pattern
  230. }
  231. type repeatPattern struct {
  232. Type int
  233. Class class
  234. }
  235. type posCapPattern struct{}
  236. type capPattern struct {
  237. Pattern pattern
  238. }
  239. type numberPattern struct {
  240. N int
  241. }
  242. type bracePattern struct {
  243. Begin int
  244. End int
  245. }
  246. // }}}
  247. /* parse {{{ */
  248. func parseClass(sc *scanner, allowset bool) class {
  249. ch := sc.Next()
  250. switch ch {
  251. case '%':
  252. return &singleClass{sc.Next()}
  253. case '.':
  254. if allowset {
  255. return &dotClass{}
  256. } else {
  257. return &charClass{ch}
  258. }
  259. case '[':
  260. if !allowset {
  261. panic(newError(sc.CurrentPos(), "invalid '['"))
  262. }
  263. return parseClassSet(sc)
  264. //case '^' '$', '(', ')', ']', '*', '+', '-', '?':
  265. // panic(newError(sc.CurrentPos(), "invalid %c", ch))
  266. case EOS:
  267. panic(newError(sc.CurrentPos(), "unexpected EOS"))
  268. default:
  269. return &charClass{ch}
  270. }
  271. }
  272. func parseClassSet(sc *scanner) class {
  273. set := &setClass{false, []class{}}
  274. if sc.Peek() == '^' {
  275. set.IsNot = true
  276. sc.Next()
  277. }
  278. isrange := false
  279. for {
  280. ch := sc.Peek()
  281. switch ch {
  282. case '[':
  283. panic(newError(sc.CurrentPos(), "'[' can not be nested"))
  284. case ']':
  285. sc.Next()
  286. goto exit
  287. case EOS:
  288. panic(newError(sc.CurrentPos(), "unexpected EOS"))
  289. case '-':
  290. if len(set.Classes) > 0 {
  291. sc.Next()
  292. isrange = true
  293. continue
  294. }
  295. fallthrough
  296. default:
  297. set.Classes = append(set.Classes, parseClass(sc, false))
  298. }
  299. if isrange {
  300. begin := set.Classes[len(set.Classes)-2]
  301. end := set.Classes[len(set.Classes)-1]
  302. set.Classes = set.Classes[0 : len(set.Classes)-2]
  303. set.Classes = append(set.Classes, &rangeClass{begin, end})
  304. isrange = false
  305. }
  306. }
  307. exit:
  308. if isrange {
  309. set.Classes = append(set.Classes, &charClass{'-'})
  310. }
  311. return set
  312. }
  313. func parsePattern(sc *scanner, toplevel bool) *seqPattern {
  314. pat := &seqPattern{}
  315. if toplevel {
  316. if sc.Peek() == '^' {
  317. sc.Next()
  318. pat.MustHead = true
  319. }
  320. }
  321. for {
  322. ch := sc.Peek()
  323. switch ch {
  324. case '%':
  325. sc.Save()
  326. sc.Next()
  327. switch sc.Peek() {
  328. case '0':
  329. panic(newError(sc.CurrentPos(), "invalid capture index"))
  330. case '1', '2', '3', '4', '5', '6', '7', '8', '9':
  331. pat.Patterns = append(pat.Patterns, &numberPattern{sc.Next() - 48})
  332. case 'b':
  333. sc.Next()
  334. pat.Patterns = append(pat.Patterns, &bracePattern{sc.Next(), sc.Next()})
  335. default:
  336. sc.Restore()
  337. pat.Patterns = append(pat.Patterns, &singlePattern{parseClass(sc, true)})
  338. }
  339. case '.', '[':
  340. pat.Patterns = append(pat.Patterns, &singlePattern{parseClass(sc, true)})
  341. case ']':
  342. panic(newError(sc.CurrentPos(), "invalid ']'"))
  343. case ')':
  344. if toplevel {
  345. panic(newError(sc.CurrentPos(), "invalid ')'"))
  346. }
  347. return pat
  348. case '(':
  349. sc.Next()
  350. if sc.Peek() == ')' {
  351. sc.Next()
  352. pat.Patterns = append(pat.Patterns, &posCapPattern{})
  353. } else {
  354. ret := &capPattern{parsePattern(sc, false)}
  355. if sc.Peek() != ')' {
  356. panic(newError(sc.CurrentPos(), "unfinished capture"))
  357. }
  358. sc.Next()
  359. pat.Patterns = append(pat.Patterns, ret)
  360. }
  361. case '*', '+', '-', '?':
  362. sc.Next()
  363. if len(pat.Patterns) > 0 {
  364. spat, ok := pat.Patterns[len(pat.Patterns)-1].(*singlePattern)
  365. if ok {
  366. pat.Patterns = pat.Patterns[0 : len(pat.Patterns)-1]
  367. pat.Patterns = append(pat.Patterns, &repeatPattern{ch, spat.Class})
  368. continue
  369. }
  370. }
  371. pat.Patterns = append(pat.Patterns, &singlePattern{&charClass{ch}})
  372. case '$':
  373. if toplevel && (sc.NextPos() == sc.Length()-1 || sc.NextPos() == EOS) {
  374. pat.MustTail = true
  375. } else {
  376. pat.Patterns = append(pat.Patterns, &singlePattern{&charClass{ch}})
  377. }
  378. sc.Next()
  379. case EOS:
  380. sc.Next()
  381. goto exit
  382. default:
  383. sc.Next()
  384. pat.Patterns = append(pat.Patterns, &singlePattern{&charClass{ch}})
  385. }
  386. }
  387. exit:
  388. return pat
  389. }
  390. type iptr struct {
  391. insts []inst
  392. capture int
  393. }
  394. func compilePattern(p pattern, ps ...*iptr) []inst {
  395. var ptr *iptr
  396. toplevel := false
  397. if len(ps) == 0 {
  398. toplevel = true
  399. ptr = &iptr{[]inst{inst{opSave, nil, 0, -1}}, 2}
  400. } else {
  401. ptr = ps[0]
  402. }
  403. switch pat := p.(type) {
  404. case *singlePattern:
  405. ptr.insts = append(ptr.insts, inst{opChar, pat.Class, -1, -1})
  406. case *seqPattern:
  407. for _, cp := range pat.Patterns {
  408. compilePattern(cp, ptr)
  409. }
  410. case *repeatPattern:
  411. idx := len(ptr.insts)
  412. switch pat.Type {
  413. case '*':
  414. ptr.insts = append(ptr.insts,
  415. inst{opSplit, nil, idx + 1, idx + 3},
  416. inst{opChar, pat.Class, -1, -1},
  417. inst{opJmp, nil, idx, -1})
  418. case '+':
  419. ptr.insts = append(ptr.insts,
  420. inst{opChar, pat.Class, -1, -1},
  421. inst{opSplit, nil, idx, idx + 2})
  422. case '-':
  423. ptr.insts = append(ptr.insts,
  424. inst{opSplit, nil, idx + 3, idx + 1},
  425. inst{opChar, pat.Class, -1, -1},
  426. inst{opJmp, nil, idx, -1})
  427. case '?':
  428. ptr.insts = append(ptr.insts,
  429. inst{opSplit, nil, idx + 1, idx + 2},
  430. inst{opChar, pat.Class, -1, -1})
  431. }
  432. case *posCapPattern:
  433. ptr.insts = append(ptr.insts, inst{opPSave, nil, ptr.capture, -1})
  434. ptr.capture += 2
  435. case *capPattern:
  436. c0, c1 := ptr.capture, ptr.capture+1
  437. ptr.capture += 2
  438. ptr.insts = append(ptr.insts, inst{opSave, nil, c0, -1})
  439. compilePattern(pat.Pattern, ptr)
  440. ptr.insts = append(ptr.insts, inst{opSave, nil, c1, -1})
  441. case *bracePattern:
  442. ptr.insts = append(ptr.insts, inst{opBrace, nil, pat.Begin, pat.End})
  443. case *numberPattern:
  444. ptr.insts = append(ptr.insts, inst{opNumber, nil, pat.N, -1})
  445. }
  446. if toplevel {
  447. if p.(*seqPattern).MustTail {
  448. ptr.insts = append(ptr.insts, inst{opSave, nil, 1, -1}, inst{opTailMatch, nil, -1, -1})
  449. }
  450. ptr.insts = append(ptr.insts, inst{opSave, nil, 1, -1}, inst{opMatch, nil, -1, -1})
  451. }
  452. return ptr.insts
  453. }
  454. /* }}} parse */
  455. /* VM {{{ */
  456. // Simple recursive virtual machine based on the
  457. // "Regular Expression Matching: the Virtual Machine Approach" (https://swtch.com/~rsc/regexp/regexp2.html)
  458. func recursiveVM(src []byte, insts []inst, pc, sp int, ms ...*MatchData) (bool, int, *MatchData) {
  459. var m *MatchData
  460. if len(ms) == 0 {
  461. m = newMatchState()
  462. } else {
  463. m = ms[0]
  464. }
  465. redo:
  466. inst := insts[pc]
  467. switch inst.OpCode {
  468. case opChar:
  469. if sp >= len(src) || !inst.Class.Matches(int(src[sp])) {
  470. return false, sp, m
  471. }
  472. pc++
  473. sp++
  474. goto redo
  475. case opMatch:
  476. return true, sp, m
  477. case opTailMatch:
  478. return sp >= len(src), sp, m
  479. case opJmp:
  480. pc = inst.Operand1
  481. goto redo
  482. case opSplit:
  483. if ok, nsp, _ := recursiveVM(src, insts, inst.Operand1, sp, m); ok {
  484. return true, nsp, m
  485. }
  486. pc = inst.Operand2
  487. goto redo
  488. case opSave:
  489. s := m.setCapture(inst.Operand1, sp)
  490. if ok, nsp, _ := recursiveVM(src, insts, pc+1, sp, m); ok {
  491. return true, nsp, m
  492. }
  493. m.restoreCapture(inst.Operand1, s)
  494. return false, sp, m
  495. case opPSave:
  496. m.addPosCapture(inst.Operand1, sp+1)
  497. pc++
  498. goto redo
  499. case opBrace:
  500. if sp >= len(src) || int(src[sp]) != inst.Operand1 {
  501. return false, sp, m
  502. }
  503. count := 1
  504. for sp = sp + 1; sp < len(src); sp++ {
  505. if int(src[sp]) == inst.Operand2 {
  506. count--
  507. }
  508. if count == 0 {
  509. pc++
  510. sp++
  511. goto redo
  512. }
  513. if int(src[sp]) == inst.Operand1 {
  514. count++
  515. }
  516. }
  517. return false, sp, m
  518. case opNumber:
  519. idx := inst.Operand1 * 2
  520. if idx >= m.CaptureLength()-1 {
  521. panic(newError(_UNKNOWN, "invalid capture index"))
  522. }
  523. capture := src[m.Capture(idx):m.Capture(idx+1)]
  524. for i := 0; i < len(capture); i++ {
  525. if i+sp >= len(src) || capture[i] != src[i+sp] {
  526. return false, sp, m
  527. }
  528. }
  529. pc++
  530. sp += len(capture)
  531. goto redo
  532. }
  533. panic("should not reach here")
  534. return false, sp, m
  535. }
  536. /* }}} */
  537. /* API {{{ */
  538. func Find(p string, src []byte, offset, limit int) (matches []*MatchData, err error) {
  539. defer func() {
  540. if v := recover(); v != nil {
  541. if perr, ok := v.(*Error); ok {
  542. err = perr
  543. } else {
  544. panic(v)
  545. }
  546. }
  547. }()
  548. pat := parsePattern(newScanner([]byte(p)), true)
  549. insts := compilePattern(pat)
  550. matches = []*MatchData{}
  551. for sp := offset; sp <= len(src); {
  552. ok, nsp, ms := recursiveVM(src, insts, 0, sp)
  553. sp++
  554. if ok {
  555. if sp < nsp {
  556. sp = nsp
  557. }
  558. matches = append(matches, ms)
  559. }
  560. if len(matches) == limit || pat.MustHead {
  561. break
  562. }
  563. }
  564. return
  565. }
  566. /* }}} */