scanner.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660
  1. package parser
  2. import (
  3. "fmt"
  4. "unicode"
  5. "unicode/utf8"
  6. "github.com/d5/tengo/v2/token"
  7. )
  8. // byte order mark
  9. const bom = 0xFEFF
  10. // ScanMode represents a scanner mode.
  11. type ScanMode int
  12. // List of scanner modes.
  13. const (
  14. ScanComments ScanMode = 1 << iota
  15. DontInsertSemis
  16. )
  17. // ScannerErrorHandler is an error handler for the scanner.
  18. type ScannerErrorHandler func(pos SourceFilePos, msg string)
  19. // Scanner reads the Tengo source text. It's based on Go's scanner
  20. // implementation.
  21. type Scanner struct {
  22. file *SourceFile // source file handle
  23. src []byte // source
  24. ch rune // current character
  25. offset int // character offset
  26. readOffset int // reading offset (position after current character)
  27. lineOffset int // current line offset
  28. insertSemi bool // insert a semicolon before next newline
  29. errorHandler ScannerErrorHandler // error reporting; or nil
  30. errorCount int // number of errors encountered
  31. mode ScanMode
  32. }
  33. // NewScanner creates a Scanner.
  34. func NewScanner(
  35. file *SourceFile,
  36. src []byte,
  37. errorHandler ScannerErrorHandler,
  38. mode ScanMode,
  39. ) *Scanner {
  40. if file.Size != len(src) {
  41. panic(fmt.Sprintf("file size (%d) does not match src len (%d)",
  42. file.Size, len(src)))
  43. }
  44. s := &Scanner{
  45. file: file,
  46. src: src,
  47. errorHandler: errorHandler,
  48. ch: ' ',
  49. mode: mode,
  50. }
  51. s.next()
  52. if s.ch == bom {
  53. s.next() // ignore BOM at file beginning
  54. }
  55. return s
  56. }
  57. // ErrorCount returns the number of errors.
  58. func (s *Scanner) ErrorCount() int {
  59. return s.errorCount
  60. }
  61. // Scan returns a token, token literal and its position.
  62. func (s *Scanner) Scan() (
  63. tok token.Token,
  64. literal string,
  65. pos Pos,
  66. ) {
  67. s.skipWhitespace()
  68. pos = s.file.FileSetPos(s.offset)
  69. insertSemi := false
  70. // determine token value
  71. switch ch := s.ch; {
  72. case isLetter(ch):
  73. literal = s.scanIdentifier()
  74. tok = token.Lookup(literal)
  75. switch tok {
  76. case token.Ident, token.Break, token.Continue, token.Return,
  77. token.Export, token.True, token.False, token.Undefined:
  78. insertSemi = true
  79. }
  80. case ('0' <= ch && ch <= '9') || (ch == '.' && '0' <= s.peek() && s.peek() <= '9'):
  81. insertSemi = true
  82. tok, literal = s.scanNumber()
  83. default:
  84. s.next() // always make progress
  85. switch ch {
  86. case -1: // EOF
  87. if s.insertSemi {
  88. s.insertSemi = false // EOF consumed
  89. return token.Semicolon, "\n", pos
  90. }
  91. tok = token.EOF
  92. case '\n':
  93. // we only reach here if s.insertSemi was set in the first place
  94. s.insertSemi = false // newline consumed
  95. return token.Semicolon, "\n", pos
  96. case '"':
  97. insertSemi = true
  98. tok = token.String
  99. literal = s.scanString()
  100. case '\'':
  101. insertSemi = true
  102. tok = token.Char
  103. literal = s.scanRune()
  104. case '`':
  105. insertSemi = true
  106. tok = token.String
  107. literal = s.scanRawString()
  108. case ':':
  109. tok = s.switch2(token.Colon, token.Define)
  110. case '.':
  111. tok = token.Period
  112. if s.ch == '.' && s.peek() == '.' {
  113. s.next()
  114. s.next() // consume last '.'
  115. tok = token.Ellipsis
  116. }
  117. case ',':
  118. tok = token.Comma
  119. case '?':
  120. tok = token.Question
  121. case ';':
  122. tok = token.Semicolon
  123. literal = ";"
  124. case '(':
  125. tok = token.LParen
  126. case ')':
  127. insertSemi = true
  128. tok = token.RParen
  129. case '[':
  130. tok = token.LBrack
  131. case ']':
  132. insertSemi = true
  133. tok = token.RBrack
  134. case '{':
  135. tok = token.LBrace
  136. case '}':
  137. insertSemi = true
  138. tok = token.RBrace
  139. case '+':
  140. tok = s.switch3(token.Add, token.AddAssign, '+', token.Inc)
  141. if tok == token.Inc {
  142. insertSemi = true
  143. }
  144. case '-':
  145. tok = s.switch3(token.Sub, token.SubAssign, '-', token.Dec)
  146. if tok == token.Dec {
  147. insertSemi = true
  148. }
  149. case '*':
  150. tok = s.switch2(token.Mul, token.MulAssign)
  151. case '/':
  152. if s.ch == '/' || s.ch == '*' {
  153. // comment
  154. if s.insertSemi && s.findLineEnd() {
  155. // reset position to the beginning of the comment
  156. s.ch = '/'
  157. s.offset = s.file.Offset(pos)
  158. s.readOffset = s.offset + 1
  159. s.insertSemi = false // newline consumed
  160. return token.Semicolon, "\n", pos
  161. }
  162. comment := s.scanComment()
  163. if s.mode&ScanComments == 0 {
  164. // skip comment
  165. s.insertSemi = false // newline consumed
  166. return s.Scan()
  167. }
  168. tok = token.Comment
  169. literal = comment
  170. } else {
  171. tok = s.switch2(token.Quo, token.QuoAssign)
  172. }
  173. case '%':
  174. tok = s.switch2(token.Rem, token.RemAssign)
  175. case '^':
  176. tok = s.switch2(token.Xor, token.XorAssign)
  177. case '<':
  178. tok = s.switch4(token.Less, token.LessEq, '<',
  179. token.Shl, token.ShlAssign)
  180. case '>':
  181. tok = s.switch4(token.Greater, token.GreaterEq, '>',
  182. token.Shr, token.ShrAssign)
  183. case '=':
  184. tok = s.switch2(token.Assign, token.Equal)
  185. case '!':
  186. tok = s.switch2(token.Not, token.NotEqual)
  187. case '&':
  188. if s.ch == '^' {
  189. s.next()
  190. tok = s.switch2(token.AndNot, token.AndNotAssign)
  191. } else {
  192. tok = s.switch3(token.And, token.AndAssign, '&', token.LAnd)
  193. }
  194. case '|':
  195. tok = s.switch3(token.Or, token.OrAssign, '|', token.LOr)
  196. default:
  197. // next reports unexpected BOMs - don't repeat
  198. if ch != bom {
  199. s.error(s.file.Offset(pos),
  200. fmt.Sprintf("illegal character %#U", ch))
  201. }
  202. insertSemi = s.insertSemi // preserve insertSemi info
  203. tok = token.Illegal
  204. literal = string(ch)
  205. }
  206. }
  207. if s.mode&DontInsertSemis == 0 {
  208. s.insertSemi = insertSemi
  209. }
  210. return
  211. }
  212. func (s *Scanner) next() {
  213. if s.readOffset < len(s.src) {
  214. s.offset = s.readOffset
  215. if s.ch == '\n' {
  216. s.lineOffset = s.offset
  217. s.file.AddLine(s.offset)
  218. }
  219. r, w := rune(s.src[s.readOffset]), 1
  220. switch {
  221. case r == 0:
  222. s.error(s.offset, "illegal character NUL")
  223. case r >= utf8.RuneSelf:
  224. // not ASCII
  225. r, w = utf8.DecodeRune(s.src[s.readOffset:])
  226. if r == utf8.RuneError && w == 1 {
  227. s.error(s.offset, "illegal UTF-8 encoding")
  228. } else if r == bom && s.offset > 0 {
  229. s.error(s.offset, "illegal byte order mark")
  230. }
  231. }
  232. s.readOffset += w
  233. s.ch = r
  234. } else {
  235. s.offset = len(s.src)
  236. if s.ch == '\n' {
  237. s.lineOffset = s.offset
  238. s.file.AddLine(s.offset)
  239. }
  240. s.ch = -1 // eof
  241. }
  242. }
  243. func (s *Scanner) peek() byte {
  244. if s.readOffset < len(s.src) {
  245. return s.src[s.readOffset]
  246. }
  247. return 0
  248. }
  249. func (s *Scanner) error(offset int, msg string) {
  250. if s.errorHandler != nil {
  251. s.errorHandler(s.file.Position(s.file.FileSetPos(offset)), msg)
  252. }
  253. s.errorCount++
  254. }
  255. func (s *Scanner) scanComment() string {
  256. // initial '/' already consumed; s.ch == '/' || s.ch == '*'
  257. offs := s.offset - 1 // position of initial '/'
  258. var numCR int
  259. if s.ch == '/' {
  260. //-style comment
  261. // (the final '\n' is not considered part of the comment)
  262. s.next()
  263. for s.ch != '\n' && s.ch >= 0 {
  264. if s.ch == '\r' {
  265. numCR++
  266. }
  267. s.next()
  268. }
  269. goto exit
  270. }
  271. /*-style comment */
  272. s.next()
  273. for s.ch >= 0 {
  274. ch := s.ch
  275. if ch == '\r' {
  276. numCR++
  277. }
  278. s.next()
  279. if ch == '*' && s.ch == '/' {
  280. s.next()
  281. goto exit
  282. }
  283. }
  284. s.error(offs, "comment not terminated")
  285. exit:
  286. lit := s.src[offs:s.offset]
  287. // On Windows, a (//-comment) line may end in "\r\n".
  288. // Remove the final '\r' before analyzing the text for line directives (matching the compiler).
  289. // Remove any other '\r' afterwards (matching the pre-existing behavior of the scanner).
  290. if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
  291. lit = lit[:len(lit)-1]
  292. numCR--
  293. }
  294. if numCR > 0 {
  295. lit = StripCR(lit, lit[1] == '*')
  296. }
  297. return string(lit)
  298. }
  299. func (s *Scanner) findLineEnd() bool {
  300. // initial '/' already consumed
  301. defer func(offs int) {
  302. // reset scanner state to where it was upon calling findLineEnd
  303. s.ch = '/'
  304. s.offset = offs
  305. s.readOffset = offs + 1
  306. s.next() // consume initial '/' again
  307. }(s.offset - 1)
  308. // read ahead until a newline, EOF, or non-comment tok is found
  309. for s.ch == '/' || s.ch == '*' {
  310. if s.ch == '/' {
  311. //-style comment always contains a newline
  312. return true
  313. }
  314. /*-style comment: look for newline */
  315. s.next()
  316. for s.ch >= 0 {
  317. ch := s.ch
  318. if ch == '\n' {
  319. return true
  320. }
  321. s.next()
  322. if ch == '*' && s.ch == '/' {
  323. s.next()
  324. break
  325. }
  326. }
  327. s.skipWhitespace() // s.insertSemi is set
  328. if s.ch < 0 || s.ch == '\n' {
  329. return true
  330. }
  331. if s.ch != '/' {
  332. // non-comment tok
  333. return false
  334. }
  335. s.next() // consume '/'
  336. }
  337. return false
  338. }
  339. func (s *Scanner) scanIdentifier() string {
  340. offs := s.offset
  341. for isLetter(s.ch) || isDigit(s.ch) {
  342. s.next()
  343. }
  344. return string(s.src[offs:s.offset])
  345. }
  346. func (s *Scanner) scanDigits(base int) {
  347. for s.ch == '_' || digitVal(s.ch) < base {
  348. s.next()
  349. }
  350. }
  351. func (s *Scanner) scanNumber() (token.Token, string) {
  352. offs := s.offset
  353. tok := token.Int
  354. base := 10
  355. // Determine base
  356. switch {
  357. case s.ch == '0' && lower(s.peek()) == 'b':
  358. base = 2
  359. s.next()
  360. s.next()
  361. case s.ch == '0' && lower(s.peek()) == 'o':
  362. base = 8
  363. s.next()
  364. s.next()
  365. case s.ch == '0' && lower(s.peek()) == 'x':
  366. base = 16
  367. s.next()
  368. s.next()
  369. }
  370. // Scan whole number
  371. s.scanDigits(base)
  372. // Scan fractional part
  373. if s.ch == '.' && (base == 10 || base == 16) {
  374. tok = token.Float
  375. s.next()
  376. s.scanDigits(base)
  377. }
  378. // Scan exponent
  379. if s.ch == 'e' || s.ch == 'E' || s.ch == 'p' || s.ch == 'P' {
  380. tok = token.Float
  381. s.next()
  382. if s.ch == '-' || s.ch == '+' {
  383. s.next()
  384. }
  385. offs := s.offset
  386. s.scanDigits(10)
  387. if offs == s.offset {
  388. s.error(offs, "exponent has no digits")
  389. }
  390. }
  391. return tok, string(s.src[offs:s.offset])
  392. }
  393. func (s *Scanner) scanEscape(quote rune) bool {
  394. offs := s.offset
  395. var n int
  396. var base, max uint32
  397. switch s.ch {
  398. case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
  399. s.next()
  400. return true
  401. case '0', '1', '2', '3', '4', '5', '6', '7':
  402. n, base, max = 3, 8, 255
  403. case 'x':
  404. s.next()
  405. n, base, max = 2, 16, 255
  406. case 'u':
  407. s.next()
  408. n, base, max = 4, 16, unicode.MaxRune
  409. case 'U':
  410. s.next()
  411. n, base, max = 8, 16, unicode.MaxRune
  412. default:
  413. msg := "unknown escape sequence"
  414. if s.ch < 0 {
  415. msg = "escape sequence not terminated"
  416. }
  417. s.error(offs, msg)
  418. return false
  419. }
  420. var x uint32
  421. for n > 0 {
  422. d := uint32(digitVal(s.ch))
  423. if d >= base {
  424. msg := fmt.Sprintf(
  425. "illegal character %#U in escape sequence", s.ch)
  426. if s.ch < 0 {
  427. msg = "escape sequence not terminated"
  428. }
  429. s.error(s.offset, msg)
  430. return false
  431. }
  432. x = x*base + d
  433. s.next()
  434. n--
  435. }
  436. if x > max || 0xD800 <= x && x < 0xE000 {
  437. s.error(offs, "escape sequence is invalid Unicode code point")
  438. return false
  439. }
  440. return true
  441. }
  442. func (s *Scanner) scanRune() string {
  443. offs := s.offset - 1 // '\'' opening already consumed
  444. valid := true
  445. n := 0
  446. for {
  447. ch := s.ch
  448. if ch == '\n' || ch < 0 {
  449. // only report error if we don't have one already
  450. if valid {
  451. s.error(offs, "rune literal not terminated")
  452. valid = false
  453. }
  454. break
  455. }
  456. s.next()
  457. if ch == '\'' {
  458. break
  459. }
  460. n++
  461. if ch == '\\' {
  462. if !s.scanEscape('\'') {
  463. valid = false
  464. }
  465. // continue to read to closing quote
  466. }
  467. }
  468. if valid && n != 1 {
  469. s.error(offs, "illegal rune literal")
  470. }
  471. return string(s.src[offs:s.offset])
  472. }
  473. func (s *Scanner) scanString() string {
  474. offs := s.offset - 1 // '"' opening already consumed
  475. for {
  476. ch := s.ch
  477. if ch == '\n' || ch < 0 {
  478. s.error(offs, "string literal not terminated")
  479. break
  480. }
  481. s.next()
  482. if ch == '"' {
  483. break
  484. }
  485. if ch == '\\' {
  486. s.scanEscape('"')
  487. }
  488. }
  489. return string(s.src[offs:s.offset])
  490. }
  491. func (s *Scanner) scanRawString() string {
  492. offs := s.offset - 1 // '`' opening already consumed
  493. hasCR := false
  494. for {
  495. ch := s.ch
  496. if ch < 0 {
  497. s.error(offs, "raw string literal not terminated")
  498. break
  499. }
  500. s.next()
  501. if ch == '`' {
  502. break
  503. }
  504. if ch == '\r' {
  505. hasCR = true
  506. }
  507. }
  508. lit := s.src[offs:s.offset]
  509. if hasCR {
  510. lit = StripCR(lit, false)
  511. }
  512. return string(lit)
  513. }
  514. // StripCR removes carriage return characters.
  515. func StripCR(b []byte, comment bool) []byte {
  516. c := make([]byte, len(b))
  517. i := 0
  518. for j, ch := range b {
  519. // In a /*-style comment, don't strip \r from *\r/ (incl. sequences of
  520. // \r from *\r\r...\r/) since the resulting */ would terminate the
  521. // comment too early unless the \r is immediately following the opening
  522. // /* in which case it's ok because /*/ is not closed yet.
  523. if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' &&
  524. j+1 < len(b) && b[j+1] == '/' {
  525. c[i] = ch
  526. i++
  527. }
  528. }
  529. return c[:i]
  530. }
  531. func (s *Scanner) skipWhitespace() {
  532. for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi ||
  533. s.ch == '\r' {
  534. s.next()
  535. }
  536. }
  537. func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
  538. if s.ch == '=' {
  539. s.next()
  540. return tok1
  541. }
  542. return tok0
  543. }
  544. func (s *Scanner) switch3(
  545. tok0, tok1 token.Token,
  546. ch2 rune,
  547. tok2 token.Token,
  548. ) token.Token {
  549. if s.ch == '=' {
  550. s.next()
  551. return tok1
  552. }
  553. if s.ch == ch2 {
  554. s.next()
  555. return tok2
  556. }
  557. return tok0
  558. }
  559. func (s *Scanner) switch4(
  560. tok0, tok1 token.Token,
  561. ch2 rune,
  562. tok2, tok3 token.Token,
  563. ) token.Token {
  564. if s.ch == '=' {
  565. s.next()
  566. return tok1
  567. }
  568. if s.ch == ch2 {
  569. s.next()
  570. if s.ch == '=' {
  571. s.next()
  572. return tok3
  573. }
  574. return tok2
  575. }
  576. return tok0
  577. }
  578. func isLetter(ch rune) bool {
  579. return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' ||
  580. ch >= utf8.RuneSelf && unicode.IsLetter(ch)
  581. }
  582. func isDigit(ch rune) bool {
  583. return '0' <= ch && ch <= '9' ||
  584. ch >= utf8.RuneSelf && unicode.IsDigit(ch)
  585. }
  586. func digitVal(ch rune) int {
  587. switch {
  588. case '0' <= ch && ch <= '9':
  589. return int(ch - '0')
  590. case 'a' <= ch && ch <= 'f':
  591. return int(ch - 'a' + 10)
  592. case 'A' <= ch && ch <= 'F':
  593. return int(ch - 'A' + 10)
  594. }
  595. return 16 // larger than any legal digit val
  596. }
  597. func lower(c byte) byte {
  598. return c | ('x' - 'X')
  599. }