blob: fbace9d06a80d8a4f607aea405d05a39dcb929ec [file] [log] [blame]
Marcel van Lohuizend80624e2018-12-10 15:26:06 +01001// Copyright 2018 The CUE Authors
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package scanner implements a scanner for CUE source text. It takes a []byte
16// as source which can then be tokenized through repeated calls to the Scan
17// method.
18package scanner // import "cuelang.org/go/cue/scanner"
19
20import (
21 "bytes"
22 "fmt"
23 "path/filepath"
24 "strconv"
25 "unicode"
26 "unicode/utf8"
27
28 "cuelang.org/go/cue/errors"
29 "cuelang.org/go/cue/token"
30)
31
32// A Scanner holds the Scanner's internal state while processing
33// a given text. It can be allocated as part of another data
34// structure but must be initialized via Init before use.
35type Scanner struct {
36 // immutable state
37 file *token.File // source file handle
38 dir string // directory portion of file.Name()
39 src []byte // source
40 err errors.Handler // error reporting; or nil
41 mode Mode // scanning mode
42
43 // scanning state
44 ch rune // current character
45 offset int // character offset
46 rdOffset int // reading offset (position after current character)
47 lineOffset int // current line offset
48 linesSinceLast int
49 spacesSinceLast int
50 insertEOL bool // insert a comma before next newline
51
52 // public state - ok to modify
53 ErrorCount int // number of errors encountered
54}
55
56const bom = 0xFEFF // byte order mark, only permitted as very first character
57
58// Read the next Unicode char into s.ch.
59// s.ch < 0 means end-of-file.
60func (s *Scanner) next() {
61 if s.rdOffset < len(s.src) {
62 s.offset = s.rdOffset
63 if s.ch == '\n' {
64 s.lineOffset = s.offset
65 s.file.AddLine(s.offset)
66 }
67 r, w := rune(s.src[s.rdOffset]), 1
68 switch {
69 case r == 0:
70 s.error(s.offset, "illegal character NUL")
71 case r >= utf8.RuneSelf:
72 // not ASCII
73 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
74 if r == utf8.RuneError && w == 1 {
75 s.error(s.offset, "illegal UTF-8 encoding")
76 } else if r == bom && s.offset > 0 {
77 s.error(s.offset, "illegal byte order mark")
78 }
79 }
80 s.rdOffset += w
81 s.ch = r
82 } else {
83 s.offset = len(s.src)
84 if s.ch == '\n' {
85 s.lineOffset = s.offset
86 s.file.AddLine(s.offset)
87 }
88 s.ch = -1 // eof
89 }
90}
91
92// A Mode value is a set of flags (or 0).
93// They control scanner behavior.
94type Mode uint
95
96// These constants are options to the Init function.
97const (
98 ScanComments Mode = 1 << iota // return comments as COMMENT tokens
99 dontInsertCommas // do not automatically insert commas - for testing only
100)
101
102// Init prepares the scanner s to tokenize the text src by setting the
103// scanner at the beginning of src. The scanner uses the file set file
104// for position information and it adds line information for each line.
105// It is ok to re-use the same file when re-scanning the same file as
106// line information which is already present is ignored. Init causes a
107// panic if the file size does not match the src size.
108//
109// Calls to Scan will invoke the error handler err if they encounter a
110// syntax error and err is not nil. Also, for each error encountered,
111// the Scanner field ErrorCount is incremented by one. The mode parameter
112// determines how comments are handled.
113//
114// Note that Init may call err if there is an error in the first character
115// of the file.
116func (s *Scanner) Init(file *token.File, src []byte, err errors.Handler, mode Mode) {
117 // Explicitly initialize all fields since a scanner may be reused.
118 if file.Size() != len(src) {
119 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
120 }
121 s.file = file
122 s.dir, _ = filepath.Split(file.Name())
123 s.src = src
124 s.err = err
125 s.mode = mode
126
127 s.ch = ' '
128 s.offset = 0
129 s.rdOffset = 0
130 s.lineOffset = 0
131 s.insertEOL = false
132 s.ErrorCount = 0
133
134 s.next()
135 if s.ch == bom {
136 s.next() // ignore BOM at file beginning
137 }
138}
139
140func (s *Scanner) error(offs int, msg string) {
141 if s.err != nil {
142 s.err(s.file.Position(s.file.Pos(offs, 0)), msg)
143 }
144 s.ErrorCount++
145}
146
147var prefix = []byte("//line ")
148
149func (s *Scanner) interpretLineComment(text []byte) {
150 if bytes.HasPrefix(text, prefix) {
151 // get filename and line number, if any
152 if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
153 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
154 // valid //line filename:line comment
155 filename := string(bytes.TrimSpace(text[len(prefix):i]))
156 if filename != "" {
157 filename = filepath.Clean(filename)
158 if !filepath.IsAbs(filename) {
159 // make filename relative to current directory
160 filename = filepath.Join(s.dir, filename)
161 }
162 }
163 // update scanner position
164 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line
165 }
166 }
167 }
168}
169
170func (s *Scanner) scanComment() string {
171 // initial '/' already consumed; s.ch == '/' || s.ch == '*'
172 offs := s.offset - 1 // position of initial '/'
173 hasCR := false
174
175 if s.ch == '/' {
176 //-style comment
177 s.next()
178 for s.ch != '\n' && s.ch >= 0 {
179 if s.ch == '\r' {
180 hasCR = true
181 }
182 s.next()
183 }
184 if offs == s.lineOffset {
185 // comment starts at the beginning of the current line
186 s.interpretLineComment(s.src[offs:s.offset])
187 }
188 goto exit
189 }
190
191 /*-style comment */
192 s.next()
193 for s.ch >= 0 {
194 ch := s.ch
195 if ch == '\r' {
196 hasCR = true
197 }
198 s.next()
199 if ch == '*' && s.ch == '/' {
200 s.next()
201 goto exit
202 }
203 }
204
205 s.error(offs, "comment not terminated")
206
207exit:
208 lit := s.src[offs:s.offset]
209 if hasCR {
210 // TODO: preserve /r/n
211 lit = stripCR(lit)
212 }
213
214 return string(lit)
215}
216
217func (s *Scanner) findLineEnd() bool {
218 // initial '/' already consumed
219
220 defer func(offs int) {
221 // reset scanner state to where it was upon calling findLineEnd
222 s.ch = '/'
223 s.offset = offs
224 s.rdOffset = offs + 1
225 s.next() // consume initial '/' again
226 }(s.offset - 1)
227
228 // read ahead until a newline, EOF, or non-comment token is found
229 for s.ch == '/' || s.ch == '*' {
230 if s.ch == '/' {
231 //-style comment always contains a newline
232 return true
233 }
234 /*-style comment: look for newline */
235 s.next()
236 for s.ch >= 0 {
237 ch := s.ch
238 if ch == '\n' {
239 return true
240 }
241 s.next()
242 if ch == '*' && s.ch == '/' {
243 s.next()
244 break
245 }
246 }
247 s.skipWhitespace(0) // s.insertSemi is set
248 if s.ch < 0 || s.ch == '\n' {
249 return true
250 }
251 if s.ch != '/' {
252 // non-comment token
253 return false
254 }
255 s.next() // consume '/'
256 }
257
258 return false
259}
260
261func isLetter(ch rune) bool {
262 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
263}
264
265func isDigit(ch rune) bool {
266 // TODO(mpvl): Is this correct?
267 return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
268}
269
270func (s *Scanner) scanIdentifier() string {
271 offs := s.offset
272 for isLetter(s.ch) || isDigit(s.ch) || s.ch == '_' {
273 s.next()
274 }
275 return string(s.src[offs:s.offset])
276}
277
278func digitVal(ch rune) int {
279 switch {
280 case '0' <= ch && ch <= '9':
281 return int(ch - '0')
282 case ch == '_':
283 return 0
284 case 'a' <= ch && ch <= 'f':
285 return int(ch - 'a' + 10)
286 case 'A' <= ch && ch <= 'F':
287 return int(ch - 'A' + 10)
288 }
289 return 16 // larger than any legal digit val
290}
291
292func (s *Scanner) scanMantissa(base int) {
293 var last rune
294 for digitVal(s.ch) < base {
295 last = s.ch
296 s.next()
297 }
298 if last == '_' {
299 s.error(s.offset-1, "illegal '_' in number")
300 }
301}
302
303func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
304 // digitVal(s.ch) < 10
305 offs := s.offset
306 tok := token.INT
307
308 if seenDecimalPoint {
309 offs--
310 tok = token.FLOAT
311 s.scanMantissa(10)
312 goto exponent
313 }
314
315 if s.ch == '0' {
316 // int or float
317 offs := s.offset
318 s.next()
319 if s.ch == 'x' || s.ch == 'X' {
320 // hexadecimal int
321 s.next()
322 s.scanMantissa(16)
323 if s.offset-offs <= 2 {
324 // only scanned "0x" or "0X"
325 s.error(offs, "illegal hexadecimal number")
326 }
327 } else if s.ch == 'b' || s.ch == 'B' {
328 // binary int
329 s.next()
330 s.scanMantissa(2)
331 if s.offset-offs <= 2 {
332 // only scanned "0b" or "0B"
333 s.error(offs, "illegal binary number")
334 }
335 } else {
336 // octal int or float
337 seenDecimalDigit := false
338 s.scanMantissa(8)
339 if s.ch == '8' || s.ch == '9' {
340 // illegal octal int or float
341 seenDecimalDigit = true
342 s.scanMantissa(10)
343 }
344 // TODO: disallow complex.
345 if s.ch == '.' || s.ch == 'e' {
346 goto fraction
347 }
348 // octal int
349 if seenDecimalDigit {
350 s.error(offs, "illegal octal number")
351 }
352 }
353 goto exit
354 }
355
356 // decimal int or float
357 s.scanMantissa(10)
358
359 // TODO: allow 3h4s, etc.
360 // switch s.ch {
361 // case 'h', 'm', 's', "µ"[0], 'u', 'n':
362 // }
363
364fraction:
365 if s.ch == '.' {
366 if p := s.offset + 1; p < len(s.src) && s.src[p] == '.' {
367 // interpret dot as part of a range.
368 goto exit
369 }
370 tok = token.FLOAT
371 s.next()
372 s.scanMantissa(10)
373 }
374
375exponent:
376 switch s.ch {
377 case 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y':
378 tok = token.INT // TODO: Or should we allow this to be a float?
379 s.next()
380 if s.ch == 'i' {
381 s.next()
382 }
383 goto exit
384 }
385
386 // TODO: allow 'E' for exponent? Could be used for Exa
387 if s.ch == 'e' { // || s.ch == 'E' {
388 tok = token.FLOAT
389 s.next()
390 if s.ch == '-' || s.ch == '+' {
391 s.next()
392 }
393 s.scanMantissa(10)
394 }
395
396exit:
397 return tok, string(s.src[offs:s.offset])
398}
399
400// scanEscape parses an escape sequence where rune is the accepted
401// escaped quote. In case of a syntax error, it stops at the offending
402// character (without consuming it) and returns false. Otherwise
403// it returns true.
404func (s *Scanner) scanEscape(quote rune) (ok, template bool) {
405 offs := s.offset
406
407 var n int
408 var base, max uint32
409 switch s.ch {
410 // TODO: remove
411 case '(':
412 return true, true
413 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
414 s.next()
415 return true, false
416 case '0', '1', '2', '3', '4', '5', '6', '7':
417 n, base, max = 3, 8, 255
418 case 'x':
419 s.next()
420 n, base, max = 2, 16, 255
421 case 'u':
422 s.next()
423 n, base, max = 4, 16, unicode.MaxRune
424 case 'U':
425 s.next()
426 n, base, max = 8, 16, unicode.MaxRune
427 default:
428 msg := "unknown escape sequence"
429 if s.ch < 0 {
430 msg = "escape sequence not terminated"
431 }
432 s.error(offs, msg)
433 return false, false
434 }
435
436 var x uint32
437 for n > 0 {
438 d := uint32(digitVal(s.ch))
439 if d >= base {
440 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
441 if s.ch < 0 {
442 msg = "escape sequence not terminated"
443 }
444 s.error(s.offset, msg)
445 return false, false
446 }
447 x = x*base + d
448 s.next()
449 n--
450 }
451
452 if x > max || 0xD800 <= x && x < 0xE000 {
453 s.error(offs, "escape sequence is invalid Unicode code point")
454 return false, false
455 }
456
457 return true, false
458}
459
460func (s *Scanner) scanString(quote rune, offset, numQuotes int) (token.Token, string) {
461 // ", """, ', or ''' opening already consumed
462 offs := s.offset - offset
463
464 tok := token.STRING
465
466 hasCR := false
467 extra := 0
468 for {
469 ch, n := s.consumeQuotes(quote, numQuotes)
470 if n == numQuotes {
471 break
472 }
473 if (numQuotes != 3 && ch == '\n') || ch < 0 {
474 s.error(offs, "string literal not terminated")
475 lit := s.src[offs:s.offset]
476 if hasCR {
477 lit = stripCR(lit)
478 }
479 return tok, string(lit)
480 }
481 if ch == '\r' && numQuotes == 3 {
482 hasCR = true
483 }
484 s.next()
485 if ch == '\\' {
486 if s.ch == '(' {
487 tok = token.INTERPOLATION
488 extra = 1
489 break
490 }
491 s.scanEscape(quote)
492 }
493 }
494 lit := s.src[offs : s.offset+extra]
495 if hasCR {
496 lit = stripCR(lit)
497 }
498 return tok, string(lit)
499}
500
501func (s *Scanner) consumeQuotes(quote rune, max int) (next rune, n int) {
502 for ; n < max; n++ {
503 if s.ch != quote {
504 return s.ch, n
505 }
506 s.next()
507 }
508 return s.ch, n
509}
510
511func stripCR(b []byte) []byte {
512 c := make([]byte, len(b))
513 i := 0
514 for _, ch := range b {
515 if ch != '\r' {
516 c[i] = ch
517 i++
518 }
519 }
520 return c[:i]
521}
522
523func (s *Scanner) scanRawString() string {
524 // '`' opening already consumed
525 offs := s.offset - 1
526
527 hasCR := false
528 for {
529 ch := s.ch
530 if ch < 0 {
531 s.error(offs, "raw string literal not terminated")
532 break
533 }
534 s.next()
535 if ch == '`' {
536 break
537 }
538 if ch == '\r' {
539 hasCR = true
540 }
541 }
542
543 lit := s.src[offs:s.offset]
544 if hasCR {
545 lit = stripCR(lit)
546 }
547
548 return string(lit)
549}
550
551func (s *Scanner) skipWhitespace(inc int) {
552 for {
553 switch s.ch {
554 case ' ', '\t':
555 s.spacesSinceLast += inc
556 case '\n':
557 s.linesSinceLast += inc
558 if s.insertEOL {
559 return
560 }
561 case '\r':
562 default:
563 return
564 }
565 s.next()
566 }
567}
568
569// Helper functions for scanning multi-byte tokens such as >> += >>= .
570// Different routines recognize different length tok_i based on matches
571// of ch_i. If a token ends in '=', the result is tok1 or tok3
572// respectively. Otherwise, the result is tok0 if there was no other
573// matching character, or tok2 if the matching character was ch2.
574
575func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
576 if s.ch == '=' {
577 s.next()
578 return tok1
579 }
580 return tok0
581}
582
583// ResumeInterpolation resumes scanning of a string interpolation.
584func (s *Scanner) ResumeInterpolation(quote rune, numQuotes int) string {
585 _, str := s.scanString(quote, 1, numQuotes)
586 return str
587}
588
589// Scan scans the next token and returns the token position, the token,
590// and its literal string if applicable. The source end is indicated by
591// EOF.
592//
593// If the returned token is a literal (IDENT, INT, FLOAT,
594// IMAG, CHAR, STRING) or COMMENT, the literal string
595// has the corresponding value.
596//
597// If the returned token is a keyword, the literal string is the keyword.
598//
599// If the returned token is Comma, the corresponding
600// literal string is "," if the comma was present in the source,
601// and "\n" if the semicolon was inserted because of a newline or
602// at EOF.
603//
604// If the returned token is ILLEGAL, the literal string is the
605// offending character.
606//
607// In all other cases, Scan returns an empty literal string.
608//
609// For more tolerant parsing, Scan will return a valid token if
610// possible even if a syntax error was encountered. Thus, even
611// if the resulting token sequence contains no illegal tokens,
612// a client may not assume that no error occurred. Instead it
613// must check the scanner's ErrorCount or the number of calls
614// of the error handler, if there was one installed.
615//
616// Scan adds line information to the file added to the file
617// set with Init. Token positions are relative to that file
618// and thus relative to the file set.
619func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
620scanAgain:
621 s.skipWhitespace(1)
622
623 var rel token.RelPos
624 switch {
625 case s.linesSinceLast > 1:
626 rel = token.NewSection
627 case s.linesSinceLast == 1:
628 rel = token.Newline
629 case s.spacesSinceLast > 0:
630 rel = token.Blank
631 default:
632 rel = token.NoSpace
633 }
634 // current token start
635 offset := s.offset
636 pos = s.file.Pos(offset, rel)
637
638 // determine token value
639 insertEOL := false
640 switch ch := s.ch; {
641 // case ch == '$':
642 // lit = string(rune(ch))
643 // s.next()
644 // fallthrough
645 case isLetter(ch):
646 lit = s.scanIdentifier()
647 if len(lit) > 1 {
648 // keywords are longer than one letter - avoid lookup otherwise
649 tok = token.Lookup(lit)
650 switch tok {
651 case token.IDENT, token.TRUE, token.FALSE, token.NULL, token.BOTTOM:
652 insertEOL = true
653 }
654 } else {
655 insertEOL = true
656 tok = token.IDENT
657 }
658 case '0' <= ch && ch <= '9':
659 insertEOL = true
660 tok, lit = s.scanNumber(false)
661 default:
662 s.next() // always make progress
663 switch ch {
664 case -1:
665 if s.insertEOL {
666 s.insertEOL = false // EOF consumed
667 return s.file.Pos(offset, token.Elided), token.COMMA, "\n"
668 }
669 tok = token.EOF
670 case '_':
671 if s.ch == '|' {
672 // Unconditionally require this to be followed by another
673 // underscore to avoid needing an extra lookahead.
674 // Note that `_|x` is always equal to x.
675 s.next()
676 if s.ch != '_' {
677 s.error(s.file.Offset(pos), "illegal token '_|'; expected '_'")
678 insertEOL = s.insertEOL // preserve insertComma info
679 tok = token.ILLEGAL
680 lit = "_|"
681 break
682 }
683 s.next()
684 tok = token.BOTTOM
685 lit = "_|_"
686 } else {
687 tok = token.IDENT
688 lit = "_" + s.scanIdentifier()
689 }
690 insertEOL = true
691 case '\n':
692 // we only reach here if s.insertSemi was
693 // set in the first place and exited early
694 // from s.skipWhitespace()
695 s.insertEOL = false // newline consumed
696 return s.file.Pos(offset, token.Elided), token.COMMA, "\n"
697 case '"', '\'':
698 insertEOL = true
699 switch _, n := s.consumeQuotes(ch, 2); n {
700 case 1:
701 if ch == '"' {
702 tok, lit = token.STRING, `""`
703 } else {
704 tok, lit = token.STRING, `''`
705 }
706 default:
707 tok, lit = s.scanString(ch, n+1, n+1)
708 }
709 case '`':
710 insertEOL = true
711 tok = token.STRING
712 lit = s.scanRawString()
713 case ':':
714 tok = token.COLON
715 case ';':
716 tok = token.SEMICOLON
717 insertEOL = true
718 case '.':
719 if '0' <= s.ch && s.ch <= '9' {
720 insertEOL = true
721 tok, lit = s.scanNumber(true)
722 } else if s.ch == '.' {
723 s.next()
724 if s.ch == '.' {
725 s.next()
726 tok = token.ELLIPSIS
727 } else {
728 tok = token.RANGE
729 }
730 } else {
731 tok = token.PERIOD
732 }
733 case ',':
734 tok = token.COMMA
735 lit = ","
736 case '(':
737 tok = token.LPAREN
738 case ')':
739 insertEOL = true
740 tok = token.RPAREN
741 case '[':
742 tok = token.LBRACK
743 case ']':
744 insertEOL = true
745 tok = token.RBRACK
746 case '{':
747 tok = token.LBRACE
748 case '}':
749 insertEOL = true
750 tok = token.RBRACE
751 case '+':
752 tok = token.ADD // Consider ++ for list concatenate.
753 case '-':
754 if s.ch == '>' {
755 s.next()
756 tok = token.LAMBDA
757 } else {
758 tok = token.SUB
759 }
760 case '*':
761 tok = token.MUL
762 case '/':
763 if s.ch == '/' || s.ch == '*' {
764 // comment
765 if s.insertEOL && s.findLineEnd() {
766 // reset position to the beginning of the comment
767 s.ch = '/'
768 s.offset = s.file.Offset(pos)
769 s.rdOffset = s.offset + 1
770 s.insertEOL = false // newline consumed
771 return s.file.Pos(offset, token.Elided), token.COMMA, "\n"
772 }
773 comment := s.scanComment()
774 if s.mode&ScanComments == 0 {
775 // skip comment
776 s.insertEOL = false // newline consumed
777 goto scanAgain
778 }
779 tok = token.COMMENT
780 lit = comment
781 } else {
782 tok = token.QUO
783 }
784 case '%':
785 tok = token.REM
786 case '<':
787 if s.ch == '-' {
788 s.next()
789 tok = token.ARROW
790 } else {
791 tok = s.switch2(token.LSS, token.LEQ)
792 }
793 case '>':
794 tok = s.switch2(token.GTR, token.GEQ)
795 case '=':
796 tok = s.switch2(token.BIND, token.EQL)
797 case '!':
798 tok = s.switch2(token.NOT, token.NEQ)
799 case '&':
800 switch s.ch {
801 case '&':
802 s.next()
803 tok = token.LAND
804 default:
805 tok = token.UNIFY
806 }
807 case '|':
808 if s.ch == '|' {
809 s.next()
810 tok = token.LOR
811 } else {
812 tok = token.DISJUNCTION
813 }
814 default:
815 // next reports unexpected BOMs - don't repeat
816 if ch != bom {
817 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
818 }
819 insertEOL = s.insertEOL // preserve insertSemi info
820 tok = token.ILLEGAL
821 lit = string(ch)
822 }
823 }
824 if s.mode&dontInsertCommas == 0 {
825 s.insertEOL = insertEOL
826 }
827
828 s.linesSinceLast = 0
829 s.spacesSinceLast = 0
830 return
831}