diff --git a/internal/parser/lexer.go b/internal/parser/lexer.go index 3743436..f26978f 100644 --- a/internal/parser/lexer.go +++ b/internal/parser/lexer.go @@ -358,7 +358,21 @@ func (l *lexer) resetTokenStart() { l.tokenStartLoc = l.location() } +// tokenKindPostprocessors defines a transformation of the lexed token string +// before it is stored in the tokens list. It is optional for each token kind. +var tokenKindPostprocessors = map[tokenKind]func(string) string{ + tokenNumber: func(s string) string { + // Get rid of underscore digit separators. + return strings.ReplaceAll(s, "_", "") + }, +} + func (l *lexer) emitFullToken(kind tokenKind, data, stringBlockIndent, stringBlockTermIndent string) { + // Run the postprocessor if the token kind has one defined. + if pp, ok := tokenKindPostprocessors[kind]; ok { + data = pp(data) + } + l.tokens = append(l.tokens, token{ kind: kind, fodder: l.fodder, @@ -451,7 +465,7 @@ func (l *lexer) lexUntilNewline() (string, int, int) { // that the next rune to be served by the lexer will be a leading digit. func (l *lexer) lexNumber() error { // This function should be understood with reference to the linked image: - // http://www.json.org/number.gif + // https://www.json.org/img/number.png // Note, we deviate from the json.org documentation as follows: // There is no reason to lex negative numbers as atomic tokens, it is better to parse them @@ -465,9 +479,11 @@ func (l *lexer) lexNumber() error { numAfterOneToNine numAfterDot numAfterDigit + numAfterUnderscore numAfterE numAfterExpSign numAfterExpDigit + numAfterExpUnderscore ) state := numBegin @@ -492,6 +508,9 @@ outerLoop: state = numAfterDot case 'e', 'E': state = numAfterE + case '_': + state = numAfterUnderscore + default: break outerLoop } @@ -503,6 +522,8 @@ outerLoop: state = numAfterE case r >= '0' && r <= '9': state = numAfterOneToNine + case r == '_': + state = numAfterUnderscore default: break outerLoop } @@ -521,9 +542,28 @@ outerLoop: state = numAfterE case r >= '0' && r <= '9': state = numAfterDigit + case r == '_': + state = numAfterUnderscore default: break outerLoop } + + case numAfterUnderscore: + // The only valid transition out of _ is to a digit. + switch { + case r == '_': + return l.makeStaticErrorPoint( + "Couldn't lex number, multiple consecutive _'s", + l.location()) + + case r >= '0' && r <= '9': + state = numAfterExpDigit + + default: + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), + l.location()) + } case numAfterE: switch { case r == '+' || r == '-': @@ -545,9 +585,12 @@ outerLoop: } case numAfterExpDigit: - if r >= '0' && r <= '9' { + switch { + case r >= '0' && r <= '9': state = numAfterExpDigit - } else { + case r == '_': + state = numAfterUnderscore + default: break outerLoop } } @@ -965,7 +1008,6 @@ func Lex(diagnosticFilename ast.DiagnosticFileName, importedFilename, input stri fmt.Sprintf("Could not lex the character %s", strconv.QuoteRuneToASCII(r)), l.location()) } - } } diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go index c54ff0e..e7f2bee 100644 --- a/internal/parser/lexer_test.go +++ b/internal/parser/lexer_test.go @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -314,6 +314,26 @@ func TestNumber1epExc(t *testing.T) { SingleTest(t, "1e+!", "snippet:1:4 Couldn't lex number, junk after exponent sign: '!'", Tokens{}) } +func TestNumberSeparators(t *testing.T) { + + SingleTest(t, "123_456", "", Tokens{{kind: tokenNumber, data: "123456"}}) + + /* + testLex("number 123_456", "123_456", {Token(Token::Kind::NUMBER, "123456")}, ""); + testLex("number 1_750_000", "1_750_000", {Token(Token::Kind::NUMBER, "1750000")}, ""); + testLex("number 1_2_3", "1_2_3", {Token(Token::Kind::NUMBER, "123")}, ""); + testLex("number 3.141_592", "3.141_592", {Token(Token::Kind::NUMBER, "3.141592")}, ""); + testLex("number 01_100", "01_100", {Token(Token::Kind::NUMBER, "0"), Token(Token::Kind::NUMBER, "1100")}, ""); + testLex("number 1_200.0", "1_200.0", {Token(Token::Kind::NUMBER, "1200.0")}, ""); + testLex("number 0e1_01", "0e1_01", {Token(Token::Kind::NUMBER, "0e101")}, ""); + testLex("number 10_10e3", "10_10e3", {Token(Token::Kind::NUMBER, "1010e3")}, ""); + testLex("number 2_3e1_2", "2_3e1_2", {Token(Token::Kind::NUMBER, "23e12")}, ""); + testLex("number 1.1_2e100", "1.1_2e100", {Token(Token::Kind::NUMBER, "1.12e100")}, ""); + testLex("number 1.1e-10_1", "1.1e-10_1", {Token(Token::Kind::NUMBER, "1.1e-101")}, ""); + testLex("number 9.109_383_56e-31", "9.109_383_56e-31", {Token(Token::Kind::NUMBER, "9.10938356e-31")}, ""); + */ +} + func TestDoublestring1(t *testing.T) { SingleTest(t, "\"hi\"", "", Tokens{ {kind: tokenStringDouble, data: "hi"},