Initial separator lexing.

2025-08-12 09:17:11 +02:00 · 2024-06-20 20:44:18 -07:00 · 2024-06-20 20:44:18 -07:00 · ffd82ef1df
commit ffd82ef1df
parent 2b4d7535f5
2 changed files with 67 additions and 5 deletions
--- a/internal/parser/lexer.go
+++ b/internal/parser/lexer.go
@ -358,7 +358,21 @@ func (l *lexer) resetTokenStart() {
 	l.tokenStartLoc = l.location()
 }

+// tokenKindPostprocessors defines a transformation of the lexed token string
+// before it is stored in the tokens list. It is optional for each token kind.
+var tokenKindPostprocessors = map[tokenKind]func(string) string{
+	tokenNumber: func(s string) string {
+		// Get rid of underscore digit separators.
+		return strings.ReplaceAll(s, "_", "")
+	},
+}
+
 func (l *lexer) emitFullToken(kind tokenKind, data, stringBlockIndent, stringBlockTermIndent string) {
+	// Run the postprocessor if the token kind has one defined.
+	if pp, ok := tokenKindPostprocessors[kind]; ok {
+		data = pp(data)
+	}
+
 	l.tokens = append(l.tokens, token{
 		kind:                  kind,
 		fodder:                l.fodder,
@ -451,7 +465,7 @@ func (l *lexer) lexUntilNewline() (string, int, int) {
 // that the next rune to be served by the lexer will be a leading digit.
 func (l *lexer) lexNumber() error {
 	// This function should be understood with reference to the linked image:
-	// http://www.json.org/number.gif
+	// https://www.json.org/img/number.png

 	// Note, we deviate from the json.org documentation as follows:
 	// There is no reason to lex negative numbers as atomic tokens, it is better to parse them
@ -465,9 +479,11 @@ func (l *lexer) lexNumber() error {
 		numAfterOneToNine
 		numAfterDot
 		numAfterDigit
+		numAfterUnderscore
 		numAfterE
 		numAfterExpSign
 		numAfterExpDigit
+		numAfterExpUnderscore
 	)

 	state := numBegin
@ -492,6 +508,9 @@ outerLoop:
 				state = numAfterDot
 			case 'e', 'E':
 				state = numAfterE
+			case '_':
+				state = numAfterUnderscore
+
 			default:
 				break outerLoop
 			}
@ -503,6 +522,8 @@ outerLoop:
 				state = numAfterE
 			case r >= '0' && r <= '9':
 				state = numAfterOneToNine
+			case r == '_':
+				state = numAfterUnderscore
 			default:
 				break outerLoop
 			}
@ -521,9 +542,28 @@ outerLoop:
 				state = numAfterE
 			case r >= '0' && r <= '9':
 				state = numAfterDigit
+			case r == '_':
+				state = numAfterUnderscore
 			default:
 				break outerLoop
 			}
+
+		case numAfterUnderscore:
+			// The only valid transition out of _ is to a digit.
+			switch {
+			case r == '_':
+				return l.makeStaticErrorPoint(
+					"Couldn't lex number, multiple consecutive _'s",
+					l.location())
+
+			case r >= '0' && r <= '9':
+				state = numAfterExpDigit
+
+			default:
+				return l.makeStaticErrorPoint(
+					fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)),
+					l.location())
+			}
 		case numAfterE:
 			switch {
 			case r == '+' || r == '-':
@ -545,9 +585,12 @@ outerLoop:
 			}

 		case numAfterExpDigit:
-			if r >= '0' && r <= '9' {
+			switch {
+			case r >= '0' && r <= '9':
 				state = numAfterExpDigit
-			} else {
+			case r == '_':
+				state = numAfterUnderscore
+			default:
 				break outerLoop
 			}
 		}
@ -965,7 +1008,6 @@ func Lex(diagnosticFilename ast.DiagnosticFileName, importedFilename, input stri
 					fmt.Sprintf("Could not lex the character %s", strconv.QuoteRuneToASCII(r)),
 					l.location())
 			}
-
 		}
 	}

--- a/internal/parser/lexer_test.go
+++ b/internal/parser/lexer_test.go
@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

-    http://www.apache.org/licenses/LICENSE-2.0
+	http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@ -314,6 +314,26 @@ func TestNumber1epExc(t *testing.T) {
 	SingleTest(t, "1e+!", "snippet:1:4 Couldn't lex number, junk after exponent sign: '!'", Tokens{})
 }

+func TestNumberSeparators(t *testing.T) {
+
+	SingleTest(t, "123_456", "", Tokens{{kind: tokenNumber, data: "123456"}})
+
+	/*
+			testLex("number 123_456", "123_456", {Token(Token::Kind::NUMBER, "123456")}, "");
+		    testLex("number 1_750_000", "1_750_000", {Token(Token::Kind::NUMBER, "1750000")}, "");
+		    testLex("number 1_2_3", "1_2_3", {Token(Token::Kind::NUMBER, "123")}, "");
+		    testLex("number 3.141_592", "3.141_592", {Token(Token::Kind::NUMBER, "3.141592")}, "");
+		    testLex("number 01_100", "01_100", {Token(Token::Kind::NUMBER, "0"), Token(Token::Kind::NUMBER, "1100")}, "");
+		    testLex("number 1_200.0", "1_200.0", {Token(Token::Kind::NUMBER, "1200.0")}, "");
+		    testLex("number 0e1_01", "0e1_01", {Token(Token::Kind::NUMBER, "0e101")}, "");
+		    testLex("number 10_10e3", "10_10e3", {Token(Token::Kind::NUMBER, "1010e3")}, "");
+		    testLex("number 2_3e1_2", "2_3e1_2", {Token(Token::Kind::NUMBER, "23e12")}, "");
+		    testLex("number 1.1_2e100", "1.1_2e100", {Token(Token::Kind::NUMBER, "1.12e100")}, "");
+		    testLex("number 1.1e-10_1", "1.1e-10_1", {Token(Token::Kind::NUMBER, "1.1e-101")}, "");
+		    testLex("number 9.109_383_56e-31", "9.109_383_56e-31", {Token(Token::Kind::NUMBER, "9.10938356e-31")}, "");
+	*/
+}
+
 func TestDoublestring1(t *testing.T) {
 	SingleTest(t, "\"hi\"", "", Tokens{
 		{kind: tokenStringDouble, data: "hi"},