Initial separator lexing.

This commit is contained in:
David Grant 2024-06-20 20:44:18 -07:00
parent 2b4d7535f5
commit ffd82ef1df
2 changed files with 67 additions and 5 deletions

View File

@ -358,7 +358,21 @@ func (l *lexer) resetTokenStart() {
l.tokenStartLoc = l.location()
}
// tokenKindPostprocessors defines a transformation of the lexed token string
// before it is stored in the tokens list. It is optional for each token kind.
var tokenKindPostprocessors = map[tokenKind]func(string) string{
tokenNumber: func(s string) string {
// Get rid of underscore digit separators.
return strings.ReplaceAll(s, "_", "")
},
}
func (l *lexer) emitFullToken(kind tokenKind, data, stringBlockIndent, stringBlockTermIndent string) {
// Run the postprocessor if the token kind has one defined.
if pp, ok := tokenKindPostprocessors[kind]; ok {
data = pp(data)
}
l.tokens = append(l.tokens, token{
kind: kind,
fodder: l.fodder,
@ -451,7 +465,7 @@ func (l *lexer) lexUntilNewline() (string, int, int) {
// that the next rune to be served by the lexer will be a leading digit.
func (l *lexer) lexNumber() error {
// This function should be understood with reference to the linked image:
// http://www.json.org/number.gif
// https://www.json.org/img/number.png
// Note, we deviate from the json.org documentation as follows:
// There is no reason to lex negative numbers as atomic tokens, it is better to parse them
@ -465,9 +479,11 @@ func (l *lexer) lexNumber() error {
numAfterOneToNine
numAfterDot
numAfterDigit
numAfterUnderscore
numAfterE
numAfterExpSign
numAfterExpDigit
numAfterExpUnderscore
)
state := numBegin
@ -492,6 +508,9 @@ outerLoop:
state = numAfterDot
case 'e', 'E':
state = numAfterE
case '_':
state = numAfterUnderscore
default:
break outerLoop
}
@ -503,6 +522,8 @@ outerLoop:
state = numAfterE
case r >= '0' && r <= '9':
state = numAfterOneToNine
case r == '_':
state = numAfterUnderscore
default:
break outerLoop
}
@ -521,9 +542,28 @@ outerLoop:
state = numAfterE
case r >= '0' && r <= '9':
state = numAfterDigit
case r == '_':
state = numAfterUnderscore
default:
break outerLoop
}
case numAfterUnderscore:
// The only valid transition out of _ is to a digit.
switch {
case r == '_':
return l.makeStaticErrorPoint(
"Couldn't lex number, multiple consecutive _'s",
l.location())
case r >= '0' && r <= '9':
state = numAfterExpDigit
default:
return l.makeStaticErrorPoint(
fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)),
l.location())
}
case numAfterE:
switch {
case r == '+' || r == '-':
@ -545,9 +585,12 @@ outerLoop:
}
case numAfterExpDigit:
if r >= '0' && r <= '9' {
switch {
case r >= '0' && r <= '9':
state = numAfterExpDigit
} else {
case r == '_':
state = numAfterUnderscore
default:
break outerLoop
}
}
@ -965,7 +1008,6 @@ func Lex(diagnosticFilename ast.DiagnosticFileName, importedFilename, input stri
fmt.Sprintf("Could not lex the character %s", strconv.QuoteRuneToASCII(r)),
l.location())
}
}
}

View File

@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
@ -314,6 +314,26 @@ func TestNumber1epExc(t *testing.T) {
SingleTest(t, "1e+!", "snippet:1:4 Couldn't lex number, junk after exponent sign: '!'", Tokens{})
}
func TestNumberSeparators(t *testing.T) {
SingleTest(t, "123_456", "", Tokens{{kind: tokenNumber, data: "123456"}})
/*
testLex("number 123_456", "123_456", {Token(Token::Kind::NUMBER, "123456")}, "");
testLex("number 1_750_000", "1_750_000", {Token(Token::Kind::NUMBER, "1750000")}, "");
testLex("number 1_2_3", "1_2_3", {Token(Token::Kind::NUMBER, "123")}, "");
testLex("number 3.141_592", "3.141_592", {Token(Token::Kind::NUMBER, "3.141592")}, "");
testLex("number 01_100", "01_100", {Token(Token::Kind::NUMBER, "0"), Token(Token::Kind::NUMBER, "1100")}, "");
testLex("number 1_200.0", "1_200.0", {Token(Token::Kind::NUMBER, "1200.0")}, "");
testLex("number 0e1_01", "0e1_01", {Token(Token::Kind::NUMBER, "0e101")}, "");
testLex("number 10_10e3", "10_10e3", {Token(Token::Kind::NUMBER, "1010e3")}, "");
testLex("number 2_3e1_2", "2_3e1_2", {Token(Token::Kind::NUMBER, "23e12")}, "");
testLex("number 1.1_2e100", "1.1_2e100", {Token(Token::Kind::NUMBER, "1.12e100")}, "");
testLex("number 1.1e-10_1", "1.1e-10_1", {Token(Token::Kind::NUMBER, "1.1e-101")}, "");
testLex("number 9.109_383_56e-31", "9.109_383_56e-31", {Token(Token::Kind::NUMBER, "9.10938356e-31")}, "");
*/
}
func TestDoublestring1(t *testing.T) {
SingleTest(t, "\"hi\"", "", Tokens{
{kind: tokenStringDouble, data: "hi"},