Initial separator lexing.

This commit is contained in:
David Grant 2024-06-20 20:44:18 -07:00
parent 2b4d7535f5
commit ffd82ef1df
2 changed files with 67 additions and 5 deletions

View File

@ -358,7 +358,21 @@ func (l *lexer) resetTokenStart() {
l.tokenStartLoc = l.location() l.tokenStartLoc = l.location()
} }
// tokenKindPostprocessors defines a transformation of the lexed token string
// before it is stored in the tokens list. It is optional for each token kind.
var tokenKindPostprocessors = map[tokenKind]func(string) string{
tokenNumber: func(s string) string {
// Get rid of underscore digit separators.
return strings.ReplaceAll(s, "_", "")
},
}
func (l *lexer) emitFullToken(kind tokenKind, data, stringBlockIndent, stringBlockTermIndent string) { func (l *lexer) emitFullToken(kind tokenKind, data, stringBlockIndent, stringBlockTermIndent string) {
// Run the postprocessor if the token kind has one defined.
if pp, ok := tokenKindPostprocessors[kind]; ok {
data = pp(data)
}
l.tokens = append(l.tokens, token{ l.tokens = append(l.tokens, token{
kind: kind, kind: kind,
fodder: l.fodder, fodder: l.fodder,
@ -451,7 +465,7 @@ func (l *lexer) lexUntilNewline() (string, int, int) {
// that the next rune to be served by the lexer will be a leading digit. // that the next rune to be served by the lexer will be a leading digit.
func (l *lexer) lexNumber() error { func (l *lexer) lexNumber() error {
// This function should be understood with reference to the linked image: // This function should be understood with reference to the linked image:
// http://www.json.org/number.gif // https://www.json.org/img/number.png
// Note, we deviate from the json.org documentation as follows: // Note, we deviate from the json.org documentation as follows:
// There is no reason to lex negative numbers as atomic tokens, it is better to parse them // There is no reason to lex negative numbers as atomic tokens, it is better to parse them
@ -465,9 +479,11 @@ func (l *lexer) lexNumber() error {
numAfterOneToNine numAfterOneToNine
numAfterDot numAfterDot
numAfterDigit numAfterDigit
numAfterUnderscore
numAfterE numAfterE
numAfterExpSign numAfterExpSign
numAfterExpDigit numAfterExpDigit
numAfterExpUnderscore
) )
state := numBegin state := numBegin
@ -492,6 +508,9 @@ outerLoop:
state = numAfterDot state = numAfterDot
case 'e', 'E': case 'e', 'E':
state = numAfterE state = numAfterE
case '_':
state = numAfterUnderscore
default: default:
break outerLoop break outerLoop
} }
@ -503,6 +522,8 @@ outerLoop:
state = numAfterE state = numAfterE
case r >= '0' && r <= '9': case r >= '0' && r <= '9':
state = numAfterOneToNine state = numAfterOneToNine
case r == '_':
state = numAfterUnderscore
default: default:
break outerLoop break outerLoop
} }
@ -521,9 +542,28 @@ outerLoop:
state = numAfterE state = numAfterE
case r >= '0' && r <= '9': case r >= '0' && r <= '9':
state = numAfterDigit state = numAfterDigit
case r == '_':
state = numAfterUnderscore
default: default:
break outerLoop break outerLoop
} }
case numAfterUnderscore:
// The only valid transition out of _ is to a digit.
switch {
case r == '_':
return l.makeStaticErrorPoint(
"Couldn't lex number, multiple consecutive _'s",
l.location())
case r >= '0' && r <= '9':
state = numAfterExpDigit
default:
return l.makeStaticErrorPoint(
fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)),
l.location())
}
case numAfterE: case numAfterE:
switch { switch {
case r == '+' || r == '-': case r == '+' || r == '-':
@ -545,9 +585,12 @@ outerLoop:
} }
case numAfterExpDigit: case numAfterExpDigit:
if r >= '0' && r <= '9' { switch {
case r >= '0' && r <= '9':
state = numAfterExpDigit state = numAfterExpDigit
} else { case r == '_':
state = numAfterUnderscore
default:
break outerLoop break outerLoop
} }
} }
@ -965,7 +1008,6 @@ func Lex(diagnosticFilename ast.DiagnosticFileName, importedFilename, input stri
fmt.Sprintf("Could not lex the character %s", strconv.QuoteRuneToASCII(r)), fmt.Sprintf("Could not lex the character %s", strconv.QuoteRuneToASCII(r)),
l.location()) l.location())
} }
} }
} }

View File

@ -314,6 +314,26 @@ func TestNumber1epExc(t *testing.T) {
SingleTest(t, "1e+!", "snippet:1:4 Couldn't lex number, junk after exponent sign: '!'", Tokens{}) SingleTest(t, "1e+!", "snippet:1:4 Couldn't lex number, junk after exponent sign: '!'", Tokens{})
} }
func TestNumberSeparators(t *testing.T) {
SingleTest(t, "123_456", "", Tokens{{kind: tokenNumber, data: "123456"}})
/*
testLex("number 123_456", "123_456", {Token(Token::Kind::NUMBER, "123456")}, "");
testLex("number 1_750_000", "1_750_000", {Token(Token::Kind::NUMBER, "1750000")}, "");
testLex("number 1_2_3", "1_2_3", {Token(Token::Kind::NUMBER, "123")}, "");
testLex("number 3.141_592", "3.141_592", {Token(Token::Kind::NUMBER, "3.141592")}, "");
testLex("number 01_100", "01_100", {Token(Token::Kind::NUMBER, "0"), Token(Token::Kind::NUMBER, "1100")}, "");
testLex("number 1_200.0", "1_200.0", {Token(Token::Kind::NUMBER, "1200.0")}, "");
testLex("number 0e1_01", "0e1_01", {Token(Token::Kind::NUMBER, "0e101")}, "");
testLex("number 10_10e3", "10_10e3", {Token(Token::Kind::NUMBER, "1010e3")}, "");
testLex("number 2_3e1_2", "2_3e1_2", {Token(Token::Kind::NUMBER, "23e12")}, "");
testLex("number 1.1_2e100", "1.1_2e100", {Token(Token::Kind::NUMBER, "1.12e100")}, "");
testLex("number 1.1e-10_1", "1.1e-10_1", {Token(Token::Kind::NUMBER, "1.1e-101")}, "");
testLex("number 9.109_383_56e-31", "9.109_383_56e-31", {Token(Token::Kind::NUMBER, "9.10938356e-31")}, "");
*/
}
func TestDoublestring1(t *testing.T) { func TestDoublestring1(t *testing.T) {
SingleTest(t, "\"hi\"", "", Tokens{ SingleTest(t, "\"hi\"", "", Tokens{
{kind: tokenStringDouble, data: "hi"}, {kind: tokenStringDouble, data: "hi"},