From a04a6cf2e32650ec4a21167ebace505054cb1e3e Mon Sep 17 00:00:00 2001 From: Joe Beda Date: Thu, 21 Jan 2016 13:11:48 -0800 Subject: [PATCH] Initial commit with lexer --- LICENSE | 202 ++++++++++++++ README.md | 5 + lexer.go | 681 ++++++++++++++++++++++++++++++++++++++++++++++++ lexer_test.go | 255 ++++++++++++++++++ static_error.go | 95 +++++++ 5 files changed, 1238 insertions(+) create mode 100644 LICENSE create mode 100644 README.md create mode 100644 lexer.go create mode 100644 lexer_test.go create mode 100644 static_error.go diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7a4a3ea --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..ea63458 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# go-jsonnet + +This is a port of [jsonnet](http://jsonnet.org/) to go. It is very much a work in progress. + +This implementation is largely based on the the [jsonnet C++ implementation](https://github.com/google/jsonnet). \ No newline at end of file diff --git a/lexer.go b/lexer.go new file mode 100644 index 0000000..4f7ebf9 --- /dev/null +++ b/lexer.go @@ -0,0 +1,681 @@ +package jsonnet + +import ( + "bytes" + "fmt" + "strconv" + "strings" + "unicode/utf8" +) + +////////////////////////////////////////////////////////////////////////////// +// Fodder +// +// Fodder is stuff that is usually thrown away by lexers/preprocessors but is +// kept so that the source can be round tripped with full fidelity. +type fodderKind int + +const ( + fodderWhitespace fodderKind = iota + fodderCommentC + fodderCommentCpp + fodderCommentHash +) + +type fodderElement struct { + kind fodderKind + data string +} + +type fodder []fodderElement + +////////////////////////////////////////////////////////////////////////////// +// Token + +type tokenKind int + +const ( + tokenInvalid tokenKind = iota + + // Symbols + tokenBraceL + tokenBraceR + tokenBracketL + tokenBracketR + tokenColon + tokenComma + tokenDollar + tokenDot + tokenParenL + tokenParenR + tokenSemicolon + + // Arbitrary length lexemes + tokenIdentifier + tokenNumber + tokenOperator + tokenStringDouble + tokenStringSingle + tokenStringBlock + + // Keywords + tokenAssert + tokenElse + tokenError + tokenFalse + tokenFor + tokenFunction + tokenIf + tokenImport + tokenImportStr + tokenIn + tokenLocal + tokenNullLit + tokenTailStrict + tokenThen + tokenSelf + tokenSuper + tokenTrue + + // A special token that holds line/column information about the end of the + // file. + tokenEndOfFile +) + +type token struct { + kind tokenKind // The type of the token + fodder fodder // Any fodder the occurs before this token + data string // Content of the token if it is not a keyword + + // Extra info for when kind == tokenStringBlock + stringBlockIndent string // The sequence of whitespace that indented the block. + stringBlockTermIndent string // This is always fewer whitespace characters than in stringBlockIndent. + + loc LocationRange +} + +type tokens []token + +////////////////////////////////////////////////////////////////////////////// +// Helpers + +func isUpper(r rune) bool { + return r >= 'A' && r <= 'Z' +} + +func isLower(r rune) bool { + return r >= 'a' && r <= 'z' +} + +func isNumber(r rune) bool { + return r >= '0' && r <= '9' +} + +func isIdentifierFirst(r rune) bool { + return isUpper(r) || isLower(r) || r == '_' +} + +func isIdentifier(r rune) bool { + return isIdentifierFirst(r) || isNumber(r) +} + +func isSymbol(r rune) bool { + switch r { + case '&', '|', '^', '=', '<', '>', '*', '/', '%', '#': + return true + } + return false +} + +// Check that b has at least the same whitespace prefix as a and returns the +// amount of this whitespace, otherwise returns 0. If a has no whitespace +// prefix than return 0. +func checkWhitespace(a, b string) int { + i := 0 + for ; i < len(a); i++ { + if a[i] != ' ' && a[i] != '\t' { + // a has run out of whitespace and b matched up to this point. Return + // result. + return i + } + if i >= len(b) { + // We ran off the edge of b while a still has whitespace. Return 0 as + // failure. + return 0 + } + if a[i] != b[i] { + // a has whitespace but b does not. Return 0 as failure. + return 0 + } + } + // We ran off the end of a and b kept up + return i +} + +////////////////////////////////////////////////////////////////////////////// +// Lexer + +type lexer struct { + fileName string // The file name being lexed, only used for errors + input string // The input string + + pos int // Current byte position in input + lineNumber int // Current line number for pos + lineStart int // Byte position of start of line + + // Data about the state position of the lexer before previous call to + // 'next'. If this state is lost then prevPos is set to lexEOF and panic + // ensues. + prevPos int // Byte position of last rune read + prevLineNumber int // The line number before last rune read + prevLineStart int // The line start before last rune read + + tokens tokens // The tokens that we've generated so far + + // Information about the token we are working on right now + fodder fodder + tokenStart int + tokenStartLoc Location +} + +const lexEOF = -1 + +func makeLexer(fn string, input string) *lexer { + return &lexer{ + fileName: fn, + input: input, + lineNumber: 1, + prevPos: lexEOF, + prevLineNumber: 1, + tokenStartLoc: Location{Line: 1, Column: 1}, + } +} + +// next returns the next rune in the input. +func (l *lexer) next() rune { + if int(l.pos) >= len(l.input) { + l.prevPos = l.pos + return lexEOF + } + r, w := utf8.DecodeRuneInString(l.input[l.pos:]) + l.prevPos = l.pos + l.pos += w + if r == '\n' { + l.prevLineNumber = l.lineNumber + l.prevLineStart = l.lineStart + l.lineNumber += 1 + l.lineStart = l.pos + } + return r +} + +func (l *lexer) acceptN(n int) { + for i := 0; i < n; i++ { + l.next() + } +} + +// peek returns but does not consume the next rune in the input. +func (l *lexer) peek() rune { + r := l.next() + l.backup() + return r +} + +// backup steps back one rune. Can only be called once per call of next. +func (l *lexer) backup() { + if l.prevPos == lexEOF { + panic("backup called with no valid previous rune") + } + l.lineNumber = l.prevLineNumber + l.lineStart = l.prevLineStart + l.pos = l.prevPos + l.prevPos = lexEOF +} + +func (l *lexer) location() Location { + return Location{Line: l.lineNumber, Column: l.pos - l.lineStart + 1} +} + +func (l *lexer) prevLocation() Location { + if l.prevPos == lexEOF { + panic("prevLocation called with no valid previous rune") + } + return Location{Line: l.prevLineNumber, Column: l.prevPos - l.prevLineStart + 1} +} + +// Reset the current working token start to the current cursor position. This +// may throw away some characters. This does not throw away any accumulated +// fodder. +func (l *lexer) resetTokenStart() { + l.tokenStart = l.pos + l.tokenStartLoc = l.location() +} + +func (l *lexer) emitFullToken(kind tokenKind, data, stringBlockIndent, stringBlockTermIndent string) { + l.tokens = append(l.tokens, token{ + kind: kind, + fodder: l.fodder, + data: data, + stringBlockIndent: stringBlockIndent, + stringBlockTermIndent: stringBlockTermIndent, + loc: makeLocationRange(l.fileName, l.tokenStartLoc, l.location()), + }) + l.fodder = fodder{} +} + +func (l *lexer) emitToken(kind tokenKind) { + l.emitFullToken(kind, l.input[l.tokenStart:l.pos], "", "") + l.resetTokenStart() +} + +func (l *lexer) addWhitespaceFodder() { + fodderData := l.input[l.tokenStart:l.pos] + if len(l.fodder) == 0 || l.fodder[len(l.fodder)-1].kind != fodderWhitespace { + l.fodder = append(l.fodder, fodderElement{kind: fodderWhitespace, data: fodderData}) + } else { + l.fodder[len(l.fodder)-1].data += fodderData + } + l.resetTokenStart() +} + +func (l *lexer) addCommentFodder(kind fodderKind) { + fodderData := l.input[l.tokenStart:l.pos] + l.fodder = append(l.fodder, fodderElement{kind: kind, data: fodderData}) + l.resetTokenStart() +} + +func (l *lexer) addFodder(kind fodderKind, data string) { + l.fodder = append(l.fodder, fodderElement{kind: kind, data: data}) +} + +// lexNumber will consume a number and emit a token. It is assumed +// that the next rune to be served by the lexer will be a leading digit. +func (l *lexer) lexNumber() error { + // This function should be understood with reference to the linked image: + // http://www.json.org/number.gif + + // Note, we deviate from the json.org documentation as follows: + // There is no reason to lex negative numbers as atomic tokens, it is better to parse them + // as a unary operator combined with a numeric literal. This avoids x-1 being tokenized as + // instead of the intended . + + type numLexState int + const ( + numBegin numLexState = iota + numAfterZero + numAfterOneToNine + numAfterDot + numAfterDigit + numAfterE + numAfterExpSign + numAfterExpDigit + ) + + state := numBegin + for true { + r := l.next() + switch state { + case numBegin: + switch { + case r == '0': + state = numAfterZero + case r >= '1' && r <= '9': + state = numAfterOneToNine + default: + return makeStaticErrorPoint( + "Couldn't lex number", l.fileName, l.prevLocation()) + } + case numAfterZero: + switch r { + case '.': + state = numAfterDot + case 'e', 'E': + state = numAfterE + default: + goto end + } + case numAfterOneToNine: + switch { + case r == '.': + state = numAfterDot + case r == 'e' || r == 'E': + state = numAfterE + case r >= '0' && r <= '9': + state = numAfterOneToNine + default: + goto end + } + case numAfterDot: + switch { + case r >= '0' && r <= '9': + state = numAfterDigit + default: + return makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after decimal point: %v", strconv.QuoteRuneToASCII(r)), + l.fileName, l.prevLocation()) + } + case numAfterDigit: + switch { + case r == 'e' || r == 'E': + state = numAfterE + case r >= '0' && r <= '9': + state = numAfterDigit + default: + goto end + } + case numAfterE: + switch { + case r == '+' || r == '-': + state = numAfterExpSign + case r >= '0' && r <= '9': + state = numAfterExpDigit + default: + return makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after 'E': %v", strconv.QuoteRuneToASCII(r)), + l.fileName, l.prevLocation()) + } + case numAfterExpSign: + if r >= '0' && r <= '9' { + state = numAfterExpDigit + } else { + return makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after exponent sign: %v", strconv.QuoteRuneToASCII(r)), + l.fileName, l.prevLocation()) + } + + case numAfterExpDigit: + if r >= '0' && r <= '9' { + state = numAfterExpDigit + } else { + goto end + } + } + } +end: + l.backup() + l.emitToken(tokenNumber) + return nil +} + +// lexIdentifier will consume a identifer and emit a token. It is assumed +// that the next rune to be served by the lexer will be a leading digit. This +// may emit a keyword or an identifier. +func (l *lexer) lexIdentifier() { + r := l.next() + if !isIdentifierFirst(r) { + panic("Unexpected character in lexIdentifier") + } + for ; r != lexEOF; r = l.next() { + if !isIdentifier(r) { + break + } + } + l.backup() + + switch l.input[l.tokenStart:l.pos] { + case "assert": + l.emitToken(tokenAssert) + case "else": + l.emitToken(tokenElse) + case "error": + l.emitToken(tokenError) + case "false": + l.emitToken(tokenFalse) + case "for": + l.emitToken(tokenFor) + case "function": + l.emitToken(tokenFunction) + case "if": + l.emitToken(tokenIf) + case "import": + l.emitToken(tokenImport) + case "importstr": + l.emitToken(tokenImportStr) + case "in": + l.emitToken(tokenIn) + case "local": + l.emitToken(tokenLocal) + case "null": + l.emitToken(tokenNullLit) + case "self": + l.emitToken(tokenSelf) + case "super": + l.emitToken(tokenSuper) + case "tailstrict": + l.emitToken(tokenTailStrict) + case "then": + l.emitToken(tokenThen) + case "true": + l.emitToken(tokenTrue) + default: + // Not a keyword, assume it is an identifier + l.emitToken(tokenIdentifier) + } +} + +// lexSymbol will lex a token that starts with a symbol. This could be a +// comment, block quote or an operator. This function assumes that the next +// rune to be served by the lexer will be the first rune of the new token. +func (l *lexer) lexSymbol() error { + r := l.next() + + // Single line C++ style comment + if r == '/' && l.peek() == '/' { + l.next() + l.resetTokenStart() // Throw out the leading // + for r = l.next(); r != lexEOF && r != '\n'; r = l.next() { + } + // Leave the '\n' in the lexer to be fodder for the next round + l.backup() + l.addCommentFodder(fodderCommentCpp) + return nil + } + + if r == '#' { + l.resetTokenStart() // Throw out the leading # + for r = l.next(); r != lexEOF && r != '\n'; r = l.next() { + } + // Leave the '\n' in the lexer to be fodder for the next round + l.backup() + l.addCommentFodder(fodderCommentHash) + return nil + } + + if r == '/' && l.peek() == '*' { + commentStartLoc := l.tokenStartLoc + l.next() // consume the '*' + l.resetTokenStart() // Throw out the leading /* + for r = l.next(); ; r = l.next() { + if r == lexEOF { + return makeStaticErrorPoint("Multi-line comment has no terminating */.", + l.fileName, commentStartLoc) + } + if r == '*' && l.peek() == '/' { + commentData := l.input[l.tokenStart : l.pos-1] // Don't include trailing */ + l.addFodder(fodderCommentC, commentData) + l.next() // Skip past '/' + l.resetTokenStart() // Start next token at this point + return nil + } + } + } + + if r == '|' && strings.HasPrefix(l.input[l.pos:], "||\n") { + commentStartLoc := l.tokenStartLoc + l.acceptN(3) // Skip "||\n" + var cb bytes.Buffer + + // Skip leading blank lines + for r = l.next(); r == '\n'; r = l.next() { + cb.WriteRune(r) + } + l.backup() + numWhiteSpace := checkWhitespace(l.input[l.pos:], l.input[l.pos:]) + stringBlockIndent := l.input[l.pos : l.pos+numWhiteSpace] + if numWhiteSpace == 0 { + return makeStaticErrorPoint("Text block's first line must start with whitespace.", + l.fileName, commentStartLoc) + } + + for { + if numWhiteSpace <= 0 { + panic("Unexpected value for numWhiteSpace") + } + l.acceptN(numWhiteSpace) + for r = l.next(); r != '\n'; r = l.next() { + if r == lexEOF { + return makeStaticErrorPoint("Unexpected EOF", + l.fileName, commentStartLoc) + } + cb.WriteRune(r) + } + cb.WriteRune('\n') + + // Skip any blank lines + for r = l.next(); r == '\n'; r = l.next() { + cb.WriteRune(r) + } + l.backup() + + // Look at the next line + numWhiteSpace = checkWhitespace(stringBlockIndent, l.input[l.pos:]) + if numWhiteSpace == 0 { + // End of the text block + var stringBlockTermIndent string + for r = l.next(); r == ' ' || r == '\t'; r = l.next() { + stringBlockTermIndent += string(r) + } + l.backup() + if !strings.HasPrefix(l.input[l.pos:], "|||") { + return makeStaticErrorPoint("Text block not terminated with |||", + l.fileName, commentStartLoc) + } + l.acceptN(3) // Skip '|||' + l.emitFullToken(tokenStringBlock, cb.String(), + stringBlockIndent, stringBlockTermIndent) + l.resetTokenStart() + return nil + } + } + } + + // Assume any string of symbols is a single operator. + for r = l.next(); isSymbol(r); r = l.next() { + + } + l.backup() + l.emitToken(tokenOperator) + return nil +} + +func lex(fn string, input string) (tokens, error) { + l := makeLexer(fn, input) + + var err error + + for r := l.next(); r != lexEOF; r = l.next() { + switch r { + case ' ', '\t', '\r', '\n': + l.addWhitespaceFodder() + continue + case '{': + l.emitToken(tokenBraceL) + case '}': + l.emitToken(tokenBraceR) + case '[': + l.emitToken(tokenBracketL) + case ']': + l.emitToken(tokenBracketR) + case ':': + l.emitToken(tokenColon) + case ',': + l.emitToken(tokenComma) + case '$': + l.emitToken(tokenDollar) + case '.': + l.emitToken(tokenDot) + case '(': + l.emitToken(tokenParenL) + case ')': + l.emitToken(tokenParenR) + case ';': + l.emitToken(tokenSemicolon) + + // Operators + case '!': + if l.peek() == '=' { + _ = l.next() + } + l.emitToken(tokenOperator) + case '~', '+', '-': + l.emitToken(tokenOperator) + + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + l.backup() + err = l.lexNumber() + if err != nil { + return nil, err + } + + // String literals + case '"': + stringStartLoc := l.prevLocation() + l.resetTokenStart() // Don't include the quotes in the token data + for r = l.next(); ; r = l.next() { + if r == lexEOF { + return nil, makeStaticErrorPoint("Unterminated String", l.fileName, stringStartLoc) + } + if r == '"' { + l.backup() + l.emitToken(tokenStringDouble) + _ = l.next() + l.resetTokenStart() + break + } + if r == '\\' && l.peek() != lexEOF { + r = l.next() + } + } + case '\'': + stringStartLoc := l.prevLocation() + l.resetTokenStart() // Don't include the quotes in the token data + for r = l.next(); ; r = l.next() { + if r == lexEOF { + return nil, makeStaticErrorPoint("Unterminated String", l.fileName, stringStartLoc) + } + if r == '\'' { + l.backup() + l.emitToken(tokenStringSingle) + r = l.next() + l.resetTokenStart() + break + } + if r == '\\' && l.peek() != lexEOF { + r = l.next() + } + } + default: + if isIdentifierFirst(r) { + l.backup() + l.lexIdentifier() + } else if isSymbol(r) { + l.backup() + err = l.lexSymbol() + if err != nil { + return nil, err + } + } else { + return nil, makeStaticErrorPoint( + fmt.Sprintf("Could not lex the character %s", strconv.QuoteRuneToASCII(r)), + l.fileName, l.prevLocation()) + } + + } + } + + // We are currently at the EOF. Emit a special token to capture any + // trailing fodder + l.emitToken(tokenEndOfFile) + return l.tokens, nil +} diff --git a/lexer_test.go b/lexer_test.go new file mode 100644 index 0000000..a7a1da1 --- /dev/null +++ b/lexer_test.go @@ -0,0 +1,255 @@ +package jsonnet + +import ( + "testing" +) + +type lexTest struct { + name string + input string + tokens tokens + errString string +} + +var ( + tEOF = token{kind: tokenEndOfFile} +) + +var lexTests = []lexTest{ + {"empty", "", tokens{}, ""}, + {"whitespace", " \t\n\r\r\n", tokens{}, ""}, + + {"brace L", "{", tokens{{kind: tokenBraceL, data: "{"}}, ""}, + {"brace R", "}", tokens{{kind: tokenBraceR, data: "}"}}, ""}, + {"bracket L", "[", tokens{{kind: tokenBracketL, data: "["}}, ""}, + {"bracket R", "]", tokens{{kind: tokenBracketR, data: "]"}}, ""}, + {"colon", ":", tokens{{kind: tokenColon, data: ":"}}, ""}, + {"comma", ",", tokens{{kind: tokenComma, data: ","}}, ""}, + {"dollar", "$", tokens{{kind: tokenDollar, data: "$"}}, ""}, + {"dot", ".", tokens{{kind: tokenDot, data: "."}}, ""}, + {"paren L", "(", tokens{{kind: tokenParenL, data: "("}}, ""}, + {"paren R", ")", tokens{{kind: tokenParenR, data: ")"}}, ""}, + {"semicolon", ";", tokens{{kind: tokenSemicolon, data: ";"}}, ""}, + + {"not 1", "!", tokens{{kind: tokenOperator, data: "!"}}, ""}, + {"not 2", "! ", tokens{{kind: tokenOperator, data: "!"}}, ""}, + {"not equal", "!=", tokens{{kind: tokenOperator, data: "!="}}, ""}, + {"tilde", "~", tokens{{kind: tokenOperator, data: "~"}}, ""}, + {"plus", "+", tokens{{kind: tokenOperator, data: "+"}}, ""}, + {"minus", "-", tokens{{kind: tokenOperator, data: "-"}}, ""}, + + {"number 0", "0", tokens{{kind: tokenNumber, data: "0"}}, ""}, + {"number 1", "1", tokens{{kind: tokenNumber, data: "1"}}, ""}, + {"number 1.0", "1.0", tokens{{kind: tokenNumber, data: "1.0"}}, ""}, + {"number 0.1", "0.1", tokens{{kind: tokenNumber, data: "0.1"}}, ""}, + {"number 0e100", "0e100", tokens{{kind: tokenNumber, data: "0e100"}}, ""}, + {"number 1e100", "1e100", tokens{{kind: tokenNumber, data: "1e100"}}, ""}, + {"number 1.1e100", "1.1e100", tokens{{kind: tokenNumber, data: "1.1e100"}}, ""}, + {"number 1.1e-100", "1.1e-100", tokens{{kind: tokenNumber, data: "1.1e-100"}}, ""}, + {"number 1.1e+100", "1.1e+100", tokens{{kind: tokenNumber, data: "1.1e+100"}}, ""}, + {"number 0100", "0100", tokens{ + {kind: tokenNumber, data: "0"}, + {kind: tokenNumber, data: "100"}, + }, ""}, + {"number 10+10", "10+10", tokens{ + {kind: tokenNumber, data: "10"}, + {kind: tokenOperator, data: "+"}, + {kind: tokenNumber, data: "10"}, + }, ""}, + {"number 1.+3", "1.+3", tokens{}, "number 1.+3:1:3 Couldn't lex number, junk after decimal point: '+'"}, + {"number 1e!", "1e!", tokens{}, "number 1e!:1:3 Couldn't lex number, junk after 'E': '!'"}, + {"number 1e+!", "1e+!", tokens{}, "number 1e+!:1:4 Couldn't lex number, junk after exponent sign: '!'"}, + + {"double string \"hi\"", "\"hi\"", tokens{{kind: tokenStringDouble, data: "hi"}}, ""}, + {"double string \"hi nl\"", "\"hi\n\"", tokens{{kind: tokenStringDouble, data: "hi\n"}}, ""}, + {"double string \"hi\\\"\"", "\"hi\\\"\"", tokens{{kind: tokenStringDouble, data: "hi\\\""}}, ""}, + {"double string \"hi\\nl\"", "\"hi\\\n\"", tokens{{kind: tokenStringDouble, data: "hi\\\n"}}, ""}, + {"double string \"hi", "\"hi", tokens{}, "double string \"hi:1:1 Unterminated String"}, + + {"single string 'hi'", "'hi'", tokens{{kind: tokenStringSingle, data: "hi"}}, ""}, + {"single string 'hi nl'", "'hi\n'", tokens{{kind: tokenStringSingle, data: "hi\n"}}, ""}, + {"single string 'hi\\''", "'hi\\''", tokens{{kind: tokenStringSingle, data: "hi\\'"}}, ""}, + {"single string 'hi\\nl'", "'hi\\\n'", tokens{{kind: tokenStringSingle, data: "hi\\\n"}}, ""}, + {"single string 'hi", "'hi", tokens{}, "single string 'hi:1:1 Unterminated String"}, + + {"assert", "assert", tokens{{kind: tokenAssert, data: "assert"}}, ""}, + {"else", "else", tokens{{kind: tokenElse, data: "else"}}, ""}, + {"error", "error", tokens{{kind: tokenError, data: "error"}}, ""}, + {"false", "false", tokens{{kind: tokenFalse, data: "false"}}, ""}, + {"for", "for", tokens{{kind: tokenFor, data: "for"}}, ""}, + {"function", "function", tokens{{kind: tokenFunction, data: "function"}}, ""}, + {"if", "if", tokens{{kind: tokenIf, data: "if"}}, ""}, + {"import", "import", tokens{{kind: tokenImport, data: "import"}}, ""}, + {"importstr", "importstr", tokens{{kind: tokenImportStr, data: "importstr"}}, ""}, + {"in", "in", tokens{{kind: tokenIn, data: "in"}}, ""}, + {"local", "local", tokens{{kind: tokenLocal, data: "local"}}, ""}, + {"null", "null", tokens{{kind: tokenNullLit, data: "null"}}, ""}, + {"self", "self", tokens{{kind: tokenSelf, data: "self"}}, ""}, + {"super", "super", tokens{{kind: tokenSuper, data: "super"}}, ""}, + {"tailstrict", "tailstrict", tokens{{kind: tokenTailStrict, data: "tailstrict"}}, ""}, + {"then", "then", tokens{{kind: tokenThen, data: "then"}}, ""}, + {"true", "true", tokens{{kind: tokenTrue, data: "true"}}, ""}, + + {"identifier", "foobar", tokens{{kind: tokenIdentifier, data: "foobar"}}, ""}, + + {"c++ comment", "// hi", tokens{}, ""}, // This test doesn't look at fodder (yet?) + {"hash comment", "# hi", tokens{}, ""}, // This test doesn't look at fodder (yet?) + {"c comment", "/* hi */", tokens{}, ""}, // This test doesn't look at fodder (yet?) + + { + "block string spaces", + `||| + test + more + ||| + foo +|||`, + tokens{ + { + kind: tokenStringBlock, + data: "test\n more\n|||\n foo\n", + stringBlockIndent: " ", + stringBlockTermIndent: "", + }, + }, + "", + }, + { + "block string tabs", + `||| + test + more + ||| + foo +|||`, + tokens{ + { + kind: tokenStringBlock, + data: "test\n more\n|||\n foo\n", + stringBlockIndent: "\t", + stringBlockTermIndent: "", + }, + }, + "", + }, + { + "block string mixed", + `||| + test + more + ||| + foo +|||`, + tokens{ + { + kind: tokenStringBlock, + data: "test\n more\n|||\n foo\n", + stringBlockIndent: "\t \t", + stringBlockTermIndent: "", + }, + }, + "", + }, + { + "block string blanks", + `||| + + test + + + more + ||| + foo +|||`, + tokens{ + { + kind: tokenStringBlock, + data: "\ntest\n\n\n more\n|||\n foo\n", + stringBlockIndent: " ", + stringBlockTermIndent: "", + }, + }, + "", + }, + { + "block string bad indent", + `||| + test + foo +|||`, + tokens{}, + "block string bad indent:1:1 Text block not terminated with |||", + }, + { + "block string eof", + `||| + test`, + tokens{}, + "block string eof:1:1 Unexpected EOF", + }, + { + "block string not term", + `||| + test +`, + tokens{}, + "block string not term:1:1 Text block not terminated with |||", + }, + + {"op *", "*", tokens{{kind: tokenOperator, data: "*"}}, ""}, + {"op /", "/", tokens{{kind: tokenOperator, data: "/"}}, ""}, + {"op %", "%", tokens{{kind: tokenOperator, data: "%"}}, ""}, + {"op &", "&", tokens{{kind: tokenOperator, data: "&"}}, ""}, + {"op |", "|", tokens{{kind: tokenOperator, data: "|"}}, ""}, + {"op ^", "^", tokens{{kind: tokenOperator, data: "^"}}, ""}, + {"op =", "=", tokens{{kind: tokenOperator, data: "="}}, ""}, + {"op <", "<", tokens{{kind: tokenOperator, data: "<"}}, ""}, + {"op >", ">", tokens{{kind: tokenOperator, data: ">"}}, ""}, + {"op >==|", ">==|", tokens{{kind: tokenOperator, data: ">==|"}}, ""}, + + {"junk", "💩", tokens{}, "junk:1:1 Could not lex the character '\\U0001f4a9'"}, +} + +func tokensEqual(ts1, ts2 tokens) bool { + if len(ts1) != len(ts2) { + return false + } + for i := range ts1 { + t1, t2 := ts1[i], ts2[i] + if t1.kind != t2.kind { + return false + } + if t1.data != t2.data { + return false + } + if t1.stringBlockIndent != t2.stringBlockIndent { + return false + } + if t1.stringBlockTermIndent != t2.stringBlockTermIndent { + return false + } + } + return true +} + +func TestLex(t *testing.T) { + for _, test := range lexTests { + // Copy the test tokens and append an EOF token + testTokens := append(tokens(nil), test.tokens...) + testTokens = append(testTokens, tEOF) + tokens, err := lex(test.name, test.input) + var errString string + if err != nil { + errString = err.Error() + } + if errString != test.errString { + t.Errorf("%s: error result does not match. got\n\t%+v\nexpected\n\t%+v", + test.name, errString, test.errString) + } + if err == nil && !tokensEqual(tokens, testTokens) { + t.Errorf("%s: got\n\t%+v\nexpected\n\t%+v", test.name, tokens, testTokens) + } + } +} + +// TODO: test fodder, test position reporting diff --git a/static_error.go b/static_error.go new file mode 100644 index 0000000..8bd0c3e --- /dev/null +++ b/static_error.go @@ -0,0 +1,95 @@ +package jsonnet + +import ( + "fmt" +) + +////////////////////////////////////////////////////////////////////////////// +// Location + +// Location represents a single location in an (unspecified) file. +type Location struct { + Line int + Column int +} + +// IsSet returns if this Location has been set. +func (l *Location) IsSet() bool { + return l.Line != 0 +} + +func (l *Location) String() string { + return fmt.Sprintf("%v:%v", l.Line, l.Column) +} + +////////////////////////////////////////////////////////////////////////////// +// LocationRange + +// LocationRange represents a range of a source file. +type LocationRange struct { + FileName string + Begin Location + End Location +} + +// IsSet returns if this LocationRange has been set. +func (lr *LocationRange) IsSet() bool { + return lr.Begin.IsSet() +} + +func (lr *LocationRange) String() string { + if !lr.IsSet() { + return lr.FileName + } + + var filePrefix string + if len(lr.FileName) > 0 { + filePrefix = lr.FileName + ":" + } + if lr.Begin.Line == lr.End.Line { + if lr.Begin.Column == lr.End.Column { + return fmt.Sprintf("%s%v", filePrefix, lr.Begin.String()) + } + return fmt.Sprintf("%s%v-%v", filePrefix, lr.Begin.String(), lr.End.Column) + } + + return fmt.Sprintf("%s(%v)-(%v)", filePrefix, lr.Begin.String(), lr.End.String()) +} + +// This is useful for special locations, e.g. manifestation entry point. +func makeLocationRangeMessage(msg string) LocationRange { + return LocationRange{FileName: msg} +} + +func makeLocationRange(fn string, begin Location, end Location) LocationRange { + return LocationRange{FileName: fn, Begin: begin, End: end} +} + +////////////////////////////////////////////////////////////////////////////// +// StaticError + +// StaticError represents an error during parsing/lexing some jsonnet. +type StaticError struct { + Loc LocationRange + Msg string +} + +func makeStaticErrorMsg(msg string) StaticError { + return StaticError{Msg: msg} +} + +func makeStaticErrorPoint(msg string, fn string, l Location) StaticError { + return StaticError{Msg: msg, Loc: makeLocationRange(fn, l, l)} +} + +func makeStaticError(msg string, lr LocationRange) StaticError { + return StaticError{Msg: msg, Loc: lr} +} + +func (err StaticError) Error() string { + loc := "" + if err.Loc.IsSet() { + loc = err.Loc.String() + } + return fmt.Sprintf("%v %v", loc, err.Msg) +}