From a94bfef7648e3d3865afc7febda171269bd11e35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Barzowski?= Date: Thu, 10 Aug 2017 17:26:37 -0400 Subject: [PATCH] Add verbatim string support --- ast.go | 2 ++ desugarer.go | 15 ++++++++------ lexer.go | 55 +++++++++++++++++++++++++++++++++++++++++++++------ lexer_test.go | 10 ++++++++++ main_test.go | 1 + parser.go | 14 +++++++++++++ 6 files changed, 85 insertions(+), 12 deletions(-) diff --git a/ast.go b/ast.go index 79bc96f..5bad7c7 100644 --- a/ast.go +++ b/ast.go @@ -348,6 +348,8 @@ const ( astStringSingle astLiteralStringKind = iota astStringDouble astStringBlock + astVerbatimStringDouble + astVerbatimStringSingle ) // astLiteralString represents a JSON string diff --git a/desugarer.go b/desugarer.go index 34933f6..c299d8d 100644 --- a/desugarer.go +++ b/desugarer.go @@ -324,13 +324,16 @@ func desugar(astPtr *astNode, objLevel int) (err error) { // Nothing to do. case *astLiteralString: - unescaped, err := stringUnescape(ast.Loc(), ast.value) - if err != nil { - return err + if ast.kind != astVerbatimStringDouble && ast.kind != astVerbatimStringSingle { + unescaped, err := stringUnescape(ast.Loc(), ast.value) + if err != nil { + return err + } + // TODO(sbarzowski) perhaps store unescaped in a separate field... + ast.value = unescaped + ast.kind = astStringDouble + ast.blockIndent = "" } - ast.value = unescaped - ast.kind = astStringDouble - ast.blockIndent = "" case *astObject: // Hidden variable to allow $ binding. diff --git a/lexer.go b/lexer.go index 77857ee..2f98383 100644 --- a/lexer.go +++ b/lexer.go @@ -70,6 +70,8 @@ const ( tokenStringBlock tokenStringDouble tokenStringSingle + tokenVerbatimStringDouble + tokenVerbatimStringSingle // Keywords tokenAssert @@ -109,12 +111,14 @@ var tokenKindStrings = []string{ tokenSemicolon: "\";\"", // Arbitrary length lexemes - tokenIdentifier: "IDENTIFIER", - tokenNumber: "NUMBER", - tokenOperator: "OPERATOR", - tokenStringBlock: "STRING_BLOCK", - tokenStringDouble: "STRING_DOUBLE", - tokenStringSingle: "STRING_SINGLE", + tokenIdentifier: "IDENTIFIER", + tokenNumber: "NUMBER", + tokenOperator: "OPERATOR", + tokenStringBlock: "STRING_BLOCK", + tokenStringDouble: "STRING_DOUBLE", + tokenStringSingle: "STRING_SINGLE", + tokenVerbatimStringDouble: "VERBATIM_STRING_DOUBLE", + tokenVerbatimStringSingle: "VERBATIM_STRING_SINGLE", // Keywords tokenAssert: "assert", @@ -735,6 +739,45 @@ func lex(fn string, input string) (tokens, error) { r = l.next() } } + case '@': + // Verbatim string literals. + // ' and " quoting is interpreted here, unlike non-verbatim strings + // where it is done later by jsonnet_string_unescape. This is OK + // in this case because no information is lost by resoving the + // repeated quote into a single quote, so we can go back to the + // original form in the formatter. + var data []rune + stringStartLoc := l.prevLocation() + quot := l.next() + var kind tokenKind + if quot == '"' { + kind = tokenVerbatimStringDouble + } else if quot == '\'' { + kind = tokenVerbatimStringSingle + } else { + return nil, makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex verbatim string, junk after '@': %v", quot), + l.fileName, + stringStartLoc, + ) + } + for r = l.next(); ; r = l.next() { + if r == lexEOF { + return nil, makeStaticErrorPoint("Unterminated String", l.fileName, stringStartLoc) + } else if r == quot { + if l.peek() == quot { + l.next() + data = append(data, r) + } else { + l.emitFullToken(kind, string(data), "", "") + l.resetTokenStart() + break + } + } else { + data = append(data, r) + } + } + case '#': l.resetTokenStart() // Throw out the leading # for r = l.next(); r != lexEOF && r != '\n'; r = l.next() { diff --git a/lexer_test.go b/lexer_test.go index 91e880b..c8ea31f 100644 --- a/lexer_test.go +++ b/lexer_test.go @@ -226,6 +226,16 @@ test "block string no ws:1:1 Text block's first line must start with whitespace", }, + {"verbatim_string1", `@""`, tokens{{kind: tokenVerbatimStringDouble, data: ""}}, ""}, + {"verbatim_string2", `@''`, tokens{{kind: tokenVerbatimStringSingle, data: ""}}, ""}, + {"verbatim_string3", `@""""`, tokens{{kind: tokenVerbatimStringDouble, data: `"`}}, ""}, + {"verbatim_string4", `@''''`, tokens{{kind: tokenVerbatimStringSingle, data: "'"}}, ""}, + {"verbatim_string5", `@"\n"`, tokens{{kind: tokenVerbatimStringDouble, data: "\\n"}}, ""}, + {"verbatim_string6", `@"''"`, tokens{{kind: tokenVerbatimStringDouble, data: "''"}}, ""}, + + {"verbatim_string_unterminated", `@"blah blah`, tokens{}, "verbatim_string_unterminated:1:1 Unterminated String"}, + {"verbatim_string_junk", `@blah blah`, tokens{}, "verbatim_string_junk:1:1 Couldn't lex verbatim string, junk after '@': 98"}, + {"op *", "*", tokens{{kind: tokenOperator, data: "*"}}, ""}, {"op /", "/", tokens{{kind: tokenOperator, data: "/"}}, ""}, {"op %", "%", tokens{{kind: tokenOperator, data: "%"}}, ""}, diff --git a/main_test.go b/main_test.go index 8b77ade..a67cb7e 100644 --- a/main_test.go +++ b/main_test.go @@ -40,6 +40,7 @@ var mainTests = []mainTest{ {"simple_arith_string2", "\"aaa\" + \"\"", "\"aaa\"", ""}, {"simple_arith_string3", "\"\" + \"bbb\"", "\"bbb\"", ""}, {"simple_arith_string_empty", "\"\" + \"\"", "\"\"", ""}, + {"verbatim_string", `@"blah ☺"`, `"blah ☺"`, ""}, {"empty_array", "[]", "[ ]", ""}, {"array", "[1, 2, 1 + 2]", "[\n 1,\n 2,\n 3\n]", ""}, {"empty_object", "{}", "{ }", ""}, diff --git a/parser.go b/parser.go index 7a7ed12..302554b 100644 --- a/parser.go +++ b/parser.go @@ -346,6 +346,8 @@ func (p *parser) parseObjectRemainder(tok *token) (astNode, *token, error) { kind: astStringBlock, blockIndent: next.stringBlockIndent, } + // TODO(sbarzowski) are verbatim string literals allowed here? + // if so, maybe it's time we extracted string literal creation somewhere... default: kind = astObjectFieldExpr var err error @@ -658,6 +660,18 @@ func (p *parser) parseTerminal() (astNode, error) { kind: astStringDouble, blockIndent: tok.stringBlockIndent, }, nil + case tokenVerbatimStringDouble: + return &astLiteralString{ + astNodeBase: astNodeBase{loc: tok.loc}, + value: tok.data, + kind: astVerbatimStringDouble, + }, nil + case tokenVerbatimStringSingle: + return &astLiteralString{ + astNodeBase: astNodeBase{loc: tok.loc}, + value: tok.data, + kind: astVerbatimStringSingle, + }, nil case tokenFalse: return &astLiteralBoolean{ astNodeBase: astNodeBase{loc: tok.loc},