From a94bfef7648e3d3865afc7febda171269bd11e35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stanis=C5=82aw=20Barzowski?= <sbarzowski@google.com>
Date: Thu, 10 Aug 2017 17:26:37 -0400
Subject: [PATCH] Add verbatim string support

---
 ast.go        |  2 ++
 desugarer.go  | 15 ++++++++------
 lexer.go      | 55 +++++++++++++++++++++++++++++++++++++++++++++------
 lexer_test.go | 10 ++++++++++
 main_test.go  |  1 +
 parser.go     | 14 +++++++++++++
 6 files changed, 85 insertions(+), 12 deletions(-)

diff --git a/ast.go b/ast.go
index 79bc96f..5bad7c7 100644
--- a/ast.go
+++ b/ast.go
@@ -348,6 +348,8 @@ const (
 	astStringSingle astLiteralStringKind = iota
 	astStringDouble
 	astStringBlock
+	astVerbatimStringDouble
+	astVerbatimStringSingle
 )
 
 // astLiteralString represents a JSON string
diff --git a/desugarer.go b/desugarer.go
index 34933f6..c299d8d 100644
--- a/desugarer.go
+++ b/desugarer.go
@@ -324,13 +324,16 @@ func desugar(astPtr *astNode, objLevel int) (err error) {
 		// Nothing to do.
 
 	case *astLiteralString:
-		unescaped, err := stringUnescape(ast.Loc(), ast.value)
-		if err != nil {
-			return err
+		if ast.kind != astVerbatimStringDouble && ast.kind != astVerbatimStringSingle {
+			unescaped, err := stringUnescape(ast.Loc(), ast.value)
+			if err != nil {
+				return err
+			}
+			// TODO(sbarzowski) perhaps store unescaped in a separate field...
+			ast.value = unescaped
+			ast.kind = astStringDouble
+			ast.blockIndent = ""
 		}
-		ast.value = unescaped
-		ast.kind = astStringDouble
-		ast.blockIndent = ""
 
 	case *astObject:
 		// Hidden variable to allow $ binding.
diff --git a/lexer.go b/lexer.go
index 77857ee..2f98383 100644
--- a/lexer.go
+++ b/lexer.go
@@ -70,6 +70,8 @@ const (
 	tokenStringBlock
 	tokenStringDouble
 	tokenStringSingle
+	tokenVerbatimStringDouble
+	tokenVerbatimStringSingle
 
 	// Keywords
 	tokenAssert
@@ -109,12 +111,14 @@ var tokenKindStrings = []string{
 	tokenSemicolon: "\";\"",
 
 	// Arbitrary length lexemes
-	tokenIdentifier:   "IDENTIFIER",
-	tokenNumber:       "NUMBER",
-	tokenOperator:     "OPERATOR",
-	tokenStringBlock:  "STRING_BLOCK",
-	tokenStringDouble: "STRING_DOUBLE",
-	tokenStringSingle: "STRING_SINGLE",
+	tokenIdentifier:           "IDENTIFIER",
+	tokenNumber:               "NUMBER",
+	tokenOperator:             "OPERATOR",
+	tokenStringBlock:          "STRING_BLOCK",
+	tokenStringDouble:         "STRING_DOUBLE",
+	tokenStringSingle:         "STRING_SINGLE",
+	tokenVerbatimStringDouble: "VERBATIM_STRING_DOUBLE",
+	tokenVerbatimStringSingle: "VERBATIM_STRING_SINGLE",
 
 	// Keywords
 	tokenAssert:     "assert",
@@ -735,6 +739,45 @@ func lex(fn string, input string) (tokens, error) {
 					r = l.next()
 				}
 			}
+		case '@':
+			// Verbatim string literals.
+			// ' and " quoting is interpreted here, unlike non-verbatim strings
+			// where it is done later by jsonnet_string_unescape.  This is OK
+			// in this case because no information is lost by resoving the
+			// repeated quote into a single quote, so we can go back to the
+			// original form in the formatter.
+			var data []rune
+			stringStartLoc := l.prevLocation()
+			quot := l.next()
+			var kind tokenKind
+			if quot == '"' {
+				kind = tokenVerbatimStringDouble
+			} else if quot == '\'' {
+				kind = tokenVerbatimStringSingle
+			} else {
+				return nil, makeStaticErrorPoint(
+					fmt.Sprintf("Couldn't lex verbatim string, junk after '@': %v", quot),
+					l.fileName,
+					stringStartLoc,
+				)
+			}
+			for r = l.next(); ; r = l.next() {
+				if r == lexEOF {
+					return nil, makeStaticErrorPoint("Unterminated String", l.fileName, stringStartLoc)
+				} else if r == quot {
+					if l.peek() == quot {
+						l.next()
+						data = append(data, r)
+					} else {
+						l.emitFullToken(kind, string(data), "", "")
+						l.resetTokenStart()
+						break
+					}
+				} else {
+					data = append(data, r)
+				}
+			}
+
 		case '#':
 			l.resetTokenStart() // Throw out the leading #
 			for r = l.next(); r != lexEOF && r != '\n'; r = l.next() {
diff --git a/lexer_test.go b/lexer_test.go
index 91e880b..c8ea31f 100644
--- a/lexer_test.go
+++ b/lexer_test.go
@@ -226,6 +226,16 @@ test
 		"block string no ws:1:1 Text block's first line must start with whitespace",
 	},
 
+	{"verbatim_string1", `@""`, tokens{{kind: tokenVerbatimStringDouble, data: ""}}, ""},
+	{"verbatim_string2", `@''`, tokens{{kind: tokenVerbatimStringSingle, data: ""}}, ""},
+	{"verbatim_string3", `@""""`, tokens{{kind: tokenVerbatimStringDouble, data: `"`}}, ""},
+	{"verbatim_string4", `@''''`, tokens{{kind: tokenVerbatimStringSingle, data: "'"}}, ""},
+	{"verbatim_string5", `@"\n"`, tokens{{kind: tokenVerbatimStringDouble, data: "\\n"}}, ""},
+	{"verbatim_string6", `@"''"`, tokens{{kind: tokenVerbatimStringDouble, data: "''"}}, ""},
+
+	{"verbatim_string_unterminated", `@"blah blah`, tokens{}, "verbatim_string_unterminated:1:1 Unterminated String"},
+	{"verbatim_string_junk", `@blah blah`, tokens{}, "verbatim_string_junk:1:1 Couldn't lex verbatim string, junk after '@': 98"},
+
 	{"op *", "*", tokens{{kind: tokenOperator, data: "*"}}, ""},
 	{"op /", "/", tokens{{kind: tokenOperator, data: "/"}}, ""},
 	{"op %", "%", tokens{{kind: tokenOperator, data: "%"}}, ""},
diff --git a/main_test.go b/main_test.go
index 8b77ade..a67cb7e 100644
--- a/main_test.go
+++ b/main_test.go
@@ -40,6 +40,7 @@ var mainTests = []mainTest{
 	{"simple_arith_string2", "\"aaa\" + \"\"", "\"aaa\"", ""},
 	{"simple_arith_string3", "\"\" + \"bbb\"", "\"bbb\"", ""},
 	{"simple_arith_string_empty", "\"\" + \"\"", "\"\"", ""},
+	{"verbatim_string", `@"blah ☺"`, `"blah ☺"`, ""},
 	{"empty_array", "[]", "[ ]", ""},
 	{"array", "[1, 2, 1 + 2]", "[\n   1,\n   2,\n   3\n]", ""},
 	{"empty_object", "{}", "{ }", ""},
diff --git a/parser.go b/parser.go
index 7a7ed12..302554b 100644
--- a/parser.go
+++ b/parser.go
@@ -346,6 +346,8 @@ func (p *parser) parseObjectRemainder(tok *token) (astNode, *token, error) {
 					kind:        astStringBlock,
 					blockIndent: next.stringBlockIndent,
 				}
+			// TODO(sbarzowski) are verbatim string literals allowed here?
+			// if so, maybe it's time we extracted string literal creation somewhere...
 			default:
 				kind = astObjectFieldExpr
 				var err error
@@ -658,6 +660,18 @@ func (p *parser) parseTerminal() (astNode, error) {
 			kind:        astStringDouble,
 			blockIndent: tok.stringBlockIndent,
 		}, nil
+	case tokenVerbatimStringDouble:
+		return &astLiteralString{
+			astNodeBase: astNodeBase{loc: tok.loc},
+			value:       tok.data,
+			kind:        astVerbatimStringDouble,
+		}, nil
+	case tokenVerbatimStringSingle:
+		return &astLiteralString{
+			astNodeBase: astNodeBase{loc: tok.loc},
+			value:       tok.data,
+			kind:        astVerbatimStringSingle,
+		}, nil
 	case tokenFalse:
 		return &astLiteralBoolean{
 			astNodeBase: astNodeBase{loc: tok.loc},