parse string later

aibasel · Nov 20, 2023 · c9cfb16 · c9cfb16
1 parent d751d5e
commit c9cfb16
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 83 deletions.
diff --git a/src/search/parser/abstract_syntax_tree.cc b/src/search/parser/abstract_syntax_tree.cc
@@ -408,21 +408,13 @@ LiteralNode::LiteralNode(Token value)
 }
 
 DecoratedASTNodePtr LiteralNode::decorate(DecorateContext &context) const {
-    utils::TraceBlock block(context, "Checking Literal: " + value.repr());
+    utils::TraceBlock block(context, "Checking Literal: " + value.content);
     if (context.has_variable(value.content)) {
-        if (value.type == TokenType::IDENTIFIER) {
-            string variable_name = value.content;
-            return utils::make_unique_ptr<VariableNode>(variable_name);
-        } else if (value.type != TokenType::STRING) {
-            /*
-              Variable names may be identical to a string literal but not
-              identical to any other token, e.g., a boolean:
-                  "let(true, blind(), astar(true))"
-              This kind of mistake is handled earlier, so ending up here is a
-              programming mistake, not an input error.
-            */
+        if (value.type != TokenType::IDENTIFIER) {
             ABORT("A non-identifier token was defined as variable.");
         }
+        string variable_name = value.content;
+        return utils::make_unique_ptr<VariableNode>(variable_name);
     }
 
     switch (value.type) {
@@ -444,7 +436,7 @@ DecoratedASTNodePtr LiteralNode::decorate(DecorateContext &context) const {
 
 void LiteralNode::dump(string indent) const {
     cout << indent << token_type_name(value.type) << ": "
-         << value.repr() << endl;
+         << value.content << endl;
 }
 
 const plugins::Type &LiteralNode::get_type(DecorateContext &context) const {

diff --git a/src/search/parser/decorated_abstract_syntax_tree.cc b/src/search/parser/decorated_abstract_syntax_tree.cc
@@ -224,11 +224,38 @@ StringLiteralNode::StringLiteralNode(const string &value)
 
 plugins::Any StringLiteralNode::construct(ConstructContext &context) const {
     utils::TraceBlock block(context, "Constructing string value from '" + value + "'");
-    return value;
+    if (!(value.starts_with('"') && value.ends_with('"'))) {
+        ABORT("String literal value is not enclosed in quotation marks"
+              " (this should have been caught before constructing this node).");
+    }
+    /*
+      We are not doing any further syntax checking. Escaped symbols other than
+      \n will just ignore the escaping \ (e.g., \t is treated as t, not as a
+      tab). Strings ending in \ will not produce an error but should be excluded
+      by the previous steps.
+    */
+    string result;
+    result.reserve(value.length() - 2);
+    bool escaped = false;
+    for (char c : value.substr(1, value.size() - 2)) {
+        if (escaped) {
+            escaped = false;
+            if (c == 'n') {
+                result += '\n';
+            } else {
+                result += c;
+            }
+        } else if (c == '\\') {
+            escaped = true;
+        } else {
+            result += c;
+        }
+    }
+    return result;
 }
 
 void StringLiteralNode::dump(string indent) const {
-    cout << indent << "STRING: \"" << utils::escape(value) << "\"" << endl;
+    cout << indent << "STRING: " << value << endl;
 }
 
 IntLiteralNode::IntLiteralNode(const string &value)

diff --git a/src/search/parser/lexical_analyzer.cc b/src/search/parser/lexical_analyzer.cc
@@ -29,7 +29,7 @@ static vector<pair<TokenType, regex>> construct_token_type_expressions() {
         {TokenType::INTEGER,
          R"([+-]?(infinity|\d+([kmg]\b)?))"},
         {TokenType::BOOLEAN, R"(true|false)"},
-        {TokenType::STRING, R"(\"((\\\\|\\"|\\n|[^"\\])*)\")"},
+        {TokenType::STRING, R"("(\\\\|\\"|\\n|[^"\\])*")"},
         {TokenType::LET, R"(let)"},
         {TokenType::IDENTIFIER, R"([a-zA-Z_]\w*)"}
     };
@@ -77,13 +77,7 @@ TokenStream split_tokens(const string &text) {
             TokenType token_type = type_and_expression.first;
             const regex &expression = type_and_expression.second;
             if (regex_search(start, end, match, expression, regex_constants::match_continuous)) {
-                string value;
-                if (token_type == TokenType::STRING) {
-                    value = utils::unescape(match[2]);
-                } else {
-                    value = utils::tolower(match[1]);
-                }
-                tokens.push_back({value, token_type});
+                tokens.push_back({match[1], token_type});
                 start += match[0].length();
                 has_match = true;
                 break;

diff --git a/src/search/parser/token_stream.cc b/src/search/parser/token_stream.cc
@@ -11,17 +11,19 @@
 using namespace std;
 
 namespace parser {
-Token::Token(const string &content, TokenType type)
-    : content(content), type(type) {
-}
-
-string Token::repr() const {
-    if (type == TokenType::STRING) {
-        return "\"" + utils::escape(content) + "\"";
+static string case_insensitive_to_lower(const string &content, TokenType type) {
+    if (type == TokenType::BOOLEAN ||
+        type == TokenType::FLOAT ||
+        type == TokenType::IDENTIFIER ||
+        type == TokenType::INTEGER) {
+        return utils::tolower(content);
     } else {
         return content;
     }
 }
+Token::Token(const string &content, TokenType type)
+    : content(case_insensitive_to_lower(content, type)), type(type) {
+}
 
 TokenStream::TokenStream(vector<Token> &&tokens)
     : tokens(move(tokens)), pos(0) {
@@ -78,7 +80,7 @@ string TokenStream::str(int from, int to) const {
     int max_position = min(static_cast<int>(tokens.size()), to);
     ostringstream message;
     while (curr_position < max_position) {
-        message << tokens[curr_position].repr();
+        message << tokens[curr_position].content;
         curr_position++;
     }
     return message.str();
@@ -121,7 +123,7 @@ ostream &operator<<(ostream &out, TokenType token_type) {
 }
 
 ostream &operator<<(ostream &out, const Token &token) {
-    out << "<Type: '" << token.type << "', Value: '" << token.repr() << "'>";
+    out << "<Type: '" << token.type << "', Value: '" << token.content << "'>";
     return out;
 }
 }
diff --git a/src/search/parser/token_stream.h b/src/search/parser/token_stream.h
@@ -29,7 +29,6 @@ struct Token {
     TokenType type;
 
     Token(const std::string &content, TokenType type);
-    std::string repr() const;
 };
 
 class TokenStream {

diff --git a/src/search/utils/strings.cc b/src/search/utils/strings.cc
@@ -8,54 +8,6 @@
 using namespace std;
 
 namespace utils {
-string escape(const string &s) {
-    /*
-      Escape any occurrences of \ with \\, occurrences of " with \" and
-      newlines with \n.
-    */
-    string result;
-    result.reserve(s.length());
-    for (char c : s) {
-        if (c == '\\') {
-            result += "\\\\";
-        } else if (c == '"') {
-            result += "\\\"";
-        } else if (c == '\n') {
-            result += "\\n";
-        } else {
-            result += c;
-        }
-    }
-    return result;
-}
-
-string unescape(const string &s) {
-    /*
-      On sequences created with escape(), this will restore the original string.
-      However, no syntax checking is done. Escaped symbols other than the ones
-      added by escape() will just ignore the escaping \ (e.g., \t is treated
-      as t, not as a tab). Strings ending in \ will not produce an error.
-    */
-    string result;
-    result.reserve(s.length());
-    bool escaped = false;
-    for (char c : s) {
-        if (escaped) {
-            escaped = false;
-            if (c == 'n') {
-                result += '\n';
-            } else {
-                result += c;
-            }
-        } else if (c == '\\') {
-            escaped = true;
-        } else {
-            result += c;
-        }
-    }
-    return result;
-}
-
 void lstrip(string &s) {
     s.erase(s.begin(), find_if(s.begin(), s.end(), [](int ch) {
                                    return !isspace(ch);

diff --git a/src/search/utils/strings.h b/src/search/utils/strings.h
@@ -8,8 +8,6 @@
 #include <vector>
 
 namespace utils {
-extern std::string escape(const std::string &s);
-extern std::string unescape(const std::string &s);
 extern void lstrip(std::string &s);
 extern void rstrip(std::string &s);
 extern void strip(std::string &s);