From 1717f544b8aaadd3d72747fcd4172e789b0a965b Mon Sep 17 00:00:00 2001
From: gibber9809 <devinbook1@gmail.com>
Date: Fri, 13 Dec 2024 16:08:26 +0000
Subject: [PATCH 1/9] Add utilities for escaping search queries and fix kql
 escaping behaviour

---
 components/core/src/clp_s/Utils.cpp           | 245 +++++++++++++++++-
 components/core/src/clp_s/Utils.hpp           |  75 +++++-
 components/core/src/clp_s/clp-s.cpp           |   2 +-
 .../clp_s/search/AddTimestampConditions.cpp   |   3 +-
 .../src/clp_s/search/ColumnDescriptor.cpp     |  22 +-
 .../src/clp_s/search/ColumnDescriptor.hpp     | 111 +++++---
 .../core/src/clp_s/search/SchemaMatch.cpp     |   9 +-
 components/core/src/clp_s/search/kql/Kql.g4   |  15 +-
 components/core/src/clp_s/search/kql/kql.cpp  |  23 +-
 components/core/tests/test-kql.cpp            |  18 +-
 10 files changed, 437 insertions(+), 86 deletions(-)

diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp
index acee48851..976c3ee14 100644
--- a/components/core/src/clp_s/Utils.cpp
+++ b/components/core/src/clp_s/Utils.cpp
@@ -1,6 +1,7 @@
 #include "Utils.hpp"
 
 #include <boost/filesystem.hpp>
+#include <simdjson.h>
 #include <spdlog/spdlog.h>
 
 using std::string;
@@ -431,30 +432,246 @@ bool StringUtils::tokenize_column_descriptor(
         std::string const& descriptor,
         std::vector<std::string>& tokens
 ) {
-    // TODO: add support for unicode sequences e.g. \u263A
     std::string cur_tok;
-    for (size_t cur = 0; cur < descriptor.size(); ++cur) {
-        if ('\\' == descriptor[cur]) {
-            ++cur;
-            if (cur >= descriptor.size()) {
-                return false;
-            }
-        } else if ('.' == descriptor[cur]) {
-            if (cur_tok.empty()) {
-                return false;
+    bool escaped{false};
+    for (size_t i = 0; i < descriptor.size(); ++i) {
+        if (false == escaped) {
+            if ('\\' == descriptor[i]) {
+                escaped = true;
+            } else if ('.' == descriptor[i]) {
+                if (cur_tok.empty()) {
+                    return false;
+                }
+                std::string unescaped_token;
+                if (unescape_kql_internal(cur_tok, unescaped_token, false)) {
+                    tokens.push_back(unescaped_token);
+                    cur_tok.clear();
+                } else {
+                    return false;
+                }
+            } else {
+                cur_tok.push_back(descriptor[i]);
             }
-            tokens.push_back(cur_tok);
-            cur_tok.clear();
             continue;
         }
-        cur_tok.push_back(descriptor[cur]);
+
+        escaped = false;
+        switch (descriptor[i]) {
+            case '.':
+                cur_tok.push_back('.');
+                break;
+            default:
+                cur_tok.push_back('\\');
+                cur_tok.push_back(descriptor[i]);
+                break;
+        }
+    }
+
+    if (escaped) {
+        return false;
     }
 
     if (cur_tok.empty()) {
         return false;
     }
 
-    tokens.push_back(cur_tok);
+    std::string unescaped_token;
+    if (unescape_kql_internal(cur_tok, unescaped_token, false)) {
+        tokens.push_back(unescaped_token);
+    } else {
+        return false;
+    }
+    return true;
+}
+
+void StringUtils::escape_json_string(std::string& destination, std::string_view const source) {
+    // Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path
+    // when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers
+    // a net decompression speedup of ~30% compared to adding every character to the destination one
+    // character at a time.
+    size_t slice_begin{0ULL};
+    auto append_unescaped_slice = [&](size_t i) {
+        if (slice_begin < i) {
+            destination.append(source.substr(slice_begin, i - slice_begin));
+        }
+        slice_begin = i + 1;
+    };
+    for (size_t i = 0; i < source.size(); ++i) {
+        char c = source[i];
+        switch (c) {
+            case '"':
+                append_unescaped_slice(i);
+                destination.append("\\\"");
+                break;
+            case '\\':
+                append_unescaped_slice(i);
+                destination.append("\\\\");
+                break;
+            case '\t':
+                append_unescaped_slice(i);
+                destination.append("\\t");
+                break;
+            case '\r':
+                append_unescaped_slice(i);
+                destination.append("\\r");
+                break;
+            case '\n':
+                append_unescaped_slice(i);
+                destination.append("\\n");
+                break;
+            case '\b':
+                append_unescaped_slice(i);
+                destination.append("\\b");
+                break;
+            case '\f':
+                append_unescaped_slice(i);
+                destination.append("\\f");
+                break;
+            default:
+                if ('\x00' <= c && c <= '\x1f') {
+                    append_unescaped_slice(i);
+                    char_to_escaped_four_char_hex(destination, c);
+                }
+                break;
+        }
+    }
+    append_unescaped_slice(source.size());
+}
+
+namespace {
+/**
+ * Convert a four byte hex sequence to utf8. We perform the conversion in this cumbersome way
+ * because c++20 deprecates most of the much more convenient std::codecvt utilities.
+ */
+bool convert_four_byte_hex_to_utf8(std::string_view const hex, std::string& destination) {
+    std::string buf = "\"\\u";
+    buf += hex;
+    buf.push_back('"');
+    buf.reserve(buf.size() + simdjson::SIMDJSON_PADDING);
+    simdjson::ondemand::parser parser;
+    auto value = parser.iterate(buf);
+    try {
+        if (false == value.is_scalar()) {
+            return false;
+        }
+
+        if (simdjson::ondemand::json_type::string != value.type()) {
+            return false;
+        }
+
+        std::string_view unescaped_utf8 = value.get_string(false);
+        destination.append(unescaped_utf8);
+    } catch (std::exception const& e) {
+        return false;
+    }
+    return true;
+}
+}  // namespace
+
+bool StringUtils::unescape_kql_value(std::string const& value, std::string& unescaped) {
+    return unescape_kql_internal(value, unescaped, true);
+}
+
+bool StringUtils::unescape_kql_internal(
+        std::string const& value,
+        std::string& unescaped,
+        bool is_value
+) {
+    bool escaped{false};
+    for (size_t i = 0; i < value.size(); ++i) {
+        if (false == escaped && '\\' != value[i]) {
+            unescaped.push_back(value[i]);
+            continue;
+        } else if (false == escaped && '\\' == value[i]) {
+            escaped = true;
+            continue;
+        }
+
+        escaped = false;
+        switch (value[i]) {
+            case '\\':
+                unescaped.append("\\\\");
+                break;
+            case '"':
+                unescaped.push_back('"');
+                break;
+            case 't':
+                unescaped.push_back('\t');
+                break;
+            case 'r':
+                unescaped.push_back('\r');
+                break;
+            case 'n':
+                unescaped.push_back('\n');
+                break;
+            case 'b':
+                unescaped.push_back('\b');
+                break;
+            case 'f':
+                unescaped.push_back('\f');
+                break;
+            case 'u': {
+                size_t last_char_in_codepoint = i + 4;
+                if (value.size() <= last_char_in_codepoint) {
+                    return false;
+                }
+
+                auto four_byte_hex = std::string_view{value}.substr(i + 1, 4);
+                i += 4;
+
+                std::string tmp;
+                if (false == convert_four_byte_hex_to_utf8(four_byte_hex, tmp)) {
+                    return false;
+                }
+
+                // Make sure unicode escape sequences are always treated as literal characters
+                if ("\\" == tmp) {
+                    unescaped.append("\\\\");
+                } else if ("?" == tmp && is_value) {
+                    unescaped.append("\\?");
+                } else if ("*" == tmp) {
+                    unescaped.append("\\*");
+                } else {
+                    unescaped.append(tmp);
+                }
+                break;
+            }
+            case '{':
+                unescaped.push_back('{');
+                break;
+            case '}':
+                unescaped.push_back('}');
+                break;
+            case '(':
+                unescaped.push_back('(');
+                break;
+            case ')':
+                unescaped.push_back(')');
+                break;
+            case '<':
+                unescaped.push_back('<');
+                break;
+            case '>':
+                unescaped.push_back('>');
+                break;
+            case '*':
+                unescaped.append("\\*");
+                break;
+            case '?':
+                if (is_value) {
+                    unescaped.append("\\?");
+                } else {
+                    unescaped.push_back('?');
+                }
+                break;
+            default:
+                return false;
+        }
+    }
+
+    if (escaped) {
+        return false;
+    }
     return true;
 }
 }  // namespace clp_s
diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp
index 553f7e608..ab9298399 100644
--- a/components/core/src/clp_s/Utils.hpp
+++ b/components/core/src/clp_s/Utils.hpp
@@ -1,6 +1,7 @@
 #ifndef CLP_S_UTILS_HPP
 #define CLP_S_UTILS_HPP
 
+#include <array>
 #include <charconv>
 #include <cstring>
 #include <sstream>
@@ -209,7 +210,8 @@ class StringUtils {
     static bool convert_string_to_double(std::string const& raw, double& converted);
 
     /**
-     * Converts a string column descriptor delimited by '.' into a list of tokens
+     * Converts a KQL string column descriptor delimited by '.' into a list of tokens. The
+     * descriptor is tokenized and unescaped per the escaping rules for KQL columns.
      * @param descriptor
      * @param tokens
      * @return true if the descriptor was tokenized successfully, false otherwise
@@ -217,6 +219,36 @@ class StringUtils {
     [[nodiscard]] static bool
     tokenize_column_descriptor(std::string const& descriptor, std::vector<std::string>& tokens);
 
+    /**
+     * Escapes a string according to JSON string escaping rules and appends the escaped string to
+     * a buffer. The input string can be either ascii or UTF-8.
+     *
+     * According to the JSON spec JSON strings must escape control sequences (characters 0x00
+     * through 0x1f) as well as the '"' and '\' characters.
+     *
+     * This function escapes common control sequences like newline with short escape sequences
+     * (e.g. \n) and less common control sequences with unicode escape sequences (e.g. \u001f). The
+     * '"' and '\' characters are escaped with a backslash.
+     *
+     * @param source
+     * @param destination
+     */
+    static void escape_json_string(std::string& destination, std::string_view const source);
+
+    /**
+     * Unescapes a KQL value string according to the escaping rules for KQL value strings and
+     * converts it into a valid CLP search string.
+     *
+     * Specifically this means that the string is unescaped, but the escape sequences '\\', '\*',
+     * and '\?' are preserved so that the resulting string can be interpreted correctly by CLP
+     * search.
+     *
+     * @param value
+     * @param unescaped
+     * @return true if the value was escaped succesfully and false otherwise.
+     */
+    static bool unescape_kql_value(std::string const& value, std::string& unescaped);
+
 private:
     /**
      * Helper for ``wildcard_match_unsafe_case_sensitive`` to advance the
@@ -236,6 +268,47 @@ class StringUtils {
             char const*& wild_current,
             char const*& wild_bookmark
     );
+
+    /**
+     * Converts a character into its two byte hexadecimal representation.
+     * @param c
+     * @return the two byte hexadecimal representation of c as an array of two characters.
+     */
+    static std::array<char, 2> char_to_hex(char c) {
+        std::array<char, 2> ret;
+        auto nibble_to_hex = [](char nibble) -> char {
+            if ('\x00' <= nibble && nibble <= '\x09') {
+                return '0' + (nibble - '\x00');
+            } else {
+                return 'a' + (nibble - '\x10');
+            }
+        };
+
+        return std::array<char, 2>{nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)};
+    }
+
+    /**
+     * Converts a character into a unicode escape sequence (e.g. \u0000) and appends the escape
+     * sequences to the `destination` buffer.
+     * @param destination
+     * @param c
+     */
+    static void char_to_escaped_four_char_hex(std::string& destination, char c) {
+        destination.append("\\u00");
+        auto hex = char_to_hex(c);
+        destination.append(hex.data(), hex.size());
+    }
+
+    /**
+     * Unescape a KQL key or value with special handling for each case and append the unescaped
+     * value to the `unescaped` buffer.
+     * @param value
+     * @param unescaped
+     * @param is_value
+     * @return true if the value was unescaped succesfully and false otherwise.
+     */
+    static bool
+    unescape_kql_internal(std::string const& value, std::string& unescaped, bool is_value);
 };
 
 enum EvaluatedValue {
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index 2c6639290..a1b0da506 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -206,7 +206,7 @@ bool search_archive(
                 SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column);
                 return false;
             }
-            projection->add_column(ColumnDescriptor::create(descriptor_tokens));
+            projection->add_column(ColumnDescriptor::create_from_escaped_tokens(descriptor_tokens));
         }
     } catch (clp_s::TraceableException& e) {
         SPDLOG_ERROR("{}", e.what());
diff --git a/components/core/src/clp_s/search/AddTimestampConditions.cpp b/components/core/src/clp_s/search/AddTimestampConditions.cpp
index 5af7ef44a..addf7069f 100644
--- a/components/core/src/clp_s/search/AddTimestampConditions.cpp
+++ b/components/core/src/clp_s/search/AddTimestampConditions.cpp
@@ -18,7 +18,8 @@ std::shared_ptr<Expression> AddTimestampConditions::run(std::shared_ptr<Expressi
         return EmptyExpr::create();
     }
 
-    auto timestamp_column = ColumnDescriptor::create(m_timestamp_column.value());
+    auto timestamp_column
+            = ColumnDescriptor::create_from_escaped_tokens(m_timestamp_column.value());
 
     auto and_expr = AndExpr::create();
     if (m_begin_ts.has_value()) {
diff --git a/components/core/src/clp_s/search/ColumnDescriptor.cpp b/components/core/src/clp_s/search/ColumnDescriptor.cpp
index 33d5ed6cf..de944022e 100644
--- a/components/core/src/clp_s/search/ColumnDescriptor.cpp
+++ b/components/core/src/clp_s/search/ColumnDescriptor.cpp
@@ -6,7 +6,7 @@ namespace clp_s::search {
 DescriptorList tokenize_descriptor(std::vector<std::string> const& descriptors) {
     DescriptorList list;
     for (std::string const& descriptor : descriptors) {
-        list.push_back(DescriptorToken(descriptor));
+        list.push_back(DescriptorToken::create_descriptor_from_escaped_token(descriptor));
     }
     return list;
 }
@@ -15,7 +15,7 @@ void ColumnDescriptor::check_and_set_unresolved_descriptor_flag() {
     m_unresolved_descriptors = false;
     m_pure_wildcard = m_descriptors.size() == 1 && m_descriptors[0].wildcard();
     for (auto const& token : m_descriptors) {
-        if (token.wildcard() || token.regex()) {
+        if (token.wildcard()) {
             m_unresolved_descriptors = true;
             break;
         }
@@ -24,7 +24,7 @@ void ColumnDescriptor::check_and_set_unresolved_descriptor_flag() {
 
 ColumnDescriptor::ColumnDescriptor(std::string const& descriptor) {
     m_flags = cAllTypes;
-    m_descriptors.emplace_back(descriptor);
+    m_descriptors.emplace_back(DescriptorToken::create_descriptor_from_escaped_token(descriptor));
     check_and_set_unresolved_descriptor_flag();
     if (is_unresolved_descriptor()) {
         simplify_descriptor_wildcards();
@@ -49,17 +49,21 @@ ColumnDescriptor::ColumnDescriptor(DescriptorList const& descriptors) {
     }
 }
 
-std::shared_ptr<ColumnDescriptor> ColumnDescriptor::create(std::string const& descriptor) {
-    return std::shared_ptr<ColumnDescriptor>(new ColumnDescriptor(descriptor));
+std::shared_ptr<ColumnDescriptor> ColumnDescriptor::create_from_escaped_token(
+        std::string const& token
+) {
+    return std::shared_ptr<ColumnDescriptor>(new ColumnDescriptor(token));
 }
 
-std::shared_ptr<ColumnDescriptor> ColumnDescriptor::create(
-        std::vector<std::string> const& descriptors
+std::shared_ptr<ColumnDescriptor> ColumnDescriptor::create_from_escaped_tokens(
+        std::vector<std::string> const& tokens
 ) {
-    return std::shared_ptr<ColumnDescriptor>(new ColumnDescriptor(descriptors));
+    return std::shared_ptr<ColumnDescriptor>(new ColumnDescriptor(tokens));
 }
 
-std::shared_ptr<ColumnDescriptor> ColumnDescriptor::create(DescriptorList const& descriptors) {
+std::shared_ptr<ColumnDescriptor> ColumnDescriptor::create_from_descriptors(
+        DescriptorList const& descriptors
+) {
     return std::shared_ptr<ColumnDescriptor>(new ColumnDescriptor(descriptors));
 }
 
diff --git a/components/core/src/clp_s/search/ColumnDescriptor.hpp b/components/core/src/clp_s/search/ColumnDescriptor.hpp
index ea1cfd7ec..666910550 100644
--- a/components/core/src/clp_s/search/ColumnDescriptor.hpp
+++ b/components/core/src/clp_s/search/ColumnDescriptor.hpp
@@ -7,6 +7,7 @@
 #include <string_view>
 #include <vector>
 
+#include "../TraceableException.hpp"
 #include "Literal.hpp"
 
 namespace clp_s::search {
@@ -15,27 +16,29 @@ namespace clp_s::search {
  */
 class DescriptorToken {
 public:
-    // Constructors
-    DescriptorToken() = default;
+    class OperationFailed : public TraceableException {
+    public:
+        // Constructors
+        OperationFailed(ErrorCode error_code, char const* const filename, int line_number)
+                : TraceableException(error_code, filename, line_number) {}
+    };
 
     /**
-     * Initialize the token from a string and set flags based on whether the token contains
-     * wildcards
-     * @param token the string to initialize the token from
+     * Creates a DescriptorToken from an escaped token string. The escape sequences '\\' and '\*'
+     * are supported in order to distinguish the literal '*' from the '*' used to match hierarchies
+     * of keys.
+     * @param token
      */
-    explicit DescriptorToken(std::string_view const token)
-            : m_token(token),
-              m_wildcard(false),
-              m_regex(false) {
-        if (token == "*") {
-            m_wildcard = true;
-        }
+    static DescriptorToken create_descriptor_from_escaped_token(std::string_view const token) {
+        return DescriptorToken{token, false};
+    }
 
-        for (char c : token) {
-            if (c == '*') {
-                m_regex = true;
-            }
-        }
+    /**
+     * Creates a DescriptorToken from a literal token string. The token is copied verbatim, and is
+     * never treated as a wildcard.
+     */
+    static DescriptorToken create_descriptor_from_literal_token(std::string_view const token) {
+        return DescriptorToken{token, true};
     }
 
     /**
@@ -44,13 +47,6 @@ class DescriptorToken {
      */
     bool wildcard() const { return m_wildcard; }
 
-    /**
-     * Whether the descriptor contains a wildcard somewhere
-     * TODO: Not currently used, and regex isn't currently supported
-     * @return true if the descriptor contains a wildcard
-     */
-    bool regex() const { return m_regex; }
-
     /**
      * Get a reference to the underlying token string
      * @return a reference to the underlying string
@@ -62,14 +58,47 @@ class DescriptorToken {
      * @return Whether this token is equal to the given token
      */
     bool operator==(DescriptorToken const& rhs) const {
-        // Note: we only need to compare the m_token field because m_regex and m_wildcard are
-        // derived from m_token.
-        return m_token == rhs.m_token;
+        return m_token == rhs.m_token && m_wildcard == rhs.m_wildcard;
     }
 
 private:
-    bool m_wildcard{};
-    bool m_regex{};
+    /**
+     * Initialize the token from a string and set flags based on whether the token contains
+     * wildcards
+     * @param token the string to initialize the token from
+     * @param bool true if the string should be interpreted as literal, and false
+     */
+    explicit DescriptorToken(std::string_view const token, bool is_literal) : m_wildcard(false) {
+        if (is_literal) {
+            m_token = token;
+            return;
+        }
+
+        if (token == "*") {
+            m_wildcard = true;
+        }
+
+        bool escaped{false};
+        for (size_t i = 0; i < token.size(); ++i) {
+            if (false == escaped) {
+                if ('\\' == token[i]) {
+                    escaped = true;
+                } else {
+                    m_token.push_back(token[i]);
+                }
+                continue;
+            } else {
+                m_token.push_back(token[i]);
+                escaped = false;
+            }
+        }
+
+        if (escaped) {
+            throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__);
+        }
+    }
+
+    bool m_wildcard{false};
     std::string m_token;
 };
 
@@ -92,9 +121,27 @@ class ColumnDescriptor : public Literal {
      * @param descriptor(s) the token or list of tokens making up the descriptor
      * @return A ColumnDescriptor
      */
-    static std::shared_ptr<ColumnDescriptor> create(std::string const& descriptor);
-    static std::shared_ptr<ColumnDescriptor> create(std::vector<std::string> const& descriptors);
-    static std::shared_ptr<ColumnDescriptor> create(DescriptorList const& descriptors);
+    static std::shared_ptr<ColumnDescriptor> create_from_escaped_token(std::string const& token);
+    static std::shared_ptr<ColumnDescriptor> create_from_escaped_tokens(
+            std::vector<std::string> const& tokens
+    );
+    static std::shared_ptr<ColumnDescriptor> create_from_descriptors(
+            DescriptorList const& descriptors
+    );
+
+    /**
+     * Insert an entire DescriptorList into this ColumnDescriptor at before given position.
+     * @param pos an iterator to the position inside of the internal descriptor list to insert
+     * before.
+     * @param source the list of descriptors to insert
+     */
+    void insert(DescriptorList::iterator pos, DescriptorList const& source) {
+        m_descriptors.insert(pos, source.begin(), source.end());
+        check_and_set_unresolved_descriptor_flag();
+        if (is_unresolved_descriptor()) {
+            simplify_descriptor_wildcards();
+        }
+    }
 
     /**
      * Deep copy of this ColumnDescriptor
diff --git a/components/core/src/clp_s/search/SchemaMatch.cpp b/components/core/src/clp_s/search/SchemaMatch.cpp
index 203634000..6b5f2ce8c 100644
--- a/components/core/src/clp_s/search/SchemaMatch.cpp
+++ b/components/core/src/clp_s/search/SchemaMatch.cpp
@@ -77,12 +77,15 @@ std::shared_ptr<Expression> SchemaMatch::populate_column_mapping(std::shared_ptr
                     auto literal_type = node_to_literal_type(node->get_type());
                     DescriptorList descriptors;
                     while (node->get_id() != m_tree->get_object_subtree_node_id()) {
-                        // may have to explicitly mark non-regex
-                        descriptors.emplace_back(node->get_key_name());
+                        descriptors.emplace_back(
+                                DescriptorToken::create_descriptor_from_literal_token(
+                                        node->get_key_name()
+                                )
+                        );
                         node = &m_tree->get_node(node->get_parent_id());
                     }
                     std::reverse(descriptors.begin(), descriptors.end());
-                    auto resolved_column = ColumnDescriptor::create(descriptors);
+                    auto resolved_column = ColumnDescriptor::create_from_descriptors(descriptors);
                     resolved_column->set_matching_type(literal_type);
                     *it = resolved_column;
                     cur->copy_append(possibilities.get());
diff --git a/components/core/src/clp_s/search/kql/Kql.g4 b/components/core/src/clp_s/search/kql/Kql.g4
index 33abf66bd..26c650ed2 100644
--- a/components/core/src/clp_s/search/kql/Kql.g4
+++ b/components/core/src/clp_s/search/kql/Kql.g4
@@ -48,36 +48,28 @@ QUOTED_STRING: '"' QUOTED_CHARACTER* '"' ;
 
 UNQUOTED_LITERAL: UNQUOTED_CHARACTER+ ;
 
-// TODO handle unicode
 fragment QUOTED_CHARACTER
     : ESCAPED_SPACE
     | '\\"'
     | ~'"'
     ;
 
-// TODO:  handle unicode
 fragment UNQUOTED_CHARACTER
     : ESCAPED_SPACE
     | ESCAPED_SPECIAL_CHARACTER
-    | ESCAPED_KEYWORD
     | WILDCARD
+    | UNICODE
     |  ~[\\():<>"{} \r\n\t]
     ;
 
 fragment WILDCARD:   '*' | '?';
 
-// TODO: unescape keywords
-fragment ESCAPED_KEYWORD
-    : '\\' KEYWORD
-    ;
-
 fragment KEYWORD
     : AND
     | OR
     | NOT
     ;
 
-
 RANGE_OPERATOR
     : '<='
     | '>='
@@ -99,9 +91,8 @@ fragment SPECIAL_CHARACTER
     : [\\():<>"*?{}.]
     ;
 
-
 // For unicode hex
-//UNICODE: 'u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT ;
-//fragment HEXDIGIT: [0-9a-fA-F]+ ;
+fragment UNICODE: '\\u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT ;
+fragment HEXDIGIT: [0-9a-fA-F]+ ;
 
 SPACE:  [ \t\r\n] -> skip ;
diff --git a/components/core/src/clp_s/search/kql/kql.cpp b/components/core/src/clp_s/search/kql/kql.cpp
index 972e44ad7..098886671 100644
--- a/components/core/src/clp_s/search/kql/kql.cpp
+++ b/components/core/src/clp_s/search/kql/kql.cpp
@@ -1,4 +1,5 @@
 #include <iostream>
+#include <stdexcept>
 #include <string>
 #include <vector>
 
@@ -54,7 +55,7 @@ class ParseTreeVisitor : public KqlBaseVisitor {
 private:
     static void
     prepend_column(std::shared_ptr<ColumnDescriptor> const& desc, DescriptorList const& prefix) {
-        desc->get_descriptor_list().insert(desc->descriptor_begin(), prefix.begin(), prefix.end());
+        desc->insert(desc->get_descriptor_list().begin(), prefix);
     }
 
     void prepend_column(std::shared_ptr<Expression> const& expr, DescriptorList const& prefix) {
@@ -69,7 +70,7 @@ class ParseTreeVisitor : public KqlBaseVisitor {
 
 public:
     static std::string unquote_string(std::string const& text) {
-        if (text.at(0) == '"') {
+        if (false == text.empty() && '"' == text.at(0)) {
             return text.substr(1, text.length() - 2);
         } else {
             return text;
@@ -83,7 +84,11 @@ class ParseTreeVisitor : public KqlBaseVisitor {
     }
 
     static std::shared_ptr<Literal> unquote_literal(std::string const& text) {
-        std::string token = unquote_string(text);
+        std::string token;
+        if (false == StringUtils::unescape_kql_value(unquote_string(text), token)) {
+            SPDLOG_ERROR("Can not parse invalid literal: {}", text);
+            throw std::runtime_error{"Invalid literal."};
+        }
 
         if (auto ret = Integral::create_from_string(token)) {
             return ret;
@@ -97,7 +102,11 @@ class ParseTreeVisitor : public KqlBaseVisitor {
     }
 
     static std::shared_ptr<Literal> unquote_date_literal(std::string const& text) {
-        std::string token = unquote_date_string(text);
+        std::string token;
+        if (false == StringUtils::unescape_kql_value(unquote_date_string(text), token)) {
+            SPDLOG_ERROR("Can not parse invalid date literal: {}", text);
+            throw std::runtime_error{"Invalid date literal."};
+        }
 
         return DateLiteral::create_from_string(token);
     }
@@ -117,7 +126,7 @@ class ParseTreeVisitor : public KqlBaseVisitor {
             return nullptr;
         }
 
-        return ColumnDescriptor::create(descriptor_tokens);
+        return ColumnDescriptor::create_from_escaped_tokens(descriptor_tokens);
     }
 
     std::any visitNestedQuery(KqlParser::NestedQueryContext* ctx) override {
@@ -193,7 +202,7 @@ class ParseTreeVisitor : public KqlBaseVisitor {
 
     std::any visitValue_expression(KqlParser::Value_expressionContext* ctx) override {
         auto lit = unquote_literal(ctx->LITERAL()->getText());
-        auto descriptor = ColumnDescriptor::create("*");
+        auto descriptor = ColumnDescriptor::create_from_escaped_token("*");
         return FilterExpr::create(descriptor, FilterOperation::EQ, lit);
     }
 
@@ -213,7 +222,7 @@ class ParseTreeVisitor : public KqlBaseVisitor {
             base = OrExpr::create();
         }
 
-        auto empty_descriptor = ColumnDescriptor::create(DescriptorList());
+        auto empty_descriptor = ColumnDescriptor::create_from_descriptors(DescriptorList());
         for (auto token : ctx->literals) {
             auto literal = unquote_literal(token->getText());
             auto expr = FilterExpr::create(
diff --git a/components/core/tests/test-kql.cpp b/components/core/tests/test-kql.cpp
index 2646ff5e0..9eebc8f05 100644
--- a/components/core/tests/test-kql.cpp
+++ b/components/core/tests/test-kql.cpp
@@ -65,7 +65,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
         REQUIRE(false == filter->is_inverted());
         REQUIRE(FilterOperation::EQ == filter->get_operation());
         REQUIRE(1 == filter->get_column()->get_descriptor_list().size());
-        REQUIRE(DescriptorToken{"key"} == *filter->get_column()->descriptor_begin());
+        auto const key_token = DescriptorToken::create_descriptor_from_escaped_token("key");
+        REQUIRE(key_token == *filter->get_column()->descriptor_begin());
         std::string extracted_value;
         REQUIRE(filter->get_operand()->as_var_string(extracted_value, filter->get_operation()));
         REQUIRE("value" == extracted_value);
@@ -82,7 +83,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
         REQUIRE(true == not_filter->is_inverted());
         REQUIRE(FilterOperation::EQ == not_filter->get_operation());
         REQUIRE(1 == not_filter->get_column()->get_descriptor_list().size());
-        REQUIRE(DescriptorToken{"key"} == *not_filter->get_column()->descriptor_begin());
+        auto const key_token = DescriptorToken::create_descriptor_from_escaped_token("key");
+        REQUIRE(key_token == *not_filter->get_column()->descriptor_begin());
         std::string extracted_value;
         REQUIRE(not_filter->get_operand()
                         ->as_var_string(extracted_value, not_filter->get_operation()));
@@ -127,7 +129,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
             std::string key = "key" + std::to_string(suffix_number);
             std::string value = "value" + std::to_string(suffix_number);
             REQUIRE(value == extracted_value);
-            REQUIRE(DescriptorToken{key} == *filter->get_column()->descriptor_begin());
+            auto const key_token = DescriptorToken::create_descriptor_from_escaped_token(key);
+            REQUIRE(key_token == *filter->get_column()->descriptor_begin());
             ++suffix_number;
         }
     }
@@ -176,7 +179,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
             std::string key = "key" + std::to_string(suffix_number);
             std::string value = "value" + std::to_string(suffix_number);
             REQUIRE(value == extracted_value);
-            REQUIRE(DescriptorToken{key} == *filter->get_column()->descriptor_begin());
+            auto const key_token = DescriptorToken::create_descriptor_from_escaped_token(key);
+            REQUIRE(key_token == *filter->get_column()->descriptor_begin());
             ++suffix_number;
         }
     }
@@ -206,8 +210,10 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
         REQUIRE(FilterOperation::EQ == filter->get_operation());
         REQUIRE(2 == filter->get_column()->get_descriptor_list().size());
         auto it = filter->get_column()->descriptor_begin();
-        REQUIRE(DescriptorToken{"a.b"} == *it++);
-        REQUIRE(DescriptorToken{"c"} == *it++);
+        auto const a_b_token = DescriptorToken::create_descriptor_from_escaped_token("a.b");
+        REQUIRE(a_b_token == *it++);
+        auto const c_token = DescriptorToken::create_descriptor_from_escaped_token("c");
+        REQUIRE(c_token == *it++);
     }
 
     SECTION("Illegal escape sequences in column name") {

From 407ed466d037871af445e48b12586e80019d73e2 Mon Sep 17 00:00:00 2001
From: gibber9809 <devinbook1@gmail.com>
Date: Fri, 13 Dec 2024 16:09:05 +0000
Subject: [PATCH 2/9] Change archive format to include only unescaped keys and
 values

---
 components/core/src/clp_s/ColumnReader.cpp   | 22 +++++++++++++++++++
 components/core/src/clp_s/ColumnReader.hpp   | 17 +++++++++++++++
 components/core/src/clp_s/JsonParser.cpp     | 21 ++++++------------
 components/core/src/clp_s/JsonSerializer.hpp | 23 ++++++++++++++++----
 4 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/components/core/src/clp_s/ColumnReader.cpp b/components/core/src/clp_s/ColumnReader.cpp
index ded7a906d..10960f50e 100644
--- a/components/core/src/clp_s/ColumnReader.cpp
+++ b/components/core/src/clp_s/ColumnReader.cpp
@@ -2,6 +2,7 @@
 
 #include "BufferViewReader.hpp"
 #include "ColumnWriter.hpp"
+#include "Utils.hpp"
 #include "VariableDecoder.hpp"
 
 namespace clp_s {
@@ -88,6 +89,20 @@ void ClpStringColumnReader::extract_string_value_into_buffer(
     VariableDecoder::decode_variables_into_message(entry, *m_var_dict, encoded_vars, buffer);
 }
 
+void ClpStringColumnReader::extract_escaped_string_value_into_buffer(
+        uint64_t cur_message,
+        std::string& buffer
+) {
+    if (false == m_is_array) {
+        // TODO: escape while decoding instead of after.
+        std::string tmp;
+        extract_string_value_into_buffer(cur_message, tmp);
+        StringUtils::escape_json_string(buffer, tmp);
+    } else {
+        extract_string_value_into_buffer(cur_message, buffer);
+    }
+}
+
 int64_t ClpStringColumnReader::get_encoded_id(uint64_t cur_message) {
     auto value = m_logtypes[cur_message];
     return ClpStringColumnWriter::get_encoded_log_dict_id(value);
@@ -125,6 +140,13 @@ void VariableStringColumnReader::extract_string_value_into_buffer(
     buffer.append(m_var_dict->get_value(m_variables[cur_message]));
 }
 
+void VariableStringColumnReader::extract_escaped_string_value_into_buffer(
+        uint64_t cur_message,
+        std::string& buffer
+) {
+    StringUtils::escape_json_string(buffer, m_var_dict->get_value(m_variables[cur_message]));
+}
+
 int64_t VariableStringColumnReader::get_variable_id(uint64_t cur_message) {
     return m_variables[cur_message];
 }
diff --git a/components/core/src/clp_s/ColumnReader.hpp b/components/core/src/clp_s/ColumnReader.hpp
index 69995e6de..d66a75890 100644
--- a/components/core/src/clp_s/ColumnReader.hpp
+++ b/components/core/src/clp_s/ColumnReader.hpp
@@ -53,6 +53,17 @@ class BaseColumnReader {
      */
     virtual void extract_string_value_into_buffer(uint64_t cur_message, std::string& buffer) = 0;
 
+    /**
+     * Extracts a value from the column, escapes it, and serializes it into a provided buffer as a
+     * string.
+     * @param cur_message
+     * @param buffer
+     */
+    virtual void
+    extract_escaped_string_value_into_buffer(uint64_t cur_message, std::string& buffer) {
+        extract_string_value_into_buffer(cur_message, buffer);
+    }
+
 private:
     int32_t m_id;
 };
@@ -152,6 +163,9 @@ class ClpStringColumnReader : public BaseColumnReader {
 
     void extract_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override;
 
+    void
+    extract_escaped_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override;
+
     /**
      * Gets the encoded id of the variable
      * @param cur_message
@@ -196,6 +210,9 @@ class VariableStringColumnReader : public BaseColumnReader {
 
     void extract_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override;
 
+    void
+    extract_escaped_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override;
+
     /**
      * Gets the encoded id of the variable
      * @param cur_message
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index d14a221b3..f9b4f8c0c 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -63,7 +63,7 @@ void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_i
     size_t object_start = m_current_schema.start_unordered_object(NodeType::Object);
     ondemand::field cur_field;
     ondemand::value cur_value;
-    std::string cur_key;
+    std::string_view cur_key;
     int32_t node_id;
     while (true) {
         while (false == object_stack.empty() && object_it_stack.top() == object_stack.top().end()) {
@@ -81,7 +81,7 @@ void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_i
         }
 
         cur_field = *object_it_stack.top();
-        cur_key = std::string_view(cur_field.unescaped_key(true));
+        cur_key = cur_field.unescaped_key(true);
         cur_value = cur_field.value();
 
         switch (cur_value.type()) {
@@ -131,9 +131,7 @@ void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_i
                 break;
             }
             case ondemand::json_type::string: {
-                std::string value = std::string(
-                        cur_value.raw_json_token().substr(1, cur_value.raw_json_token().size() - 2)
-                );
+                std::string_view value = cur_value.get_string(true);
                 if (value.find(' ') != std::string::npos) {
                     node_id = m_archive_writer
                                       ->add_node(node_id_stack.top(), NodeType::ClpString, cur_key);
@@ -209,9 +207,7 @@ void JsonParser::parse_array(ondemand::array array, int32_t parent_node_id) {
                 break;
             }
             case ondemand::json_type::string: {
-                std::string value = std::string(
-                        cur_value.raw_json_token().substr(1, cur_value.raw_json_token().size() - 2)
-                );
+                std::string_view value = cur_value.get_string(true);
                 if (value.find(' ') != std::string::npos) {
                     node_id = m_archive_writer->add_node(parent_node_id, NodeType::ClpString, "");
                 } else {
@@ -246,7 +242,7 @@ void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::s
 
     ondemand::field cur_field;
 
-    std::string cur_key = key;
+    std::string_view cur_key = key;
     node_id_stack.push(parent_node_id);
 
     bool can_match_timestamp = !m_timestamp_column.empty();
@@ -257,7 +253,7 @@ void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::s
     do {
         if (false == object_stack.empty()) {
             cur_field = *object_it_stack.top();
-            cur_key = std::string(std::string_view(cur_field.unescaped_key(true)));
+            cur_key = cur_field.unescaped_key(true);
             line = cur_field.value();
             if (may_match_timestamp) {
                 if (object_stack.size() <= m_timestamp_column.size()
@@ -353,10 +349,7 @@ void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::s
                 break;
             }
             case ondemand::json_type::string: {
-                auto raw_json_token = line.raw_json_token();
-                std::string value
-                        = std::string(raw_json_token.substr(1, raw_json_token.rfind('"') - 1));
-
+                std::string_view value = line.get_string(true);
                 if (matches_timestamp) {
                     node_id = m_archive_writer->add_node(
                             node_id_stack.top(),
diff --git a/components/core/src/clp_s/JsonSerializer.hpp b/components/core/src/clp_s/JsonSerializer.hpp
index 63f845525..d0eea817d 100644
--- a/components/core/src/clp_s/JsonSerializer.hpp
+++ b/components/core/src/clp_s/JsonSerializer.hpp
@@ -6,6 +6,9 @@
 #include <vector>
 
 #include "ColumnReader.hpp"
+#include "Utils.hpp"
+
+namespace clp_s {
 
 class JsonSerializer {
 public:
@@ -67,7 +70,11 @@ class JsonSerializer {
         return false;
     }
 
-    void add_special_key(std::string_view const key) { m_special_keys.emplace_back(key); }
+    void add_special_key(std::string_view const key) {
+        std::string tmp;
+        StringUtils::escape_json_string(tmp, key);
+        m_special_keys.emplace_back(tmp);
+    }
 
     void begin_object() {
         append_key();
@@ -109,11 +116,11 @@ class JsonSerializer {
         m_json_string += "],";
     }
 
-    void append_key() { append_key(m_special_keys[m_special_keys_index++]); }
+    void append_key() { append_escaped_key(m_special_keys[m_special_keys_index++]); }
 
     void append_key(std::string_view const key) {
         m_json_string += "\"";
-        m_json_string += key;
+        StringUtils::escape_json_string(m_json_string, key);
         m_json_string += "\":";
     }
 
@@ -130,11 +137,17 @@ class JsonSerializer {
     void
     append_value_from_column_with_quotes(clp_s::BaseColumnReader* column, uint64_t cur_message) {
         m_json_string += "\"";
-        column->extract_string_value_into_buffer(cur_message, m_json_string);
+        column->extract_escaped_string_value_into_buffer(cur_message, m_json_string);
         m_json_string += "\",";
     }
 
 private:
+    void append_escaped_key(std::string_view const key) {
+        m_json_string.push_back('"');
+        m_json_string.append(key);
+        m_json_string.append("\":");
+    }
+
     std::string m_json_string;
     std::vector<Op> m_op_list;
     std::vector<std::string> m_special_keys;
@@ -143,4 +156,6 @@ class JsonSerializer {
     size_t m_special_keys_index{0};
 };
 
+}  // namespace clp_s
+
 #endif  // CLP_S_JSONSERIALIZER_HPP

From 0b71b46cabf53c027758be6aad96c7a10bff9f80 Mon Sep 17 00:00:00 2001
From: gibber9809 <devinbook1@gmail.com>
Date: Mon, 16 Dec 2024 19:10:40 +0000
Subject: [PATCH 3/9] Add tests for kql value escaping

---
 components/core/tests/test-kql.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/components/core/tests/test-kql.cpp b/components/core/tests/test-kql.cpp
index 9eebc8f05..63a8ca7c2 100644
--- a/components/core/tests/test-kql.cpp
+++ b/components/core/tests/test-kql.cpp
@@ -3,6 +3,7 @@
 #include <vector>
 
 #include <Catch2/single_include/catch2/catch.hpp>
+#include <fmt/core.h>
 #include <spdlog/spdlog.h>
 
 #include "../src/clp_s/search/AndExpr.hpp"
@@ -238,4 +239,30 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
         auto filter = parse_kql_expression(empty_token_column);
         REQUIRE(nullptr == filter);
     }
+
+    SECTION("Escape sequences in value") {
+        auto translated_pair = GENERATE(
+                std::pair{"\\\\", "\\\\"},
+                std::pair{"\\??", "\\??"},
+                std::pair{"\\**", "\\**"},
+                std::pair{"\u9999", "香"},
+                std::pair{"\\r\\n\\t\\b\\f", "\r\n\t\b\f"},
+                std::pair{"\\\"", "\""},
+                std::pair{"\\{\\}\\(\\)\\<\\>", "{}()<>"}
+        );
+
+        auto formatted_query = fmt::format("*: \"{}\"", translated_pair.first);
+        stringstream query{formatted_query};
+        auto filter = std::dynamic_pointer_cast<FilterExpr>(parse_kql_expression(query));
+        REQUIRE(nullptr != filter);
+        REQUIRE(nullptr != filter->get_operand());
+        REQUIRE(nullptr != filter->get_column());
+        REQUIRE(false == filter->has_only_expression_operands());
+        REQUIRE(false == filter->is_inverted());
+        REQUIRE(FilterOperation::EQ == filter->get_operation());
+        REQUIRE(true == filter->get_column()->is_pure_wildcard());
+        std::string literal;
+        REQUIRE(true == filter->get_operand()->as_var_string(literal, FilterOperation::EQ));
+        REQUIRE(literal == translated_pair.second);
+    }
 }

From 850808421d09632dc727f812228fbce54375c445 Mon Sep 17 00:00:00 2001
From: gibber9809 <devinbook1@gmail.com>
Date: Mon, 16 Dec 2024 19:23:34 +0000
Subject: [PATCH 4/9] Add tests for more edge cases

---
 components/core/tests/test-kql.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/components/core/tests/test-kql.cpp b/components/core/tests/test-kql.cpp
index 63a8ca7c2..a39b12d07 100644
--- a/components/core/tests/test-kql.cpp
+++ b/components/core/tests/test-kql.cpp
@@ -245,10 +245,13 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
                 std::pair{"\\\\", "\\\\"},
                 std::pair{"\\??", "\\??"},
                 std::pair{"\\**", "\\**"},
-                std::pair{"\u9999", "香"},
+                std::pair{"\\u9999", "香"},
                 std::pair{"\\r\\n\\t\\b\\f", "\r\n\t\b\f"},
                 std::pair{"\\\"", "\""},
-                std::pair{"\\{\\}\\(\\)\\<\\>", "{}()<>"}
+                std::pair{"\\{\\}\\(\\)\\<\\>", "{}()<>"},
+                std::pair{"\\u003F", "\\?"},
+                std::pair{"\\u002A", "\\*"},
+                std::pair{"\\u005C", "\\\\"}
         );
 
         auto formatted_query = fmt::format("*: \"{}\"", translated_pair.first);

From 1483c356eddebbacbe3eb6fe192659a5d8a0f855 Mon Sep 17 00:00:00 2001
From: gibber9809 <devinbook1@gmail.com>
Date: Mon, 16 Dec 2024 19:31:46 +0000
Subject: [PATCH 5/9] Address rabbit comments

---
 components/core/src/clp_s/Utils.hpp         | 6 +++---
 components/core/src/clp_s/search/kql/Kql.g4 | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp
index ab9298399..6f22878d4 100644
--- a/components/core/src/clp_s/Utils.hpp
+++ b/components/core/src/clp_s/Utils.hpp
@@ -230,8 +230,8 @@ class StringUtils {
      * (e.g. \n) and less common control sequences with unicode escape sequences (e.g. \u001f). The
      * '"' and '\' characters are escaped with a backslash.
      *
-     * @param source
      * @param destination
+     * @param source
      */
     static void escape_json_string(std::string& destination, std::string_view const source);
 
@@ -245,7 +245,7 @@ class StringUtils {
      *
      * @param value
      * @param unescaped
-     * @return true if the value was escaped succesfully and false otherwise.
+     * @return true if the value was unescaped successfully, false otherwise.
      */
     static bool unescape_kql_value(std::string const& value, std::string& unescaped);
 
@@ -280,7 +280,7 @@ class StringUtils {
             if ('\x00' <= nibble && nibble <= '\x09') {
                 return '0' + (nibble - '\x00');
             } else {
-                return 'a' + (nibble - '\x10');
+                return 'a' + (nibble - '\x0a');
             }
         };
 
diff --git a/components/core/src/clp_s/search/kql/Kql.g4 b/components/core/src/clp_s/search/kql/Kql.g4
index 26c650ed2..2737e88fa 100644
--- a/components/core/src/clp_s/search/kql/Kql.g4
+++ b/components/core/src/clp_s/search/kql/Kql.g4
@@ -93,6 +93,6 @@ fragment SPECIAL_CHARACTER
 
 // For unicode hex
 fragment UNICODE: '\\u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT ;
-fragment HEXDIGIT: [0-9a-fA-F]+ ;
+fragment HEXDIGIT: [0-9a-fA-F] ;
 
 SPACE:  [ \t\r\n] -> skip ;

From 3abe7b38e316c3b25e548f832f68ef7bab0a0730 Mon Sep 17 00:00:00 2001
From: Devin Gibson <gibber9809@users.noreply.github.com>
Date: Thu, 19 Dec 2024 15:19:07 -0500
Subject: [PATCH 6/9] Apply suggestions from code review

Co-authored-by: wraymo <37269683+wraymo@users.noreply.github.com>
---
 components/core/src/clp_s/Utils.cpp                   | 2 +-
 components/core/src/clp_s/Utils.hpp                   | 2 +-
 components/core/src/clp_s/search/ColumnDescriptor.hpp | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp
index 976c3ee14..0b7925c44 100644
--- a/components/core/src/clp_s/Utils.cpp
+++ b/components/core/src/clp_s/Utils.cpp
@@ -540,7 +540,7 @@ void StringUtils::escape_json_string(std::string& destination, std::string_view
 
 namespace {
 /**
- * Convert a four byte hex sequence to utf8. We perform the conversion in this cumbersome way
+ * Converts a four byte hex sequence to utf8. We perform the conversion in this cumbersome way
  * because c++20 deprecates most of the much more convenient std::codecvt utilities.
  */
 bool convert_four_byte_hex_to_utf8(std::string_view const hex, std::string& destination) {
diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp
index 6f22878d4..7fdc4a886 100644
--- a/components/core/src/clp_s/Utils.hpp
+++ b/components/core/src/clp_s/Utils.hpp
@@ -300,7 +300,7 @@ class StringUtils {
     }
 
     /**
-     * Unescape a KQL key or value with special handling for each case and append the unescaped
+     * Unescapes a KQL key or value with special handling for each case and append the unescaped
      * value to the `unescaped` buffer.
      * @param value
      * @param unescaped
diff --git a/components/core/src/clp_s/search/ColumnDescriptor.hpp b/components/core/src/clp_s/search/ColumnDescriptor.hpp
index 666910550..f982dd189 100644
--- a/components/core/src/clp_s/search/ColumnDescriptor.hpp
+++ b/components/core/src/clp_s/search/ColumnDescriptor.hpp
@@ -63,7 +63,7 @@ class DescriptorToken {
 
 private:
     /**
-     * Initialize the token from a string and set flags based on whether the token contains
+     * Initializes the token from a string and sets flags based on whether the token contains
      * wildcards
      * @param token the string to initialize the token from
      * @param bool true if the string should be interpreted as literal, and false
@@ -130,10 +130,10 @@ class ColumnDescriptor : public Literal {
     );
 
     /**
-     * Insert an entire DescriptorList into this ColumnDescriptor at before given position.
-     * @param pos an iterator to the position inside of the internal descriptor list to insert
+     * Inserts an entire DescriptorList into this ColumnDescriptor before the specified position.
+     * @param an iterator indicating the position in the internal descriptor list before which the new descriptors will be inserted.
      * before.
-     * @param source the list of descriptors to insert
+     * @param source the list of descriptors to be inserted
      */
     void insert(DescriptorList::iterator pos, DescriptorList const& source) {
         m_descriptors.insert(pos, source.begin(), source.end());

From 10307f516d1a752e0f6fc175dd03a538ccd530c6 Mon Sep 17 00:00:00 2001
From: gibber9809 <devinbook1@gmail.com>
Date: Thu, 19 Dec 2024 20:37:10 +0000
Subject: [PATCH 7/9] Address code review comments

---
 components/core/src/clp_s/Utils.cpp           | 11 +++++----
 .../src/clp_s/search/ColumnDescriptor.cpp     |  8 +++----
 .../src/clp_s/search/ColumnDescriptor.hpp     | 23 +++++++++++++++----
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp
index 0b7925c44..b33b2508a 100644
--- a/components/core/src/clp_s/Utils.cpp
+++ b/components/core/src/clp_s/Utils.cpp
@@ -579,11 +579,12 @@ bool StringUtils::unescape_kql_internal(
 ) {
     bool escaped{false};
     for (size_t i = 0; i < value.size(); ++i) {
-        if (false == escaped && '\\' != value[i]) {
-            unescaped.push_back(value[i]);
-            continue;
-        } else if (false == escaped && '\\' == value[i]) {
-            escaped = true;
+        if (false == escaped) {
+            if ('\\' == value[i]) {
+                escaped = true;
+            } else {
+                unescaped.push_back(value[i]);
+            }
             continue;
         }
 
diff --git a/components/core/src/clp_s/search/ColumnDescriptor.cpp b/components/core/src/clp_s/search/ColumnDescriptor.cpp
index de944022e..d20f9e8f9 100644
--- a/components/core/src/clp_s/search/ColumnDescriptor.cpp
+++ b/components/core/src/clp_s/search/ColumnDescriptor.cpp
@@ -22,18 +22,18 @@ void ColumnDescriptor::check_and_set_unresolved_descriptor_flag() {
     }
 }
 
-ColumnDescriptor::ColumnDescriptor(std::string const& descriptor) {
+ColumnDescriptor::ColumnDescriptor(std::string const& token) {
     m_flags = cAllTypes;
-    m_descriptors.emplace_back(DescriptorToken::create_descriptor_from_escaped_token(descriptor));
+    m_descriptors.emplace_back(DescriptorToken::create_descriptor_from_escaped_token(token));
     check_and_set_unresolved_descriptor_flag();
     if (is_unresolved_descriptor()) {
         simplify_descriptor_wildcards();
     }
 }
 
-ColumnDescriptor::ColumnDescriptor(std::vector<std::string> const& descriptors) {
+ColumnDescriptor::ColumnDescriptor(std::vector<std::string> const& tokens) {
     m_flags = cAllTypes;
-    m_descriptors = std::move(tokenize_descriptor(descriptors));
+    m_descriptors = std::move(tokenize_descriptor(tokens));
     check_and_set_unresolved_descriptor_flag();
     if (is_unresolved_descriptor()) {
         simplify_descriptor_wildcards();
diff --git a/components/core/src/clp_s/search/ColumnDescriptor.hpp b/components/core/src/clp_s/search/ColumnDescriptor.hpp
index f982dd189..d675d7ed0 100644
--- a/components/core/src/clp_s/search/ColumnDescriptor.hpp
+++ b/components/core/src/clp_s/search/ColumnDescriptor.hpp
@@ -117,22 +117,35 @@ DescriptorList tokenize_descriptor(std::vector<std::string> const& descriptors);
 class ColumnDescriptor : public Literal {
 public:
     /**
-     * Create a ColumnDescriptor literal from an integral value
-     * @param descriptor(s) the token or list of tokens making up the descriptor
-     * @return A ColumnDescriptor
+     * Creates a ColumnDescriptor literal from a list of escaped tokens.
+     *
+     * In particular there are escaping rules for literal '\' and '*':
+     * \\ -> literal \
+     * \* -> literal *
+     *
+     * A '*' that is not escaped is treated as a wildcard descriptor.
+     *
+     * @param token(s) the escaped token or list of escaped tokens making up the descriptor
+     * @return A ColumnDescriptor literal
      */
     static std::shared_ptr<ColumnDescriptor> create_from_escaped_token(std::string const& token);
     static std::shared_ptr<ColumnDescriptor> create_from_escaped_tokens(
             std::vector<std::string> const& tokens
     );
+
+    /**
+     * Create a ColumnDescriptor literal from a list of already-parsed DescriptorToken.
+     * @param descriptors the list of parsed DescriptorToken
+     * @return A ColumnDescriptor literal
+     */
     static std::shared_ptr<ColumnDescriptor> create_from_descriptors(
             DescriptorList const& descriptors
     );
 
     /**
      * Inserts an entire DescriptorList into this ColumnDescriptor before the specified position.
-     * @param an iterator indicating the position in the internal descriptor list before which the new descriptors will be inserted.
-     * before.
+     * @param an iterator indicating the position in the internal descriptor list before which the
+     * new descriptors will be inserted.
      * @param source the list of descriptors to be inserted
      */
     void insert(DescriptorList::iterator pos, DescriptorList const& source) {

From 2042a1e5c2b6e6a06eb6b73828f6787a4b2476ee Mon Sep 17 00:00:00 2001
From: gibber9809 <devinbook1@gmail.com>
Date: Thu, 19 Dec 2024 20:51:44 +0000
Subject: [PATCH 8/9] Remove escaping in ir ingestion path that is unnecessary
 because of this PR

---
 components/core/src/clp_s/CMakeLists.txt |  2 --
 components/core/src/clp_s/JsonParser.cpp | 45 +++++-------------------
 2 files changed, 8 insertions(+), 39 deletions(-)

diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt
index 9ca0c947e..1001085c8 100644
--- a/components/core/src/clp_s/CMakeLists.txt
+++ b/components/core/src/clp_s/CMakeLists.txt
@@ -23,8 +23,6 @@ set(
         ../clp/ffi/KeyValuePairLogEvent.hpp
         ../clp/ffi/SchemaTree.cpp
         ../clp/ffi/SchemaTree.hpp
-        ../clp/ffi/utils.cpp
-        ../clp/ffi/utils.hpp
         ../clp/ffi/Value.hpp
         ../clp/FileDescriptor.cpp
         ../clp/FileDescriptor.hpp
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index 46078f411..aaf8188ce 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -16,7 +16,6 @@
 #include "../clp/ffi/ir_stream/IrUnitType.hpp"
 #include "../clp/ffi/KeyValuePairLogEvent.hpp"
 #include "../clp/ffi/SchemaTree.hpp"
-#include "../clp/ffi/utils.hpp"
 #include "../clp/ffi/Value.hpp"
 #include "../clp/ir/EncodedTextAst.hpp"
 #include "../clp/streaming_compression/zstd/Decompressor.hpp"
@@ -654,18 +653,11 @@ auto JsonParser::add_node_to_archive_and_translations(
         NodeType archive_node_type,
         int32_t parent_node_id
 ) -> int {
-    auto validated_escaped_key
-            = clp::ffi::validate_and_escape_utf8_string(ir_node_to_add.get_key_name());
-    std::string node_key;
-    if (validated_escaped_key.has_value()) {
-        node_key = validated_escaped_key.value();
-    } else {
-        SPDLOG_ERROR("Key is not UTF-8 compliant: \"{}\"", ir_node_to_add.get_key_name());
-        throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__);
-    }
-    int const curr_node_archive_id
-            = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key);
-
+    int const curr_node_archive_id = m_archive_writer->add_node(
+            parent_node_id,
+            archive_node_type,
+            ir_node_to_add.get_key_name()
+    );
     m_ir_node_to_archive_node_id_mapping.emplace(
             std::make_pair(ir_node_id, archive_node_type),
             curr_node_archive_id
@@ -759,23 +751,10 @@ void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv) {
                 m_current_parsed_message.add_value(node_id, b_value);
             } break;
             case NodeType::VarString: {
-                auto const validated_escaped_string = clp::ffi::validate_and_escape_utf8_string(
-                        pair.second.value().get_immutable_view<std::string>()
-                );
-                std::string str;
-                if (validated_escaped_string.has_value()) {
-                    str = validated_escaped_string.value();
-                } else {
-                    SPDLOG_ERROR(
-                            "String is not utf8 compliant: \"{}\"",
-                            pair.second.value().get_immutable_view<std::string>()
-                    );
-                    throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__);
-                }
-                m_current_parsed_message.add_value(node_id, str);
+                auto const var_value{pair.second.value().get_immutable_view<std::string>()};
+                m_current_parsed_message.add_value(node_id, var_value);
             } break;
             case NodeType::ClpString: {
-                std::string encoded_str;
                 std::string decoded_value;
                 if (pair.second.value().is<clp::ir::EightByteEncodedTextAst>()) {
                     decoded_value = pair.second.value()
@@ -789,15 +768,7 @@ void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv) {
                                             .decode_and_unparse()
                                             .value();
                 }
-                auto const validated_escaped_encoded_string
-                        = clp::ffi::validate_and_escape_utf8_string(decoded_value.c_str());
-                if (validated_escaped_encoded_string.has_value()) {
-                    encoded_str = validated_escaped_encoded_string.value();
-                } else {
-                    SPDLOG_ERROR("Encoded string is not utf8 compliant: \"{}\"", decoded_value);
-                    throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__);
-                }
-                m_current_parsed_message.add_value(node_id, encoded_str);
+                m_current_parsed_message.add_value(node_id, decoded_value);
             } break;
             case NodeType::UnstructuredArray: {
                 std::string array_str;

From e236e619dd378adbf1af9985a0ce91aa3c928db0 Mon Sep 17 00:00:00 2001
From: gibber9809 <devinbook1@gmail.com>
Date: Sat, 21 Dec 2024 21:54:12 +0000
Subject: [PATCH 9/9] Fix array search bug

---
 components/core/src/clp_s/search/Output.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/components/core/src/clp_s/search/Output.cpp b/components/core/src/clp_s/search/Output.cpp
index c9954779b..7b632b389 100644
--- a/components/core/src/clp_s/search/Output.cpp
+++ b/components/core/src/clp_s/search/Output.cpp
@@ -666,9 +666,7 @@ bool Output::evaluate_array_filter_object(
     }
 
     for (auto field : object) {
-        // Note: field.key() yields the escaped JSON key, so the descriptor tokens passed to search
-        // must likewise be escaped.
-        if (field.key() != unresolved_tokens[cur_idx].get_token()) {
+        if (field.unescaped_key(true).value() != unresolved_tokens[cur_idx].get_token()) {
             continue;
         }