From 1717f544b8aaadd3d72747fcd4172e789b0a965b Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Fri, 13 Dec 2024 16:08:26 +0000 Subject: [PATCH 1/9] Add utilities for escaping search queries and fix kql escaping behaviour --- components/core/src/clp_s/Utils.cpp | 245 +++++++++++++++++- components/core/src/clp_s/Utils.hpp | 75 +++++- components/core/src/clp_s/clp-s.cpp | 2 +- .../clp_s/search/AddTimestampConditions.cpp | 3 +- .../src/clp_s/search/ColumnDescriptor.cpp | 22 +- .../src/clp_s/search/ColumnDescriptor.hpp | 111 +++++--- .../core/src/clp_s/search/SchemaMatch.cpp | 9 +- components/core/src/clp_s/search/kql/Kql.g4 | 15 +- components/core/src/clp_s/search/kql/kql.cpp | 23 +- components/core/tests/test-kql.cpp | 18 +- 10 files changed, 437 insertions(+), 86 deletions(-) diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index acee48851..976c3ee14 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -1,6 +1,7 @@ #include "Utils.hpp" #include +#include #include using std::string; @@ -431,30 +432,246 @@ bool StringUtils::tokenize_column_descriptor( std::string const& descriptor, std::vector& tokens ) { - // TODO: add support for unicode sequences e.g. \u263A std::string cur_tok; - for (size_t cur = 0; cur < descriptor.size(); ++cur) { - if ('\\' == descriptor[cur]) { - ++cur; - if (cur >= descriptor.size()) { - return false; - } - } else if ('.' == descriptor[cur]) { - if (cur_tok.empty()) { - return false; + bool escaped{false}; + for (size_t i = 0; i < descriptor.size(); ++i) { + if (false == escaped) { + if ('\\' == descriptor[i]) { + escaped = true; + } else if ('.' == descriptor[i]) { + if (cur_tok.empty()) { + return false; + } + std::string unescaped_token; + if (unescape_kql_internal(cur_tok, unescaped_token, false)) { + tokens.push_back(unescaped_token); + cur_tok.clear(); + } else { + return false; + } + } else { + cur_tok.push_back(descriptor[i]); } - tokens.push_back(cur_tok); - cur_tok.clear(); continue; } - cur_tok.push_back(descriptor[cur]); + + escaped = false; + switch (descriptor[i]) { + case '.': + cur_tok.push_back('.'); + break; + default: + cur_tok.push_back('\\'); + cur_tok.push_back(descriptor[i]); + break; + } + } + + if (escaped) { + return false; } if (cur_tok.empty()) { return false; } - tokens.push_back(cur_tok); + std::string unescaped_token; + if (unescape_kql_internal(cur_tok, unescaped_token, false)) { + tokens.push_back(unescaped_token); + } else { + return false; + } + return true; +} + +void StringUtils::escape_json_string(std::string& destination, std::string_view const source) { + // Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path + // when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers + // a net decompression speedup of ~30% compared to adding every character to the destination one + // character at a time. + size_t slice_begin{0ULL}; + auto append_unescaped_slice = [&](size_t i) { + if (slice_begin < i) { + destination.append(source.substr(slice_begin, i - slice_begin)); + } + slice_begin = i + 1; + }; + for (size_t i = 0; i < source.size(); ++i) { + char c = source[i]; + switch (c) { + case '"': + append_unescaped_slice(i); + destination.append("\\\""); + break; + case '\\': + append_unescaped_slice(i); + destination.append("\\\\"); + break; + case '\t': + append_unescaped_slice(i); + destination.append("\\t"); + break; + case '\r': + append_unescaped_slice(i); + destination.append("\\r"); + break; + case '\n': + append_unescaped_slice(i); + destination.append("\\n"); + break; + case '\b': + append_unescaped_slice(i); + destination.append("\\b"); + break; + case '\f': + append_unescaped_slice(i); + destination.append("\\f"); + break; + default: + if ('\x00' <= c && c <= '\x1f') { + append_unescaped_slice(i); + char_to_escaped_four_char_hex(destination, c); + } + break; + } + } + append_unescaped_slice(source.size()); +} + +namespace { +/** + * Convert a four byte hex sequence to utf8. We perform the conversion in this cumbersome way + * because c++20 deprecates most of the much more convenient std::codecvt utilities. + */ +bool convert_four_byte_hex_to_utf8(std::string_view const hex, std::string& destination) { + std::string buf = "\"\\u"; + buf += hex; + buf.push_back('"'); + buf.reserve(buf.size() + simdjson::SIMDJSON_PADDING); + simdjson::ondemand::parser parser; + auto value = parser.iterate(buf); + try { + if (false == value.is_scalar()) { + return false; + } + + if (simdjson::ondemand::json_type::string != value.type()) { + return false; + } + + std::string_view unescaped_utf8 = value.get_string(false); + destination.append(unescaped_utf8); + } catch (std::exception const& e) { + return false; + } + return true; +} +} // namespace + +bool StringUtils::unescape_kql_value(std::string const& value, std::string& unescaped) { + return unescape_kql_internal(value, unescaped, true); +} + +bool StringUtils::unescape_kql_internal( + std::string const& value, + std::string& unescaped, + bool is_value +) { + bool escaped{false}; + for (size_t i = 0; i < value.size(); ++i) { + if (false == escaped && '\\' != value[i]) { + unescaped.push_back(value[i]); + continue; + } else if (false == escaped && '\\' == value[i]) { + escaped = true; + continue; + } + + escaped = false; + switch (value[i]) { + case '\\': + unescaped.append("\\\\"); + break; + case '"': + unescaped.push_back('"'); + break; + case 't': + unescaped.push_back('\t'); + break; + case 'r': + unescaped.push_back('\r'); + break; + case 'n': + unescaped.push_back('\n'); + break; + case 'b': + unescaped.push_back('\b'); + break; + case 'f': + unescaped.push_back('\f'); + break; + case 'u': { + size_t last_char_in_codepoint = i + 4; + if (value.size() <= last_char_in_codepoint) { + return false; + } + + auto four_byte_hex = std::string_view{value}.substr(i + 1, 4); + i += 4; + + std::string tmp; + if (false == convert_four_byte_hex_to_utf8(four_byte_hex, tmp)) { + return false; + } + + // Make sure unicode escape sequences are always treated as literal characters + if ("\\" == tmp) { + unescaped.append("\\\\"); + } else if ("?" == tmp && is_value) { + unescaped.append("\\?"); + } else if ("*" == tmp) { + unescaped.append("\\*"); + } else { + unescaped.append(tmp); + } + break; + } + case '{': + unescaped.push_back('{'); + break; + case '}': + unescaped.push_back('}'); + break; + case '(': + unescaped.push_back('('); + break; + case ')': + unescaped.push_back(')'); + break; + case '<': + unescaped.push_back('<'); + break; + case '>': + unescaped.push_back('>'); + break; + case '*': + unescaped.append("\\*"); + break; + case '?': + if (is_value) { + unescaped.append("\\?"); + } else { + unescaped.push_back('?'); + } + break; + default: + return false; + } + } + + if (escaped) { + return false; + } return true; } } // namespace clp_s diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index 553f7e608..ab9298399 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -1,6 +1,7 @@ #ifndef CLP_S_UTILS_HPP #define CLP_S_UTILS_HPP +#include #include #include #include @@ -209,7 +210,8 @@ class StringUtils { static bool convert_string_to_double(std::string const& raw, double& converted); /** - * Converts a string column descriptor delimited by '.' into a list of tokens + * Converts a KQL string column descriptor delimited by '.' into a list of tokens. The + * descriptor is tokenized and unescaped per the escaping rules for KQL columns. * @param descriptor * @param tokens * @return true if the descriptor was tokenized successfully, false otherwise @@ -217,6 +219,36 @@ class StringUtils { [[nodiscard]] static bool tokenize_column_descriptor(std::string const& descriptor, std::vector& tokens); + /** + * Escapes a string according to JSON string escaping rules and appends the escaped string to + * a buffer. The input string can be either ascii or UTF-8. + * + * According to the JSON spec JSON strings must escape control sequences (characters 0x00 + * through 0x1f) as well as the '"' and '\' characters. + * + * This function escapes common control sequences like newline with short escape sequences + * (e.g. \n) and less common control sequences with unicode escape sequences (e.g. \u001f). The + * '"' and '\' characters are escaped with a backslash. + * + * @param source + * @param destination + */ + static void escape_json_string(std::string& destination, std::string_view const source); + + /** + * Unescapes a KQL value string according to the escaping rules for KQL value strings and + * converts it into a valid CLP search string. + * + * Specifically this means that the string is unescaped, but the escape sequences '\\', '\*', + * and '\?' are preserved so that the resulting string can be interpreted correctly by CLP + * search. + * + * @param value + * @param unescaped + * @return true if the value was escaped succesfully and false otherwise. + */ + static bool unescape_kql_value(std::string const& value, std::string& unescaped); + private: /** * Helper for ``wildcard_match_unsafe_case_sensitive`` to advance the @@ -236,6 +268,47 @@ class StringUtils { char const*& wild_current, char const*& wild_bookmark ); + + /** + * Converts a character into its two byte hexadecimal representation. + * @param c + * @return the two byte hexadecimal representation of c as an array of two characters. + */ + static std::array char_to_hex(char c) { + std::array ret; + auto nibble_to_hex = [](char nibble) -> char { + if ('\x00' <= nibble && nibble <= '\x09') { + return '0' + (nibble - '\x00'); + } else { + return 'a' + (nibble - '\x10'); + } + }; + + return std::array{nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)}; + } + + /** + * Converts a character into a unicode escape sequence (e.g. \u0000) and appends the escape + * sequences to the `destination` buffer. + * @param destination + * @param c + */ + static void char_to_escaped_four_char_hex(std::string& destination, char c) { + destination.append("\\u00"); + auto hex = char_to_hex(c); + destination.append(hex.data(), hex.size()); + } + + /** + * Unescape a KQL key or value with special handling for each case and append the unescaped + * value to the `unescaped` buffer. + * @param value + * @param unescaped + * @param is_value + * @return true if the value was unescaped succesfully and false otherwise. + */ + static bool + unescape_kql_internal(std::string const& value, std::string& unescaped, bool is_value); }; enum EvaluatedValue { diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 2c6639290..a1b0da506 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -206,7 +206,7 @@ bool search_archive( SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column); return false; } - projection->add_column(ColumnDescriptor::create(descriptor_tokens)); + projection->add_column(ColumnDescriptor::create_from_escaped_tokens(descriptor_tokens)); } } catch (clp_s::TraceableException& e) { SPDLOG_ERROR("{}", e.what()); diff --git a/components/core/src/clp_s/search/AddTimestampConditions.cpp b/components/core/src/clp_s/search/AddTimestampConditions.cpp index 5af7ef44a..addf7069f 100644 --- a/components/core/src/clp_s/search/AddTimestampConditions.cpp +++ b/components/core/src/clp_s/search/AddTimestampConditions.cpp @@ -18,7 +18,8 @@ std::shared_ptr AddTimestampConditions::run(std::shared_ptr const& descriptors) { DescriptorList list; for (std::string const& descriptor : descriptors) { - list.push_back(DescriptorToken(descriptor)); + list.push_back(DescriptorToken::create_descriptor_from_escaped_token(descriptor)); } return list; } @@ -15,7 +15,7 @@ void ColumnDescriptor::check_and_set_unresolved_descriptor_flag() { m_unresolved_descriptors = false; m_pure_wildcard = m_descriptors.size() == 1 && m_descriptors[0].wildcard(); for (auto const& token : m_descriptors) { - if (token.wildcard() || token.regex()) { + if (token.wildcard()) { m_unresolved_descriptors = true; break; } @@ -24,7 +24,7 @@ void ColumnDescriptor::check_and_set_unresolved_descriptor_flag() { ColumnDescriptor::ColumnDescriptor(std::string const& descriptor) { m_flags = cAllTypes; - m_descriptors.emplace_back(descriptor); + m_descriptors.emplace_back(DescriptorToken::create_descriptor_from_escaped_token(descriptor)); check_and_set_unresolved_descriptor_flag(); if (is_unresolved_descriptor()) { simplify_descriptor_wildcards(); @@ -49,17 +49,21 @@ ColumnDescriptor::ColumnDescriptor(DescriptorList const& descriptors) { } } -std::shared_ptr ColumnDescriptor::create(std::string const& descriptor) { - return std::shared_ptr(new ColumnDescriptor(descriptor)); +std::shared_ptr ColumnDescriptor::create_from_escaped_token( + std::string const& token +) { + return std::shared_ptr(new ColumnDescriptor(token)); } -std::shared_ptr ColumnDescriptor::create( - std::vector const& descriptors +std::shared_ptr ColumnDescriptor::create_from_escaped_tokens( + std::vector const& tokens ) { - return std::shared_ptr(new ColumnDescriptor(descriptors)); + return std::shared_ptr(new ColumnDescriptor(tokens)); } -std::shared_ptr ColumnDescriptor::create(DescriptorList const& descriptors) { +std::shared_ptr ColumnDescriptor::create_from_descriptors( + DescriptorList const& descriptors +) { return std::shared_ptr(new ColumnDescriptor(descriptors)); } diff --git a/components/core/src/clp_s/search/ColumnDescriptor.hpp b/components/core/src/clp_s/search/ColumnDescriptor.hpp index ea1cfd7ec..666910550 100644 --- a/components/core/src/clp_s/search/ColumnDescriptor.hpp +++ b/components/core/src/clp_s/search/ColumnDescriptor.hpp @@ -7,6 +7,7 @@ #include #include +#include "../TraceableException.hpp" #include "Literal.hpp" namespace clp_s::search { @@ -15,27 +16,29 @@ namespace clp_s::search { */ class DescriptorToken { public: - // Constructors - DescriptorToken() = default; + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; /** - * Initialize the token from a string and set flags based on whether the token contains - * wildcards - * @param token the string to initialize the token from + * Creates a DescriptorToken from an escaped token string. The escape sequences '\\' and '\*' + * are supported in order to distinguish the literal '*' from the '*' used to match hierarchies + * of keys. + * @param token */ - explicit DescriptorToken(std::string_view const token) - : m_token(token), - m_wildcard(false), - m_regex(false) { - if (token == "*") { - m_wildcard = true; - } + static DescriptorToken create_descriptor_from_escaped_token(std::string_view const token) { + return DescriptorToken{token, false}; + } - for (char c : token) { - if (c == '*') { - m_regex = true; - } - } + /** + * Creates a DescriptorToken from a literal token string. The token is copied verbatim, and is + * never treated as a wildcard. + */ + static DescriptorToken create_descriptor_from_literal_token(std::string_view const token) { + return DescriptorToken{token, true}; } /** @@ -44,13 +47,6 @@ class DescriptorToken { */ bool wildcard() const { return m_wildcard; } - /** - * Whether the descriptor contains a wildcard somewhere - * TODO: Not currently used, and regex isn't currently supported - * @return true if the descriptor contains a wildcard - */ - bool regex() const { return m_regex; } - /** * Get a reference to the underlying token string * @return a reference to the underlying string @@ -62,14 +58,47 @@ class DescriptorToken { * @return Whether this token is equal to the given token */ bool operator==(DescriptorToken const& rhs) const { - // Note: we only need to compare the m_token field because m_regex and m_wildcard are - // derived from m_token. - return m_token == rhs.m_token; + return m_token == rhs.m_token && m_wildcard == rhs.m_wildcard; } private: - bool m_wildcard{}; - bool m_regex{}; + /** + * Initialize the token from a string and set flags based on whether the token contains + * wildcards + * @param token the string to initialize the token from + * @param bool true if the string should be interpreted as literal, and false + */ + explicit DescriptorToken(std::string_view const token, bool is_literal) : m_wildcard(false) { + if (is_literal) { + m_token = token; + return; + } + + if (token == "*") { + m_wildcard = true; + } + + bool escaped{false}; + for (size_t i = 0; i < token.size(); ++i) { + if (false == escaped) { + if ('\\' == token[i]) { + escaped = true; + } else { + m_token.push_back(token[i]); + } + continue; + } else { + m_token.push_back(token[i]); + escaped = false; + } + } + + if (escaped) { + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } + } + + bool m_wildcard{false}; std::string m_token; }; @@ -92,9 +121,27 @@ class ColumnDescriptor : public Literal { * @param descriptor(s) the token or list of tokens making up the descriptor * @return A ColumnDescriptor */ - static std::shared_ptr create(std::string const& descriptor); - static std::shared_ptr create(std::vector const& descriptors); - static std::shared_ptr create(DescriptorList const& descriptors); + static std::shared_ptr create_from_escaped_token(std::string const& token); + static std::shared_ptr create_from_escaped_tokens( + std::vector const& tokens + ); + static std::shared_ptr create_from_descriptors( + DescriptorList const& descriptors + ); + + /** + * Insert an entire DescriptorList into this ColumnDescriptor at before given position. + * @param pos an iterator to the position inside of the internal descriptor list to insert + * before. + * @param source the list of descriptors to insert + */ + void insert(DescriptorList::iterator pos, DescriptorList const& source) { + m_descriptors.insert(pos, source.begin(), source.end()); + check_and_set_unresolved_descriptor_flag(); + if (is_unresolved_descriptor()) { + simplify_descriptor_wildcards(); + } + } /** * Deep copy of this ColumnDescriptor diff --git a/components/core/src/clp_s/search/SchemaMatch.cpp b/components/core/src/clp_s/search/SchemaMatch.cpp index 203634000..6b5f2ce8c 100644 --- a/components/core/src/clp_s/search/SchemaMatch.cpp +++ b/components/core/src/clp_s/search/SchemaMatch.cpp @@ -77,12 +77,15 @@ std::shared_ptr SchemaMatch::populate_column_mapping(std::shared_ptr auto literal_type = node_to_literal_type(node->get_type()); DescriptorList descriptors; while (node->get_id() != m_tree->get_object_subtree_node_id()) { - // may have to explicitly mark non-regex - descriptors.emplace_back(node->get_key_name()); + descriptors.emplace_back( + DescriptorToken::create_descriptor_from_literal_token( + node->get_key_name() + ) + ); node = &m_tree->get_node(node->get_parent_id()); } std::reverse(descriptors.begin(), descriptors.end()); - auto resolved_column = ColumnDescriptor::create(descriptors); + auto resolved_column = ColumnDescriptor::create_from_descriptors(descriptors); resolved_column->set_matching_type(literal_type); *it = resolved_column; cur->copy_append(possibilities.get()); diff --git a/components/core/src/clp_s/search/kql/Kql.g4 b/components/core/src/clp_s/search/kql/Kql.g4 index 33abf66bd..26c650ed2 100644 --- a/components/core/src/clp_s/search/kql/Kql.g4 +++ b/components/core/src/clp_s/search/kql/Kql.g4 @@ -48,36 +48,28 @@ QUOTED_STRING: '"' QUOTED_CHARACTER* '"' ; UNQUOTED_LITERAL: UNQUOTED_CHARACTER+ ; -// TODO handle unicode fragment QUOTED_CHARACTER : ESCAPED_SPACE | '\\"' | ~'"' ; -// TODO: handle unicode fragment UNQUOTED_CHARACTER : ESCAPED_SPACE | ESCAPED_SPECIAL_CHARACTER - | ESCAPED_KEYWORD | WILDCARD + | UNICODE | ~[\\():<>"{} \r\n\t] ; fragment WILDCARD: '*' | '?'; -// TODO: unescape keywords -fragment ESCAPED_KEYWORD - : '\\' KEYWORD - ; - fragment KEYWORD : AND | OR | NOT ; - RANGE_OPERATOR : '<=' | '>=' @@ -99,9 +91,8 @@ fragment SPECIAL_CHARACTER : [\\():<>"*?{}.] ; - // For unicode hex -//UNICODE: 'u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT ; -//fragment HEXDIGIT: [0-9a-fA-F]+ ; +fragment UNICODE: '\\u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT ; +fragment HEXDIGIT: [0-9a-fA-F]+ ; SPACE: [ \t\r\n] -> skip ; diff --git a/components/core/src/clp_s/search/kql/kql.cpp b/components/core/src/clp_s/search/kql/kql.cpp index 972e44ad7..098886671 100644 --- a/components/core/src/clp_s/search/kql/kql.cpp +++ b/components/core/src/clp_s/search/kql/kql.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -54,7 +55,7 @@ class ParseTreeVisitor : public KqlBaseVisitor { private: static void prepend_column(std::shared_ptr const& desc, DescriptorList const& prefix) { - desc->get_descriptor_list().insert(desc->descriptor_begin(), prefix.begin(), prefix.end()); + desc->insert(desc->get_descriptor_list().begin(), prefix); } void prepend_column(std::shared_ptr const& expr, DescriptorList const& prefix) { @@ -69,7 +70,7 @@ class ParseTreeVisitor : public KqlBaseVisitor { public: static std::string unquote_string(std::string const& text) { - if (text.at(0) == '"') { + if (false == text.empty() && '"' == text.at(0)) { return text.substr(1, text.length() - 2); } else { return text; @@ -83,7 +84,11 @@ class ParseTreeVisitor : public KqlBaseVisitor { } static std::shared_ptr unquote_literal(std::string const& text) { - std::string token = unquote_string(text); + std::string token; + if (false == StringUtils::unescape_kql_value(unquote_string(text), token)) { + SPDLOG_ERROR("Can not parse invalid literal: {}", text); + throw std::runtime_error{"Invalid literal."}; + } if (auto ret = Integral::create_from_string(token)) { return ret; @@ -97,7 +102,11 @@ class ParseTreeVisitor : public KqlBaseVisitor { } static std::shared_ptr unquote_date_literal(std::string const& text) { - std::string token = unquote_date_string(text); + std::string token; + if (false == StringUtils::unescape_kql_value(unquote_date_string(text), token)) { + SPDLOG_ERROR("Can not parse invalid date literal: {}", text); + throw std::runtime_error{"Invalid date literal."}; + } return DateLiteral::create_from_string(token); } @@ -117,7 +126,7 @@ class ParseTreeVisitor : public KqlBaseVisitor { return nullptr; } - return ColumnDescriptor::create(descriptor_tokens); + return ColumnDescriptor::create_from_escaped_tokens(descriptor_tokens); } std::any visitNestedQuery(KqlParser::NestedQueryContext* ctx) override { @@ -193,7 +202,7 @@ class ParseTreeVisitor : public KqlBaseVisitor { std::any visitValue_expression(KqlParser::Value_expressionContext* ctx) override { auto lit = unquote_literal(ctx->LITERAL()->getText()); - auto descriptor = ColumnDescriptor::create("*"); + auto descriptor = ColumnDescriptor::create_from_escaped_token("*"); return FilterExpr::create(descriptor, FilterOperation::EQ, lit); } @@ -213,7 +222,7 @@ class ParseTreeVisitor : public KqlBaseVisitor { base = OrExpr::create(); } - auto empty_descriptor = ColumnDescriptor::create(DescriptorList()); + auto empty_descriptor = ColumnDescriptor::create_from_descriptors(DescriptorList()); for (auto token : ctx->literals) { auto literal = unquote_literal(token->getText()); auto expr = FilterExpr::create( diff --git a/components/core/tests/test-kql.cpp b/components/core/tests/test-kql.cpp index 2646ff5e0..9eebc8f05 100644 --- a/components/core/tests/test-kql.cpp +++ b/components/core/tests/test-kql.cpp @@ -65,7 +65,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") { REQUIRE(false == filter->is_inverted()); REQUIRE(FilterOperation::EQ == filter->get_operation()); REQUIRE(1 == filter->get_column()->get_descriptor_list().size()); - REQUIRE(DescriptorToken{"key"} == *filter->get_column()->descriptor_begin()); + auto const key_token = DescriptorToken::create_descriptor_from_escaped_token("key"); + REQUIRE(key_token == *filter->get_column()->descriptor_begin()); std::string extracted_value; REQUIRE(filter->get_operand()->as_var_string(extracted_value, filter->get_operation())); REQUIRE("value" == extracted_value); @@ -82,7 +83,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") { REQUIRE(true == not_filter->is_inverted()); REQUIRE(FilterOperation::EQ == not_filter->get_operation()); REQUIRE(1 == not_filter->get_column()->get_descriptor_list().size()); - REQUIRE(DescriptorToken{"key"} == *not_filter->get_column()->descriptor_begin()); + auto const key_token = DescriptorToken::create_descriptor_from_escaped_token("key"); + REQUIRE(key_token == *not_filter->get_column()->descriptor_begin()); std::string extracted_value; REQUIRE(not_filter->get_operand() ->as_var_string(extracted_value, not_filter->get_operation())); @@ -127,7 +129,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") { std::string key = "key" + std::to_string(suffix_number); std::string value = "value" + std::to_string(suffix_number); REQUIRE(value == extracted_value); - REQUIRE(DescriptorToken{key} == *filter->get_column()->descriptor_begin()); + auto const key_token = DescriptorToken::create_descriptor_from_escaped_token(key); + REQUIRE(key_token == *filter->get_column()->descriptor_begin()); ++suffix_number; } } @@ -176,7 +179,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") { std::string key = "key" + std::to_string(suffix_number); std::string value = "value" + std::to_string(suffix_number); REQUIRE(value == extracted_value); - REQUIRE(DescriptorToken{key} == *filter->get_column()->descriptor_begin()); + auto const key_token = DescriptorToken::create_descriptor_from_escaped_token(key); + REQUIRE(key_token == *filter->get_column()->descriptor_begin()); ++suffix_number; } } @@ -206,8 +210,10 @@ TEST_CASE("Test parsing KQL", "[KQL]") { REQUIRE(FilterOperation::EQ == filter->get_operation()); REQUIRE(2 == filter->get_column()->get_descriptor_list().size()); auto it = filter->get_column()->descriptor_begin(); - REQUIRE(DescriptorToken{"a.b"} == *it++); - REQUIRE(DescriptorToken{"c"} == *it++); + auto const a_b_token = DescriptorToken::create_descriptor_from_escaped_token("a.b"); + REQUIRE(a_b_token == *it++); + auto const c_token = DescriptorToken::create_descriptor_from_escaped_token("c"); + REQUIRE(c_token == *it++); } SECTION("Illegal escape sequences in column name") { From 407ed466d037871af445e48b12586e80019d73e2 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Fri, 13 Dec 2024 16:09:05 +0000 Subject: [PATCH 2/9] Change archive format to include only unescaped keys and values --- components/core/src/clp_s/ColumnReader.cpp | 22 +++++++++++++++++++ components/core/src/clp_s/ColumnReader.hpp | 17 +++++++++++++++ components/core/src/clp_s/JsonParser.cpp | 21 ++++++------------ components/core/src/clp_s/JsonSerializer.hpp | 23 ++++++++++++++++---- 4 files changed, 65 insertions(+), 18 deletions(-) diff --git a/components/core/src/clp_s/ColumnReader.cpp b/components/core/src/clp_s/ColumnReader.cpp index ded7a906d..10960f50e 100644 --- a/components/core/src/clp_s/ColumnReader.cpp +++ b/components/core/src/clp_s/ColumnReader.cpp @@ -2,6 +2,7 @@ #include "BufferViewReader.hpp" #include "ColumnWriter.hpp" +#include "Utils.hpp" #include "VariableDecoder.hpp" namespace clp_s { @@ -88,6 +89,20 @@ void ClpStringColumnReader::extract_string_value_into_buffer( VariableDecoder::decode_variables_into_message(entry, *m_var_dict, encoded_vars, buffer); } +void ClpStringColumnReader::extract_escaped_string_value_into_buffer( + uint64_t cur_message, + std::string& buffer +) { + if (false == m_is_array) { + // TODO: escape while decoding instead of after. + std::string tmp; + extract_string_value_into_buffer(cur_message, tmp); + StringUtils::escape_json_string(buffer, tmp); + } else { + extract_string_value_into_buffer(cur_message, buffer); + } +} + int64_t ClpStringColumnReader::get_encoded_id(uint64_t cur_message) { auto value = m_logtypes[cur_message]; return ClpStringColumnWriter::get_encoded_log_dict_id(value); @@ -125,6 +140,13 @@ void VariableStringColumnReader::extract_string_value_into_buffer( buffer.append(m_var_dict->get_value(m_variables[cur_message])); } +void VariableStringColumnReader::extract_escaped_string_value_into_buffer( + uint64_t cur_message, + std::string& buffer +) { + StringUtils::escape_json_string(buffer, m_var_dict->get_value(m_variables[cur_message])); +} + int64_t VariableStringColumnReader::get_variable_id(uint64_t cur_message) { return m_variables[cur_message]; } diff --git a/components/core/src/clp_s/ColumnReader.hpp b/components/core/src/clp_s/ColumnReader.hpp index 69995e6de..d66a75890 100644 --- a/components/core/src/clp_s/ColumnReader.hpp +++ b/components/core/src/clp_s/ColumnReader.hpp @@ -53,6 +53,17 @@ class BaseColumnReader { */ virtual void extract_string_value_into_buffer(uint64_t cur_message, std::string& buffer) = 0; + /** + * Extracts a value from the column, escapes it, and serializes it into a provided buffer as a + * string. + * @param cur_message + * @param buffer + */ + virtual void + extract_escaped_string_value_into_buffer(uint64_t cur_message, std::string& buffer) { + extract_string_value_into_buffer(cur_message, buffer); + } + private: int32_t m_id; }; @@ -152,6 +163,9 @@ class ClpStringColumnReader : public BaseColumnReader { void extract_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override; + void + extract_escaped_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override; + /** * Gets the encoded id of the variable * @param cur_message @@ -196,6 +210,9 @@ class VariableStringColumnReader : public BaseColumnReader { void extract_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override; + void + extract_escaped_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override; + /** * Gets the encoded id of the variable * @param cur_message diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index d14a221b3..f9b4f8c0c 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -63,7 +63,7 @@ void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_i size_t object_start = m_current_schema.start_unordered_object(NodeType::Object); ondemand::field cur_field; ondemand::value cur_value; - std::string cur_key; + std::string_view cur_key; int32_t node_id; while (true) { while (false == object_stack.empty() && object_it_stack.top() == object_stack.top().end()) { @@ -81,7 +81,7 @@ void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_i } cur_field = *object_it_stack.top(); - cur_key = std::string_view(cur_field.unescaped_key(true)); + cur_key = cur_field.unescaped_key(true); cur_value = cur_field.value(); switch (cur_value.type()) { @@ -131,9 +131,7 @@ void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_i break; } case ondemand::json_type::string: { - std::string value = std::string( - cur_value.raw_json_token().substr(1, cur_value.raw_json_token().size() - 2) - ); + std::string_view value = cur_value.get_string(true); if (value.find(' ') != std::string::npos) { node_id = m_archive_writer ->add_node(node_id_stack.top(), NodeType::ClpString, cur_key); @@ -209,9 +207,7 @@ void JsonParser::parse_array(ondemand::array array, int32_t parent_node_id) { break; } case ondemand::json_type::string: { - std::string value = std::string( - cur_value.raw_json_token().substr(1, cur_value.raw_json_token().size() - 2) - ); + std::string_view value = cur_value.get_string(true); if (value.find(' ') != std::string::npos) { node_id = m_archive_writer->add_node(parent_node_id, NodeType::ClpString, ""); } else { @@ -246,7 +242,7 @@ void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::s ondemand::field cur_field; - std::string cur_key = key; + std::string_view cur_key = key; node_id_stack.push(parent_node_id); bool can_match_timestamp = !m_timestamp_column.empty(); @@ -257,7 +253,7 @@ void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::s do { if (false == object_stack.empty()) { cur_field = *object_it_stack.top(); - cur_key = std::string(std::string_view(cur_field.unescaped_key(true))); + cur_key = cur_field.unescaped_key(true); line = cur_field.value(); if (may_match_timestamp) { if (object_stack.size() <= m_timestamp_column.size() @@ -353,10 +349,7 @@ void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::s break; } case ondemand::json_type::string: { - auto raw_json_token = line.raw_json_token(); - std::string value - = std::string(raw_json_token.substr(1, raw_json_token.rfind('"') - 1)); - + std::string_view value = line.get_string(true); if (matches_timestamp) { node_id = m_archive_writer->add_node( node_id_stack.top(), diff --git a/components/core/src/clp_s/JsonSerializer.hpp b/components/core/src/clp_s/JsonSerializer.hpp index 63f845525..d0eea817d 100644 --- a/components/core/src/clp_s/JsonSerializer.hpp +++ b/components/core/src/clp_s/JsonSerializer.hpp @@ -6,6 +6,9 @@ #include #include "ColumnReader.hpp" +#include "Utils.hpp" + +namespace clp_s { class JsonSerializer { public: @@ -67,7 +70,11 @@ class JsonSerializer { return false; } - void add_special_key(std::string_view const key) { m_special_keys.emplace_back(key); } + void add_special_key(std::string_view const key) { + std::string tmp; + StringUtils::escape_json_string(tmp, key); + m_special_keys.emplace_back(tmp); + } void begin_object() { append_key(); @@ -109,11 +116,11 @@ class JsonSerializer { m_json_string += "],"; } - void append_key() { append_key(m_special_keys[m_special_keys_index++]); } + void append_key() { append_escaped_key(m_special_keys[m_special_keys_index++]); } void append_key(std::string_view const key) { m_json_string += "\""; - m_json_string += key; + StringUtils::escape_json_string(m_json_string, key); m_json_string += "\":"; } @@ -130,11 +137,17 @@ class JsonSerializer { void append_value_from_column_with_quotes(clp_s::BaseColumnReader* column, uint64_t cur_message) { m_json_string += "\""; - column->extract_string_value_into_buffer(cur_message, m_json_string); + column->extract_escaped_string_value_into_buffer(cur_message, m_json_string); m_json_string += "\","; } private: + void append_escaped_key(std::string_view const key) { + m_json_string.push_back('"'); + m_json_string.append(key); + m_json_string.append("\":"); + } + std::string m_json_string; std::vector m_op_list; std::vector m_special_keys; @@ -143,4 +156,6 @@ class JsonSerializer { size_t m_special_keys_index{0}; }; +} // namespace clp_s + #endif // CLP_S_JSONSERIALIZER_HPP From 0b71b46cabf53c027758be6aad96c7a10bff9f80 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 19:10:40 +0000 Subject: [PATCH 3/9] Add tests for kql value escaping --- components/core/tests/test-kql.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/components/core/tests/test-kql.cpp b/components/core/tests/test-kql.cpp index 9eebc8f05..63a8ca7c2 100644 --- a/components/core/tests/test-kql.cpp +++ b/components/core/tests/test-kql.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include "../src/clp_s/search/AndExpr.hpp" @@ -238,4 +239,30 @@ TEST_CASE("Test parsing KQL", "[KQL]") { auto filter = parse_kql_expression(empty_token_column); REQUIRE(nullptr == filter); } + + SECTION("Escape sequences in value") { + auto translated_pair = GENERATE( + std::pair{"\\\\", "\\\\"}, + std::pair{"\\??", "\\??"}, + std::pair{"\\**", "\\**"}, + std::pair{"\u9999", "香"}, + std::pair{"\\r\\n\\t\\b\\f", "\r\n\t\b\f"}, + std::pair{"\\\"", "\""}, + std::pair{"\\{\\}\\(\\)\\<\\>", "{}()<>"} + ); + + auto formatted_query = fmt::format("*: \"{}\"", translated_pair.first); + stringstream query{formatted_query}; + auto filter = std::dynamic_pointer_cast(parse_kql_expression(query)); + REQUIRE(nullptr != filter); + REQUIRE(nullptr != filter->get_operand()); + REQUIRE(nullptr != filter->get_column()); + REQUIRE(false == filter->has_only_expression_operands()); + REQUIRE(false == filter->is_inverted()); + REQUIRE(FilterOperation::EQ == filter->get_operation()); + REQUIRE(true == filter->get_column()->is_pure_wildcard()); + std::string literal; + REQUIRE(true == filter->get_operand()->as_var_string(literal, FilterOperation::EQ)); + REQUIRE(literal == translated_pair.second); + } } From 850808421d09632dc727f812228fbce54375c445 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 19:23:34 +0000 Subject: [PATCH 4/9] Add tests for more edge cases --- components/core/tests/test-kql.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/components/core/tests/test-kql.cpp b/components/core/tests/test-kql.cpp index 63a8ca7c2..a39b12d07 100644 --- a/components/core/tests/test-kql.cpp +++ b/components/core/tests/test-kql.cpp @@ -245,10 +245,13 @@ TEST_CASE("Test parsing KQL", "[KQL]") { std::pair{"\\\\", "\\\\"}, std::pair{"\\??", "\\??"}, std::pair{"\\**", "\\**"}, - std::pair{"\u9999", "香"}, + std::pair{"\\u9999", "香"}, std::pair{"\\r\\n\\t\\b\\f", "\r\n\t\b\f"}, std::pair{"\\\"", "\""}, - std::pair{"\\{\\}\\(\\)\\<\\>", "{}()<>"} + std::pair{"\\{\\}\\(\\)\\<\\>", "{}()<>"}, + std::pair{"\\u003F", "\\?"}, + std::pair{"\\u002A", "\\*"}, + std::pair{"\\u005C", "\\\\"} ); auto formatted_query = fmt::format("*: \"{}\"", translated_pair.first); From 1483c356eddebbacbe3eb6fe192659a5d8a0f855 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 19:31:46 +0000 Subject: [PATCH 5/9] Address rabbit comments --- components/core/src/clp_s/Utils.hpp | 6 +++--- components/core/src/clp_s/search/kql/Kql.g4 | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index ab9298399..6f22878d4 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -230,8 +230,8 @@ class StringUtils { * (e.g. \n) and less common control sequences with unicode escape sequences (e.g. \u001f). The * '"' and '\' characters are escaped with a backslash. * - * @param source * @param destination + * @param source */ static void escape_json_string(std::string& destination, std::string_view const source); @@ -245,7 +245,7 @@ class StringUtils { * * @param value * @param unescaped - * @return true if the value was escaped succesfully and false otherwise. + * @return true if the value was unescaped successfully, false otherwise. */ static bool unescape_kql_value(std::string const& value, std::string& unescaped); @@ -280,7 +280,7 @@ class StringUtils { if ('\x00' <= nibble && nibble <= '\x09') { return '0' + (nibble - '\x00'); } else { - return 'a' + (nibble - '\x10'); + return 'a' + (nibble - '\x0a'); } }; diff --git a/components/core/src/clp_s/search/kql/Kql.g4 b/components/core/src/clp_s/search/kql/Kql.g4 index 26c650ed2..2737e88fa 100644 --- a/components/core/src/clp_s/search/kql/Kql.g4 +++ b/components/core/src/clp_s/search/kql/Kql.g4 @@ -93,6 +93,6 @@ fragment SPECIAL_CHARACTER // For unicode hex fragment UNICODE: '\\u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT ; -fragment HEXDIGIT: [0-9a-fA-F]+ ; +fragment HEXDIGIT: [0-9a-fA-F] ; SPACE: [ \t\r\n] -> skip ; From 3abe7b38e316c3b25e548f832f68ef7bab0a0730 Mon Sep 17 00:00:00 2001 From: Devin Gibson Date: Thu, 19 Dec 2024 15:19:07 -0500 Subject: [PATCH 6/9] Apply suggestions from code review Co-authored-by: wraymo <37269683+wraymo@users.noreply.github.com> --- components/core/src/clp_s/Utils.cpp | 2 +- components/core/src/clp_s/Utils.hpp | 2 +- components/core/src/clp_s/search/ColumnDescriptor.hpp | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index 976c3ee14..0b7925c44 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -540,7 +540,7 @@ void StringUtils::escape_json_string(std::string& destination, std::string_view namespace { /** - * Convert a four byte hex sequence to utf8. We perform the conversion in this cumbersome way + * Converts a four byte hex sequence to utf8. We perform the conversion in this cumbersome way * because c++20 deprecates most of the much more convenient std::codecvt utilities. */ bool convert_four_byte_hex_to_utf8(std::string_view const hex, std::string& destination) { diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index 6f22878d4..7fdc4a886 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -300,7 +300,7 @@ class StringUtils { } /** - * Unescape a KQL key or value with special handling for each case and append the unescaped + * Unescapes a KQL key or value with special handling for each case and append the unescaped * value to the `unescaped` buffer. * @param value * @param unescaped diff --git a/components/core/src/clp_s/search/ColumnDescriptor.hpp b/components/core/src/clp_s/search/ColumnDescriptor.hpp index 666910550..f982dd189 100644 --- a/components/core/src/clp_s/search/ColumnDescriptor.hpp +++ b/components/core/src/clp_s/search/ColumnDescriptor.hpp @@ -63,7 +63,7 @@ class DescriptorToken { private: /** - * Initialize the token from a string and set flags based on whether the token contains + * Initializes the token from a string and sets flags based on whether the token contains * wildcards * @param token the string to initialize the token from * @param bool true if the string should be interpreted as literal, and false @@ -130,10 +130,10 @@ class ColumnDescriptor : public Literal { ); /** - * Insert an entire DescriptorList into this ColumnDescriptor at before given position. - * @param pos an iterator to the position inside of the internal descriptor list to insert + * Inserts an entire DescriptorList into this ColumnDescriptor before the specified position. + * @param an iterator indicating the position in the internal descriptor list before which the new descriptors will be inserted. * before. - * @param source the list of descriptors to insert + * @param source the list of descriptors to be inserted */ void insert(DescriptorList::iterator pos, DescriptorList const& source) { m_descriptors.insert(pos, source.begin(), source.end()); From 10307f516d1a752e0f6fc175dd03a538ccd530c6 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Thu, 19 Dec 2024 20:37:10 +0000 Subject: [PATCH 7/9] Address code review comments --- components/core/src/clp_s/Utils.cpp | 11 +++++---- .../src/clp_s/search/ColumnDescriptor.cpp | 8 +++---- .../src/clp_s/search/ColumnDescriptor.hpp | 23 +++++++++++++++---- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index 0b7925c44..b33b2508a 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -579,11 +579,12 @@ bool StringUtils::unescape_kql_internal( ) { bool escaped{false}; for (size_t i = 0; i < value.size(); ++i) { - if (false == escaped && '\\' != value[i]) { - unescaped.push_back(value[i]); - continue; - } else if (false == escaped && '\\' == value[i]) { - escaped = true; + if (false == escaped) { + if ('\\' == value[i]) { + escaped = true; + } else { + unescaped.push_back(value[i]); + } continue; } diff --git a/components/core/src/clp_s/search/ColumnDescriptor.cpp b/components/core/src/clp_s/search/ColumnDescriptor.cpp index de944022e..d20f9e8f9 100644 --- a/components/core/src/clp_s/search/ColumnDescriptor.cpp +++ b/components/core/src/clp_s/search/ColumnDescriptor.cpp @@ -22,18 +22,18 @@ void ColumnDescriptor::check_and_set_unresolved_descriptor_flag() { } } -ColumnDescriptor::ColumnDescriptor(std::string const& descriptor) { +ColumnDescriptor::ColumnDescriptor(std::string const& token) { m_flags = cAllTypes; - m_descriptors.emplace_back(DescriptorToken::create_descriptor_from_escaped_token(descriptor)); + m_descriptors.emplace_back(DescriptorToken::create_descriptor_from_escaped_token(token)); check_and_set_unresolved_descriptor_flag(); if (is_unresolved_descriptor()) { simplify_descriptor_wildcards(); } } -ColumnDescriptor::ColumnDescriptor(std::vector const& descriptors) { +ColumnDescriptor::ColumnDescriptor(std::vector const& tokens) { m_flags = cAllTypes; - m_descriptors = std::move(tokenize_descriptor(descriptors)); + m_descriptors = std::move(tokenize_descriptor(tokens)); check_and_set_unresolved_descriptor_flag(); if (is_unresolved_descriptor()) { simplify_descriptor_wildcards(); diff --git a/components/core/src/clp_s/search/ColumnDescriptor.hpp b/components/core/src/clp_s/search/ColumnDescriptor.hpp index f982dd189..d675d7ed0 100644 --- a/components/core/src/clp_s/search/ColumnDescriptor.hpp +++ b/components/core/src/clp_s/search/ColumnDescriptor.hpp @@ -117,22 +117,35 @@ DescriptorList tokenize_descriptor(std::vector const& descriptors); class ColumnDescriptor : public Literal { public: /** - * Create a ColumnDescriptor literal from an integral value - * @param descriptor(s) the token or list of tokens making up the descriptor - * @return A ColumnDescriptor + * Creates a ColumnDescriptor literal from a list of escaped tokens. + * + * In particular there are escaping rules for literal '\' and '*': + * \\ -> literal \ + * \* -> literal * + * + * A '*' that is not escaped is treated as a wildcard descriptor. + * + * @param token(s) the escaped token or list of escaped tokens making up the descriptor + * @return A ColumnDescriptor literal */ static std::shared_ptr create_from_escaped_token(std::string const& token); static std::shared_ptr create_from_escaped_tokens( std::vector const& tokens ); + + /** + * Create a ColumnDescriptor literal from a list of already-parsed DescriptorToken. + * @param descriptors the list of parsed DescriptorToken + * @return A ColumnDescriptor literal + */ static std::shared_ptr create_from_descriptors( DescriptorList const& descriptors ); /** * Inserts an entire DescriptorList into this ColumnDescriptor before the specified position. - * @param an iterator indicating the position in the internal descriptor list before which the new descriptors will be inserted. - * before. + * @param an iterator indicating the position in the internal descriptor list before which the + * new descriptors will be inserted. * @param source the list of descriptors to be inserted */ void insert(DescriptorList::iterator pos, DescriptorList const& source) { From 2042a1e5c2b6e6a06eb6b73828f6787a4b2476ee Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Thu, 19 Dec 2024 20:51:44 +0000 Subject: [PATCH 8/9] Remove escaping in ir ingestion path that is unnecessary because of this PR --- components/core/src/clp_s/CMakeLists.txt | 2 -- components/core/src/clp_s/JsonParser.cpp | 45 +++++------------------- 2 files changed, 8 insertions(+), 39 deletions(-) diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index 9ca0c947e..1001085c8 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -23,8 +23,6 @@ set( ../clp/ffi/KeyValuePairLogEvent.hpp ../clp/ffi/SchemaTree.cpp ../clp/ffi/SchemaTree.hpp - ../clp/ffi/utils.cpp - ../clp/ffi/utils.hpp ../clp/ffi/Value.hpp ../clp/FileDescriptor.cpp ../clp/FileDescriptor.hpp diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 46078f411..aaf8188ce 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -16,7 +16,6 @@ #include "../clp/ffi/ir_stream/IrUnitType.hpp" #include "../clp/ffi/KeyValuePairLogEvent.hpp" #include "../clp/ffi/SchemaTree.hpp" -#include "../clp/ffi/utils.hpp" #include "../clp/ffi/Value.hpp" #include "../clp/ir/EncodedTextAst.hpp" #include "../clp/streaming_compression/zstd/Decompressor.hpp" @@ -654,18 +653,11 @@ auto JsonParser::add_node_to_archive_and_translations( NodeType archive_node_type, int32_t parent_node_id ) -> int { - auto validated_escaped_key - = clp::ffi::validate_and_escape_utf8_string(ir_node_to_add.get_key_name()); - std::string node_key; - if (validated_escaped_key.has_value()) { - node_key = validated_escaped_key.value(); - } else { - SPDLOG_ERROR("Key is not UTF-8 compliant: \"{}\"", ir_node_to_add.get_key_name()); - throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); - } - int const curr_node_archive_id - = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key); - + int const curr_node_archive_id = m_archive_writer->add_node( + parent_node_id, + archive_node_type, + ir_node_to_add.get_key_name() + ); m_ir_node_to_archive_node_id_mapping.emplace( std::make_pair(ir_node_id, archive_node_type), curr_node_archive_id @@ -759,23 +751,10 @@ void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv) { m_current_parsed_message.add_value(node_id, b_value); } break; case NodeType::VarString: { - auto const validated_escaped_string = clp::ffi::validate_and_escape_utf8_string( - pair.second.value().get_immutable_view() - ); - std::string str; - if (validated_escaped_string.has_value()) { - str = validated_escaped_string.value(); - } else { - SPDLOG_ERROR( - "String is not utf8 compliant: \"{}\"", - pair.second.value().get_immutable_view() - ); - throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); - } - m_current_parsed_message.add_value(node_id, str); + auto const var_value{pair.second.value().get_immutable_view()}; + m_current_parsed_message.add_value(node_id, var_value); } break; case NodeType::ClpString: { - std::string encoded_str; std::string decoded_value; if (pair.second.value().is()) { decoded_value = pair.second.value() @@ -789,15 +768,7 @@ void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv) { .decode_and_unparse() .value(); } - auto const validated_escaped_encoded_string - = clp::ffi::validate_and_escape_utf8_string(decoded_value.c_str()); - if (validated_escaped_encoded_string.has_value()) { - encoded_str = validated_escaped_encoded_string.value(); - } else { - SPDLOG_ERROR("Encoded string is not utf8 compliant: \"{}\"", decoded_value); - throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); - } - m_current_parsed_message.add_value(node_id, encoded_str); + m_current_parsed_message.add_value(node_id, decoded_value); } break; case NodeType::UnstructuredArray: { std::string array_str; From e236e619dd378adbf1af9985a0ce91aa3c928db0 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Sat, 21 Dec 2024 21:54:12 +0000 Subject: [PATCH 9/9] Fix array search bug --- components/core/src/clp_s/search/Output.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/components/core/src/clp_s/search/Output.cpp b/components/core/src/clp_s/search/Output.cpp index c9954779b..7b632b389 100644 --- a/components/core/src/clp_s/search/Output.cpp +++ b/components/core/src/clp_s/search/Output.cpp @@ -666,9 +666,7 @@ bool Output::evaluate_array_filter_object( } for (auto field : object) { - // Note: field.key() yields the escaped JSON key, so the descriptor tokens passed to search - // must likewise be escaped. - if (field.key() != unresolved_tokens[cur_idx].get_token()) { + if (field.unescaped_key(true).value() != unresolved_tokens[cur_idx].get_token()) { continue; }