diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index f1fa3857c..a318a1740 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -34,8 +34,6 @@ set( ../clp/ffi/KeyValuePairLogEvent.hpp ../clp/ffi/SchemaTree.cpp ../clp/ffi/SchemaTree.hpp - ../clp/ffi/utils.cpp - ../clp/ffi/utils.hpp ../clp/ffi/Value.hpp ../clp/FileDescriptor.cpp ../clp/FileDescriptor.hpp diff --git a/components/core/src/clp_s/ColumnReader.cpp b/components/core/src/clp_s/ColumnReader.cpp index ded7a906d..10960f50e 100644 --- a/components/core/src/clp_s/ColumnReader.cpp +++ b/components/core/src/clp_s/ColumnReader.cpp @@ -2,6 +2,7 @@ #include "BufferViewReader.hpp" #include "ColumnWriter.hpp" +#include "Utils.hpp" #include "VariableDecoder.hpp" namespace clp_s { @@ -88,6 +89,20 @@ void ClpStringColumnReader::extract_string_value_into_buffer( VariableDecoder::decode_variables_into_message(entry, *m_var_dict, encoded_vars, buffer); } +void ClpStringColumnReader::extract_escaped_string_value_into_buffer( + uint64_t cur_message, + std::string& buffer +) { + if (false == m_is_array) { + // TODO: escape while decoding instead of after. + std::string tmp; + extract_string_value_into_buffer(cur_message, tmp); + StringUtils::escape_json_string(buffer, tmp); + } else { + extract_string_value_into_buffer(cur_message, buffer); + } +} + int64_t ClpStringColumnReader::get_encoded_id(uint64_t cur_message) { auto value = m_logtypes[cur_message]; return ClpStringColumnWriter::get_encoded_log_dict_id(value); @@ -125,6 +140,13 @@ void VariableStringColumnReader::extract_string_value_into_buffer( buffer.append(m_var_dict->get_value(m_variables[cur_message])); } +void VariableStringColumnReader::extract_escaped_string_value_into_buffer( + uint64_t cur_message, + std::string& buffer +) { + StringUtils::escape_json_string(buffer, m_var_dict->get_value(m_variables[cur_message])); +} + int64_t VariableStringColumnReader::get_variable_id(uint64_t cur_message) { return m_variables[cur_message]; } diff --git a/components/core/src/clp_s/ColumnReader.hpp b/components/core/src/clp_s/ColumnReader.hpp index 69995e6de..d66a75890 100644 --- a/components/core/src/clp_s/ColumnReader.hpp +++ b/components/core/src/clp_s/ColumnReader.hpp @@ -53,6 +53,17 @@ class BaseColumnReader { */ virtual void extract_string_value_into_buffer(uint64_t cur_message, std::string& buffer) = 0; + /** + * Extracts a value from the column, escapes it, and serializes it into a provided buffer as a + * string. + * @param cur_message + * @param buffer + */ + virtual void + extract_escaped_string_value_into_buffer(uint64_t cur_message, std::string& buffer) { + extract_string_value_into_buffer(cur_message, buffer); + } + private: int32_t m_id; }; @@ -152,6 +163,9 @@ class ClpStringColumnReader : public BaseColumnReader { void extract_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override; + void + extract_escaped_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override; + /** * Gets the encoded id of the variable * @param cur_message @@ -196,6 +210,9 @@ class VariableStringColumnReader : public BaseColumnReader { void extract_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override; + void + extract_escaped_string_value_into_buffer(uint64_t cur_message, std::string& buffer) override; + /** * Gets the encoded id of the variable * @param cur_message diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 21e3b0cfd..bc4ebae00 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -18,7 +18,6 @@ #include "../clp/ffi/ir_stream/IrUnitType.hpp" #include "../clp/ffi/KeyValuePairLogEvent.hpp" #include "../clp/ffi/SchemaTree.hpp" -#include "../clp/ffi/utils.hpp" #include "../clp/ffi/Value.hpp" #include "../clp/ir/EncodedTextAst.hpp" #include "../clp/NetworkReader.hpp" @@ -125,7 +124,7 @@ void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_i size_t object_start = m_current_schema.start_unordered_object(NodeType::Object); ondemand::field cur_field; ondemand::value cur_value; - std::string cur_key; + std::string_view cur_key; int32_t node_id; while (true) { while (false == object_stack.empty() && object_it_stack.top() == object_stack.top().end()) { @@ -143,7 +142,7 @@ void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_i } cur_field = *object_it_stack.top(); - cur_key = std::string_view(cur_field.unescaped_key(true)); + cur_key = cur_field.unescaped_key(true); cur_value = cur_field.value(); switch (cur_value.type()) { @@ -193,9 +192,7 @@ void JsonParser::parse_obj_in_array(ondemand::object line, int32_t parent_node_i break; } case ondemand::json_type::string: { - std::string value = std::string( - cur_value.raw_json_token().substr(1, cur_value.raw_json_token().size() - 2) - ); + std::string_view value = cur_value.get_string(true); if (value.find(' ') != std::string::npos) { node_id = m_archive_writer ->add_node(node_id_stack.top(), NodeType::ClpString, cur_key); @@ -271,9 +268,7 @@ void JsonParser::parse_array(ondemand::array array, int32_t parent_node_id) { break; } case ondemand::json_type::string: { - std::string value = std::string( - cur_value.raw_json_token().substr(1, cur_value.raw_json_token().size() - 2) - ); + std::string_view value = cur_value.get_string(true); if (value.find(' ') != std::string::npos) { node_id = m_archive_writer->add_node(parent_node_id, NodeType::ClpString, ""); } else { @@ -308,7 +303,7 @@ void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::s ondemand::field cur_field; - std::string cur_key = key; + std::string_view cur_key = key; node_id_stack.push(parent_node_id); bool can_match_timestamp = !m_timestamp_column.empty(); @@ -319,7 +314,7 @@ void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::s do { if (false == object_stack.empty()) { cur_field = *object_it_stack.top(); - cur_key = std::string(std::string_view(cur_field.unescaped_key(true))); + cur_key = cur_field.unescaped_key(true); line = cur_field.value(); if (may_match_timestamp) { if (object_stack.size() <= m_timestamp_column.size() @@ -415,10 +410,7 @@ void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::s break; } case ondemand::json_type::string: { - auto raw_json_token = line.raw_json_token(); - std::string value - = std::string(raw_json_token.substr(1, raw_json_token.rfind('"') - 1)); - + std::string_view value = line.get_string(true); if (matches_timestamp) { node_id = m_archive_writer->add_node( node_id_stack.top(), @@ -679,18 +671,11 @@ auto JsonParser::add_node_to_archive_and_translations( NodeType archive_node_type, int32_t parent_node_id ) -> int { - auto validated_escaped_key - = clp::ffi::validate_and_escape_utf8_string(ir_node_to_add.get_key_name()); - std::string node_key; - if (validated_escaped_key.has_value()) { - node_key = validated_escaped_key.value(); - } else { - SPDLOG_ERROR("Key is not UTF-8 compliant: \"{}\"", ir_node_to_add.get_key_name()); - throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); - } - int const curr_node_archive_id - = m_archive_writer->add_node(parent_node_id, archive_node_type, node_key); - + int const curr_node_archive_id = m_archive_writer->add_node( + parent_node_id, + archive_node_type, + ir_node_to_add.get_key_name() + ); m_ir_node_to_archive_node_id_mapping.emplace( std::make_pair(ir_node_id, archive_node_type), curr_node_archive_id @@ -784,23 +769,10 @@ void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv) { m_current_parsed_message.add_value(node_id, b_value); } break; case NodeType::VarString: { - auto const validated_escaped_string = clp::ffi::validate_and_escape_utf8_string( - pair.second.value().get_immutable_view() - ); - std::string str; - if (validated_escaped_string.has_value()) { - str = validated_escaped_string.value(); - } else { - SPDLOG_ERROR( - "String is not utf8 compliant: \"{}\"", - pair.second.value().get_immutable_view() - ); - throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); - } - m_current_parsed_message.add_value(node_id, str); + auto const var_value{pair.second.value().get_immutable_view()}; + m_current_parsed_message.add_value(node_id, var_value); } break; case NodeType::ClpString: { - std::string encoded_str; std::string decoded_value; if (pair.second.value().is()) { decoded_value = pair.second.value() @@ -814,15 +786,7 @@ void JsonParser::parse_kv_log_event(KeyValuePairLogEvent const& kv) { .decode_and_unparse() .value(); } - auto const validated_escaped_encoded_string - = clp::ffi::validate_and_escape_utf8_string(decoded_value.c_str()); - if (validated_escaped_encoded_string.has_value()) { - encoded_str = validated_escaped_encoded_string.value(); - } else { - SPDLOG_ERROR("Encoded string is not utf8 compliant: \"{}\"", decoded_value); - throw OperationFailed(ErrorCodeFailure, __FILENAME__, __LINE__); - } - m_current_parsed_message.add_value(node_id, encoded_str); + m_current_parsed_message.add_value(node_id, decoded_value); } break; case NodeType::UnstructuredArray: { std::string array_str; diff --git a/components/core/src/clp_s/JsonSerializer.hpp b/components/core/src/clp_s/JsonSerializer.hpp index 63f845525..d0eea817d 100644 --- a/components/core/src/clp_s/JsonSerializer.hpp +++ b/components/core/src/clp_s/JsonSerializer.hpp @@ -6,6 +6,9 @@ #include #include "ColumnReader.hpp" +#include "Utils.hpp" + +namespace clp_s { class JsonSerializer { public: @@ -67,7 +70,11 @@ class JsonSerializer { return false; } - void add_special_key(std::string_view const key) { m_special_keys.emplace_back(key); } + void add_special_key(std::string_view const key) { + std::string tmp; + StringUtils::escape_json_string(tmp, key); + m_special_keys.emplace_back(tmp); + } void begin_object() { append_key(); @@ -109,11 +116,11 @@ class JsonSerializer { m_json_string += "],"; } - void append_key() { append_key(m_special_keys[m_special_keys_index++]); } + void append_key() { append_escaped_key(m_special_keys[m_special_keys_index++]); } void append_key(std::string_view const key) { m_json_string += "\""; - m_json_string += key; + StringUtils::escape_json_string(m_json_string, key); m_json_string += "\":"; } @@ -130,11 +137,17 @@ class JsonSerializer { void append_value_from_column_with_quotes(clp_s::BaseColumnReader* column, uint64_t cur_message) { m_json_string += "\""; - column->extract_string_value_into_buffer(cur_message, m_json_string); + column->extract_escaped_string_value_into_buffer(cur_message, m_json_string); m_json_string += "\","; } private: + void append_escaped_key(std::string_view const key) { + m_json_string.push_back('"'); + m_json_string.append(key); + m_json_string.append("\":"); + } + std::string m_json_string; std::vector m_op_list; std::vector m_special_keys; @@ -143,4 +156,6 @@ class JsonSerializer { size_t m_special_keys_index{0}; }; +} // namespace clp_s + #endif // CLP_S_JSONSERIALIZER_HPP diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index 19b564c03..55dc62733 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include "archive_constants.hpp" @@ -531,30 +532,247 @@ bool StringUtils::tokenize_column_descriptor( std::string const& descriptor, std::vector& tokens ) { - // TODO: add support for unicode sequences e.g. \u263A std::string cur_tok; - for (size_t cur = 0; cur < descriptor.size(); ++cur) { - if ('\\' == descriptor[cur]) { - ++cur; - if (cur >= descriptor.size()) { - return false; - } - } else if ('.' == descriptor[cur]) { - if (cur_tok.empty()) { - return false; + bool escaped{false}; + for (size_t i = 0; i < descriptor.size(); ++i) { + if (false == escaped) { + if ('\\' == descriptor[i]) { + escaped = true; + } else if ('.' == descriptor[i]) { + if (cur_tok.empty()) { + return false; + } + std::string unescaped_token; + if (unescape_kql_internal(cur_tok, unescaped_token, false)) { + tokens.push_back(unescaped_token); + cur_tok.clear(); + } else { + return false; + } + } else { + cur_tok.push_back(descriptor[i]); } - tokens.push_back(cur_tok); - cur_tok.clear(); continue; } - cur_tok.push_back(descriptor[cur]); + + escaped = false; + switch (descriptor[i]) { + case '.': + cur_tok.push_back('.'); + break; + default: + cur_tok.push_back('\\'); + cur_tok.push_back(descriptor[i]); + break; + } + } + + if (escaped) { + return false; } if (cur_tok.empty()) { return false; } - tokens.push_back(cur_tok); + std::string unescaped_token; + if (unescape_kql_internal(cur_tok, unescaped_token, false)) { + tokens.push_back(unescaped_token); + } else { + return false; + } + return true; +} + +void StringUtils::escape_json_string(std::string& destination, std::string_view const source) { + // Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path + // when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers + // a net decompression speedup of ~30% compared to adding every character to the destination one + // character at a time. + size_t slice_begin{0ULL}; + auto append_unescaped_slice = [&](size_t i) { + if (slice_begin < i) { + destination.append(source.substr(slice_begin, i - slice_begin)); + } + slice_begin = i + 1; + }; + for (size_t i = 0; i < source.size(); ++i) { + char c = source[i]; + switch (c) { + case '"': + append_unescaped_slice(i); + destination.append("\\\""); + break; + case '\\': + append_unescaped_slice(i); + destination.append("\\\\"); + break; + case '\t': + append_unescaped_slice(i); + destination.append("\\t"); + break; + case '\r': + append_unescaped_slice(i); + destination.append("\\r"); + break; + case '\n': + append_unescaped_slice(i); + destination.append("\\n"); + break; + case '\b': + append_unescaped_slice(i); + destination.append("\\b"); + break; + case '\f': + append_unescaped_slice(i); + destination.append("\\f"); + break; + default: + if ('\x00' <= c && c <= '\x1f') { + append_unescaped_slice(i); + char_to_escaped_four_char_hex(destination, c); + } + break; + } + } + append_unescaped_slice(source.size()); +} + +namespace { +/** + * Converts a four byte hex sequence to utf8. We perform the conversion in this cumbersome way + * because c++20 deprecates most of the much more convenient std::codecvt utilities. + */ +bool convert_four_byte_hex_to_utf8(std::string_view const hex, std::string& destination) { + std::string buf = "\"\\u"; + buf += hex; + buf.push_back('"'); + buf.reserve(buf.size() + simdjson::SIMDJSON_PADDING); + simdjson::ondemand::parser parser; + auto value = parser.iterate(buf); + try { + if (false == value.is_scalar()) { + return false; + } + + if (simdjson::ondemand::json_type::string != value.type()) { + return false; + } + + std::string_view unescaped_utf8 = value.get_string(false); + destination.append(unescaped_utf8); + } catch (std::exception const& e) { + return false; + } + return true; +} +} // namespace + +bool StringUtils::unescape_kql_value(std::string const& value, std::string& unescaped) { + return unescape_kql_internal(value, unescaped, true); +} + +bool StringUtils::unescape_kql_internal( + std::string const& value, + std::string& unescaped, + bool is_value +) { + bool escaped{false}; + for (size_t i = 0; i < value.size(); ++i) { + if (false == escaped) { + if ('\\' == value[i]) { + escaped = true; + } else { + unescaped.push_back(value[i]); + } + continue; + } + + escaped = false; + switch (value[i]) { + case '\\': + unescaped.append("\\\\"); + break; + case '"': + unescaped.push_back('"'); + break; + case 't': + unescaped.push_back('\t'); + break; + case 'r': + unescaped.push_back('\r'); + break; + case 'n': + unescaped.push_back('\n'); + break; + case 'b': + unescaped.push_back('\b'); + break; + case 'f': + unescaped.push_back('\f'); + break; + case 'u': { + size_t last_char_in_codepoint = i + 4; + if (value.size() <= last_char_in_codepoint) { + return false; + } + + auto four_byte_hex = std::string_view{value}.substr(i + 1, 4); + i += 4; + + std::string tmp; + if (false == convert_four_byte_hex_to_utf8(four_byte_hex, tmp)) { + return false; + } + + // Make sure unicode escape sequences are always treated as literal characters + if ("\\" == tmp) { + unescaped.append("\\\\"); + } else if ("?" == tmp && is_value) { + unescaped.append("\\?"); + } else if ("*" == tmp) { + unescaped.append("\\*"); + } else { + unescaped.append(tmp); + } + break; + } + case '{': + unescaped.push_back('{'); + break; + case '}': + unescaped.push_back('}'); + break; + case '(': + unescaped.push_back('('); + break; + case ')': + unescaped.push_back(')'); + break; + case '<': + unescaped.push_back('<'); + break; + case '>': + unescaped.push_back('>'); + break; + case '*': + unescaped.append("\\*"); + break; + case '?': + if (is_value) { + unescaped.append("\\?"); + } else { + unescaped.push_back('?'); + } + break; + default: + return false; + } + } + + if (escaped) { + return false; + } return true; } } // namespace clp_s diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index 0181a6749..83ac73621 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -1,6 +1,7 @@ #ifndef CLP_S_UTILS_HPP #define CLP_S_UTILS_HPP +#include #include #include #include @@ -243,7 +244,8 @@ class StringUtils { static bool convert_string_to_double(std::string const& raw, double& converted); /** - * Converts a string column descriptor delimited by '.' into a list of tokens + * Converts a KQL string column descriptor delimited by '.' into a list of tokens. The + * descriptor is tokenized and unescaped per the escaping rules for KQL columns. * @param descriptor * @param tokens * @return true if the descriptor was tokenized successfully, false otherwise @@ -251,6 +253,36 @@ class StringUtils { [[nodiscard]] static bool tokenize_column_descriptor(std::string const& descriptor, std::vector& tokens); + /** + * Escapes a string according to JSON string escaping rules and appends the escaped string to + * a buffer. The input string can be either ascii or UTF-8. + * + * According to the JSON spec JSON strings must escape control sequences (characters 0x00 + * through 0x1f) as well as the '"' and '\' characters. + * + * This function escapes common control sequences like newline with short escape sequences + * (e.g. \n) and less common control sequences with unicode escape sequences (e.g. \u001f). The + * '"' and '\' characters are escaped with a backslash. + * + * @param destination + * @param source + */ + static void escape_json_string(std::string& destination, std::string_view const source); + + /** + * Unescapes a KQL value string according to the escaping rules for KQL value strings and + * converts it into a valid CLP search string. + * + * Specifically this means that the string is unescaped, but the escape sequences '\\', '\*', + * and '\?' are preserved so that the resulting string can be interpreted correctly by CLP + * search. + * + * @param value + * @param unescaped + * @return true if the value was unescaped successfully, false otherwise. + */ + static bool unescape_kql_value(std::string const& value, std::string& unescaped); + private: /** * Helper for ``wildcard_match_unsafe_case_sensitive`` to advance the @@ -270,6 +302,47 @@ class StringUtils { char const*& wild_current, char const*& wild_bookmark ); + + /** + * Converts a character into its two byte hexadecimal representation. + * @param c + * @return the two byte hexadecimal representation of c as an array of two characters. + */ + static std::array char_to_hex(char c) { + std::array ret; + auto nibble_to_hex = [](char nibble) -> char { + if ('\x00' <= nibble && nibble <= '\x09') { + return '0' + (nibble - '\x00'); + } else { + return 'a' + (nibble - '\x0a'); + } + }; + + return std::array{nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)}; + } + + /** + * Converts a character into a unicode escape sequence (e.g. \u0000) and appends the escape + * sequences to the `destination` buffer. + * @param destination + * @param c + */ + static void char_to_escaped_four_char_hex(std::string& destination, char c) { + destination.append("\\u00"); + auto hex = char_to_hex(c); + destination.append(hex.data(), hex.size()); + } + + /** + * Unescapes a KQL key or value with special handling for each case and append the unescaped + * value to the `unescaped` buffer. + * @param value + * @param unescaped + * @param is_value + * @return true if the value was unescaped succesfully and false otherwise. + */ + static bool + unescape_kql_internal(std::string const& value, std::string& unescaped, bool is_value); }; enum EvaluatedValue { diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index c21a1f3d3..4d4124516 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -207,7 +207,7 @@ bool search_archive( SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column); return false; } - projection->add_column(ColumnDescriptor::create(descriptor_tokens)); + projection->add_column(ColumnDescriptor::create_from_escaped_tokens(descriptor_tokens)); } } catch (clp_s::TraceableException& e) { SPDLOG_ERROR("{}", e.what()); diff --git a/components/core/src/clp_s/search/AddTimestampConditions.cpp b/components/core/src/clp_s/search/AddTimestampConditions.cpp index 5af7ef44a..addf7069f 100644 --- a/components/core/src/clp_s/search/AddTimestampConditions.cpp +++ b/components/core/src/clp_s/search/AddTimestampConditions.cpp @@ -18,7 +18,8 @@ std::shared_ptr AddTimestampConditions::run(std::shared_ptr const& descriptors) { DescriptorList list; for (std::string const& descriptor : descriptors) { - list.push_back(DescriptorToken(descriptor)); + list.push_back(DescriptorToken::create_descriptor_from_escaped_token(descriptor)); } return list; } @@ -15,25 +15,25 @@ void ColumnDescriptor::check_and_set_unresolved_descriptor_flag() { m_unresolved_descriptors = false; m_pure_wildcard = m_descriptors.size() == 1 && m_descriptors[0].wildcard(); for (auto const& token : m_descriptors) { - if (token.wildcard() || token.regex()) { + if (token.wildcard()) { m_unresolved_descriptors = true; break; } } } -ColumnDescriptor::ColumnDescriptor(std::string const& descriptor) { +ColumnDescriptor::ColumnDescriptor(std::string const& token) { m_flags = cAllTypes; - m_descriptors.emplace_back(descriptor); + m_descriptors.emplace_back(DescriptorToken::create_descriptor_from_escaped_token(token)); check_and_set_unresolved_descriptor_flag(); if (is_unresolved_descriptor()) { simplify_descriptor_wildcards(); } } -ColumnDescriptor::ColumnDescriptor(std::vector const& descriptors) { +ColumnDescriptor::ColumnDescriptor(std::vector const& tokens) { m_flags = cAllTypes; - m_descriptors = std::move(tokenize_descriptor(descriptors)); + m_descriptors = std::move(tokenize_descriptor(tokens)); check_and_set_unresolved_descriptor_flag(); if (is_unresolved_descriptor()) { simplify_descriptor_wildcards(); @@ -49,17 +49,21 @@ ColumnDescriptor::ColumnDescriptor(DescriptorList const& descriptors) { } } -std::shared_ptr ColumnDescriptor::create(std::string const& descriptor) { - return std::shared_ptr(new ColumnDescriptor(descriptor)); +std::shared_ptr ColumnDescriptor::create_from_escaped_token( + std::string const& token +) { + return std::shared_ptr(new ColumnDescriptor(token)); } -std::shared_ptr ColumnDescriptor::create( - std::vector const& descriptors +std::shared_ptr ColumnDescriptor::create_from_escaped_tokens( + std::vector const& tokens ) { - return std::shared_ptr(new ColumnDescriptor(descriptors)); + return std::shared_ptr(new ColumnDescriptor(tokens)); } -std::shared_ptr ColumnDescriptor::create(DescriptorList const& descriptors) { +std::shared_ptr ColumnDescriptor::create_from_descriptors( + DescriptorList const& descriptors +) { return std::shared_ptr(new ColumnDescriptor(descriptors)); } diff --git a/components/core/src/clp_s/search/ColumnDescriptor.hpp b/components/core/src/clp_s/search/ColumnDescriptor.hpp index ea1cfd7ec..d675d7ed0 100644 --- a/components/core/src/clp_s/search/ColumnDescriptor.hpp +++ b/components/core/src/clp_s/search/ColumnDescriptor.hpp @@ -7,6 +7,7 @@ #include #include +#include "../TraceableException.hpp" #include "Literal.hpp" namespace clp_s::search { @@ -15,27 +16,29 @@ namespace clp_s::search { */ class DescriptorToken { public: - // Constructors - DescriptorToken() = default; + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + }; /** - * Initialize the token from a string and set flags based on whether the token contains - * wildcards - * @param token the string to initialize the token from + * Creates a DescriptorToken from an escaped token string. The escape sequences '\\' and '\*' + * are supported in order to distinguish the literal '*' from the '*' used to match hierarchies + * of keys. + * @param token */ - explicit DescriptorToken(std::string_view const token) - : m_token(token), - m_wildcard(false), - m_regex(false) { - if (token == "*") { - m_wildcard = true; - } + static DescriptorToken create_descriptor_from_escaped_token(std::string_view const token) { + return DescriptorToken{token, false}; + } - for (char c : token) { - if (c == '*') { - m_regex = true; - } - } + /** + * Creates a DescriptorToken from a literal token string. The token is copied verbatim, and is + * never treated as a wildcard. + */ + static DescriptorToken create_descriptor_from_literal_token(std::string_view const token) { + return DescriptorToken{token, true}; } /** @@ -44,13 +47,6 @@ class DescriptorToken { */ bool wildcard() const { return m_wildcard; } - /** - * Whether the descriptor contains a wildcard somewhere - * TODO: Not currently used, and regex isn't currently supported - * @return true if the descriptor contains a wildcard - */ - bool regex() const { return m_regex; } - /** * Get a reference to the underlying token string * @return a reference to the underlying string @@ -62,14 +58,47 @@ class DescriptorToken { * @return Whether this token is equal to the given token */ bool operator==(DescriptorToken const& rhs) const { - // Note: we only need to compare the m_token field because m_regex and m_wildcard are - // derived from m_token. - return m_token == rhs.m_token; + return m_token == rhs.m_token && m_wildcard == rhs.m_wildcard; } private: - bool m_wildcard{}; - bool m_regex{}; + /** + * Initializes the token from a string and sets flags based on whether the token contains + * wildcards + * @param token the string to initialize the token from + * @param bool true if the string should be interpreted as literal, and false + */ + explicit DescriptorToken(std::string_view const token, bool is_literal) : m_wildcard(false) { + if (is_literal) { + m_token = token; + return; + } + + if (token == "*") { + m_wildcard = true; + } + + bool escaped{false}; + for (size_t i = 0; i < token.size(); ++i) { + if (false == escaped) { + if ('\\' == token[i]) { + escaped = true; + } else { + m_token.push_back(token[i]); + } + continue; + } else { + m_token.push_back(token[i]); + escaped = false; + } + } + + if (escaped) { + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } + } + + bool m_wildcard{false}; std::string m_token; }; @@ -88,13 +117,44 @@ DescriptorList tokenize_descriptor(std::vector const& descriptors); class ColumnDescriptor : public Literal { public: /** - * Create a ColumnDescriptor literal from an integral value - * @param descriptor(s) the token or list of tokens making up the descriptor - * @return A ColumnDescriptor + * Creates a ColumnDescriptor literal from a list of escaped tokens. + * + * In particular there are escaping rules for literal '\' and '*': + * \\ -> literal \ + * \* -> literal * + * + * A '*' that is not escaped is treated as a wildcard descriptor. + * + * @param token(s) the escaped token or list of escaped tokens making up the descriptor + * @return A ColumnDescriptor literal */ - static std::shared_ptr create(std::string const& descriptor); - static std::shared_ptr create(std::vector const& descriptors); - static std::shared_ptr create(DescriptorList const& descriptors); + static std::shared_ptr create_from_escaped_token(std::string const& token); + static std::shared_ptr create_from_escaped_tokens( + std::vector const& tokens + ); + + /** + * Create a ColumnDescriptor literal from a list of already-parsed DescriptorToken. + * @param descriptors the list of parsed DescriptorToken + * @return A ColumnDescriptor literal + */ + static std::shared_ptr create_from_descriptors( + DescriptorList const& descriptors + ); + + /** + * Inserts an entire DescriptorList into this ColumnDescriptor before the specified position. + * @param an iterator indicating the position in the internal descriptor list before which the + * new descriptors will be inserted. + * @param source the list of descriptors to be inserted + */ + void insert(DescriptorList::iterator pos, DescriptorList const& source) { + m_descriptors.insert(pos, source.begin(), source.end()); + check_and_set_unresolved_descriptor_flag(); + if (is_unresolved_descriptor()) { + simplify_descriptor_wildcards(); + } + } /** * Deep copy of this ColumnDescriptor diff --git a/components/core/src/clp_s/search/Output.cpp b/components/core/src/clp_s/search/Output.cpp index c9954779b..7b632b389 100644 --- a/components/core/src/clp_s/search/Output.cpp +++ b/components/core/src/clp_s/search/Output.cpp @@ -666,9 +666,7 @@ bool Output::evaluate_array_filter_object( } for (auto field : object) { - // Note: field.key() yields the escaped JSON key, so the descriptor tokens passed to search - // must likewise be escaped. - if (field.key() != unresolved_tokens[cur_idx].get_token()) { + if (field.unescaped_key(true).value() != unresolved_tokens[cur_idx].get_token()) { continue; } diff --git a/components/core/src/clp_s/search/SchemaMatch.cpp b/components/core/src/clp_s/search/SchemaMatch.cpp index 203634000..6b5f2ce8c 100644 --- a/components/core/src/clp_s/search/SchemaMatch.cpp +++ b/components/core/src/clp_s/search/SchemaMatch.cpp @@ -77,12 +77,15 @@ std::shared_ptr SchemaMatch::populate_column_mapping(std::shared_ptr auto literal_type = node_to_literal_type(node->get_type()); DescriptorList descriptors; while (node->get_id() != m_tree->get_object_subtree_node_id()) { - // may have to explicitly mark non-regex - descriptors.emplace_back(node->get_key_name()); + descriptors.emplace_back( + DescriptorToken::create_descriptor_from_literal_token( + node->get_key_name() + ) + ); node = &m_tree->get_node(node->get_parent_id()); } std::reverse(descriptors.begin(), descriptors.end()); - auto resolved_column = ColumnDescriptor::create(descriptors); + auto resolved_column = ColumnDescriptor::create_from_descriptors(descriptors); resolved_column->set_matching_type(literal_type); *it = resolved_column; cur->copy_append(possibilities.get()); diff --git a/components/core/src/clp_s/search/kql/Kql.g4 b/components/core/src/clp_s/search/kql/Kql.g4 index 33abf66bd..2737e88fa 100644 --- a/components/core/src/clp_s/search/kql/Kql.g4 +++ b/components/core/src/clp_s/search/kql/Kql.g4 @@ -48,36 +48,28 @@ QUOTED_STRING: '"' QUOTED_CHARACTER* '"' ; UNQUOTED_LITERAL: UNQUOTED_CHARACTER+ ; -// TODO handle unicode fragment QUOTED_CHARACTER : ESCAPED_SPACE | '\\"' | ~'"' ; -// TODO: handle unicode fragment UNQUOTED_CHARACTER : ESCAPED_SPACE | ESCAPED_SPECIAL_CHARACTER - | ESCAPED_KEYWORD | WILDCARD + | UNICODE | ~[\\():<>"{} \r\n\t] ; fragment WILDCARD: '*' | '?'; -// TODO: unescape keywords -fragment ESCAPED_KEYWORD - : '\\' KEYWORD - ; - fragment KEYWORD : AND | OR | NOT ; - RANGE_OPERATOR : '<=' | '>=' @@ -99,9 +91,8 @@ fragment SPECIAL_CHARACTER : [\\():<>"*?{}.] ; - // For unicode hex -//UNICODE: 'u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT ; -//fragment HEXDIGIT: [0-9a-fA-F]+ ; +fragment UNICODE: '\\u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT ; +fragment HEXDIGIT: [0-9a-fA-F] ; SPACE: [ \t\r\n] -> skip ; diff --git a/components/core/src/clp_s/search/kql/kql.cpp b/components/core/src/clp_s/search/kql/kql.cpp index 972e44ad7..098886671 100644 --- a/components/core/src/clp_s/search/kql/kql.cpp +++ b/components/core/src/clp_s/search/kql/kql.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -54,7 +55,7 @@ class ParseTreeVisitor : public KqlBaseVisitor { private: static void prepend_column(std::shared_ptr const& desc, DescriptorList const& prefix) { - desc->get_descriptor_list().insert(desc->descriptor_begin(), prefix.begin(), prefix.end()); + desc->insert(desc->get_descriptor_list().begin(), prefix); } void prepend_column(std::shared_ptr const& expr, DescriptorList const& prefix) { @@ -69,7 +70,7 @@ class ParseTreeVisitor : public KqlBaseVisitor { public: static std::string unquote_string(std::string const& text) { - if (text.at(0) == '"') { + if (false == text.empty() && '"' == text.at(0)) { return text.substr(1, text.length() - 2); } else { return text; @@ -83,7 +84,11 @@ class ParseTreeVisitor : public KqlBaseVisitor { } static std::shared_ptr unquote_literal(std::string const& text) { - std::string token = unquote_string(text); + std::string token; + if (false == StringUtils::unescape_kql_value(unquote_string(text), token)) { + SPDLOG_ERROR("Can not parse invalid literal: {}", text); + throw std::runtime_error{"Invalid literal."}; + } if (auto ret = Integral::create_from_string(token)) { return ret; @@ -97,7 +102,11 @@ class ParseTreeVisitor : public KqlBaseVisitor { } static std::shared_ptr unquote_date_literal(std::string const& text) { - std::string token = unquote_date_string(text); + std::string token; + if (false == StringUtils::unescape_kql_value(unquote_date_string(text), token)) { + SPDLOG_ERROR("Can not parse invalid date literal: {}", text); + throw std::runtime_error{"Invalid date literal."}; + } return DateLiteral::create_from_string(token); } @@ -117,7 +126,7 @@ class ParseTreeVisitor : public KqlBaseVisitor { return nullptr; } - return ColumnDescriptor::create(descriptor_tokens); + return ColumnDescriptor::create_from_escaped_tokens(descriptor_tokens); } std::any visitNestedQuery(KqlParser::NestedQueryContext* ctx) override { @@ -193,7 +202,7 @@ class ParseTreeVisitor : public KqlBaseVisitor { std::any visitValue_expression(KqlParser::Value_expressionContext* ctx) override { auto lit = unquote_literal(ctx->LITERAL()->getText()); - auto descriptor = ColumnDescriptor::create("*"); + auto descriptor = ColumnDescriptor::create_from_escaped_token("*"); return FilterExpr::create(descriptor, FilterOperation::EQ, lit); } @@ -213,7 +222,7 @@ class ParseTreeVisitor : public KqlBaseVisitor { base = OrExpr::create(); } - auto empty_descriptor = ColumnDescriptor::create(DescriptorList()); + auto empty_descriptor = ColumnDescriptor::create_from_descriptors(DescriptorList()); for (auto token : ctx->literals) { auto literal = unquote_literal(token->getText()); auto expr = FilterExpr::create( diff --git a/components/core/tests/test-kql.cpp b/components/core/tests/test-kql.cpp index 2646ff5e0..a39b12d07 100644 --- a/components/core/tests/test-kql.cpp +++ b/components/core/tests/test-kql.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include "../src/clp_s/search/AndExpr.hpp" @@ -65,7 +66,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") { REQUIRE(false == filter->is_inverted()); REQUIRE(FilterOperation::EQ == filter->get_operation()); REQUIRE(1 == filter->get_column()->get_descriptor_list().size()); - REQUIRE(DescriptorToken{"key"} == *filter->get_column()->descriptor_begin()); + auto const key_token = DescriptorToken::create_descriptor_from_escaped_token("key"); + REQUIRE(key_token == *filter->get_column()->descriptor_begin()); std::string extracted_value; REQUIRE(filter->get_operand()->as_var_string(extracted_value, filter->get_operation())); REQUIRE("value" == extracted_value); @@ -82,7 +84,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") { REQUIRE(true == not_filter->is_inverted()); REQUIRE(FilterOperation::EQ == not_filter->get_operation()); REQUIRE(1 == not_filter->get_column()->get_descriptor_list().size()); - REQUIRE(DescriptorToken{"key"} == *not_filter->get_column()->descriptor_begin()); + auto const key_token = DescriptorToken::create_descriptor_from_escaped_token("key"); + REQUIRE(key_token == *not_filter->get_column()->descriptor_begin()); std::string extracted_value; REQUIRE(not_filter->get_operand() ->as_var_string(extracted_value, not_filter->get_operation())); @@ -127,7 +130,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") { std::string key = "key" + std::to_string(suffix_number); std::string value = "value" + std::to_string(suffix_number); REQUIRE(value == extracted_value); - REQUIRE(DescriptorToken{key} == *filter->get_column()->descriptor_begin()); + auto const key_token = DescriptorToken::create_descriptor_from_escaped_token(key); + REQUIRE(key_token == *filter->get_column()->descriptor_begin()); ++suffix_number; } } @@ -176,7 +180,8 @@ TEST_CASE("Test parsing KQL", "[KQL]") { std::string key = "key" + std::to_string(suffix_number); std::string value = "value" + std::to_string(suffix_number); REQUIRE(value == extracted_value); - REQUIRE(DescriptorToken{key} == *filter->get_column()->descriptor_begin()); + auto const key_token = DescriptorToken::create_descriptor_from_escaped_token(key); + REQUIRE(key_token == *filter->get_column()->descriptor_begin()); ++suffix_number; } } @@ -206,8 +211,10 @@ TEST_CASE("Test parsing KQL", "[KQL]") { REQUIRE(FilterOperation::EQ == filter->get_operation()); REQUIRE(2 == filter->get_column()->get_descriptor_list().size()); auto it = filter->get_column()->descriptor_begin(); - REQUIRE(DescriptorToken{"a.b"} == *it++); - REQUIRE(DescriptorToken{"c"} == *it++); + auto const a_b_token = DescriptorToken::create_descriptor_from_escaped_token("a.b"); + REQUIRE(a_b_token == *it++); + auto const c_token = DescriptorToken::create_descriptor_from_escaped_token("c"); + REQUIRE(c_token == *it++); } SECTION("Illegal escape sequences in column name") { @@ -232,4 +239,33 @@ TEST_CASE("Test parsing KQL", "[KQL]") { auto filter = parse_kql_expression(empty_token_column); REQUIRE(nullptr == filter); } + + SECTION("Escape sequences in value") { + auto translated_pair = GENERATE( + std::pair{"\\\\", "\\\\"}, + std::pair{"\\??", "\\??"}, + std::pair{"\\**", "\\**"}, + std::pair{"\\u9999", "香"}, + std::pair{"\\r\\n\\t\\b\\f", "\r\n\t\b\f"}, + std::pair{"\\\"", "\""}, + std::pair{"\\{\\}\\(\\)\\<\\>", "{}()<>"}, + std::pair{"\\u003F", "\\?"}, + std::pair{"\\u002A", "\\*"}, + std::pair{"\\u005C", "\\\\"} + ); + + auto formatted_query = fmt::format("*: \"{}\"", translated_pair.first); + stringstream query{formatted_query}; + auto filter = std::dynamic_pointer_cast(parse_kql_expression(query)); + REQUIRE(nullptr != filter); + REQUIRE(nullptr != filter->get_operand()); + REQUIRE(nullptr != filter->get_column()); + REQUIRE(false == filter->has_only_expression_operands()); + REQUIRE(false == filter->is_inverted()); + REQUIRE(FilterOperation::EQ == filter->get_operation()); + REQUIRE(true == filter->get_column()->is_pure_wildcard()); + std::string literal; + REQUIRE(true == filter->get_operand()->as_var_string(literal, FilterOperation::EQ)); + REQUIRE(literal == translated_pair.second); + } } diff --git a/docs/src/user-guide/reference-json-search-syntax.md b/docs/src/user-guide/reference-json-search-syntax.md index 18d0e4267..8a7013007 100644 --- a/docs/src/user-guide/reference-json-search-syntax.md +++ b/docs/src/user-guide/reference-json-search-syntax.md @@ -42,7 +42,7 @@ be interpreted as _exact_ searches unless they include [wildcards](#wildcards-in :::{note} Certain characters have special meanings when used in keys or values, so to search for the characters literally, you must escape them. For a list of such characters, see -[Escaping special characters](#escaping-special-characters). +[Characters that require escaping](#characters-that-require-escaping). ::: ### Querying nested kv-pairs @@ -164,13 +164,14 @@ There are three supported boolean operators: You can use parentheses (`()`) to apply an operator to a group of expressions. -### Escaping special characters +### Characters that require escaping Keys containing the following literal characters must escape the characters using a `\` (backslash): * `\` * `"` * `.` +* `*` Values containing the following literal characters must escape the characters using a `\` (backslash): @@ -191,6 +192,24 @@ characters using a `\` (backslash): * `{` * `}` +### Supported escape sequences + +Keys and values can represent Unicode codepoints using the `\uXXXX` escape sequence, where each `X` +is a hexadecimal character. + +Keys and values also support the following escape sequences to represent control characters: + +* `\b` +* `\f` +* `\n` +* `\r` +* `\t` + +:::{note} +The escape sequences in this section are described verbatim and don't need an extra backslash to +escape the backslash at the beginning of each sequence. +::: + ## Examples **Search for log events that contain a specific key-value pair:**