Skip to content

Commit

Permalink
feat: Add support for escaping characters in KQL key names. (y-scope#560
Browse files Browse the repository at this point in the history
)
  • Loading branch information
gibber9809 authored Nov 13, 2024
1 parent 2dfcd8a commit 53c4f52
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 17 deletions.
7 changes: 6 additions & 1 deletion components/core/src/clp_s/JsonParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@ JsonParser::JsonParser(JsonParserOption const& option)
}

if (false == m_timestamp_key.empty()) {
clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column);
if (false
== clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column))
{
SPDLOG_ERROR("Can not parse invalid timestamp key: \"{}\"", m_timestamp_key);
throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__);
}
}

for (auto& file_path : option.file_paths) {
Expand Down
4 changes: 3 additions & 1 deletion components/core/src/clp_s/TimestampDictionaryReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ void TimestampDictionaryReader::read_new_entries() {
TimestampEntry entry;
std::vector<std::string> tokens;
entry.try_read_from_file(m_dictionary_decompressor);
StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens);
if (false == StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens)) {
throw OperationFailed(ErrorCodeCorrupt, __FILENAME__, __LINE__);
}
m_entries.emplace_back(std::move(entry));

// TODO: Currently, we only allow a single authoritative timestamp column at ingestion time,
Expand Down
34 changes: 25 additions & 9 deletions components/core/src/clp_s/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -427,18 +427,34 @@ bool StringUtils::convert_string_to_double(std::string const& raw, double& conve
return true;
}

void StringUtils::tokenize_column_descriptor(
bool StringUtils::tokenize_column_descriptor(
std::string const& descriptor,
std::vector<std::string>& tokens
) {
// TODO: handle escaped . correctly
auto start = 0U;
auto end = descriptor.find('.');
while (end != std::string::npos) {
tokens.push_back(descriptor.substr(start, end - start));
start = end + 1;
end = descriptor.find('.', start);
// TODO: add support for unicode sequences e.g. \u263A
std::string cur_tok;
for (size_t cur = 0; cur < descriptor.size(); ++cur) {
if ('\\' == descriptor[cur]) {
++cur;
if (cur >= descriptor.size()) {
return false;
}
} else if ('.' == descriptor[cur]) {
if (cur_tok.empty()) {
return false;
}
tokens.push_back(cur_tok);
cur_tok.clear();
continue;
}
cur_tok.push_back(descriptor[cur]);
}
tokens.push_back(descriptor.substr(start));

if (cur_tok.empty()) {
return false;
}

tokens.push_back(cur_tok);
return true;
}
} // namespace clp_s
4 changes: 2 additions & 2 deletions components/core/src/clp_s/Utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,9 @@ class StringUtils {
* Converts a string column descriptor delimited by '.' into a list of tokens
* @param descriptor
* @param tokens
* @return the list of tokens pushed into the 'tokens' parameter
* @return true if the descriptor was tokenized successfully, false otherwise
*/
static void
[[nodiscard]] static bool
tokenize_column_descriptor(std::string const& descriptor, std::vector<std::string>& tokens);

private:
Expand Down
5 changes: 4 additions & 1 deletion components/core/src/clp_s/clp-s.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,10 @@ bool search_archive(
try {
for (auto const& column : command_line_arguments.get_projection_columns()) {
std::vector<std::string> descriptor_tokens;
StringUtils::tokenize_column_descriptor(column, descriptor_tokens);
if (false == StringUtils::tokenize_column_descriptor(column, descriptor_tokens)) {
SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column);
return false;
}
projection->add_column(ColumnDescriptor::create(descriptor_tokens));
}
} catch (clp_s::TraceableException& e) {
Expand Down
2 changes: 1 addition & 1 deletion components/core/src/clp_s/search/kql/Kql.g4
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ fragment ESCAPED_SPACE
;
fragment SPECIAL_CHARACTER
: [\\():<>"*?{}]
: [\\():<>"*?{}.]
;


Expand Down
11 changes: 9 additions & 2 deletions components/core/src/clp_s/search/kql/kql.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,10 @@ class ParseTreeVisitor : public KqlBaseVisitor {
std::string column = unquote_string(ctx->LITERAL()->getText());

std::vector<std::string> descriptor_tokens;
StringUtils::tokenize_column_descriptor(column, descriptor_tokens);
if (false == StringUtils::tokenize_column_descriptor(column, descriptor_tokens)) {
SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column);
return nullptr;
}

return ColumnDescriptor::create(descriptor_tokens);
}
Expand Down Expand Up @@ -248,6 +251,10 @@ std::shared_ptr<Expression> parse_kql_expression(std::istream& in) {
}

ParseTreeVisitor visitor;
return std::any_cast<std::shared_ptr<Expression>>(visitor.visitStart(tree));
try {
return std::any_cast<std::shared_ptr<Expression>>(visitor.visitStart(tree));
} catch (std::exception& e) {
return {};
}
}
} // namespace clp_s::search::kql
45 changes: 45 additions & 0 deletions components/core/tests/test-kql.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,49 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
auto failure = parse_kql_expression(incorrect_query);
REQUIRE(nullptr == failure);
}

SECTION("Escape sequences in column name") {
auto query = GENERATE(
"a\\.b.c: *",
"\"a\\.b.c\": *",
"a\\.b: {c: *}",
"\"a\\.b\": {\"c\": *}"
);
stringstream escaped_column_query{query};
auto filter
= std::dynamic_pointer_cast<FilterExpr>(parse_kql_expression(escaped_column_query));
REQUIRE(nullptr != filter);
REQUIRE(nullptr != filter->get_operand());
REQUIRE(nullptr != filter->get_column());
REQUIRE(false == filter->has_only_expression_operands());
REQUIRE(false == filter->is_inverted());
REQUIRE(FilterOperation::EQ == filter->get_operation());
REQUIRE(2 == filter->get_column()->get_descriptor_list().size());
auto it = filter->get_column()->descriptor_begin();
REQUIRE(DescriptorToken{"a.b"} == *it++);
REQUIRE(DescriptorToken{"c"} == *it++);
}

SECTION("Illegal escape sequences in column name") {
auto query = GENERATE(
//"a\\:*", this case is technically legal since ':' gets escaped
"\"a\\\":*",
"a\\ :*",
"\"a\\\" :*",
"a.:*",
"\"a.\":*",
"a. :*",
"\"a.\" :*"
);
stringstream illegal_escape{query};
auto filter = parse_kql_expression(illegal_escape);
REQUIRE(nullptr == filter);
}

SECTION("Empty token in column name") {
auto query = GENERATE(".a:*", "a.:*", "a..c:*", "a.b.:*");
stringstream empty_token_column{query};
auto filter = parse_kql_expression(empty_token_column);
REQUIRE(nullptr == filter);
}
}

0 comments on commit 53c4f52

Please sign in to comment.