Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix incorrect logtype encoding by properly escaping variable placeholder characters (fixes #163). #162

Merged
merged 32 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c91f625
Move parsing methods from ffi to ir namespace.
kirkrodrigues Sep 16, 2023
d0779ba
Replace existing parsing methods with those from ir namespace.
kirkrodrigues Sep 17, 2023
e2474fa
Improve performance of parsing methods from ffi to match those in CLP.
kirkrodrigues Sep 17, 2023
1c920b4
Fix build error
kirkrodrigues Sep 17, 2023
aa1d327
C++ 101: Inline methods should be defined in the header.
kirkrodrigues Sep 17, 2023
7fd4656
Replace LogTypeDictionaryEntry::VarDelim with ir::VariablePlaceholder…
kirkrodrigues Sep 17, 2023
22dc820
Apply suggestions from code review
kirkrodrigues Sep 17, 2023
10e339a
Add suggestion from code review with formatting.
kirkrodrigues Sep 17, 2023
22bfa84
Fix dictionary collision for compression/decompression
LinZhihao-723 Sep 18, 2023
e73d284
Merge
LinZhihao-723 Sep 18, 2023
7d6d749
Fix the generated log type
LinZhihao-723 Sep 18, 2023
8ebf5de
Fix var tracking
LinZhihao-723 Sep 18, 2023
5963bbd
Add missing const
LinZhihao-723 Sep 18, 2023
6199eec
Fix search
LinZhihao-723 Sep 18, 2023
40f02b0
Track the number of escape placeholders when parsing the logtype in t…
LinZhihao-723 Sep 18, 2023
a72d52f
Don't unescape logtypes when ingesting IR log events.
kirkrodrigues Sep 19, 2023
251b8a7
Double escape when building the search logtype
LinZhihao-723 Sep 23, 2023
b61e324
Refactor the code to reduce the inner branching
LinZhihao-723 Sep 24, 2023
dde3dfb
Apply suggestions from code review
LinZhihao-723 Nov 29, 2023
0fe414d
Undo parsing.hpp
LinZhihao-723 Nov 29, 2023
37eba2c
Rename escape_var
LinZhihao-723 Nov 29, 2023
eb8e9cb
Refactor member variable init
LinZhihao-723 Nov 29, 2023
d44862d
Refactoring the place to do the variabel initializations
LinZhihao-723 Nov 29, 2023
0ab0024
Fix the variable placeholder
LinZhihao-723 Nov 29, 2023
8c77fbb
Removing cVariablePlaceholder
LinZhihao-723 Nov 29, 2023
bf95c9a
Adding a case to the unit test
LinZhihao-723 Nov 29, 2023
469c2b3
Refactor the escape function
LinZhihao-723 Nov 30, 2023
62363ec
Fix subquery
LinZhihao-723 Nov 30, 2023
7c3189f
Fix ffi and search
LinZhihao-723 Nov 30, 2023
ad48240
Delete components/core/src/ffi/ir_stream/Attribute.hpp
LinZhihao-723 Dec 1, 2023
a37f90c
Bug-fix: Apply correct escaping for wildcard queries; Some refactoring.
kirkrodrigues Dec 5, 2023
07d6d33
Merge branch 'main' into dict_collision
LinZhihao-723 Dec 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE STRING "Choose the type of build." FORCE)
endif()

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

# Set general compressor
set(GENERAL_COMPRESSOR "zstd" CACHE STRING "The general-purpose compressor used as the 2nd-stage compressor")
set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS passthrough zstd)
Expand Down Expand Up @@ -253,6 +255,7 @@ set(SOURCE_FILES_clp
src/ir/LogEventDeserializer.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/ir/parsing.inc
src/ir/utils.cpp
src/ir/utils.hpp
src/LibarchiveFileReader.cpp
Expand Down Expand Up @@ -436,6 +439,7 @@ set(SOURCE_FILES_clg
src/ir/LogEvent.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/ir/parsing.inc
src/LogTypeDictionaryEntry.cpp
src/LogTypeDictionaryEntry.hpp
src/LogTypeDictionaryReader.cpp
Expand Down Expand Up @@ -594,6 +598,7 @@ set(SOURCE_FILES_clo
src/ir/LogEvent.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/ir/parsing.inc
src/LogTypeDictionaryEntry.cpp
src/LogTypeDictionaryEntry.hpp
src/LogTypeDictionaryReader.cpp
Expand Down Expand Up @@ -797,6 +802,7 @@ set(SOURCE_FILES_unitTest
src/ir/LogEventDeserializer.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/ir/parsing.inc
src/ir/utils.cpp
src/ir/utils.hpp
src/LibarchiveFileReader.cpp
Expand Down
28 changes: 15 additions & 13 deletions components/core/src/EncodedVariableInterpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ void EncodedVariableInterpreter::encode_and_add_to_dictionary(
encoded_vars.push_back(encoded_var);
};

ffi::ir_stream::generic_decode_message(
ffi::ir_stream::generic_decode_message<false>(
log_event.get_logtype(),
log_event.get_encoded_vars(),
log_event.get_dict_vars(),
Expand All @@ -287,38 +287,40 @@ void EncodedVariableInterpreter::encode_and_add_to_dictionary(
bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDictionaryEntry& logtype_dict_entry, const VariableDictionaryReader& var_dict,
const vector<encoded_variable_t>& encoded_vars, string& decompressed_msg)
{
size_t num_vars_in_logtype = logtype_dict_entry.get_num_vars();

// Ensure the number of variables in the logtype matches the number of encoded variables given
const auto& logtype_value = logtype_dict_entry.get_value();
if (num_vars_in_logtype != encoded_vars.size()) {
size_t const num_vars = logtype_dict_entry.get_num_variables();
if (num_vars != encoded_vars.size()) {
SPDLOG_ERROR("EncodedVariableInterpreter: Logtype '{}' contains {} variables, but {} were given for decoding.", logtype_value.c_str(),
num_vars_in_logtype, encoded_vars.size());
num_vars, encoded_vars.size());
return false;
}

ir::VariablePlaceholder var_placeholder;
size_t constant_begin_pos = 0;
string float_str;
variable_dictionary_id_t var_dict_id;
for (size_t i = 0; i < num_vars_in_logtype; ++i) {
size_t var_position = logtype_dict_entry.get_var_info(i, var_placeholder);
size_t const num_placeholders_in_logtype = logtype_dict_entry.get_num_placeholders();
for (size_t placeholder_ix = 0, var_ix = 0; placeholder_ix < num_placeholders_in_logtype; ++placeholder_ix) {
size_t placeholder_position = logtype_dict_entry.get_placeholder_info(placeholder_ix, var_placeholder);

// Add the constant that's between the last variable and this one
// Add the constant that's between the last placeholder and this one
decompressed_msg.append(logtype_value, constant_begin_pos,
var_position - constant_begin_pos);
placeholder_position - constant_begin_pos);
switch (var_placeholder) {
case ir::VariablePlaceholder::Integer:
decompressed_msg += std::to_string(encoded_vars[i]);
decompressed_msg += std::to_string(encoded_vars[var_ix++]);
break;
case ir::VariablePlaceholder::Float:
convert_encoded_float_to_string(encoded_vars[i], float_str);
convert_encoded_float_to_string(encoded_vars[var_ix++], float_str);
decompressed_msg += float_str;
break;
case ir::VariablePlaceholder::Dictionary:
var_dict_id = decode_var_dict_id(encoded_vars[i]);
var_dict_id = decode_var_dict_id(encoded_vars[var_ix++]);
decompressed_msg += var_dict.get_value(var_dict_id);
break;
case ir::VariablePlaceholder::Escape:
break;
default:
SPDLOG_ERROR(
"EncodedVariableInterpreter: Logtype '{}' contains "
Expand All @@ -328,7 +330,7 @@ bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDic
return false;
}
// Move past the variable placeholder
constant_begin_pos = var_position + 1;
constant_begin_pos = placeholder_position + 1;
}
// Append remainder of logtype, if any
if (constant_begin_pos < logtype_value.length()) {
Expand Down
35 changes: 32 additions & 3 deletions components/core/src/Grep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,9 +338,30 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archiv
{
size_t last_token_end_pos = 0;
string logtype;
auto escape_handler
= [](std::string_view constant, size_t char_to_escape_pos, string& logtype) -> void {
auto const escape_char{enum_to_underlying_type(ir::VariablePlaceholder::Escape)};
auto const next_char_pos{char_to_escape_pos + 1};
// NOTE: We don't want to add additional escapes for wildcards that have
// been escaped. E.g., the query "\\*" should remain unchanged.
if (next_char_pos < constant.length()
&& false == is_wildcard(constant[next_char_pos]))
{
logtype += escape_char;
} else if (ir::is_variable_placeholder(constant[char_to_escape_pos])) {
logtype += escape_char;
logtype += escape_char;
}
};
for (const auto& query_token : query_tokens) {
// Append from end of last token to beginning of this token, to logtype
logtype.append(processed_search_string, last_token_end_pos, query_token.get_begin_pos() - last_token_end_pos);
ir::append_constant_to_logtype(
static_cast<std::string_view>(processed_search_string)
.substr(last_token_end_pos,
query_token.get_begin_pos() - last_token_end_pos),
escape_handler,
logtype
);
last_token_end_pos = query_token.get_end_pos();

if (query_token.is_wildcard()) {
Expand All @@ -358,7 +379,10 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archiv
}
} else {
if (!query_token.is_var()) {
logtype += query_token.get_value();
ir::append_constant_to_logtype(
query_token.get_value(), escape_handler,
logtype
);
} else if (!process_var_token(query_token, archive, ignore_case, sub_query, logtype)) {
return SubQueryMatchabilityResult::WontMatch;
}
Expand All @@ -367,7 +391,12 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archiv

if (last_token_end_pos < processed_search_string.length()) {
// Append from end of last token to end
logtype.append(processed_search_string, last_token_end_pos, string::npos);
ir::append_constant_to_logtype(
static_cast<std::string_view>(processed_search_string)
.substr(last_token_end_pos, string::npos),
escape_handler,
logtype
);
last_token_end_pos = processed_search_string.length();
}

Expand Down
90 changes: 47 additions & 43 deletions components/core/src/LogTypeDictionaryEntry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,26 @@
#include "type_utils.hpp"
#include "Utils.hpp"

using std::string_view;
using std::string;

size_t LogTypeDictionaryEntry::get_var_info(
size_t var_ix,
ir::VariablePlaceholder& var_placeholder
size_t LogTypeDictionaryEntry::get_placeholder_info(
size_t placeholder_ix,
ir::VariablePlaceholder& placeholder
) const {
if (var_ix >= m_var_positions.size()) {
if (placeholder_ix >= m_placeholder_positions.size()) {
return SIZE_MAX;
}

auto var_position = m_var_positions[var_ix];
var_placeholder = static_cast<ir::VariablePlaceholder>(m_value[var_position]);
auto var_position = m_placeholder_positions[placeholder_ix];
placeholder = static_cast<ir::VariablePlaceholder>(m_value[var_position]);

return m_var_positions[var_ix];
return m_placeholder_positions[placeholder_ix];
}

size_t LogTypeDictionaryEntry::get_data_size () const {
// NOTE: sizeof(vector[0]) is executed at compile time so there's no risk of an exception at runtime
return sizeof(m_id) + m_value.length() + m_var_positions.size() * sizeof(m_var_positions[0]) +
return sizeof(m_id) + m_value.length() + m_placeholder_positions.size() * sizeof(m_placeholder_positions[0]) +
m_ids_of_segments_containing_entry.size() * sizeof(segment_id_t);
}

Expand All @@ -32,49 +33,73 @@ void LogTypeDictionaryEntry::add_constant (const string& value_containing_consta
}

void LogTypeDictionaryEntry::add_dictionary_var () {
m_var_positions.push_back(m_value.length());
m_placeholder_positions.push_back(m_value.length());
add_dict_var(m_value);
}

void LogTypeDictionaryEntry::add_int_var () {
m_var_positions.push_back(m_value.length());
m_placeholder_positions.push_back(m_value.length());
add_int_var(m_value);
}

void LogTypeDictionaryEntry::add_float_var () {
m_var_positions.push_back(m_value.length());
m_placeholder_positions.push_back(m_value.length());
add_float_var(m_value);
}

void LogTypeDictionaryEntry::add_escape() {
m_placeholder_positions.push_back(m_value.length());
add_escape(m_value);
++m_num_escaped_placeholders;
}

bool LogTypeDictionaryEntry::parse_next_var (const string& msg, size_t& var_begin_pos, size_t& var_end_pos, string& var) {
auto last_var_end_pos = var_end_pos;
// clang-format off
auto escape_handler = [&](
[[maybe_unused]] string_view constant,
[[maybe_unused]] size_t char_to_escape_pos,
string& logtype
) -> void {
m_placeholder_positions.push_back(logtype.size());
++m_num_escaped_placeholders;
logtype += enum_to_underlying_type(ir::VariablePlaceholder::Escape);
};
// clang-format on
if (ir::get_bounds_of_next_var(msg, var_begin_pos, var_end_pos)) {
// Append to log type: from end of last variable to start of current variable
add_constant(msg, last_var_end_pos, var_begin_pos - last_var_end_pos);
auto constant = static_cast<string_view>(msg).substr(
last_var_end_pos,
var_begin_pos - last_var_end_pos
);
ir::append_constant_to_logtype(constant, escape_handler, m_value);

var.assign(msg, var_begin_pos, var_end_pos - var_begin_pos);
return true;
}
if (last_var_end_pos < msg.length()) {
// Append to log type: from end of last variable to end
add_constant(msg, last_var_end_pos, msg.length() - last_var_end_pos);
auto constant = static_cast<string_view>(msg).substr(
last_var_end_pos,
msg.length() - last_var_end_pos
);
ir::append_constant_to_logtype(constant, escape_handler, m_value);
}

return false;
}

void LogTypeDictionaryEntry::clear () {
m_value.clear();
m_var_positions.clear();
m_placeholder_positions.clear();
m_num_escaped_placeholders = 0;
}

void LogTypeDictionaryEntry::write_to_file (streaming_compression::Compressor& compressor) const {
compressor.write_numeric_value(m_id);

string escaped_value;
get_value_with_unfounded_variables_escaped(escaped_value);
compressor.write_numeric_value<uint64_t>(escaped_value.length());
compressor.write_string(escaped_value);
compressor.write_numeric_value<uint64_t>(m_value.length());
compressor.write_string(m_value);
}

ErrorCode LogTypeDictionaryEntry::try_read_from_file (streaming_compression::Decompressor& decompressor) {
Expand Down Expand Up @@ -107,8 +132,11 @@ ErrorCode LogTypeDictionaryEntry::try_read_from_file (streaming_compression::Dec
if (is_escaped) {
constant += c;
is_escaped = false;
} else if (ir::cVariablePlaceholderEscapeCharacter == c) {
} else if (enum_to_underlying_type(ir::VariablePlaceholder::Escape) == c) {
is_escaped = true;
add_constant(constant, 0, constant.length());
constant.clear();
add_escape();
} else {
if (enum_to_underlying_type(ir::VariablePlaceholder::Integer) == c) {
add_constant(constant, 0, constant.length());
Expand Down Expand Up @@ -141,27 +169,3 @@ void LogTypeDictionaryEntry::read_from_file (streaming_compression::Decompressor
throw OperationFailed(error_code, __FILENAME__, __LINE__);
}
}

void LogTypeDictionaryEntry::get_value_with_unfounded_variables_escaped (string& escaped_logtype_value) const {
auto value_view = static_cast<std::string_view>(m_value);
size_t begin_ix = 0;
// Reset escaped value and reserve enough space to at least contain the whole value
escaped_logtype_value.clear();
escaped_logtype_value.reserve(value_view.length());
for (auto var_position : m_var_positions) {
size_t end_ix = var_position;

ir::escape_and_append_constant_to_logtype(
value_view.substr(begin_ix, end_ix - begin_ix),
escaped_logtype_value
);

// Add variable placeholder
escaped_logtype_value += value_view[end_ix];

// Move begin to start of next portion of logtype between variables
begin_ix = end_ix + 1;
}
// Escape any variable placeholders in remainder of value
ir::escape_and_append_constant_to_logtype(value_view.substr(begin_ix), escaped_logtype_value);
}
45 changes: 30 additions & 15 deletions components/core/src/LogTypeDictionaryEntry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,33 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> {
static void add_float_var (std::string& logtype) {
logtype += enum_to_underlying_type(ir::VariablePlaceholder::Float);
}
/**
* Adds an escape character to the given logtype
* @param logtype
*/
static void add_escape (std::string& logtype) {
logtype += enum_to_underlying_type(ir::VariablePlaceholder::Escape);
}

size_t get_num_vars () const { return m_var_positions.size(); }
/**
* Gets all info about a variable in the logtype
* @param var_ix The index of the variable to get the info for
* @param var_placeholder
* @return The variable's position in the logtype, or SIZE_MAX if var_ix is out of bounds
* @return The number of variable placeholders (including escaped ones) in the logtype.
*/
size_t get_num_placeholders () const { return m_placeholder_positions.size(); }

/**
* @return The number of variable placeholders (excluding escaped ones) in the logtype.
*/
size_t get_num_variables () const {
return m_placeholder_positions.size() - m_num_escaped_placeholders;
}

/**
* Gets all info about a variable placeholder in the logtype
* @param placeholder_ix The index of the placeholder to get the info for
* @param placeholder
* @return The placeholder's position in the logtype, or SIZE_MAX if var_ix is out of bounds
*/
size_t get_var_info (size_t var_ix, ir::VariablePlaceholder& var_placeholder) const;
size_t get_placeholder_info (size_t placeholder_ix, ir::VariablePlaceholder& placeholder) const;

/**
* Gets the size (in-memory) of the data contained in this entry
Expand All @@ -98,6 +116,10 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> {
* Adds a dictionary variable placeholder
*/
void add_dictionary_var ();
/**
* Adds an escape character
*/
void add_escape ();

/**
* Parses next variable from a message, constructing the constant part of the message's logtype as well
Expand Down Expand Up @@ -137,16 +159,9 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> {
void read_from_file (streaming_compression::Decompressor& decompressor);

private:
// Methods
/**
* Escapes any variable placeholders that don't correspond to the positions
* of variables in the logtype entry's value
* @param escaped_logtype_value
*/
void get_value_with_unfounded_variables_escaped (std::string& escaped_logtype_value) const;

// Variables
std::vector<size_t> m_var_positions;
std::vector<size_t> m_placeholder_positions;
size_t m_num_escaped_placeholders{0};
};

#endif // LOGTYPEDICTIONARYENTRY_HPP
Loading