Skip to content

Commit

Permalink
Fix incorrect logtype encoding by properly escaping variable placehol…
Browse files Browse the repository at this point in the history
…der characters (fixes #163). (#162)
  • Loading branch information
LinZhihao-723 authored Dec 5, 2023
1 parent a65dcb2 commit 3dcac07
Show file tree
Hide file tree
Showing 22 changed files with 463 additions and 210 deletions.
6 changes: 6 additions & 0 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE STRING "Choose the type of build." FORCE)
endif()

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

# Set general compressor
set(GENERAL_COMPRESSOR "zstd" CACHE STRING "The general-purpose compressor used as the 2nd-stage compressor")
set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS passthrough zstd)
Expand Down Expand Up @@ -253,6 +255,7 @@ set(SOURCE_FILES_clp
src/ir/LogEventDeserializer.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/ir/parsing.inc
src/ir/utils.cpp
src/ir/utils.hpp
src/LibarchiveFileReader.cpp
Expand Down Expand Up @@ -436,6 +439,7 @@ set(SOURCE_FILES_clg
src/ir/LogEvent.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/ir/parsing.inc
src/LogTypeDictionaryEntry.cpp
src/LogTypeDictionaryEntry.hpp
src/LogTypeDictionaryReader.cpp
Expand Down Expand Up @@ -594,6 +598,7 @@ set(SOURCE_FILES_clo
src/ir/LogEvent.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/ir/parsing.inc
src/LogTypeDictionaryEntry.cpp
src/LogTypeDictionaryEntry.hpp
src/LogTypeDictionaryReader.cpp
Expand Down Expand Up @@ -797,6 +802,7 @@ set(SOURCE_FILES_unitTest
src/ir/LogEventDeserializer.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/ir/parsing.inc
src/ir/utils.cpp
src/ir/utils.hpp
src/LibarchiveFileReader.cpp
Expand Down
28 changes: 15 additions & 13 deletions components/core/src/EncodedVariableInterpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ void EncodedVariableInterpreter::encode_and_add_to_dictionary(
encoded_vars.push_back(encoded_var);
};

ffi::ir_stream::generic_decode_message(
ffi::ir_stream::generic_decode_message<false>(
log_event.get_logtype(),
log_event.get_encoded_vars(),
log_event.get_dict_vars(),
Expand All @@ -287,38 +287,40 @@ void EncodedVariableInterpreter::encode_and_add_to_dictionary(
bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDictionaryEntry& logtype_dict_entry, const VariableDictionaryReader& var_dict,
const vector<encoded_variable_t>& encoded_vars, string& decompressed_msg)
{
size_t num_vars_in_logtype = logtype_dict_entry.get_num_vars();

// Ensure the number of variables in the logtype matches the number of encoded variables given
const auto& logtype_value = logtype_dict_entry.get_value();
if (num_vars_in_logtype != encoded_vars.size()) {
size_t const num_vars = logtype_dict_entry.get_num_variables();
if (num_vars != encoded_vars.size()) {
SPDLOG_ERROR("EncodedVariableInterpreter: Logtype '{}' contains {} variables, but {} were given for decoding.", logtype_value.c_str(),
num_vars_in_logtype, encoded_vars.size());
num_vars, encoded_vars.size());
return false;
}

ir::VariablePlaceholder var_placeholder;
size_t constant_begin_pos = 0;
string float_str;
variable_dictionary_id_t var_dict_id;
for (size_t i = 0; i < num_vars_in_logtype; ++i) {
size_t var_position = logtype_dict_entry.get_var_info(i, var_placeholder);
size_t const num_placeholders_in_logtype = logtype_dict_entry.get_num_placeholders();
for (size_t placeholder_ix = 0, var_ix = 0; placeholder_ix < num_placeholders_in_logtype; ++placeholder_ix) {
size_t placeholder_position = logtype_dict_entry.get_placeholder_info(placeholder_ix, var_placeholder);

// Add the constant that's between the last variable and this one
// Add the constant that's between the last placeholder and this one
decompressed_msg.append(logtype_value, constant_begin_pos,
var_position - constant_begin_pos);
placeholder_position - constant_begin_pos);
switch (var_placeholder) {
case ir::VariablePlaceholder::Integer:
decompressed_msg += std::to_string(encoded_vars[i]);
decompressed_msg += std::to_string(encoded_vars[var_ix++]);
break;
case ir::VariablePlaceholder::Float:
convert_encoded_float_to_string(encoded_vars[i], float_str);
convert_encoded_float_to_string(encoded_vars[var_ix++], float_str);
decompressed_msg += float_str;
break;
case ir::VariablePlaceholder::Dictionary:
var_dict_id = decode_var_dict_id(encoded_vars[i]);
var_dict_id = decode_var_dict_id(encoded_vars[var_ix++]);
decompressed_msg += var_dict.get_value(var_dict_id);
break;
case ir::VariablePlaceholder::Escape:
break;
default:
SPDLOG_ERROR(
"EncodedVariableInterpreter: Logtype '{}' contains "
Expand All @@ -328,7 +330,7 @@ bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDic
return false;
}
// Move past the variable placeholder
constant_begin_pos = var_position + 1;
constant_begin_pos = placeholder_position + 1;
}
// Append remainder of logtype, if any
if (constant_begin_pos < logtype_value.length()) {
Expand Down
35 changes: 32 additions & 3 deletions components/core/src/Grep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,9 +338,30 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archiv
{
size_t last_token_end_pos = 0;
string logtype;
auto escape_handler
= [](std::string_view constant, size_t char_to_escape_pos, string& logtype) -> void {
auto const escape_char{enum_to_underlying_type(ir::VariablePlaceholder::Escape)};
auto const next_char_pos{char_to_escape_pos + 1};
// NOTE: We don't want to add additional escapes for wildcards that have
// been escaped. E.g., the query "\\*" should remain unchanged.
if (next_char_pos < constant.length()
&& false == is_wildcard(constant[next_char_pos]))
{
logtype += escape_char;
} else if (ir::is_variable_placeholder(constant[char_to_escape_pos])) {
logtype += escape_char;
logtype += escape_char;
}
};
for (const auto& query_token : query_tokens) {
// Append from end of last token to beginning of this token, to logtype
logtype.append(processed_search_string, last_token_end_pos, query_token.get_begin_pos() - last_token_end_pos);
ir::append_constant_to_logtype(
static_cast<std::string_view>(processed_search_string)
.substr(last_token_end_pos,
query_token.get_begin_pos() - last_token_end_pos),
escape_handler,
logtype
);
last_token_end_pos = query_token.get_end_pos();

if (query_token.is_wildcard()) {
Expand All @@ -358,7 +379,10 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archiv
}
} else {
if (!query_token.is_var()) {
logtype += query_token.get_value();
ir::append_constant_to_logtype(
query_token.get_value(), escape_handler,
logtype
);
} else if (!process_var_token(query_token, archive, ignore_case, sub_query, logtype)) {
return SubQueryMatchabilityResult::WontMatch;
}
Expand All @@ -367,7 +391,12 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archiv

if (last_token_end_pos < processed_search_string.length()) {
// Append from end of last token to end
logtype.append(processed_search_string, last_token_end_pos, string::npos);
ir::append_constant_to_logtype(
static_cast<std::string_view>(processed_search_string)
.substr(last_token_end_pos, string::npos),
escape_handler,
logtype
);
last_token_end_pos = processed_search_string.length();
}

Expand Down
90 changes: 47 additions & 43 deletions components/core/src/LogTypeDictionaryEntry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,26 @@
#include "type_utils.hpp"
#include "Utils.hpp"

using std::string_view;
using std::string;

size_t LogTypeDictionaryEntry::get_var_info(
size_t var_ix,
ir::VariablePlaceholder& var_placeholder
size_t LogTypeDictionaryEntry::get_placeholder_info(
size_t placeholder_ix,
ir::VariablePlaceholder& placeholder
) const {
if (var_ix >= m_var_positions.size()) {
if (placeholder_ix >= m_placeholder_positions.size()) {
return SIZE_MAX;
}

auto var_position = m_var_positions[var_ix];
var_placeholder = static_cast<ir::VariablePlaceholder>(m_value[var_position]);
auto var_position = m_placeholder_positions[placeholder_ix];
placeholder = static_cast<ir::VariablePlaceholder>(m_value[var_position]);

return m_var_positions[var_ix];
return m_placeholder_positions[placeholder_ix];
}

size_t LogTypeDictionaryEntry::get_data_size () const {
// NOTE: sizeof(vector[0]) is executed at compile time so there's no risk of an exception at runtime
return sizeof(m_id) + m_value.length() + m_var_positions.size() * sizeof(m_var_positions[0]) +
return sizeof(m_id) + m_value.length() + m_placeholder_positions.size() * sizeof(m_placeholder_positions[0]) +
m_ids_of_segments_containing_entry.size() * sizeof(segment_id_t);
}

Expand All @@ -32,49 +33,73 @@ void LogTypeDictionaryEntry::add_constant (const string& value_containing_consta
}

void LogTypeDictionaryEntry::add_dictionary_var () {
m_var_positions.push_back(m_value.length());
m_placeholder_positions.push_back(m_value.length());
add_dict_var(m_value);
}

void LogTypeDictionaryEntry::add_int_var () {
m_var_positions.push_back(m_value.length());
m_placeholder_positions.push_back(m_value.length());
add_int_var(m_value);
}

void LogTypeDictionaryEntry::add_float_var () {
m_var_positions.push_back(m_value.length());
m_placeholder_positions.push_back(m_value.length());
add_float_var(m_value);
}

void LogTypeDictionaryEntry::add_escape() {
m_placeholder_positions.push_back(m_value.length());
add_escape(m_value);
++m_num_escaped_placeholders;
}

bool LogTypeDictionaryEntry::parse_next_var (const string& msg, size_t& var_begin_pos, size_t& var_end_pos, string& var) {
auto last_var_end_pos = var_end_pos;
// clang-format off
auto escape_handler = [&](
[[maybe_unused]] string_view constant,
[[maybe_unused]] size_t char_to_escape_pos,
string& logtype
) -> void {
m_placeholder_positions.push_back(logtype.size());
++m_num_escaped_placeholders;
logtype += enum_to_underlying_type(ir::VariablePlaceholder::Escape);
};
// clang-format on
if (ir::get_bounds_of_next_var(msg, var_begin_pos, var_end_pos)) {
// Append to log type: from end of last variable to start of current variable
add_constant(msg, last_var_end_pos, var_begin_pos - last_var_end_pos);
auto constant = static_cast<string_view>(msg).substr(
last_var_end_pos,
var_begin_pos - last_var_end_pos
);
ir::append_constant_to_logtype(constant, escape_handler, m_value);

var.assign(msg, var_begin_pos, var_end_pos - var_begin_pos);
return true;
}
if (last_var_end_pos < msg.length()) {
// Append to log type: from end of last variable to end
add_constant(msg, last_var_end_pos, msg.length() - last_var_end_pos);
auto constant = static_cast<string_view>(msg).substr(
last_var_end_pos,
msg.length() - last_var_end_pos
);
ir::append_constant_to_logtype(constant, escape_handler, m_value);
}

return false;
}

void LogTypeDictionaryEntry::clear () {
m_value.clear();
m_var_positions.clear();
m_placeholder_positions.clear();
m_num_escaped_placeholders = 0;
}

void LogTypeDictionaryEntry::write_to_file (streaming_compression::Compressor& compressor) const {
compressor.write_numeric_value(m_id);

string escaped_value;
get_value_with_unfounded_variables_escaped(escaped_value);
compressor.write_numeric_value<uint64_t>(escaped_value.length());
compressor.write_string(escaped_value);
compressor.write_numeric_value<uint64_t>(m_value.length());
compressor.write_string(m_value);
}

ErrorCode LogTypeDictionaryEntry::try_read_from_file (streaming_compression::Decompressor& decompressor) {
Expand Down Expand Up @@ -107,8 +132,11 @@ ErrorCode LogTypeDictionaryEntry::try_read_from_file (streaming_compression::Dec
if (is_escaped) {
constant += c;
is_escaped = false;
} else if (ir::cVariablePlaceholderEscapeCharacter == c) {
} else if (enum_to_underlying_type(ir::VariablePlaceholder::Escape) == c) {
is_escaped = true;
add_constant(constant, 0, constant.length());
constant.clear();
add_escape();
} else {
if (enum_to_underlying_type(ir::VariablePlaceholder::Integer) == c) {
add_constant(constant, 0, constant.length());
Expand Down Expand Up @@ -141,27 +169,3 @@ void LogTypeDictionaryEntry::read_from_file (streaming_compression::Decompressor
throw OperationFailed(error_code, __FILENAME__, __LINE__);
}
}

void LogTypeDictionaryEntry::get_value_with_unfounded_variables_escaped (string& escaped_logtype_value) const {
auto value_view = static_cast<std::string_view>(m_value);
size_t begin_ix = 0;
// Reset escaped value and reserve enough space to at least contain the whole value
escaped_logtype_value.clear();
escaped_logtype_value.reserve(value_view.length());
for (auto var_position : m_var_positions) {
size_t end_ix = var_position;

ir::escape_and_append_constant_to_logtype(
value_view.substr(begin_ix, end_ix - begin_ix),
escaped_logtype_value
);

// Add variable placeholder
escaped_logtype_value += value_view[end_ix];

// Move begin to start of next portion of logtype between variables
begin_ix = end_ix + 1;
}
// Escape any variable placeholders in remainder of value
ir::escape_and_append_constant_to_logtype(value_view.substr(begin_ix), escaped_logtype_value);
}
45 changes: 30 additions & 15 deletions components/core/src/LogTypeDictionaryEntry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,33 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> {
static void add_float_var (std::string& logtype) {
logtype += enum_to_underlying_type(ir::VariablePlaceholder::Float);
}
/**
* Adds an escape character to the given logtype
* @param logtype
*/
static void add_escape (std::string& logtype) {
logtype += enum_to_underlying_type(ir::VariablePlaceholder::Escape);
}

size_t get_num_vars () const { return m_var_positions.size(); }
/**
* Gets all info about a variable in the logtype
* @param var_ix The index of the variable to get the info for
* @param var_placeholder
* @return The variable's position in the logtype, or SIZE_MAX if var_ix is out of bounds
* @return The number of variable placeholders (including escaped ones) in the logtype.
*/
size_t get_num_placeholders () const { return m_placeholder_positions.size(); }

/**
* @return The number of variable placeholders (excluding escaped ones) in the logtype.
*/
size_t get_num_variables () const {
return m_placeholder_positions.size() - m_num_escaped_placeholders;
}

/**
* Gets all info about a variable placeholder in the logtype
* @param placeholder_ix The index of the placeholder to get the info for
* @param placeholder
* @return The placeholder's position in the logtype, or SIZE_MAX if var_ix is out of bounds
*/
size_t get_var_info (size_t var_ix, ir::VariablePlaceholder& var_placeholder) const;
size_t get_placeholder_info (size_t placeholder_ix, ir::VariablePlaceholder& placeholder) const;

/**
* Gets the size (in-memory) of the data contained in this entry
Expand All @@ -98,6 +116,10 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> {
* Adds a dictionary variable placeholder
*/
void add_dictionary_var ();
/**
* Adds an escape character
*/
void add_escape ();

/**
* Parses next variable from a message, constructing the constant part of the message's logtype as well
Expand Down Expand Up @@ -137,16 +159,9 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> {
void read_from_file (streaming_compression::Decompressor& decompressor);

private:
// Methods
/**
* Escapes any variable placeholders that don't correspond to the positions
* of variables in the logtype entry's value
* @param escaped_logtype_value
*/
void get_value_with_unfounded_variables_escaped (std::string& escaped_logtype_value) const;

// Variables
std::vector<size_t> m_var_positions;
std::vector<size_t> m_placeholder_positions;
size_t m_num_escaped_placeholders{0};
};

#endif // LOGTYPEDICTIONARYENTRY_HPP
Loading

0 comments on commit 3dcac07

Please sign in to comment.