Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate parsing methods from ffi and non-ffi code (fixes #160) #161

Merged
merged 8 commits into from
Sep 17, 2023
9 changes: 9 additions & 0 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ set(SOURCE_FILES_clp
src/ir/LogEvent.hpp
src/ir/LogEventDeserializer.cpp
src/ir/LogEventDeserializer.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/ir/utils.cpp
src/ir/utils.hpp
src/LibarchiveFileReader.cpp
Expand Down Expand Up @@ -432,6 +434,8 @@ set(SOURCE_FILES_clg
src/Grep.cpp
src/Grep.hpp
src/ir/LogEvent.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/LogTypeDictionaryEntry.cpp
src/LogTypeDictionaryEntry.hpp
src/LogTypeDictionaryReader.cpp
Expand Down Expand Up @@ -588,6 +592,8 @@ set(SOURCE_FILES_clo
src/Grep.cpp
src/Grep.hpp
src/ir/LogEvent.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/LogTypeDictionaryEntry.cpp
src/LogTypeDictionaryEntry.hpp
src/LogTypeDictionaryReader.cpp
Expand Down Expand Up @@ -789,6 +795,8 @@ set(SOURCE_FILES_unitTest
src/ir/LogEvent.hpp
src/ir/LogEventDeserializer.cpp
src/ir/LogEventDeserializer.hpp
src/ir/parsing.cpp
src/ir/parsing.hpp
src/ir/utils.cpp
src/ir/utils.hpp
src/LibarchiveFileReader.cpp
Expand Down Expand Up @@ -890,6 +898,7 @@ set(SOURCE_FILES_unitTest
tests/test-encoding_methods.cpp
tests/test-Grep.cpp
tests/test-ir_encoding_methods.cpp
tests/test-ir_parsing.cpp
tests/test-main.cpp
tests/test-math_utils.cpp
tests/test-ParserWithUserSchema.cpp
Expand Down
2 changes: 2 additions & 0 deletions components/core/cmake/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ set(SOURCE_FILES_make-dictionaries-readable
${CMAKE_CURRENT_SOURCE_DIR}/src/FileReader.hpp
${CMAKE_CURRENT_SOURCE_DIR}/src/FileWriter.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/FileWriter.hpp
${CMAKE_CURRENT_SOURCE_DIR}/src/ir/parsing.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/ir/parsing.hpp
${CMAKE_CURRENT_SOURCE_DIR}/src/LogTypeDictionaryEntry.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/LogTypeDictionaryEntry.hpp
${CMAKE_CURRENT_SOURCE_DIR}/src/LogTypeDictionaryReader.cpp
Expand Down
17 changes: 9 additions & 8 deletions components/core/src/EncodedVariableInterpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "Defs.h"
#include "ffi/encoding_methods.hpp"
#include "ffi/ir_stream/decoding_methods.hpp"
#include "ir/parsing.hpp"
#include "spdlog_with_specializations.hpp"
#include "string_utils.hpp"
#include "type_utils.hpp"
Expand Down Expand Up @@ -296,25 +297,25 @@ bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDic
return false;
}

LogTypeDictionaryEntry::VarDelim var_delim;
ir::VariablePlaceholder var_placeholder;
size_t constant_begin_pos = 0;
string float_str;
variable_dictionary_id_t var_dict_id;
for (size_t i = 0; i < num_vars_in_logtype; ++i) {
size_t var_position = logtype_dict_entry.get_var_info(i, var_delim);
size_t var_position = logtype_dict_entry.get_var_info(i, var_placeholder);

// Add the constant that's between the last variable and this one
decompressed_msg.append(logtype_value, constant_begin_pos,
var_position - constant_begin_pos);
switch (var_delim) {
case LogTypeDictionaryEntry::VarDelim::Integer:
switch (var_placeholder) {
case ir::VariablePlaceholder::Integer:
decompressed_msg += std::to_string(encoded_vars[i]);
break;
case LogTypeDictionaryEntry::VarDelim::Float:
case ir::VariablePlaceholder::Float:
convert_encoded_float_to_string(encoded_vars[i], float_str);
decompressed_msg += float_str;
break;
case LogTypeDictionaryEntry::VarDelim::Dictionary:
case ir::VariablePlaceholder::Dictionary:
var_dict_id = decode_var_dict_id(encoded_vars[i]);
decompressed_msg += var_dict.get_value(var_dict_id);
break;
Expand All @@ -323,10 +324,10 @@ bool EncodedVariableInterpreter::decode_variables_into_message (const LogTypeDic
"EncodedVariableInterpreter: Logtype '{}' contains "
"unexpected variable placeholder 0x{:x}",
logtype_value,
enum_to_underlying_type(var_delim));
enum_to_underlying_type(var_placeholder));
return false;
}
// Move past the variable delimiter
// Move past the variable placeholder
constant_begin_pos = var_position + 1;
}
// Append remainder of logtype, if any
Expand Down
5 changes: 4 additions & 1 deletion components/core/src/Grep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
// Project headers
#include "compressor_frontend/Constants.hpp"
#include "EncodedVariableInterpreter.hpp"
#include "ir/parsing.hpp"
#include "StringReader.hpp"
#include "Utils.hpp"

using ir::is_delim;
using std::string;
using std::vector;
using streaming_archive::reader::Archive;
Expand Down Expand Up @@ -556,7 +558,8 @@ bool Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_
// - it contains a decimal digit, or
// - it could be a multi-digit hex value, or
// - it's directly preceded by an equals sign and contains an alphabet without a wildcard between the equals sign and the first alphabet of the token
if (contains_decimal_digit || could_be_multi_digit_hex_value(value, begin_pos, end_pos)) {
auto variable = static_cast<std::string_view>(value).substr(begin_pos, end_pos - begin_pos);
if (contains_decimal_digit || ir::could_be_multi_digit_hex_value(variable)) {
is_var = true;
} else if (begin_pos > 0 && '=' == value[begin_pos - 1] && contains_alphabet) {
// Find first alphabet or wildcard in token
Expand Down
64 changes: 21 additions & 43 deletions components/core/src/LogTypeDictionaryEntry.cpp
Original file line number Diff line number Diff line change
@@ -1,48 +1,22 @@
#include "LogTypeDictionaryEntry.hpp"

// Project headers
#include "ir/parsing.hpp"
#include "type_utils.hpp"
#include "Utils.hpp"

using std::string;

// Constants
static constexpr char cEscapeChar = '\\';

// Local function prototypes
/**
* Escapes any variable delimiters in the identified portion of the given value
* @param value
* @param begin_ix
* @param end_ix
* @param escaped_value
*/
static void escape_variable_delimiters (const std::string& value, size_t begin_ix, size_t end_ix, std::string& escaped_value);

static void escape_variable_delimiters (const string& value, size_t begin_ix, size_t end_ix, string& escaped_value) {
for (size_t i = begin_ix; i < end_ix; ++i) {
auto c = value[i];

// Add escape character if necessary
if (enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Integer) == c ||
enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Float) == c ||
enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Dictionary) == c ||
cEscapeChar == c) {
escaped_value += cEscapeChar;
}

// Add character
escaped_value += value[i];
}
}

size_t LogTypeDictionaryEntry::get_var_info (size_t var_ix, VarDelim& var_delim) const {
size_t LogTypeDictionaryEntry::get_var_info(
size_t var_ix,
ir::VariablePlaceholder& var_placeholder
) const {
if (var_ix >= m_var_positions.size()) {
return SIZE_MAX;
}

auto var_position = m_var_positions[var_ix];
var_delim = (VarDelim)m_value[var_position];
var_placeholder = static_cast<ir::VariablePlaceholder>(m_value[var_position]);

return m_var_positions[var_ix];
}
Expand Down Expand Up @@ -74,7 +48,7 @@ void LogTypeDictionaryEntry::add_float_var () {

bool LogTypeDictionaryEntry::parse_next_var (const string& msg, size_t& var_begin_pos, size_t& var_end_pos, string& var) {
auto last_var_end_pos = var_end_pos;
if (get_bounds_of_next_var(msg, var_begin_pos, var_end_pos)) {
if (ir::get_bounds_of_next_var(msg, var_begin_pos, var_end_pos)) {
// Append to log type: from end of last variable to start of current variable
add_constant(msg, last_var_end_pos, var_begin_pos - last_var_end_pos);

Expand Down Expand Up @@ -133,18 +107,18 @@ ErrorCode LogTypeDictionaryEntry::try_read_from_file (streaming_compression::Dec
if (is_escaped) {
constant += c;
is_escaped = false;
} else if (cEscapeChar == c) {
} else if (ir::cVariablePlaceholderEscapeCharacter == c) {
is_escaped = true;
} else {
if (enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Integer) == c) {
if (enum_to_underlying_type(ir::VariablePlaceholder::Integer) == c) {
add_constant(constant, 0, constant.length());
constant.clear();
add_int_var();
} else if (enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Float) == c) {
} else if (enum_to_underlying_type(ir::VariablePlaceholder::Float) == c) {
add_constant(constant, 0, constant.length());
constant.clear();
add_float_var();
} else if (enum_to_underlying_type(LogTypeDictionaryEntry::VarDelim::Dictionary) == c)
} else if (enum_to_underlying_type(ir::VariablePlaceholder::Dictionary) == c)
{
kirkrodrigues marked this conversation as resolved.
Show resolved Hide resolved
add_constant(constant, 0, constant.length());
constant.clear();
Expand All @@ -170,21 +144,25 @@ void LogTypeDictionaryEntry::read_from_file (streaming_compression::Decompressor
}

void LogTypeDictionaryEntry::get_value_with_unfounded_variables_escaped (string& escaped_logtype_value) const {
auto value_view = static_cast<std::string_view>(m_value);
size_t begin_ix = 0;
// Reset escaped value and reserve enough space to at least contain the whole value
escaped_logtype_value.clear();
escaped_logtype_value.reserve(m_value.length());
escaped_logtype_value.reserve(value_view.length());
for (auto var_position : m_var_positions) {
size_t end_ix = var_position;

escape_variable_delimiters(m_value, begin_ix, end_ix, escaped_logtype_value);
ir::escape_and_append_constant_to_logtype(
value_view.substr(begin_ix, end_ix - begin_ix),
escaped_logtype_value
);

// Add variable delimiter
escaped_logtype_value += m_value[end_ix];
// Add variable placeholder
escaped_logtype_value += value_view[end_ix];

// Move begin to start of next portion of logtype between variables
begin_ix = end_ix + 1;
}
// Escape any variable delimiters in remainder of value
escape_variable_delimiters(m_value, begin_ix, m_value.length(), escaped_logtype_value);
// Escape any variable placeholders in remainder of value
ir::escape_and_append_constant_to_logtype(value_view.substr(begin_ix), escaped_logtype_value);
}
35 changes: 14 additions & 21 deletions components/core/src/LogTypeDictionaryEntry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "DictionaryEntry.hpp"
#include "ErrorCode.hpp"
#include "FileReader.hpp"
#include "ir/parsing.hpp"
#include "streaming_compression/zstd/Compressor.hpp"
#include "streaming_compression/zstd/Decompressor.hpp"
#include "TraceableException.hpp"
Expand All @@ -31,15 +32,6 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> {
}
};

// Constants
enum class VarDelim : char {
// NOTE: These values are used within logtypes to denote variables, so care must be taken when changing them
Integer = 0x11,
Dictionary = 0x12,
Float = 0x13,
Length = 3
};

// Constructors
LogTypeDictionaryEntry () = default;
// Use default copy constructor
Expand All @@ -51,35 +43,35 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> {

// Methods
/**
* Adds a dictionary variable delimiter to the given logtype
* Adds a dictionary variable placeholder to the given logtype
* @param logtype
*/
static void add_dict_var (std::string& logtype) {
logtype += enum_to_underlying_type(VarDelim::Dictionary);
logtype += enum_to_underlying_type(ir::VariablePlaceholder::Dictionary);
}
/**
* Adds an integer variable delimiter to the given logtype
* Adds an integer variable placeholder to the given logtype
* @param logtype
*/
static void add_int_var (std::string& logtype) {
logtype += enum_to_underlying_type(VarDelim::Integer);
logtype += enum_to_underlying_type(ir::VariablePlaceholder::Integer);
}
/**
* Adds a float variable delimiter to the given logtype
* Adds a float variable placeholder to the given logtype
* @param logtype
*/
static void add_float_var (std::string& logtype) {
logtype += enum_to_underlying_type(VarDelim::Float);
logtype += enum_to_underlying_type(ir::VariablePlaceholder::Float);
}

size_t get_num_vars () const { return m_var_positions.size(); }
/**
* Gets all info about a variable in the logtype
* @param var_ix The index of the variable to get the info for
* @param var_delim
* @param var_placeholder
* @return The variable's position in the logtype, or SIZE_MAX if var_ix is out of bounds
*/
size_t get_var_info (size_t var_ix, VarDelim& var_delim) const;
size_t get_var_info (size_t var_ix, ir::VariablePlaceholder& var_placeholder) const;

/**
* Gets the size (in-memory) of the data contained in this entry
Expand All @@ -95,15 +87,15 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> {
*/
void add_constant (const std::string& value_containing_constant, size_t begin_pos, size_t length);
/**
* Adds an int variable delimiter
* Adds an int variable placeholder
*/
void add_int_var ();
/**
* Adds a float variable delimiter
* Adds a float variable placeholder
*/
void add_float_var ();
/**
* Adds a dictionary variable delimiter
* Adds a dictionary variable placeholder
*/
void add_dictionary_var ();

Expand Down Expand Up @@ -147,7 +139,8 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> {
private:
// Methods
/**
* Escapes any variable delimiters that don't correspond to the positions of variables in the logtype entry's value
* Escapes any variable placeholders that don't correspond to the positions
* of variables in the logtype entry's value
* @param escaped_logtype_value
*/
void get_value_with_unfounded_variables_escaped (std::string& escaped_logtype_value) const;
Expand Down
49 changes: 0 additions & 49 deletions components/core/src/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,55 +85,6 @@ ErrorCode create_directory_structure (const string& path, mode_t mode) {
return ErrorCode_Success;
}

bool get_bounds_of_next_var (const string& msg, size_t& begin_pos, size_t& end_pos) {
const auto msg_length = msg.length();
if (end_pos >= msg_length) {
return false;
}

while (true) {
begin_pos = end_pos;
// Find next non-delimiter
for (; begin_pos < msg_length; ++begin_pos) {
if (false == is_delim(msg[begin_pos])) {
break;
}
}
if (msg_length == begin_pos) {
// Early exit for performance
return false;
}

bool contains_decimal_digit = false;
bool contains_alphabet = false;

// Find next delimiter
end_pos = begin_pos;
for (; end_pos < msg_length; ++end_pos) {
char c = msg[end_pos];
if (is_decimal_digit(c)) {
contains_decimal_digit = true;
} else if (is_alphabet(c)) {
contains_alphabet = true;
} else if (is_delim(c)) {
break;
}
}

// Treat token as variable if:
// - it contains a decimal digit, or
// - it's directly preceded by an equals sign and contains an alphabet, or
// - it could be a multi-digit hex value
if (contains_decimal_digit || (begin_pos > 0 && '=' == msg[begin_pos - 1] && contains_alphabet) ||
could_be_multi_digit_hex_value(msg, begin_pos, end_pos))
{
break;
}
}

return (msg_length != begin_pos);
}

string get_parent_directory_path (const string& path) {
string dirname = get_unambiguous_path(path);

Expand Down
Loading
Loading