From 6571d8d067a5432fb0a6cfc5e85d3e1e3e9fbc1f Mon Sep 17 00:00:00 2001 From: Sharaf Mohamed Date: Wed, 6 Dec 2023 03:53:42 -0500 Subject: [PATCH] Replace compressor_frontend with log-surgeon submodule. (#131) --- .gitmodules | 3 + components/core/.clang-format | 2 +- components/core/CMakeLists.txt | 99 +-- components/core/README-Schema.md | 4 +- components/core/README.md | 2 +- components/core/cmake/utils.cmake | 5 + components/core/config/schemas.txt | 4 +- components/core/src/Grep.cpp | 131 +++- components/core/src/Grep.hpp | 33 +- components/core/src/LogSurgeonReader.cpp | 12 + components/core/src/LogSurgeonReader.hpp | 19 + components/core/src/Utils.cpp | 146 ++++ components/core/src/Utils.hpp | 14 + components/core/src/clg/clg.cpp | 42 +- components/core/src/clo/clo.cpp | 8 +- components/core/src/clp/FileCompressor.cpp | 69 +- components/core/src/clp/FileCompressor.hpp | 27 +- components/core/src/clp/compression.cpp | 14 +- components/core/src/clp/compression.hpp | 19 +- components/core/src/clp/run.cpp | 20 +- .../src/compressor_frontend/Constants.hpp | 42 -- .../src/compressor_frontend/LALR1Parser.cpp | 14 - .../src/compressor_frontend/LALR1Parser.hpp | 421 ----------- .../src/compressor_frontend/LALR1Parser.inc | 689 ------------------ .../core/src/compressor_frontend/Lexer.hpp | 199 ----- .../core/src/compressor_frontend/Lexer.inc | 541 -------------- .../src/compressor_frontend/LogParser.cpp | 218 ------ .../src/compressor_frontend/LogParser.hpp | 70 -- .../src/compressor_frontend/SchemaParser.cpp | 463 ------------ .../src/compressor_frontend/SchemaParser.hpp | 118 --- .../core/src/compressor_frontend/Token.cpp | 31 - .../core/src/compressor_frontend/Token.hpp | 52 -- .../finite_automata/RegexAST.hpp | 449 ------------ .../finite_automata/RegexAST.inc | 262 ------- .../finite_automata/RegexDFA.hpp | 86 --- .../finite_automata/RegexDFA.inc | 41 -- .../finite_automata/RegexNFA.hpp | 140 ---- .../finite_automata/RegexNFA.inc | 188 ----- .../finite_automata/UnicodeIntervalTree.hpp | 186 ----- .../finite_automata/UnicodeIntervalTree.inc | 231 ------ .../core/src/compressor_frontend/utils.cpp | 120 --- .../core/src/compressor_frontend/utils.hpp | 21 - .../src/streaming_archive/writer/Archive.cpp | 80 +- .../src/streaming_archive/writer/Archive.hpp | 19 +- components/core/submodules/log-surgeon | 1 + components/core/tests/test-Grep.cpp | 23 +- .../core/tests/test-ParserWithUserSchema.cpp | 154 ++-- components/core/tests/test_log_files/log.txt | 5 +- .../colon_missing_schema.txt | 2 +- .../tests/test_schema_files/real_schema.txt | 2 +- .../schema_with_delimiter_in_regex_error.txt | 2 +- .../schema_with_delimiters.txt | 2 +- ...schema_with_multicharacter_token_error.txt | 2 +- .../schema_without_delimiters.txt | 2 +- .../tests/test_schema_files/search_schema.txt | 2 +- .../clp-env-base-centos7.4/Dockerfile | 4 +- .../clp-env-base-ubuntu-focal/Dockerfile | 6 + .../scripts/lib_install/centos7.4/README.md | 4 +- .../centos7.4/install-packages-from-source.sh | 4 +- .../centos7.4/install-prebuilt-packages.sh | 2 +- .../ubuntu-focal/install-prebuilt-packages.sh | 2 + .../src/etc/clp-schema.template.txt | 2 +- 62 files changed, 658 insertions(+), 4917 deletions(-) create mode 100644 components/core/src/LogSurgeonReader.cpp create mode 100644 components/core/src/LogSurgeonReader.hpp delete mode 100644 components/core/src/compressor_frontend/Constants.hpp delete mode 100644 components/core/src/compressor_frontend/LALR1Parser.cpp delete mode 100644 components/core/src/compressor_frontend/LALR1Parser.hpp delete mode 100644 components/core/src/compressor_frontend/LALR1Parser.inc delete mode 100644 components/core/src/compressor_frontend/Lexer.hpp delete mode 100644 components/core/src/compressor_frontend/Lexer.inc delete mode 100644 components/core/src/compressor_frontend/LogParser.cpp delete mode 100644 components/core/src/compressor_frontend/LogParser.hpp delete mode 100644 components/core/src/compressor_frontend/SchemaParser.cpp delete mode 100644 components/core/src/compressor_frontend/SchemaParser.hpp delete mode 100644 components/core/src/compressor_frontend/Token.cpp delete mode 100644 components/core/src/compressor_frontend/Token.hpp delete mode 100644 components/core/src/compressor_frontend/finite_automata/RegexAST.hpp delete mode 100644 components/core/src/compressor_frontend/finite_automata/RegexAST.inc delete mode 100644 components/core/src/compressor_frontend/finite_automata/RegexDFA.hpp delete mode 100644 components/core/src/compressor_frontend/finite_automata/RegexDFA.inc delete mode 100644 components/core/src/compressor_frontend/finite_automata/RegexNFA.hpp delete mode 100644 components/core/src/compressor_frontend/finite_automata/RegexNFA.inc delete mode 100644 components/core/src/compressor_frontend/finite_automata/UnicodeIntervalTree.hpp delete mode 100644 components/core/src/compressor_frontend/finite_automata/UnicodeIntervalTree.inc delete mode 100644 components/core/src/compressor_frontend/utils.cpp delete mode 100644 components/core/src/compressor_frontend/utils.hpp create mode 160000 components/core/submodules/log-surgeon diff --git a/.gitmodules b/.gitmodules index ba28584e2..4b3b13551 100644 --- a/.gitmodules +++ b/.gitmodules @@ -11,6 +11,9 @@ [submodule "components/core/submodules/yaml-cpp"] path = components/core/submodules/yaml-cpp url = https://github.com/jbeder/yaml-cpp.git +[submodule "components/core/submodules/log-surgeon"] + path = components/core/submodules/log-surgeon + url = https://github.com/y-scope/log-surgeon.git [submodule "components/core/submodules/boost-outcome"] path = components/core/submodules/boost-outcome url = https://github.com/boostorg/outcome.git diff --git a/components/core/.clang-format b/components/core/.clang-format index 42f194fdb..ce26532e7 100644 --- a/components/core/.clang-format +++ b/components/core/.clang-format @@ -68,7 +68,7 @@ IncludeBlocks: Regroup IncludeCategories: # NOTE: A header is grouped by first matching regex # Third-party headers. Update when adding new third-party libraries. - - Regex: '^<(archive|boost|catch2|date|fmt|json|mariadb|spdlog|sqlite3|yaml-cpp|zstd)' + - Regex: '^<(archive|boost|catch2|date|fmt|json|log_surgeon|mariadb|spdlog|sqlite3|yaml-cpp|zstd)' Priority: 3 # C system headers - Regex: '^<.+.h>' diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 503d8f122..f44cc9ad6 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -128,6 +128,9 @@ else() message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for LibArchive") endif() +# Add log surgeon +add_subdirectory(submodules/log-surgeon EXCLUDE_FROM_ALL) + # Find and setup MariaDBClient library if(CLP_USE_STATIC_LIBS) # NOTE: We can't statically link to MariaDBClient since it's GPL @@ -193,28 +196,6 @@ set(SOURCE_FILES_clp src/clp/StructuredFileToCompress.hpp src/clp/utils.cpp src/clp/utils.hpp - src/compressor_frontend/Constants.hpp - src/compressor_frontend/finite_automata/RegexAST.hpp - src/compressor_frontend/finite_automata/RegexAST.inc - src/compressor_frontend/finite_automata/RegexDFA.hpp - src/compressor_frontend/finite_automata/RegexDFA.inc - src/compressor_frontend/finite_automata/RegexNFA.hpp - src/compressor_frontend/finite_automata/RegexNFA.inc - src/compressor_frontend/finite_automata/UnicodeIntervalTree.hpp - src/compressor_frontend/finite_automata/UnicodeIntervalTree.inc - src/compressor_frontend/LALR1Parser.cpp - src/compressor_frontend/LALR1Parser.hpp - src/compressor_frontend/LALR1Parser.inc - src/compressor_frontend/Lexer.hpp - src/compressor_frontend/Lexer.inc - src/compressor_frontend/LogParser.cpp - src/compressor_frontend/LogParser.hpp - src/compressor_frontend/SchemaParser.cpp - src/compressor_frontend/SchemaParser.hpp - src/compressor_frontend/Token.cpp - src/compressor_frontend/Token.hpp - src/compressor_frontend/utils.cpp - src/compressor_frontend/utils.hpp src/database_utils.cpp src/database_utils.hpp src/Defs.h @@ -262,6 +243,8 @@ set(SOURCE_FILES_clp src/LibarchiveFileReader.hpp src/LibarchiveReader.cpp src/LibarchiveReader.hpp + src/LogSurgeonReader.cpp + src/LogSurgeonReader.hpp src/LogTypeDictionaryEntry.cpp src/LogTypeDictionaryEntry.hpp src/LogTypeDictionaryReader.cpp @@ -360,6 +343,7 @@ target_link_libraries(clp PRIVATE Boost::filesystem Boost::iostreams Boost::program_options fmt::fmt + log_surgeon::log_surgeon spdlog::spdlog ${sqlite_LIBRARY_DEPENDENCIES} LibArchive::LibArchive @@ -384,26 +368,6 @@ set(SOURCE_FILES_clg src/clg/clg.cpp src/clg/CommandLineArguments.cpp src/clg/CommandLineArguments.hpp - src/compressor_frontend/Constants.hpp - src/compressor_frontend/finite_automata/RegexAST.hpp - src/compressor_frontend/finite_automata/RegexAST.inc - src/compressor_frontend/finite_automata/RegexDFA.hpp - src/compressor_frontend/finite_automata/RegexDFA.inc - src/compressor_frontend/finite_automata/RegexNFA.hpp - src/compressor_frontend/finite_automata/RegexNFA.inc - src/compressor_frontend/finite_automata/UnicodeIntervalTree.hpp - src/compressor_frontend/finite_automata/UnicodeIntervalTree.inc - src/compressor_frontend/LALR1Parser.cpp - src/compressor_frontend/LALR1Parser.hpp - src/compressor_frontend/LALR1Parser.inc - src/compressor_frontend/Lexer.hpp - src/compressor_frontend/Lexer.inc - src/compressor_frontend/SchemaParser.cpp - src/compressor_frontend/SchemaParser.hpp - src/compressor_frontend/Token.cpp - src/compressor_frontend/Token.hpp - src/compressor_frontend/utils.cpp - src/compressor_frontend/utils.hpp src/database_utils.cpp src/database_utils.hpp src/Defs.h @@ -439,6 +403,8 @@ set(SOURCE_FILES_clg src/ir/LogEvent.hpp src/ir/parsing.cpp src/ir/parsing.hpp + src/LogSurgeonReader.cpp + src/LogSurgeonReader.hpp src/ir/parsing.inc src/LogTypeDictionaryEntry.cpp src/LogTypeDictionaryEntry.hpp @@ -532,6 +498,7 @@ target_link_libraries(clg Boost::filesystem Boost::iostreams Boost::program_options fmt::fmt KQL + log_surgeon::log_surgeon MariaDBClient::MariaDBClient spdlog::spdlog ${sqlite_LIBRARY_DEPENDENCIES} @@ -551,26 +518,6 @@ set(SOURCE_FILES_clo src/clo/CommandLineArguments.hpp src/clo/ControllerMonitoringThread.cpp src/clo/ControllerMonitoringThread.hpp - src/compressor_frontend/Constants.hpp - src/compressor_frontend/finite_automata/RegexAST.hpp - src/compressor_frontend/finite_automata/RegexAST.inc - src/compressor_frontend/finite_automata/RegexDFA.hpp - src/compressor_frontend/finite_automata/RegexDFA.inc - src/compressor_frontend/finite_automata/RegexNFA.hpp - src/compressor_frontend/finite_automata/RegexNFA.inc - src/compressor_frontend/finite_automata/UnicodeIntervalTree.hpp - src/compressor_frontend/finite_automata/UnicodeIntervalTree.inc - src/compressor_frontend/LALR1Parser.cpp - src/compressor_frontend/LALR1Parser.hpp - src/compressor_frontend/LALR1Parser.inc - src/compressor_frontend/Lexer.hpp - src/compressor_frontend/Lexer.inc - src/compressor_frontend/SchemaParser.cpp - src/compressor_frontend/SchemaParser.hpp - src/compressor_frontend/Token.cpp - src/compressor_frontend/Token.hpp - src/compressor_frontend/utils.cpp - src/compressor_frontend/utils.hpp src/database_utils.cpp src/database_utils.hpp src/Defs.h @@ -598,6 +545,8 @@ set(SOURCE_FILES_clo src/ir/LogEvent.hpp src/ir/parsing.cpp src/ir/parsing.hpp + src/LogSurgeonReader.cpp + src/LogSurgeonReader.hpp src/ir/parsing.inc src/LogTypeDictionaryEntry.cpp src/LogTypeDictionaryEntry.hpp @@ -690,6 +639,7 @@ target_link_libraries(clo PRIVATE Boost::filesystem Boost::iostreams Boost::program_options fmt::fmt + log_surgeon::log_surgeon msgpack-cxx spdlog::spdlog ${sqlite_LIBRARY_DEPENDENCIES} @@ -723,28 +673,6 @@ set(SOURCE_FILES_unitTest src/clp/StructuredFileToCompress.hpp src/clp/utils.cpp src/clp/utils.hpp - src/compressor_frontend/Constants.hpp - src/compressor_frontend/finite_automata/RegexAST.hpp - src/compressor_frontend/finite_automata/RegexAST.inc - src/compressor_frontend/finite_automata/RegexDFA.hpp - src/compressor_frontend/finite_automata/RegexDFA.inc - src/compressor_frontend/finite_automata/RegexNFA.hpp - src/compressor_frontend/finite_automata/RegexNFA.inc - src/compressor_frontend/finite_automata/UnicodeIntervalTree.hpp - src/compressor_frontend/finite_automata/UnicodeIntervalTree.inc - src/compressor_frontend/LALR1Parser.cpp - src/compressor_frontend/LALR1Parser.hpp - src/compressor_frontend/LALR1Parser.inc - src/compressor_frontend/Lexer.hpp - src/compressor_frontend/Lexer.inc - src/compressor_frontend/LogParser.cpp - src/compressor_frontend/LogParser.hpp - src/compressor_frontend/SchemaParser.cpp - src/compressor_frontend/SchemaParser.hpp - src/compressor_frontend/Token.cpp - src/compressor_frontend/Token.hpp - src/compressor_frontend/utils.cpp - src/compressor_frontend/utils.hpp src/database_utils.cpp src/database_utils.hpp src/Defs.h @@ -809,6 +737,8 @@ set(SOURCE_FILES_unitTest src/LibarchiveFileReader.hpp src/LibarchiveReader.cpp src/LibarchiveReader.hpp + src/LogSurgeonReader.cpp + src/LogSurgeonReader.hpp src/LogTypeDictionaryEntry.cpp src/LogTypeDictionaryEntry.hpp src/LogTypeDictionaryReader.cpp @@ -925,6 +855,7 @@ target_link_libraries(unitTest PRIVATE Boost::filesystem Boost::iostreams Boost::program_options fmt::fmt + log_surgeon::log_surgeon LibArchive::LibArchive MariaDBClient::MariaDBClient spdlog::spdlog diff --git a/components/core/README-Schema.md b/components/core/README-Schema.md index ac59ca2ab..6644abd66 100644 --- a/components/core/README-Schema.md +++ b/components/core/README-Schema.md @@ -17,7 +17,7 @@ delimiters: \t\r\n:,!;% timestamp:\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1} timestamp:\[\d{8}\-\d{2}:\d{2}:\d{2}\] int:\-{0,1}[0-9]+ -double:\-{0,1}[0-9]+\.[0-9]+ +float:\-{0,1}[0-9]+\.[0-9]+ // Custom variables hex:[a-fA-F]+ @@ -49,7 +49,7 @@ equals:.*=.*[a-zA-Z0-9].* start of the file then a newline is used to indicate the beginning of a new log message. Timestamp patterns are not matched midline and are not stored as dictionary variables as they may contain delimiters. -* `int` and `double` are keywords. These are encoded specially for compression +* `int` and `float` are keywords. These are encoded specially for compression performance. ## Supported Regex diff --git a/components/core/README.md b/components/core/README.md index 5e0221d1b..6820d311c 100644 --- a/components/core/README.md +++ b/components/core/README.md @@ -22,7 +22,7 @@ CLP core is the low-level component that performs compression, decompression, an * We have built and tested CLP on the OSes listed [below](https://github.com/y-scope/clp/tree/main/components/core#native-environment). * If you have trouble building for another OS, file an issue, and we may be able to help. -* A compiler that supports C++17 (e.g., gcc-8) +* A compiler that supports C++17 and std::span (e.g., gcc-10) ## Building diff --git a/components/core/cmake/utils.cmake b/components/core/cmake/utils.cmake index f2bb940ce..d6aefa160 100644 --- a/components/core/cmake/utils.cmake +++ b/components/core/cmake/utils.cmake @@ -41,9 +41,14 @@ set(SOURCE_FILES_make-dictionaries-readable ${CMAKE_CURRENT_SOURCE_DIR}/submodules/date/include/date/date.h ) add_executable(make-dictionaries-readable ${SOURCE_FILES_make-dictionaries-readable}) +target_include_directories(make-dictionaries-readable + PRIVATE + ${CMAKE_SOURCE_DIR}/submodules + ) target_link_libraries(make-dictionaries-readable PRIVATE Boost::filesystem Boost::iostreams Boost::program_options + log_surgeon::log_surgeon spdlog::spdlog ZStd::ZStd ) diff --git a/components/core/config/schemas.txt b/components/core/config/schemas.txt index 2965a3d8f..e0b777859 100644 --- a/components/core/config/schemas.txt +++ b/components/core/config/schemas.txt @@ -9,9 +9,9 @@ timestamp:\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1} // E.g. [20150131-15:50:45] timestamp:\[\d{8}\-\d{2}:\d{2}:\d{2}\] -// Specially-encoded variables (using the `int` and `double` keywords) +// Specially-encoded variables (using the `int` and `float` keywords) int:\-{0,1}[0-9]+ -double:\-{0,1}[0-9]+\.[0-9]+ +float:\-{0,1}[0-9]+\.[0-9]+ // Dictionary variables hex:[a-fA-F]+ diff --git a/components/core/src/Grep.cpp b/components/core/src/Grep.cpp index e533a4eea..fbcc151e6 100644 --- a/components/core/src/Grep.cpp +++ b/components/core/src/Grep.cpp @@ -3,10 +3,13 @@ // C++ libraries #include +// Log surgeon +#include + // Project headers -#include "compressor_frontend/Constants.hpp" #include "EncodedVariableInterpreter.hpp" #include "ir/parsing.hpp" +#include "LogSurgeonReader.hpp" #include "StringReader.hpp" #include "Utils.hpp" @@ -233,6 +236,16 @@ bool QueryToken::change_to_next_possible_type () { } } +/** + * Wraps the tokens returned from the log_surgeon lexer, and stores the variable + * ids of the tokens in a search query in a set. This allows for optimized + * search performance. + */ +class SearchToken : public log_surgeon::Token { +public: + std::set m_type_ids_set; +}; + // Local prototypes /** * Process a QueryToken that is definitely a variable @@ -419,10 +432,17 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery (const Archiv return SubQueryMatchabilityResult::MayMatch; } -bool Grep::process_raw_query (const Archive& archive, const string& search_string, epochtime_t search_begin_ts, epochtime_t search_end_ts, bool ignore_case, - Query& query, compressor_frontend::lexers::ByteLexer& forward_lexer, compressor_frontend::lexers::ByteLexer& reverse_lexer, - bool use_heuristic) -{ +bool Grep::process_raw_query( + Archive const& archive, + string const& search_string, + epochtime_t search_begin_ts, + epochtime_t search_end_ts, + bool ignore_case, + Query& query, + log_surgeon::lexers::ByteLexer& forward_lexer, + log_surgeon::lexers::ByteLexer& reverse_lexer, + bool use_heuristic +) { // Set properties which require no processing query.set_search_begin_timestamp(search_begin_ts); query.set_search_end_timestamp(search_end_ts); @@ -437,22 +457,24 @@ bool Grep::process_raw_query (const Archive& archive, const string& search_strin processed_search_string = clean_up_wildcard_search_string(processed_search_string); query.set_search_string(processed_search_string); - // Replace non-greedy wildcards with greedy wildcards since we currently have no support for searching compressed files with non-greedy wildcards - std::replace(processed_search_string.begin(), processed_search_string.end(), '?', '*'); - // Clean-up in case any instances of "?*" or "*?" were changed into "**" - processed_search_string = clean_up_wildcard_search_string(processed_search_string); - // Split search_string into tokens with wildcards vector query_tokens; size_t begin_pos = 0; size_t end_pos = 0; bool is_var; if (use_heuristic) { + // Replace non-greedy wildcards with greedy wildcards since we currently + // have no support for searching compressed files with non-greedy + // wildcards + std::replace(processed_search_string.begin(), processed_search_string.end(), '?', '*'); + // Clean-up in case any instances of "?*" or "*?" were changed into "**" + processed_search_string = clean_up_wildcard_search_string(processed_search_string); while (get_bounds_of_next_potential_var(processed_search_string, begin_pos, end_pos, is_var)) { query_tokens.emplace_back(processed_search_string, begin_pos, end_pos, is_var); } } else { - while (get_bounds_of_next_potential_var(processed_search_string, begin_pos, end_pos, is_var, forward_lexer, reverse_lexer)) { + while (get_bounds_of_next_potential_var(processed_search_string, begin_pos, end_pos, is_var, + forward_lexer, reverse_lexer)) { query_tokens.emplace_back(processed_search_string, begin_pos, end_pos, is_var); } } @@ -621,9 +643,14 @@ bool Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_ return (value_length != begin_pos); } -bool -Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_pos, size_t& end_pos, bool& is_var, - compressor_frontend::lexers::ByteLexer& forward_lexer, compressor_frontend::lexers::ByteLexer& reverse_lexer) { +bool Grep::get_bounds_of_next_potential_var( + string const& value, + size_t& begin_pos, + size_t& end_pos, + bool& is_var, + log_surgeon::lexers::ByteLexer& forward_lexer, + log_surgeon::lexers::ByteLexer& reverse_lexer +) { const size_t value_length = value.length(); if (end_pos >= value_length) { return false; @@ -699,35 +726,59 @@ Grep::get_bounds_of_next_potential_var (const string& value, size_t& begin_pos, break; } } + SearchToken search_token; if (has_wildcard_in_middle || (has_prefix_wildcard && has_suffix_wildcard)) { // DO NOTHING - } else if (has_suffix_wildcard) { //asdsas* - StringReader stringReader; - stringReader.open(value.substr(begin_pos, end_pos - begin_pos - 1)); - forward_lexer.reset(stringReader); - compressor_frontend::Token token = forward_lexer.scan_with_wildcard(value[end_pos - 1]); - if (token.m_type_ids->at(0) != (int) compressor_frontend::SymbolID::TokenUncaughtStringID && - token.m_type_ids->at(0) != (int) compressor_frontend::SymbolID::TokenEndID) { - is_var = true; - } - } else if (has_prefix_wildcard) { // *asdas - std::string value_reverse = value.substr(begin_pos + 1, end_pos - begin_pos - 1); - std::reverse(value_reverse.begin(), value_reverse.end()); - StringReader stringReader; - stringReader.open(value_reverse); - reverse_lexer.reset(stringReader); - compressor_frontend::Token token = reverse_lexer.scan_with_wildcard(value[begin_pos]); - if (token.m_type_ids->at(0) != (int) compressor_frontend::SymbolID::TokenUncaughtStringID && - token.m_type_ids->at(0) != (int)compressor_frontend::SymbolID::TokenEndID) { - is_var = true; + } else { + StringReader string_reader; + LogSurgeonReader reader_wrapper(string_reader); + log_surgeon::ParserInputBuffer parser_input_buffer; + if (has_suffix_wildcard) { //text* + // TODO: creating a string reader, setting it equal to a + // string, to read it into the ParserInputBuffer, seems + // like a convoluted way to set a string equal to a string, + // should be improved when adding a SearchParser to + // log_surgeon + string_reader.open(value.substr(begin_pos, end_pos - begin_pos - 1)); + parser_input_buffer.read_if_safe(reader_wrapper); + forward_lexer.reset(); + forward_lexer.scan_with_wildcard( + parser_input_buffer, + value[end_pos - 1], + search_token + ); + } else if (has_prefix_wildcard) { // *text + std::string value_reverse + = value.substr(begin_pos + 1, end_pos - begin_pos - 1); + std::reverse(value_reverse.begin(), value_reverse.end()); + string_reader.open(value_reverse); + parser_input_buffer.read_if_safe(reader_wrapper); + reverse_lexer.reset(); + reverse_lexer.scan_with_wildcard( + parser_input_buffer, + value[begin_pos], + search_token + ); + } else { // no wildcards + string_reader.open(value.substr(begin_pos, end_pos - begin_pos)); + parser_input_buffer.read_if_safe(reader_wrapper); + forward_lexer.reset(); + forward_lexer.scan(parser_input_buffer, search_token); + search_token.m_type_ids_set.insert(search_token.m_type_ids_ptr->at(0)); } - } else { // no wildcards - StringReader stringReader; - stringReader.open(value.substr(begin_pos, end_pos - begin_pos)); - forward_lexer.reset(stringReader); - compressor_frontend::Token token = forward_lexer.scan(); - if (token.m_type_ids->at(0) != (int) compressor_frontend::SymbolID::TokenUncaughtStringID && - token.m_type_ids->at(0) != (int) compressor_frontend::SymbolID::TokenEndID) { + // TODO: use a set so its faster + // auto const& set = search_token.m_type_ids_set; + // if (set.find(static_cast(log_surgeon::SymbolID::TokenUncaughtStringID)) + // == set.end() + // && set.find(static_cast(log_surgeon::SymbolID::TokenEndID)) + // == set.end()) + // { + // is_var = true; + // } + auto const& type = search_token.m_type_ids_ptr->at(0); + if (type != static_cast(log_surgeon::SymbolID::TokenUncaughtStringID) + && type != static_cast(log_surgeon::SymbolID::TokenEndID)) + { is_var = true; } } diff --git a/components/core/src/Grep.hpp b/components/core/src/Grep.hpp index 68225eb1b..ece1e62d9 100644 --- a/components/core/src/Grep.hpp +++ b/components/core/src/Grep.hpp @@ -4,12 +4,14 @@ // C++ libraries #include +// Log surgeon +#include + // Project headers #include "Defs.h" #include "Query.hpp" #include "streaming_archive/reader/Archive.hpp" #include "streaming_archive/reader/File.hpp" -#include "compressor_frontend/Lexer.hpp" class Grep { @@ -34,11 +36,23 @@ class Grep { * @param search_end_ts * @param ignore_case * @param query + * @param forward_lexer DFA for determining if input is in the schema + * @param reverse_lexer DFA for determining if reverse of input is in the + * schema + * @param use_heuristic * @return true if query may match messages, false otherwise */ - static bool process_raw_query (const streaming_archive::reader::Archive& archive, const std::string& search_string, epochtime_t search_begin_ts, - epochtime_t search_end_ts, bool ignore_case, Query& query, compressor_frontend::lexers::ByteLexer& forward_lexer, - compressor_frontend::lexers::ByteLexer& reverse_lexer, bool use_heuristic); + static bool process_raw_query( + streaming_archive::reader::Archive const& archive, + std::string const& search_string, + epochtime_t search_begin_ts, + epochtime_t search_end_ts, + bool ignore_case, + Query& query, + log_surgeon::lexers::ByteLexer& forward_lexer, + log_surgeon::lexers::ByteLexer& reverse_lexer, + bool use_heuristic + ); /** * Returns bounds of next potential variable (either a definite variable or a token with wildcards) @@ -60,9 +74,14 @@ class Grep { * @param reverse_lexer DFA for determining if reverse of input is in the schema * @return true if another potential variable was found, false otherwise */ - static bool get_bounds_of_next_potential_var (const std::string& value, size_t& begin_pos, size_t& end_pos, bool& is_var, - compressor_frontend::lexers::ByteLexer& forward_lexer, compressor_frontend::lexers::ByteLexer& reverse_lexer); - + static bool get_bounds_of_next_potential_var( + std::string const& value, + size_t& begin_pos, + size_t& end_pos, + bool& is_var, + log_surgeon::lexers::ByteLexer& forward_lexer, + log_surgeon::lexers::ByteLexer& reverse_lexer + ); /** * Marks which sub-queries in each query are relevant to the given file * @param compressed_file diff --git a/components/core/src/LogSurgeonReader.cpp b/components/core/src/LogSurgeonReader.cpp new file mode 100644 index 000000000..e3d0e7a12 --- /dev/null +++ b/components/core/src/LogSurgeonReader.cpp @@ -0,0 +1,12 @@ +#include "LogSurgeonReader.hpp" + +LogSurgeonReader::LogSurgeonReader(ReaderInterface& reader_interface) + : m_reader_interface(reader_interface) { + read = [this](char* buf, size_t count, size_t& read_to) -> log_surgeon::ErrorCode { + m_reader_interface.read(buf, count, read_to); + if (read_to == 0) { + return log_surgeon::ErrorCode::EndOfFile; + } + return log_surgeon::ErrorCode::Success; + }; +} diff --git a/components/core/src/LogSurgeonReader.hpp b/components/core/src/LogSurgeonReader.hpp new file mode 100644 index 000000000..82e762bf9 --- /dev/null +++ b/components/core/src/LogSurgeonReader.hpp @@ -0,0 +1,19 @@ +#ifndef LOG_SURGEON_READER_HPP +#define LOG_SURGEON_READER_HPP + +#include + +#include "ReaderInterface.hpp" + +/* + * Wrapper providing a read function that works with the parsers in log_surgeon. + */ +class LogSurgeonReader : public log_surgeon::Reader { +public: + LogSurgeonReader(ReaderInterface& reader_interface); + +private: + ReaderInterface& m_reader_interface; +}; + +#endif // LOG_SURGEON_READER_HPP diff --git a/components/core/src/Utils.cpp b/components/core/src/Utils.cpp index a0b226fee..f3dd17276 100644 --- a/components/core/src/Utils.cpp +++ b/components/core/src/Utils.cpp @@ -14,6 +14,12 @@ #include #include +// spdlog +#include + +// Log surgeon +#include + // Project headers #include "spdlog_with_specializations.hpp" #include "string_utils.hpp" @@ -164,3 +170,143 @@ ErrorCode read_list_of_paths (const string& list_path, vector& paths) { return ErrorCode_Success; } + +// TODO: duplicates code in log_surgeon/parser.tpp, should implement a +// SearchParser in log_surgeon instead and use it here. Specifically, +// initialization of lexer.m_symbol_id, contains_delimiter error, and add_rule +// logic. +void load_lexer_from_file( + std::string const& schema_file_path, + bool reverse, + log_surgeon::lexers::ByteLexer& lexer +) { + log_surgeon::SchemaParser sp; + std::unique_ptr schema_ast + = log_surgeon::SchemaParser::try_schema_file(schema_file_path); + if (!lexer.m_symbol_id.empty()) { + throw std::runtime_error("Error: symbol_ids initialized before setting enum symbol_ids"); + } + + // cTokenEnd and cTokenUncaughtString never need to be added as a rule to + // the lexer as they are not parsed + lexer.m_symbol_id[log_surgeon::cTokenEnd] = static_cast(log_surgeon::SymbolID::TokenEndID); + lexer.m_symbol_id[log_surgeon::cTokenUncaughtString] + = static_cast(log_surgeon::SymbolID::TokenUncaughtStringID); + // cTokenInt, cTokenFloat, cTokenFirstTimestamp, and cTokenNewlineTimestamp + // each have unknown rule(s) until specified by the user so can't be + // explicitly added and are done by looping over schema_vars (user schema) + lexer.m_symbol_id[log_surgeon::cTokenInt] = static_cast(log_surgeon::SymbolID::TokenIntId); + lexer.m_symbol_id[log_surgeon::cTokenFloat] + = static_cast(log_surgeon::SymbolID::TokenFloatId); + lexer.m_symbol_id[log_surgeon::cTokenFirstTimestamp] + = static_cast(log_surgeon::SymbolID::TokenFirstTimestampId); + lexer.m_symbol_id[log_surgeon::cTokenNewlineTimestamp] + = static_cast(log_surgeon::SymbolID::TokenNewlineTimestampId); + // cTokenNewline is not added in schema_vars and can be explicitly added + // as '\n' to catch the end of non-timestamped log messages + lexer.m_symbol_id[log_surgeon::cTokenNewline] + = static_cast(log_surgeon::SymbolID::TokenNewlineId); + + lexer.m_id_symbol[static_cast(log_surgeon::SymbolID::TokenEndID)] = log_surgeon::cTokenEnd; + lexer.m_id_symbol[static_cast(log_surgeon::SymbolID::TokenUncaughtStringID)] + = log_surgeon::cTokenUncaughtString; + lexer.m_id_symbol[static_cast(log_surgeon::SymbolID::TokenIntId)] = log_surgeon::cTokenInt; + lexer.m_id_symbol[static_cast(log_surgeon::SymbolID::TokenFloatId)] + = log_surgeon::cTokenFloat; + lexer.m_id_symbol[static_cast(log_surgeon::SymbolID::TokenFirstTimestampId)] + = log_surgeon::cTokenFirstTimestamp; + lexer.m_id_symbol[static_cast(log_surgeon::SymbolID::TokenNewlineTimestampId)] + = log_surgeon::cTokenNewlineTimestamp; + lexer.m_id_symbol[static_cast(log_surgeon::SymbolID::TokenNewlineId)] + = log_surgeon::cTokenNewline; + + lexer.add_rule( + lexer.m_symbol_id["newLine"], + std::move(std::make_unique>( + log_surgeon::finite_automata::RegexASTLiteral< + log_surgeon::finite_automata::RegexNFAByteState>('\n') + )) + ); + + for (auto const& delimiters_ast : schema_ast->m_delimiters) { + auto* delimiters_ptr = dynamic_cast(delimiters_ast.get()); + if (delimiters_ptr != nullptr) { + lexer.add_delimiters(delimiters_ptr->m_delimiters); + } + } + vector delimiters; + for (uint32_t i = 0; i < log_surgeon::cSizeOfByte; i++) { + if (lexer.is_delimiter(i)) { + delimiters.push_back(i); + } + } + for (std::unique_ptr const& parser_ast : schema_ast->m_schema_vars) { + auto* rule = dynamic_cast(parser_ast.get()); + + if ("timestamp" == rule->m_name) { + continue; + } + + if (lexer.m_symbol_id.find(rule->m_name) == lexer.m_symbol_id.end()) { + lexer.m_symbol_id[rule->m_name] = lexer.m_symbol_id.size(); + lexer.m_id_symbol[lexer.m_symbol_id[rule->m_name]] = rule->m_name; + } + + // transform '.' from any-character into any non-delimiter character + rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters); + + bool is_possible_input[log_surgeon::cUnicodeMax] = {false}; + rule->m_regex_ptr->set_possible_inputs_to_true(is_possible_input); + bool contains_delimiter = false; + uint32_t delimiter_name; + for (uint32_t delimiter : delimiters) { + if (is_possible_input[delimiter]) { + contains_delimiter = true; + delimiter_name = delimiter; + break; + } + } + + if (contains_delimiter) { + FileReader schema_reader; + ErrorCode error_code = schema_reader.try_open(schema_ast->m_file_path); + if (ErrorCode_Success != error_code) { + throw std::runtime_error( + schema_file_path + ":" + std::to_string(rule->m_line_num + 1) + ": error: '" + + rule->m_name + "' has regex pattern which contains delimiter '" + + char(delimiter_name) + "'.\n" + ); + } else { + // more detailed debugging based on looking at the file + string line; + for (uint32_t i = 0; i <= rule->m_line_num; i++) { + schema_reader.read_to_delimiter('\n', false, false, line); + } + int colon_pos = 0; + for (char i : line) { + colon_pos++; + if (i == ':') { + break; + } + } + string indent(10, ' '); + string spaces(colon_pos, ' '); + string arrows(line.size() - colon_pos, '^'); + + throw std::runtime_error( + schema_file_path + ":" + std::to_string(rule->m_line_num + 1) + ": error: '" + + rule->m_name + "' has regex pattern which contains delimiter '" + + char(delimiter_name) + "'.\n" + indent + line + "\n" + indent + spaces + + arrows + "\n" + ); + } + } + lexer.add_rule(lexer.m_symbol_id[rule->m_name], std::move(rule->m_regex_ptr)); + } + if (reverse) { + lexer.generate_reverse(); + } else { + lexer.generate(); + } +} diff --git a/components/core/src/Utils.hpp b/components/core/src/Utils.hpp index e3fa814a0..ea09f0ca7 100644 --- a/components/core/src/Utils.hpp +++ b/components/core/src/Utils.hpp @@ -8,6 +8,9 @@ #include #include +// Log surgeon +#include + // Project headers #include "Defs.h" #include "ErrorCode.hpp" @@ -65,4 +68,15 @@ std::string get_unambiguous_path (const std::string& path); */ ErrorCode read_list_of_paths (const std::string& list_path, std::vector& paths); +/** + * Loads a lexer from a file + * @param schema_file_path + * @param done + * @param forward_lexer_ptr + */ +void load_lexer_from_file( + std::string const& schema_file_path, + bool done, + log_surgeon::lexers::ByteLexer& forward_lexer_ptr +); #endif // UTILS_HPP diff --git a/components/core/src/clg/clg.cpp b/components/core/src/clg/clg.cpp index 16ab7b4df..7d8160ca0 100644 --- a/components/core/src/clg/clg.cpp +++ b/components/core/src/clg/clg.cpp @@ -8,19 +8,21 @@ // spdlog #include +// Log surgeon +#include + // Project headers #include "../Defs.h" -#include "../compressor_frontend/utils.hpp" #include "../Grep.hpp" #include "../GlobalMySQLMetadataDB.hpp" #include "../GlobalSQLiteMetadataDB.hpp" #include "../Profiler.hpp" #include "../spdlog_with_specializations.hpp" #include "../streaming_archive/Constants.hpp" +#include "../Utils.hpp" #include "CommandLineArguments.hpp" using clg::CommandLineArguments; -using compressor_frontend::load_lexer_from_file; using std::cout; using std::cerr; using std::endl; @@ -135,8 +137,14 @@ static bool open_archive (const string& archive_path, Archive& archive_reader) { return true; } -static bool search (const vector& search_strings, CommandLineArguments& command_line_args, Archive& archive, - compressor_frontend::lexers::ByteLexer& forward_lexer, compressor_frontend::lexers::ByteLexer& reverse_lexer, bool use_heuristic) { +static bool search( + vector const& search_strings, + CommandLineArguments& command_line_args, + Archive& archive, + log_surgeon::lexers::ByteLexer& forward_lexer, + log_surgeon::lexers::ByteLexer& reverse_lexer, + bool use_heuristic +) { ErrorCode error_code; auto search_begin_ts = command_line_args.get_search_begin_ts(); auto search_end_ts = command_line_args.get_search_end_ts(); @@ -148,9 +156,8 @@ static bool search (const vector& search_strings, CommandLineArguments& bool is_superseding_query = false; for (const auto& search_string : search_strings) { Query query; - if (Grep::process_raw_query(archive, search_string, search_begin_ts, search_end_ts, command_line_args.ignore_case(), query, forward_lexer, + if (Grep::process_raw_query(archive, search_string, search_begin_ts, search_end_ts, command_line_args.ignore_case(), query, forward_lexer, reverse_lexer, use_heuristic)) { - //if (Grep::process_raw_query(archive, search_string, search_begin_ts, search_end_ts, command_line_args.ignore_case(), query, parser)) { no_queries_match = false; if (query.contains_sub_queries() == false) { @@ -390,14 +397,15 @@ int main (int argc, const char* argv[]) { } global_metadata_db->open(); - /// TODO: if performance is too slow, can make this more efficient by only diffing files with the same checksum + // TODO: if performance is too slow, can make this more efficient by only + // diffing files with the same checksum const uint32_t max_map_schema_length = 100000; - std::map forward_lexer_map; - std::map reverse_lexer_map; - compressor_frontend::lexers::ByteLexer one_time_use_forward_lexer; - compressor_frontend::lexers::ByteLexer one_time_use_reverse_lexer; - compressor_frontend::lexers::ByteLexer* forward_lexer_ptr; - compressor_frontend::lexers::ByteLexer* reverse_lexer_ptr; + std::map forward_lexer_map; + std::map reverse_lexer_map; + log_surgeon::lexers::ByteLexer one_time_use_forward_lexer; + log_surgeon::lexers::ByteLexer one_time_use_reverse_lexer; + log_surgeon::lexers::ByteLexer* forward_lexer_ptr; + log_surgeon::lexers::ByteLexer* reverse_lexer_ptr; string archive_id; Archive archive_reader; @@ -416,7 +424,7 @@ int main (int argc, const char* argv[]) { if (!open_archive(archive_path.string(), archive_reader)) { return -1; } - + // Generate lexer if schema file exists auto schema_file_path = archive_path / streaming_archive::cSchemaFileName; bool use_heuristic = true; @@ -435,12 +443,14 @@ int main (int argc, const char* argv[]) { // if there is a chance there might be a difference make a new lexer as it's pretty fast to create if (forward_lexer_map_it == forward_lexer_map.end()) { // Create forward lexer - auto insert_result = forward_lexer_map.emplace(buf, compressor_frontend::lexers::ByteLexer()); + auto insert_result + = forward_lexer_map.emplace(buf, log_surgeon::lexers::ByteLexer()); forward_lexer_ptr = &insert_result.first->second; load_lexer_from_file(schema_file_path, false, *forward_lexer_ptr); // Create reverse lexer - insert_result = reverse_lexer_map.emplace(buf, compressor_frontend::lexers::ByteLexer()); + insert_result + = reverse_lexer_map.emplace(buf, log_surgeon::lexers::ByteLexer()); reverse_lexer_ptr = &insert_result.first->second; load_lexer_from_file(schema_file_path, true, *reverse_lexer_ptr); } else { diff --git a/components/core/src/clo/clo.cpp b/components/core/src/clo/clo.cpp index b71836858..1f5439a04 100644 --- a/components/core/src/clo/clo.cpp +++ b/components/core/src/clo/clo.cpp @@ -16,7 +16,6 @@ // Project headers #include "../Defs.h" -#include "../compressor_frontend/utils.hpp" #include "../Grep.hpp" #include "../Profiler.hpp" #include "../networking/socket_utils.hpp" @@ -27,7 +26,6 @@ #include "ControllerMonitoringThread.hpp" using clo::CommandLineArguments; -using compressor_frontend::load_lexer_from_file; using std::cout; using std::cerr; using std::endl; @@ -204,16 +202,16 @@ static bool search_archive (const CommandLineArguments& command_line_args, const // Load lexers from schema file if it exists auto schema_file_path = archive_path / streaming_archive::cSchemaFileName; - unique_ptr forward_lexer, reverse_lexer; + unique_ptr forward_lexer, reverse_lexer; bool use_heuristic = true; if (boost::filesystem::exists(schema_file_path)) { use_heuristic = false; // Create forward lexer - forward_lexer.reset(new compressor_frontend::lexers::ByteLexer()); + forward_lexer.reset(new log_surgeon::lexers::ByteLexer()); load_lexer_from_file(schema_file_path.string(), false, *forward_lexer); // Create reverse lexer - reverse_lexer.reset(new compressor_frontend::lexers::ByteLexer()); + reverse_lexer.reset(new log_surgeon::lexers::ByteLexer()); load_lexer_from_file(schema_file_path.string(), true, *reverse_lexer); } diff --git a/components/core/src/clp/FileCompressor.cpp b/components/core/src/clp/FileCompressor.cpp index 9b5fe493a..1eb12af44 100644 --- a/components/core/src/clp/FileCompressor.cpp +++ b/components/core/src/clp/FileCompressor.cpp @@ -12,14 +12,22 @@ // libarchive #include +// Log surgeon +#include +#include + // Project headers #include "../ffi/ir_stream/decoding_methods.hpp" #include "../ir/utils.hpp" +#include "../LogSurgeonReader.hpp" #include "../Profiler.hpp" #include "utils.hpp" using ir::has_ir_stream_magic_number; using ir::LogEventDeserializer; +using log_surgeon::LogEventView; +using log_surgeon::Reader; +using log_surgeon::ReaderParser; using std::cout; using std::endl; using std::set; @@ -123,9 +131,15 @@ namespace clp { file_to_compress.get_path_for_compression(), file_to_compress.get_group_id(), archive_writer, m_file_reader); } else { - parse_and_encode(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, - file_to_compress.get_path_for_compression(), - file_to_compress.get_group_id(), archive_writer, m_file_reader); + parse_and_encode_with_library( + target_data_size_of_dicts, + archive_user_config, + target_encoded_file_size, + file_to_compress.get_path_for_compression(), + file_to_compress.get_group_id(), + archive_writer, + m_file_reader + ); } } else { if (false == try_compressing_as_archive(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, file_to_compress, @@ -144,10 +158,15 @@ namespace clp { return succeeded; } - void FileCompressor::parse_and_encode (size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, - size_t target_encoded_file_size, const string& path_for_compression, group_id_t group_id, - streaming_archive::writer::Archive& archive_writer, ReaderInterface& reader) - { + void FileCompressor::parse_and_encode_with_library( + size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + string const& path_for_compression, + group_id_t group_id, + streaming_archive::writer::Archive& archive_writer, + ReaderInterface& reader + ) { archive_writer.m_target_data_size_of_dicts = target_data_size_of_dicts; archive_writer.m_archive_user_config = archive_user_config; archive_writer.m_path_for_compression = path_for_compression; @@ -155,26 +174,21 @@ namespace clp { archive_writer.m_target_encoded_file_size = target_encoded_file_size; // Open compressed file archive_writer.create_and_open_file(path_for_compression, group_id, m_uuid_generator(), 0); - m_log_parser->set_archive_writer_ptr(&archive_writer); - m_log_parser->get_archive_writer_ptr()->old_ts_pattern.clear(); - try { - m_log_parser->parse(reader); - } catch (std::string const err) { - if (err.find("Lexer failed to find a match after checking entire buffer") != std::string::npos) { - close_file_and_append_to_segment(archive_writer); - SPDLOG_ERROR(err); - } else { - throw (err); + archive_writer.m_old_ts_pattern = nullptr; + LogSurgeonReader log_surgeon_reader(reader); + m_reader_parser->reset_and_set_reader(log_surgeon_reader); + while (false == m_reader_parser->done()) { + if (log_surgeon::ErrorCode err{m_reader_parser->parse_next_event()}; + log_surgeon::ErrorCode::Success != err) { + SPDLOG_ERROR("Parsing Failed"); + throw (std::runtime_error("Parsing Failed")); } + LogEventView const& log_view = m_reader_parser->get_log_parser().get_log_event_view(); + archive_writer.write_msg_using_schema(log_view); } - // TODO: separate variables from static text - //Stopwatch close_file_watch("close_file_watch"); - //close_file_watch.start(); close_file_and_append_to_segment(archive_writer); // archive_writer_config needs to persist between files archive_user_config = archive_writer.m_archive_user_config; - //close_file_watch.stop(); - //close_file_watch.print(); } void FileCompressor::parse_and_encode_with_heuristic (size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, @@ -292,8 +306,15 @@ namespace clp { boost_path_for_compression.string(), file_to_compress.get_group_id(), archive_writer, m_libarchive_file_reader); } else { - parse_and_encode(target_data_size_of_dicts, archive_user_config, target_encoded_file_size, boost_path_for_compression.string(), - file_to_compress.get_group_id(), archive_writer, m_libarchive_file_reader); + parse_and_encode_with_library( + target_data_size_of_dicts, + archive_user_config, + target_encoded_file_size, + boost_path_for_compression.string(), + file_to_compress.get_group_id(), + archive_writer, + m_libarchive_file_reader + ); } } else if (has_ir_stream_magic_number({utf8_validation_buf, utf8_validation_buf_len})) { // Remove .clp suffix if found diff --git a/components/core/src/clp/FileCompressor.hpp b/components/core/src/clp/FileCompressor.hpp index 7d87e12db..f0346a616 100644 --- a/components/core/src/clp/FileCompressor.hpp +++ b/components/core/src/clp/FileCompressor.hpp @@ -7,9 +7,12 @@ // Boost libraries #include +// Log surgeon +#include +#include + // Project headers #include "../BufferedFileReader.hpp" -#include "../compressor_frontend/LogParser.hpp" #include "../ir/LogEventDeserializer.hpp" #include "../LibarchiveFileReader.hpp" #include "../LibarchiveReader.hpp" @@ -25,8 +28,12 @@ namespace clp { class FileCompressor { public: // Constructors - FileCompressor (boost::uuids::random_generator& uuid_generator, std::unique_ptr log_parser) : m_uuid_generator( - uuid_generator), m_log_parser(std::move(log_parser)) {} + FileCompressor( + boost::uuids::random_generator& uuid_generator, + std::unique_ptr reader_parser + ) + : m_uuid_generator(uuid_generator), + m_reader_parser(std::move(reader_parser)) {} // Methods /** @@ -55,9 +62,15 @@ namespace clp { * @param archive_writer * @param reader */ - void parse_and_encode (size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, - size_t target_encoded_file_size, const std::string& path_for_compression, group_id_t group_id, - streaming_archive::writer::Archive& archive_writer, ReaderInterface& reader); + void parse_and_encode_with_library( + size_t target_data_size_of_dicts, + streaming_archive::writer::Archive::UserConfig& archive_user_config, + size_t target_encoded_file_size, + std::string const& path_for_compression, + group_id_t group_id, + streaming_archive::writer::Archive& archive_writer, + ReaderInterface& reader + ); void parse_and_encode_with_heuristic (size_t target_data_size_of_dicts, streaming_archive::writer::Archive::UserConfig& archive_user_config, size_t target_encoded_file_size, const std::string& path_for_compression, group_id_t group_id, @@ -129,7 +142,7 @@ namespace clp { LibarchiveFileReader m_libarchive_file_reader; MessageParser m_message_parser; ParsedMessage m_parsed_message; - std::unique_ptr m_log_parser; + std::unique_ptr m_reader_parser; }; } diff --git a/components/core/src/clp/compression.cpp b/components/core/src/clp/compression.cpp index 8b1bf1c52..d82d0b4c8 100644 --- a/components/core/src/clp/compression.cpp +++ b/components/core/src/clp/compression.cpp @@ -51,9 +51,15 @@ namespace clp { return boost::filesystem::last_write_time(lhs.get_path()) < boost::filesystem::last_write_time(rhs.get_path()); } - bool compress (CommandLineArguments& command_line_args, vector& files_to_compress, const vector& empty_directory_paths, - vector& grouped_files_to_compress, size_t target_encoded_file_size, - std::unique_ptr log_parser, bool use_heuristic) { + bool compress( + CommandLineArguments& command_line_args, + vector& files_to_compress, + vector const& empty_directory_paths, + vector& grouped_files_to_compress, + size_t target_encoded_file_size, + std::unique_ptr reader_parser, + bool use_heuristic + ) { auto output_dir = boost::filesystem::path(command_line_args.get_output_dir()); // Create output directory in case it doesn't exist @@ -106,7 +112,7 @@ namespace clp { archive_writer.add_empty_directories(empty_directory_paths); bool all_files_compressed_successfully = true; - FileCompressor file_compressor(uuid_generator, std::move(log_parser)); + FileCompressor file_compressor(uuid_generator, std::move(reader_parser)); auto target_data_size_of_dictionaries = command_line_args.get_target_data_size_of_dictionaries(); // Compress all files diff --git a/components/core/src/clp/compression.hpp b/components/core/src/clp/compression.hpp index 8291acb0b..a86aa1fca 100644 --- a/components/core/src/clp/compression.hpp +++ b/components/core/src/clp/compression.hpp @@ -8,11 +8,14 @@ // Boost libraries #include +// Log surgeon +#include +#include + // Project headers #include "CommandLineArguments.hpp" #include "FileToCompress.hpp" #include "StructuredFileToCompress.hpp" -#include "../compressor_frontend/LogParser.hpp" namespace clp { /** @@ -22,13 +25,19 @@ namespace clp { * @param empty_directory_paths * @param grouped_files_to_compress * @param target_encoded_file_size - * @param log_parser + * @param reader_parser * @param use_heuristic * @return true if compression was successful, false otherwise */ - bool compress (CommandLineArguments& command_line_args, std::vector& files_to_compress, - const std::vector& empty_directory_paths, std::vector& grouped_files_to_compress, - size_t target_encoded_file_size, std::unique_ptr log_parser, bool use_heuristic); + bool compress( + CommandLineArguments& command_line_args, + std::vector& files_to_compress, + std::vector const& empty_directory_paths, + std::vector& grouped_files_to_compress, + size_t target_encoded_file_size, + std::unique_ptr reader_parser, + bool use_heuristic + ); /** * Reads a list of grouped files and a list of their IDs diff --git a/components/core/src/clp/run.cpp b/components/core/src/clp/run.cpp index ef9f90e0c..11786a753 100644 --- a/components/core/src/clp/run.cpp +++ b/components/core/src/clp/run.cpp @@ -6,8 +6,10 @@ // spdlog #include +// Log Surgeon +#include + // Project headers -#include "../compressor_frontend/LogParser.hpp" #include "../Profiler.hpp" #include "../spdlog_with_specializations.hpp" #include "../Utils.hpp" @@ -60,10 +62,10 @@ namespace clp { if (CommandLineArguments::Command::Compress == command_line_args.get_command()) { /// TODO: make this not a unique_ptr and test performance difference - std::unique_ptr log_parser; + std::unique_ptr reader_parser; if (!command_line_args.get_use_heuristic()) { const std::string& schema_file_path = command_line_args.get_schema_file_path(); - log_parser = std::make_unique(schema_file_path); + reader_parser = std::make_unique(schema_file_path); } boost::filesystem::path path_prefix_to_remove(command_line_args.get_path_prefix_to_remove()); @@ -91,9 +93,15 @@ namespace clp { bool compression_successful; try { - compression_successful = compress(command_line_args, files_to_compress, empty_directory_paths, grouped_files_to_compress, - command_line_args.get_target_encoded_file_size(), std::move(log_parser), - command_line_args.get_use_heuristic()); + compression_successful = compress( + command_line_args, + files_to_compress, + empty_directory_paths, + grouped_files_to_compress, + command_line_args.get_target_encoded_file_size(), + std::move(reader_parser), + command_line_args.get_use_heuristic() + ); } catch (TraceableException& e) { ErrorCode error_code = e.get_error_code(); if (ErrorCode_errno == error_code) { diff --git a/components/core/src/compressor_frontend/Constants.hpp b/components/core/src/compressor_frontend/Constants.hpp deleted file mode 100644 index ed31f1ce5..000000000 --- a/components/core/src/compressor_frontend/Constants.hpp +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_CONSTANTS_HPP -#define COMPRESSOR_FRONTEND_CONSTANTS_HPP - -#include - -namespace compressor_frontend { - - typedef std::pair Interval; - - constexpr uint32_t cUnicodeMax = 0x10FFFF; - constexpr uint32_t cSizeOfByte = 256; - constexpr uint32_t cSizeOfAllChildren = 10000; - constexpr uint32_t cNullSymbol = 10000000; - - enum class SymbolID { - TokenEndID, - TokenUncaughtStringID, - TokenIntId, - TokenFloatId, - TokenFirstTimestampId, - TokenNewlineTimestampId, - TokenNewlineId - }; - - constexpr char cTokenEnd[] = "$end"; - constexpr char cTokenUncaughtString[] = "$UncaughtString"; - constexpr char cTokenInt[] = "int"; - constexpr char cTokenFloat[] = "float"; - constexpr char cTokenFirstTimestamp[] = "firstTimestamp"; - constexpr char cTokenNewlineTimestamp[] = "newLineTimestamp"; - constexpr char cTokenNewline[] = "newLine"; - - constexpr uint32_t cStaticByteBuffSize = 60000; - - namespace utf8 { - //0xC0, 0xC1, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF are invalid UTF-8 code units - static const uint32_t cError = 0xFE; - static const unsigned char cCharEOF = 0xFF; - }; -} - -#endif // COMPRESSOR_FRONTEND_CONSTANTS_HPP diff --git a/components/core/src/compressor_frontend/LALR1Parser.cpp b/components/core/src/compressor_frontend/LALR1Parser.cpp deleted file mode 100644 index 721b926d2..000000000 --- a/components/core/src/compressor_frontend/LALR1Parser.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include "LALR1Parser.hpp" - -namespace compressor_frontend { - MatchedSymbol NonTerminal::m_all_children[cSizeOfAllChildren]; - - ParserAST::~ParserAST () = default; - - uint32_t NonTerminal::m_next_children_start = 0; - - NonTerminal::NonTerminal (Production* p) : m_production(p), m_ast(nullptr) { - m_children_start = NonTerminal::m_next_children_start; - NonTerminal::m_next_children_start += p->m_body.size(); - } -} diff --git a/components/core/src/compressor_frontend/LALR1Parser.hpp b/components/core/src/compressor_frontend/LALR1Parser.hpp deleted file mode 100644 index 9af75a2c6..000000000 --- a/components/core/src/compressor_frontend/LALR1Parser.hpp +++ /dev/null @@ -1,421 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_LALR1_PARSER_HPP -#define COMPRESSOR_FRONTEND_LALR1_PARSER_HPP - -// C++ standard libraries -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// Project headers -#include "../ReaderInterface.hpp" -#include "../type_utils.hpp" -#include "Lexer.hpp" - -namespace streaming_archive::writer { - class File; - - class Archive; -} - -namespace compressor_frontend { - - class ParserAST; - - class NonTerminal; - - template - class ParserValue; - - struct Production; - struct Item; - struct ItemSet; - - typedef std::function (NonTerminal*)> SemanticRule; - typedef std::variant Action; - - class ParserAST { - public: - // Constructor - virtual ~ParserAST () = 0; - - template - T& get () { - // TODO: why does this compile? - return static_cast*>(this)->value; - } - }; - - template - class ParserValue : public ParserAST { - public: - T value; - - explicit ParserValue (T v) : value(std::move(v)) {} - }; - - typedef std::variant MatchedSymbol; - - class NonTerminal { - public: - // Constructor - NonTerminal () : m_production(nullptr), m_children_start(0), m_ast(nullptr) {} - - // Constructor - explicit NonTerminal (Production*); - - /** - * Return the ith child's (body of production) MatchedSymbol as a Token. - * Note: only children are needed (and stored) for performing semantic actions (for the AST) - * @param i - * @return Token* - */ - [[nodiscard]] Token* token_cast (int i) const { - return &std::get(NonTerminal::m_all_children[m_children_start + i]); - } - - /** - * Return the ith child's (body of production) MatchedSymbol as a NonTerminal. - * Note: only children are needed (and stored) for performing semantic actions (for the AST) - * @param i - * @return NonTerminal* - */ - [[nodiscard]] NonTerminal* nonterminal_cast (int i) const { - return &std::get(NonTerminal::m_all_children[m_children_start + i]); - } - - /** - * Return the AST that relates this nonterminal's children together (based on the production/syntax-rule that was determined to have generated them) - * @return std::unique_ptr - */ - std::unique_ptr& getParserAST () { - return m_ast; - } - - static MatchedSymbol m_all_children[]; - static uint32_t m_next_children_start; - uint32_t m_children_start; - Production* m_production; - std::unique_ptr m_ast; - }; - - /** - * Structure representing a production of the form "m_head -> {m_body}". - * The code fragment to execute upon reducing "{m_body} -> m_head" is m_semantic_rule, which is purely a function of the MatchedSymbols for {m_body}. - * m_index is the productions position in the parsers production vector. - */ - struct Production { - public: - /** - * Returns if the production is an epsilon production. An epsilon production has nothing on its LHS (i.e., HEAD -> {}) - * @return bool - */ - [[nodiscard]] bool is_epsilon () const { - return this->m_body.empty(); - } - - uint32_t m_index; - uint32_t m_head; - std::vector m_body; - SemanticRule m_semantic_rule; - }; - - /** - * Structure representing an item in a LALR1 state. - * An item (1) is associated with a m_production and a single m_lookahead which is an input symbol (character) that can follow the m_production, - * and (2) tracks the current matching progress of its associated m_production, where everything exclusively to the left of m_dot is already matched. - */ - struct Item { - public: - // Constructor - Item () = default; - - // Constructor - Item (Production* p, uint32_t d, uint32_t t) : m_production(p), m_dot(d), m_lookahead(t) { - } - - /** - * Comparison operator for tie-breakers (not 100% sure where this is used) - * @param lhs - * @param rhs - * @return bool - */ - friend bool operator< (const Item& lhs, const Item& rhs) { - return std::tie(lhs.m_production->m_index, lhs.m_dot, lhs.m_lookahead) < - std::tie(rhs.m_production->m_index, rhs.m_dot, rhs.m_lookahead); - } - - /** - * Returns if the item has a dot at the end. This indicates the production associated with the item has already been fully matched. - * @return bool - */ - [[nodiscard]] bool has_dot_at_end () const { - return this->m_dot == this->m_production->m_body.size(); - } - - /** - * Returns the next unmatched symbol in the production based on the dot. - * @return uint32_t - */ - [[nodiscard]] uint32_t next_symbol () const { - return this->m_production->m_body.at(this->m_dot); - } - - Production* m_production; - uint32_t m_dot; - uint32_t m_lookahead; // for LR0 items, `m_lookahead` is unused - }; - - /** - * Structure representing an LALR1 state, a collection of items. - * The m_kernel is sufficient for fully representing the state, but m_closure is useful for computations. - * m_next indicates what state (ItemSet) to transition to based on the symbol received from the lexer - * m_actions is the action to perform based on the symbol received from the lexer. - */ - struct ItemSet { - public: - /** - * Comparison operator for tie-breakers (not 100% sure where this is used) - * @param lhs - * @param rhs - * @return bool - */ - friend bool operator< (const ItemSet& lhs, const ItemSet& rhs) { - return lhs.m_kernel < rhs.m_kernel; - } - - bool empty () const { - return m_kernel.empty(); - } - - uint32_t m_index = -1; - std::set m_kernel; - std::set m_closure; - std::unordered_map m_next; - std::vector m_actions; - }; - - /// TODO: make LALR1Parser an abstract class? - template - class LALR1Parser { - public: - // Constructor - LALR1Parser (); - - /// TODO: combine all the add_* into add_rule - /** - * Add a lexical rule to m_lexer - * @param name - * @param rule - */ - void add_rule (const std::string& name, std::unique_ptr> rule); - - /** - * Constructs a RegexASTLiteral and call add_rule - * @param name - * @param rule_char - */ - void add_token (const std::string& name, char rule_char); - - /** - * Calls add_rule with the given RegexASTGroup - * @param name - * @param rule_char - */ - void add_token_group (const std::string& name, std::unique_ptr> rule_group); - - /** - * Constructs a RegexASTCat and calls add_rule - * @param name - * @param chain - */ - void add_token_chain (const std::string& name, const std::string& chain); - - /** - * Adds productions (syntax rule) to the parser - * @param head - * @param body - * @param semantic_rule - * @return uint32_t - */ - uint32_t add_production (const std::string& head, const std::vector& body, SemanticRule semantic_rule); - - /** - * Generate the LALR1 parser (use after all the lexical rules and productions have been added) - */ - void generate (); - - /// TODO: add throws to function headers - /** - * Parse an input (e.g. file) - * @param reader - * @return Nonterminal - */ - NonTerminal parse (ReaderInterface& reader); - - void set_archive_writer_ptr (streaming_archive::writer::Archive* value) { - m_archive_writer_ptr = value; - } - - [[nodiscard]] streaming_archive::writer::Archive* get_archive_writer_ptr () const { - return m_archive_writer_ptr; - } - - protected: - /** - * Reset the parser to start a new parsing (set state to root, reset buffers, reset vars tracking positions) - * @param reader - */ - void reset (ReaderInterface& reader); - - /** - * Return an error string based on the current error state, matched_stack, and next_symbol in the parser - * @param reader - * @return std::string - */ - std::string report_error (ReaderInterface& reader); - - Lexer m_lexer; - streaming_archive::writer::Archive* m_archive_writer_ptr; - std::stack m_parse_stack_matches; - std::stack m_parse_stack_states; - ItemSet* root_itemset_ptr; - std::optional m_next_token; - std::vector> m_productions; - std::unordered_map, Production*>> m_productions_map; - std::unordered_map> m_nonterminals; - uint32_t m_root_production_id; - - private: - // Parser generation - - /** - * Generate LR0 kernels based on the productions in m_productions - */ - void generate_lr0_kernels (); - - /** - * Perform closure for the specified item_set based on its kernel - * @param item_set - */ - void generate_lr0_closure (ItemSet* item_set_ptr); - - /** - * Helper function for doing the closure on a specified item set - * @param item_set_ptr - * @param item - * @param next_symbol - * @return bool - */ - bool lr_closure_helper (ItemSet* item_set_ptr, Item const* item, uint32_t* next_symbol); - - /** - * Return the next state (ItemSet) based on the current state (ItemSet) and input symbol - * @return ItemSet* - */ - ItemSet* go_to (ItemSet*, const uint32_t&); - - /** - * Generate m_firsts, which specify for each symbol, all possible prefixes (I think?) - */ - void generate_first_sets (); - - /** - * Generate kernels for LR1 item sets based on LR0 item sets - */ - void generate_lr1_itemsets (); - - /** - * Generate closure for a specified LR1 item set - * @param item_set_ptr - */ - void generate_lr1_closure (ItemSet* item_set_ptr); - - /** - * Generating parsing table and goto table for LALR1 parser based on state-symbol pair - * generate_lalr1_goto() + generate_lalr1_action() - */ - void generate_lalr1_parsing_table (); - - /** - * Generating the goto table for LARL1 parser specifying which state (ItemSet) to transition to based on state-symbol pair - * Does nothing (its already done in an earlier step) - */ - void generate_lalr1_goto (); - - /** - * Generating the action table for LARL1 parser specifying which action to perform based on state-symbol pair - */ - void generate_lalr1_action (); - - // Parser utilization - - /** - * Use the previous symbol from the lexer if unused, otherwise request the next symbol from the lexer - * @return Token - */ - Token get_next_symbol (); - - /** - * Tries all symbols in the language that the next token may be until the first non-error symbol is tried - * @param next_token - * @param accept - * @return bool - */ - bool parse_advance (Token& next_token, bool* accept); - - /** - * Perform an action and state transition based on the current state (ItemSet) and the type_id (current symbol interpretation of the next_token) - * @param type_id - * @param next_token - * @param accept - * @return bool - */ - bool parse_symbol (uint32_t const& type_id, Token& next_token, bool* accept); - - // Error handling - - /** - * Get the current line up to the error symbol - * @param parse_stack_matches - * @return std::string - */ - static std::string get_input_after_last_newline (std::stack& parse_stack_matches); - - /** - * Get the current line after the error symbol - * @param reader - * @param error_token - * @return std::string - */ - std::string get_input_until_next_newline (ReaderInterface& reader, Token* error_token); - - bool symbol_is_token (uint32_t s) { - return m_terminals.find(s) != m_terminals.end(); - } - - // Variables - std::set m_terminals; - std::set m_nullable; - std::map, std::unique_ptr> m_lr0_itemsets; - std::map, std::unique_ptr> m_lr1_itemsets; - std::unordered_map> m_firsts; - std::unordered_map> m_spontaneous_map; - std::map> m_propagate_map; - std::unordered_map> m_go_to_table; - }; -} - -#include "LALR1Parser.inc" - -#endif // COMPRESSOR_FRONTEND_LALR1_PARSER_HPP diff --git a/components/core/src/compressor_frontend/LALR1Parser.inc b/components/core/src/compressor_frontend/LALR1Parser.inc deleted file mode 100644 index 3e82883a3..000000000 --- a/components/core/src/compressor_frontend/LALR1Parser.inc +++ /dev/null @@ -1,689 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_LALR1_PARSER_TPP -#define COMPRESSOR_FRONTEND_LALR1_PARSER_TPP - -#include "LALR1Parser.hpp" - -// C++ standard libraries -#include -#include - -// Boost libraries -#include - -// Project headers -#include "../FileReader.hpp" -#include "../streaming_archive/writer/Archive.hpp" - -using compressor_frontend::finite_automata::RegexAST; -using compressor_frontend::finite_automata::RegexASTCat; -using compressor_frontend::finite_automata::RegexASTGroup; -using compressor_frontend::finite_automata::RegexASTInteger; -using compressor_frontend::finite_automata::RegexASTLiteral; -using compressor_frontend::finite_automata::RegexASTMultiplication; -using compressor_frontend::finite_automata::RegexASTOr; -using std::cout; -using std::deque; -using std::holds_alternative; -using std::make_unique; -using std::map; -using std::pair; -using std::set; -using std::string; -using std::unique_ptr; -using std::vector; - -namespace compressor_frontend { - template - LALR1Parser::LALR1Parser () : m_archive_writer_ptr(nullptr), root_itemset_ptr(nullptr), m_root_production_id(0) { - m_lexer.m_symbol_id[cTokenEnd] = (int) SymbolID::TokenEndID; - m_lexer.m_symbol_id[cTokenUncaughtString] = (int) SymbolID::TokenUncaughtStringID; - m_lexer.m_symbol_id[cTokenInt] = (int) SymbolID::TokenIntId; - m_lexer.m_symbol_id[cTokenFloat] = (int) SymbolID::TokenFloatId; - m_lexer.m_symbol_id[cTokenFirstTimestamp] = (int) SymbolID::TokenFirstTimestampId; - m_lexer.m_symbol_id[cTokenNewlineTimestamp] = (int) SymbolID::TokenNewlineTimestampId; - m_lexer.m_symbol_id[cTokenNewline] = (int) SymbolID::TokenNewlineId; - - m_lexer.m_id_symbol[(int) SymbolID::TokenEndID] = cTokenEnd; - m_lexer.m_id_symbol[(int) SymbolID::TokenUncaughtStringID] = cTokenUncaughtString; - m_lexer.m_id_symbol[(int) SymbolID::TokenIntId] = cTokenInt; - m_lexer.m_id_symbol[(int) SymbolID::TokenFloatId] = cTokenFloat; - m_lexer.m_id_symbol[(int) SymbolID::TokenFirstTimestampId] = cTokenFirstTimestamp; - m_lexer.m_id_symbol[(int) SymbolID::TokenNewlineTimestampId] = cTokenNewlineTimestamp; - m_lexer.m_id_symbol[(int) SymbolID::TokenNewlineId] = cTokenNewline; - - m_terminals.insert((int) SymbolID::TokenEndID); - m_terminals.insert((int) SymbolID::TokenUncaughtStringID); - m_terminals.insert((int) SymbolID::TokenIntId); - m_terminals.insert((int) SymbolID::TokenFloatId); - m_terminals.insert((int) SymbolID::TokenFirstTimestampId); - m_terminals.insert((int) SymbolID::TokenNewlineTimestampId); - m_terminals.insert((int) SymbolID::TokenNewlineId); - } - - - template - void LALR1Parser::add_rule (const string& name, unique_ptr> rule) { - if (m_lexer.m_symbol_id.find(name) == m_lexer.m_symbol_id.end()) { - m_lexer.m_symbol_id[name] = m_lexer.m_symbol_id.size(); - m_lexer.m_id_symbol[m_lexer.m_symbol_id[name]] = name; - - } - m_lexer.add_rule(m_lexer.m_symbol_id[name], std::move(rule)); - m_terminals.insert(m_lexer.m_symbol_id[name]); - } - - template - void LALR1Parser::add_token (const string& name, char rule_char) { - add_rule(name, make_unique>(RegexASTLiteral(rule_char))); - } - - template - void LALR1Parser::add_token_group (const string& name, unique_ptr> rule_group) { - add_rule(name, std::move(rule_group)); - } - - template - void LALR1Parser::add_token_chain (const string& name, const string& chain) { - assert(chain.size() > 1); - unique_ptr> first_char_rule = make_unique>(RegexASTLiteral(chain[0])); - unique_ptr> second_char_rule = make_unique>(RegexASTLiteral(chain[1])); - unique_ptr> rule_chain = make_unique>(std::move(first_char_rule), std::move(second_char_rule)); - for (uint32_t i = 2; i < chain.size(); i++) { - char next_char = chain[i]; - unique_ptr> next_char_rule = make_unique>(RegexASTLiteral(next_char)); - rule_chain = make_unique>(std::move(rule_chain), std::move(next_char_rule)); - } - add_rule(name, std::move(rule_chain)); - } - - template - uint32_t LALR1Parser::add_production (const string& head, const vector& body, SemanticRule semantic_rule) { - if (m_lexer.m_symbol_id.find(head) == m_lexer.m_symbol_id.end()) { - m_lexer.m_symbol_id[head] = m_lexer.m_symbol_id.size(); - m_lexer.m_id_symbol[m_lexer.m_symbol_id[head]] = head; - } - uint32_t n = m_productions.size(); - auto it = m_productions_map.find(head); - if (it != m_productions_map.end()) { - map, Production*>::iterator it2; - it2 = it->second.find(body); - if (it2 != it->second.end()) { - it2->second->m_semantic_rule = semantic_rule; - return n; - } - } - unique_ptr p(new Production); - p->m_index = n; - p->m_head = m_lexer.m_symbol_id[head]; - for (const string& symbol_string: body) { - if (m_lexer.m_symbol_id.find(symbol_string) == m_lexer.m_symbol_id.end()) { - m_lexer.m_symbol_id[symbol_string] = m_lexer.m_symbol_id.size(); - m_lexer.m_id_symbol[m_lexer.m_symbol_id[symbol_string]] = symbol_string; - } - p->m_body.push_back(m_lexer.m_symbol_id[symbol_string]); - } - p->m_semantic_rule = std::move(semantic_rule); - m_nonterminals.insert(pair>(p->m_head, {})); - m_nonterminals[p->m_head].push_back(p.get()); - m_productions_map[head][body] = p.get(); - m_productions.push_back(std::move(p)); - if (m_productions.size() == 1) { - m_root_production_id = add_production("$START_PRIME", {head}, nullptr); - } - return n; - } - - template - void LALR1Parser::generate () { - m_lexer.generate(); - assert(!m_productions.empty()); - generate_lr0_kernels(); - generate_first_sets(); - generate_lr1_itemsets(); - generate_lalr1_parsing_table(); - } - - template - void LALR1Parser::generate_lr0_kernels () { - Production* root_production_ptr = m_productions[m_root_production_id].get(); - Item root_item(root_production_ptr, 0, cNullSymbol); - unique_ptr item_set0 = make_unique(); - item_set0->m_kernel.insert(root_item); - deque unused_item_sets; - item_set0->m_index = m_lr0_itemsets.size(); - unused_item_sets.push_back(item_set0.get()); - m_lr0_itemsets[item_set0->m_kernel] = std::move(item_set0); - while (!unused_item_sets.empty()) { - ItemSet* item_set_ptr = unused_item_sets.back(); - unused_item_sets.pop_back(); - generate_lr0_closure(item_set_ptr); - for (const uint32_t& next_symbol: m_terminals) { - ItemSet* new_item_set_ptr = go_to(item_set_ptr, next_symbol); - if (new_item_set_ptr != nullptr) { - unused_item_sets.push_back(new_item_set_ptr); - } - } - for (map>::value_type const& kv: m_nonterminals) { - uint32_t next_symbol = kv.first; - ItemSet* new_item_set_ptr = go_to(item_set_ptr, next_symbol); - if (new_item_set_ptr != nullptr) { - unused_item_sets.push_back(new_item_set_ptr); - } - } - } - } - - template - bool LALR1Parser::lr_closure_helper (ItemSet* item_set_ptr, const Item* item, uint32_t* next_symbol) { - if (!item_set_ptr->m_closure.insert(*item).second) { // add {S'->(dot)S, ""} - return true; - } - if (item->has_dot_at_end()) { - return true; - } - *next_symbol = item->next_symbol(); - if (this->symbol_is_token(*next_symbol)) { // false - return true; - } - return false; - } - - template - void LALR1Parser::generate_lr0_closure (ItemSet* item_set_ptr) { - deque q(item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end()); // {{S'->(dot)S, ""}} - while (!q.empty()) { - Item item = q.back(); // {S'->(dot)S, ""} - q.pop_back(); - uint32_t next_symbol; - if (lr_closure_helper(item_set_ptr, &item, &next_symbol)) { - continue; - } - if (m_nonterminals.find(next_symbol) == m_nonterminals.end()) { - assert(false); - } - for (Production* const p: m_nonterminals.at(next_symbol)) { // S -> a - q.emplace_back(p, 0, cNullSymbol); // {S -> (dot) a, ""} - } - } - } - - template - ItemSet* LALR1Parser::go_to (ItemSet* from_item_set, const uint32_t& next_symbol) { - unique_ptr next_item_set_ptr = make_unique(); - assert(from_item_set != nullptr); - for (Item const& item: from_item_set->m_closure) { - if (item.has_dot_at_end()) { - continue; - } - if (item.next_symbol() == next_symbol) { - next_item_set_ptr->m_kernel.emplace(item.m_production, item.m_dot + 1, item.m_lookahead); - } - } - if (next_item_set_ptr->m_kernel.empty()) { - return nullptr; - } - if (m_lr0_itemsets.find(next_item_set_ptr->m_kernel) != m_lr0_itemsets.end()) { - ItemSet* existing_item_set_ptr = m_lr0_itemsets[next_item_set_ptr->m_kernel].get(); - m_go_to_table[from_item_set->m_index][next_symbol] = existing_item_set_ptr->m_index; - from_item_set->m_next[next_symbol] = existing_item_set_ptr; - } else { - next_item_set_ptr->m_index = m_lr0_itemsets.size(); - m_go_to_table[from_item_set->m_index][next_symbol] = next_item_set_ptr->m_index; - from_item_set->m_next[next_symbol] = next_item_set_ptr.get(); - m_lr0_itemsets[next_item_set_ptr->m_kernel] = std::move(next_item_set_ptr); - return from_item_set->m_next[next_symbol]; - } - return nullptr; - } - - template - void LALR1Parser::generate_first_sets () { - for (uint32_t const& s: m_terminals) { - m_firsts.insert(pair>(s, {s})); - } - bool changed = true; - while (changed) { - changed = false; - for (const unique_ptr& p: m_productions) { - set& f = m_firsts[p->m_head]; - if (p->is_epsilon()) { - changed = changed || m_nullable.insert(p->m_head).second; - continue; - } - size_t old = f.size(); - size_t i = 0; - for (uint32_t const& s: p->m_body) { - set& f2 = m_firsts[s]; - f.insert(f2.begin(), f2.end()); - if (m_nullable.find(s) == m_nullable.end()) { - break; - } - i++; - } - if (i == p->m_body.size()) { - changed = changed || m_nullable.insert(p->m_head).second; - } - changed = changed || (f.size() != old); - } - } - } - - template - void LALR1Parser::generate_lr1_itemsets () { - for (map, unique_ptr>::value_type const& kv: m_lr0_itemsets) { - for (Item const& l0_item: kv.second->m_kernel) { - ItemSet temp_item_set; - temp_item_set.m_kernel.insert(l0_item); - generate_lr1_closure(&temp_item_set); - for (Item const& l1_item: temp_item_set.m_closure) { - if (l1_item.m_lookahead != cNullSymbol) { - m_spontaneous_map[l1_item.m_production].insert(l1_item.m_lookahead); - } else { - if (l1_item.m_dot < l1_item.m_production->m_body.size()) { - Item temp_item(l1_item.m_production, l1_item.m_dot + 1, cNullSymbol); - m_propagate_map[l0_item].insert(temp_item); - } - } - } - } - } - map> lookaheads; - for (map, unique_ptr>::value_type const& kv: m_lr0_itemsets) { - for (Item const& l0_item: kv.second->m_kernel) { - lookaheads[l0_item].insert(m_spontaneous_map[l0_item.m_production].begin(), - m_spontaneous_map[l0_item.m_production].end()); - if (l0_item.m_production == m_productions[m_root_production_id].get()) { - lookaheads[l0_item].insert((int) SymbolID::TokenEndID); - } - } - } - bool changed = true; - while (changed) { - changed = false; - for (map>::value_type& kv: m_propagate_map) { - Item item_from = kv.first; - for (Item const& item_to: kv.second) { - size_t size_before = lookaheads[item_to].size(); - lookaheads[item_to].insert(lookaheads[item_from].begin(), lookaheads[item_from].end()); - size_t size_after = lookaheads[item_to].size(); - changed = changed || size_after > size_before; - } - } - } - for (map, unique_ptr>::value_type const& kv: m_lr0_itemsets) { - unique_ptr lr1_item_set_ptr = make_unique(); - for (Item const& l0_item: kv.second->m_kernel) { - for (int const& lookahead: lookaheads[l0_item]) { - Item lr1_item(l0_item.m_production, l0_item.m_dot, lookahead); - lr1_item_set_ptr->m_kernel.insert(lr1_item); - } - if (l0_item.m_production == m_productions[m_root_production_id].get() && l0_item.m_dot == 0) { - root_itemset_ptr = lr1_item_set_ptr.get(); - } - } - generate_lr1_closure(lr1_item_set_ptr.get()); - lr1_item_set_ptr->m_index = kv.second->m_index; - m_lr1_itemsets[lr1_item_set_ptr->m_kernel] = std::move(lr1_item_set_ptr); - } - // this seems like the wrong way to do this still: - for (map, unique_ptr>::value_type const& kv1: m_lr1_itemsets) { - for (map::value_type next_index: m_go_to_table[kv1.second->m_index]) { - bool success = false; - for (map, unique_ptr>::value_type const& kv2: m_lr1_itemsets) { - if (next_index.second == kv2.second->m_index) { - kv1.second->m_next[next_index.first] = kv2.second.get(); - success = true; - break; - } - } - assert(success); - } - } - } - - template - void LALR1Parser::generate_lr1_closure (ItemSet* item_set_ptr) { - deque queue(item_set_ptr->m_kernel.begin(), item_set_ptr->m_kernel.end()); - while (!queue.empty()) { - Item item = queue.back(); - queue.pop_back(); - uint32_t next_symbol; - if (lr_closure_helper(item_set_ptr, &item, &next_symbol)) { - continue; - } - vector lookaheads; - size_t pos = item.m_dot + 1; - while (pos < item.m_production->m_body.size()) { - uint32_t symbol = item.m_production->m_body.at(pos); - set symbol_firsts = m_firsts.find(symbol)->second; - lookaheads.insert(lookaheads.end(), std::make_move_iterator(symbol_firsts.begin()), - std::make_move_iterator(symbol_firsts.end())); - if (m_nullable.find(symbol) == m_nullable.end()) { - break; - } - pos++; - } - if (pos == item.m_production->m_body.size()) { - lookaheads.push_back(item.m_lookahead); - } - for (Production* const p: m_nonterminals.at(next_symbol)) { - for (uint32_t const& l: lookaheads) { - queue.emplace_back(p, 0, l); - } - } - } - } - - template - void LALR1Parser::generate_lalr1_parsing_table () { - generate_lalr1_goto(); - generate_lalr1_action(); - } - - template - void LALR1Parser::generate_lalr1_goto () { - // done already at end of generate_lr1_itemsets()? - } - - // Dragon book page 253 - template - void LALR1Parser::generate_lalr1_action () { - for (map, unique_ptr>::value_type const& kv: m_lr1_itemsets) { - ItemSet* item_set_ptr = kv.second.get(); - item_set_ptr->m_actions.resize(m_lexer.m_symbol_id.size(), false); - for (Item const& item: item_set_ptr->m_closure) { - if (!item.has_dot_at_end()) { - if (m_terminals.find(item.next_symbol()) == m_terminals.end() && - m_nonterminals.find(item.next_symbol()) == m_nonterminals.end()) { - continue; - } - assert(item_set_ptr->m_next.find(item.next_symbol()) != item_set_ptr->m_next.end()); - Action& action = item_set_ptr->m_actions[item.next_symbol()]; - if (!holds_alternative(action)) { - if (holds_alternative(action) && std::get(action) == item_set_ptr->m_next[item.next_symbol()]) { - continue; - } - cout << "Warning: For symbol " << m_lexer.m_id_symbol[item.next_symbol()] << ", adding shift to " - << item_set_ptr->m_next[item.next_symbol()]->m_index << " causes "; - if (holds_alternative(action)) { - cout << "shift-shift conflict with shift to " << std::get(action)->m_index << std::endl; - } else { - cout << "shift-reduce conflict with reduction " << m_lexer.m_id_symbol[std::get(action)->m_head] - << "-> {"; - for (uint32_t symbol: std::get(action)->m_body) { - cout << m_lexer.m_id_symbol[symbol] << ","; - } - cout << "}" << std::endl; - } - } - item_set_ptr->m_actions[item.next_symbol()] = item_set_ptr->m_next[item.next_symbol()]; - } - if (item.has_dot_at_end()) { - if (item.m_production == m_productions[m_root_production_id].get()) { - Action action = true; - item_set_ptr->m_actions[(int) SymbolID::TokenEndID] = action; - } else { - Action& action = item_set_ptr->m_actions[item.m_lookahead]; - if (!holds_alternative(action)) { - cout << "Warning: For symbol " << m_lexer.m_id_symbol[item.m_lookahead] - << ", adding reduction " << m_lexer.m_id_symbol[item.m_production->m_head] << "-> {"; - for (uint32_t symbol: item.m_production->m_body) { - cout << m_lexer.m_id_symbol[symbol] << ","; - } - cout << "} causes "; - if (holds_alternative(action)) { - cout << "shift-reduce conflict with shift to " << std::get(action)->m_index << std::endl; - } else { - cout << "reduce-reduce conflict with reduction " - << m_lexer.m_id_symbol[std::get(action)->m_head] - << "-> {"; - for (uint32_t symbol: std::get(action)->m_body) { - cout << m_lexer.m_id_symbol[symbol] << ","; - } - cout << "}" << std::endl; - } - } - item_set_ptr->m_actions[item.m_lookahead] = item.m_production; - } - } - } - } - } - - static uint32_t get_line_num (MatchedSymbol& top_symbol) { - uint32_t line_num = -1; - std::stack symbols; - symbols.push(std::move(top_symbol)); - while (line_num == -1) { - assert(!symbols.empty()); - MatchedSymbol& curr_symbol = symbols.top(); - std::visit(overloaded{ - [&line_num] (Token& token) { - line_num = token.m_line; - }, - [&symbols] (NonTerminal& m) { - for (int i = 0; i < m.m_production->m_body.size(); i++) { - symbols.push(std::move(NonTerminal::m_all_children[m.m_children_start + i])); - } - } - }, curr_symbol); - symbols.pop(); - } - return line_num; - } - - template - string LALR1Parser::get_input_after_last_newline (std::stack& parse_stack_matches) { - string error_message_reversed; - bool done = false; - while (!parse_stack_matches.empty() && !done) { - MatchedSymbol top_symbol = std::move(parse_stack_matches.top()); - parse_stack_matches.pop(); - std::visit(overloaded{ - [&error_message_reversed, &done] (Token& token) { - if (token.get_string() == "\r" || token.get_string() == "\n") { - done = true; - } else { - // input is being read backwards, so reverse each token so that when the entire input is reversed - // each token is displayed correctly - string token_string = token.get_string(); - std::reverse(token_string.begin(), token_string.end()); - error_message_reversed += token_string; - } - }, - [&parse_stack_matches] (NonTerminal& m) { - for (int i = 0; i < m.m_production->m_body.size(); i++) { - parse_stack_matches.push(std::move(NonTerminal::m_all_children[m.m_children_start + i])); - } - } - }, top_symbol); - } - std::reverse(error_message_reversed.begin(), error_message_reversed.end()); - return error_message_reversed; - } - - template - string LALR1Parser::get_input_until_next_newline (ReaderInterface& reader, Token* error_token) { - string rest_of_line; - bool next_is_end_token = (error_token->m_type_ids->at(0) == (int) SymbolID::TokenEndID); - bool next_has_newline = (error_token->get_string().find('\n') != string::npos) || (error_token->get_string().find('\r') != string::npos); - while (!next_has_newline && !next_is_end_token) { - Token token = get_next_symbol(); - next_has_newline = (token.get_string().find('\n') != string::npos) || (token.get_string().find('\r') != string::npos); - if (!next_has_newline) { - rest_of_line += token.get_string(); - next_is_end_token = (token.m_type_ids->at(0) == (int) SymbolID::TokenEndID); - } - } - rest_of_line += "\n"; - return rest_of_line; - } - - static string unescape (char const& c) { - switch (c) { - case '\t': - return "\\t"; - case '\r': - return "\\r"; - case '\n': - return "\\n"; - case '\v': - return "\\v"; - case '\f': - return "\\f"; - default: - return {c}; - } - } - - template - string LALR1Parser::report_error (ReaderInterface& reader) { - assert(m_next_token == std::nullopt); - assert(!m_parse_stack_matches.empty()); - MatchedSymbol top_symbol = std::move(m_parse_stack_matches.top()); - m_parse_stack_matches.pop(); - uint32_t line_num = get_line_num(top_symbol); - Token token = std::get(top_symbol); - string consumed_input = get_input_after_last_newline(m_parse_stack_matches); - string error_type = "unknown error"; - string error_indicator; - Token error_token = token; - string rest_of_line = get_input_until_next_newline(reader, &error_token); - for (uint32_t i = 0; i < consumed_input.size() + 10; i++) { - error_indicator += " "; - } - error_indicator += "^\n"; - if (token.m_type_ids->at(0) == (int) SymbolID::TokenEndID && consumed_input.empty()) { - error_type = "empty file"; - error_indicator = "^\n"; - } else { - error_type = "expected "; - for (uint32_t i = 0; i < m_parse_stack_states.top()->m_actions.size(); i++) { - Action action = m_parse_stack_states.top()->m_actions[i]; - if (action.index() != 0) { - error_type += "'"; - if (auto* regex_ast_literal = dynamic_cast*>(m_lexer.get_rule(i))) { - error_type += unescape(char(regex_ast_literal->get_character())); - } else { - error_type += m_lexer.m_id_symbol[i]; - } - error_type += "',"; - } - } - error_type.pop_back(); - error_type += " before '" + unescape(token.get_string()[0]) + "' token"; - } - string file_name = boost::filesystem::canonical((dynamic_cast(reader)).get_path()).string(); - string error_string = file_name + ":" + std::to_string(line_num + 1) + ":" - + std::to_string(consumed_input.size() + 1) + ": error: " + error_type + "\n"; - for (int i = 0; i < 10; i++) { - error_string += " "; - } - error_string += consumed_input + error_token.get_string() + rest_of_line + error_indicator; - return error_string; - } - - template - NonTerminal LALR1Parser::parse (ReaderInterface& reader) { - reset(reader); - m_parse_stack_states.push(root_itemset_ptr); - bool accept = false; - while (true) { - Token next_terminal = get_next_symbol(); - if (parse_advance(next_terminal, &accept)) { - break; - } - } - if (!accept) { - throw std::runtime_error(report_error(reader)); - } - assert(!m_parse_stack_matches.empty()); - MatchedSymbol m = std::move(m_parse_stack_matches.top()); - m_parse_stack_matches.pop(); - assert(m_parse_stack_matches.empty()); - return std::move(std::get(m)); - } - - template - void LALR1Parser::reset (ReaderInterface& reader) { - m_next_token = std::nullopt; - while (!m_parse_stack_states.empty()) { - m_parse_stack_states.pop(); - } - while (!m_parse_stack_matches.empty()) { - m_parse_stack_matches.pop(); - } - m_lexer.reset(reader); - } - - template - Token LALR1Parser::get_next_symbol () { - if (m_next_token == std::nullopt) { - Token token = m_lexer.scan(); - return token; - } - Token s = std::move(m_next_token.value()); - m_next_token = std::nullopt; - return s; - } - - template - bool LALR1Parser::parse_advance (Token& next_token, bool* accept) { - for (int const& type: *(next_token.m_type_ids)) { - if (parse_symbol(type, next_token, accept)) { - return (*accept); - } - } - assert(*accept == false); - // For error handling - m_parse_stack_matches.push(std::move(next_token)); - return true; - } - - template - bool LALR1Parser::parse_symbol (uint32_t const& type_id, Token& next_token, bool* accept) { - ItemSet* curr = m_parse_stack_states.top(); - Action& it = curr->m_actions[type_id]; - bool ret; - std::visit(overloaded{ - [&ret, &accept] (bool is_accepting) { - if (!is_accepting) { - ret = false; - return; - } - *accept = true; - ret = true; - return; - }, - [&ret, &next_token, this] (ItemSet* shift) { - m_parse_stack_states.push(shift); - m_parse_stack_matches.push(std::move(next_token)); - ret = true; - return; - }, - [&ret, &next_token, this] (Production* reduce) { - m_next_token = std::move(next_token); - NonTerminal matched_nonterminal(reduce); - size_t n = reduce->m_body.size(); - for (size_t i = 0; i < n; i++) { - m_parse_stack_states.pop(); - NonTerminal::m_all_children[matched_nonterminal.m_children_start + n - i - 1] = std::move(m_parse_stack_matches.top()); - m_parse_stack_matches.pop(); - } - if (reduce->m_semantic_rule != nullptr) { - m_lexer.set_reduce_pos(m_next_token->m_start_pos - 1); - matched_nonterminal.m_ast = reduce->m_semantic_rule(&matched_nonterminal); - } - ItemSet* curr = m_parse_stack_states.top(); - Action const& it = curr->m_actions[matched_nonterminal.m_production->m_head]; - m_parse_stack_states.push(std::get(it)); - m_parse_stack_matches.push(std::move(matched_nonterminal)); - ret = true; - return; - } - }, it); - return ret; - } -} - -#endif //COMPRESSOR_FRONTEND_LALR1_PARSER_TPP diff --git a/components/core/src/compressor_frontend/Lexer.hpp b/components/core/src/compressor_frontend/Lexer.hpp deleted file mode 100644 index 840fbdc22..000000000 --- a/components/core/src/compressor_frontend/Lexer.hpp +++ /dev/null @@ -1,199 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_LEXER_HPP -#define COMPRESSOR_FRONTEND_LEXER_HPP - -// C++ standard libraries -#include -#include -#include -#include -#include -#include -#include - -// Project headers -#include "../ReaderInterface.hpp" -#include "../Stopwatch.hpp" -#include "Constants.hpp" -#include "finite_automata/RegexAST.hpp" -#include "finite_automata/RegexDFA.hpp" -#include "finite_automata/RegexNFA.hpp" -#include "Token.hpp" - -using compressor_frontend::finite_automata::RegexAST; -using compressor_frontend::finite_automata::RegexNFA; -using compressor_frontend::finite_automata::RegexDFA; - -namespace compressor_frontend { - template - class Lexer { - public: - // std::vector can be declared as constexpr in c++20 - inline static const std::vector cTokenEndTypes = {(int) SymbolID::TokenEndID}; - inline static const std::vector cTokenUncaughtStringTypes = {(int) SymbolID::TokenUncaughtStringID}; - - /** - * A lexical rule has a name and regex pattern - */ - struct Rule { - // Constructor - Rule (int n, std::unique_ptr> r) : m_name(n), m_regex(std::move(r)) {} - - /** - * Adds AST representing the lexical rule to the NFA - * @param nfa - */ - void add_ast (RegexNFA* nfa) const; - - int m_name; - std::unique_ptr> m_regex; - }; - - // Constructor - Lexer () : m_byte_buf_pos(0), m_bytes_read(0), m_line(0), m_fail_pos(0), m_reduce_pos(0), m_match(false), m_match_pos(0), m_start_pos(0), - m_match_line(0), m_last_match_pos(0), m_last_match_line(0), m_type_ids(), m_is_delimiter(), m_is_first_char(), m_static_byte_buf(), - m_finished_reading_file(false), m_at_end_of_file(false), m_last_read_first_half_of_buf(false), m_reader(nullptr), m_has_delimiters(false), - m_active_byte_buf(nullptr), m_byte_buf_ptr(nullptr), m_byte_buf_size_ptr(nullptr), m_static_byte_buf_ptr(nullptr) { - for (bool& i: m_is_first_char) { - i = false; - } - } - - /** - * Add a delimiters line from the schema to the lexer - * @param delimiters - */ - void add_delimiters (const std::vector& delimiters); - - /** - * Add lexical rule to the lexer's list of rules - * @param id - * @param regex - */ - void add_rule (const uint32_t& id, std::unique_ptr> regex); - - /** - * Return regex patter for a rule name - * @param name - * @return RegexAST* - */ - RegexAST* get_rule (const uint32_t& name); - - /** - * Generate DFA for lexer - */ - void generate (); - - /** - * Generate DFA for a reverse lexer matching the reverse of the words in the original language - */ - void generate_reverse (); - - /** - * Reset the lexer to start a new lexing (reset buffers, reset vars tracking positions) - * @param reader - */ - void reset (ReaderInterface& reader); - - /** - * After lexing half of the buffer, reads into that half of the buffer and changes variables accordingly - * @param next_children_start - */ - void soft_reset (uint32_t& next_children_start); - - /** - * Gets next token from the input string - * If next token is an uncaught string, the next variable token is already prepped to be returned on the next call - * @return Token - */ - Token scan (); - - /** - * scan(), but with wild wildcards in the input string (for search) - * @param wildcard - * @return Token - */ - Token scan_with_wildcard (char wildcard); - - /** - * Sets the position of where the last reduce was performed, - * Used to know during lexing if half of the buffer has been lexed and needs to be read into - * @param value - */ - void set_reduce_pos (uint32_t value) { - m_reduce_pos = value; - } - - [[nodiscard]] const bool& get_has_delimiters() const { - return m_has_delimiters; - } - - [[nodiscard]] const bool& is_delimiter (uint8_t byte) const { - return m_is_delimiter[byte]; - } - - // First character of any variable in the schema - [[nodiscard]] const bool& is_first_char (uint8_t byte) const { - return m_is_first_char[byte]; - } - - std::map m_symbol_id; - std::map m_id_symbol; - - private: - /** - * Get next character from the input buffer - * @return unsigned char - */ - unsigned char get_next_character (); - - /** - * Return epsilon_closure over m_epsilon_transitions - * @return - */ - std::set epsilon_closure (NFAStateType* state_ptr); - - /** - * Generate a DFA from the NFA - * @param RegexNFA nfa - * @return std::unique_ptr> - */ - unique_ptr> nfa_to_dfa (RegexNFA& nfa); - - uint32_t m_fail_pos; - uint32_t m_reduce_pos; - uint32_t m_match_pos; - uint32_t m_start_pos; - uint32_t m_match_line; - uint32_t m_last_match_pos; - uint32_t m_last_match_line; - bool m_match; - const std::vector* m_type_ids; - static uint32_t m_current_buff_size; - bool m_is_delimiter[cSizeOfByte]; - bool m_is_first_char[cSizeOfByte]; - char* m_active_byte_buf; - char** m_byte_buf_ptr; - const uint32_t* m_byte_buf_size_ptr; - char* m_static_byte_buf_ptr; - char m_static_byte_buf[cStaticByteBuffSize]; - bool m_finished_reading_file; - bool m_at_end_of_file; - std::vector m_rules; - uint32_t m_byte_buf_pos; - bool m_last_read_first_half_of_buf; - size_t m_bytes_read; - uint32_t m_line; - ReaderInterface* m_reader; - bool m_has_delimiters; - unique_ptr> m_dfa; - }; - - namespace lexers { - using ByteLexer = Lexer; - using UTF8Lexer = Lexer; - }; -} - -#include "Lexer.inc" - -#endif // COMPRESSOR_FRONTEND_LEXER_HPP diff --git a/components/core/src/compressor_frontend/Lexer.inc b/components/core/src/compressor_frontend/Lexer.inc deleted file mode 100644 index 41b6ee7e9..000000000 --- a/components/core/src/compressor_frontend/Lexer.inc +++ /dev/null @@ -1,541 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_LEXER_TPP -#define COMPRESSOR_FRONTEND_LEXER_TPP - -#include "Lexer.hpp" - -// C++ standard libraries -#include -#include -#include - -// Project headers -#include "../FileReader.hpp" -#include "../spdlog_with_specializations.hpp" -#include "Constants.hpp" -#include "finite_automata/RegexAST.hpp" - -using std::string; -using std::to_string; - -/** - * utf8 format (https://en.wikipedia.org/wiki/UTF-8) - * 1 byte: 0x0 - 0x80 : 0xxxxxxx - * 2 byte: 0x80 - 0x7FF : 110xxxxx 10xxxxxx - * 3 byte: 0x800 - 0xFFFF : 1110xxxx 10xxxxxx 10xxxxxx - * 4 byte: 0x10000 - 0x1FFFFF : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - */ -namespace compressor_frontend { - template - uint32_t Lexer::m_current_buff_size; - - template - void Lexer::soft_reset (uint32_t& next_children_start) { - if (next_children_start > cSizeOfAllChildren / 2) { - next_children_start = 0; - } - if (m_finished_reading_file) { - return; - } - if (m_reduce_pos == -1) { - m_reduce_pos += m_current_buff_size; - } - if ((!m_last_read_first_half_of_buf && m_reduce_pos > m_current_buff_size / 2) || - (m_last_read_first_half_of_buf && m_reduce_pos < m_current_buff_size / 2 && m_reduce_pos > 0)) { - uint32_t offset = 0; - if (m_last_read_first_half_of_buf) { - offset = m_current_buff_size / 2; - } - m_reader->read(m_active_byte_buf + offset, m_current_buff_size / 2, m_bytes_read); - - if (m_bytes_read < m_current_buff_size / 2) { - m_finished_reading_file = true; - } - m_last_read_first_half_of_buf = !m_last_read_first_half_of_buf; - m_bytes_read += offset; - if (m_reduce_pos >= m_current_buff_size / 2) { - m_fail_pos = m_current_buff_size / 2; - } else { - m_fail_pos = 0; - } - } - } - - template - unsigned char Lexer::get_next_character () { - if (m_finished_reading_file && m_byte_buf_pos == m_bytes_read) { - m_at_end_of_file = true; - return utf8::cCharEOF; - } - unsigned char character = m_active_byte_buf[m_byte_buf_pos]; - m_byte_buf_pos++; - if (m_byte_buf_pos == m_current_buff_size) { - m_byte_buf_pos = 0; - } - return character; - } - - template - Token Lexer::scan () { - if (m_match) { - m_match = false; - m_last_match_pos = m_match_pos; - m_last_match_line = m_match_line; - return Token{m_start_pos, m_match_pos, m_byte_buf_ptr, m_byte_buf_size_ptr, m_match_line, m_type_ids}; - } - m_start_pos = m_byte_buf_pos; - m_match_pos = m_byte_buf_pos; - m_match_line = m_line; - m_type_ids = nullptr; - DFAStateType* state = m_dfa->get_root(); - while (true) { - if (m_byte_buf_pos == m_fail_pos) { - string warn = "Long line detected"; - warn += " at line " + to_string(m_line); - warn += " in file " + dynamic_cast(m_reader)->get_path(); - warn += " changing to dynamic buffer and increasing buffer size to "; - warn += to_string(m_current_buff_size * 2); - SPDLOG_WARN(warn); - // Found a super long line: for completeness handle this case, but efficiency doesn't matter - // 1. copy everything from old buffer into new buffer - if (m_active_byte_buf == m_static_byte_buf) { - m_active_byte_buf = (char*) malloc(m_current_buff_size * sizeof(char)); - if (m_fail_pos == 0) { - memcpy(m_active_byte_buf, m_static_byte_buf, sizeof(m_static_byte_buf)); - } else { - /// TODO: make a test case for this scenario - memcpy(m_active_byte_buf, m_static_byte_buf + sizeof(m_static_byte_buf) / 2, sizeof(m_static_byte_buf) / 2); - memcpy(m_active_byte_buf + sizeof(m_static_byte_buf) / 2, m_static_byte_buf, sizeof(m_static_byte_buf) / 2); - if (m_match_pos >= m_current_buff_size / 2) { - m_match_pos -= m_current_buff_size / 2; - } else { - m_match_pos += m_current_buff_size / 2; - } - if (m_start_pos >= m_current_buff_size / 2) { - m_start_pos -= m_current_buff_size / 2; - } else { - m_start_pos += m_current_buff_size / 2; - } - if (m_last_match_pos >= m_current_buff_size / 2) { - m_last_match_pos -= m_current_buff_size / 2; - } else { - m_last_match_pos += m_current_buff_size / 2; - } - } - } - m_current_buff_size *= 2; - m_active_byte_buf = (char*) realloc(m_active_byte_buf, m_current_buff_size * sizeof(char)); - m_byte_buf_ptr = &m_active_byte_buf; - m_byte_buf_size_ptr = &m_current_buff_size; - if (m_active_byte_buf == nullptr) { - SPDLOG_ERROR("failed to allocate byte buffer of size {}", m_current_buff_size); - string err = "Lexer failed to find a match after checking entire buffer"; - err += " at line " + to_string(m_line); - err += " in file " + dynamic_cast(m_reader)->get_path(); - dynamic_cast(m_reader)->close(); - throw (err); // this throw allows for continuation of compressing other files - } - m_reader->read(m_active_byte_buf + m_current_buff_size / 2, m_current_buff_size / 2, m_bytes_read); - m_bytes_read += m_current_buff_size / 2; - if (m_bytes_read < m_current_buff_size) { - m_finished_reading_file = true; - } - m_byte_buf_pos = m_current_buff_size / 2; - m_fail_pos = 0; - } - uint32_t prev_byte_buf_pos = m_byte_buf_pos; - unsigned char next_char = get_next_character(); - if ((m_is_delimiter[next_char] || m_at_end_of_file || !m_has_delimiters) && state->is_accepting()) { - m_match = true; - m_type_ids = &(state->get_tags()); - m_match_pos = prev_byte_buf_pos; - m_match_line = m_line; - } - DFAStateType* next = state->next(next_char); - if (next_char == '\n') { - m_line++; - if (m_has_delimiters && !m_match) { - next = m_dfa->get_root()->next(next_char); - m_match = true; - m_type_ids = &(next->get_tags()); - m_start_pos = prev_byte_buf_pos; - m_match_pos = m_byte_buf_pos; - m_match_line = m_line; - } - } - if (m_at_end_of_file || next == nullptr) { - if (m_match) { - m_at_end_of_file = false; - m_byte_buf_pos = m_match_pos; - m_line = m_match_line; - if (m_last_match_pos != m_start_pos) { - return Token{m_last_match_pos, m_start_pos, m_byte_buf_ptr, m_byte_buf_size_ptr, m_last_match_line, &cTokenUncaughtStringTypes}; - } - m_match = false; - m_last_match_pos = m_match_pos; - m_last_match_line = m_match_line; - return Token{m_start_pos, m_match_pos, m_byte_buf_ptr, m_byte_buf_size_ptr, m_match_line, m_type_ids}; - } else if (m_at_end_of_file && m_start_pos == m_byte_buf_pos) { - if (m_last_match_pos != m_start_pos) { - m_match_pos = m_byte_buf_pos; - m_type_ids = &cTokenEndTypes; - m_match = true; - return Token{m_last_match_pos, m_start_pos, m_byte_buf_ptr, m_byte_buf_size_ptr, m_last_match_line, &cTokenUncaughtStringTypes}; - } - return Token{m_byte_buf_pos, m_byte_buf_pos, m_byte_buf_ptr, m_byte_buf_size_ptr, m_line, &cTokenEndTypes}; - } else { - while (!m_at_end_of_file && !m_is_first_char[next_char]) { - prev_byte_buf_pos = m_byte_buf_pos; - next_char = get_next_character(); - } - m_byte_buf_pos = prev_byte_buf_pos; - m_start_pos = prev_byte_buf_pos; - state = m_dfa->get_root(); - continue; - } - } - state = next; - } - } - - /// TODO: this is duplicating almost all the code of scan() - template - Token Lexer::scan_with_wildcard (char wildcard) { - if (m_match) { - m_match = false; - m_last_match_pos = m_match_pos; - m_last_match_line = m_match_line; - return Token{m_start_pos, m_match_pos, m_byte_buf_ptr, m_byte_buf_size_ptr, m_match_line, m_type_ids}; - } - m_start_pos = m_byte_buf_pos; - m_match_pos = m_byte_buf_pos; - m_match_line = m_line; - m_type_ids = nullptr; - DFAStateType* state = m_dfa->get_root(); - while (true) { - if (m_byte_buf_pos == m_fail_pos) { - string warn = "Long line detected"; - warn += " at line " + to_string(m_line); - warn += " in file " + dynamic_cast(m_reader)->get_path(); - warn += " changing to dynamic buffer and increasing buffer size to "; - warn += to_string(m_current_buff_size * 2); - SPDLOG_WARN(warn); - // Found a super long line: for completeness handle this case, but efficiency doesn't matter - // 1. copy everything from old buffer into new buffer - if (m_active_byte_buf == m_static_byte_buf) { - m_active_byte_buf = (char*) malloc(m_current_buff_size * sizeof(char)); - if (m_fail_pos == 0) { - memcpy(m_active_byte_buf, m_static_byte_buf, sizeof(m_static_byte_buf)); - } else { - /// TODO: make a test case for this scenario - memcpy(m_active_byte_buf, m_static_byte_buf + sizeof(m_static_byte_buf) / 2, sizeof(m_static_byte_buf) / 2); - memcpy(m_active_byte_buf + sizeof(m_static_byte_buf) / 2, m_static_byte_buf, sizeof(m_static_byte_buf) / 2); - if (m_match_pos >= m_current_buff_size / 2) { - m_match_pos -= m_current_buff_size / 2; - } else { - m_match_pos += m_current_buff_size / 2; - } - if (m_start_pos >= m_current_buff_size / 2) { - m_start_pos -= m_current_buff_size / 2; - } else { - m_start_pos += m_current_buff_size / 2; - } - if (m_last_match_pos >= m_current_buff_size / 2) { - m_last_match_pos -= m_current_buff_size / 2; - } else { - m_last_match_pos += m_current_buff_size / 2; - } - } - } - m_current_buff_size *= 2; - m_active_byte_buf = (char*) realloc(m_active_byte_buf, m_current_buff_size * sizeof(char)); - m_byte_buf_ptr = &m_active_byte_buf; - m_byte_buf_size_ptr = &m_current_buff_size; - if (m_active_byte_buf == nullptr) { - SPDLOG_ERROR("failed to allocate byte buffer of size {}", m_current_buff_size); - string err = "Lexer failed to find a match after checking entire buffer"; - err += " at line " + to_string(m_line); - err += " in file " + dynamic_cast(m_reader)->get_path(); - dynamic_cast(m_reader)->close(); - throw (err); // this throw allows for continuation of compressing other files - } - m_reader->read(m_active_byte_buf + m_current_buff_size / 2, m_current_buff_size / 2, m_bytes_read); - m_bytes_read += m_current_buff_size / 2; - if (m_bytes_read < m_current_buff_size) { - m_finished_reading_file = true; - } - m_byte_buf_pos = m_current_buff_size / 2; - m_fail_pos = 0; - } - uint32_t prev_byte_buf_pos = m_byte_buf_pos; - unsigned char next_char = get_next_character(); - if ((m_is_delimiter[next_char] || m_at_end_of_file || !m_has_delimiters) && state->is_accepting()) { - m_match = true; - m_type_ids = &(state->get_tags()); - m_match_pos = prev_byte_buf_pos; - m_match_line = m_line; - } - DFAStateType* next = state->next(next_char); - if (next_char == '\n') { - m_line++; - if (m_has_delimiters && !m_match) { - next = m_dfa->get_root()->next(next_char); - m_match = true; - m_type_ids = &(next->get_tags()); - m_start_pos = prev_byte_buf_pos; - m_match_pos = m_byte_buf_pos; - m_match_line = m_line; - } - } - - // !m_at_end_of_file should be impossible - // m_match_pos != m_byte_buf_pos --> "te matches from "tes*" (means "tes" isn't a match, so is_var = false) - // - if (m_at_end_of_file || next == nullptr) { - assert(m_at_end_of_file); - - if (!m_match || (m_match && m_match_pos != m_byte_buf_pos)) { - return Token{m_last_match_pos, m_byte_buf_pos, m_byte_buf_ptr, m_byte_buf_size_ptr, m_last_match_line, &cTokenUncaughtStringTypes}; - } - if (m_match) { - // BFS (keep track of m_type_ids) - if (wildcard == '?') { - for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - DFAStateType* next_state = state->next(byte); - if (next_state->is_accepting() == false) { - return Token{m_last_match_pos, m_byte_buf_pos, m_byte_buf_ptr, m_byte_buf_size_ptr, m_last_match_line, &cTokenUncaughtStringTypes}; - } - } - } else if (wildcard == '*') { - std::stack unvisited_states; - std::set visited_states; - unvisited_states.push(state); - while (!unvisited_states.empty()) { - DFAStateType* current_state = unvisited_states.top(); - if (current_state == nullptr || current_state->is_accepting() == false) { - return Token{m_last_match_pos, m_byte_buf_pos, m_byte_buf_ptr, m_byte_buf_size_ptr, m_last_match_line, &cTokenUncaughtStringTypes}; - } - unvisited_states.pop(); - visited_states.insert(current_state); - for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - if (m_is_delimiter[byte]) { - continue; - } - DFAStateType* next_state = current_state->next(byte); - if (visited_states.find(next_state) == visited_states.end()) { - unvisited_states.push(next_state); - } - } - } - } - m_byte_buf_pos = m_match_pos; - m_line = m_match_line; - m_match = false; - m_last_match_pos = m_match_pos; - m_last_match_line = m_match_line; - return Token{m_start_pos, m_match_pos, m_byte_buf_ptr, m_byte_buf_size_ptr, m_match_line, m_type_ids}; - } - } - state = next; - } - } - - // If reset() is called all Tokens previously created by the lexer are invalid - template - void Lexer::reset (ReaderInterface& reader_interface) { - m_reader = &reader_interface; - m_finished_reading_file = false; - m_at_end_of_file = false; - m_reduce_pos = 0; - m_last_match_pos = 0; - m_match = false; - m_byte_buf_pos = 0; - m_line = 0; - m_bytes_read = 0; - m_last_read_first_half_of_buf = true; - if (m_active_byte_buf != nullptr && m_active_byte_buf != m_static_byte_buf) { - free(m_active_byte_buf); - } - m_static_byte_buf_ptr = m_static_byte_buf; - m_active_byte_buf = m_static_byte_buf; - m_current_buff_size = cStaticByteBuffSize; - m_byte_buf_ptr = &m_static_byte_buf_ptr; - m_byte_buf_size_ptr = &cStaticByteBuffSize; - - m_reader->read(m_active_byte_buf, m_current_buff_size / 2, m_bytes_read); - if (m_bytes_read < m_current_buff_size / 2) { - m_finished_reading_file = true; - } - m_fail_pos = m_current_buff_size / 2; - m_match_pos = 0; - m_start_pos = 0; - m_match_line = 0; - m_last_match_line = 0; - m_type_ids = nullptr; - } - - template - void Lexer::add_delimiters (const std::vector& delimiters) { - assert(!delimiters.empty()); - m_has_delimiters = true; - for (bool& i: m_is_delimiter) { - i = false; - } - for (uint32_t delimiter: delimiters) { - m_is_delimiter[delimiter] = true; - } - } - - template - void Lexer::add_rule (const uint32_t& id, std::unique_ptr> rule) { - m_rules.emplace_back(id, std::move(rule)); - } - - template - RegexAST* Lexer::get_rule (const uint32_t& name) { - for (Rule& rule: m_rules) { - if (rule.m_name == name) { - return rule.m_regex.get(); - } - } - return nullptr; - } - - template - void Lexer::generate () { - RegexNFA nfa; - for (const Rule& r: m_rules) { - r.add_ast(&nfa); - } - m_dfa = nfa_to_dfa(nfa); - - DFAStateType* state = m_dfa->get_root(); - for (uint32_t i = 0; i < cSizeOfByte; i++) { - if (state->next(i) != nullptr) { - m_is_first_char[i] = true; - } else { - m_is_first_char[i] = false; - } - } - } - - template - void Lexer::generate_reverse () { - RegexNFA nfa; - for (const Rule& r: m_rules) { - r.add_ast(&nfa); - } - - nfa.reverse(); - - m_dfa = nfa_to_dfa(nfa); - - DFAStateType* state = m_dfa->get_root(); - for (uint32_t i = 0; i < cSizeOfByte; i++) { - if (state->next(i) != nullptr) { - m_is_first_char[i] = true; - } else { - m_is_first_char[i] = false; - } - } - } - - template - void Lexer::Rule::add_ast (RegexNFA* nfa) const { - NFAStateType* s = nfa->new_state(); - s->set_accepting(true); - s->set_tag(m_name); - m_regex->add(nfa, s); - } - - template - std::set Lexer::epsilon_closure (NFAStateType* state_ptr) { - std::set closure_set; - std::stack stack; - stack.push(state_ptr); - while (!stack.empty()) { - NFAStateType* t = stack.top(); - stack.pop(); - if (closure_set.insert(t).second) { - for (NFAStateType* const u: t->get_epsilon_transitions()) { - stack.push(u); - } - } - } - return closure_set; - } - - template - unique_ptr> Lexer::nfa_to_dfa (RegexNFA& nfa) { - - typedef std::set StateSet; - unique_ptr> dfa(new RegexDFA); - - map dfa_states; - stack unmarked_sets; - - auto create_dfa_state = - [&dfa, &dfa_states, &unmarked_sets] (const StateSet& set) -> DFAStateType* { - DFAStateType* state = dfa->new_state(set); - dfa_states[set] = state; - unmarked_sets.push(set); - return state; - }; - - StateSet start_set = epsilon_closure(nfa.m_root); - create_dfa_state(start_set); - - while (!unmarked_sets.empty()) { - StateSet set = unmarked_sets.top(); - unmarked_sets.pop(); - DFAStateType* dfa_state = dfa_states.at(set); - - map ascii_transitions_map; - // map transitions_map; - - for (NFAStateType* s0: set) { - for (uint32_t i = 0; i < cSizeOfByte; i++) { - for (NFAStateType* const s1: s0->get_byte_transitions(i)) { - StateSet closure = epsilon_closure(s1); - ascii_transitions_map[i].insert(closure.begin(), closure.end()); - } - } - - /// TODO: add this for the utf8 case - //for (const typename NFAStateType::Tree::Data& data: s0->get_tree_transitions().all()) { - // for (NFAStateType* const s1: data.m_value) { - // StateSet closure = epsilon_closure(s1); - // transitions_map[data.m_interval].insert(closure.begin(), closure.end()); - // } - //} - - } - - auto next_dfa_state = - [&dfa_states, &create_dfa_state] (const StateSet& set) -> DFAStateType* { - DFAStateType* state; - auto it = dfa_states.find(set); - if (it == dfa_states.end()) { - state = create_dfa_state(set); - } else { - state = it->second; - } - return state; - }; - - for (const typename map::value_type& kv: ascii_transitions_map) { - DFAStateType* dest_state = next_dfa_state(kv.second); - dfa_state->add_byte_transition(kv.first, dest_state); - } - - /// TODO: add this for the utf8 case - //for (const typename map::value_type& kv: transitions_map) { - // DFAStateType* dest_state = next_dfa_state(kv.second); - // dfa_state->add_tree_transition(kv.first, dest_state); - //} - - } - return dfa; - } -} - -#endif // COMPRESSOR_FRONTEND_LEXER_TPP diff --git a/components/core/src/compressor_frontend/LogParser.cpp b/components/core/src/compressor_frontend/LogParser.cpp deleted file mode 100644 index e5ac766dd..000000000 --- a/components/core/src/compressor_frontend/LogParser.cpp +++ /dev/null @@ -1,218 +0,0 @@ -#include "LogParser.hpp" - -// C++ standard libraries -#include -#include - -// Project headers -#include "../clp/utils.hpp" -#include "../spdlog_with_specializations.hpp" -#include "Constants.hpp" -#include "SchemaParser.hpp" - -using compressor_frontend::finite_automata::RegexAST; -using compressor_frontend::finite_automata::RegexASTCat; -using compressor_frontend::finite_automata::RegexASTGroup; -using compressor_frontend::finite_automata::RegexASTInteger; -using compressor_frontend::finite_automata::RegexASTLiteral; -using compressor_frontend::finite_automata::RegexASTMultiplication; -using compressor_frontend::finite_automata::RegexASTOr; -using std::make_unique; -using std::runtime_error; -using std::string; -using std::to_string; -using std::unique_ptr; -using std::vector; - -namespace compressor_frontend { - LogParser::LogParser (const string& schema_file_path) { - m_active_uncompressed_msg = nullptr; - m_uncompressed_msg_size = 0; - - std::unique_ptr schema_ast = compressor_frontend::SchemaParser::try_schema_file(schema_file_path); - add_delimiters(schema_ast->m_delimiters); - add_rules(schema_ast); - m_lexer.generate(); - } - - void LogParser::add_delimiters (const unique_ptr& delimiters) { - auto delimiters_ptr = dynamic_cast(delimiters.get()); - if (delimiters_ptr != nullptr) { - m_lexer.add_delimiters(delimiters_ptr->m_delimiters); - } - } - - void LogParser::add_rules (const unique_ptr& schema_ast) { - // Currently, required to have delimiters (if schema_ast->delimiters != nullptr it is already enforced that at least 1 delimiter is specified) - if (schema_ast->m_delimiters == nullptr) { - throw runtime_error("When using --schema-path, \"delimiters:\" line must be used."); - } - vector& delimiters = dynamic_cast(schema_ast->m_delimiters.get())->m_delimiters; - add_token("newLine", '\n'); - for (unique_ptr const& parser_ast: schema_ast->m_schema_vars) { - auto rule = dynamic_cast(parser_ast.get()); - - // transform '.' from any-character into any non-delimiter character - rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters); - - if (rule->m_name == "timestamp") { - unique_ptr> first_timestamp_regex_ast(rule->m_regex_ptr->clone()); - add_rule("firstTimestamp", std::move(first_timestamp_regex_ast)); - unique_ptr> newline_timestamp_regex_ast(rule->m_regex_ptr->clone()); - unique_ptr> r2 = make_unique>('\n'); - add_rule("newLineTimestamp", make_unique>(std::move(r2), std::move(newline_timestamp_regex_ast))); - // prevent timestamps from going into the dictionary - continue; - } - // currently, error out if non-timestamp pattern contains a delimiter - // check if regex contains a delimiter - bool is_possible_input[cUnicodeMax] = {false}; - rule->m_regex_ptr->set_possible_inputs_to_true(is_possible_input); - bool contains_delimiter = false; - uint32_t delimiter_name; - for (uint32_t delimiter: delimiters) { - if (is_possible_input[delimiter]) { - contains_delimiter = true; - delimiter_name = delimiter; - break; - } - } - if (contains_delimiter) { - FileReader schema_reader; - ErrorCode error_code = schema_reader.try_open(schema_ast->m_file_path); - if (ErrorCode_Success != error_code) { - throw std::runtime_error(schema_ast->m_file_path + ":" + to_string(rule->m_line_num + 1) + ": error: '" + rule->m_name - + "' has regex pattern which contains delimiter '" + char(delimiter_name) + "'.\n"); - } else { - // more detailed debugging based on looking at the file - string line; - for (uint32_t i = 0; i <= rule->m_line_num; i++) { - schema_reader.read_to_delimiter('\n', false, false, line); - } - int colon_pos = 0; - for (char i : line) { - colon_pos++; - if (i == ':') { - break; - } - } - string indent(10, ' '); - string spaces(colon_pos, ' '); - string arrows(line.size() - colon_pos, '^'); - - throw std::runtime_error(schema_ast->m_file_path + ":" + to_string(rule->m_line_num + 1) + ": error: '" + rule->m_name - + "' has regex pattern which contains delimiter '" + char(delimiter_name) + "'.\n" - + indent + line + "\n" + indent + spaces + arrows + "\n"); - } - } - unique_ptr> delimiter_group = - make_unique>(RegexASTGroup(delimiters)); - rule->m_regex_ptr = make_unique>(std::move(delimiter_group), std::move(rule->m_regex_ptr)); - add_rule(rule->m_name, std::move(rule->m_regex_ptr)); - } - } - - - void LogParser::increment_uncompressed_msg_pos (ReaderInterface& reader) { - m_uncompressed_msg_pos++; - if (m_uncompressed_msg_pos == m_uncompressed_msg_size) { - string warn = "Very long line detected"; - warn += " changing to dynamic uncompressed_msg and increasing size to "; - warn += to_string(m_uncompressed_msg_size * 2); - SPDLOG_WARN("warn"); - if (m_active_uncompressed_msg == m_static_uncompressed_msg) { - m_active_uncompressed_msg = (Token*) malloc(m_uncompressed_msg_size * sizeof(Token)); - memcpy(m_active_uncompressed_msg, m_static_uncompressed_msg, sizeof(m_static_uncompressed_msg)); - } - m_uncompressed_msg_size *= 2; - m_active_uncompressed_msg = (Token*) realloc(m_active_uncompressed_msg, m_uncompressed_msg_size * sizeof(Token)); - if (m_active_uncompressed_msg == nullptr) { - SPDLOG_ERROR("failed to allocate uncompressed msg of size {}", m_uncompressed_msg_size); - string err = "Lexer failed to find a match after checking entire buffer"; - err += " in file " + dynamic_cast(reader).get_path(); - clp::close_file_and_append_to_segment(*m_archive_writer_ptr); - dynamic_cast(reader).close(); - throw (err); // error of this type will allow the program to continue running to compress other files - } - } - } - - void LogParser::parse (ReaderInterface& reader) { - m_uncompressed_msg_pos = 0; - if (m_active_uncompressed_msg != m_static_uncompressed_msg) { - free(m_active_uncompressed_msg); - } - m_uncompressed_msg_size = cStaticByteBuffSize; - m_active_uncompressed_msg = m_static_uncompressed_msg; - reset(reader); - m_parse_stack_states.push(root_itemset_ptr); - m_active_uncompressed_msg[0] = get_next_symbol(); - bool has_timestamp = false; - if (m_active_uncompressed_msg[0].m_type_ids->at(0) == (int) SymbolID::TokenEndID) { - return; - } - if (m_active_uncompressed_msg[0].m_type_ids->at(0) == (int) SymbolID::TokenFirstTimestampId) { - has_timestamp = true; - increment_uncompressed_msg_pos(reader); - } else { - has_timestamp = false; - m_archive_writer_ptr->change_ts_pattern(nullptr); - m_active_uncompressed_msg[1] = m_active_uncompressed_msg[0]; - m_uncompressed_msg_pos = 2; - } - while (true) { - m_active_uncompressed_msg[m_uncompressed_msg_pos] = get_next_symbol(); - int token_type = m_active_uncompressed_msg[m_uncompressed_msg_pos].m_type_ids->at(0); - if (token_type == (int) SymbolID::TokenEndID) { - m_archive_writer_ptr->write_msg_using_schema(m_active_uncompressed_msg, m_uncompressed_msg_pos, - m_lexer.get_has_delimiters(), has_timestamp); - break; - } - bool found_start_of_next_message = (has_timestamp && token_type == (int) SymbolID::TokenNewlineTimestampId) || - (!has_timestamp && m_active_uncompressed_msg[m_uncompressed_msg_pos].get_char(0) == '\n' && - token_type != (int) SymbolID::TokenNewlineId); - bool found_end_of_current_message = !has_timestamp && token_type == (int) SymbolID::TokenNewlineId; - if (found_end_of_current_message) { - m_lexer.set_reduce_pos(m_active_uncompressed_msg[m_uncompressed_msg_pos].m_end_pos); - increment_uncompressed_msg_pos(reader); - m_archive_writer_ptr->write_msg_using_schema(m_active_uncompressed_msg, m_uncompressed_msg_pos, - m_lexer.get_has_delimiters(), has_timestamp); - m_uncompressed_msg_pos = 0; - m_lexer.soft_reset(NonTerminal::m_next_children_start); - } - if (found_start_of_next_message) { - increment_uncompressed_msg_pos(reader); - m_active_uncompressed_msg[m_uncompressed_msg_pos] = m_active_uncompressed_msg[m_uncompressed_msg_pos - 1]; - if (m_active_uncompressed_msg[m_uncompressed_msg_pos].m_start_pos == *m_active_uncompressed_msg[m_uncompressed_msg_pos].m_buffer_size_ptr - 1) { - m_active_uncompressed_msg[m_uncompressed_msg_pos].m_start_pos = 0; - } else { - m_active_uncompressed_msg[m_uncompressed_msg_pos].m_start_pos++; - } - m_active_uncompressed_msg[m_uncompressed_msg_pos - 1].m_end_pos = - m_active_uncompressed_msg[m_uncompressed_msg_pos - 1].m_start_pos + 1; - m_active_uncompressed_msg[m_uncompressed_msg_pos - 1].m_type_ids = &Lexer::cTokenUncaughtStringTypes; - m_lexer.set_reduce_pos(m_active_uncompressed_msg[m_uncompressed_msg_pos].m_start_pos - 1); - m_archive_writer_ptr->write_msg_using_schema(m_active_uncompressed_msg, m_uncompressed_msg_pos, - m_lexer.get_has_delimiters(), has_timestamp); - // switch to timestamped messages if a timestamp is ever found at the start of line (potentially dangerous as it never switches back) - /// TODO: potentially switch back if a new line is reached and the message is too long (100x static message size) - if (token_type == (int) SymbolID::TokenNewlineTimestampId) { - has_timestamp = true; - } - if (has_timestamp) { - m_active_uncompressed_msg[0] = m_active_uncompressed_msg[m_uncompressed_msg_pos]; - m_uncompressed_msg_pos = 0; - } else { - m_active_uncompressed_msg[1] = m_active_uncompressed_msg[m_uncompressed_msg_pos]; - m_uncompressed_msg_pos = 1; - } - m_lexer.soft_reset(NonTerminal::m_next_children_start); - } - increment_uncompressed_msg_pos(reader); - } - } - - Token LogParser::get_next_symbol () { - return m_lexer.scan(); - } -} diff --git a/components/core/src/compressor_frontend/LogParser.hpp b/components/core/src/compressor_frontend/LogParser.hpp deleted file mode 100644 index f6c93e4b8..000000000 --- a/components/core/src/compressor_frontend/LogParser.hpp +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_LOGPARSER_HPP -#define COMPRESSOR_FRONTEND_LOGPARSER_HPP - -// C++ standard libraries -#include -#include - -// Boost libraries -#include - -// Project headers -#include "../Stopwatch.hpp" -#include "LALR1Parser.hpp" -#include "SchemaParser.hpp" - -namespace compressor_frontend { - - using finite_automata::RegexDFAByteState; - using finite_automata::RegexNFAByteState; - - /// TODO: try not inheriting from LALR1Parser (and compare c-array vs. vectors (its underlying array) for buffers afterwards) - class LogParser : public LALR1Parser { - public: - // Constructor - LogParser (const std::string& schema_file_path); - - /** - * /// TODO: this description will need to change after adding it directly into the dictionary writer - * Custom parsing for the log that builds up an uncompressed message and then compresses it all at once - * @param reader - */ - void parse (ReaderInterface& reader); - - /** - * Increment uncompressed message pos, considering swapping to a dynamic buffer (or doubling its size) when the current buffer size is reached - * @param reader - */ - void increment_uncompressed_msg_pos (ReaderInterface& reader); - - private: - /** - * Request the next symbol from the lexer - * @return Token - */ - Token get_next_symbol (); - - /** - * Add delimiters (originally from the schema AST from the user defined schema) to the log parser - * @param delimiters - */ - void add_delimiters (const std::unique_ptr& delimiters); - - /** - * Add log lexing rules (directly from the schema AST from the user defined schema) to the log lexer - * Add delimiters to the start of regex formats if delimiters are specified in user defined schema - * Timestamps aren't matched mid log message as a variable (as they can contain delimiters, which will break search) - * Variables other than timestamps cannot have delimiters - * @param schema_ast - */ - void add_rules (const std::unique_ptr& schema_ast); - - Token* m_active_uncompressed_msg; - uint32_t m_uncompressed_msg_size; - Token m_static_uncompressed_msg[cStaticByteBuffSize]; - uint32_t m_uncompressed_msg_pos = 0; - - }; -} - -#endif // COMPRESSOR_FRONTEND_LOGPARSER_HPP diff --git a/components/core/src/compressor_frontend/SchemaParser.cpp b/components/core/src/compressor_frontend/SchemaParser.cpp deleted file mode 100644 index 419ddee4e..000000000 --- a/components/core/src/compressor_frontend/SchemaParser.cpp +++ /dev/null @@ -1,463 +0,0 @@ -#include "SchemaParser.hpp" - -// C++ libraries -#include -#include - -// Project headers -#include "../FileReader.hpp" -#include "../spdlog_with_specializations.hpp" -#include "Constants.hpp" -#include "finite_automata/RegexAST.hpp" -#include "LALR1Parser.hpp" -#include "Lexer.hpp" - -using RegexASTByte = compressor_frontend::finite_automata::RegexAST; -using RegexASTGroupByte = compressor_frontend::finite_automata::RegexASTGroup; -using RegexASTIntegerByte = compressor_frontend::finite_automata::RegexASTInteger; -using RegexASTLiteralByte = compressor_frontend::finite_automata::RegexASTLiteral; -using RegexASTMultiplicationByte = compressor_frontend::finite_automata::RegexASTMultiplication; -using RegexASTOrByte = compressor_frontend::finite_automata::RegexASTOr; -using RegexASTCatByte = compressor_frontend::finite_automata::RegexASTCat; - - -using std::make_unique; -using std::string; -using std::unique_ptr; - -namespace compressor_frontend { - SchemaParser::SchemaParser () { - add_lexical_rules(); - add_productions(); - generate(); - } - - unique_ptr SchemaParser::generate_schema_ast (ReaderInterface& reader) { - NonTerminal nonterminal = parse(reader); - std::unique_ptr schema_file_ast(dynamic_cast(nonterminal.getParserAST().release())); - return std::move(schema_file_ast); - } - - unique_ptr SchemaParser::try_schema_file (const string& schema_file_path) { - FileReader schema_reader; - ErrorCode error_code = schema_reader.try_open(schema_file_path); - if (ErrorCode_Success != error_code) { - if (ErrorCode_FileNotFound == error_code) { - SPDLOG_ERROR("'{}' does not exist.", schema_file_path); - } else if (ErrorCode_errno == error_code) { - SPDLOG_ERROR("Failed to read '{}', errno={}", schema_file_path, errno); - } else { - SPDLOG_ERROR("Failed to read '{}', error_code={}", schema_file_path, error_code); - } - return nullptr; - } - SchemaParser sp; - unique_ptr schema_ast = sp.generate_schema_ast(schema_reader); - schema_reader.close(); - schema_ast->m_file_path = std::filesystem::canonical(schema_reader.get_path()).string(); - return schema_ast; - } - - static unique_ptr new_identifier_rule (NonTerminal* m) { - string r1 = m->token_cast(0)->get_string(); - return make_unique(IdentifierAST(r1[0])); - } - - static unique_ptr existing_identifier_rule (NonTerminal* m) { - unique_ptr& r1 = m->nonterminal_cast(0)->getParserAST(); - auto* r1_ptr = dynamic_cast(r1.get()); - string r2 = m->token_cast(1)->get_string(); - r1_ptr->add_character(r2[0]); - return std::move(r1); - } - - static unique_ptr schema_var_rule (NonTerminal* m) { - auto* r2 = dynamic_cast(m->nonterminal_cast(1)->getParserAST().get()); - Token* colon_token = m->token_cast(2); - auto& r4 = m->nonterminal_cast(3)->getParserAST()->get>(); - return make_unique(r2->m_name, std::move(r4), colon_token->m_line); - } - - static unique_ptr new_schema_file_rule (NonTerminal* m) { - return make_unique(); - } - - static unique_ptr new_schema_file_rule_with_var (NonTerminal* m) { - unique_ptr& r1 = m->nonterminal_cast(0)->getParserAST(); - unique_ptr schema_file_ast = make_unique(); - schema_file_ast->add_schema_var(std::move(r1)); - return std::move(schema_file_ast); - } - - - static unique_ptr new_schema_file_rule_with_delimiters (NonTerminal* m) { - unique_ptr& r1 = m->nonterminal_cast(2)->getParserAST(); - unique_ptr schema_file_ast = make_unique(); - schema_file_ast->set_delimiters(std::move(r1)); - return std::move(schema_file_ast); - } - - static unique_ptr existing_schema_file_rule_with_delimiter (NonTerminal* m) { - unique_ptr& r1 = m->nonterminal_cast(0)->getParserAST(); - std::unique_ptr schema_file_ast(dynamic_cast(r1.release())); - unique_ptr& r5 = m->nonterminal_cast(4)->getParserAST(); - schema_file_ast->set_delimiters(std::move(r5)); - return std::move(schema_file_ast); - } - - unique_ptr SchemaParser::existing_schema_file_rule (NonTerminal* m) { - unique_ptr& r1 = m->nonterminal_cast(0)->getParserAST(); - std::unique_ptr schema_file_ast(dynamic_cast(r1.release())); - unique_ptr& r2 = m->nonterminal_cast(2)->getParserAST(); - schema_file_ast->add_schema_var(std::move(r2)); - m_lexer.soft_reset(NonTerminal::m_next_children_start); - return std::move(schema_file_ast); - } - - static unique_ptr identity_rule_ParserASTSchemaFile (NonTerminal* m) { - unique_ptr& r1 = m->nonterminal_cast(0)->getParserAST(); - std::unique_ptr schema_file_ast(dynamic_cast(r1.release())); - return std::move(schema_file_ast); - } - - typedef ParserValue> ParserValueRegex; - - static unique_ptr regex_identity_rule (NonTerminal* m) { - return unique_ptr( - new ParserValueRegex(std::move(m->nonterminal_cast(0)->getParserAST()->get>()))); - } - - static unique_ptr regex_cat_rule (NonTerminal* m) { - auto& r1 = m->nonterminal_cast(0)->getParserAST()->get>(); - auto& r2 = m->nonterminal_cast(1)->getParserAST()->get>(); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTCatByte(std::move(r1), std::move(r2))))); - } - - static unique_ptr regex_or_rule (NonTerminal* m) { - auto& r1 = m->nonterminal_cast(0)->getParserAST()->get>(); - auto& r2 = m->nonterminal_cast(2)->getParserAST()->get>(); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTOrByte(std::move(r1), std::move(r2))))); - } - - static unique_ptr regex_match_zero_or_more_rule (NonTerminal* m) { - auto& r1 = m->nonterminal_cast(0)->getParserAST()->get>(); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTMultiplicationByte(std::move(r1), 0, 0)))); - } - - static unique_ptr regex_match_one_or_more_rule (NonTerminal* m) { - auto& r1 = m->nonterminal_cast(0)->getParserAST()->get>(); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTMultiplicationByte(std::move(r1), 1, 0)))); - } - - static unique_ptr regex_match_exactly_rule (NonTerminal* m) { - auto& r3 = m->nonterminal_cast(2)->getParserAST()->get>(); - auto* r3_ptr = dynamic_cast(r3.get()); - uint32_t reps = 0; - uint32_t r3_size = r3_ptr->get_digits().size(); - for (uint32_t i = 0; i < r3_size; i++) { - reps += r3_ptr->get_digit(i) * (uint32_t) pow(10, r3_size - i - 1); - } - auto& r1 = m->nonterminal_cast(0)->getParserAST()->get>(); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTMultiplicationByte(std::move(r1), reps, reps)))); - } - - static unique_ptr regex_match_range_rule (NonTerminal* m) { - auto& r3 = m->nonterminal_cast(2)->getParserAST()->get>(); - auto* r3_ptr = dynamic_cast(r3.get()); - uint32_t min = 0; - uint32_t r3_size = r3_ptr->get_digits().size(); - for (uint32_t i = 0; i < r3_size; i++) { - min += r3_ptr->get_digit(i) * (uint32_t) pow(10, r3_size - i - 1); - } - auto& r5 = m->nonterminal_cast(4)->getParserAST()->get>(); - auto* r5_ptr = dynamic_cast(r5.get()); - uint32_t max = 0; - uint32_t r5_size = r5_ptr->get_digits().size(); - for (uint32_t i = 0; i < r5_size; i++) { - max += r5_ptr->get_digit(i) * (uint32_t) pow(10, r5_size - i - 1); - } - auto& r1 = m->nonterminal_cast(0)->getParserAST()->get>(); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTMultiplicationByte(std::move(r1), min, max)))); - } - - static unique_ptr regex_add_literal_existing_group_rule (NonTerminal* m) { - auto& r1 = m->nonterminal_cast(0)->getParserAST()->get>(); - auto& r2 = m->nonterminal_cast(1)->getParserAST()->get>(); - auto* r1_ptr = dynamic_cast(r1.get()); - auto* r2_ptr = dynamic_cast(r2.get()); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r1_ptr, r2_ptr)))); - } - - static unique_ptr regex_add_range_existing_group_rule (NonTerminal* m) { - auto& r1 = m->nonterminal_cast(0)->getParserAST()->get>(); - auto& r2 = m->nonterminal_cast(1)->getParserAST()->get>(); - auto* r1_ptr = dynamic_cast(r1.get()); - auto* r2_ptr = dynamic_cast(r2.get()); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r1_ptr, r2_ptr)))); - } - - static unique_ptr regex_add_literal_new_group_rule (NonTerminal* m) { - auto& r2 = m->nonterminal_cast(1)->getParserAST()->get>(); - auto* r2_ptr = dynamic_cast(r2.get()); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r2_ptr)))); - } - - static unique_ptr regex_add_range_new_group_rule (NonTerminal* m) { - auto& r2 = m->nonterminal_cast(1)->getParserAST()->get>(); - auto* r2_ptr = dynamic_cast(r2.get()); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r2_ptr)))); - } - - static unique_ptr regex_complement_incomplete_group_rule (NonTerminal* m) { - return unique_ptr(new ParserValueRegex(make_unique())); - } - - static unique_ptr regex_range_rule (NonTerminal* m) { - auto& r1 = m->nonterminal_cast(0)->getParserAST()->get>(); - auto& r2 = m->nonterminal_cast(2)->getParserAST()->get>(); - auto* r1_ptr = dynamic_cast(r1.get()); - auto* r2_ptr = dynamic_cast(r2.get()); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r1_ptr, r2_ptr)))); - } - - static unique_ptr regex_middle_identity_rule (NonTerminal* m) { - return unique_ptr( - new ParserValueRegex(std::move(m->nonterminal_cast(1)->getParserAST()->get>()))); - } - - static unique_ptr regex_literal_rule (NonTerminal* m) { - Token* token = m->token_cast(0); - assert(token->get_string().size() == 1); - return unique_ptr(new ParserValueRegex(unique_ptr( - new RegexASTLiteralByte(token->get_string()[0])))); - } - - static unique_ptr regex_cancel_literal_rule (NonTerminal* m) { - Token* token = m->token_cast(1); - assert(token->get_string().size() == 1); - return unique_ptr(new ParserValueRegex(unique_ptr( - new RegexASTLiteralByte(token->get_string()[0])))); - } - - static unique_ptr regex_existing_integer_rule (NonTerminal* m) { - auto& r2 = m->nonterminal_cast(0)->getParserAST()->get>(); - auto* r2_ptr = dynamic_cast(r2.get()); - Token* token = m->token_cast(1); - assert(token->get_string().size() == 1); - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTIntegerByte(r2_ptr, token->get_string()[0])))); - } - - static unique_ptr regex_new_integer_rule (NonTerminal* m) { - Token* token = m->token_cast(0); - assert(token->get_string().size() == 1); - return unique_ptr(new ParserValueRegex(unique_ptr( - new RegexASTIntegerByte(token->get_string()[0])))); - } - - static unique_ptr regex_digit_rule (NonTerminal* m) { - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTGroupByte('0', '9')))); - } - - static unique_ptr regex_wildcard_rule (NonTerminal* m) { - unique_ptr regex_wildcard = make_unique(0, cUnicodeMax); - regex_wildcard->set_is_wildcard_true(); - return unique_ptr(new ParserValueRegex(std::move(regex_wildcard))); - } - - static unique_ptr regex_vertical_tab_rule (NonTerminal* m) { - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\v')))); - } - - static unique_ptr regex_form_feed_rule (NonTerminal* m) { - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\f')))); - } - - static unique_ptr regex_tab_rule (NonTerminal* m) { - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\t')))); - } - - static unique_ptr regex_char_return_rule (NonTerminal* m) { - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\r')))); - } - - static unique_ptr regex_newline_rule (NonTerminal* m) { - return unique_ptr(new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\n')))); - } - - static unique_ptr regex_white_space_rule (NonTerminal* m) { - unique_ptr regex_ast_group = make_unique(RegexASTGroupByte({' ', '\t', '\r', '\n', '\v', '\f'})); - return unique_ptr(new ParserValueRegex(unique_ptr(std::move(regex_ast_group)))); - } - - static unique_ptr existing_delimiter_string_rule (NonTerminal* m) { - unique_ptr& r1 = m->nonterminal_cast(0)->getParserAST(); - auto& r2 = m->nonterminal_cast(1)->getParserAST()->get>(); - auto* r1_ptr = dynamic_cast(r1.get()); - uint32_t character = dynamic_cast(r2.get())->get_character(); - r1_ptr->add_delimiter(character); - return std::move(r1); - } - - static unique_ptr new_delimiter_string_rule (NonTerminal* m) { - auto& r1 = m->nonterminal_cast(0)->getParserAST()->get>(); - uint32_t character = dynamic_cast(r1.get())->get_character(); - return make_unique(character); - } - - void SchemaParser::add_lexical_rules () { - add_token("Tab", '\t'); //9 - add_token("NewLine", '\n'); //10 - add_token("VerticalTab", '\v'); //11 - add_token("FormFeed", '\f'); //12 - add_token("CarriageReturn", '\r'); //13 - add_token("Space", ' '); - add_token("Bang", '!'); - add_token("Quotation", '"'); - add_token("Hash", '#'); - add_token("DollarSign", '$'); - add_token("Percent", '%'); - add_token("Ampersand", '&'); - add_token("Apostrophe", '\''); - add_token("Lparen", '('); - add_token("Rparen", ')'); - add_token("Star", '*'); - add_token("Plus", '+'); - add_token("Comma", ','); - add_token("Dash", '-'); - add_token("Dot", '.'); - add_token("ForwardSlash", '/'); - add_token_group("Numeric", make_unique('0', '9')); - add_token("Colon", ':'); - add_token("SemiColon", ';'); - add_token("LAngle", '<'); - add_token("Equal", '='); - add_token("RAngle", '>'); - add_token("QuestionMark", '?'); - add_token("At", '@'); - add_token_group("AlphaNumeric", make_unique('a', 'z')); - add_token_group("AlphaNumeric", make_unique('A', 'Z')); - add_token_group("AlphaNumeric", make_unique('0', '9')); - add_token("Lbracket", '['); - add_token("Backslash", '\\'); - add_token("Rbracket", ']'); - add_token("Hat", '^'); - add_token("Underscore", '_'); - add_token("Backtick", '`'); - add_token("Lbrace", '{'); - add_token("Vbar", '|'); - add_token("Rbrace", '}'); - add_token("Tilde", '~'); - add_token("d", 'd'); - add_token("s", 's'); - add_token("n", 'n'); - add_token("r", 'r'); - add_token("t", 't'); - add_token("f", 'f'); - add_token("v", 'v'); - add_token_chain("Delimiters", "delimiters"); - // default constructs to a m_negate group - unique_ptr comment_characters = make_unique(); - comment_characters->add_literal('\r'); - comment_characters->add_literal('\n'); - add_token_group("CommentCharacters", std::move(comment_characters)); - } - - void SchemaParser::add_productions () { - // add_production("SchemaFile", {}, new_schema_file_rule); - add_production("SchemaFile", {"Comment"}, new_schema_file_rule); - add_production("SchemaFile", {"SchemaVar"}, new_schema_file_rule_with_var); - add_production("SchemaFile", {"Delimiters", "Colon", "DelimiterString"}, new_schema_file_rule_with_delimiters); - add_production("SchemaFile", {"SchemaFile", "PortableNewLine"}, identity_rule_ParserASTSchemaFile); - add_production("SchemaFile", {"SchemaFile", "PortableNewLine", "Comment"}, identity_rule_ParserASTSchemaFile); - add_production("SchemaFile", {"SchemaFile", "PortableNewLine", "SchemaVar"}, - std::bind(&SchemaParser::existing_schema_file_rule, this, std::placeholders::_1)); - add_production("SchemaFile", {"SchemaFile", "PortableNewLine", "Delimiters", "Colon", "DelimiterString"}, existing_schema_file_rule_with_delimiter); - add_production("DelimiterString", {"DelimiterString", "Literal"}, existing_delimiter_string_rule); - add_production("DelimiterString", {"Literal"}, new_delimiter_string_rule); - add_production("PortableNewLine", {"CarriageReturn", "NewLine"}, nullptr); - add_production("PortableNewLine", {"NewLine"}, nullptr); - add_production("Comment", {"ForwardSlash", "ForwardSlash", "Text"}, nullptr); - add_production("Text", {"Text", "CommentCharacters"}, nullptr); - add_production("Text", {"CommentCharacters"}, nullptr); - add_production("Text", {"Text", "Delimiters"}, nullptr); - add_production("Text", {"Delimiters"}, nullptr); - add_production("SchemaVar", {"WhitespaceStar", "Identifier", "Colon", "Regex"}, schema_var_rule); - add_production("Identifier", {"Identifier", "AlphaNumeric"}, existing_identifier_rule); - add_production("Identifier", {"AlphaNumeric"}, new_identifier_rule); - add_production("WhitespaceStar", {"WhitespaceStar", "Space"}, nullptr); - add_production("WhitespaceStar", {}, nullptr); - add_production("Regex", {"Concat"}, regex_identity_rule); - add_production("Concat", {"Concat", "Or"}, regex_cat_rule); - add_production("Concat", {"Or"}, regex_identity_rule); - add_production("Or", {"Or", "Vbar", "Literal"}, regex_or_rule); - add_production("Or", {"MatchStar"}, regex_identity_rule); - add_production("Or", {"MatchPlus"}, regex_identity_rule); - add_production("Or", {"MatchExact"}, regex_identity_rule); - add_production("Or", {"MatchRange"}, regex_identity_rule); - add_production("Or", {"CompleteGroup"}, regex_identity_rule); - add_production("MatchStar", {"CompleteGroup", "Star"}, regex_match_zero_or_more_rule); - add_production("MatchPlus", {"CompleteGroup", "Plus"}, regex_match_one_or_more_rule); - add_production("MatchExact", {"CompleteGroup", "Lbrace", "Integer", "Rbrace"}, regex_match_exactly_rule); - add_production("MatchRange", {"CompleteGroup", "Lbrace", "Integer", "Comma", "Integer", "Rbrace"}, regex_match_range_rule); - add_production("CompleteGroup", {"IncompleteGroup", "Rbracket"}, regex_identity_rule); - add_production("CompleteGroup", {"Literal"}, regex_identity_rule); - add_production("CompleteGroup", {"Digit"}, regex_identity_rule); - add_production("CompleteGroup", {"Wildcard"}, regex_identity_rule); - add_production("CompleteGroup", {"WhiteSpace"}, regex_identity_rule); - add_production("IncompleteGroup", {"IncompleteGroup", "LiteralRange"}, regex_add_range_existing_group_rule); - add_production("IncompleteGroup", {"IncompleteGroup", "Digit"}, regex_add_range_existing_group_rule); - add_production("IncompleteGroup", {"IncompleteGroup", "Literal"}, regex_add_literal_existing_group_rule); - add_production("IncompleteGroup", {"IncompleteGroup", "WhiteSpace"}, regex_add_literal_existing_group_rule); - add_production("IncompleteGroup", {"Lbracket", "LiteralRange"}, regex_add_range_new_group_rule); - add_production("IncompleteGroup", {"Lbracket", "Digit"}, regex_add_range_new_group_rule); - add_production("IncompleteGroup", {"Lbracket", "Literal"}, regex_add_literal_new_group_rule); - add_production("IncompleteGroup", {"Lbracket", "WhiteSpace"}, regex_add_literal_new_group_rule); - add_production("IncompleteGroup", {"Lbracket", "Hat"}, regex_complement_incomplete_group_rule); - add_production("LiteralRange", {"Literal", "Dash", "Literal"}, regex_range_rule); - add_production("Literal", {"Backslash", "t"}, regex_tab_rule); - add_production("Literal", {"Backslash", "n"}, regex_newline_rule); - add_production("Literal", {"Backslash", "v"}, regex_vertical_tab_rule); - add_production("Literal", {"Backslash", "f"}, regex_form_feed_rule); - add_production("Literal", {"Backslash", "r"}, regex_char_return_rule); - add_production("Literal", {"Space"}, regex_literal_rule); - add_production("Literal", {"Bang"}, regex_literal_rule); - add_production("Literal", {"Quotation"}, regex_literal_rule); - add_production("Literal", {"Hash"}, regex_literal_rule); - add_production("Literal", {"DollarSign"}, regex_literal_rule); - add_production("Literal", {"Percent"}, regex_literal_rule); - add_production("Literal", {"Ampersand"}, regex_literal_rule); - add_production("Literal", {"Apostrophe"}, regex_literal_rule); - add_production("Literal", {"Backslash", "Lparen"}, regex_cancel_literal_rule); - add_production("Literal", {"Backslash", "Rparen"}, regex_cancel_literal_rule); - add_production("Literal", {"Backslash", "Star"}, regex_cancel_literal_rule); - add_production("Literal", {"Backslash", "Plus"}, regex_cancel_literal_rule); - add_production("Literal", {"Comma"}, regex_literal_rule); - add_production("Literal", {"Backslash", "Dash"}, regex_cancel_literal_rule); - add_production("Literal", {"Backslash", "Dot"}, regex_cancel_literal_rule); - add_production("Literal", {"ForwardSlash"}, regex_literal_rule); - add_production("Literal", {"AlphaNumeric"}, regex_literal_rule); - add_production("Literal", {"Colon"}, regex_literal_rule); - add_production("Literal", {"SemiColon"}, regex_literal_rule); - add_production("Literal", {"LAngle"}, regex_literal_rule); - add_production("Literal", {"Equal"}, regex_literal_rule); - add_production("Literal", {"RAngle"}, regex_literal_rule); - add_production("Literal", {"QuestionMark"}, regex_literal_rule); - add_production("Literal", {"At"}, regex_literal_rule); - add_production("Literal", {"Backslash", "Lbracket"}, regex_cancel_literal_rule); - add_production("Literal", {"Backslash", "Backslash"}, regex_cancel_literal_rule); - add_production("Literal", {"Backslash", "Rbracket"}, regex_cancel_literal_rule); - add_production("Literal", {"Backslash", "Hat"}, regex_cancel_literal_rule); - add_production("Literal", {"Underscore"}, regex_literal_rule); - add_production("Literal", {"Backtick"}, regex_literal_rule); - add_production("Literal", {"Backslash", "Lbrace"}, regex_cancel_literal_rule); - add_production("Literal", {"Backslash", "Vbar"}, regex_cancel_literal_rule); - add_production("Literal", {"Backslash", "Rbrace"}, regex_cancel_literal_rule); - add_production("Literal", {"Tilde"}, regex_literal_rule); - add_production("Literal", {"Lparen", "Regex", "Rparen"}, regex_middle_identity_rule); - add_production("Integer", {"Integer", "Numeric"}, regex_existing_integer_rule); - add_production("Integer", {"Numeric"}, regex_new_integer_rule); - add_production("Digit", {"Backslash", "d"}, regex_digit_rule); - add_production("Wildcard", {"Dot"}, regex_wildcard_rule); - add_production("WhiteSpace", {"Backslash", "s"}, regex_white_space_rule); - } -} \ No newline at end of file diff --git a/components/core/src/compressor_frontend/SchemaParser.hpp b/components/core/src/compressor_frontend/SchemaParser.hpp deleted file mode 100644 index 10375d7f0..000000000 --- a/components/core/src/compressor_frontend/SchemaParser.hpp +++ /dev/null @@ -1,118 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_SCHEMAPARSER_HPP -#define COMPRESSOR_FRONTEND_SCHEMAPARSER_HPP - -// Boost libraries -#include -#include - -// Project headers -#include "../ReaderInterface.hpp" -#include "LALR1Parser.hpp" - -namespace compressor_frontend { - - using finite_automata::RegexDFAByteState; - using finite_automata::RegexNFAByteState; - - // ASTs used in SchemaParser AST - class SchemaFileAST : public ParserAST { - public: - // Constructor - SchemaFileAST () = default; - - /// TODO: shouldn't this add delimiters instead of setting it? - void set_delimiters (std::unique_ptr delimiters_in) { - m_delimiters = std::move(delimiters_in); - } - - void add_schema_var (std::unique_ptr schema_var) { - m_schema_vars.push_back(std::move(schema_var)); - } - - std::vector> m_schema_vars; - std::unique_ptr m_delimiters; - std::string m_file_path; - }; - - class IdentifierAST : public ParserAST { - public: - // Constructor - explicit IdentifierAST (char character) { - m_name.push_back(character); - } - - void add_character (char character) { - m_name.push_back(character); - } - - std::string m_name; - }; - - class SchemaVarAST : public ParserAST { - public: - //Constructor - SchemaVarAST (std::string name, std::unique_ptr> regex_ptr, uint32_t line_num) : m_name(std::move(name)), - m_regex_ptr(std::move(regex_ptr)), - m_line_num(line_num) {} - - uint32_t m_line_num; - std::string m_name; - std::unique_ptr> m_regex_ptr; - }; - - class DelimiterStringAST : public ParserAST { - public: - // Constructor - explicit DelimiterStringAST (uint32_t delimiter) { - m_delimiters.push_back(delimiter); - } - - void add_delimiter (uint32_t delimiter) { - m_delimiters.push_back(delimiter); - } - - std::vector m_delimiters; - }; - - // Schema Parser itself - - class SchemaParser : public LALR1Parser { - public: - // Constructor - SchemaParser (); - - /** - * A semantic rule that needs access to soft_reset() - * @param m - * @return std::unique_ptr - */ - std::unique_ptr existing_schema_file_rule (NonTerminal* m); - - /** - * Parse a user defined schema to generate a schema AST used for generating the log lexer - * @param reader - * @return std::unique_ptr - */ - std::unique_ptr generate_schema_ast (ReaderInterface& reader); - - /** - * Wrapper around generate_schema_ast() - * @param schema_file_path - * @return std::unique_ptr - */ - static std::unique_ptr try_schema_file (const std::string& schema_file_path); - - private: - /** - * Add all lexical rules needed for schema lexing - */ - void add_lexical_rules (); - - /** - * Add all productions needed for schema parsing - */ - void add_productions (); - }; -} - -#endif // COMPRESSOR_FRONTEND_SCHEMAPARSER_HPP diff --git a/components/core/src/compressor_frontend/Token.cpp b/components/core/src/compressor_frontend/Token.cpp deleted file mode 100644 index 4c984d0af..000000000 --- a/components/core/src/compressor_frontend/Token.cpp +++ /dev/null @@ -1,31 +0,0 @@ -#include "Token.hpp" - -using std::string; - -namespace compressor_frontend { - - string Token::get_string () const { - if (m_start_pos <= m_end_pos) { - return {*m_buffer_ptr + m_start_pos, *m_buffer_ptr + m_end_pos}; - } else { - return string(*m_buffer_ptr + m_start_pos, *m_buffer_ptr + *m_buffer_size_ptr) + - string(*m_buffer_ptr, *m_buffer_ptr + m_end_pos); - } - } - - char Token::get_char (uint8_t i) const { - return (*m_buffer_ptr)[m_start_pos + i]; - } - - string Token::get_delimiter () const { - return {*m_buffer_ptr + m_start_pos, *m_buffer_ptr + m_start_pos + 1}; - } - - uint32_t Token::get_length () const { - if (m_start_pos <= m_end_pos) { - return m_end_pos - m_start_pos; - } else { - return *m_buffer_size_ptr - m_start_pos + m_end_pos; - } - } -} \ No newline at end of file diff --git a/components/core/src/compressor_frontend/Token.hpp b/components/core/src/compressor_frontend/Token.hpp deleted file mode 100644 index d4db8396b..000000000 --- a/components/core/src/compressor_frontend/Token.hpp +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_TOKEN_HPP -#define COMPRESSOR_FRONTEND_TOKEN_HPP - -// C++ standard libraries -#include -#include - -namespace compressor_frontend { - class Token { - public: - // Constructor - Token () : m_buffer_ptr(nullptr), m_buffer_size_ptr(nullptr), m_type_ids(nullptr), m_start_pos(0), m_end_pos(0), m_line(0) {} - - // Constructor - Token (uint32_t start_pos, uint32_t end_pos, char** buffer_ptr, const uint32_t* buffer_size_ptr, uint32_t line, const std::vector* type_ids) : - m_start_pos(start_pos), m_end_pos(end_pos), m_buffer_ptr(buffer_ptr), m_buffer_size_ptr(buffer_size_ptr), m_line(line), m_type_ids(type_ids) {} - - /** - * Return the token string (string in the input buffer that the token represents) - * @return std::string - */ - [[nodiscard]] std::string get_string () const; - - /** - * Return the first character (as a string) of the token string (which is a delimiter if delimiters are being used) - * @return std::string - */ - [[nodiscard]] std::string get_delimiter () const; - - /** - * Return the ith character of the token string - * @param i - * @return char - */ - [[nodiscard]] char get_char (uint8_t i) const; - - /** - * Get the length of the token string - * @return uint32_t - */ - [[nodiscard]] uint32_t get_length () const; - - uint32_t m_start_pos; - uint32_t m_end_pos; - char** m_buffer_ptr; - const uint32_t* m_buffer_size_ptr; - uint32_t m_line; - const std::vector* m_type_ids; - }; -} - -#endif // COMPRESSOR_FRONTEND_TOKEN_HPP \ No newline at end of file diff --git a/components/core/src/compressor_frontend/finite_automata/RegexAST.hpp b/components/core/src/compressor_frontend/finite_automata/RegexAST.hpp deleted file mode 100644 index f40796b3f..000000000 --- a/components/core/src/compressor_frontend/finite_automata/RegexAST.hpp +++ /dev/null @@ -1,449 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_AST_HPP -#define COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_AST_HPP - -// C++ standard libraries -#include -#include -#include -#include -#include -#include - -// Project headers -#include "../Constants.hpp" -#include "RegexNFA.hpp" -#include "UnicodeIntervalTree.hpp" - -namespace compressor_frontend::finite_automata { - - template - class RegexAST { - public: - // Destructor - virtual ~RegexAST () = default; - - /** - * Used for cloning a unique_pointer of base type RegexAST - * @return RegexAST* - */ - [[nodiscard]] virtual RegexAST* clone () const = 0; - - /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule - * @param is_possible_input - */ - virtual void set_possible_inputs_to_true (bool is_possible_input[]) const = 0; - - /** - * transform '.' from any-character into any non-delimiter in a lexer rule - * @param delimiters - */ - virtual void remove_delimiters_from_wildcard (std::vector& delimiters) = 0; - - /** - * Add the needed RegexNFA::states to the passed in nfa to handle the current node before transitioning to a pre-tagged end_state - * @param nfa - * @param end_state - */ - virtual void add (RegexNFA* nfa, NFAStateType* end_state) = 0; - }; - - // Leaf node - template - class RegexASTLiteral : public RegexAST { - public: - // Constructor - explicit RegexASTLiteral (uint32_t character); - - /** - * Used for cloning a unique_pointer of type RegexASTLiteral - * @return RegexASTLiteral* - */ - [[nodiscard]] RegexASTLiteral* clone () const override { - return new RegexASTLiteral(*this); - } - - /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule containing RegexASTLiteral at a leaf node in its AST - * @param is_possible_input - */ - void set_possible_inputs_to_true (bool is_possible_input[]) const override { - is_possible_input[m_character] = true; - } - - /** - * Transforms '.' to to be any non-delimiter in a lexer rule, which does nothing as RegexASTLiteral is a leaf node that is not a RegexASTGroup - * @param delimiters - */ - void remove_delimiters_from_wildcard (std::vector& delimiters) override { - // DO NOTHING - } - - /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTLiteral before transitioning to a pre-tagged end_state - * @param nfa - * @param end_state - */ - void add (RegexNFA* nfa, NFAStateType* end_state) override; - - [[nodiscard]] const uint32_t& get_character () const { - return m_character; - } - - private: - uint32_t m_character; - - }; - - // Leaf node - template - class RegexASTInteger : public RegexAST { - public: - // Constructor - explicit RegexASTInteger (uint32_t digit); - - // Constructor - RegexASTInteger (RegexASTInteger* left, uint32_t digit); - - /** - * Used for cloning a unique_pointer of type RegexASTInteger - * @return RegexASTInteger* - */ - [[nodiscard]] RegexASTInteger* clone () const override { - return new RegexASTInteger(*this); - } - - /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule containing RegexASTInteger at a leaf node in its AST - * @param is_possible_input - */ - void set_possible_inputs_to_true (bool is_possible_input[]) const override { - for (uint32_t i: m_digits) { - is_possible_input[i + '0'] = true; - } - } - - /** - * Transforms '.' to to be any non-delimiter in a lexer rule, which does nothing as RegexASTInteger is a leaf node that is not a RegexASTGroup - * @param delimiters - */ - void remove_delimiters_from_wildcard (std::vector& delimiters) override { - // DO NOTHING - } - - /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTInteger before transitioning to a pre-tagged end_state - * @param nfa - * @param end_state - */ - void add (RegexNFA* nfa, NFAStateType* end_state) override; - - [[nodiscard]] const std::vector& get_digits () const { - return m_digits; - } - - [[nodiscard]] const uint32_t& get_digit (uint32_t i) const { - return m_digits[i]; - } - - private: - std::vector m_digits; - }; - - // Lead node - template - class RegexASTGroup : public RegexAST { - public: - - typedef std::pair Range; - - // constructor - RegexASTGroup (); - - // constructor - RegexASTGroup (RegexASTGroup* left, RegexASTLiteral* right); - - // constructor - RegexASTGroup (RegexASTGroup* left, RegexASTGroup* right); - - // constructor - explicit RegexASTGroup (RegexASTLiteral* right); - - // constructor - explicit RegexASTGroup (RegexASTGroup* right); - - // constructor - RegexASTGroup (RegexASTLiteral* left, RegexASTLiteral* right); - - // constructor - RegexASTGroup (uint32_t min, uint32_t max); - - // constructor - explicit RegexASTGroup (const std::vector& literals); - - /** - * Used for cloning a unique_pointer of type RegexASTGroup - * @return RegexASTGroup* - */ - [[nodiscard]] RegexASTGroup* clone () const override { - return new RegexASTGroup(*this); - } - - /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule containing RegexASTGroup at a leaf node in its AST - * @param is_possible_input - */ - void set_possible_inputs_to_true (bool is_possible_input[]) const override { - if (!m_negate) { - for (Range range: m_ranges) { - for (uint32_t i = range.first; i <= range.second; i++) { - is_possible_input[i] = true; - } - } - } else { - std::vector inputs(cUnicodeMax, true); - for (Range range: m_ranges) { - for (uint32_t i = range.first; i <= range.second; i++) { - inputs[i] = false; - } - } - for (uint32_t i = 0; i < inputs.size(); i++) { - if (inputs[i]) { - is_possible_input[i] = true; - } - } - } - } - - /** - * Transforms '.' to to be any non-delimiter in a lexer rule if this RegexASTGroup node contains `.` (is a wildcard group) - * @param delimiters - */ - void remove_delimiters_from_wildcard (std::vector& delimiters) override { - if (!m_is_wildcard) { - return; - } - if (delimiters.empty()) { - return; - } - m_ranges.clear(); - std::sort(delimiters.begin(), delimiters.end()); - if (delimiters[0] != 0) { - Range range(0, delimiters[0] - 1); - m_ranges.push_back(range); - } - for (uint32_t i = 1; i < delimiters.size(); i++) { - if (delimiters[i] - delimiters[i - 1] > 1) { - Range range(delimiters[i - 1] + 1, delimiters[i] - 1); - m_ranges.push_back(range); - } - } - if (delimiters.back() != cUnicodeMax) { - Range range(delimiters.back() + 1, cUnicodeMax); - m_ranges.push_back(range); - } - } - - /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTGroup before transitioning to a pre-tagged end_state - * @param nfa - * @param end_state - */ - void add (RegexNFA* nfa, NFAStateType* end_state) override; - - void add_range (uint32_t min, uint32_t max) { - m_ranges.emplace_back(min, max); - } - - void add_literal (uint32_t literal) { - m_ranges.emplace_back(literal, literal); - } - - void set_is_wildcard_true () { - m_is_wildcard = true; - } - - private: - /** - * Merges multiple ranges such that the resulting m_ranges is sorted and non-overlapping - * @param ranges - * @return std::vector - */ - static std::vector merge (const std::vector& ranges); - - /** - * Takes the compliment (in the cast of regex `^` at the start of a group) of multiple ranges such that m_ranges is sorted and non-overlapping - * @param ranges - * @return std::vector - */ - static std::vector complement (const std::vector& ranges); - - bool m_is_wildcard; - bool m_negate; - std::vector m_ranges; - - - }; - - // Intermediate node - - template - class RegexASTOr : public RegexAST { - public: - // Constructor - RegexASTOr (std::unique_ptr>, std::unique_ptr>); - - // Constructor - RegexASTOr (const RegexASTOr& rhs) { - m_left = std::unique_ptr>(rhs.m_left->clone()); - m_right = std::unique_ptr>(rhs.m_right->clone()); - } - - /** - * Used for cloning a unique_pointer of type RegexASTOr - * @return RegexASTOr* - */ - [[nodiscard]] RegexASTOr* clone () const override { - return new RegexASTOr(*this); - } - - /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule containing RegexASTOr at a leaf node in its AST - * @param is_possible_input - */ - void set_possible_inputs_to_true (bool is_possible_input[]) const override { - m_left->set_possible_inputs_to_true(is_possible_input); - m_right->set_possible_inputs_to_true(is_possible_input); - } - - /** - * Transforms '.' to to be any non-delimiter in a lexer rule if RegexASTGroup with `.` is a descendant of this RegexASTOr node - * @param delimiters - */ - void remove_delimiters_from_wildcard (std::vector& delimiters) override { - m_left->remove_delimiters_from_wildcard(delimiters); - m_right->remove_delimiters_from_wildcard(delimiters); - } - - /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTOr before transitioning to a pre-tagged end_state - * @param nfa - * @param end_state - */ - void add (RegexNFA* nfa, NFAStateType* end_state) override; - - private: - std::unique_ptr> m_left; - std::unique_ptr> m_right; - }; - - // Intermediate node - template - class RegexASTCat : public RegexAST { - public: - // Constructor - RegexASTCat (std::unique_ptr>, std::unique_ptr>); - - // Constructor - RegexASTCat (const RegexASTCat& rhs) { - m_left = std::unique_ptr>(rhs.m_left->clone()); - m_right = std::unique_ptr>(rhs.m_right->clone()); - } - - /** - * Used for cloning a unique_pointer of type RegexASTCat - * @return RegexASTCat* - */ - [[nodiscard]] RegexASTCat* clone () const override { - return new RegexASTCat(*this); - } - - /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule containing RegexASTCat at a leaf node in its AST - * @param is_possible_input - */ - void set_possible_inputs_to_true (bool is_possible_input[]) const override { - m_left->set_possible_inputs_to_true(is_possible_input); - m_right->set_possible_inputs_to_true(is_possible_input); - } - - /** - * Transforms '.' to to be any non-delimiter in a lexer rule if RegexASTGroup with `.` is a descendant of this RegexASTCat node - * @param delimiters - */ - void remove_delimiters_from_wildcard (std::vector& delimiters) override { - m_left->remove_delimiters_from_wildcard(delimiters); - m_right->remove_delimiters_from_wildcard(delimiters); - } - - /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTCat before transitioning to a pre-tagged end_state - * @param nfa - * @param end_state - */ - void add (RegexNFA* nfa, NFAStateType* end_state) override; - - private: - std::unique_ptr> m_left; - std::unique_ptr> m_right; - }; - - // Intermediate node - template - class RegexASTMultiplication : public RegexAST { - public: - // Constructor - RegexASTMultiplication (std::unique_ptr>, uint32_t, uint32_t); - - // Constructor - RegexASTMultiplication (const RegexASTMultiplication& rhs) { - m_operand = std::unique_ptr>(rhs.m_operand->clone()); - m_min = rhs.m_min; - m_max = rhs.m_max; - } - - /** - * Used for cloning a unique_pointer of type RegexASTMultiplication - * @return RegexASTMultiplication* - */ - [[nodiscard]] RegexASTMultiplication* clone () const override { - return new RegexASTMultiplication(*this); - } - - /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule containing RegexASTMultiplication at a leaf node in its AST - * @param is_possible_input - */ - void set_possible_inputs_to_true (bool is_possible_input[]) const override { - m_operand->set_possible_inputs_to_true(is_possible_input); - } - - /** - * Transforms '.' to to be any non-delimiter in a lexer rule if RegexASTGroup with `.` is a descendant of this RegexASTMultiplication node - * @param delimiters - */ - void remove_delimiters_from_wildcard (std::vector& delimiters) override { - m_operand->remove_delimiters_from_wildcard(delimiters); - } - - /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTMultiplication before transitioning to a pre-tagged end_state - * @param nfa - * @param end_state - */ - void add (RegexNFA* nfa, NFAStateType* end_state) override; - - [[nodiscard]] bool is_infinite () const { - return this->m_max == 0; - } - - private: - std::unique_ptr> m_operand; - uint32_t m_min; - uint32_t m_max; - }; -} - -#include "RegexAST.inc" - -#endif // COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_AST_HPP diff --git a/components/core/src/compressor_frontend/finite_automata/RegexAST.inc b/components/core/src/compressor_frontend/finite_automata/RegexAST.inc deleted file mode 100644 index 650d305f5..000000000 --- a/components/core/src/compressor_frontend/finite_automata/RegexAST.inc +++ /dev/null @@ -1,262 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_AST_TPP -#define COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_AST_TPP - -#include "RegexAST.hpp" - -// C++ standard libraries -#include -#include -#include -#include - -// Project headers -#include "../../spdlog_with_specializations.hpp" -#include "../Constants.hpp" -#include "RegexNFA.hpp" -#include "UnicodeIntervalTree.hpp" - -/* In order to use std::unordered_map (or absl::flat_hash_map) we need to have - * a specialization for hash from boost, abseil, etc. Afaik replacing - * std::set (i.e. an ordered set) with an unordered set is difficult due to - * fundamental issues of making an unordered data structure hashable. - * (i.e. you need two containers with the same elements in differing orders to - * hash to the same value, which makes computing/maintaining the hash of this - * unordered container non-trivial) - */ - -/// TODO: remove general `using` expressions like these from tpp -using std::map; -using std::max; -using std::min; -using std::pair; -using std::runtime_error; -using std::stack; -using std::unique_ptr; -using std::vector; - -namespace compressor_frontend::finite_automata { - - template - RegexASTLiteral::RegexASTLiteral (uint32_t character) : m_character(character) { - - } - - template - void RegexASTLiteral::add (RegexNFA* nfa, NFAStateType* end_state) { - nfa->add_root_interval(Interval(m_character, m_character), end_state); - } - - template - RegexASTInteger::RegexASTInteger (uint32_t digit) { - digit = digit - '0'; - m_digits.push_back(digit); - } - - template - RegexASTInteger::RegexASTInteger (RegexASTInteger* left, uint32_t digit) { - digit = digit - '0'; - m_digits = std::move(left->m_digits); - m_digits.push_back(digit); - } - - template - void RegexASTInteger::add (RegexNFA* nfa, NFAStateType* end_state) { - assert(false); // this shouldn't ever be called - } - - template - RegexASTOr::RegexASTOr (unique_ptr> left, unique_ptr> right) : m_left(std::move(left)), - m_right(std::move(right)) { - - } - - template - void RegexASTOr::add (RegexNFA* nfa, NFAStateType* end_state) { - m_left->add(nfa, end_state); - m_right->add(nfa, end_state); - } - - template - RegexASTCat::RegexASTCat (unique_ptr> left, unique_ptr> right) : m_left(std::move(left)), - m_right(std::move(right)) { - - } - - template - void RegexASTCat::add (RegexNFA* nfa, NFAStateType* end_state) { - NFAStateType* saved_root = nfa->m_root; - NFAStateType* intermediate_state = nfa->new_state(); - m_left->add(nfa, intermediate_state); - nfa->m_root = intermediate_state; - m_right->add(nfa, end_state); - nfa->m_root = saved_root; - } - - template - RegexASTMultiplication::RegexASTMultiplication (unique_ptr> operand, uint32_t min, uint32_t max) : - m_operand(std::move(operand)), m_min(min), m_max(max) { - - } - - template - void RegexASTMultiplication::add (RegexNFA* nfa, NFAStateType* end_state) { - NFAStateType* saved_root = nfa->m_root; - if (this->m_min == 0) { - nfa->m_root->add_epsilon_transition(end_state); - } else { - for (int i = 1; i < this->m_min; i++) { - NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add(nfa, intermediate_state); - nfa->m_root = intermediate_state; - } - m_operand->add(nfa, end_state); - } - if (this->is_infinite()) { - nfa->m_root = end_state; - m_operand->add(nfa, end_state); - } else if (this->m_max > this->m_min) { - if (this->m_min != 0) { - NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add(nfa, intermediate_state); - nfa->m_root = intermediate_state; - } - for (uint32_t i = this->m_min + 1; i < this->m_max; i++) { - m_operand->add(nfa, end_state); - NFAStateType* intermediate_state = nfa->new_state(); - m_operand->add(nfa, intermediate_state); - nfa->m_root = intermediate_state; - } - m_operand->add(nfa, end_state); - } - nfa->m_root = saved_root; - } - - template - RegexASTGroup::RegexASTGroup () { - m_is_wildcard = false; - m_negate = true; - } - - template - RegexASTGroup::RegexASTGroup (RegexASTGroup* left, RegexASTLiteral* right) { - m_is_wildcard = false; - if (right == nullptr) { - SPDLOG_ERROR("A bracket expression in the schema contains illegal characters, remember to escape special characters. " - "Refer to README-Schema.md for more details."); - throw runtime_error("RegexASTGroup1: right==nullptr"); - } - m_negate = left->m_negate; - m_ranges = left->m_ranges; - m_ranges.emplace_back(right->get_character(), right->get_character()); - } - - template - RegexASTGroup::RegexASTGroup (RegexASTGroup* left, RegexASTGroup* right) { - m_is_wildcard = false; - m_negate = left->m_negate; - m_ranges = left->m_ranges; - assert(right->m_ranges.size() == 1); // Only add LiteralRange - m_ranges.push_back(right->m_ranges[0]); - } - - template - RegexASTGroup::RegexASTGroup (RegexASTLiteral* right) { - m_is_wildcard = false; - if (right == nullptr) { - SPDLOG_ERROR("A bracket expression in the schema contains illegal characters, remember to escape special characters. " - "Refer to README-Schema.md for more details."); - throw runtime_error("RegexASTGroup2: right==nullptr"); - } - m_negate = false; - m_ranges.emplace_back(right->get_character(), right->get_character()); - } - - template - RegexASTGroup::RegexASTGroup (RegexASTGroup* right) { - m_is_wildcard = false; - m_negate = false; - assert(right->m_ranges.size() == 1); // Only add LiteralRange - m_ranges.push_back(right->m_ranges[0]); - } - - template - RegexASTGroup::RegexASTGroup (RegexASTLiteral* left, RegexASTLiteral* right) { - m_is_wildcard = false; - if (left == nullptr || right == nullptr) { - SPDLOG_ERROR("A bracket expression in the schema contains illegal characters, remember to escape special characters. " - "Refer to README-Schema.md for more details."); - throw runtime_error("RegexASTGroup3: left == nullptr || right == nullptr"); - } - m_negate = false; - assert(right->get_character() > left->get_character()); - m_ranges.emplace_back(left->get_character(), right->get_character()); - } - - template - RegexASTGroup::RegexASTGroup (const vector& literals) { - m_is_wildcard = false; - m_negate = false; - for (uint32_t literal: literals) { - m_ranges.emplace_back(literal, literal); - } - } - - template - RegexASTGroup::RegexASTGroup (uint32_t min, uint32_t max) { - m_is_wildcard = false; - m_negate = false; - m_ranges.emplace_back(min, max); - } - - // ranges must be sorted - template - vector::Range> RegexASTGroup::merge (const vector& ranges) { - vector merged; - if (ranges.empty()) { - return merged; - } - Range cur = ranges[0]; - for (size_t i = 1; i < ranges.size(); i++) { - Range r = ranges[i]; - if (r.first <= cur.second + 1) { - cur.second = max(r.second, cur.second); - } else { - merged.push_back(cur); - cur = r; - } - } - merged.push_back(cur); - return merged; - } - - // ranges must be sorted and non-overlapping - template - vector::Range> RegexASTGroup::complement (const vector& ranges) { - vector complemented; - uint32_t low = 0; - for (const Range& r: ranges) { - if (r.first > 0) { - complemented.emplace_back(low, r.first - 1); - } - low = r.second + 1; - } - if (low > 0) { - complemented.emplace_back(low, cUnicodeMax); - } - return complemented; - } - - template - void RegexASTGroup::add (RegexNFA* nfa, NFAStateType* end_state) { - std::sort(this->m_ranges.begin(), this->m_ranges.end()); - vector merged = RegexASTGroup::merge(this->m_ranges); - if (this->m_negate) { - merged = RegexASTGroup::complement(merged); - } - for (const Range& r: merged) { - nfa->m_root->add_interval(Interval(r.first, r.second), end_state); - } - } -} - -#endif // COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_AST_TPP \ No newline at end of file diff --git a/components/core/src/compressor_frontend/finite_automata/RegexDFA.hpp b/components/core/src/compressor_frontend/finite_automata/RegexDFA.hpp deleted file mode 100644 index f532c93c5..000000000 --- a/components/core/src/compressor_frontend/finite_automata/RegexDFA.hpp +++ /dev/null @@ -1,86 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_DFA_HPP -#define COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_DFA_HPP - -// C++ standard libraries -#include -#include -#include -#include -#include -#include - -// Project headers -#include "../Constants.hpp" -#include "RegexNFA.hpp" - -namespace compressor_frontend::finite_automata { - enum class RegexDFAStateType { - Byte, - UTF8 - }; - - template - class RegexDFAState { - public: - using Tree = UnicodeIntervalTree*>; - - void add_tag (const int& rule_name_id) { - m_tags.push_back(rule_name_id); - } - - [[nodiscard]] const std::vector& get_tags () const { - return m_tags; - } - - bool is_accepting () { - return !m_tags.empty(); - } - - void add_byte_transition (const uint8_t& byte, RegexDFAState* dest_state) { - m_bytes_transition[byte] = dest_state; - } - - /** - * Returns the next state the DFA transitions to on input character (byte or utf8) - * @param character - * @return RegexDFAState* - */ - RegexDFAState* next (uint32_t character); - - - private: - std::vector m_tags; - RegexDFAState* m_bytes_transition[cSizeOfByte]; - - // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case, - // so we use an empty class (`std::tuple<>`) in that case. - std::conditional_t> m_tree_transitions; - }; - - using RegexDFAByteState = RegexDFAState; - using RegexDFAUTF8State = RegexDFAState; - - template - class RegexDFA { - public: - - /** - * Creates a new DFA state based on a set of NFA states and adds it to m_states - * @param set - * @return DFAStateType* - */ - template - DFAStateType* new_state (const std::set& set); - - DFAStateType* get_root () { - return m_states.at(0).get(); - } - - private: - std::vector> m_states; - }; -} - -#include "RegexDFA.inc" - -#endif // COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_DFA_HPP diff --git a/components/core/src/compressor_frontend/finite_automata/RegexDFA.inc b/components/core/src/compressor_frontend/finite_automata/RegexDFA.inc deleted file mode 100644 index 75a5774bb..000000000 --- a/components/core/src/compressor_frontend/finite_automata/RegexDFA.inc +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_DFA_TPP -#define COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_DFA_TPP - -#include "RegexDFA.hpp" - -namespace compressor_frontend::finite_automata { - - template - RegexDFAState* RegexDFAState::next (uint32_t character) { - if constexpr (RegexDFAStateType::Byte == stateType) { - return m_bytes_transition[character]; - } else { - if (character < cSizeOfByte) { - return m_bytes_transition[character]; - } - unique_ptr> result = m_tree_transitions.find(Interval(character, character)); - assert(result->size() <= 1); - if (!result->empty()) { - return result->front().m_value; - } - return nullptr; - } - } - - template - template - DFAStateType* RegexDFA::new_state (const std::set& set) { - std::unique_ptr ptr = std::make_unique(); - m_states.push_back(std::move(ptr)); - - DFAStateType* state = m_states.back().get(); - for (const NFAStateType* s: set) { - if (s->is_accepting()) { - state->add_tag(s->get_tag()); - } - } - return state; - } -} - -#endif // COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_DFA_TPP \ No newline at end of file diff --git a/components/core/src/compressor_frontend/finite_automata/RegexNFA.hpp b/components/core/src/compressor_frontend/finite_automata/RegexNFA.hpp deleted file mode 100644 index 415740fcd..000000000 --- a/components/core/src/compressor_frontend/finite_automata/RegexNFA.hpp +++ /dev/null @@ -1,140 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_NFA_HPP -#define COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_NFA_HPP - -// C++ standard libraries -#include -#include -#include -#include -#include -#include -#include - -// Project headers -#include "../Constants.hpp" -#include "UnicodeIntervalTree.hpp" - -namespace compressor_frontend::finite_automata { - enum class RegexNFAStateType { - Byte, - UTF8 - }; - - template - class RegexNFAState { - public: - - using Tree = UnicodeIntervalTree*>; - - void set_accepting (bool accepting) { - m_accepting = accepting; - } - - [[nodiscard]] const bool& is_accepting () const { - return m_accepting; - } - - void set_tag (int rule_name_id) { - m_tag = rule_name_id; - } - - [[nodiscard]] const int& get_tag () const { - return m_tag; - } - - void set_epsilon_transitions (std::vector*>& epsilon_transitions) { - m_epsilon_transitions = epsilon_transitions; - } - - void add_epsilon_transition (RegexNFAState* epsilon_transition) { - m_epsilon_transitions.push_back(epsilon_transition); - } - - void clear_epsilon_transitions () { - m_epsilon_transitions.clear(); - } - - [[nodiscard]] const std::vector*>& get_epsilon_transitions () const { - return m_epsilon_transitions; - } - - void set_byte_transitions (uint8_t byte, std::vector*>& byte_transitions) { - m_bytes_transitions[byte] = byte_transitions; - } - - void add_byte_transition (uint8_t byte, RegexNFAState* dest_state) { - m_bytes_transitions[byte].push_back(dest_state); - } - - void clear_byte_transitions (uint8_t byte) { - m_bytes_transitions[byte].clear(); - } - - [[nodiscard]] const std::vector*>& get_byte_transitions (uint8_t byte) const { - return m_bytes_transitions[byte]; - } - - void reset_tree_transitions () { - m_tree_transitions.reset(); - } - - const Tree& get_tree_transitions () { - return m_tree_transitions; - } - - /** - Add dest_state to m_bytes_transitions if all values in interval are a byte, otherwise add dest_state to m_tree_transitions - * @param interval - * @param dest_state - */ - void add_interval (Interval interval, RegexNFAState* dest_state); - - private: - bool m_accepting; - int m_tag; - std::vector*> m_epsilon_transitions; - std::vector*> m_bytes_transitions[cSizeOfByte]; - - // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case, - // so we use an empty class (`std::tuple<>`) in that case. - std::conditional_t> m_tree_transitions; - - - }; - - using RegexNFAByteState = RegexNFAState; - using RegexNFAUTF8State = RegexNFAState; - - template - class RegexNFA { - public: - typedef std::vector StateVec; - - // constructor - RegexNFA (); - - /** - * Create a unique_ptr for an NFA state and add it to m_states - * @return NFAStateType* - */ - NFAStateType* new_state (); - - /** - * Reverse the NFA such that it matches on its reverse language - */ - void reverse (); - - void add_root_interval (Interval interval, NFAStateType* dest_state) { - m_root->add_interval(interval, dest_state); - } - - NFAStateType* m_root; - - private: - std::vector> m_states; - }; -} - -#include "RegexNFA.inc" - -#endif // COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_NFA_HPP diff --git a/components/core/src/compressor_frontend/finite_automata/RegexNFA.inc b/components/core/src/compressor_frontend/finite_automata/RegexNFA.inc deleted file mode 100644 index 287ef75bf..000000000 --- a/components/core/src/compressor_frontend/finite_automata/RegexNFA.inc +++ /dev/null @@ -1,188 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_NFA_TPP -#define COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_NFA_TPP - -#include "RegexNFA.hpp" - -// C++ standard libraries -#include -#include -#include -#include - -// Project headers -#include "../Constants.hpp" -#include "UnicodeIntervalTree.hpp" - -using std::map; -using std::max; -using std::min; -using std::pair; -using std::stack; -using std::unique_ptr; -using std::vector; - -namespace compressor_frontend::finite_automata { - template - void RegexNFAState::add_interval (Interval interval, RegexNFAState* dest_state) { - if (interval.first < cSizeOfByte) { - uint32_t bound = min(interval.second, cSizeOfByte - 1); - for (uint32_t i = interval.first; i <= bound; i++) { - add_byte_transition(i, dest_state); - } - interval.first = bound + 1; - } - if constexpr (RegexNFAStateType::UTF8 == stateType) { - if (interval.second < cSizeOfByte) { - return; - } - unique_ptr> overlaps = m_tree_transitions.pop(interval); - for (const typename Tree::Data& data: *overlaps) { - uint32_t overlap_low = max(data.m_interval.first, interval.first); - uint32_t overlap_high = min(data.m_interval.second, interval.second); - - std::vector tree_states = data.m_value; - tree_states.push_back(dest_state); - m_tree_transitions.insert(Interval(overlap_low, overlap_high), tree_states); - if (data.m_interval.first < interval.first) { - m_tree_transitions.insert(Interval(data.m_interval.first, interval.first - 1), data.m_value); - } else if (data.m_interval.first > interval.first) { - m_tree_transitions.insert(Interval(interval.first, data.m_interval.first - 1), {dest_state}); - } - if (data.m_interval.second > interval.second) { - m_tree_transitions.insert(Interval(interval.second + 1, data.m_interval.second), data.m_value); - } - interval.first = data.m_interval.second + 1; - } - if (interval.first != 0 && interval.first <= interval.second) { - m_tree_transitions.insert(interval, {dest_state}); - } - } - } - - template - void RegexNFA::reverse () { - // add new end with all accepting pointing to it - NFAStateType* new_end = new_state(); - for (unique_ptr& state_ptr: m_states) { - if (state_ptr->is_accepting()) { - state_ptr->add_epsilon_transition(new_end); - state_ptr->set_accepting(false); - } - } - // move edges from NFA to maps - map, vector> byte_edges; - map, bool> epsilon_edges; - for (unique_ptr& src_state_ptr: m_states) { - // TODO: handle utf8 case with if constexpr (RegexNFAUTF8State == NFAStateType) ~ don't really need this though - for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - for (NFAStateType* dest_state_ptr: src_state_ptr->get_byte_transitions(byte)) { - byte_edges[pair(src_state_ptr.get(), dest_state_ptr)].push_back(byte); - } - src_state_ptr->clear_byte_transitions(byte); - } - for (NFAStateType* dest_state_ptr: src_state_ptr->get_epsilon_transitions()) { - epsilon_edges[pair(src_state_ptr.get(), dest_state_ptr)] = true; - } - src_state_ptr->clear_epsilon_transitions(); - } - - // insert edges from maps back into NFA, but in the reverse direction - for (unique_ptr& src_state_ptr: m_states) { - for (unique_ptr& dest_state_ptr: m_states) { - pair key(src_state_ptr.get(), dest_state_ptr.get()); - auto byte_it = byte_edges.find(key); - if (byte_it != byte_edges.end()) { - for (uint8_t byte: byte_it->second) { - dest_state_ptr->add_byte_transition(byte, src_state_ptr.get()); - } - } - auto epsilon_it = epsilon_edges.find(key); - if (epsilon_it != epsilon_edges.end()) { - dest_state_ptr->add_epsilon_transition(src_state_ptr.get()); - } - } - } - - // propagate tag from old accepting m_states - for (NFAStateType* old_accepting_state: new_end->get_epsilon_transitions()) { - int tag = old_accepting_state->get_tag(); - stack unvisited_states; - std::set visited_states; - unvisited_states.push(old_accepting_state); - while (!unvisited_states.empty()) { - NFAStateType* current_state = unvisited_states.top(); - current_state->set_tag(tag); - unvisited_states.pop(); - visited_states.insert(current_state); - for(uint32_t byte = 0; byte < cSizeOfByte; byte++) { - std::vector byte_transitions = current_state->get_byte_transitions(byte); - for (NFAStateType* next_state: byte_transitions) { - if (visited_states.find(next_state) == visited_states.end()) { - unvisited_states.push(next_state); - } - } - } - for (NFAStateType* next_state: current_state->get_epsilon_transitions()) { - if (visited_states.find(next_state) == visited_states.end()) { - unvisited_states.push(next_state); - } - } - } - } - for (int32_t i = m_states.size() - 1; i >= 0; i--) { - unique_ptr& src_state_unique_ptr = m_states[i]; - NFAStateType* src_state = src_state_unique_ptr.get(); - int tag = src_state->get_tag(); - for(uint32_t byte = 0; byte < cSizeOfByte; byte++) { - std::vector byte_transitions = src_state->get_byte_transitions(byte); - for (int32_t j = byte_transitions.size() - 1; j >= 0; j--) { - NFAStateType*& dest_state = byte_transitions[j]; - if (dest_state == m_root) { - dest_state = new_state(); - assert(dest_state != nullptr); - dest_state->set_tag(tag); - dest_state->set_accepting(true); - } - } - src_state->clear_byte_transitions(byte); - src_state->set_byte_transitions(byte, byte_transitions); - } - std::vector epsilon_transitions = src_state->get_epsilon_transitions(); - for (int32_t j = epsilon_transitions .size() - 1; j >= 0; j--) { - NFAStateType*& dest_state = epsilon_transitions[j]; - if (dest_state == m_root) { - dest_state = new_state(); - dest_state->set_tag(src_state->get_tag()); - dest_state->set_accepting(true); - } - } - src_state->clear_epsilon_transitions(); - src_state->set_epsilon_transitions(epsilon_transitions); - } - - for (uint32_t i = 0; i < m_states.size(); i++) { - if (m_states[i].get() == m_root) { - m_states.erase(m_states.begin() + i); - break; - } - } - // start from the end - m_root = new_end; - - } - - template - RegexNFA::RegexNFA () { - m_root = new_state(); - } - - template - NFAStateType* RegexNFA::new_state () { - unique_ptr ptr = std::make_unique(); - NFAStateType* state = ptr.get(); - m_states.push_back(std::move(ptr)); - return state; - } -} - -#endif // COMPRESSOR_FRONTEND_FINITE_AUTOMATA_REGEX_NFA_TPP \ No newline at end of file diff --git a/components/core/src/compressor_frontend/finite_automata/UnicodeIntervalTree.hpp b/components/core/src/compressor_frontend/finite_automata/UnicodeIntervalTree.hpp deleted file mode 100644 index 016b564da..000000000 --- a/components/core/src/compressor_frontend/finite_automata/UnicodeIntervalTree.hpp +++ /dev/null @@ -1,186 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_HPP -#define COMPRESSOR_FRONTEND_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_HPP - -#include -#include -#include -#include -#include - -// Project headers -#include "../Constants.hpp" - -namespace compressor_frontend::finite_automata { - - template - class UnicodeIntervalTree { - public: - /// TODO: probably use this Data type more often in this class??? - /** - * Structure to represent utf8 data - */ - struct Data { - public: - Data (Interval interval, T value) : m_interval(std::move(interval)), m_value(value) {} - - Interval m_interval; - T m_value; - }; - - /** - * Insert data into the tree - * @param interval - * @param value - */ - void insert (Interval interval, T value); - - /** - * Returns all utf8 in the tree - * @return std::vector - */ - std::vector all () const; - - /** - * Return an interval in the tree - * @param interval - * @return std::unique_ptr> - */ - std::unique_ptr> find (Interval interval); - - /** - * Remove an interval from the tree - * @param interval - * @return std::unique_ptr> - */ - std::unique_ptr> pop (Interval interval); - - void reset () { - m_root.reset(); - } - - private: - class Node { - public: - // Constructor - Node () : m_lower(0), m_upper(0), m_height(0) {} - - // Constructor - Node (Interval i, T v) : m_interval(std::move(i)), m_value(v) {} - - /** - * Balance the subtree below a node - * @param node - * @return std::unique_ptr - */ - static std::unique_ptr balance (std::unique_ptr node); - - /** - * Insert a node - * @param node - * @param interval - * @param value - * @return std::unique_ptr - */ - static std::unique_ptr insert (std::unique_ptr node, Interval interval, T value); - - /** - * Remove a node - * @param node - * @param interval - * @param ret - * @return std::unique_ptr - */ - static std::unique_ptr pop (std::unique_ptr node, Interval interval, std::unique_ptr* ret); - - /** - * Remove a node - * @param node - * @param ret - * @return std::unique_ptr - */ - static std::unique_ptr pop_min (std::unique_ptr node, std::unique_ptr* ret); - - /** - * Rotate a node by a factor - * @param node - * @param factor - * @return std::unique_ptr - */ - static std::unique_ptr rotate (std::unique_ptr node, int factor); - - /** - * Rotate a node clockwise - * @param node - * @return std::unique_ptr - */ - static std::unique_ptr rotate_cw (std::unique_ptr node); - - /** - * Rotate a node counterclockwise - * @param node - * @return std::unique_ptr - */ - static std::unique_ptr rotate_ccw (std::unique_ptr node); - - /** - * add all utf8 in subtree to results - * @param results - */ - void all (std::vector* results); - - /** - * add all utf8 in subtree that matches interval to results - * @param interval - * @param results - */ - void find (Interval interval, std::vector* results); - - /** - * update node - */ - void update (); - - /** - * get balance factor of node - */ - int balance_factor (); - - /** - * overlaps_recursive() - * @param i - */ - bool overlaps_recursive (Interval i); - - /** - * overlaps() - * @param i - */ - bool overlaps (Interval i); - - Interval get_interval () { - return m_interval; - } - - T get_value () { - return m_value; - } - - private: - - Interval m_interval; - T m_value; - uint32_t m_lower{}; - uint32_t m_upper{}; - int m_height{}; - std::unique_ptr m_left; - std::unique_ptr m_right; - }; - - std::unique_ptr m_root; - }; -} - -// Implementation of template class must be included in anything wanting to use it -#include "UnicodeIntervalTree.inc" - -#endif // COMPRESSOR_FRONTEND_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_HPP diff --git a/components/core/src/compressor_frontend/finite_automata/UnicodeIntervalTree.inc b/components/core/src/compressor_frontend/finite_automata/UnicodeIntervalTree.inc deleted file mode 100644 index 2bde708b7..000000000 --- a/components/core/src/compressor_frontend/finite_automata/UnicodeIntervalTree.inc +++ /dev/null @@ -1,231 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_TPP -#define COMPRESSOR_FRONTEND_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_TPP - -#include "UnicodeIntervalTree.hpp" - -// C++ standard libraries -#include - -using std::max; -using std::unique_ptr; -using std::vector; - -namespace compressor_frontend::finite_automata { - - template - void UnicodeIntervalTree::insert (Interval interval, T value) { - m_root = Node::insert(std::move(m_root), interval, value); - } - - template - unique_ptr::Node> UnicodeIntervalTree::Node::insert (unique_ptr node, Interval interval, T value) { - if (node == nullptr) { - unique_ptr n(new Node(interval, value)); - n->update(); - return n; - } - if (interval < node->m_interval) { - node->m_left = Node::insert(std::move(node->m_left), interval, value); - } else if (interval > node->m_interval) { - node->m_right = Node::insert(std::move(node->m_right), interval, value); - } else { - node->m_value = value; - } - node->update(); - return Node::balance(std::move(node)); - } - - template - vector::Data> UnicodeIntervalTree::all () const { - vector results; - if (m_root != nullptr) { - m_root->all(&results); - } - return results; - } - - template - void UnicodeIntervalTree::Node::all (vector* results) { - if (m_left != nullptr) { - m_left->all(results); - } - results->push_back(Data(m_interval, m_value)); - if (m_right != nullptr) { - m_right->all(results); - } - } - - template - unique_ptr::Data>> UnicodeIntervalTree::find (Interval interval) { - unique_ptr> results(new vector); - m_root->find(interval, results.get()); - return results; - } - - template - void UnicodeIntervalTree::Node::find (Interval interval, vector* results) { - if (!overlaps_recursive(interval)) { - return; - } - if (m_left != nullptr) { - m_left->find(interval, results); - } - if (overlaps(interval)) { - results->push_back(Data(m_interval, m_value)); - } - if (m_right != nullptr) { - m_right->find(interval, results); - } - } - - template - unique_ptr::Data>> UnicodeIntervalTree::pop (Interval interval) { - unique_ptr> results(new vector); - while (true) { - unique_ptr n; - m_root = Node::pop(std::move(m_root), interval, &n); - if (n == nullptr) { - break; - } - results->push_back(Data(n->get_interval(), n->get_value())); - } - return results; - } - - template - unique_ptr::Node> UnicodeIntervalTree::Node::pop (unique_ptr node, Interval interval, - unique_ptr* ret) { - if (node == nullptr) { - return nullptr; - } - if (!node->overlaps_recursive(interval)) { - return node; - } - node->m_left = Node::pop(std::move(node->m_left), interval, ret); - if (ret->get() != nullptr) { - node->update(); - return Node::balance(std::move(node)); - } - assert(node->overlaps(interval)); - ret->reset(node.release()); - if (((*ret)->m_left == nullptr) && ((*ret)->m_right == nullptr)) { - return nullptr; - } else if ((*ret)->m_left == nullptr) { - return std::move((*ret)->m_right); - } else if ((*ret)->m_right == nullptr) { - return std::move((*ret)->m_left); - } else { - unique_ptr replacement; - unique_ptr sub_tree = Node::pop_min(std::move((*ret)->m_right), &replacement); - replacement->m_left = std::move((*ret)->m_left); - replacement->m_right = std::move(sub_tree); - replacement->update(); - return Node::balance(std::move(replacement)); - } - } - - template - unique_ptr::Node> UnicodeIntervalTree::Node::pop_min (unique_ptr node, unique_ptr* ret) { - assert(node != nullptr); - if (node->m_left == nullptr) { - assert(node->m_right != nullptr); - unique_ptr right(std::move(node->m_right)); - ret->reset(node.release()); - return right; - } - node->m_left = Node::pop_min(std::move(node->m_left), ret); - node->update(); - return Node::balance(std::move(node)); - } - - template - void UnicodeIntervalTree::Node::update () { - if ((m_left == nullptr) && (m_right == nullptr)) { - m_height = 1; - m_lower = m_interval.first; - m_upper = m_interval.second; - } else if (m_left == nullptr) { - m_height = 2; - m_lower = m_interval.first; - m_upper = max(m_interval.second, m_right->m_upper); - } else if (m_right == nullptr) { - m_height = 2; - m_lower = m_left->m_lower; - m_upper = max(m_interval.second, m_left->m_upper); - } else { - m_height = max(m_left->m_height, m_right->m_height) + 1; - m_lower = m_left->m_lower; - m_upper = max({m_interval.second, m_left->m_upper, m_right->m_upper}); - } - } - - template - int UnicodeIntervalTree::Node::balance_factor () { - return (m_right != nullptr ? m_right.get() : 0) - - (m_left != nullptr ? m_left.get() : 0); - } - - template - unique_ptr::Node> UnicodeIntervalTree::Node::balance (unique_ptr node) { - int factor = node->balance_factor(); - if (factor * factor <= 1) { - return node; - } - int sub_factor = (factor < 0) ? node->m_left->balance_factor() : node->m_right->balance_factor(); - if (factor * sub_factor > 0) { - return Node::rotate(std::move(node), factor); - } - if (factor == 2) { - node->m_right = Node::rotate(std::move(node->m_right), sub_factor); - } else { - node->m_left = Node::rotate(std::move(node->m_left), sub_factor); - } - return Node::rotate(std::move(node), factor); - } - - template - unique_ptr::Node> UnicodeIntervalTree::Node::rotate (unique_ptr node, int factor) { - if (factor < 0) { - return Node::rotate_cw(std::move(node)); - } else if (factor > 0) { - return Node::rotate_ccw(std::move(node)); - } - return node; - } - - template - unique_ptr::Node> UnicodeIntervalTree::Node::rotate_cw (unique_ptr node) { - unique_ptr n(std::move(node->m_left)); - node->m_left.reset(n->m_right.release()); - n->m_right.reset(node.release()); - n->m_right->update(); - n->update(); - return n; - } - - template - unique_ptr::Node> UnicodeIntervalTree::Node::rotate_ccw (unique_ptr node) { - unique_ptr n(std::move(node->m_right)); - node->m_right.reset(n->m_left.release()); - n->m_left.reset(node.release()); - n->m_left->update(); - n->update(); - return n; - } - - template - bool UnicodeIntervalTree::Node::overlaps_recursive (Interval i) { - return ((m_lower <= i.first) && (i.first <= m_upper)) || - ((m_lower <= i.second) && (i.second <= m_upper)) || - ((i.first <= m_lower) && (m_lower <= i.second)); - } - - template - bool UnicodeIntervalTree::Node::overlaps (Interval i) { - return ((m_interval.first <= i.first) && (i.first <= m_interval.second)) || - ((m_interval.first <= i.second) && (i.second <= m_interval.second)) || - ((i.first <= m_interval.first) && (m_interval.first <= i.second)); - } -} - -#endif // COMPRESSOR_FRONTEND_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_TPP \ No newline at end of file diff --git a/components/core/src/compressor_frontend/utils.cpp b/components/core/src/compressor_frontend/utils.cpp deleted file mode 100644 index 9efbeb133..000000000 --- a/components/core/src/compressor_frontend/utils.cpp +++ /dev/null @@ -1,120 +0,0 @@ -#include "utils.hpp" - -// C++ standard libraries -#include - -// Project headers -#include "../FileReader.hpp" -#include "Constants.hpp" -#include "LALR1Parser.hpp" -#include "SchemaParser.hpp" - -using std::unique_ptr; - -namespace compressor_frontend { - void load_lexer_from_file (const std::string& schema_file_path, bool reverse, lexers::ByteLexer& lexer) { - FileReader schema_reader; - schema_reader.try_open(schema_file_path); - - SchemaParser sp; - unique_ptr schema_ast = sp.generate_schema_ast(schema_reader); - auto* delimiters_ptr = dynamic_cast(schema_ast->m_delimiters.get()); - - if (!lexer.m_symbol_id.empty()) { - throw std::runtime_error("Error: symbol_ids initialized before setting enum symbol_ids"); - } - - /// TODO: this is a copy of other code - lexer.m_symbol_id[cTokenEnd] = (int) SymbolID::TokenEndID; - lexer.m_symbol_id[cTokenUncaughtString] = (int) SymbolID::TokenUncaughtStringID; - lexer.m_symbol_id[cTokenInt] = (int) SymbolID::TokenIntId; - lexer.m_symbol_id[cTokenFloat] = (int) SymbolID::TokenFloatId; - lexer.m_symbol_id[cTokenFirstTimestamp] = (int) SymbolID::TokenFirstTimestampId; - lexer.m_symbol_id[cTokenNewlineTimestamp] = (int) SymbolID::TokenNewlineTimestampId; - lexer.m_symbol_id[cTokenNewline] = (int) SymbolID::TokenNewlineId; - - lexer.m_id_symbol[(int) SymbolID::TokenEndID] = cTokenEnd; - lexer.m_id_symbol[(int) SymbolID::TokenUncaughtStringID] = cTokenUncaughtString; - lexer.m_id_symbol[(int) SymbolID::TokenIntId] = cTokenInt; - lexer.m_id_symbol[(int) SymbolID::TokenFloatId] = cTokenFloat; - lexer.m_id_symbol[(int) SymbolID::TokenFirstTimestampId] = cTokenFirstTimestamp; - lexer.m_id_symbol[(int) SymbolID::TokenNewlineTimestampId] = cTokenNewlineTimestamp; - lexer.m_id_symbol[(int) SymbolID::TokenNewlineId] = cTokenNewline; - - /// TODO: figure out why this needs to be specially added - lexer.add_rule(lexer.m_symbol_id["newLine"], - std::move(make_unique>(RegexASTLiteral('\n')))); - - if (delimiters_ptr != nullptr) { - lexer.add_delimiters(delimiters_ptr->m_delimiters); - } - for (unique_ptr const& parser_ast: schema_ast->m_schema_vars) { - auto* rule = dynamic_cast(parser_ast.get()); - - if ("timestamp" == rule->m_name) { - continue; - } - - if (lexer.m_symbol_id.find(rule->m_name) == lexer.m_symbol_id.end()) { - lexer.m_symbol_id[rule->m_name] = lexer.m_symbol_id.size(); - lexer.m_id_symbol[lexer.m_symbol_id[rule->m_name]] = rule->m_name; - } - - // transform '.' from any-character into any non-delimiter character - rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters_ptr->m_delimiters); - - /// TODO: this error function is a copy - // currently, error out if non-timestamp pattern contains a delimiter - // check if regex contains a delimiter - bool is_possible_input[cUnicodeMax] = {false}; - rule->m_regex_ptr->set_possible_inputs_to_true(is_possible_input); - bool contains_delimiter = false; - uint32_t delimiter_name; - for (uint32_t delimiter: delimiters_ptr->m_delimiters) { - if (is_possible_input[delimiter]) { - contains_delimiter = true; - delimiter_name = delimiter; - break; - } - } - if (contains_delimiter) { - FileReader schema_reader; - ErrorCode error_code = schema_reader.try_open(schema_ast->m_file_path); - if (ErrorCode_Success != error_code) { - throw std::runtime_error(schema_file_path + ":" + to_string(rule->m_line_num + 1) + ": error: '" + rule->m_name - + "' has regex pattern which contains delimiter '" + char(delimiter_name) + "'.\n"); - } else { - // more detailed debugging based on looking at the file - string line; - for (uint32_t i = 0; i <= rule->m_line_num; i++) { - schema_reader.read_to_delimiter('\n', false, false, line); - } - int colon_pos = 0; - for (char i : line) { - colon_pos++; - if (i == ':') { - break; - } - } - string indent(10, ' '); - string spaces(colon_pos, ' '); - string arrows(line.size() - colon_pos, '^'); - - throw std::runtime_error(schema_file_path + ":" + to_string(rule->m_line_num + 1) + ": error: '" + rule->m_name - + "' has regex pattern which contains delimiter '" + char(delimiter_name) + "'.\n" - + indent + line + "\n" + indent + spaces + arrows + "\n"); - - } - } - - lexer.add_rule(lexer.m_symbol_id[rule->m_name], std::move(rule->m_regex_ptr)); - } - if (reverse) { - lexer.generate_reverse(); - } else { - lexer.generate(); - } - - schema_reader.close(); - } -} diff --git a/components/core/src/compressor_frontend/utils.hpp b/components/core/src/compressor_frontend/utils.hpp deleted file mode 100644 index 0943d3dda..000000000 --- a/components/core/src/compressor_frontend/utils.hpp +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef COMPRESSOR_FRONTEND_UTILS_HPP -#define COMPRESSOR_FRONTEND_UTILS_HPP - -// Project headers -#include "Lexer.hpp" - -namespace compressor_frontend { - - using finite_automata::RegexNFAByteState; - using finite_automata::RegexDFAByteState; - - /** - * Loads the lexer from the schema file at the given path - * @param schema_file_path - * @param reverse Whether to generate a reverse lexer - * @param lexer - */ - void load_lexer_from_file (const std::string& schema_file_path, bool reverse, Lexer& lexer); -} - -#endif //COMPRESSOR_FRONTEND_UTILS_HPP diff --git a/components/core/src/streaming_archive/writer/Archive.cpp b/components/core/src/streaming_archive/writer/Archive.cpp index 415d599e4..b8b900dca 100644 --- a/components/core/src/streaming_archive/writer/Archive.cpp +++ b/components/core/src/streaming_archive/writer/Archive.cpp @@ -18,13 +18,18 @@ // json #include +// Log surgeon +#include +#include + // Project headers -#include "../../compressor_frontend/LogParser.hpp" +#include "../../clp/utils.hpp" #include "../../EncodedVariableInterpreter.hpp" #include "../../spdlog_with_specializations.hpp" #include "../../Utils.hpp" #include "../Constants.hpp" +using log_surgeon::LogEventView; using std::list; using std::make_unique; using std::string; @@ -262,66 +267,74 @@ namespace streaming_archive::writer { update_segment_indices(logtype_id, var_ids); } - void Archive::write_msg_using_schema (compressor_frontend::Token*& uncompressed_msg, uint32_t uncompressed_msg_pos, const bool has_delimiter, - const bool has_timestamp) { + void Archive::write_msg_using_schema (LogEventView const& log_view) { epochtime_t timestamp = 0; TimestampPattern* timestamp_pattern = nullptr; - if (has_timestamp) { + auto const& log_output_buffer = log_view.get_log_output_buffer(); + if (log_output_buffer->has_timestamp()) { size_t start; size_t end; - timestamp_pattern = (TimestampPattern*) TimestampPattern::search_known_ts_patterns( - uncompressed_msg[0].get_string(), timestamp, start, end); - if (old_ts_pattern != *timestamp_pattern) { + timestamp_pattern = (TimestampPattern*)TimestampPattern::search_known_ts_patterns( + log_output_buffer->get_mutable_token(0).to_string(), + timestamp, + start, + end + ); + if (m_old_ts_pattern != timestamp_pattern) { change_ts_pattern(timestamp_pattern); - old_ts_pattern = *timestamp_pattern; + m_old_ts_pattern = timestamp_pattern; } - assert(nullptr != timestamp_pattern); } if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts) { clp::split_file_and_archive(m_archive_user_config, m_path_for_compression, m_group_id, timestamp_pattern, *this); } else if (m_file->get_encoded_size_in_bytes() >= m_target_encoded_file_size) { clp::split_file(m_path_for_compression, m_group_id, timestamp_pattern, *this); } - m_encoded_vars.clear(); m_var_ids.clear(); m_logtype_dict_entry.clear(); - size_t num_uncompressed_bytes = 0; // Timestamp is included in the uncompressed message size - uint32_t start_pos = uncompressed_msg[0].m_start_pos; + uint32_t start_pos = log_output_buffer->get_token(0).m_start_pos; if (timestamp_pattern == nullptr) { - start_pos = uncompressed_msg[1].m_start_pos; + start_pos = log_output_buffer->get_token(1).m_start_pos; } - uint32_t end_pos = uncompressed_msg[uncompressed_msg_pos - 1].m_end_pos; + uint32_t end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos; if (start_pos <= end_pos) { num_uncompressed_bytes = end_pos - start_pos; } else { - num_uncompressed_bytes = *uncompressed_msg[0].m_buffer_size_ptr - start_pos + end_pos; - } - for (uint32_t i = 1; i < uncompressed_msg_pos; i++) { - compressor_frontend::Token& token = uncompressed_msg[i]; - int token_type = token.m_type_ids->at(0); - if (has_delimiter && token_type != (int) compressor_frontend::SymbolID::TokenUncaughtStringID && - token_type != (int) compressor_frontend::SymbolID::TokenNewlineId) { + num_uncompressed_bytes + = log_output_buffer->get_token(0).m_buffer_size - start_pos + end_pos; + } + for (uint32_t i = 1; i < log_output_buffer->pos(); i++) { + log_surgeon::Token& token = log_output_buffer->get_mutable_token(i); + int token_type = token.m_type_ids_ptr->at(0); + if (log_output_buffer->has_delimiters() && (timestamp_pattern != nullptr || i > 1) + && token_type != static_cast(log_surgeon::SymbolID::TokenUncaughtStringID) + && token_type != static_cast(log_surgeon::SymbolID::TokenNewlineId)) + { m_logtype_dict_entry.add_constant(token.get_delimiter(), 0, 1); - if (token.m_start_pos == *token.m_buffer_size_ptr - 1) { + if (token.m_start_pos == token.m_buffer_size - 1) { token.m_start_pos = 0; } else { token.m_start_pos++; } } switch (token_type) { - case (int) compressor_frontend::SymbolID::TokenNewlineId: - case (int) compressor_frontend::SymbolID::TokenUncaughtStringID: { - m_logtype_dict_entry.add_constant(token.get_string(), 0, token.get_length()); + case static_cast(log_surgeon::SymbolID::TokenNewlineId): + case static_cast(log_surgeon::SymbolID::TokenUncaughtStringID): { + m_logtype_dict_entry.add_constant(token.to_string(), 0, token.get_length()); break; } - case (int) compressor_frontend::SymbolID::TokenIntId: { + case static_cast(log_surgeon::SymbolID::TokenIntId): { encoded_variable_t encoded_var; - if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var(token.get_string(), encoded_var)) { + if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var( + token.to_string(), + encoded_var + )) + { variable_dictionary_id_t id; - m_var_dict.add_entry(token.get_string(), id); + m_var_dict.add_entry(token.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_logtype_dict_entry.add_dictionary_var(); } else { @@ -330,12 +343,15 @@ namespace streaming_archive::writer { m_encoded_vars.push_back(encoded_var); break; } - case (int) compressor_frontend::SymbolID::TokenFloatId: { + case static_cast(log_surgeon::SymbolID::TokenFloatId): { encoded_variable_t encoded_var; if (!EncodedVariableInterpreter::convert_string_to_representable_float_var( - token.get_string(), encoded_var)) { + token.to_string(), + encoded_var + )) + { variable_dictionary_id_t id; - m_var_dict.add_entry(token.get_string(), id); + m_var_dict.add_entry(token.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_logtype_dict_entry.add_dictionary_var(); } else { @@ -348,7 +364,7 @@ namespace streaming_archive::writer { // Variable string looks like a dictionary variable, so encode it as so encoded_variable_t encoded_var; variable_dictionary_id_t id; - m_var_dict.add_entry(token.get_string(), id); + m_var_dict.add_entry(token.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_var_ids.push_back(id); diff --git a/components/core/src/streaming_archive/writer/Archive.hpp b/components/core/src/streaming_archive/writer/Archive.hpp index 64569a9f6..048081603 100644 --- a/components/core/src/streaming_archive/writer/Archive.hpp +++ b/components/core/src/streaming_archive/writer/Archive.hpp @@ -14,9 +14,12 @@ #include #include +// Log Surgeon +#include +#include + // Project headers #include "../../ArrayBackedPosIntSet.hpp" -#include "../../compressor_frontend/Token.hpp" #include "../../ErrorCode.hpp" #include "../../GlobalMetadataDB.hpp" #include "../../ir/LogEvent.hpp" @@ -62,8 +65,7 @@ namespace streaming_archive { namespace writer { } }; - TimestampPattern old_ts_pattern; - + TimestampPattern* m_old_ts_pattern; size_t m_target_data_size_of_dicts; UserConfig m_archive_user_config; std::string m_path_for_compression; @@ -73,7 +75,7 @@ namespace streaming_archive { namespace writer { // Constructors Archive () : m_segments_dir_fd(-1), m_compression_level(0), m_global_metadata_db(nullptr), - old_ts_pattern(), m_schema_file_path() {} + m_old_ts_pattern(nullptr), m_schema_file_path() {} // Destructor ~Archive (); @@ -130,16 +132,13 @@ namespace streaming_archive { namespace writer { * @throw FileWriter::OperationFailed if any write fails */ void write_msg (epochtime_t timestamp, const std::string& message, size_t num_uncompressed_bytes); + /** * Encodes and writes a message to the given file using schema file - * @param file - * @param uncompressed_msg - * @param uncompressed_msg_pos - * @param has_delimiter - * @param has_timestamp + * @param log_event_view * @throw FileWriter::OperationFailed if any write fails */ - void write_msg_using_schema (compressor_frontend::Token*& uncompressed_msg, uint32_t uncompressed_msg_pos, bool has_delimiter, bool has_timestamp); + void write_msg_using_schema (log_surgeon::LogEventView const& log_event_view); /** * Writes an IR log event to the current encoded file diff --git a/components/core/submodules/log-surgeon b/components/core/submodules/log-surgeon new file mode 160000 index 000000000..895f46489 --- /dev/null +++ b/components/core/submodules/log-surgeon @@ -0,0 +1 @@ +Subproject commit 895f46489b1911ab3b3aac3202afd56c96e8cd98 diff --git a/components/core/tests/test-Grep.cpp b/components/core/tests/test-Grep.cpp index 783f5c4bd..96a855c82 100644 --- a/components/core/tests/test-Grep.cpp +++ b/components/core/tests/test-Grep.cpp @@ -4,25 +4,26 @@ // Catch2 #include +// Log Surgeon +#include +#include + // Project headers -#include "../src/compressor_frontend/Lexer.hpp" -#include "../src/compressor_frontend/SchemaParser.hpp" -#include "../src/compressor_frontend/utils.hpp" #include "../src/Grep.hpp" -using compressor_frontend::DelimiterStringAST; -using compressor_frontend::lexers::ByteLexer; -using compressor_frontend::ParserAST; -using compressor_frontend::SchemaFileAST; -using compressor_frontend::SchemaParser; -using compressor_frontend::SchemaVarAST; +using log_surgeon::DelimiterStringAST; +using log_surgeon::lexers::ByteLexer; +using log_surgeon::ParserAST; +using log_surgeon::SchemaAST; +using log_surgeon::SchemaParser; +using log_surgeon::SchemaVarAST; using std::string; TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { ByteLexer forward_lexer; - compressor_frontend::load_lexer_from_file("../tests/test_schema_files/search_schema.txt", false, forward_lexer); + load_lexer_from_file("../tests/test_schema_files/search_schema.txt", false, forward_lexer); ByteLexer reverse_lexer; - compressor_frontend::load_lexer_from_file("../tests/test_schema_files/search_schema.txt", true, reverse_lexer); + load_lexer_from_file("../tests/test_schema_files/search_schema.txt", true, reverse_lexer); string str; size_t begin_pos; diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index ef11d30f5..b96fda3c4 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -1,4 +1,8 @@ +// TODO: move this test to log_surgeon +// TODO: move load_lexer_from_file into SearchParser in log_surgeon + // C libraries +#include #include // Boost libraries @@ -8,34 +12,34 @@ // Catch2 #include +// Log Surgeon +#include + // Project headers #include "../src/clp/run.hpp" -#include "../src/compressor_frontend/utils.hpp" -#include "../src/compressor_frontend/LogParser.hpp" #include "../src/GlobalMySQLMetadataDB.hpp" - -using compressor_frontend::DelimiterStringAST; -using compressor_frontend::LALR1Parser; -using compressor_frontend::lexers::ByteLexer; -using compressor_frontend::LogParser; -using compressor_frontend::ParserAST; -using compressor_frontend::SchemaFileAST; -using compressor_frontend::SchemaParser; -using compressor_frontend::SchemaVarAST; -using compressor_frontend::Token; - -std::unique_ptr generate_schema_ast(const std::string& schema_file) { +#include "../src/LogSurgeonReader.hpp" +#include "../src/Utils.hpp" + +using log_surgeon::DelimiterStringAST; +using log_surgeon::LALR1Parser; +using log_surgeon::lexers::ByteLexer; +using log_surgeon::LogParser; +using log_surgeon::ParserAST; +using log_surgeon::SchemaAST; +using log_surgeon::SchemaParser; +using log_surgeon::SchemaVarAST; +using log_surgeon::Token; + +std::unique_ptr generate_schema_ast(const std::string& schema_file) { SchemaParser schema_parser; - FileReader schema_file_reader; - schema_file_reader.open(schema_file); - REQUIRE(schema_file_reader.is_open()); - std::unique_ptr schema_ast = schema_parser.generate_schema_ast(schema_file_reader); + std::unique_ptr schema_ast = SchemaParser::try_schema_file(schema_file); REQUIRE(schema_ast.get() != nullptr); return schema_ast; } std::unique_ptr generate_log_parser(const std::string& schema_file) { - std::unique_ptr schema_ast = generate_schema_ast(schema_file); + std::unique_ptr schema_ast = generate_schema_ast(schema_file); std::unique_ptr log_parser = std::make_unique(schema_file); REQUIRE(log_parser.get() != nullptr); return log_parser; @@ -44,7 +48,7 @@ std::unique_ptr generate_log_parser(const std::string& schema_file) { void compress(const std::string& output_dir, const std::string& file_to_compress, std::string schema_file, bool old = false) { std::vector arguments; if(old) { - arguments = {"main.cpp", "c", output_dir, file_to_compress}; + arguments = {"main.cpp", "c", output_dir, file_to_compress}; } else { arguments = {"main.cpp", "c", output_dir, file_to_compress, "--schema-path", std::move(schema_file)}; } @@ -68,32 +72,41 @@ void decompress(std::string archive_dir, std::string output_dir) { TEST_CASE("Test error for missing schema file", "[LALR1Parser][SchemaParser]") { std::string file_path = "../tests/test_schema_files/missing_schema.txt"; std::string file_name = boost::filesystem::weakly_canonical(file_path).string(); - REQUIRE_THROWS_WITH(generate_schema_ast(file_path), "File not found: " + file_name + "\n"); - SPDLOG_INFO("File not found: " + file_name + "\n"); + REQUIRE_THROWS_WITH( + generate_schema_ast(file_path), + "Failed to read '" + file_path + "', error_code=" + + std::to_string(static_cast(log_surgeon::ErrorCode::FileNotFound)) + ); } TEST_CASE("Test error for empty schema file", "[LALR1Parser][SchemaParser]") { std::string file_path = "../tests/test_schema_files/empty_schema.txt"; - std::string file_name = boost::filesystem::canonical(file_path).string(); - REQUIRE_THROWS_WITH(generate_schema_ast(file_path), file_name +":1:1: error: empty file\n" - +" \n" - +"^\n"); + REQUIRE_THROWS_WITH( + generate_schema_ast(file_path), + "Schema:1:1: error: empty file\n" + " \n" + "^\n" + ); } TEST_CASE("Test error for colon missing schema file", "[LALR1Parser][SchemaParser]") { std::string file_path = "../tests/test_schema_files/colon_missing_schema.txt"; - std::string file_name = boost::filesystem::canonical(file_path).string(); - REQUIRE_THROWS_WITH(generate_schema_ast(file_path), file_name +":3:4: error: expected ':','AlphaNumeric' before ' ' token\n" - +" int [0-9]+\n" - +" ^\n"); + REQUIRE_THROWS_WITH( + generate_schema_ast(file_path), + "Schema:3:4: error: expected ':','AlphaNumeric' before ' ' token\n" + " int [0-9]+\n" + " ^\n" + ); } TEST_CASE("Test error for multi-character tokens in schema file", "[LALR1Parser][SchemaParser]") { std::string file_path = "../tests/test_schema_files/schema_with_multicharacter_token_error.txt"; - std::string file_name = boost::filesystem::canonical(file_path).string(); - REQUIRE_THROWS_WITH(generate_schema_ast(file_path), file_name +":2:11: error: expected ':' before ' ' token\n" - +" delimiters : \\r\\n\n" - +" ^\n"); + REQUIRE_THROWS_WITH( + generate_schema_ast(file_path), + "Schema:2:11: error: expected ':' before ' ' token\n" + " delimiters : \\r\\n\n" + " ^\n" + ); } TEST_CASE("Test creating schema parser", "[LALR1Parser][SchemaParser]") { @@ -109,19 +122,26 @@ TEST_CASE("Test creating log parser without delimiters", "[LALR1Parser][LogParse "When using --schema-path, \"delimiters:\" line must be used."); } -TEST_CASE("Test error for creating log file with delimiter in regex pattern", "[LALR1Parser][SchemaParser]") { - std::string file_path = "../tests/test_schema_files/schema_with_delimiter_in_regex_error.txt"; - std::string file_name = boost::filesystem::canonical(file_path).string(); - REQUIRE_THROWS_WITH(generate_log_parser(file_path), file_name + ":2: error: 'equals' has regex pattern which contains delimiter '='.\n" - + " equals:.*=.*\n" - + " ^^^^^\n"); -} +// TODO: This test doesn't currently work because delimiters are allowed in +// schema files, and there is no option to disable this yet +//TEST_CASE("Test error for creating log file with delimiter in regex pattern", +// "[LALR1Parser]SchemaParser]") { +// std::string file_path = "../tests/test_schema_files/schema_with_delimiter_in_regex_error.txt"; +// std::string file_name = boost::filesystem::canonical(file_path).string(); +// REQUIRE_THROWS_WITH(generate_log_parser(file_path), +// file_name + +// ":2: error: 'equals' has regex pattern which contains delimiter '='.\n" +// + " equals:.*=.*\n" +// + " ^^^^^\n"); +//} -/// TODO: This error check is performed correctly by CLP, but it is handled by something different now so this test will fail as is +// TODO: This error check is performed correctly by CLP, but it is handled by +// something different now so this test will fail as is //TEST_CASE("Test error for missing log file", "[LALR1Parser][LogParser]") { // std::string file_name = "../tests/test_log_files/missing_log.txt"; // std::string file_path = boost::filesystem::weakly_canonical(file_name).string(); -// REQUIRE_THROWS(compress("../tests/test_archives", file_name, "../tests/test_schema_files/schema_that_does_not_exist.txt"), +// REQUIRE_THROWS(compress("../tests/test_archives", file_name, +// "../tests/test_schema_files/schema_that_does_not_exist.txt"), // "Specified schema file does not exist."); //} @@ -129,15 +149,21 @@ TEST_CASE("Test forward lexer", "[Search]") { ByteLexer forward_lexer; std::string schema_file_name = "../tests/test_schema_files/search_schema.txt"; std::string schema_file_path = boost::filesystem::weakly_canonical(schema_file_name).string(); - compressor_frontend::load_lexer_from_file(schema_file_path, false, forward_lexer); - FileReader reader; - reader.open("../tests/test_search_queries/easy.txt"); - forward_lexer.reset(reader); - Token token = forward_lexer.scan(); - while (token.m_type_ids->at(0) != (int)compressor_frontend::SymbolID::TokenEndID) { - SPDLOG_INFO("token:" + token.get_string() + "\n"); - SPDLOG_INFO("token.m_type_ids->back():" + forward_lexer.m_id_symbol[token.m_type_ids->back()] + "\n"); - token = forward_lexer.scan(); + load_lexer_from_file(schema_file_path, false, forward_lexer); + FileReader file_reader; + LogSurgeonReader reader_wrapper(file_reader); + file_reader.open("../tests/test_search_queries/easy.txt"); + log_surgeon::ParserInputBuffer parser_input_buffer; + parser_input_buffer.read_if_safe(reader_wrapper); + forward_lexer.reset(); + Token token; + auto error_code = forward_lexer.scan(parser_input_buffer, token); + REQUIRE(error_code == log_surgeon::ErrorCode::Success); + while (token.m_type_ids_ptr->at(0) != static_cast(log_surgeon::SymbolID::TokenEndID)) { + SPDLOG_INFO("token:" + token.to_string() + "\n"); + SPDLOG_INFO("token.m_type_ids->back():" + forward_lexer.m_id_symbol[token.m_type_ids_ptr->back()] + "\n"); + error_code = forward_lexer.scan(parser_input_buffer, token); + REQUIRE(error_code == log_surgeon::ErrorCode::Success); } } @@ -145,14 +171,20 @@ TEST_CASE("Test reverse lexer", "[Search]") { ByteLexer reverse_lexer; std::string schema_file_name = "../tests/test_schema_files/search_schema.txt"; std::string schema_file_path = boost::filesystem::weakly_canonical(schema_file_name).string(); - compressor_frontend::load_lexer_from_file(schema_file_path, true, reverse_lexer); - FileReader reader; - reader.open("../tests/test_search_queries/easy.txt"); - reverse_lexer.reset(reader); - Token token = reverse_lexer.scan(); - while (token.m_type_ids->at(0) != (int)compressor_frontend::SymbolID::TokenEndID) { - SPDLOG_INFO("token:" + token.get_string() + "\n"); - SPDLOG_INFO("token.m_type_ids->back():" + reverse_lexer.m_id_symbol[token.m_type_ids->back()] + "\n"); - token = reverse_lexer.scan(); + load_lexer_from_file(schema_file_path, false, reverse_lexer); + FileReader file_reader; + LogSurgeonReader reader_wrapper(file_reader); + file_reader.open("../tests/test_search_queries/easy.txt"); + log_surgeon::ParserInputBuffer parser_input_buffer; + parser_input_buffer.read_if_safe(reader_wrapper); + reverse_lexer.reset(); + Token token; + auto error_code = reverse_lexer.scan(parser_input_buffer, token); + REQUIRE(error_code == log_surgeon::ErrorCode::Success); + while (token.m_type_ids_ptr->at(0) != static_cast(log_surgeon::SymbolID::TokenEndID)) { + SPDLOG_INFO("token:" + token.to_string() + "\n"); + SPDLOG_INFO("token.m_type_ids->back():" + reverse_lexer.m_id_symbol[token.m_type_ids_ptr->back()] + "\n"); + error_code = reverse_lexer.scan(parser_input_buffer, token); + REQUIRE(error_code == log_surgeon::ErrorCode::Success); } } diff --git a/components/core/tests/test_log_files/log.txt b/components/core/tests/test_log_files/log.txt index 51309fc85..185e4723d 100644 --- a/components/core/tests/test_log_files/log.txt +++ b/components/core/tests/test_log_files/log.txt @@ -1,6 +1,7 @@ 2016-05-08 07:34:05.251 MyDog123 APet4123\test.txt 2016-05-08 07:34:05.252 statictext123 -2016-05-08 07:34:05.253 123 +2016-05-08 07:34:05.253 123 1.9 GB out of 4.2 GB data 2016-05-08 07:34:05.254 123.123 +is multiline 2016-05-08 07:34:05.255 Some Static Text Then MyDog123 APet4123\test.txt Then 123 then 123.123 -123123 relative timestamp \ No newline at end of file +123123 relative timestamp diff --git a/components/core/tests/test_schema_files/colon_missing_schema.txt b/components/core/tests/test_schema_files/colon_missing_schema.txt index 0e063a696..d2c25cfbf 100644 --- a/components/core/tests/test_schema_files/colon_missing_schema.txt +++ b/components/core/tests/test_schema_files/colon_missing_schema.txt @@ -1,3 +1,3 @@ delimiters: -double:[0-9]+\.[0-9]+ +float:[0-9]+\.[0-9]+ int [0-9]+ \ No newline at end of file diff --git a/components/core/tests/test_schema_files/real_schema.txt b/components/core/tests/test_schema_files/real_schema.txt index 4a72dff29..3c2cb6e29 100644 --- a/components/core/tests/test_schema_files/real_schema.txt +++ b/components/core/tests/test_schema_files/real_schema.txt @@ -4,7 +4,7 @@ delimiters: \r\n // First set of variables timestamp:[0-9]{4}\-[0-9]{2}\-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}[,\.][0-9]{0,3} int:\-{0,1}[0-9]+ -double:\-{0,1}[0-9]+\.[0-9]+ +float:\-{0,1}[0-9]+\.[0-9]+ // Second set of variables hex:[a-fA-F]+ diff --git a/components/core/tests/test_schema_files/schema_with_delimiter_in_regex_error.txt b/components/core/tests/test_schema_files/schema_with_delimiter_in_regex_error.txt index 9bd2488c2..7491d1580 100644 --- a/components/core/tests/test_schema_files/schema_with_delimiter_in_regex_error.txt +++ b/components/core/tests/test_schema_files/schema_with_delimiter_in_regex_error.txt @@ -4,4 +4,4 @@ identifier:(My.og)\d{3}APet[0-9]*\\test\.txt timestamp:[0-9]{4}\-[0-9]{2}\-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3} serverName:(S|s)erver[0-9]* int:[0-9]+ -double:[0-9]+\.[0-9]+ \ No newline at end of file +float:[0-9]+\.[0-9]+ \ No newline at end of file diff --git a/components/core/tests/test_schema_files/schema_with_delimiters.txt b/components/core/tests/test_schema_files/schema_with_delimiters.txt index 0b0f9af9f..532dba9de 100644 --- a/components/core/tests/test_schema_files/schema_with_delimiters.txt +++ b/components/core/tests/test_schema_files/schema_with_delimiters.txt @@ -3,4 +3,4 @@ identifier:(My.og)\d{3}APet[0-9]*\\test\.txt timestamp:[0-9]{4}\-[0-9]{2}\-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3} serverName:(S|s)erver[0-9]* int:[0-9]+ -double:[0-9]+\.[0-9]+ \ No newline at end of file +float:[0-9]+\.[0-9]+ \ No newline at end of file diff --git a/components/core/tests/test_schema_files/schema_with_multicharacter_token_error.txt b/components/core/tests/test_schema_files/schema_with_multicharacter_token_error.txt index 5fa7f41ea..efe3fff1a 100644 --- a/components/core/tests/test_schema_files/schema_with_multicharacter_token_error.txt +++ b/components/core/tests/test_schema_files/schema_with_multicharacter_token_error.txt @@ -4,7 +4,7 @@ delimiters : \r\n // First set of variables timestamp:[0-9]{4}\-[0-9]{2}\-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3} int:\-{0,1}[0-9]+ -double:\-{0,1}[0-9]+\.[0-9]+ +float:\-{0,1}[0-9]+\.[0-9]+ // Second set of variables hex:[a-fA-F]+ diff --git a/components/core/tests/test_schema_files/schema_without_delimiters.txt b/components/core/tests/test_schema_files/schema_without_delimiters.txt index 7b25296d4..ea28b6142 100644 --- a/components/core/tests/test_schema_files/schema_without_delimiters.txt +++ b/components/core/tests/test_schema_files/schema_without_delimiters.txt @@ -2,4 +2,4 @@ identifier:(My.og)\d{3}\sAPet[0-9]*\\test\.txt timestamp:[0-9]{4}\-[0-9]{2}\-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3} serverName:(S|s)erver[0-9]* int:[0-9]+ -double:[0-9]+\.[0-9]+ \ No newline at end of file +float:[0-9]+\.[0-9]+ \ No newline at end of file diff --git a/components/core/tests/test_schema_files/search_schema.txt b/components/core/tests/test_schema_files/search_schema.txt index 73f11db6b..f49a6dbfa 100644 --- a/components/core/tests/test_schema_files/search_schema.txt +++ b/components/core/tests/test_schema_files/search_schema.txt @@ -4,7 +4,7 @@ delimiters: \r\n:,=!;%? // First set of variables timestamp:[0-9]{4}\-[0-9]{2}\-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]{3}){0,1} int:\-{0,1}[0-9]+ -double:\-{0,1}[0-9]+\.[0-9]+ +float:\-{0,1}[0-9]+\.[0-9]+ // Second set of variables hex:[a-fA-F]+ diff --git a/components/core/tools/docker-images/clp-env-base-centos7.4/Dockerfile b/components/core/tools/docker-images/clp-env-base-centos7.4/Dockerfile index d93d575a8..fea78e668 100644 --- a/components/core/tools/docker-images/clp-env-base-centos7.4/Dockerfile +++ b/components/core/tools/docker-images/clp-env-base-centos7.4/Dockerfile @@ -13,8 +13,8 @@ RUN ./tools/scripts/lib_install/centos7.4/install-all.sh # Set PKG_CONFIG_PATH since CentOS doesn't look in /usr/local by default ENV PKG_CONFIG_PATH /usr/local/lib64/pkgconfig:/usr/local/lib/pkgconfig -# Enable gcc 9 in login shells and non-interactive non-login shells -RUN ln -s /opt/rh/devtoolset-9/enable /etc/profile.d/devtoolset.sh +# Enable gcc 10 in login shells and non-interactive non-login shells +RUN ln -s /opt/rh/devtoolset-10/enable /etc/profile.d/devtoolset.sh # Enable git 2.27 # NOTE: We use a script to enable the SCL git package on each git call because some Github actions diff --git a/components/core/tools/docker-images/clp-env-base-ubuntu-focal/Dockerfile b/components/core/tools/docker-images/clp-env-base-ubuntu-focal/Dockerfile index 794ad77c9..60c307818 100644 --- a/components/core/tools/docker-images/clp-env-base-ubuntu-focal/Dockerfile +++ b/components/core/tools/docker-images/clp-env-base-ubuntu-focal/Dockerfile @@ -7,6 +7,12 @@ ADD ./tools/scripts/lib_install ./tools/scripts/lib_install RUN ./tools/scripts/lib_install/ubuntu-focal/install-all.sh +# Set the compiler to gcc-10 +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 +RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 +RUN update-alternatives --set gcc /usr/bin/gcc-10 +RUN update-alternatives --set g++ /usr/bin/g++-10 + # Reset the working directory so that it's accessible by any user who runs the # container WORKDIR / diff --git a/components/core/tools/scripts/lib_install/centos7.4/README.md b/components/core/tools/scripts/lib_install/centos7.4/README.md index 0662e53aa..d529c0d03 100644 --- a/components/core/tools/scripts/lib_install/centos7.4/README.md +++ b/components/core/tools/scripts/lib_install/centos7.4/README.md @@ -17,10 +17,10 @@ will not install any dependencies you don't expect. # Setup dependencies -* Enable gcc 9 +* Enable gcc 10 ```bash - ln -s /opt/rh/devtoolset-9/enable /etc/profile.d/devtoolset.sh + ln -s /opt/rh/devtoolset-10/enable /etc/profile.d/devtoolset.sh ``` * Set PKG_CONFIG_PATH since CentOS doesn't look in `/usr/local` by default. diff --git a/components/core/tools/scripts/lib_install/centos7.4/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/centos7.4/install-packages-from-source.sh index 2c911912d..daeef06be 100755 --- a/components/core/tools/scripts/lib_install/centos7.4/install-packages-from-source.sh +++ b/components/core/tools/scripts/lib_install/centos7.4/install-packages-from-source.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash -# Enable gcc 9 -source /opt/rh/devtoolset-9/enable +# Enable gcc 10 +source /opt/rh/devtoolset-10/enable # NOTE: cmake and boost must be installed first since the remaining packages depend on them ./tools/scripts/lib_install/install-cmake.sh 3.21.2 diff --git a/components/core/tools/scripts/lib_install/centos7.4/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/centos7.4/install-prebuilt-packages.sh index aab2e8168..e9398083b 100755 --- a/components/core/tools/scripts/lib_install/centos7.4/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/centos7.4/install-prebuilt-packages.sh @@ -14,5 +14,5 @@ yum install -y \ # Install packages from CentOS' software collections repository (centos-release-scl) yum install -y \ - devtoolset-9 \ + devtoolset-10 \ rh-git227 diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh index 67e165d76..4ee5a0359 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh @@ -8,6 +8,8 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y \ curl \ build-essential \ git \ + g++-10 \ + gcc-10 \ libboost-filesystem-dev \ libboost-iostreams-dev \ libboost-program-options-dev \ diff --git a/components/package-template/src/etc/clp-schema.template.txt b/components/package-template/src/etc/clp-schema.template.txt index d1d480308..f026b5612 100644 --- a/components/package-template/src/etc/clp-schema.template.txt +++ b/components/package-template/src/etc/clp-schema.template.txt @@ -49,7 +49,7 @@ timestamp:\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}.\d{6} // Specially-encoded variables (using the `int` and `double` keywords) int:\-{0,1}[0-9]+ -double:\-{0,1}[0-9]+\.[0-9]+ +float:\-{0,1}[0-9]+\.[0-9]+ // Dictionary variables hex:[a-fA-F]+