diff --git a/.clang-format b/.clang-format index e3906589..b1424521 100644 --- a/.clang-format +++ b/.clang-format @@ -1,26 +1,157 @@ --- -BasedOnStyle: LLVM ColumnLimit: 100 IndentWidth: 4 --- Language: Cpp AccessModifierOffset: -4 +AlignAfterOpenBracket: BlockIndent +AlignArrayOfStructures: None +AlignConsecutiveAssignments: None +AlignConsecutiveBitFields: None +AlignConsecutiveDeclarations: None +AlignConsecutiveMacros: None +AlignEscapedNewlines: DontAlign +AlignOperands: Align +AlignTrailingComments: Never +AllowAllArgumentsOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: Always +AllowShortCaseLabelsOnASingleLine: false +AllowShortEnumsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes BinPackArguments: false BinPackParameters: false +BitFieldColonSpacing: Both +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: MultiLine + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterExternBlock: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyNamespace: false + SplitEmptyRecord: false +BreakAfterAttributes: Never +BreakBeforeBinaryOperators: All +BreakBeforeBraces: Custom +BreakBeforeConceptDeclarations: Always +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +CompactNamespaces: true +ConstructorInitializerIndentWidth: 8 ContinuationIndentWidth: 8 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeCategories: + # NOTE: A header is grouped by first matching regex + # Project headers + - Regex: '^$' + Priority: 1 + # C++ standard libraries + - Regex: '^<.*>$' + Priority: 2 +IndentAccessModifiers: false +IndentCaseBlocks: false +IndentCaseLabels: true +IndentExternBlock: Indent +IndentGotoLabels: false +IndentPPDirectives: BeforeHash +IndentRequiresClause: false +IndentWrappedFunctionNames: false +InsertBraces: true +InsertNewlineAtEOF: true +IntegerLiteralSeparator: + Binary: 4 + BinaryMinDigits: 4 + Decimal: 3 + DecimalMinDigits: 5 + Hex: 4 + HexMinDigits: 4 +KeepEmptyLinesAtTheStartOfBlocks: false +LambdaBodyIndentation: Signature +LineEnding: LF +MaxEmptyLinesToKeep: 1 NamespaceIndentation: Inner +PPIndentWidth: -1 PackConstructorInitializers: CurrentLine +PenaltyBreakAssignment: 50 +PenaltyBreakOpenParenthesis: 25 +PenaltyBreakBeforeFirstCallParameter: 25 +PenaltyReturnTypeOnItsOwnLine: 100 PointerAlignment: Left -QualifierAlignment: Right -ReflowComments: false +QualifierAlignment: Custom +QualifierOrder: + - static + - friend + - inline + # constexpr west as explained in https://www.youtube.com/watch?v=z6s6bacI424 + - constexpr + - type + - const + - volatile +ReferenceAlignment: Pointer +ReflowComments: true +RemoveBracesLLVM: false +RemoveSemicolon: true +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: OuterScope +SeparateDefinitionBlocks: Always +ShortNamespaceLines: 0 +SortIncludes: CaseInsensitive +SortUsingDeclarations: Lexicographic +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceAroundPointerQualifiers: Default +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true -SpaceBeforeParens: Custom -SpaceBeforeParensOptions: - AfterControlStatements: true -# AfterFunctionDeclarationName: true -# AfterFunctionDefinitionName: true +SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false Standard: Latest +TabWidth: 4 +UseTab: Never diff --git a/.clang-tidy b/.clang-tidy index 989dd557..ce1bdf96 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,9 +1,24 @@ --- -Checks: 'cert-*,clang-analyzer-*,clang-diagnostic-*,cppcoreguidelines-*,modernize-*,performance-*,readability-*,-readability-identifier-length,-readability-simplify-boolean-expr' +Checks: >- + bugprone-*, + -bugprone-easily-swappable-parameters, + cert-*, + clang-analyzer-*, + clang-diagnostic-*, + concurrency-*, + cppcoreguidelines-*, + misc-*, + modernize-*, + performance-*, + portability-*, + readability-*, + -readability-identifier-length, + -readability-simplify-boolean-expr, + +WarningsAsErrors: '*' FormatStyle: file -HeaderFileExtensions: ['','h','hh','hpp','hxx','tpp'] -ImplementationFileExtensions: ['','c','cc','cpp','cxx'] CheckOptions: + misc-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: true readability-identifier-naming.ClassCase: 'CamelCase' readability-identifier-naming.ClassMemberCase: 'lower_case' readability-identifier-naming.ClassMemberPrefix: 'm_' @@ -23,4 +38,5 @@ CheckOptions: readability-identifier-naming.ParameterCase: 'lower_case' readability-identifier-naming.StructCase: 'CamelCase' readability-identifier-naming.TypedefCase: 'CamelCase' + readability-identifier-naming.TypedefIgnoredRegexp: '[a-z_]+_t' readability-identifier-naming.UnionCase: 'CamelCase' diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 00000000..8b0ffbb4 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,27 @@ +name: Build + +on: + pull_request: + push: + workflow_call: + +jobs: + build: + strategy: + matrix: + os: [macos-latest, ubuntu-latest] + build_type: [Release, Debug] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + + - if: ${{ 'macos-latest' == matrix.os }} + run: | + brew update + brew install llvm + + - run: cmake -B ./build -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + + - run: cmake --build ./build --config ${{ matrix.build_type }} + + - run: cmake --install ./build --prefix ./install diff --git a/src/log_surgeon/Buffer.hpp b/src/log_surgeon/Buffer.hpp index df6916be..c2c032ba 100644 --- a/src/log_surgeon/Buffer.hpp +++ b/src/log_surgeon/Buffer.hpp @@ -1,28 +1,25 @@ #ifndef LOG_SURGEON_BUFFER_HPP #define LOG_SURGEON_BUFFER_HPP -// C++ libraries #include #include -// Project Headers -#include "Constants.hpp" -#include "Reader.hpp" +#include +#include namespace log_surgeon { /** * A base class for the efficient implementation of a single growing buffer. * Under the hood it keeps track of one static buffer and multiple dynamic * buffers. The buffer object uses the underlying static buffer whenever - * possible, as the static buffer is on the stack and results in faster - * reads and writes. In outlier cases, where the static buffer is not large - * enough to fit all the needed data, the buffer object switches to using - * the underlying dynamic buffers. A new dynamic buffer is used each time - * the size must be grown to preserve any pointers to the buffer. All - * pointers to the buffer are valid until reset() is called and the - * buffer returns to using the underlying static buffer. The base class does - * not grow the buffer itself, the child class is responsible for doing - * this. + * possible, as the static buffer is on the stack and results in faster reads + * and writes. In outlier cases, where the static buffer is not large enough to + * fit all the needed data, the buffer object switches to using the underlying + * dynamic buffers. A new dynamic buffer is used each time the size must be + * grown to preserve any pointers to the buffer. All pointers to the buffer are + * valid until reset() is called and the buffer returns to using the underlying + * static buffer. The base class does not grow the buffer itself, the child + * class is responsible for doing this. */ template class Buffer { @@ -94,6 +91,6 @@ class Buffer { Item m_static_storage[cStaticByteBuffSize]; Item* m_active_storage{m_static_storage}; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_BUFFER_HPP +#endif // LOG_SURGEON_BUFFER_HPP diff --git a/src/log_surgeon/BufferParser.cpp b/src/log_surgeon/BufferParser.cpp index 9fe650e9..29a06233 100644 --- a/src/log_surgeon/BufferParser.cpp +++ b/src/log_surgeon/BufferParser.cpp @@ -1,26 +1,29 @@ +#include "BufferParser.hpp" + #include -#include "BufferParser.hpp" -#include "Constants.hpp" -#include "LogEvent.hpp" -#include "Schema.hpp" +#include +#include +#include namespace log_surgeon { BufferParser::BufferParser(Schema& schema) : m_log_parser(schema.get_schema_ast_ptr()) {} BufferParser::BufferParser(std::string const& schema_file_path) - : m_log_parser(LogParser(schema_file_path)) {} + : m_log_parser(LogParser(schema_file_path)) {} auto BufferParser::reset() -> void { m_log_parser.reset(); m_done = false; } -auto BufferParser::get_next_event_view(char* buf, - size_t size, - size_t& offset, - LogEventView& event_view, - bool finished_reading_input) -> ErrorCode { +auto BufferParser::get_next_event_view( + char* buf, + size_t size, + size_t& offset, + LogEventView& event_view, + bool finished_reading_input +) -> ErrorCode { event_view.reset(); // TODO in order to allow logs/tokens to wrap user buffers this function // will need more parameters or the input buffer may need to be exposed to @@ -48,17 +51,18 @@ auto BufferParser::get_next_event_view(char* buf, for (uint32_t i = start; i < event_view.m_log_output_buffer->pos(); i++) { Token* token = &event_view.m_log_output_buffer->get_mutable_token(i); event_view.add_token(token->m_type_ids_ptr->at(0), token); - if (token->m_type_ids_ptr->at(0) == (int)SymbolID::TokenNewlineId && - first_newline_pos == 0) { + if (token->m_type_ids_ptr->at(0) == (int)SymbolID::TokenNewlineId && first_newline_pos == 0) + { first_newline_pos = i; } } // To be a multiline log there must be at least one token between the // newline token and the last token in the output buffer. - if (event_view.m_log_output_buffer->has_timestamp() && 0 < first_newline_pos && - first_newline_pos + 1 < event_view.m_log_output_buffer->pos()) { + if (event_view.m_log_output_buffer->has_timestamp() && 0 < first_newline_pos + && first_newline_pos + 1 < event_view.m_log_output_buffer->pos()) + { event_view.set_multiline(true); } return ErrorCode::Success; } -} // namespace log_surgeon +} // namespace log_surgeon diff --git a/src/log_surgeon/BufferParser.hpp b/src/log_surgeon/BufferParser.hpp index 96ef2b36..8b4a9eb9 100644 --- a/src/log_surgeon/BufferParser.hpp +++ b/src/log_surgeon/BufferParser.hpp @@ -4,9 +4,9 @@ #include #include -#include "LogEvent.hpp" -#include "LogParser.hpp" -#include "Schema.hpp" +#include +#include +#include namespace log_surgeon { /** @@ -67,11 +67,13 @@ class BufferParser { * internally before this method returns. * @return ErrorCode from LogParser::parse. */ - auto get_next_event_view(char* buf, - size_t size, - size_t& offset, - LogEventView& event_view, - bool finished_reading_input = false) -> ErrorCode; + auto get_next_event_view( + char* buf, + size_t size, + size_t& offset, + LogEventView& event_view, + bool finished_reading_input = false + ) -> ErrorCode; /** * @return The underlying LogParser. @@ -100,6 +102,6 @@ class BufferParser { LogParser m_log_parser; bool m_done{false}; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_BUFFER_PARSER_HPP +#endif // LOG_SURGEON_BUFFER_PARSER_HPP diff --git a/src/log_surgeon/Constants.hpp b/src/log_surgeon/Constants.hpp index f963fd98..5752749a 100644 --- a/src/log_surgeon/Constants.hpp +++ b/src/log_surgeon/Constants.hpp @@ -1,15 +1,14 @@ #ifndef LOG_SURGEON_CONSTANTS_HPP #define LOG_SURGEON_CONSTANTS_HPP -// C++ standard libraries #include #include namespace log_surgeon { -constexpr uint32_t cUnicodeMax = 0x10FFFF; +constexpr uint32_t cUnicodeMax = 0x10'FFFF; constexpr uint32_t cSizeOfByte = 256; -constexpr uint32_t cSizeOfAllChildren = 10000; -constexpr uint32_t cNullSymbol = 10000000; +constexpr uint32_t cSizeOfAllChildren = 10'000; +constexpr uint32_t cNullSymbol = 10'000'000; enum class ErrorCode { Success, @@ -42,14 +41,14 @@ constexpr char cTokenHex[] = "hex"; constexpr char cTokenFirstTimestamp[] = "firstTimestamp"; constexpr char cTokenNewlineTimestamp[] = "newLineTimestamp"; constexpr char cTokenNewline[] = "newLine"; -constexpr uint32_t cStaticByteBuffSize = 48000; +constexpr uint32_t cStaticByteBuffSize = 48'000; namespace utf8 { - // 0xC0, 0xC1, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF are invalid - // UTF-8 code units + // 0xC0, 0xC1, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, + // 0xFF are invalid UTF-8 code units static unsigned char const cCharEOF = 0xFF; static unsigned char const cCharErr = 0xFE; -} // namespace utf8 -} // namespace log_surgeon +} // namespace utf8 +} // namespace log_surgeon -#endif // LOG_SURGEON_CONSTANTS_HPP +#endif // LOG_SURGEON_CONSTANTS_HPP diff --git a/src/log_surgeon/FileReader.cpp b/src/log_surgeon/FileReader.cpp index d0574ad3..edaa2b30 100644 --- a/src/log_surgeon/FileReader.cpp +++ b/src/log_surgeon/FileReader.cpp @@ -1,13 +1,11 @@ #include "FileReader.hpp" -// C standard libraries #include -// C++ libraries #include #include -#include "Constants.hpp" +#include using std::string; @@ -53,8 +51,8 @@ auto FileReader::try_open(string const& path) -> ErrorCode { auto FileReader::close() -> void { if (m_file != nullptr) { - // NOTE: We don't check errors for fclose since it seems the only reason it could fail is if - // it was interrupted by a signal + // NOTE: We don't check errors for fclose since it seems the only reason + // it could fail is if it was interrupted by a signal fclose(m_file); m_file = nullptr; } @@ -81,4 +79,4 @@ auto FileReader::try_read_to_delimiter(char delim, bool keep_delimiter, bool app str.append(m_get_delim_buf, num_bytes_read); return ErrorCode::Success; } -} // namespace log_surgeon +} // namespace log_surgeon diff --git a/src/log_surgeon/FileReader.hpp b/src/log_surgeon/FileReader.hpp index 6cce477f..1af906db 100644 --- a/src/log_surgeon/FileReader.hpp +++ b/src/log_surgeon/FileReader.hpp @@ -1,13 +1,11 @@ #ifndef LOG_SURGEON_FILE_READER_HPP #define LOG_SURGEON_FILE_READER_HPP -// C++ libraries #include #include -// Project headers -#include "Constants.hpp" -#include "Reader.hpp" +#include +#include namespace log_surgeon { class FileReader : public Reader { @@ -27,10 +25,13 @@ class FileReader : public Reader { auto read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) -> ErrorCode; /** - * Tries to read a string from the file until it reaches the specified delimiter + * Tries to read a string from the file until it reaches the specified + * delimiter * @param delim The delimiter to stop at - * @param keep_delimiter Whether to include the delimiter in the output string or not - * @param append Whether to append to the given string or replace its contents + * @param keep_delimiter Whether to include the delimiter in the output + * string or not + * @param append Whether to append to the given string or replace its + * contents * @param str The string read * @return ErrorCode::Success on success * @return ErrorCode::EndOfFile on EOF @@ -58,6 +59,6 @@ class FileReader : public Reader { size_t m_get_delim_buf_len{0}; char* m_get_delim_buf{nullptr}; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_FILE_READER_HPP +#endif // LOG_SURGEON_FILE_READER_HPP diff --git a/src/log_surgeon/LALR1Parser.cpp b/src/log_surgeon/LALR1Parser.cpp index a176116f..5268091d 100644 --- a/src/log_surgeon/LALR1Parser.cpp +++ b/src/log_surgeon/LALR1Parser.cpp @@ -8,10 +8,9 @@ ParserAST::~ParserAST() = default; uint32_t NonTerminal::m_next_children_start = 0; NonTerminal::NonTerminal(Production* p) - : m_children_start(NonTerminal::m_next_children_start), - m_production(p), - m_ast(nullptr) { - + : m_children_start(NonTerminal::m_next_children_start), + m_production(p), + m_ast(nullptr) { NonTerminal::m_next_children_start += p->m_body.size(); } -} // namespace log_surgeon +} // namespace log_surgeon diff --git a/src/log_surgeon/LALR1Parser.hpp b/src/log_surgeon/LALR1Parser.hpp index ce1aa3d7..63bfed1c 100644 --- a/src/log_surgeon/LALR1Parser.hpp +++ b/src/log_surgeon/LALR1Parser.hpp @@ -1,7 +1,6 @@ #ifndef LOG_SURGEON_LALR1_PARSER_HPP #define LOG_SURGEON_LALR1_PARSER_HPP -// C++ standard libraries #include #include #include @@ -17,10 +16,9 @@ #include #include -// Project headers -#include "Constants.hpp" -#include "Lexer.hpp" -#include "Parser.hpp" +#include +#include +#include namespace log_surgeon { @@ -39,7 +37,6 @@ using Action = std::variant; class ParserAST { public: - // Constructor virtual ~ParserAST() = 0; template @@ -61,21 +58,20 @@ using MatchedSymbol = std::variant; template struct Overloaded : Ts... { using Ts::operator()...; -}; // (1) +}; template -Overloaded(Ts...) -> Overloaded; // (2) +Overloaded(Ts...) -> Overloaded; class NonTerminal { public: - // Constructor NonTerminal() : m_children_start(0), m_production(nullptr), m_ast(nullptr) {} - // Constructor explicit NonTerminal(Production*); /** * Return the ith child's (body of production) MatchedSymbol as a Token. - * Note: only children are needed (and stored) for performing semantic actions (for the AST) + * Note: only children are needed (and stored) for performing semantic + * actions (for the AST) * @param i * @return Token* */ @@ -85,8 +81,9 @@ class NonTerminal { } /** - * Return the ith child's (body of production) MatchedSymbol as a NonTerminal. - * Note: only children are needed (and stored) for performing semantic actions (for the AST) + * Return the ith child's (body of production) MatchedSymbol as a + * NonTerminal. Note: only children are needed (and stored) for performing + * semantic actions (for the AST) * @param i * @return NonTerminal* */ @@ -96,8 +93,8 @@ class NonTerminal { } /** - * Return the AST that relates this non_terminal's children together (based on the - * production/syntax-rule that was determined to have generated them) + * Return the AST that relates this non_terminal's children together (based + * on the production/syntax-rule that was determined to have generated them) * @return std::unique_ptr */ auto get_parser_ast() -> std::unique_ptr& { return m_ast; } @@ -111,15 +108,16 @@ class NonTerminal { /** * Structure representing a production of the form "m_head -> {m_body}". - * The code fragment to execute upon reducing "{m_body} -> m_head" is m_semantic_rule, which is - * purely a function of the MatchedSymbols for {m_body}. m_index is the productions position in the - * parsers production vector. + * The code fragment to execute upon reducing "{m_body} -> m_head" is + * m_semantic_rule, which is purely a function of the MatchedSymbols for + * {m_body}. m_index is the productions position in the parsers production + * vector. */ struct Production { public: /** - * Returns if the production is an epsilon production. An epsilon production has nothing on its - * LHS (i.e., HEAD -> {}) + * Returns if the production is an epsilon production. An epsilon production + * has nothing on its LHS (i.e., HEAD -> {}) * @return bool */ [[nodiscard]] auto is_epsilon() const -> bool { return this->m_body.empty(); } @@ -132,16 +130,15 @@ struct Production { /** * Structure representing an item in a LALR1 state. - * An item (1) is associated with a m_production and a single m_lookahead which is an input symbol - * (character) that can follow the m_production, and (2) tracks the current matching progress of its - * associated m_production, where everything exclusively to the left of m_dot is already matched. + * An item (1) is associated with a m_production and a single m_lookahead which + * is an input symbol (character) that can follow the m_production, and (2) + * tracks the current matching progress of its associated m_production, where + * everything exclusively to the left of m_dot is already matched. */ struct Item { public: - // Constructor Item() = default; - // Constructor Item(Production* p, uint32_t d, uint32_t t) : m_production(p), m_dot(d), m_lookahead(t) {} /** @@ -151,13 +148,13 @@ struct Item { * @return bool */ friend auto operator<(Item const& lhs, Item const& rhs) -> bool { - return std::tie(lhs.m_production->m_index, lhs.m_dot, lhs.m_lookahead) < - std::tie(rhs.m_production->m_index, rhs.m_dot, rhs.m_lookahead); + return std::tie(lhs.m_production->m_index, lhs.m_dot, lhs.m_lookahead) + < std::tie(rhs.m_production->m_index, rhs.m_dot, rhs.m_lookahead); } /** - * Returns if the item has a dot at the end. This indicates the production associated with the - * item has already been fully matched. + * Returns if the item has a dot at the end. This indicates the production + * associated with the item has already been fully matched. * @return bool */ [[nodiscard]] auto has_dot_at_end() const -> bool { @@ -174,14 +171,15 @@ struct Item { Production* m_production; uint32_t m_dot; - uint32_t m_lookahead; // for LR0 items, `m_lookahead` is unused + uint32_t m_lookahead; // for LR0 items, `m_lookahead` is unused }; /** * Structure representing an LALR1 state, a collection of items. - * The m_kernel is sufficient for fully representing the state, but m_closure is useful for - * computations. m_next indicates what state (ItemSet) to transition to based on the symbol received - * from the lexer m_actions is the action to perform based on the symbol received from the lexer. + * The m_kernel is sufficient for fully representing the state, but m_closure is + * useful for computations. m_next indicates what state (ItemSet) to transition + * to based on the symbol received from the lexer m_actions is the action to + * perform based on the symbol received from the lexer. */ struct ItemSet { public: @@ -207,26 +205,26 @@ struct ItemSet { template class LALR1Parser : public Parser { public: - // Constructor LALR1Parser(); - /// TODO: combine all the add_* into add_rule /** * Add a lexical rule to m_lexer * @param name * @param rule */ - auto add_rule(std::string const& name, - std::unique_ptr> rule) -> void override; + auto + add_rule(std::string const& name, std::unique_ptr> rule) + -> void override; /** * Calls add_rule with the given RegexASTGroup * @param name * @param rule_char */ - auto add_token_group(std::string const& name, - std::unique_ptr> rule_group) - -> void; + auto add_token_group( + std::string const& name, + std::unique_ptr> rule_group + ) -> void; /** * Constructs a RegexASTCat and calls add_rule @@ -242,16 +240,19 @@ class LALR1Parser : public Parser { * @param semantic_rule * @return uint32_t */ - auto add_production(std::string const& head, - std::vector const& body, - SemanticRule semantic_rule) -> uint32_t; + auto add_production( + std::string const& head, + std::vector const& body, + SemanticRule semantic_rule + ) -> uint32_t; /** - * Generate the LALR1 parser (use after all the lexical rules and productions have been added) + * Generate the LALR1 parser (use after all the lexical rules and + * productions have been added) */ auto generate() -> void; - /// TODO: add throws to function headers + // TODO: add throws to function headers /** * Parse an input (e.g. file) * @param reader @@ -261,14 +262,14 @@ class LALR1Parser : public Parser { protected: /** - * Reset the parser to start a new parsing (set state to root, reset buffers, reset vars - * tracking positions) + * Reset the parser to start a new parsing (set state to root, reset + * buffers, reset vars tracking positions) */ auto reset() -> void; /** - * Return an error string based on the current error state, matched_stack, and next_symbol in - * the parser + * Return an error string based on the current error state, matched_stack, + * and next_symbol in the parser * @param reader * @return std::string */ @@ -287,8 +288,6 @@ class LALR1Parser : public Parser { ParserInputBuffer m_input_buffer; private: - // Parser generation - /** * Generate LR0 kernels based on the productions in m_productions */ @@ -310,13 +309,14 @@ class LALR1Parser : public Parser { auto lr_closure_helper(ItemSet* item_set_ptr, Item const* item, uint32_t* next_symbol) -> bool; /** - * Return the next state (ItemSet) based on the current state (ItemSet) and input symbol + * Return the next state (ItemSet) based on the current state (ItemSet) and + * input symbol * @return ItemSet* */ auto go_to(ItemSet* /*from_item_set*/, uint32_t const& /*next_symbol*/) -> ItemSet*; /** - * Generate m_firsts, which specify for each symbol, all possible prefixes (I think?) + * Generate m_firsts, which specify for each symbol, all possible prefixes */ auto generate_first_sets() -> void; @@ -332,35 +332,34 @@ class LALR1Parser : public Parser { auto generate_lr1_closure(ItemSet* item_set_ptr) -> void; /** - * Generating parsing table and goto table for LALR1 parser based on state-symbol pair - * generate_lalr1_goto() + generate_lalr1_action() + * Generating parsing table and goto table for LALR1 parser based on + * state-symbol pair generate_lalr1_goto() + generate_lalr1_action() */ auto generate_lalr1_parsing_table() -> void; /** - * Generating the goto table for LARL1 parser specifying which state (ItemSet) to transition to - * based on state-symbol pair Does nothing (its already done in an earlier step) + * Generating the goto table for LARL1 parser specifying which state + * (ItemSet) to transition to based on state-symbol pair Does nothing (its + * already done in an earlier step) */ auto generate_lalr1_goto() -> void; /** - * Generating the action table for LARL1 parser specifying which action to perform based on - * state-symbol pair + * Generating the action table for LARL1 parser specifying which action to + * perform based on state-symbol pair */ auto generate_lalr1_action() -> void; - // Parser utilization - /** - * Use the previous symbol from the lexer if unused, otherwise request the next symbol from the - * lexer + * Use the previous symbol from the lexer if unused, otherwise request the + * next symbol from the lexer * @return Token */ auto get_next_symbol() -> Token; /** - * Tries all symbols in the language that the next token may be until the first non-error symbol - * is tried + * Tries all symbols in the language that the next token may be until the + * first non-error symbol is tried * @param next_token * @param accept * @return bool @@ -368,8 +367,9 @@ class LALR1Parser : public Parser { auto parse_advance(Token& next_token, bool* accept) -> bool; /** - * Perform an action and state transition based on the current state (ItemSet) and the type_id - * (current symbol interpretation of the next_token) + * Perform an action and state transition based on the current state + * (ItemSet) and the type_id (current symbol interpretation of the + * next_token) * @param type_id * @param next_token * @param accept @@ -377,8 +377,6 @@ class LALR1Parser : public Parser { */ auto parse_symbol(uint32_t const& type_id, Token& next_token, bool* accept) -> bool; - // Error handling - /** * Get the current line up to the error symbol * @param parse_stack_matches @@ -397,7 +395,6 @@ class LALR1Parser : public Parser { auto symbol_is_token(uint32_t s) -> bool { return m_terminals.find(s) != m_terminals.end(); } - // Variables std::set m_terminals; std::set m_nullable; std::map, std::unique_ptr> m_lr0_item_sets; @@ -407,8 +404,8 @@ class LALR1Parser : public Parser { std::map> m_propagate_map; std::unordered_map> m_go_to_table; }; -} // namespace log_surgeon +} // namespace log_surgeon #include "LALR1Parser.tpp" -#endif // LOG_SURGEON_LALR1_PARSER_HPP +#endif // LOG_SURGEON_LALR1_PARSER_HPP diff --git a/src/log_surgeon/LALR1Parser.tpp b/src/log_surgeon/LALR1Parser.tpp index 15d07bae..dcf8df07 100644 --- a/src/log_surgeon/LALR1Parser.tpp +++ b/src/log_surgeon/LALR1Parser.tpp @@ -2,17 +2,57 @@ #define LOG_SURGEON_LALR1_PARSER_TPP #include - -// C++ standard libraries #include #include #include -// Project headers -#include "Constants.hpp" +#include namespace log_surgeon { +namespace { + [[maybe_unused]] auto get_line_num(MatchedSymbol& top_symbol) -> uint32_t { + std::optional line_num{std::nullopt}; + std::stack symbols; + symbols.push(std::move(top_symbol)); + while (std::nullopt == line_num) { + assert(!symbols.empty()); + MatchedSymbol& curr_symbol = symbols.top(); + std::visit( + Overloaded{ + [&line_num](Token& token) { line_num = token.m_line; }, + [&symbols](NonTerminal& m) { + for (size_t i = 0; i < m.m_production->m_body.size(); i++) { + symbols.push(std::move( + NonTerminal::m_all_children[m.m_children_start + i] + )); + } + }}, + curr_symbol + ); + symbols.pop(); + } + return *line_num; + } + + [[maybe_unused]] auto unescape(char const& c) -> std::string { + switch (c) { + case '\t': + return "\\t"; + case '\r': + return "\\r"; + case '\n': + return "\\n"; + case '\v': + return "\\v"; + case '\f': + return "\\f"; + default: + return {c}; + } + } +} // namespace + template LALR1Parser::LALR1Parser() { m_terminals.insert((int)SymbolID::TokenEndID); @@ -27,7 +67,9 @@ LALR1Parser::LALR1Parser() { template void LALR1Parser::add_rule( - std::string const& name, std::unique_ptr> rule) { + std::string const& name, + std::unique_ptr> rule +) { Parser::add_rule(name, std::move(rule)); m_terminals.insert(this->m_lexer.m_symbol_id[name]); } @@ -35,36 +77,44 @@ void LALR1Parser::add_rule( template void LALR1Parser::add_token_group( std::string const& name, - std::unique_ptr> rule_group) { + std::unique_ptr> rule_group +) { add_rule(name, std::move(rule_group)); } template -void LALR1Parser::add_token_chain(std::string const& name, - std::string const& chain) { +void LALR1Parser::add_token_chain( + std::string const& name, + std::string const& chain +) { assert(chain.size() > 1); - std::unique_ptr> first_char_rule = - std::make_unique>(chain[0]); - std::unique_ptr> second_char_rule = - std::make_unique>(chain[1]); - std::unique_ptr> rule_chain = - std::make_unique>( - std::move(first_char_rule), std::move(second_char_rule)); + std::unique_ptr> first_char_rule + = std::make_unique>(chain[0]); + std::unique_ptr> second_char_rule + = std::make_unique>(chain[1]); + std::unique_ptr> rule_chain + = std::make_unique>( + std::move(first_char_rule), + std::move(second_char_rule) + ); for (uint32_t i = 2; i < chain.size(); i++) { char next_char = chain[i]; - std::unique_ptr> next_char_rule = - std::make_unique>(next_char); + std::unique_ptr> next_char_rule + = std::make_unique>(next_char); rule_chain = std::make_unique>( - std::move(rule_chain), std::move(next_char_rule)); + std::move(rule_chain), + std::move(next_char_rule) + ); } add_rule(name, std::move(rule_chain)); } template -auto LALR1Parser::add_production(std::string const& head, - std::vector const& body, - SemanticRule semantic_rule) - -> uint32_t { +auto LALR1Parser::add_production( + std::string const& head, + std::vector const& body, + SemanticRule semantic_rule +) -> uint32_t { if (this->m_lexer.m_symbol_id.find(head) == this->m_lexer.m_symbol_id.end()) { this->m_lexer.m_symbol_id[head] = this->m_lexer.m_symbol_id.size(); this->m_lexer.m_id_symbol[this->m_lexer.m_symbol_id[head]] = head; @@ -141,17 +191,20 @@ void LALR1Parser::generate_lr0_kernels() { } template -auto LALR1Parser::lr_closure_helper(ItemSet* item_set_ptr, - Item const* item, - uint32_t* next_symbol) -> bool { - if (!item_set_ptr->m_closure.insert(*item).second) { // add {S'->(dot)S, ""} +auto LALR1Parser::lr_closure_helper( + ItemSet* item_set_ptr, + Item const* item, + uint32_t* next_symbol +) -> bool { + // add {S'->(dot)S, ""} + if (!item_set_ptr->m_closure.insert(*item).second) { return true; } if (item->has_dot_at_end()) { return true; } *next_symbol = item->next_symbol(); - if (this->symbol_is_token(*next_symbol)) { // false + if (this->symbol_is_token(*next_symbol)) { return true; } return false; @@ -159,10 +212,12 @@ auto LALR1Parser::lr_closure_helper(ItemSet* item_se template void LALR1Parser::generate_lr0_closure(ItemSet* item_set_ptr) { - std::deque q(item_set_ptr->m_kernel.begin(), - item_set_ptr->m_kernel.end()); // {{S'->(dot)S, ""}} + std::deque q( + item_set_ptr->m_kernel.begin(), + item_set_ptr->m_kernel.end() + ); // {{S'->(dot)S, ""}} while (!q.empty()) { - Item item = q.back(); // {S'->(dot)S, ""} + Item item = q.back(); // {S'->(dot)S, ""} q.pop_back(); uint32_t next_symbol = 0; if (lr_closure_helper(item_set_ptr, &item, &next_symbol)) { @@ -171,15 +226,17 @@ void LALR1Parser::generate_lr0_closure(ItemSet* item if (m_non_terminals.find(next_symbol) == m_non_terminals.end()) { assert(false); } - for (Production* const p : m_non_terminals.at(next_symbol)) { // S -> a - q.emplace_back(p, 0, cNullSymbol); // {S -> (dot) a, ""} + for (Production* const p : m_non_terminals.at(next_symbol)) { // S -> a + q.emplace_back(p, 0, cNullSymbol); // {S -> (dot) a, ""} } } } template -auto LALR1Parser::go_to(ItemSet* from_item_set, - uint32_t const& next_symbol) -> ItemSet* { +auto LALR1Parser::go_to( + ItemSet* from_item_set, + uint32_t const& next_symbol +) -> ItemSet* { std::unique_ptr next_item_set_ptr = std::make_unique(); assert(from_item_set != nullptr); for (Item const& item : from_item_set->m_closure) { @@ -187,8 +244,8 @@ auto LALR1Parser::go_to(ItemSet* from_item_set, continue; } if (item.next_symbol() == next_symbol) { - next_item_set_ptr->m_kernel.emplace( - item.m_production, item.m_dot + 1, item.m_lookahead); + next_item_set_ptr->m_kernel + .emplace(item.m_production, item.m_dot + 1, item.m_lookahead); } } if (next_item_set_ptr->m_kernel.empty()) { @@ -242,8 +299,8 @@ void LALR1Parser::generate_first_sets() { template void LALR1Parser::generate_lr1_item_sets() { - for (std::map, std::unique_ptr>::value_type const& kv : - m_lr0_item_sets) { + for (std::map, std::unique_ptr>::value_type const& kv : m_lr0_item_sets) + { for (Item const& l0_item : kv.second->m_kernel) { ItemSet temp_item_set; temp_item_set.m_kernel.insert(l0_item); @@ -261,11 +318,13 @@ void LALR1Parser::generate_lr1_item_sets() { } } std::map> lookaheads; - for (std::map, std::unique_ptr>::value_type const& kv : - m_lr0_item_sets) { + for (std::map, std::unique_ptr>::value_type const& kv : m_lr0_item_sets) + { for (Item const& l0_item : kv.second->m_kernel) { - lookaheads[l0_item].insert(m_spontaneous_map[l0_item.m_production].begin(), - m_spontaneous_map[l0_item.m_production].end()); + lookaheads[l0_item].insert( + m_spontaneous_map[l0_item.m_production].begin(), + m_spontaneous_map[l0_item.m_production].end() + ); if (l0_item.m_production == m_productions[m_root_production_id].get()) { lookaheads[l0_item].insert((int)SymbolID::TokenEndID); } @@ -278,23 +337,26 @@ void LALR1Parser::generate_lr1_item_sets() { Item item_from = kv.first; for (Item const& item_to : kv.second) { size_t size_before = lookaheads[item_to].size(); - lookaheads[item_to].insert(lookaheads[item_from].begin(), - lookaheads[item_from].end()); + lookaheads[item_to].insert( + lookaheads[item_from].begin(), + lookaheads[item_from].end() + ); size_t size_after = lookaheads[item_to].size(); changed = changed || size_after > size_before; } } } - for (std::map, std::unique_ptr>::value_type const& kv : - m_lr0_item_sets) { + for (std::map, std::unique_ptr>::value_type const& kv : m_lr0_item_sets) + { std::unique_ptr lr1_item_set_ptr = std::make_unique(); for (Item const& l0_item : kv.second->m_kernel) { for (int const& lookahead : lookaheads[l0_item]) { Item lr1_item(l0_item.m_production, l0_item.m_dot, lookahead); lr1_item_set_ptr->m_kernel.insert(lr1_item); } - if (l0_item.m_production == m_productions[m_root_production_id].get() && - l0_item.m_dot == 0) { + if (l0_item.m_production == m_productions[m_root_production_id].get() + && l0_item.m_dot == 0) + { m_root_item_set_ptr = lr1_item_set_ptr.get(); } } @@ -304,10 +366,12 @@ void LALR1Parser::generate_lr1_item_sets() { } // this seems like the wrong way to do this still: for (std::map, std::unique_ptr>::value_type const& kv1 : - m_lr1_item_sets) { + m_lr1_item_sets) + { for (auto const& next_index : m_go_to_table[kv1.second->m_index]) { for (std::map, std::unique_ptr>::value_type const& kv2 : - m_lr1_item_sets) { + m_lr1_item_sets) + { if (next_index.second == kv2.second->m_index) { kv1.second->m_next[next_index.first] = kv2.second.get(); break; @@ -332,9 +396,11 @@ void LALR1Parser::generate_lr1_closure(ItemSet* item while (pos < item.m_production->m_body.size()) { uint32_t symbol = item.m_production->m_body.at(pos); std::set symbol_firsts = m_firsts.find(symbol)->second; - lookaheads.insert(lookaheads.end(), - std::make_move_iterator(symbol_firsts.begin()), - std::make_move_iterator(symbol_firsts.end())); + lookaheads.insert( + lookaheads.end(), + std::make_move_iterator(symbol_firsts.begin()), + std::make_move_iterator(symbol_firsts.end()) + ); if (m_nullable.find(symbol) == m_nullable.end()) { break; } @@ -365,29 +431,31 @@ void LALR1Parser::generate_lalr1_goto() { // Dragon book page 253 template void LALR1Parser::generate_lalr1_action() { - for (std::map, std::unique_ptr>::value_type const& kv : - m_lr1_item_sets) { + for (std::map, std::unique_ptr>::value_type const& kv : m_lr1_item_sets) + { ItemSet* item_set_ptr = kv.second.get(); item_set_ptr->m_actions.resize(this->m_lexer.m_symbol_id.size(), false); for (Item const& item : item_set_ptr->m_closure) { if (!item.has_dot_at_end()) { - if (m_terminals.find(item.next_symbol()) == m_terminals.end() && - m_non_terminals.find(item.next_symbol()) == m_non_terminals.end()) { + if (m_terminals.find(item.next_symbol()) == m_terminals.end() + && m_non_terminals.find(item.next_symbol()) == m_non_terminals.end()) + { continue; } assert(item_set_ptr->m_next.find(item.next_symbol()) != item_set_ptr->m_next.end()); Action& action = item_set_ptr->m_actions[item.next_symbol()]; if (!std::holds_alternative(action)) { - if (std::holds_alternative(action) && - std::get(action) == item_set_ptr->m_next[item.next_symbol()]) { + if (std::holds_alternative(action) + && std::get(action) == item_set_ptr->m_next[item.next_symbol()]) + { continue; } std::string conflict_msg{}; conflict_msg += "For symbol "; conflict_msg += this->m_lexer.m_id_symbol[item.next_symbol()]; conflict_msg += ", adding shift to "; - conflict_msg += - std::to_string(item_set_ptr->m_next[item.next_symbol()]->m_index); + conflict_msg += std::to_string(item_set_ptr->m_next[item.next_symbol()]->m_index + ); conflict_msg += " causes "; if (std::holds_alternative(action)) { conflict_msg += "shift-shift conflict with shift to "; @@ -395,8 +463,8 @@ void LALR1Parser::generate_lalr1_action() { conflict_msg += "\n"; } else { conflict_msg += "shift-reduce conflict with reduction "; - conflict_msg += - this->m_lexer.m_id_symbol[std::get(action)->m_head]; + conflict_msg + += this->m_lexer.m_id_symbol[std::get(action)->m_head]; conflict_msg += "-> {"; for (uint32_t symbol : std::get(action)->m_body) { conflict_msg += this->m_lexer.m_id_symbol[symbol] + ","; @@ -405,8 +473,8 @@ void LALR1Parser::generate_lalr1_action() { } throw std::runtime_error(conflict_msg); } - item_set_ptr->m_actions[item.next_symbol()] = - item_set_ptr->m_next[item.next_symbol()]; + item_set_ptr->m_actions[item.next_symbol()] + = item_set_ptr->m_next[item.next_symbol()]; } if (item.has_dot_at_end()) { if (item.m_production == m_productions[m_root_production_id].get()) { @@ -431,9 +499,9 @@ void LALR1Parser::generate_lalr1_action() { conflict_msg += "\n"; } else { conflict_msg += "reduce-reduce conflict with reduction "; - conflict_msg += - this->m_lexer - .m_id_symbol[std::get(action)->m_head]; + conflict_msg + += this->m_lexer + .m_id_symbol[std::get(action)->m_head]; conflict_msg += "-> {"; for (uint32_t symbol : std::get(action)->m_body) { conflict_msg += this->m_lexer.m_id_symbol[symbol] + ","; @@ -449,54 +517,39 @@ void LALR1Parser::generate_lalr1_action() { } } -static auto get_line_num(MatchedSymbol& top_symbol) -> uint32_t { - std::optional line_num{std::nullopt}; - std::stack symbols; - symbols.push(std::move(top_symbol)); - while (std::nullopt == line_num) { - assert(!symbols.empty()); - MatchedSymbol& curr_symbol = symbols.top(); - std::visit(Overloaded{[&line_num](Token& token) { line_num = token.m_line; }, - [&symbols](NonTerminal& m) { - for (size_t i = 0; i < m.m_production->m_body.size(); i++) { - symbols.push(std::move( - NonTerminal::m_all_children[m.m_children_start + i])); - } - }}, - curr_symbol); - symbols.pop(); - } - return *line_num; -} - template auto LALR1Parser::get_input_after_last_newline( - std::stack& parse_stack_matches) -> std::string { + std::stack& parse_stack_matches +) -> std::string { std::string error_message_reversed; bool done = false; while (!parse_stack_matches.empty() && !done) { MatchedSymbol top_symbol = std::move(parse_stack_matches.top()); parse_stack_matches.pop(); - std::visit(Overloaded{[&error_message_reversed, &done](Token& token) { - if (token.to_string() == "\r" || token.to_string() == "\n") { - done = true; - } else { - // input is being read backwards, so reverse each token so - // that when the entire input is reversed each token is - // displayed correctly - std::string token_string = token.to_string(); - std::reverse(token_string.begin(), token_string.end()); - error_message_reversed += token_string; - } - }, - [&parse_stack_matches](NonTerminal& m) { - for (size_t i = 0; i < m.m_production->m_body.size(); i++) { - assert(m.m_children_start + i < cSizeOfAllChildren); - parse_stack_matches.push(std::move( - NonTerminal::m_all_children[m.m_children_start + i])); - } - }}, - top_symbol); + std::visit( + Overloaded{ + [&error_message_reversed, &done](Token& token) { + if (token.to_string() == "\r" || token.to_string() == "\n") { + done = true; + } else { + // input is being read backwards, so reverse + // each token so that when the entire input is + // reversed each token is displayed correctly + std::string token_string = token.to_string(); + std::reverse(token_string.begin(), token_string.end()); + error_message_reversed += token_string; + } + }, + [&parse_stack_matches](NonTerminal& m) { + for (size_t i = 0; i < m.m_production->m_body.size(); i++) { + assert(m.m_children_start + i < cSizeOfAllChildren); + parse_stack_matches.push(std::move( + NonTerminal::m_all_children[m.m_children_start + i] + )); + } + }}, + top_symbol + ); } std::reverse(error_message_reversed.begin(), error_message_reversed.end()); return error_message_reversed; @@ -507,12 +560,12 @@ auto LALR1Parser::get_input_until_next_newline(Token -> std::string { std::string rest_of_line; bool next_is_end_token = (error_token->m_type_ids_ptr->at(0) == (int)SymbolID::TokenEndID); - bool next_has_newline = (error_token->to_string().find('\n') != std::string::npos) || - (error_token->to_string().find('\r') != std::string::npos); + bool next_has_newline = (error_token->to_string().find('\n') != std::string::npos) + || (error_token->to_string().find('\r') != std::string::npos); while (!next_has_newline && !next_is_end_token) { Token token = get_next_symbol(); - next_has_newline = (token.to_string().find('\n') != std::string::npos) || - (token.to_string().find('\r') != std::string::npos); + next_has_newline = (token.to_string().find('\n') != std::string::npos) + || (token.to_string().find('\r') != std::string::npos); if (!next_has_newline) { rest_of_line += token.to_string(); next_is_end_token = (token.m_type_ids_ptr->at(0) == (int)SymbolID::TokenEndID); @@ -522,23 +575,6 @@ auto LALR1Parser::get_input_until_next_newline(Token return rest_of_line; } -static auto unescape(char const& c) -> std::string { - switch (c) { - case '\t': - return "\\t"; - case '\r': - return "\\r"; - case '\n': - return "\\n"; - case '\v': - return "\\v"; - case '\f': - return "\\f"; - default: - return {c}; - } -} - template auto LALR1Parser::report_error() -> std::string { assert(m_next_token == std::nullopt); @@ -565,9 +601,11 @@ auto LALR1Parser::report_error() -> std::string { Action action = m_parse_stack_states.top()->m_actions[i]; if (action.index() != 0) { error_type += "'"; - if (auto* regex_ast_literal = - dynamic_cast*>( - this->m_lexer.get_rule(i))) { + if (auto* regex_ast_literal + = dynamic_cast*>( + this->m_lexer.get_rule(i) + )) + { error_type += unescape(char(regex_ast_literal->get_character())); } else { error_type += this->m_lexer.m_id_symbol[i]; @@ -578,9 +616,9 @@ auto LALR1Parser::report_error() -> std::string { error_type.pop_back(); error_type += " before '" + unescape(token.to_string()[0]) + "' token"; } - std::string error_string = "Schema:" + std::to_string(line_num + 1) + ":" + - std::to_string(consumed_input.size() + 1) + - ": error: " + error_type + "\n"; + std::string error_string = "Schema:" + std::to_string(line_num + 1) + ":" + + std::to_string(consumed_input.size() + 1) + + ": error: " + error_type + "\n"; for (int i = 0; i < 10; i++) { error_string += " "; } @@ -628,7 +666,8 @@ auto LALR1Parser::get_next_symbol() -> Token { if (m_next_token == std::nullopt) { Token token; if (ErrorCode error = this->m_lexer.scan(m_input_buffer, token); - ErrorCode::Success != error) { + ErrorCode::Success != error) + { throw std::runtime_error("Error scanning in lexer."); } return token; @@ -653,62 +692,67 @@ auto LALR1Parser::parse_advance(Token& next_token, b } template -auto LALR1Parser::parse_symbol(uint32_t const& type_id, - Token& next_token, - bool* accept) -> bool { +auto LALR1Parser::parse_symbol( + uint32_t const& type_id, + Token& next_token, + bool* accept +) -> bool { ItemSet* curr = m_parse_stack_states.top(); Action& it = curr->m_actions[type_id]; bool ret = false; std::visit( - Overloaded{[&ret, &accept](bool is_accepting) { - if (!is_accepting) { - ret = false; - return; - } - *accept = true; - ret = true; - return; - }, - [&ret, &next_token, this](ItemSet* shift) { - m_parse_stack_states.push(shift); - m_parse_stack_matches.emplace(std::move(next_token)); - ret = true; - return; - }, - [&ret, &next_token, this](Production* reduce) { - m_next_token = next_token; - NonTerminal matched_non_terminal(reduce); - size_t n = reduce->m_body.size(); - for (size_t i = 0; i < n; i++) { - m_parse_stack_states.pop(); - assert((matched_non_terminal.m_children_start + n - i - 1) < - cSizeOfAllChildren); - NonTerminal::m_all_children[matched_non_terminal.m_children_start + - n - i - 1] = - std::move(m_parse_stack_matches.top()); - m_parse_stack_matches.pop(); - } - if (reduce->m_semantic_rule != nullptr) { - if (0 == m_next_token->m_start_pos) { - m_input_buffer.set_consumed_pos(m_input_buffer.storage().size() - - 1); - } else { - m_input_buffer.set_consumed_pos(m_next_token->m_start_pos - 1); - } - matched_non_terminal.m_ast = - reduce->m_semantic_rule(&matched_non_terminal); - } - ItemSet* curr = m_parse_stack_states.top(); - Action const& it = - curr->m_actions[matched_non_terminal.m_production->m_head]; - m_parse_stack_states.push(std::get(it)); - m_parse_stack_matches.emplace(std::move(matched_non_terminal)); - ret = true; - return; - }}, - it); + Overloaded{ + [&ret, &accept](bool is_accepting) { + if (!is_accepting) { + ret = false; + return; + } + *accept = true; + ret = true; + return; + }, + [&ret, &next_token, this](ItemSet* shift) { + m_parse_stack_states.push(shift); + m_parse_stack_matches.emplace(std::move(next_token)); + ret = true; + return; + }, + [&ret, &next_token, this](Production* reduce) { + m_next_token = next_token; + NonTerminal matched_non_terminal(reduce); + size_t n = reduce->m_body.size(); + for (size_t i = 0; i < n; i++) { + m_parse_stack_states.pop(); + assert((matched_non_terminal.m_children_start + n - i - 1) + < cSizeOfAllChildren); + NonTerminal::m_all_children + [matched_non_terminal.m_children_start + n - i - 1] + = std::move(m_parse_stack_matches.top()); + m_parse_stack_matches.pop(); + } + if (reduce->m_semantic_rule != nullptr) { + if (0 == m_next_token->m_start_pos) { + m_input_buffer.set_consumed_pos( + m_input_buffer.storage().size() - 1 + ); + } else { + m_input_buffer.set_consumed_pos(m_next_token->m_start_pos - 1); + } + matched_non_terminal.m_ast + = reduce->m_semantic_rule(&matched_non_terminal); + } + ItemSet* curr = m_parse_stack_states.top(); + Action const& it + = curr->m_actions[matched_non_terminal.m_production->m_head]; + m_parse_stack_states.push(std::get(it)); + m_parse_stack_matches.emplace(std::move(matched_non_terminal)); + ret = true; + return; + }}, + it + ); return ret; } -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_LALR1_PARSER_TPP +#endif // LOG_SURGEON_LALR1_PARSER_TPP diff --git a/src/log_surgeon/Lexer.hpp b/src/log_surgeon/Lexer.hpp index d8fc6771..6e003c22 100644 --- a/src/log_surgeon/Lexer.hpp +++ b/src/log_surgeon/Lexer.hpp @@ -1,7 +1,7 @@ #ifndef LOG_SURGEON_LEXER_HPP #define LOG_SURGEON_LEXER_HPP -// C++ standard libraries +#include #include #include #include @@ -10,21 +10,20 @@ #include #include -// Project headers -#include "Constants.hpp" -#include "ParserInputBuffer.hpp" -#include "Token.hpp" -#include "finite_automata/RegexAST.hpp" -#include "finite_automata/RegexDFA.hpp" -#include "finite_automata/RegexNFA.hpp" +#include +#include +#include +#include +#include +#include namespace log_surgeon { template class Lexer { public: // std::vector can be declared as constexpr in c++20 - inline static std::vector const cTokenEndTypes = {(int)SymbolID::TokenEndID}; - inline static std::vector const cTokenUncaughtStringTypes = { + static inline std::vector const cTokenEndTypes = {(int)SymbolID::TokenEndID}; + static inline std::vector const cTokenUncaughtStringTypes = { (int)SymbolID::TokenUncaughtStringID}; /** @@ -33,8 +32,8 @@ class Lexer { struct Rule { // Constructor Rule(uint32_t n, std::unique_ptr> r) - : m_name(n), - m_regex(std::move(r)) {} + : m_name(n), + m_regex(std::move(r)) {} /** * Adds AST representing the lexical rule to the NFA @@ -73,12 +72,14 @@ class Lexer { auto generate() -> void; /** - * Generate DFA for a reverse lexer matching the reverse of the words in the original language + * Generate DFA for a reverse lexer matching the reverse of the words in the + * original language */ auto generate_reverse() -> void; /** - * Reset the lexer to start a new lexing (reset buffers, reset vars tracking positions) + * Reset the lexer to start a new lexing (reset buffers, reset vars tracking + * positions) */ auto reset() -> void; @@ -91,8 +92,8 @@ class Lexer { /** * Gets next token from the input string - * If next token is an uncaught string, the next variable token is already prepped to be - * returned on the next call + * If next token is an uncaught string, the next variable token is already + * prepped to be returned on the next call * @param input_buffer * @param Token& * @return ErrorCode::Success @@ -109,7 +110,7 @@ class Lexer { * @return Token * @throw runtime_error("Input buffer about to overflow") */ - auto scan_with_wildcard(ParserInputBuffer& input_buffer, char wildcard, Token& token) + auto scan_with_wildcard(ParserInputBuffer& input_buffer, char wildcard, Token& token) -> ErrorCode; [[nodiscard]] auto get_has_delimiters() const -> bool const& { return m_has_delimiters; } @@ -167,9 +168,9 @@ class Lexer { namespace lexers { using ByteLexer = Lexer; using UTF8Lexer = Lexer; -} // namespace lexers -} // namespace log_surgeon +} // namespace lexers +} // namespace log_surgeon #include "Lexer.tpp" -#endif // LOG_SURGEON_LEXER_HPP +#endif // LOG_SURGEON_LEXER_HPP diff --git a/src/log_surgeon/Lexer.tpp b/src/log_surgeon/Lexer.tpp index 8f21a50c..898e58f9 100644 --- a/src/log_surgeon/Lexer.tpp +++ b/src/log_surgeon/Lexer.tpp @@ -1,14 +1,12 @@ #ifndef LOG_SURGEON_LEXER_TPP #define LOG_SURGEON_LEXER_TPP -// C++ standard libraries #include #include #include -// Project headers -#include "Constants.hpp" -#include "finite_automata/RegexAST.hpp" +#include +#include /** * utf8 format (https://en.wikipedia.org/wiki/UTF-8) @@ -25,8 +23,8 @@ void Lexer::flip_states(uint32_t old_storage_size) { } else { m_match_pos += old_storage_size / 2; } - /// TODO when m_start_pos == old_storage_size / 2, theres two possible cases currently so both - /// options are potentially wrong + // TODO when m_start_pos == old_storage_size / 2, theres two possible cases + // currently so both options are potentially wrong if (m_start_pos > old_storage_size / 2) { m_start_pos -= old_storage_size / 2; } else { @@ -51,12 +49,13 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To m_match = false; m_last_match_pos = m_match_pos; m_last_match_line = m_match_line; - token = Token{m_start_pos, - m_match_pos, - input_buffer.storage().get_active_buffer(), - input_buffer.storage().size(), - m_match_line, - m_type_ids}; + token = Token{ + m_start_pos, + m_match_pos, + input_buffer.storage().get_active_buffer(), + input_buffer.storage().size(), + m_match_line, + m_type_ids}; return ErrorCode::Success; } m_start_pos = input_buffer.storage().pos(); @@ -72,8 +71,9 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To m_prev_state = state; return err; } - if ((m_is_delimiter[next_char] || input_buffer.log_fully_consumed() || !m_has_delimiters) && - state->is_accepting()) { + if ((m_is_delimiter[next_char] || input_buffer.log_fully_consumed() || !m_has_delimiters) + && state->is_accepting()) + { m_match = true; m_type_ids = &(state->get_tags()); m_match_pos = prev_byte_buf_pos; @@ -97,23 +97,25 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To input_buffer.set_pos(m_match_pos); m_line = m_match_line; if (m_last_match_pos != m_start_pos) { - token = Token{m_last_match_pos, - m_start_pos, - input_buffer.storage().get_active_buffer(), - input_buffer.storage().size(), - m_last_match_line, - &cTokenUncaughtStringTypes}; + token = Token{ + m_last_match_pos, + m_start_pos, + input_buffer.storage().get_active_buffer(), + input_buffer.storage().size(), + m_last_match_line, + &cTokenUncaughtStringTypes}; return ErrorCode::Success; } m_match = false; m_last_match_pos = m_match_pos; m_last_match_line = m_match_line; - token = Token{m_start_pos, - m_match_pos, - input_buffer.storage().get_active_buffer(), - input_buffer.storage().size(), - m_match_line, - m_type_ids}; + token = Token{ + m_start_pos, + m_match_pos, + input_buffer.storage().get_active_buffer(), + input_buffer.storage().size(), + m_match_line, + m_type_ids}; return ErrorCode::Success; } if (input_buffer.log_fully_consumed() && m_start_pos == input_buffer.storage().pos()) { @@ -121,29 +123,33 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To m_match_pos = input_buffer.storage().pos(); m_type_ids = &cTokenEndTypes; m_match = true; - token = Token{m_last_match_pos, - m_start_pos, - input_buffer.storage().get_active_buffer(), - input_buffer.storage().size(), - m_last_match_line, - &cTokenUncaughtStringTypes}; + token = Token{ + m_last_match_pos, + m_start_pos, + input_buffer.storage().get_active_buffer(), + input_buffer.storage().size(), + m_last_match_line, + &cTokenUncaughtStringTypes}; return ErrorCode::Success; } - token = Token{input_buffer.storage().pos(), - input_buffer.storage().pos(), - input_buffer.storage().get_active_buffer(), - input_buffer.storage().size(), - m_line, - &cTokenEndTypes}; + token = Token{ + input_buffer.storage().pos(), + input_buffer.storage().pos(), + input_buffer.storage().get_active_buffer(), + input_buffer.storage().size(), + m_line, + &cTokenEndTypes}; return ErrorCode::Success; } - /// TODO: remove timestamp from m_is_fist_char so that m_is_delimiter check not - /// needed - while (input_buffer.log_fully_consumed() == false && - (m_is_first_char[next_char] == false || m_is_delimiter[next_char] == false)) { + // TODO: remove timestamp from m_is_fist_char so that m_is_delimiter + // check not needed + while (input_buffer.log_fully_consumed() == false + && (m_is_first_char[next_char] == false || m_is_delimiter[next_char] == false)) + { prev_byte_buf_pos = input_buffer.storage().pos(); if (ErrorCode err = input_buffer.get_next_character(next_char); - ErrorCode::Success != err) { + ErrorCode::Success != err) + { m_asked_for_more_data = true; m_prev_state = state; return err; @@ -158,11 +164,13 @@ auto Lexer::scan(ParserInputBuffer& input_buffer, To } } -/// TODO: this is duplicating almost all the code of scan() +// TODO: this is duplicating almost all the code of scan() template -auto Lexer::scan_with_wildcard(ParserInputBuffer& input_buffer, - char wildcard, - Token& token) -> ErrorCode { +auto Lexer::scan_with_wildcard( + ParserInputBuffer& input_buffer, + char wildcard, + Token& token +) -> ErrorCode { DFAStateType* state = m_dfa->get_root(); if (m_asked_for_more_data) { state = m_prev_state; @@ -172,12 +180,13 @@ auto Lexer::scan_with_wildcard(ParserInputBuffer& in m_match = false; m_last_match_pos = m_match_pos; m_last_match_line = m_match_line; - token = Token{m_start_pos, - m_match_pos, - input_buffer.storage().get_active_buffer(), - input_buffer.storage().size(), - m_match_line, - m_type_ids}; + token = Token{ + m_start_pos, + m_match_pos, + input_buffer.storage().get_active_buffer(), + input_buffer.storage().size(), + m_match_line, + m_type_ids}; return ErrorCode::Success; } m_start_pos = input_buffer.storage().pos(); @@ -193,8 +202,9 @@ auto Lexer::scan_with_wildcard(ParserInputBuffer& in m_prev_state = state; return err; } - if ((m_is_delimiter[next_char] || input_buffer.log_fully_consumed() || !m_has_delimiters) && - state->is_accepting()) { + if ((m_is_delimiter[next_char] || input_buffer.log_fully_consumed() || !m_has_delimiters) + && state->is_accepting()) + { m_match = true; m_type_ids = &(state->get_tags()); m_match_pos = prev_byte_buf_pos; @@ -215,12 +225,13 @@ auto Lexer::scan_with_wildcard(ParserInputBuffer& in if (input_buffer.log_fully_consumed() || next == nullptr) { assert(input_buffer.log_fully_consumed()); if (!m_match || (m_match && m_match_pos != input_buffer.storage().pos())) { - token = Token{m_last_match_pos, - input_buffer.storage().pos(), - input_buffer.storage().get_active_buffer(), - input_buffer.storage().size(), - m_last_match_line, - &cTokenUncaughtStringTypes}; + token = Token{ + m_last_match_pos, + input_buffer.storage().pos(), + input_buffer.storage().get_active_buffer(), + input_buffer.storage().size(), + m_last_match_line, + &cTokenUncaughtStringTypes}; return ErrorCode::Success; } if (m_match) { @@ -229,12 +240,13 @@ auto Lexer::scan_with_wildcard(ParserInputBuffer& in for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { DFAStateType* next_state = state->next(byte); if (next_state->is_accepting() == false) { - token = Token{m_last_match_pos, - input_buffer.storage().pos(), - input_buffer.storage().get_active_buffer(), - input_buffer.storage().size(), - m_last_match_line, - &cTokenUncaughtStringTypes}; + token = Token{ + m_last_match_pos, + input_buffer.storage().pos(), + input_buffer.storage().get_active_buffer(), + input_buffer.storage().size(), + m_last_match_line, + &cTokenUncaughtStringTypes}; return ErrorCode::Success; } } @@ -245,12 +257,13 @@ auto Lexer::scan_with_wildcard(ParserInputBuffer& in while (!unvisited_states.empty()) { DFAStateType* current_state = unvisited_states.top(); if (current_state == nullptr || current_state->is_accepting() == false) { - token = Token{m_last_match_pos, - input_buffer.storage().pos(), - input_buffer.storage().get_active_buffer(), - input_buffer.storage().size(), - m_last_match_line, - &cTokenUncaughtStringTypes}; + token = Token{ + m_last_match_pos, + input_buffer.storage().pos(), + input_buffer.storage().get_active_buffer(), + input_buffer.storage().size(), + m_last_match_line, + &cTokenUncaughtStringTypes}; return ErrorCode::Success; } unvisited_states.pop(); @@ -271,12 +284,13 @@ auto Lexer::scan_with_wildcard(ParserInputBuffer& in m_match = false; m_last_match_pos = m_match_pos; m_last_match_line = m_match_line; - token = Token{m_start_pos, - m_match_pos, - input_buffer.storage().get_active_buffer(), - input_buffer.storage().size(), - m_match_line, - m_type_ids}; + token = Token{ + m_start_pos, + m_match_pos, + input_buffer.storage().get_active_buffer(), + input_buffer.storage().size(), + m_match_line, + m_type_ids}; return ErrorCode::Success; } } @@ -312,7 +326,9 @@ void Lexer::add_delimiters(std::vector con template void Lexer::add_rule( - uint32_t const& id, std::unique_ptr> rule) { + uint32_t const& id, + std::unique_ptr> rule +) { m_rules.emplace_back(id, std::move(rule)); } @@ -363,8 +379,8 @@ void Lexer::generate_reverse() { } template -void Lexer::Rule::add_ast( - finite_automata::RegexNFA* nfa) const { +void Lexer::Rule::add_ast(finite_automata::RegexNFA* nfa +) const { NFAStateType* s = nfa->new_state(); s->set_accepting(true); s->set_tag(m_name); @@ -393,12 +409,12 @@ template auto Lexer::nfa_to_dfa(finite_automata::RegexNFA& nfa) -> std::unique_ptr> { typedef std::set StateSet; - std::unique_ptr> dfa = - std::make_unique>(); + std::unique_ptr> dfa + = std::make_unique>(); std::map dfa_states; std::stack unmarked_sets; - auto create_dfa_state = - [&dfa, &dfa_states, &unmarked_sets](StateSet const& set) -> DFAStateType* { + auto create_dfa_state = [&dfa, &dfa_states, &unmarked_sets](StateSet const& set + ) -> DFAStateType* { DFAStateType* state = dfa->new_state(set); dfa_states[set] = state; unmarked_sets.push(set); @@ -419,14 +435,15 @@ auto Lexer::nfa_to_dfa(finite_automata::RegexNFAget_tree_transitions().all()) - // { - // for (NFAStateType* const s1: data.m_value) { - // StateSet closure = epsilon_closure(s1); - // transitions_map[data.m_interval].insert(closure.begin(), closure.end()); - // } - // } + // TODO: add this for the utf8 case + /* + for (const typename NFAStateType::Tree::Data& data : s0->get_tree_transitions().all()) { + for (NFAStateType* const s1 : data.m_value) { + StateSet closure = epsilon_closure(s1); + transitions_map[data.m_interval].insert(closure.begin(), closure.end()); + } + } + */ } auto next_dfa_state = [&dfa_states, &create_dfa_state](StateSet const& set) -> DFAStateType* { @@ -443,15 +460,18 @@ auto Lexer::nfa_to_dfa(finite_automata::RegexNFAadd_byte_transition(kv.first, dest_state); } - /// TODO: add this for the utf8 case - // for (const typename map::value_type& kv: - // transitions_map) { - // DFAStateType* dest_state = next_dfa_state(kv.second); - // dfa_state->add_tree_transition(kv.first, dest_state); - // } + // TODO: add this for the utf8 case + /* + for (const typename map::value_type& kv : + transitions_map) + { + DFAStateType* dest_state = next_dfa_state(kv.second); + dfa_state->add_tree_transition(kv.first, dest_state); + } + */ } return dfa; } -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_LEXER_TPP +#endif // LOG_SURGEON_LEXER_TPP diff --git a/src/log_surgeon/LogEvent.cpp b/src/log_surgeon/LogEvent.cpp index edd12488..c25e5da1 100644 --- a/src/log_surgeon/LogEvent.cpp +++ b/src/log_surgeon/LogEvent.cpp @@ -1,21 +1,24 @@ +#include "LogEvent.hpp" + #include #include #include -#include "Constants.hpp" -#include "LogEvent.hpp" -#include "LogParser.hpp" -#include "LogParserOutputBuffer.hpp" -#include "Token.hpp" +#include +#include +#include +#include namespace log_surgeon { LogEventView::LogEventView(LogParser const* log_parser) - : m_log_parser{log_parser}, - m_log_var_occurrences{log_parser->m_lexer.m_id_symbol.size()} { + : m_log_parser{log_parser}, + m_log_var_occurrences{log_parser->m_lexer.m_id_symbol.size()} { m_log_output_buffer = std::make_unique(); } -auto LogEventView::deep_copy() const -> LogEvent { return {*this}; } +auto LogEventView::deep_copy() const -> LogEvent { + return {*this}; +} auto LogEventView::reset() -> void { for (std::vector& log_var_occ : m_log_var_occurrences) { @@ -79,12 +82,13 @@ LogEvent::LogEvent(LogEventView const& src) : LogEventView{src.get_log_parser()} m_buffer[curr_pos] = c; curr_pos++; } - Token copied_token{start_pos, - curr_pos, - m_buffer.data(), - buffer_size, - 0, - token.m_type_ids_ptr}; + Token copied_token{ + start_pos, + curr_pos, + m_buffer.data(), + buffer_size, + 0, + token.m_type_ids_ptr}; m_log_output_buffer->set_curr_token(copied_token); m_log_output_buffer->advance_to_next_token(); } @@ -94,4 +98,4 @@ LogEvent::LogEvent(LogEventView const& src) : LogEventView{src.get_log_parser()} add_token(token_types[0], &token); } } -} // namespace log_surgeon +} // namespace log_surgeon diff --git a/src/log_surgeon/LogEvent.hpp b/src/log_surgeon/LogEvent.hpp index 29f90a23..6767de49 100644 --- a/src/log_surgeon/LogEvent.hpp +++ b/src/log_surgeon/LogEvent.hpp @@ -5,9 +5,9 @@ #include #include -#include "LogParser.hpp" -#include "LogParserOutputBuffer.hpp" -#include "Token.hpp" +#include +#include +#include namespace log_surgeon { class LogEvent; @@ -143,6 +143,6 @@ class LogEvent : public LogEventView { private: std::vector m_buffer; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_LOG_EVENT_HPP +#endif // LOG_SURGEON_LOG_EVENT_HPP diff --git a/src/log_surgeon/LogParser.cpp b/src/log_surgeon/LogParser.cpp index 246de912..bd7b6f80 100644 --- a/src/log_surgeon/LogParser.cpp +++ b/src/log_surgeon/LogParser.cpp @@ -1,15 +1,13 @@ #include "LogParser.hpp" -// C++ standard libraries #include #include #include #include -// Project headers -#include "Constants.hpp" -#include "FileReader.hpp" -#include "SchemaParser.hpp" +#include +#include +#include using std::make_unique; using std::runtime_error; @@ -29,8 +27,7 @@ using finite_automata::RegexASTOr; using finite_automata::RegexDFAByteState; using finite_automata::RegexNFAByteState; -LogParser::LogParser(string const& schema_file_path) - : m_has_start_of_log(false) { +LogParser::LogParser(string const& schema_file_path) : m_has_start_of_log(false) { std::unique_ptr schema_ast = SchemaParser::try_schema_file(schema_file_path); add_delimiters(schema_ast->m_delimiters); add_rules(schema_ast.get()); @@ -51,27 +48,33 @@ auto LogParser::add_delimiters(unique_ptr const& delimiters) -> void } void LogParser::add_rules(SchemaAST const* schema_ast) { - // Currently, required to have delimiters (if schema_ast->delimiters != nullptr it is already - // enforced that at least 1 delimiter is specified) + // Currently, required to have delimiters (if schema_ast->delimiters != + // nullptr it is already enforced that at least 1 delimiter is specified) if (schema_ast->m_delimiters == nullptr) { throw runtime_error("When using --schema-path, \"delimiters:\" line must be used."); } - vector& delimiters = - dynamic_cast(schema_ast->m_delimiters.get())->m_delimiters; + vector& delimiters + = dynamic_cast(schema_ast->m_delimiters.get())->m_delimiters; add_token("newLine", '\n'); for (unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* rule = dynamic_cast(parser_ast.get()); if (rule->m_name == "timestamp") { unique_ptr> first_timestamp_regex_ast( - rule->m_regex_ptr->clone()); + rule->m_regex_ptr->clone() + ); add_rule("firstTimestamp", std::move(first_timestamp_regex_ast)); unique_ptr> newline_timestamp_regex_ast( - rule->m_regex_ptr->clone()); - unique_ptr> r2 = - make_unique>('\n'); - add_rule("newLineTimestamp", - make_unique>( - std::move(r2), std::move(newline_timestamp_regex_ast))); + rule->m_regex_ptr->clone() + ); + unique_ptr> r2 + = make_unique>('\n'); + add_rule( + "newLineTimestamp", + make_unique>( + std::move(r2), + std::move(newline_timestamp_regex_ast) + ) + ); // prevent timestamps from going into the dictionary continue; } @@ -94,11 +97,12 @@ void LogParser::add_rules(SchemaAST const* schema_ast) { FileReader schema_reader; ErrorCode error_code = schema_reader.try_open(schema_ast->m_file_path); if (ErrorCode::Success != error_code) { - throw std::runtime_error(schema_ast->m_file_path + ":" + - to_string(rule->m_line_num + 1) + ": error: '" + - rule->m_name + - "' has regex pattern which contains delimiter '" + - char(delimiter_name) + "'.\n"); + throw std::runtime_error( + schema_ast->m_file_path + ":" + to_string(rule->m_line_num + 1) + + ": error: '" + rule->m_name + + "' has regex pattern which contains delimiter '" + char(delimiter_name) + + "'.\n" + ); } // more detailed debugging based on looking at the file string line; @@ -116,17 +120,21 @@ void LogParser::add_rules(SchemaAST const* schema_ast) { string spaces(colon_pos, ' '); string arrows(line.size() - colon_pos, '^'); - throw std::runtime_error(schema_ast->m_file_path + ":" + - to_string(rule->m_line_num + 1) + ": error: '" + rule->m_name + - "' has regex pattern which contains delimiter '" + - char(delimiter_name) + "'.\n" + indent + line + "\n" + indent + - spaces + arrows + "\n"); + throw std::runtime_error( + schema_ast->m_file_path + ":" + to_string(rule->m_line_num + 1) + ": error: '" + + rule->m_name + "' has regex pattern which contains delimiter '" + + char(delimiter_name) + "'.\n" + indent + line + "\n" + indent + spaces + + arrows + "\n" + ); } - unique_ptr> delimiter_group = - make_unique>( - RegexASTGroup(delimiters)); + unique_ptr> delimiter_group + = make_unique>( + RegexASTGroup(delimiters) + ); rule->m_regex_ptr = make_unique>( - std::move(delimiter_group), std::move(rule->m_regex_ptr)); + std::move(delimiter_group), + std::move(rule->m_regex_ptr) + ); add_rule(rule->m_name, std::move(rule->m_regex_ptr)); } } @@ -136,11 +144,13 @@ auto LogParser::reset() -> void { m_lexer.reset(); } -/// TODO: if the first text is a variable in the no timestamp case you lose the first character -/// to static text since it has no leading delim -/// TODO: switching between timestamped and non-timestamped logs -auto LogParser::parse(std::unique_ptr& output_buffer, - LogParser::ParsingAction& parsing_action) -> ErrorCode { +// TODO: if the first text is a variable in the no timestamp case you lose the +// first character to static text since it has no leading delim +// TODO: switching between timestamped and non-timestamped logs +auto LogParser::parse( + std::unique_ptr& output_buffer, + LogParser::ParsingAction& parsing_action +) -> ErrorCode { if (0 == output_buffer->pos()) { output_buffer->set_has_delimiters(m_lexer.get_has_delimiters()); Token next_token; @@ -157,8 +167,9 @@ auto LogParser::parse(std::unique_ptr& output_buffer, parsing_action = ParsingAction::CompressAndFinish; return ErrorCode::Success; } - if (next_token.m_type_ids_ptr->at(0) == (int)SymbolID::TokenFirstTimestampId || - next_token.m_type_ids_ptr->at(0) == (int)SymbolID::TokenNewlineTimestampId) { + if (next_token.m_type_ids_ptr->at(0) == (int)SymbolID::TokenFirstTimestampId + || next_token.m_type_ids_ptr->at(0) == (int)SymbolID::TokenNewlineTimestampId) + { output_buffer->set_has_timestamp(true); output_buffer->set_token(0, next_token); output_buffer->set_pos(1); @@ -177,24 +188,25 @@ auto LogParser::parse(std::unique_ptr& output_buffer, } output_buffer->set_curr_token(next_token); int token_type = next_token.m_type_ids_ptr->at(0); - bool found_start_of_next_message = - (output_buffer->has_timestamp() && - token_type == (int)SymbolID::TokenNewlineTimestampId) || - (!output_buffer->has_timestamp() && next_token.get_char(0) == '\n' && - token_type != (int)SymbolID::TokenNewlineId); + bool found_start_of_next_message = (output_buffer->has_timestamp() + && token_type == (int)SymbolID::TokenNewlineTimestampId) + || (!output_buffer->has_timestamp() + && next_token.get_char(0) == '\n' + && token_type != (int)SymbolID::TokenNewlineId); if (token_type == (int)SymbolID::TokenEndID) { parsing_action = ParsingAction::CompressAndFinish; return ErrorCode::Success; } - if (false == output_buffer->has_timestamp() && - token_type == (int)SymbolID::TokenNewlineId) { + if (false == output_buffer->has_timestamp() && token_type == (int)SymbolID::TokenNewlineId) + { m_input_buffer.set_consumed_pos(output_buffer->get_curr_token().m_end_pos); output_buffer->advance_to_next_token(); parsing_action = ParsingAction::Compress; return ErrorCode::Success; } if (found_start_of_next_message) { - // increment by 1 because the '\n' character is not part of the next log message + // increment by 1 because the '\n' character is not part of the next + // log message m_start_of_log_message = output_buffer->get_curr_token(); if (m_start_of_log_message.m_start_pos == m_start_of_log_message.m_buffer_size - 1) { m_start_of_log_message.m_start_pos = 0; @@ -204,8 +216,8 @@ auto LogParser::parse(std::unique_ptr& output_buffer, // make the last token of the current message the '\n' character Token curr_token = output_buffer->get_curr_token(); curr_token.m_end_pos = curr_token.m_start_pos + 1; - curr_token.m_type_ids_ptr = - &Lexer::cTokenUncaughtStringTypes; + curr_token.m_type_ids_ptr + = &Lexer::cTokenUncaughtStringTypes; output_buffer->set_curr_token(curr_token); if (0 == m_start_of_log_message.m_start_pos) { m_input_buffer.set_consumed_pos(m_input_buffer.storage().size() - 1); @@ -231,4 +243,4 @@ auto LogParser::get_symbol_id(std::string const& symbol) const -> std::optional< auto LogParser::get_next_symbol(Token& token) -> ErrorCode { return m_lexer.scan(m_input_buffer, token); } -} // namespace log_surgeon +} // namespace log_surgeon diff --git a/src/log_surgeon/LogParser.hpp b/src/log_surgeon/LogParser.hpp index c36a38cd..9e3dfcb5 100644 --- a/src/log_surgeon/LogParser.hpp +++ b/src/log_surgeon/LogParser.hpp @@ -1,25 +1,27 @@ #ifndef LOG_SURGEON_LOG_PARSER_HPP #define LOG_SURGEON_LOG_PARSER_HPP -// C++ standard libraries #include #include #include -// Project headers -#include "Constants.hpp" -#include "LALR1Parser.hpp" -#include "LogParserOutputBuffer.hpp" -#include "Parser.hpp" -#include "ParserInputBuffer.hpp" -#include "SchemaParser.hpp" +#include +#include +#include +#include +#include +#include namespace log_surgeon { /// TODO: Compare c-array vs. vectors (its underlying array) for buffers class LogParser - : public Parser { + : public Parser { public: - enum class ParsingAction { None, Compress, CompressAndFinish }; + enum class ParsingAction { + None, + Compress, + CompressAndFinish + }; /** * Constructs the parser using the given schema file. @@ -49,7 +51,8 @@ class LogParser * its tokens are stored into output_buffer. * @param output_buffer Buffer to write Token objects to as they are parsed. * @param parsing_action Returns the action for CLP to take by reference. - * @return ErrorCode::Success if successfully parsed to the start of a new log event. + * @return ErrorCode::Success if successfully parsed to the start of a new + * log event. * @return ErrorCode from LogParser::get_next_symbol. */ auto parse(std::unique_ptr& output_buffer, ParsingAction& parsing_action) @@ -157,6 +160,6 @@ class LogParser bool m_has_start_of_log; Token m_start_of_log_message{}; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_LOG_PARSER_HPP +#endif // LOG_SURGEON_LOG_PARSER_HPP diff --git a/src/log_surgeon/LogParserOutputBuffer.cpp b/src/log_surgeon/LogParserOutputBuffer.cpp index 48ec9081..56657f39 100644 --- a/src/log_surgeon/LogParserOutputBuffer.cpp +++ b/src/log_surgeon/LogParserOutputBuffer.cpp @@ -1,6 +1,5 @@ #include "LogParserOutputBuffer.hpp" -// C++ standard libraries #include using std::string; @@ -15,4 +14,4 @@ auto LogParserOutputBuffer::advance_to_next_token() -> void { m_storage.copy(old_storage, old_storage + old_size, 0); } } -} // namespace log_surgeon +} // namespace log_surgeon diff --git a/src/log_surgeon/LogParserOutputBuffer.hpp b/src/log_surgeon/LogParserOutputBuffer.hpp index 4c247479..e96cd357 100644 --- a/src/log_surgeon/LogParserOutputBuffer.hpp +++ b/src/log_surgeon/LogParserOutputBuffer.hpp @@ -1,9 +1,8 @@ #ifndef LOG_SURGEON_LOG_PARSER_OUTPUT_BUFFER_HPP #define LOG_SURGEON_LOG_PARSER_OUTPUT_BUFFER_HPP -// Project Headers -#include "Buffer.hpp" -#include "Token.hpp" +#include +#include namespace log_surgeon { /** @@ -64,6 +63,6 @@ class LogParserOutputBuffer { // contains the static and dynamic Token buffers Buffer m_storage{}; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_LOG_PARSER_OUTPUT_BUFFER_HPP +#endif // LOG_SURGEON_LOG_PARSER_OUTPUT_BUFFER_HPP diff --git a/src/log_surgeon/Parser.hpp b/src/log_surgeon/Parser.hpp index 33418de6..54efd528 100644 --- a/src/log_surgeon/Parser.hpp +++ b/src/log_surgeon/Parser.hpp @@ -1,8 +1,7 @@ #ifndef LOG_SURGEON_PARSER_HPP #define LOG_SURGEON_PARSER_HPP -// Project headers -#include "Lexer.hpp" +#include namespace log_surgeon { @@ -11,15 +10,16 @@ class Parser { public: Parser(); - virtual auto add_rule(std::string const& name, - std::unique_ptr> rule) -> void; + virtual auto + add_rule(std::string const& name, std::unique_ptr> rule) + -> void; auto add_token(std::string const& name, char rule_char) -> void; Lexer m_lexer; }; -} //namespace log_surgeon +} // namespace log_surgeon #include "Parser.tpp" -#endif // LOG_SURGEON_PARSER_HPP +#endif // LOG_SURGEON_PARSER_HPP diff --git a/src/log_surgeon/Parser.tpp b/src/log_surgeon/Parser.tpp index 7efe00dd..69c8a5ea 100644 --- a/src/log_surgeon/Parser.tpp +++ b/src/log_surgeon/Parser.tpp @@ -1,11 +1,9 @@ #ifndef LOG_SURGEON_PARSER_TPP #define LOG_SURGEON_PARSER_TPP -// C++ standard libraries #include -// Project headers -#include "finite_automata/RegexAST.hpp" +#include namespace log_surgeon { @@ -33,7 +31,9 @@ Parser::Parser() { template void Parser::add_rule( - std::string const& name, std::unique_ptr> rule) { + std::string const& name, + std::unique_ptr> rule +) { if (m_lexer.m_symbol_id.find(name) == m_lexer.m_symbol_id.end()) { m_lexer.m_symbol_id[name] = m_lexer.m_symbol_id.size(); m_lexer.m_id_symbol[m_lexer.m_symbol_id[name]] = name; @@ -45,6 +45,6 @@ template void Parser::add_token(std::string const& name, char rule_char) { add_rule(name, std::make_unique>(rule_char)); } -} //namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_PARSER_TPP +#endif // LOG_SURGEON_PARSER_TPP diff --git a/src/log_surgeon/ParserInputBuffer.cpp b/src/log_surgeon/ParserInputBuffer.cpp index d51da93d..ce51e772 100644 --- a/src/log_surgeon/ParserInputBuffer.cpp +++ b/src/log_surgeon/ParserInputBuffer.cpp @@ -1,10 +1,9 @@ -// C++ libraries -#include +#include "ParserInputBuffer.hpp" + +#include #include -// Project Headers -#include "Constants.hpp" -#include "ParserInputBuffer.hpp" +#include using std::string; using std::to_string; @@ -25,8 +24,9 @@ auto ParserInputBuffer::read_is_safe() -> bool { } // Check if the last log message ends in the buffer half last read. // This means the other half of the buffer has already been fully used. - if ((!m_last_read_first_half && m_consumed_pos > m_storage.size() / 2) || - (m_last_read_first_half && m_consumed_pos < m_storage.size() / 2 && m_consumed_pos > 0)) { + if ((!m_last_read_first_half && m_consumed_pos > m_storage.size() / 2) + || (m_last_read_first_half && m_consumed_pos < m_storage.size() / 2 && m_consumed_pos > 0)) + { return true; } return false; @@ -40,7 +40,8 @@ auto ParserInputBuffer::read(Reader& reader) -> ErrorCode { read_offset = m_storage.size() / 2; } if (ErrorCode err = m_storage.read(reader, read_offset, m_storage.size() / 2, bytes_read); - ErrorCode::Success != err) { + ErrorCode::Success != err) + { if (ErrorCode::EndOfFile == err) { m_finished_reading_input = true; } @@ -48,7 +49,8 @@ auto ParserInputBuffer::read(Reader& reader) -> ErrorCode { } m_last_read_first_half = !m_last_read_first_half; // TODO: This is not a portable check for certain forms of IO - // A method from Reader should be used to check if the input source is finished + // A method from Reader should be used to check if the input source is + // finished if (bytes_read < m_storage.size() / 2) { m_finished_reading_input = true; } @@ -88,8 +90,9 @@ auto ParserInputBuffer::get_next_character(unsigned char& next_char) -> ErrorCod next_char = utf8::cCharEOF; return ErrorCode::Success; } - if ((m_last_read_first_half && m_storage.pos() == m_storage.size() / 2) || - (!m_last_read_first_half && m_storage.pos() == 0)) { + if ((m_last_read_first_half && m_storage.pos() == m_storage.size() / 2) + || (!m_last_read_first_half && m_storage.pos() == 0)) + { return ErrorCode::BufferOutOfBounds; } char character = m_storage.get_curr_value(); @@ -105,14 +108,16 @@ auto ParserInputBuffer::get_next_character(unsigned char& next_char) -> ErrorCod // the user to wrap their input buffer. It tricks the LogParser and // ParserInputBuffer into thinking it never reaches the wrap, while still // respecting the actual size of the buffer the user passed in. -void ParserInputBuffer::set_storage(char* storage, - uint32_t size, - uint32_t pos, - bool finished_reading_input) { +void ParserInputBuffer::set_storage( + char* storage, + uint32_t size, + uint32_t pos, + bool finished_reading_input +) { reset(); m_storage.set_active_buffer(storage, size * 2, pos); m_finished_reading_input = finished_reading_input; m_pos_last_read_char = size; m_last_read_first_half = true; } -} // namespace log_surgeon +} // namespace log_surgeon diff --git a/src/log_surgeon/ParserInputBuffer.hpp b/src/log_surgeon/ParserInputBuffer.hpp index ed1018a3..4a6cdaa7 100644 --- a/src/log_surgeon/ParserInputBuffer.hpp +++ b/src/log_surgeon/ParserInputBuffer.hpp @@ -122,7 +122,7 @@ class ParserInputBuffer { * @param reader to use for IO. */ auto read(Reader& reader) -> ErrorCode; - + private: // the position of the last character read into the buffer uint32_t m_pos_last_read_char{0}; @@ -136,6 +136,6 @@ class ParserInputBuffer { // the position last used by the caller (no longer needed in storage) uint32_t m_consumed_pos{m_storage.size() - 1}; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_PARSER_INPUT_BUFFER_HPP +#endif // LOG_SURGEON_PARSER_INPUT_BUFFER_HPP diff --git a/src/log_surgeon/Reader.hpp b/src/log_surgeon/Reader.hpp index 15d97703..66ff16e8 100644 --- a/src/log_surgeon/Reader.hpp +++ b/src/log_surgeon/Reader.hpp @@ -3,7 +3,7 @@ #include -#include "Constants.hpp" +#include namespace log_surgeon { /** @@ -26,6 +26,6 @@ class Reader { std::function read{}; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_LIBRARY_READER_HPP +#endif // LOG_SURGEON_LIBRARY_READER_HPP diff --git a/src/log_surgeon/ReaderParser.cpp b/src/log_surgeon/ReaderParser.cpp index 77871bf1..139e9b96 100644 --- a/src/log_surgeon/ReaderParser.cpp +++ b/src/log_surgeon/ReaderParser.cpp @@ -1,12 +1,14 @@ +#include "ReaderParser.hpp" + #include -#include "Constants.hpp" -#include "LogEvent.hpp" -#include "ReaderParser.hpp" -#include "Schema.hpp" +#include +#include +#include namespace log_surgeon { ReaderParser::ReaderParser(Schema& schema) : m_log_parser(schema.get_schema_ast_ptr()) {} + ReaderParser::ReaderParser(std::string const& schema_file_path) : m_log_parser(schema_file_path) {} auto ReaderParser::reset_and_set_reader(Reader& reader) -> void { @@ -18,7 +20,8 @@ auto ReaderParser::reset_and_set_reader(Reader& reader) -> void { auto ReaderParser::get_next_event_view(LogEventView& event_view) -> ErrorCode { event_view.reset(); if (ErrorCode err = m_log_parser.read_into_input(m_reader); - ErrorCode::Success != err && ErrorCode::EndOfFile != err) { + ErrorCode::Success != err && ErrorCode::EndOfFile != err) + { return err; } while (true) { @@ -33,16 +36,18 @@ auto ReaderParser::get_next_event_view(LogEventView& event_view) -> ErrorCode { if (ErrorCode::BufferOutOfBounds == parse_error) { uint32_t old_storage_size{0}; bool flipped_static_buffer{false}; - if (ErrorCode err = - m_log_parser.increase_capacity(old_storage_size, flipped_static_buffer); - ErrorCode::Success != err) { + if (ErrorCode err + = m_log_parser.increase_capacity(old_storage_size, flipped_static_buffer); + ErrorCode::Success != err) + { return err; } if (flipped_static_buffer) { m_log_parser.flip_lexer_states(old_storage_size); } if (ErrorCode err = m_log_parser.read_into_input(m_reader); - ErrorCode::Success != err && ErrorCode::EndOfFile != err) { + ErrorCode::Success != err && ErrorCode::EndOfFile != err) + { return err; } } else { @@ -57,17 +62,18 @@ auto ReaderParser::get_next_event_view(LogEventView& event_view) -> ErrorCode { for (uint32_t i = start; i < event_view.m_log_output_buffer->pos(); i++) { Token* token = &event_view.m_log_output_buffer->get_mutable_token(i); event_view.add_token(token->m_type_ids_ptr->at(0), token); - if (token->m_type_ids_ptr->at(0) == (int)SymbolID::TokenNewlineId && - first_newline_pos == 0) { + if (token->m_type_ids_ptr->at(0) == (int)SymbolID::TokenNewlineId && first_newline_pos == 0) + { first_newline_pos = i; } } // To be a multiline log there must be at least one token between the // newline token and the last token in the output buffer. - if (event_view.m_log_output_buffer->has_timestamp() && 0 < first_newline_pos && - first_newline_pos + 1 < event_view.m_log_output_buffer->pos()) { + if (event_view.m_log_output_buffer->has_timestamp() && 0 < first_newline_pos + && first_newline_pos + 1 < event_view.m_log_output_buffer->pos()) + { event_view.set_multiline(true); } return ErrorCode::Success; } -} // namespace log_surgeon +} // namespace log_surgeon diff --git a/src/log_surgeon/ReaderParser.hpp b/src/log_surgeon/ReaderParser.hpp index 466de855..938977a8 100644 --- a/src/log_surgeon/ReaderParser.hpp +++ b/src/log_surgeon/ReaderParser.hpp @@ -4,10 +4,10 @@ #include #include -#include "LogEvent.hpp" -#include "LogParser.hpp" -#include "Reader.hpp" -#include "Schema.hpp" +#include +#include +#include +#include namespace log_surgeon { /** @@ -84,6 +84,6 @@ class ReaderParser { LogParser m_log_parser; bool m_done{false}; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_READER_PARSER_HPP +#endif // LOG_SURGEON_READER_PARSER_HPP diff --git a/src/log_surgeon/Schema.cpp b/src/log_surgeon/Schema.cpp index e6d8f3a0..479091c2 100644 --- a/src/log_surgeon/Schema.cpp +++ b/src/log_surgeon/Schema.cpp @@ -1,8 +1,8 @@ -#include - #include "Schema.hpp" +#include + namespace log_surgeon { Schema::Schema(std::string const& schema_file_path) - : m_schema_ast{SchemaParser::try_schema_file(schema_file_path)} {} -} // namespace log_surgeon + : m_schema_ast{SchemaParser::try_schema_file(schema_file_path)} {} +} // namespace log_surgeon diff --git a/src/log_surgeon/Schema.hpp b/src/log_surgeon/Schema.hpp index 3ed7cf9c..d2bf4777 100644 --- a/src/log_surgeon/Schema.hpp +++ b/src/log_surgeon/Schema.hpp @@ -4,7 +4,7 @@ #include #include -#include "SchemaParser.hpp" +#include namespace log_surgeon { /** @@ -47,13 +47,11 @@ class Schema { auto clear (); */ - [[nodiscard]] auto get_schema_ast_ptr() const -> SchemaAST const* { - return m_schema_ast.get(); - } + [[nodiscard]] auto get_schema_ast_ptr() const -> SchemaAST const* { return m_schema_ast.get(); } private: std::unique_ptr m_schema_ast; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_SCHEMA_HPP +#endif // LOG_SURGEON_SCHEMA_HPP diff --git a/src/log_surgeon/SchemaParser.cpp b/src/log_surgeon/SchemaParser.cpp index 28f08c2a..3dfc0f7e 100644 --- a/src/log_surgeon/SchemaParser.cpp +++ b/src/log_surgeon/SchemaParser.cpp @@ -1,20 +1,18 @@ #include "SchemaParser.hpp" -// C++ libraries #include #include #include -// Project headers -#include "Constants.hpp" -#include "FileReader.hpp" -#include "LALR1Parser.hpp" -#include "Lexer.hpp" -#include "finite_automata/RegexAST.hpp" -#include "utils.hpp" +#include +#include +#include +#include +#include +#include -using RegexASTByte = - log_surgeon::finite_automata::RegexAST; +using RegexASTByte + = log_surgeon::finite_automata::RegexAST; using RegexASTGroupByte = log_surgeon::finite_automata::RegexASTGroup< log_surgeon::finite_automata::RegexNFAByteState>; using RegexASTIntegerByte = log_surgeon::finite_automata::RegexASTInteger< @@ -23,10 +21,10 @@ using RegexASTLiteralByte = log_surgeon::finite_automata::RegexASTLiteral< log_surgeon::finite_automata::RegexNFAByteState>; using RegexASTMultiplicationByte = log_surgeon::finite_automata::RegexASTMultiplication< log_surgeon::finite_automata::RegexNFAByteState>; -using RegexASTOrByte = - log_surgeon::finite_automata::RegexASTOr; -using RegexASTCatByte = - log_surgeon::finite_automata::RegexASTCat; +using RegexASTOrByte + = log_surgeon::finite_automata::RegexASTOr; +using RegexASTCatByte = log_surgeon::finite_automata::RegexASTCat< + log_surgeon::finite_automata::RegexNFAByteState>; using std::make_unique; using std::string; @@ -42,7 +40,8 @@ SchemaParser::SchemaParser() { auto SchemaParser::generate_schema_ast(Reader& reader) -> unique_ptr { NonTerminal nonterminal = parse(reader); std::unique_ptr schema_ast( - dynamic_cast(nonterminal.get_parser_ast().release())); + dynamic_cast(nonterminal.get_parser_ast().release()) + ); return schema_ast; } @@ -52,11 +51,13 @@ auto SchemaParser::try_schema_file(string const& schema_file_path) -> unique_ptr if (ErrorCode::Success != error_code) { if (ErrorCode::Errno == error_code) { throw std::runtime_error( - strfmt("Failed to read '%s', errno=%d", schema_file_path.c_str(), errno)); + strfmt("Failed to read '%s', errno=%d", schema_file_path.c_str(), errno) + ); } int code{static_cast>(error_code)}; throw std::runtime_error( - strfmt("Failed to read '%s', error_code=%d", schema_file_path.c_str(), code)); + strfmt("Failed to read '%s', error_code=%d", schema_file_path.c_str(), code) + ); } SchemaParser sp; Reader reader{[&](char* buf, size_t count, size_t& read_to) -> ErrorCode { @@ -139,33 +140,38 @@ using ParserValueRegex = ParserValue>; static auto regex_identity_rule(NonTerminal* m) -> unique_ptr { return unique_ptr(new ParserValueRegex( - std::move(m->non_terminal_cast(0)->get_parser_ast()->get>()))); + std::move(m->non_terminal_cast(0)->get_parser_ast()->get>()) + )); } static auto regex_cat_rule(NonTerminal* m) -> unique_ptr { auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); auto& r2 = m->non_terminal_cast(1)->get_parser_ast()->get>(); return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTCatByte(std::move(r1), std::move(r2))))); + unique_ptr(new RegexASTCatByte(std::move(r1), std::move(r2))) + )); } static auto regex_or_rule(NonTerminal* m) -> unique_ptr { auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); auto& r2 = m->non_terminal_cast(2)->get_parser_ast()->get>(); return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTOrByte(std::move(r1), std::move(r2))))); + unique_ptr(new RegexASTOrByte(std::move(r1), std::move(r2))) + )); } static auto regex_match_zero_or_more_rule(NonTerminal* m) -> unique_ptr { auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTMultiplicationByte(std::move(r1), 0, 0)))); + unique_ptr(new RegexASTMultiplicationByte(std::move(r1), 0, 0)) + )); } static auto regex_match_one_or_more_rule(NonTerminal* m) -> unique_ptr { auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTMultiplicationByte(std::move(r1), 1, 0)))); + unique_ptr(new RegexASTMultiplicationByte(std::move(r1), 1, 0)) + )); } static auto regex_match_exactly_rule(NonTerminal* m) -> unique_ptr { @@ -178,7 +184,8 @@ static auto regex_match_exactly_rule(NonTerminal* m) -> unique_ptr { } auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTMultiplicationByte(std::move(r1), reps, reps)))); + unique_ptr(new RegexASTMultiplicationByte(std::move(r1), reps, reps)) + )); } static auto regex_match_range_rule(NonTerminal* m) -> unique_ptr { @@ -198,7 +205,8 @@ static auto regex_match_range_rule(NonTerminal* m) -> unique_ptr { } auto& r1 = m->non_terminal_cast(0)->get_parser_ast()->get>(); return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTMultiplicationByte(std::move(r1), min, max)))); + unique_ptr(new RegexASTMultiplicationByte(std::move(r1), min, max)) + )); } static auto regex_add_literal_existing_group_rule(NonTerminal* m) -> unique_ptr { @@ -207,7 +215,8 @@ static auto regex_add_literal_existing_group_rule(NonTerminal* m) -> unique_ptr< auto* r1_ptr = dynamic_cast(r1.get()); auto* r2_ptr = dynamic_cast(r2.get()); return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r1_ptr, r2_ptr)))); + new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r1_ptr, r2_ptr))) + ); } static auto regex_add_range_existing_group_rule(NonTerminal* m) -> unique_ptr { @@ -216,21 +225,24 @@ static auto regex_add_range_existing_group_rule(NonTerminal* m) -> unique_ptr(r1.get()); auto* r2_ptr = dynamic_cast(r2.get()); return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r1_ptr, r2_ptr)))); + new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r1_ptr, r2_ptr))) + ); } static auto regex_add_literal_new_group_rule(NonTerminal* m) -> unique_ptr { auto& r2 = m->non_terminal_cast(1)->get_parser_ast()->get>(); auto* r2_ptr = dynamic_cast(r2.get()); return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r2_ptr)))); + new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r2_ptr))) + ); } static auto regex_add_range_new_group_rule(NonTerminal* m) -> unique_ptr { auto& r2 = m->non_terminal_cast(1)->get_parser_ast()->get>(); auto* r2_ptr = dynamic_cast(r2.get()); return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r2_ptr)))); + new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r2_ptr))) + ); } static auto regex_complement_incomplete_group_rule(NonTerminal* /* m */) -> unique_ptr { @@ -243,26 +255,30 @@ static auto regex_range_rule(NonTerminal* m) -> unique_ptr { auto* r1_ptr = dynamic_cast(r1.get()); auto* r2_ptr = dynamic_cast(r2.get()); return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r1_ptr, r2_ptr)))); + new ParserValueRegex(unique_ptr(new RegexASTGroupByte(r1_ptr, r2_ptr))) + ); } static auto regex_middle_identity_rule(NonTerminal* m) -> unique_ptr { return unique_ptr(new ParserValueRegex( - std::move(m->non_terminal_cast(1)->get_parser_ast()->get>()))); + std::move(m->non_terminal_cast(1)->get_parser_ast()->get>()) + )); } static auto regex_literal_rule(NonTerminal* m) -> unique_ptr { Token* token = m->token_cast(0); assert(token->to_string().size() == 1); return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTLiteralByte(token->to_string()[0])))); + unique_ptr(new RegexASTLiteralByte(token->to_string()[0])) + )); } static auto regex_cancel_literal_rule(NonTerminal* m) -> unique_ptr { Token* token = m->token_cast(1); assert(token->to_string().size() == 1); return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTLiteralByte(token->to_string()[0])))); + unique_ptr(new RegexASTLiteralByte(token->to_string()[0])) + )); } static auto regex_existing_integer_rule(NonTerminal* m) -> unique_ptr { @@ -271,19 +287,22 @@ static auto regex_existing_integer_rule(NonTerminal* m) -> unique_ptr Token* token = m->token_cast(1); assert(token->to_string().size() == 1); return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTIntegerByte(r2_ptr, token->to_string()[0])))); + unique_ptr(new RegexASTIntegerByte(r2_ptr, token->to_string()[0])) + )); } static auto regex_new_integer_rule(NonTerminal* m) -> unique_ptr { Token* token = m->token_cast(0); assert(token->to_string().size() == 1); return unique_ptr(new ParserValueRegex( - unique_ptr(new RegexASTIntegerByte(token->to_string()[0])))); + unique_ptr(new RegexASTIntegerByte(token->to_string()[0])) + )); } static auto regex_digit_rule(NonTerminal* /* m */) -> unique_ptr { return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTGroupByte('0', '9')))); + new ParserValueRegex(unique_ptr(new RegexASTGroupByte('0', '9'))) + ); } static auto regex_wildcard_rule(NonTerminal* /* m */) -> unique_ptr { @@ -294,34 +313,41 @@ static auto regex_wildcard_rule(NonTerminal* /* m */) -> unique_ptr { static auto regex_vertical_tab_rule(NonTerminal* /* m */) -> unique_ptr { return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\v')))); + new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\v'))) + ); } static auto regex_form_feed_rule(NonTerminal* /* m */) -> unique_ptr { return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\f')))); + new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\f'))) + ); } static auto regex_tab_rule(NonTerminal* /* m */) -> unique_ptr { return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\t')))); + new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\t'))) + ); } static auto regex_char_return_rule(NonTerminal* /* m */) -> unique_ptr { return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\r')))); + new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\r'))) + ); } static auto regex_newline_rule(NonTerminal* /* m */) -> unique_ptr { return unique_ptr( - new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\n')))); + new ParserValueRegex(unique_ptr(new RegexASTLiteralByte('\n'))) + ); } static auto regex_white_space_rule(NonTerminal* /* m */) -> unique_ptr { - unique_ptr regex_ast_group = - make_unique(RegexASTGroupByte({' ', '\t', '\r', '\n', '\v', '\f'})); + unique_ptr regex_ast_group = make_unique( + RegexASTGroupByte({' ', '\t', '\r', '\n', '\v', '\f'}) + ); return unique_ptr( - new ParserValueRegex(unique_ptr(std::move(regex_ast_group)))); + new ParserValueRegex(unique_ptr(std::move(regex_ast_group))) + ); } static auto existing_delimiter_string_rule(NonTerminal* m) -> unique_ptr { @@ -340,11 +366,11 @@ static auto new_delimiter_string_rule(NonTerminal* m) -> unique_ptr { } void SchemaParser::add_lexical_rules() { - add_token("Tab", '\t'); // 9 - add_token("NewLine", '\n'); // 10 - add_token("VerticalTab", '\v'); // 11 - add_token("FormFeed", '\f'); // 12 - add_token("CarriageReturn", '\r'); // 13 + add_token("Tab", '\t'); // 9 + add_token("NewLine", '\n'); // 10 + add_token("VerticalTab", '\v'); // 11 + add_token("FormFeed", '\f'); // 12 + add_token("CarriageReturn", '\r'); // 13 add_token("Space", ' '); add_token("Bang", '!'); add_token("Quotation", '"'); @@ -402,18 +428,31 @@ void SchemaParser::add_productions() { add_production("Schema", {"Comment"}, new_schema_rule); add_production("Schema", {"SchemaVar"}, new_schema_rule_with_var); add_production( - "Schema", {"Delimiters", "Colon", "DelimiterString"}, new_schema_rule_with_delimiters); + "Schema", + {"Delimiters", "Colon", "DelimiterString"}, + new_schema_rule_with_delimiters + ); add_production("Schema", {"Schema", "PortableNewLine"}, identity_rule_ParserASTSchema); add_production( - "Schema", {"Schema", "PortableNewLine", "Comment"}, identity_rule_ParserASTSchema); - add_production("Schema", - {"Schema", "PortableNewLine", "SchemaVar"}, - std::bind(&SchemaParser::existing_schema_rule, this, std::placeholders::_1)); - add_production("Schema", - {"Schema", "PortableNewLine", "Delimiters", "Colon", "DelimiterString"}, - existing_schema_rule_with_delimiter); + "Schema", + {"Schema", "PortableNewLine", "Comment"}, + identity_rule_ParserASTSchema + ); + add_production( + "Schema", + {"Schema", "PortableNewLine", "SchemaVar"}, + std::bind(&SchemaParser::existing_schema_rule, this, std::placeholders::_1) + ); add_production( - "DelimiterString", {"DelimiterString", "Literal"}, existing_delimiter_string_rule); + "Schema", + {"Schema", "PortableNewLine", "Delimiters", "Colon", "DelimiterString"}, + existing_schema_rule_with_delimiter + ); + add_production( + "DelimiterString", + {"DelimiterString", "Literal"}, + existing_delimiter_string_rule + ); add_production("DelimiterString", {"Literal"}, new_delimiter_string_rule); add_production("PortableNewLine", {"CarriageReturn", "NewLine"}, nullptr); add_production("PortableNewLine", {"NewLine"}, nullptr); @@ -423,7 +462,10 @@ void SchemaParser::add_productions() { add_production("Text", {"Text", "Delimiters"}, nullptr); add_production("Text", {"Delimiters"}, nullptr); add_production( - "SchemaVar", {"WhitespaceStar", "Identifier", "Colon", "Regex"}, schema_var_rule); + "SchemaVar", + {"WhitespaceStar", "Identifier", "Colon", "Regex"}, + schema_var_rule + ); add_production("Identifier", {"Identifier", "AlphaNumeric"}, existing_identifier_rule); add_production("Identifier", {"AlphaNumeric"}, new_identifier_rule); add_production("WhitespaceStar", {"WhitespaceStar", "Space"}, nullptr); @@ -439,28 +481,41 @@ void SchemaParser::add_productions() { add_production("Or", {"CompleteGroup"}, regex_identity_rule); add_production("MatchStar", {"CompleteGroup", "Star"}, regex_match_zero_or_more_rule); add_production("MatchPlus", {"CompleteGroup", "Plus"}, regex_match_one_or_more_rule); - add_production("MatchExact", - {"CompleteGroup", "Lbrace", "Integer", "Rbrace"}, - regex_match_exactly_rule); - add_production("MatchRange", - {"CompleteGroup", "Lbrace", "Integer", "Comma", "Integer", "Rbrace"}, - regex_match_range_rule); + add_production( + "MatchExact", + {"CompleteGroup", "Lbrace", "Integer", "Rbrace"}, + regex_match_exactly_rule + ); + add_production( + "MatchRange", + {"CompleteGroup", "Lbrace", "Integer", "Comma", "Integer", "Rbrace"}, + regex_match_range_rule + ); add_production("CompleteGroup", {"IncompleteGroup", "Rbracket"}, regex_identity_rule); add_production("CompleteGroup", {"Literal"}, regex_identity_rule); add_production("CompleteGroup", {"Digit"}, regex_identity_rule); add_production("CompleteGroup", {"Wildcard"}, regex_identity_rule); add_production("CompleteGroup", {"WhiteSpace"}, regex_identity_rule); - add_production("IncompleteGroup", - {"IncompleteGroup", "LiteralRange"}, - regex_add_range_existing_group_rule); add_production( - "IncompleteGroup", {"IncompleteGroup", "Digit"}, regex_add_range_existing_group_rule); - add_production("IncompleteGroup", - {"IncompleteGroup", "Literal"}, - regex_add_literal_existing_group_rule); - add_production("IncompleteGroup", - {"IncompleteGroup", "WhiteSpace"}, - regex_add_literal_existing_group_rule); + "IncompleteGroup", + {"IncompleteGroup", "LiteralRange"}, + regex_add_range_existing_group_rule + ); + add_production( + "IncompleteGroup", + {"IncompleteGroup", "Digit"}, + regex_add_range_existing_group_rule + ); + add_production( + "IncompleteGroup", + {"IncompleteGroup", "Literal"}, + regex_add_literal_existing_group_rule + ); + add_production( + "IncompleteGroup", + {"IncompleteGroup", "WhiteSpace"}, + regex_add_literal_existing_group_rule + ); add_production("IncompleteGroup", {"Lbracket", "LiteralRange"}, regex_add_range_new_group_rule); add_production("IncompleteGroup", {"Lbracket", "Digit"}, regex_add_range_new_group_rule); add_production("IncompleteGroup", {"Lbracket", "Literal"}, regex_add_literal_new_group_rule); @@ -513,4 +568,4 @@ void SchemaParser::add_productions() { add_production("Wildcard", {"Dot"}, regex_wildcard_rule); add_production("WhiteSpace", {"Backslash", "s"}, regex_white_space_rule); } -} // namespace log_surgeon +} // namespace log_surgeon diff --git a/src/log_surgeon/SchemaParser.hpp b/src/log_surgeon/SchemaParser.hpp index 5cfd997e..110b1504 100644 --- a/src/log_surgeon/SchemaParser.hpp +++ b/src/log_surgeon/SchemaParser.hpp @@ -3,8 +3,7 @@ #include -// Project headers -#include "LALR1Parser.hpp" +#include namespace log_surgeon { // ASTs used in SchemaParser AST @@ -40,13 +39,15 @@ class IdentifierAST : public ParserAST { class SchemaVarAST : public ParserAST { public: // Constructor - SchemaVarAST(std::string name, - std::unique_ptr> - regex_ptr, - uint32_t line_num) - : m_line_num(line_num), - m_name(std::move(name)), - m_regex_ptr(std::move(regex_ptr)) {} + SchemaVarAST( + std::string name, + std::unique_ptr> + regex_ptr, + uint32_t line_num + ) + : m_line_num(line_num), + m_name(std::move(name)), + m_regex_ptr(std::move(regex_ptr)) {} uint32_t m_line_num; std::string m_name; @@ -63,8 +64,9 @@ class DelimiterStringAST : public ParserAST { std::vector m_delimiters; }; -class SchemaParser - : public LALR1Parser { +class SchemaParser : public LALR1Parser< + finite_automata::RegexNFAByteState, + finite_automata::RegexDFAByteState> { public: // Constructor SchemaParser(); @@ -77,7 +79,8 @@ class SchemaParser auto existing_schema_rule(NonTerminal* m) -> std::unique_ptr; /** - * Parse a user defined schema to generate a schema AST used for generating the log lexer + * Parse a user defined schema to generate a schema AST used for generating + * the log lexer * @param reader * @return std::unique_ptr */ @@ -88,13 +91,12 @@ class SchemaParser * @param schema_file_path * @return std::unique_ptr */ - static auto try_schema_file(std::string const& schema_file_path) - -> std::unique_ptr; + static auto try_schema_file(std::string const& schema_file_path) -> std::unique_ptr; private: /** - * After lexing half of the buffer, reads into that half of the buffer and changes variables - * accordingly + * After lexing half of the buffer, reads into that half of the buffer and + * changes variables accordingly * @param next_children_start */ auto soft_reset(uint32_t& next_children_start) -> void; @@ -109,6 +111,6 @@ class SchemaParser */ auto add_productions() -> void; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_SCHEMA_PARSER_HPP +#endif // LOG_SURGEON_SCHEMA_PARSER_HPP diff --git a/src/log_surgeon/Token.cpp b/src/log_surgeon/Token.cpp index 2165b7cb..59b24d17 100644 --- a/src/log_surgeon/Token.cpp +++ b/src/log_surgeon/Token.cpp @@ -10,8 +10,8 @@ auto Token::to_string() -> std::string { return {m_buffer + m_start_pos, m_buffer + m_end_pos}; } if (m_wrap_around_string.empty()) { - m_wrap_around_string = std::string{m_buffer + m_start_pos, m_buffer + m_buffer_size} + - std::string{m_buffer, m_buffer + m_end_pos}; + m_wrap_around_string = std::string{m_buffer + m_start_pos, m_buffer + m_buffer_size} + + std::string{m_buffer, m_buffer + m_end_pos}; } return {m_wrap_around_string}; } @@ -21,8 +21,8 @@ auto Token::to_string_view() -> std::string_view { return {m_buffer + m_start_pos, m_end_pos - m_start_pos}; } if (m_wrap_around_string.empty()) { - m_wrap_around_string = std::string{m_buffer + m_start_pos, m_buffer + m_buffer_size} + - std::string{m_buffer, m_buffer + m_end_pos}; + m_wrap_around_string = std::string{m_buffer + m_start_pos, m_buffer + m_buffer_size} + + std::string{m_buffer, m_buffer + m_end_pos}; } return {m_wrap_around_string}; } @@ -44,4 +44,4 @@ auto Token::get_length() const -> uint32_t { } return m_buffer_size - m_start_pos + m_end_pos; } -} // namespace log_surgeon +} // namespace log_surgeon diff --git a/src/log_surgeon/Token.hpp b/src/log_surgeon/Token.hpp index 4c871807..7911ae56 100644 --- a/src/log_surgeon/Token.hpp +++ b/src/log_surgeon/Token.hpp @@ -1,7 +1,6 @@ #ifndef LOG_SURGEON_TOKEN_HPP #define LOG_SURGEON_TOKEN_HPP -// C++ standard libraries #include #include #include @@ -44,6 +43,6 @@ class Token { std::vector const* m_type_ids_ptr{nullptr}; std::string m_wrap_around_string{}; }; -} // namespace log_surgeon +} // namespace log_surgeon -#endif // LOG_SURGEON_TOKEN_HPP +#endif // LOG_SURGEON_TOKEN_HPP diff --git a/src/log_surgeon/finite_automata/RegexAST.hpp b/src/log_surgeon/finite_automata/RegexAST.hpp index 9b57f273..d7949145 100644 --- a/src/log_surgeon/finite_automata/RegexAST.hpp +++ b/src/log_surgeon/finite_automata/RegexAST.hpp @@ -1,7 +1,6 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_AST_HPP #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_AST_HPP -// C++ standard libraries #include #include #include @@ -9,17 +8,15 @@ #include #include -// Project headers -#include "../Constants.hpp" -#include "RegexNFA.hpp" -#include "UnicodeIntervalTree.hpp" +#include +#include +#include namespace log_surgeon::finite_automata { template class RegexAST { public: - // Destructor virtual ~RegexAST() = default; /** @@ -29,7 +26,8 @@ class RegexAST { [[nodiscard]] virtual auto clone() const -> RegexAST* = 0; /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule + * Sets is_possible_input to specify which utf8 characters are allowed in a + * lexer rule * @param is_possible_input */ virtual auto set_possible_inputs_to_true(bool is_possible_input[]) const -> void = 0; @@ -41,19 +39,17 @@ class RegexAST { virtual auto remove_delimiters_from_wildcard(std::vector& delimiters) -> void = 0; /** - * Add the needed RegexNFA::states to the passed in nfa to handle the current node before - * transitioning to a pre-tagged end_state + * Add the needed RegexNFA::states to the passed in nfa to handle the + * current node before transitioning to a pre-tagged end_state * @param nfa * @param end_state */ virtual auto add(RegexNFA* nfa, NFAStateType* end_state) -> void = 0; }; -// Leaf node template class RegexASTLiteral : public RegexAST { public: - // Constructor explicit RegexASTLiteral(uint32_t character); /** @@ -65,8 +61,8 @@ class RegexASTLiteral : public RegexAST { } /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule - * containing RegexASTLiteral at a leaf node in its AST + * Sets is_possible_input to specify which utf8 characters are allowed in a + * lexer rule containing RegexASTLiteral at a leaf node in its AST * @param is_possible_input */ auto set_possible_inputs_to_true(bool is_possible_input[]) const -> void override { @@ -74,8 +70,8 @@ class RegexASTLiteral : public RegexAST { } /** - * Transforms '.' to to be any non-delimiter in a lexer rule, which does nothing as - * RegexASTLiteral is a leaf node that is not a RegexASTGroup + * Transforms '.' to to be any non-delimiter in a lexer rule, which does + * nothing as RegexASTLiteral is a leaf node that is not a RegexASTGroup * @param delimiters */ auto remove_delimiters_from_wildcard(std::vector& /* delimiters */) -> void override { @@ -83,8 +79,8 @@ class RegexASTLiteral : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTLiteral before - * transitioning to a pre-tagged end_state + * Add the needed RegexNFA::states to the passed in nfa to handle a + * RegexASTLiteral before transitioning to a pre-tagged end_state * @param nfa * @param end_state */ @@ -96,14 +92,11 @@ class RegexASTLiteral : public RegexAST { uint32_t m_character; }; -// Leaf node template class RegexASTInteger : public RegexAST { public: - // Constructor explicit RegexASTInteger(uint32_t digit); - // Constructor RegexASTInteger(RegexASTInteger* left, uint32_t digit); /** @@ -115,8 +108,8 @@ class RegexASTInteger : public RegexAST { } /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule - * containing RegexASTInteger at a leaf node in its AST + * Sets is_possible_input to specify which utf8 characters are allowed in a + * lexer rule containing RegexASTInteger at a leaf node in its AST * @param is_possible_input */ auto set_possible_inputs_to_true(bool is_possible_input[]) const -> void override { @@ -126,8 +119,8 @@ class RegexASTInteger : public RegexAST { } /** - * Transforms '.' to to be any non-delimiter in a lexer rule, which does nothing as - * RegexASTInteger is a leaf node that is not a RegexASTGroup + * Transforms '.' to to be any non-delimiter in a lexer rule, which does + * nothing as RegexASTInteger is a leaf node that is not a RegexASTGroup * @param delimiters */ auto remove_delimiters_from_wildcard(std::vector& /* delimiters */) -> void override { @@ -135,8 +128,8 @@ class RegexASTInteger : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTInteger before - * transitioning to a pre-tagged end_state + * Add the needed RegexNFA::states to the passed in nfa to handle a + * RegexASTInteger before transitioning to a pre-tagged end_state * @param nfa * @param end_state */ @@ -150,34 +143,25 @@ class RegexASTInteger : public RegexAST { std::vector m_digits; }; -// Lead node template class RegexASTGroup : public RegexAST { public: using Range = std::pair; - // constructor RegexASTGroup(); - // constructor RegexASTGroup(RegexASTGroup* left, RegexASTLiteral* right); - // constructor RegexASTGroup(RegexASTGroup* left, RegexASTGroup* right); - // constructor explicit RegexASTGroup(RegexASTLiteral* right); - // constructor explicit RegexASTGroup(RegexASTGroup* right); - // constructor RegexASTGroup(RegexASTLiteral* left, RegexASTLiteral* right); - // constructor RegexASTGroup(uint32_t min, uint32_t max); - // constructor explicit RegexASTGroup(std::vector const& literals); /** @@ -187,8 +171,8 @@ class RegexASTGroup : public RegexAST { [[nodiscard]] auto clone() const -> RegexASTGroup* override { return new RegexASTGroup(*this); } /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule - * containing RegexASTGroup at a leaf node in its AST + * Sets is_possible_input to specify which utf8 characters are allowed in a + * lexer rule containing RegexASTGroup at a leaf node in its AST * @param is_possible_input */ auto set_possible_inputs_to_true(bool is_possible_input[]) const -> void override { @@ -214,8 +198,8 @@ class RegexASTGroup : public RegexAST { } /** - * Transforms '.' to to be any non-delimiter in a lexer rule if this RegexASTGroup node contains - * `.` (is a wildcard group) + * Transforms '.' to to be any non-delimiter in a lexer rule if this + * RegexASTGroup node contains `.` (is a wildcard group) * @param delimiters */ auto remove_delimiters_from_wildcard(std::vector& delimiters) -> void override { @@ -244,8 +228,8 @@ class RegexASTGroup : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTGroup before - * transitioning to a pre-tagged end_state + * Add the needed RegexNFA::states to the passed in nfa to handle a + * RegexASTGroup before transitioning to a pre-tagged end_state * @param nfa * @param end_state */ @@ -259,15 +243,15 @@ class RegexASTGroup : public RegexAST { private: /** - * Merges multiple ranges such that the resulting m_ranges is sorted and non-overlapping - * @param ranges + * Merges multiple ranges such that the resulting m_ranges is sorted and + * non-overlapping @param ranges * @return std::vector */ static auto merge(std::vector const& ranges) -> std::vector; /** - * Takes the compliment (in the cast of regex `^` at the start of a group) of multiple ranges - * such that m_ranges is sorted and non-overlapping + * Takes the compliment (in the cast of regex `^` at the start of a group) + * of multiple ranges such that m_ranges is sorted and non-overlapping * @param ranges * @return std::vector */ @@ -278,19 +262,17 @@ class RegexASTGroup : public RegexAST { std::vector m_ranges; }; -// Intermediate node - template class RegexASTOr : public RegexAST { public: - // Constructor - RegexASTOr(std::unique_ptr> /*left*/, - std::unique_ptr> /*right*/); + RegexASTOr( + std::unique_ptr> /*left*/, + std::unique_ptr> /*right*/ + ); - // Constructor RegexASTOr(RegexASTOr const& rhs) - : m_left(std::unique_ptr>(rhs.m_left->clone())), - m_right(std::unique_ptr>(rhs.m_right->clone())) {} + : m_left(std::unique_ptr>(rhs.m_left->clone())), + m_right(std::unique_ptr>(rhs.m_right->clone())) {} /** * Used for cloning a unique_pointer of type RegexASTOr @@ -299,8 +281,8 @@ class RegexASTOr : public RegexAST { [[nodiscard]] auto clone() const -> RegexASTOr* override { return new RegexASTOr(*this); } /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule - * containing RegexASTOr at a leaf node in its AST + * Sets is_possible_input to specify which utf8 characters are allowed in a + * lexer rule containing RegexASTOr at a leaf node in its AST * @param is_possible_input */ auto set_possible_inputs_to_true(bool is_possible_input[]) const -> void override { @@ -309,8 +291,8 @@ class RegexASTOr : public RegexAST { } /** - * Transforms '.' to to be any non-delimiter in a lexer rule if RegexASTGroup with `.` is a - * descendant of this RegexASTOr node + * Transforms '.' to to be any non-delimiter in a lexer rule if + * RegexASTGroup with `.` is a descendant of this RegexASTOr node * @param delimiters */ auto remove_delimiters_from_wildcard(std::vector& delimiters) -> void override { @@ -319,8 +301,8 @@ class RegexASTOr : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTOr before - * transitioning to a pre-tagged end_state + * Add the needed RegexNFA::states to the passed in nfa to handle a + * RegexASTOr before transitioning to a pre-tagged end_state * @param nfa * @param end_state */ @@ -331,18 +313,17 @@ class RegexASTOr : public RegexAST { std::unique_ptr> m_right; }; -// Intermediate node template class RegexASTCat : public RegexAST { public: - // Constructor - RegexASTCat(std::unique_ptr> /*left*/, - std::unique_ptr> /*right*/); + RegexASTCat( + std::unique_ptr> /*left*/, + std::unique_ptr> /*right*/ + ); - // Constructor RegexASTCat(RegexASTCat const& rhs) - : m_left(std::unique_ptr>(rhs.m_left->clone())), - m_right(std::unique_ptr>(rhs.m_right->clone())) {} + : m_left(std::unique_ptr>(rhs.m_left->clone())), + m_right(std::unique_ptr>(rhs.m_right->clone())) {} /** * Used for cloning a unique_pointer of type RegexASTCat @@ -351,8 +332,8 @@ class RegexASTCat : public RegexAST { [[nodiscard]] auto clone() const -> RegexASTCat* override { return new RegexASTCat(*this); } /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule - * containing RegexASTCat at a leaf node in its AST + * Sets is_possible_input to specify which utf8 characters are allowed in a + * lexer rule containing RegexASTCat at a leaf node in its AST * @param is_possible_input */ auto set_possible_inputs_to_true(bool is_possible_input[]) const -> void override { @@ -361,8 +342,8 @@ class RegexASTCat : public RegexAST { } /** - * Transforms '.' to to be any non-delimiter in a lexer rule if RegexASTGroup with `.` is a - * descendant of this RegexASTCat node + * Transforms '.' to to be any non-delimiter in a lexer rule if + * RegexASTGroup with `.` is a descendant of this RegexASTCat node * @param delimiters */ auto remove_delimiters_from_wildcard(std::vector& delimiters) -> void override { @@ -371,8 +352,8 @@ class RegexASTCat : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTCat before - * transitioning to a pre-tagged end_state + * Add the needed RegexNFA::states to the passed in nfa to handle a + * RegexASTCat before transitioning to a pre-tagged end_state * @param nfa * @param end_state */ @@ -383,20 +364,19 @@ class RegexASTCat : public RegexAST { std::unique_ptr> m_right; }; -// Intermediate node template class RegexASTMultiplication : public RegexAST { public: - // Constructor - RegexASTMultiplication(std::unique_ptr> operand, - uint32_t min, - uint32_t max); + RegexASTMultiplication( + std::unique_ptr> operand, + uint32_t min, + uint32_t max + ); - // Constructor RegexASTMultiplication(RegexASTMultiplication const& rhs) - : m_operand(std::unique_ptr>(rhs.m_operand->clone())), - m_min(rhs.m_min), - m_max(rhs.m_max) {} + : m_operand(std::unique_ptr>(rhs.m_operand->clone())), + m_min(rhs.m_min), + m_max(rhs.m_max) {} /** * Used for cloning a unique_pointer of type RegexASTMultiplication @@ -407,8 +387,8 @@ class RegexASTMultiplication : public RegexAST { } /** - * Sets is_possible_input to specify which utf8 characters are allowed in a lexer rule - * containing RegexASTMultiplication at a leaf node in its AST + * Sets is_possible_input to specify which utf8 characters are allowed in a + * lexer rule containing RegexASTMultiplication at a leaf node in its AST * @param is_possible_input */ auto set_possible_inputs_to_true(bool is_possible_input[]) const -> void override { @@ -416,8 +396,9 @@ class RegexASTMultiplication : public RegexAST { } /** - * Transforms '.' to to be any non-delimiter in a lexer rule if RegexASTGroup with `.` is a - * descendant of this RegexASTMultiplication node + * Transforms '.' to to be any non-delimiter in a lexer rule if + * RegexASTGroup with `.` is a descendant of this RegexASTMultiplication + * node * @param delimiters */ auto remove_delimiters_from_wildcard(std::vector& delimiters) -> void override { @@ -425,8 +406,8 @@ class RegexASTMultiplication : public RegexAST { } /** - * Add the needed RegexNFA::states to the passed in nfa to handle a RegexASTMultiplication - * before transitioning to a pre-tagged end_state + * Add the needed RegexNFA::states to the passed in nfa to handle a + * RegexASTMultiplication before transitioning to a pre-tagged end_state * @param nfa * @param end_state */ @@ -439,8 +420,8 @@ class RegexASTMultiplication : public RegexAST { uint32_t m_min; uint32_t m_max; }; -} // namespace log_surgeon::finite_automata +} // namespace log_surgeon::finite_automata #include "RegexAST.tpp" -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_AST_HPP +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_AST_HPP diff --git a/src/log_surgeon/finite_automata/RegexAST.tpp b/src/log_surgeon/finite_automata/RegexAST.tpp index b3d5cc4c..d0aa1f53 100644 --- a/src/log_surgeon/finite_automata/RegexAST.tpp +++ b/src/log_surgeon/finite_automata/RegexAST.tpp @@ -1,16 +1,14 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_AST_TPP #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_AST_TPP -// C++ standard libraries #include #include #include #include -// Project headers -#include "../Constants.hpp" -#include "RegexNFA.hpp" -#include "UnicodeIntervalTree.hpp" +#include +#include +#include namespace log_surgeon::finite_automata { @@ -30,22 +28,24 @@ RegexASTInteger::RegexASTInteger(uint32_t digit) { template RegexASTInteger::RegexASTInteger(RegexASTInteger* left, uint32_t digit) - : m_digits(std::move(left->m_digits)) { + : m_digits(std::move(left->m_digits)) { digit = digit - '0'; m_digits.push_back(digit); } template -void RegexASTInteger::add(RegexNFA* /* nfa */, - NFAStateType* /* end_state */) { +void RegexASTInteger< + NFAStateType>::add(RegexNFA* /* nfa */, NFAStateType* /* end_state */) { throw std::runtime_error("Unsupported"); } template -RegexASTOr::RegexASTOr(std::unique_ptr> left, - std::unique_ptr> right) - : m_left(std::move(left)), - m_right(std::move(right)) {} +RegexASTOr::RegexASTOr( + std::unique_ptr> left, + std::unique_ptr> right +) + : m_left(std::move(left)), + m_right(std::move(right)) {} template void RegexASTOr::add(RegexNFA* nfa, NFAStateType* end_state) { @@ -54,10 +54,12 @@ void RegexASTOr::add(RegexNFA* nfa, NFAStateType* en } template -RegexASTCat::RegexASTCat(std::unique_ptr> left, - std::unique_ptr> right) - : m_left(std::move(left)), - m_right(std::move(right)) {} +RegexASTCat::RegexASTCat( + std::unique_ptr> left, + std::unique_ptr> right +) + : m_left(std::move(left)), + m_right(std::move(right)) {} template void RegexASTCat::add(RegexNFA* nfa, NFAStateType* end_state) { @@ -71,14 +73,19 @@ void RegexASTCat::add(RegexNFA* nfa, NFAStateType* e template RegexASTMultiplication::RegexASTMultiplication( - std::unique_ptr> operand, uint32_t min, uint32_t max) - : m_operand(std::move(operand)), - m_min(min), - m_max(max) {} - -template -void RegexASTMultiplication::add(RegexNFA* nfa, - NFAStateType* end_state) { + std::unique_ptr> operand, + uint32_t min, + uint32_t max +) + : m_operand(std::move(operand)), + m_min(min), + m_max(max) {} + +template +void RegexASTMultiplication::add( + RegexNFA* nfa, + NFAStateType* end_state +) { NFAStateType* saved_root = nfa->get_root(); if (this->m_min == 0) { nfa->get_root()->add_epsilon_transition(end_state); @@ -114,8 +121,10 @@ template RegexASTGroup::RegexASTGroup() = default; template -RegexASTGroup::RegexASTGroup(RegexASTGroup* left, - RegexASTLiteral* right) { +RegexASTGroup::RegexASTGroup( + RegexASTGroup* left, + RegexASTLiteral* right +) { if (right == nullptr) { throw std::runtime_error("RegexASTGroup1: right == nullptr: A bracket expression in the " "schema contains illegal characters, remember to escape special " @@ -127,11 +136,13 @@ RegexASTGroup::RegexASTGroup(RegexASTGroup* left, } template -RegexASTGroup::RegexASTGroup(RegexASTGroup* left, - RegexASTGroup* right) - : m_negate(left->m_negate), - m_ranges(left->m_ranges) { - assert(right->m_ranges.size() == 1); // Only add LiteralRange +RegexASTGroup::RegexASTGroup( + RegexASTGroup* left, + RegexASTGroup* right +) + : m_negate(left->m_negate), + m_ranges(left->m_ranges) { + assert(right->m_ranges.size() == 1); // Only add LiteralRange m_ranges.push_back(right->m_ranges[0]); } @@ -148,18 +159,21 @@ RegexASTGroup::RegexASTGroup(RegexASTLiteral* right) template RegexASTGroup::RegexASTGroup(RegexASTGroup* right) : m_negate(false) { - assert(right->m_ranges.size() == 1); // Only add LiteralRange + assert(right->m_ranges.size() == 1); // Only add LiteralRange m_ranges.push_back(right->m_ranges[0]); } template -RegexASTGroup::RegexASTGroup(RegexASTLiteral* left, - RegexASTLiteral* right) { +RegexASTGroup::RegexASTGroup( + RegexASTLiteral* left, + RegexASTLiteral* right +) { if (left == nullptr || right == nullptr) { throw std::runtime_error( "RegexASTGroup3: left == nullptr || right == nullptr: A bracket expression in the " "schema contains illegal characters, remember to escape special characters. Refer " - "to README-Schema.md for more details."); + "to README-Schema.md for more details." + ); } m_negate = false; assert(right->get_character() > left->get_character()); @@ -168,7 +182,7 @@ RegexASTGroup::RegexASTGroup(RegexASTLiteral* left, template RegexASTGroup::RegexASTGroup(std::vector const& literals) - : m_negate(false) { + : m_negate(false) { for (uint32_t literal : literals) { m_ranges.emplace_back(literal, literal); } @@ -230,6 +244,6 @@ void RegexASTGroup::add(RegexNFA* nfa, NFAStateType* nfa->get_root()->add_interval(Interval(r.first, r.second), end_state); } } -} // namespace log_surgeon::finite_automata +} // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_AST_TPP +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_AST_TPP diff --git a/src/log_surgeon/finite_automata/RegexDFA.hpp b/src/log_surgeon/finite_automata/RegexDFA.hpp index 17e8ba87..73e640fc 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.hpp +++ b/src/log_surgeon/finite_automata/RegexDFA.hpp @@ -1,7 +1,6 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP -// C++ standard libraries #include #include #include @@ -9,13 +8,15 @@ #include #include -// Project headers -#include "../Constants.hpp" -#include "RegexNFA.hpp" -#include "UnicodeIntervalTree.hpp" +#include +#include +#include namespace log_surgeon::finite_automata { -enum class RegexDFAStateType { Byte, UTF8 }; +enum class RegexDFAStateType { + Byte, + UTF8 +}; template class RegexDFAState { @@ -33,7 +34,8 @@ class RegexDFAState { } /** - * Returns the next state the DFA transitions to on input character (byte or utf8) + * Returns the next state the DFA transitions to on input character (byte or + * utf8) * @param character * @return RegexDFAState* */ @@ -42,8 +44,9 @@ class RegexDFAState { private: std::vector m_tags; RegexDFAState* m_bytes_transition[cSizeOfByte]; - // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case, - // so we use an empty class (`std::tuple<>`) in that case. + // NOTE: We don't need m_tree_transitions for the `stateType == + // RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`) + // in that case. std::conditional_t> m_tree_transitions; }; @@ -54,7 +57,8 @@ template class RegexDFA { public: /** - * Creates a new DFA state based on a set of NFA states and adds it to m_states + * Creates a new DFA state based on a set of NFA states and adds it to + * m_states * @param set * @return DFAStateType* */ @@ -66,8 +70,8 @@ class RegexDFA { private: std::vector> m_states; }; -} // namespace log_surgeon::finite_automata +} // namespace log_surgeon::finite_automata #include "RegexDFA.tpp" -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_HPP diff --git a/src/log_surgeon/finite_automata/RegexDFA.tpp b/src/log_surgeon/finite_automata/RegexDFA.tpp index cd96fcf1..6fe2fc88 100644 --- a/src/log_surgeon/finite_automata/RegexDFA.tpp +++ b/src/log_surgeon/finite_automata/RegexDFA.tpp @@ -11,8 +11,8 @@ auto RegexDFAState::next(uint32_t character) -> RegexDFAState> result = - m_tree_transitions.find(Interval(character, character)); + std::unique_ptr> result + = m_tree_transitions.find(Interval(character, character)); assert(result->size() <= 1); if (!result->empty()) { return result->front().m_value; @@ -34,6 +34,6 @@ auto RegexDFA::new_state(std::set const& set) -> DF } return state; } -} // namespace log_surgeon::finite_automata +} // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_TPP +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_DFA_TPP diff --git a/src/log_surgeon/finite_automata/RegexNFA.hpp b/src/log_surgeon/finite_automata/RegexNFA.hpp index f29aa199..7639dfb2 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.hpp +++ b/src/log_surgeon/finite_automata/RegexNFA.hpp @@ -1,7 +1,6 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP -// C++ standard libraries #include #include #include @@ -10,12 +9,14 @@ #include #include -// Project headers -#include "../Constants.hpp" -#include "UnicodeIntervalTree.hpp" +#include +#include namespace log_surgeon::finite_automata { -enum class RegexNFAStateType { Byte, UTF8 }; +enum class RegexNFAStateType { + Byte, + UTF8 +}; template class RegexNFAState { @@ -46,8 +47,9 @@ class RegexNFAState { return m_epsilon_transitions; } - auto set_byte_transitions(uint8_t byte, - std::vector*>& byte_transitions) -> void { + auto + set_byte_transitions(uint8_t byte, std::vector*>& byte_transitions) + -> void { m_bytes_transitions[byte] = byte_transitions; } @@ -79,8 +81,9 @@ class RegexNFAState { int m_tag; std::vector*> m_epsilon_transitions; std::vector*> m_bytes_transitions[cSizeOfByte]; - // NOTE: We don't need m_tree_transitions for the `stateType == RegexDFAStateType::Byte` case, - // so we use an empty class (`std::tuple<>`) in that case. + // NOTE: We don't need m_tree_transitions for the `stateType == + // RegexDFAStateType::Byte` case, so we use an empty class (`std::tuple<>`) + // in that case. std::conditional_t> m_tree_transitions; }; @@ -117,8 +120,8 @@ class RegexNFA { std::vector> m_states; NFAStateType* m_root; }; -} // namespace log_surgeon::finite_automata +} // namespace log_surgeon::finite_automata #include "RegexNFA.tpp" -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_HPP diff --git a/src/log_surgeon/finite_automata/RegexNFA.tpp b/src/log_surgeon/finite_automata/RegexNFA.tpp index 83b4609a..62943dd5 100644 --- a/src/log_surgeon/finite_automata/RegexNFA.tpp +++ b/src/log_surgeon/finite_automata/RegexNFA.tpp @@ -1,20 +1,20 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_TPP #define LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_TPP -// C++ standard libraries #include #include #include #include -// Project headers -#include "../Constants.hpp" -#include "UnicodeIntervalTree.hpp" +#include +#include namespace log_surgeon::finite_automata { template -void RegexNFAState::add_interval(Interval interval, - RegexNFAState* dest_state) { +void RegexNFAState::add_interval( + Interval interval, + RegexNFAState* dest_state +) { if (interval.first < cSizeOfByte) { uint32_t bound = std::min(interval.second, cSizeOfByte - 1); for (uint32_t i = interval.first; i <= bound; i++) { @@ -26,8 +26,8 @@ void RegexNFAState::add_interval(Interval interval, if (interval.second < cSizeOfByte) { return; } - std::unique_ptr> overlaps = - m_tree_transitions.pop(interval); + std::unique_ptr> overlaps = m_tree_transitions.pop(interval + ); for (typename Tree::Data const& data : *overlaps) { uint32_t overlap_low = std::max(data.m_interval.first, interval.first); uint32_t overlap_high = std::min(data.m_interval.second, interval.second); @@ -36,15 +36,21 @@ void RegexNFAState::add_interval(Interval interval, tree_states.push_back(dest_state); m_tree_transitions.insert(Interval(overlap_low, overlap_high), tree_states); if (data.m_interval.first < interval.first) { - m_tree_transitions.insert(Interval(data.m_interval.first, interval.first - 1), - data.m_value); + m_tree_transitions.insert( + Interval(data.m_interval.first, interval.first - 1), + data.m_value + ); } else if (data.m_interval.first > interval.first) { - m_tree_transitions.insert(Interval(interval.first, data.m_interval.first - 1), - {dest_state}); + m_tree_transitions.insert( + Interval(interval.first, data.m_interval.first - 1), + {dest_state} + ); } if (data.m_interval.second > interval.second) { - m_tree_transitions.insert(Interval(interval.second + 1, data.m_interval.second), - data.m_value); + m_tree_transitions.insert( + Interval(interval.second + 1, data.m_interval.second), + data.m_value + ); } interval.first = data.m_interval.second + 1; } @@ -68,8 +74,8 @@ void RegexNFA::reverse() { std::map, std::vector> byte_edges; std::map, bool> epsilon_edges; for (std::unique_ptr& src_state_ptr : m_states) { - // TODO: handle utf8 case with if constexpr (RegexNFAUTF8State == NFAStateType) ~ don't - // really need this though + // TODO: handle utf8 case with if constexpr (RegexNFAUTF8State == + // NFAStateType) ~ don't really need this though for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { for (NFAStateType* dest_state_ptr : src_state_ptr->get_byte_transitions(byte)) { std::pair edge{src_state_ptr.get(), dest_state_ptr}; @@ -78,8 +84,9 @@ void RegexNFA::reverse() { src_state_ptr->clear_byte_transitions(byte); } for (NFAStateType* dest_state_ptr : src_state_ptr->get_epsilon_transitions()) { - epsilon_edges[std::pair(src_state_ptr.get(), - dest_state_ptr)] = true; + epsilon_edges + [std::pair(src_state_ptr.get(), dest_state_ptr)] + = true; } src_state_ptr->clear_epsilon_transitions(); } @@ -113,8 +120,8 @@ void RegexNFA::reverse() { unvisited_states.pop(); visited_states.insert(current_state); for (uint32_t byte = 0; byte < cSizeOfByte; byte++) { - std::vector byte_transitions = - current_state->get_byte_transitions(byte); + std::vector byte_transitions + = current_state->get_byte_transitions(byte); for (NFAStateType* next_state : byte_transitions) { if (visited_states.find(next_state) == visited_states.end()) { unvisited_states.push(next_state); @@ -176,6 +183,6 @@ auto RegexNFA::new_state() -> NFAStateType* { m_states.push_back(std::move(ptr)); return state; } -} // namespace log_surgeon::finite_automata +} // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_TPP +#endif // LOG_SURGEON_FINITE_AUTOMATA_REGEX_NFA_TPP diff --git a/src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp b/src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp index f85e346b..e9050f95 100644 --- a/src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp +++ b/src/log_surgeon/finite_automata/UnicodeIntervalTree.hpp @@ -170,8 +170,8 @@ class UnicodeIntervalTree { std::unique_ptr m_root; }; -} // namespace log_surgeon::finite_automata +} // namespace log_surgeon::finite_automata #include "UnicodeIntervalTree.tpp" -#endif // LOG_SURGEON_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_HPP +#endif // LOG_SURGEON_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_HPP diff --git a/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp b/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp index f0b7b524..56e9e0f1 100644 --- a/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp +++ b/src/log_surgeon/finite_automata/UnicodeIntervalTree.tpp @@ -1,7 +1,6 @@ #ifndef LOG_SURGEON_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_TPP #define LOG_SURGEON_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_TPP -// C++ standard libraries #include #include @@ -29,6 +28,7 @@ auto UnicodeIntervalTree::Node::insert(std::unique_ptr node, Interval i } node->update(); return Node::balance(std::move(node)); + } template @@ -91,10 +91,11 @@ auto UnicodeIntervalTree::pop(Interval interval) } template -auto UnicodeIntervalTree::Node::pop(std::unique_ptr node, - Interval interval, - std::unique_ptr* ret) - -> std::unique_ptr::Node> { +auto UnicodeIntervalTree::Node::pop( + std::unique_ptr node, + Interval interval, + std::unique_ptr* ret +) -> std::unique_ptr::Node> { if (node == nullptr) { return nullptr; } @@ -173,8 +174,8 @@ auto UnicodeIntervalTree::Node::balance(std::unique_ptr node) if (factor * factor <= 1) { return node; } - int sub_factor = - (factor < 0) ? node->m_left->balance_factor() : node->m_right->balance_factor(); + int sub_factor = (factor < 0) ? node->m_left->balance_factor() + : node->m_right->balance_factor(); if (factor * sub_factor > 0) { return Node::rotate(std::move(node), factor); } @@ -222,17 +223,17 @@ auto UnicodeIntervalTree::Node::rotate_ccw(std::unique_ptr node) template auto UnicodeIntervalTree::Node::overlaps_recursive(Interval i) -> bool { - return ((m_lower <= i.first) && (i.first <= m_upper)) || - ((m_lower <= i.second) && (i.second <= m_upper)) || - ((i.first <= m_lower) && (m_lower <= i.second)); + return ((m_lower <= i.first) && (i.first <= m_upper)) + || ((m_lower <= i.second) && (i.second <= m_upper)) + || ((i.first <= m_lower) && (m_lower <= i.second)); } template auto UnicodeIntervalTree::Node::overlaps(Interval i) -> bool { - return ((m_interval.first <= i.first) && (i.first <= m_interval.second)) || - ((m_interval.first <= i.second) && (i.second <= m_interval.second)) || - ((i.first <= m_interval.first) && (m_interval.first <= i.second)); + return ((m_interval.first <= i.first) && (i.first <= m_interval.second)) + || ((m_interval.first <= i.second) && (i.second <= m_interval.second)) + || ((i.first <= m_interval.first) && (m_interval.first <= i.second)); } -} // namespace log_surgeon::finite_automata +} // namespace log_surgeon::finite_automata -#endif // LOG_SURGEON_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_TPP +#endif // LOG_SURGEON_FINITE_AUTOMATA_UNICODE_INTERVAL_TREE_TPP diff --git a/src/log_surgeon/utils.hpp b/src/log_surgeon/utils.hpp index 4ea1ea6d..bb3140f0 100644 --- a/src/log_surgeon/utils.hpp +++ b/src/log_surgeon/utils.hpp @@ -21,4 +21,4 @@ auto strfmt(std::string const& fmt, Args... args) -> std::string { return {buf.data(), buf.data() + size}; } -#endif // LOG_SURGEON_UTILS_HPP +#endif // LOG_SURGEON_UTILS_HPP