diff --git a/extra/cpp/tokenizers/CMakeLists.txt b/extra/cpp/tokenizers/CMakeLists.txt new file mode 100644 index 0000000..a790d93 --- /dev/null +++ b/extra/cpp/tokenizers/CMakeLists.txt @@ -0,0 +1,37 @@ +cmake_minimum_required(VERSION 3.10) +project(Tokenizers) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Add GoogleTest +include(FetchContent) +FetchContent_Declare( + googletest + DOWNLOAD_EXTRACT_TIMESTAMP ON + URL https://github.com/google/googletest/archive/refs/tags/v1.15.2.zip) +FetchContent_MakeAvailable(googletest) + +enable_testing() + +add_library(tokenizers STATIC + src/metta_tokenizer.cc) + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${GTEST_INCLUDE_DIRS}) + +add_executable( + test_metta_tokenizer + tests/test_metta_tokenizer.cc) + +target_link_libraries( + test_metta_tokenizer + tokenizers + gtest_main + ${GTEST_LIBRARIES} + pthread) + +include(GoogleTest) +gtest_discover_tests(test_metta_tokenizer) diff --git a/extra/cpp/tokenizers/Makefile b/extra/cpp/tokenizers/Makefile new file mode 100644 index 0000000..1626405 --- /dev/null +++ b/extra/cpp/tokenizers/Makefile @@ -0,0 +1,10 @@ +clean: + @rm -rf ./build + +build-tests: clean + @mkdir -p ./build \ + && cmake -S . -B ./build \ + && cmake --build ./build --parallel $(nproc) + +unit-tests: build-tests + make -C ./build test diff --git a/extra/cpp/tokenizers/include/metta_tokenizer.h b/extra/cpp/tokenizers/include/metta_tokenizer.h new file mode 100644 index 0000000..0bf9390 --- /dev/null +++ b/extra/cpp/tokenizers/include/metta_tokenizer.h @@ -0,0 +1,30 @@ +#pragma once + +#include + +using namespace std; + +// ------------------------------------------------------------------------------------------------- +/** + * @brief Parses a MeTTa expression into a tokenized string stream. + * + * This function processes the input MeTTa expression string and converts it into a tokenized string + * stream. The expression is expected to be in the format `(Similarity (Concept "human") $v1)`, where + * elements inside the parentheses are links of type `Expression`. Each element inside the + * parentheses, such as `Similarity`, `Concept`, and `"human"`, are nodes of type `Symbol`, + * except for those that start with `$`, which are variables. + * + * Example: + * + * Input: `(Similarity (Concept "human") $v1)` + * + * Output: `LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK Expression 2 NODE Symbol Concept NODE Symbol "human" VARIABLE v1` + * + * @param expression The input MeTTa expression string to be tokenized. + * @return A tokenized string stream representing the parsed expression. + * @throws runtime_error if the expression is invalid. + */ +string tokenize(const string& expression); + +// ------------------------------------------------------------------------------------------------- + diff --git a/extra/cpp/tokenizers/main.cc b/extra/cpp/tokenizers/main.cc new file mode 100644 index 0000000..10330f4 --- /dev/null +++ b/extra/cpp/tokenizers/main.cc @@ -0,0 +1,15 @@ +#include + +#include "metta_tokenizer.h" + +using namespace std; + +// ------------------------------------------------------------------------------------------------- +int main() { + string expression = "(Similarity (Concept \"human\") $v1)"; + auto tokenized = tokenize(expression); + cout << tokenize(expression) << endl; + return 0; +} + +// ------------------------------------------------------------------------------------------------- diff --git a/extra/cpp/tokenizers/src/metta_tokenizer.cc b/extra/cpp/tokenizers/src/metta_tokenizer.cc new file mode 100644 index 0000000..d67defd --- /dev/null +++ b/extra/cpp/tokenizers/src/metta_tokenizer.cc @@ -0,0 +1,79 @@ +#include +#include +#include + +#include "metta_tokenizer.h" + +using namespace std; + +// ------------------------------------------------------------------------------------------------- +/** + * @brief Parses a MeTTa expression into a tokenized string stream. + * + * This function processes the input MeTTa expression string starting from the given cursor position + * and returns a pair containing the updated cursor position and the tokenized string stream. + * + * @param expression The input MeTTa expression string to be tokenized. + * @param cursor The starting position in the expression string. Defaults to 0. + * @return A pair containing the updated cursor position and the tokenized string stream. + * @throws runtime_error if the expression is invalid. + */ +pair _tokenize(const string& expression, size_t cursor = 0) { + string output; + string header = "LINK Expression"; + int target_count = 0; + string token; + char ch; + size_t start = cursor; + + for (; cursor < expression.size(); cursor++) { + ch = expression[cursor]; + + if (ch == '(') { + if (cursor > start) { + tie(cursor, token) = _tokenize(expression, cursor); + output += " " + token; + target_count++; + } + continue; + + } else if (ch == ')') { + return make_pair(cursor, header + " " + to_string(target_count) + output); + + } else if (isspace(ch)) { + continue; + + } else { + token.clear(); + while ( + cursor < expression.size() + and not isspace(expression[cursor]) + and expression[cursor] != '(' + and expression[cursor] != ')' + ) { + token += expression[cursor++]; + } + --cursor; + + if (token[0] == '$') { + header = "LINK_TEMPLATE Expression"; + output += " VARIABLE " + token.substr(1); + target_count++; + } else { + output += " NODE Symbol " + token; + target_count++; + } + } + } + + throw runtime_error("Invalid expression"); +} + +// ------------------------------------------------------------------------------------------------- +string tokenize(const string& expression) { + auto [_, tokenized_query] = _tokenize(expression); + return tokenized_query; +} + +// ------------------------------------------------------------------------------------------------- + diff --git a/extra/cpp/tokenizers/tests/test_metta_tokenizer.cc b/extra/cpp/tokenizers/tests/test_metta_tokenizer.cc new file mode 100644 index 0000000..9eb8bc9 --- /dev/null +++ b/extra/cpp/tokenizers/tests/test_metta_tokenizer.cc @@ -0,0 +1,30 @@ +#include + +#include "metta_tokenizer.h" + +TEST(MettaTokenizerTest, BasicAssertions) { + string expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\" VARIABLE v1"; + string expression = "(Similarity (Concept \"human\") $v1)"; + string actual = tokenize(expression); + EXPECT_EQ(actual, expected); + + expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity VARIABLE v1 LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\""; + expression = "(Similarity $v1 (Concept \"human\"))"; + actual = tokenize(expression); + EXPECT_EQ(actual, expected); + + expected = "LINK_TEMPLATE Expression 4 NODE Symbol Similarity VARIABLE v0 LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\" VARIABLE v1"; + expression = "(Similarity $v0 (Concept \"human\") $v1)"; + actual = tokenize(expression); + EXPECT_EQ(actual, expected); + + expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK_TEMPLATE Expression 2 NODE Symbol Concept VARIABLE v0 VARIABLE v1"; + expression = "(Similarity (Concept $v0) $v1)"; + actual = tokenize(expression); + EXPECT_EQ(actual, expected); +} + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}