Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[#355] Adding MeTTa tokenizer #365

Merged
merged 7 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions extra/cpp/tokenizers/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
cmake_minimum_required(VERSION 3.10)
project(Tokenizers)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Add GoogleTest
include(FetchContent)
FetchContent_Declare(
googletest
DOWNLOAD_EXTRACT_TIMESTAMP ON
URL https://github.com/google/googletest/archive/refs/tags/v1.15.2.zip)
FetchContent_MakeAvailable(googletest)

enable_testing()

add_library(tokenizers STATIC
src/metta_tokenizer.cc)

include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/include
${GTEST_INCLUDE_DIRS})

add_executable(
test_metta_tokenizer
tests/test_metta_tokenizer.cc)

target_link_libraries(
test_metta_tokenizer
tokenizers
gtest_main
${GTEST_LIBRARIES}
pthread)

include(GoogleTest)
gtest_discover_tests(test_metta_tokenizer)
10 changes: 10 additions & 0 deletions extra/cpp/tokenizers/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
clean:
@rm -rf ./build

build-tests: clean
@mkdir -p ./build \
&& cmake -S . -B ./build \
&& cmake --build ./build --parallel $(nproc)

unit-tests: build-tests
make -C ./build test
30 changes: 30 additions & 0 deletions extra/cpp/tokenizers/include/metta_tokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#pragma once

#include <string>

using namespace std;

// -------------------------------------------------------------------------------------------------
/**
* @brief Parses a MeTTa expression into a tokenized string stream.
*
* This function processes the input MeTTa expression string and converts it into a tokenized string
* stream. The expression is expected to be in the format `(Similarity (Concept "human") $v1)`, where
* elements inside the parentheses are links of type `Expression`. Each element inside the
* parentheses, such as `Similarity`, `Concept`, and `"human"`, are nodes of type `Symbol`,
* except for those that start with `$`, which are variables.
*
* Example:
*
* Input: `(Similarity (Concept "human") $v1)`
*
* Output: `LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK Expression 2 NODE Symbol Concept NODE Symbol "human" VARIABLE v1`
*
* @param expression The input MeTTa expression string to be tokenized.
* @return A tokenized string stream representing the parsed expression.
* @throws runtime_error if the expression is invalid.
*/
string tokenize(const string& expression);

// -------------------------------------------------------------------------------------------------

15 changes: 15 additions & 0 deletions extra/cpp/tokenizers/main.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include <iostream>

#include "metta_tokenizer.h"

using namespace std;

// -------------------------------------------------------------------------------------------------
int main() {
string expression = "(Similarity (Concept \"human\") $v1)";
auto tokenized = tokenize(expression);
cout << tokenize(expression) << endl;
return 0;
}

// -------------------------------------------------------------------------------------------------
79 changes: 79 additions & 0 deletions extra/cpp/tokenizers/src/metta_tokenizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#include <stdexcept>
#include <tuple>
#include <vector>

#include "metta_tokenizer.h"

using namespace std;

// -------------------------------------------------------------------------------------------------
/**
* @brief Parses a MeTTa expression into a tokenized string stream.
*
* This function processes the input MeTTa expression string starting from the given cursor position
* and returns a pair containing the updated cursor position and the tokenized string stream.
*
* @param expression The input MeTTa expression string to be tokenized.
* @param cursor The starting position in the expression string. Defaults to 0.
* @return A pair containing the updated cursor position and the tokenized string stream.
* @throws runtime_error if the expression is invalid.
*/
pair<size_t, string> _tokenize(const string& expression, size_t cursor = 0) {
string output;
string header = "LINK Expression";
int target_count = 0;
string token;
char ch;
size_t start = cursor;

for (; cursor < expression.size(); cursor++) {
ch = expression[cursor];

if (ch == '(') {
if (cursor > start) {
tie(cursor, token) = _tokenize(expression, cursor);
output += " " + token;
target_count++;
}
continue;

} else if (ch == ')') {
return make_pair(cursor, header + " " + to_string(target_count) + output);

} else if (isspace(ch)) {
continue;

} else {
token.clear();
while (
cursor < expression.size()
and not isspace(expression[cursor])
and expression[cursor] != '('
and expression[cursor] != ')'
) {
token += expression[cursor++];
}
--cursor;

if (token[0] == '$') {
header = "LINK_TEMPLATE Expression";
output += " VARIABLE " + token.substr(1);
target_count++;
} else {
output += " NODE Symbol " + token;
target_count++;
}
}
}

throw runtime_error("Invalid expression");
}

// -------------------------------------------------------------------------------------------------
string tokenize(const string& expression) {
auto [_, tokenized_query] = _tokenize(expression);
return tokenized_query;
}

// -------------------------------------------------------------------------------------------------

30 changes: 30 additions & 0 deletions extra/cpp/tokenizers/tests/test_metta_tokenizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include <gtest/gtest.h>

#include "metta_tokenizer.h"

TEST(MettaTokenizerTest, BasicAssertions) {
string expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\" VARIABLE v1";
string expression = "(Similarity (Concept \"human\") $v1)";
string actual = tokenize(expression);
EXPECT_EQ(actual, expected);

expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity VARIABLE v1 LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\"";
expression = "(Similarity $v1 (Concept \"human\"))";
actual = tokenize(expression);
EXPECT_EQ(actual, expected);

expected = "LINK_TEMPLATE Expression 4 NODE Symbol Similarity VARIABLE v0 LINK Expression 2 NODE Symbol Concept NODE Symbol \"human\" VARIABLE v1";
expression = "(Similarity $v0 (Concept \"human\") $v1)";
actual = tokenize(expression);
EXPECT_EQ(actual, expected);

expected = "LINK_TEMPLATE Expression 3 NODE Symbol Similarity LINK_TEMPLATE Expression 2 NODE Symbol Concept VARIABLE v0 VARIABLE v1";
expression = "(Similarity (Concept $v0) $v1)";
actual = tokenize(expression);
EXPECT_EQ(actual, expected);
}

int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}