diff --git a/engine/custom_rank.py b/engine/custom_rank.py index 128ec99..d00847d 100644 --- a/engine/custom_rank.py +++ b/engine/custom_rank.py @@ -3,12 +3,13 @@ import pandas as pd from custom_db import get_doc_by_id -from custom_tokenizer import tokenize_data from sklearn.feature_extraction.text import TfidfVectorizer +from tokenizer import process_text + def preprocess_query(Q): - tokenized_query = tokenize_data(Q) + tokenized_query = process_text(Q) return tokenized_query diff --git a/engine/main.py b/engine/main.py index 780c7cd..451e7b7 100755 --- a/engine/main.py +++ b/engine/main.py @@ -14,7 +14,7 @@ # Pipeline from crawl import Crawler from custom_db import index_pages, access_index, save_pages -from custom_tokenizer import Tokenizer +from tokenizer import Tokenizer from index import Indexer # Threading diff --git a/engine/requirements.txt b/engine/requirements.txt index 9cf4449..4792ab8 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -12,3 +12,4 @@ pandas==2.2.2 scikit-learn==1.5.1 aiohttp==3.9.5 spacy==3.7.5 +lxml==5.2.2 diff --git a/engine/test.py b/engine/test.py deleted file mode 100644 index ad58a80..0000000 --- a/engine/test.py +++ /dev/null @@ -1,9 +0,0 @@ -# file to test the written functions -import logging - -from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words - -CUSTOM_TEXT = "Lorem Ipsum is simply dummy text" + " " + " \n "+ "of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum." - -top_30_words = top_30_words([CUSTOM_TEXT]) -print(top_30_words) diff --git a/engine/tokenizer.py b/engine/tokenizer.py index 925996d..7fc5f19 100644 --- a/engine/tokenizer.py +++ b/engine/tokenizer.py @@ -257,7 +257,9 @@ async def process(self, data, link): "I'm 6'2\" tall and I weigh 180 lbs. I'm 25 years old.", ] -for sentence in test_sentences: - print(f"Original: {sentence}") - print(f"Tokenized: {process_text(sentence)}") - print() +if __name__ == "__main__": + + for sentence in test_sentences: + print(f"Original: {sentence}") + print(f"Tokenized: {process_text(sentence)}") + print()