From 23f59b0680450629eb60b59d43fddcd14f4d0a29 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Wed, 17 Jul 2024 15:19:42 +0200 Subject: [PATCH 1/3] Re-add lxml --- engine/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/engine/requirements.txt b/engine/requirements.txt index 9cf4449..4792ab8 100644 --- a/engine/requirements.txt +++ b/engine/requirements.txt @@ -12,3 +12,4 @@ pandas==2.2.2 scikit-learn==1.5.1 aiohttp==3.9.5 spacy==3.7.5 +lxml==5.2.2 From 80dcdf28f9efea1fb4faa09c0e5ed0fba1187f44 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Wed, 17 Jul 2024 15:19:58 +0200 Subject: [PATCH 2/3] Adapt files to changed imports --- engine/custom_rank.py | 5 +++-- engine/main.py | 2 +- engine/test.py | 9 --------- 3 files changed, 4 insertions(+), 12 deletions(-) delete mode 100644 engine/test.py diff --git a/engine/custom_rank.py b/engine/custom_rank.py index 128ec99..d00847d 100644 --- a/engine/custom_rank.py +++ b/engine/custom_rank.py @@ -3,12 +3,13 @@ import pandas as pd from custom_db import get_doc_by_id -from custom_tokenizer import tokenize_data from sklearn.feature_extraction.text import TfidfVectorizer +from tokenizer import process_text + def preprocess_query(Q): - tokenized_query = tokenize_data(Q) + tokenized_query = process_text(Q) return tokenized_query diff --git a/engine/main.py b/engine/main.py index 780c7cd..451e7b7 100755 --- a/engine/main.py +++ b/engine/main.py @@ -14,7 +14,7 @@ # Pipeline from crawl import Crawler from custom_db import index_pages, access_index, save_pages -from custom_tokenizer import Tokenizer +from tokenizer import Tokenizer from index import Indexer # Threading diff --git a/engine/test.py b/engine/test.py deleted file mode 100644 index ad58a80..0000000 --- a/engine/test.py +++ /dev/null @@ -1,9 +0,0 @@ -# file to test the written functions -import logging - -from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words - -CUSTOM_TEXT = "Lorem Ipsum is simply dummy text" + " " + " \n "+ "of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum." - -top_30_words = top_30_words([CUSTOM_TEXT]) -print(top_30_words) From ccd9fa90719784b7594f2a59007c4ecd8a96347c Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Wed, 17 Jul 2024 15:20:09 +0200 Subject: [PATCH 3/3] Add main for tokenizer --- engine/tokenizer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/engine/tokenizer.py b/engine/tokenizer.py index 925996d..7fc5f19 100644 --- a/engine/tokenizer.py +++ b/engine/tokenizer.py @@ -257,7 +257,9 @@ async def process(self, data, link): "I'm 6'2\" tall and I weigh 180 lbs. I'm 25 years old.", ] -for sentence in test_sentences: - print(f"Original: {sentence}") - print(f"Tokenized: {process_text(sentence)}") - print() +if __name__ == "__main__": + + for sentence in test_sentences: + print(f"Original: {sentence}") + print(f"Tokenized: {process_text(sentence)}") + print()