Skip to content

Commit

Permalink
Merge pull request #37 from am9zZWY/tokenization
Browse files Browse the repository at this point in the history
Tokenization
  • Loading branch information
am9zZWY authored Jul 17, 2024
2 parents b302f9c + ccd9fa9 commit 02b75f7
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 16 deletions.
5 changes: 3 additions & 2 deletions engine/custom_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import pandas as pd

from custom_db import get_doc_by_id
from custom_tokenizer import tokenize_data
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizer import process_text


def preprocess_query(Q):
tokenized_query = tokenize_data(Q)
tokenized_query = process_text(Q)
return tokenized_query


Expand Down
2 changes: 1 addition & 1 deletion engine/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# Pipeline
from crawl import Crawler
from custom_db import index_pages, access_index, save_pages
from custom_tokenizer import Tokenizer
from tokenizer import Tokenizer
from index import Indexer

# Threading
Expand Down
1 change: 1 addition & 0 deletions engine/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ pandas==2.2.2
scikit-learn==1.5.1
aiohttp==3.9.5
spacy==3.7.5
lxml==5.2.2
9 changes: 0 additions & 9 deletions engine/test.py

This file was deleted.

10 changes: 6 additions & 4 deletions engine/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,9 @@ async def process(self, data, link):
"I'm 6'2\" tall and I weigh 180 lbs. I'm 25 years old.",
]

for sentence in test_sentences:
print(f"Original: {sentence}")
print(f"Tokenized: {process_text(sentence)}")
print()
if __name__ == "__main__":

for sentence in test_sentences:
print(f"Original: {sentence}")
print(f"Tokenized: {process_text(sentence)}")
print()

0 comments on commit 02b75f7

Please sign in to comment.