Skip to content

Commit

Permalink
Add more information to ranked results to be compatible with web inte…
Browse files Browse the repository at this point in the history
…rface
  • Loading branch information
am9zZWY committed Jul 16, 2024
1 parent cc0ba01 commit 7c47662
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 10 deletions.
9 changes: 9 additions & 0 deletions engine/custom_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ def save_pages() -> None:
pages_df.to_csv("pages.csv", index=False, header=headers)


def get_doc_by_id(page_id: int):
global pages_df
page = pages_df[pages_df['id'] == page_id]
return page


def load_pages() -> pd.DataFrame:
"""
Load the pages DataFrame from a CSV file.
Expand All @@ -177,3 +183,6 @@ def load_pages() -> pd.DataFrame:

logging.info("Loaded pages")
return pages_df


load_pages()
37 changes: 27 additions & 10 deletions engine/custom_rank.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pandas as pd
from custom_tokenizer import tokenize_data, tf_idf_vectorize

from custom_db import get_doc_by_id
from custom_tokenizer import tokenize_data
from sklearn.feature_extraction.text import TfidfVectorizer
def preprocess_query(Q):
tokenized_query = tokenize_data(Q)
Expand Down Expand Up @@ -89,13 +91,29 @@ def rank_documents(subset_D, Q, X):

# Rank the documents by the summed TF-IDF values in descending order
ranked_docs = filtered_X.sort_values(by='sum_tfidf', ascending=False)

return ranked_docs






# Map document ID to document with title, URL, and snippet
ranking = []
for index, ranked_doc in ranked_docs.iterrows():
score = ranked_doc['sum_tfidf']

doc = get_doc_by_id(index)
title = str(doc['title'].values[0]) if not doc.empty else ""
url = str(doc['url'].values[0]) if not doc.empty else ""
snippet = str(doc['snippet'].values[0]) if not doc.empty else ""

result = {
"id": index,
"title": title,
"url": url,
"description": snippet if snippet else "",
"summary": "",
"score": score
}
ranking.append(result)

return ranking


query = "max animal future"
docs = find_documents(query)
Expand All @@ -104,5 +122,4 @@ def rank_documents(subset_D, Q, X):
print(f"Result: {generate_tf_idf_matrix('engine/pages.csv')}")

ranked_docs = rank_documents(docs, query, X)
best_20_docs = ranked_docs.head(20).index + 1
print(f"Best 20 docs: {best_20_docs}")
print(f"Best 20 docs: {ranked_docs[:20]}")

0 comments on commit 7c47662

Please sign in to comment.