Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add InterRanker #28

Merged
merged 18 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions engine/custom_rank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import pandas as pd
from custom_tokenizer import tokenize_data, tf_idf_vectorize
from sklearn.feature_extraction.text import TfidfVectorizer
def preprocess_query(Q):
tokenized_query = tokenize_data(Q)
return tokenized_query


def find_intersection_2(Q):
df_inverted = pd.read_csv("engine/inverted_index.csv",sep=",", index_col=1)
df_inverted.drop(columns=["Unnamed: 0"], inplace=True)

# df_inverted.set_index("word", inplace=True)
print(df_inverted.columns)
print(df_inverted.head())
tokenized_query = preprocess_query(Q)
print(tokenized_query)
result = []
for token in tokenized_query:
if token in df_inverted.word.values:
print(f"Found token: {token}")
doc_ids = df_inverted[df_inverted["word"]==token]["doc_ids"].apply(eval)
print(f"It has {len(doc_ids)} doc_ids")
result.append(doc_ids)
# print(f"result: {result}")
# find intersection of all lists in result
intersection = set(result[0]).intersection(*result)
return intersection



def find_documents(Q):
df_inverted = pd.read_csv("engine/inverted_index.csv", converters={'doc_ids': pd.eval})
df_inverted.set_index("word", inplace=True)
df_inverted.drop(columns=["Unnamed: 0"], inplace=True)



print(df_inverted.head())
tokenized_query = preprocess_query(Q)
print(df_inverted.index.values)
result = []
for token in tokenized_query:
if token in df_inverted.index.values:
print(f"Found token: {token}")
doc_ids = df_inverted.loc[token].doc_ids
print(f"It has {len(doc_ids)} doc_ids")
result.append(doc_ids)
# find intersection of all lists in result
intersection = set(result[0]).intersection(*result)
union = set(result[0]).union(*result)
if len(intersection) < 2:
print("No intersection found")
return union
return intersection

def dummy(tokens):
return tokens

def generate_tf_idf_matrix(path):
df = pd.read_csv("engine/pages.csv", converters={'tokenized_text': pd.eval})
df_text = df["tokenized_text"]
# create list of lists containing the tokenized text
tokenized_text = []
print(type(df_text.values))
vectorizer = TfidfVectorizer(tokenizer=dummy, preprocessor=dummy)
X = vectorizer.fit_transform(df_text.values)
features = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
print(features)

return features

def rank_documents(subset_D, Q, X):
# Filter the DataFrame to include only the documents in subset_D
subset_adj = [x-1 for x in subset_D]
filtered_X = X.loc[list(subset_adj)] # here accessen wir rows

# Ensure Q is a list of query terms
query_terms = preprocess_query(Q)
query_terms_in_X = [term for term in query_terms if term in X.columns]
# Filter the DataFrame to include only the columns corresponding to the query terms
if not query_terms_in_X:
print("No query terms found in the TF-IDF matrix.")
return pd.DataFrame()
filtered_X_query_terms = filtered_X[query_terms_in_X] # here accessen wir ganze columns

# Sum the TF-IDF values for each document
filtered_X['sum_tfidf'] = filtered_X_query_terms.sum(axis=1)

# Rank the documents by the summed TF-IDF values in descending order
ranked_docs = filtered_X.sort_values(by='sum_tfidf', ascending=False)

return ranked_docs






query = "max animal future"
docs = find_documents(query)
X = generate_tf_idf_matrix('engine/pages.csv')
print(f"Found {len(docs)} documents, they look like this: {docs}")
print(f"Result: {generate_tf_idf_matrix('engine/pages.csv')}")

ranked_docs = rank_documents(docs, query, X)
best_20_docs = ranked_docs.head(20).index + 1
print(f"Best 20 docs: {best_20_docs}")
8 changes: 6 additions & 2 deletions engine/custom_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,17 @@ def tokenize_data(data) -> list[str]:
data = pipe(data)
return data


# Following problem: TFIDF vectorizer nimmt einen ganzen plain text und tokenized ihn dann selbst. Wir haben aber schon fertige tokenized sachen.
# Damit wir den datentypen nicht hin und her und wir unnötig das leben komolziert machen, müssen wir viele steps wie tf idf iund tokenizing direkt nach dem crawlen machen
# ist zwar in der pipeline nicht ganz so schön aber sonst müssen wir vieles doppelt machen und abspeichern
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
def tf_idf_vectorize(data):
"""
Vectorizes the input data using the TF-IDF algorithm.
"""
# Create the vectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english")
# vectorizer = TfidfVectorizer(tokenizer=tokenize_data, stop_words="english") # hier müssen wir schauen was wir für tokenizer machen
vectorizer = TfidfVectorizer()
# Vectorize the data
X = vectorizer.fit_transform(data)
return X
Expand Down