Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
PhictionalOne committed Jul 18, 2024
1 parent e303a42 commit 9b09920
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 21 deletions.
2 changes: 1 addition & 1 deletion engine/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,6 @@ async def process(self):

soup = pickle.loads(lzma.decompress(blob))

await self.propagate_to_next(soup, doc_id, link)
await self.propagate_to_next(soup, link, doc_id = doc_id)

row = self.cursor.fetchone()
20 changes: 14 additions & 6 deletions engine/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection):

self.cursor = dbcon.cursor()

self._load_state()
# self._load_state()

def __del__(self):
self.cursor.close()

async def process(self, data, doc_id, link):
async def process(self, data, link, doc_id = None):
"""
Indexes the input data.
"""
Expand All @@ -40,10 +40,18 @@ async def process(self, data, doc_id, link):
description = soup.find("meta", attrs={"name": "description"})
description_content = description.get("content") if description is not None else ""

self.cursor.execute("""
INSERT INTO documents(id, link, title, description)
VALUES (?, ?, ?, ?)
""", [doc_id, link, title_content, description_content])
if doc_id:
self.cursor.execute("""
INSERT INTO documents(id, link, title, description)
VALUES (?, ?, ?, ?)
""", [doc_id, link, title_content, description_content])
else:
self.cursor.execute("""
INSERT INTO documents(link, title, description)
VALUES (?, ?, ?)
""", [link, title_content, description_content])

doc_id = self.cursor.execute("SELECT id FROM documents WHERE link = ?", [link]).fetchone()[0]

print(f"Indexed {link}")
if not self.is_shutdown():
Expand Down
25 changes: 25 additions & 0 deletions engine/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,31 @@ def signal_handler(signum, frame):
con.close()
print("State saved")

# Compute TF-IDF matrix
con.execute("""
WITH
DocumentCount(total_docs) AS (
SELECT COUNT(*) FROM documents
),
TermFrequence AS Inverted_Index,
DocumentFrequency(word, doc_count) AS (
SELECT word, COUNT(DISTINCT doc) AS doc_count
FROM Inverted_Index
),
TFIDF(doc, word, tfidf) AS (
SELECT tf.doc, tf.word,
tf.amount * LOG((total_docs * 1.0) / df.doc_count)
FROM TermFrequency AS tf,
DocumentCount AS _(total_docs),
DocumentFrequency AS df
WHERE tf.word = df.word
)
INSERT INTO TFIDFs (doc, word, tfidf)
SELECT doc, word, tfidf
FROM TFIDF
WHERE tfidf > 0
""")

# Save the state+
for element in [crawler, indexer, tokenizer]:
element.save_state()
Expand Down
21 changes: 10 additions & 11 deletions engine/setup.sql
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,28 @@ CREATE TABLE documents (
);

CREATE TABLE words (
word VARCHAR PRIMARY KEY,
id INTEGER DEFAULT nextval('word_ids') UNIQUE,
occurrences INTEGER NOT NULL
word VARCHAR PRIMARY KEY,
id INTEGER DEFAULT nextval('word_ids') UNIQUE,
);

CREATE TABLE Inverted_Index (
word INTEGER,
doc INTEGER,
word INTEGER,
doc INTEGER,
amount INTEGER,
PRIMARY KEY (word, doc),
FOREIGN KEY (word) ON words(id),
FOREIGN KEY (doc) ON documents(id)
FOREIGN KEY (word) REFERENCES words (id),
FOREIGN KEY (doc) REFERENCES documents (id)
);

CREATE INDEX inverted_index_word(word);
-- CREATE INDEX inverted_index_word(word);

CREATE TABLE TFIDFs (
doc INTEGER,
word INTEGER,
tfidf NUMERIC(10,9) NOT NULL,
PRIMARY KEY (word, doc),
FOREIGN KEY (word) ON words(id),
FOREIGN KEY (doc) ON documents(id)
FOREIGN KEY (word) REFERENCES words (id),
FOREIGN KEY (doc) REFERENCES documents (id)
);

CREATE INDEX tfidfs_word(word);
-- CREATE INDEX tfidfs_word (word);
10 changes: 7 additions & 3 deletions engine/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,8 @@ async def process(self, data, doc_id, link):
# Tokenize the text
try:
tokenized_text : list[str] = process_text(text=text)
tokens = pd.DataFrame({'doc_id': doc_id, 'token': tokenized_text})
tokens = pd.DataFrame({'token': tokenized_text})
tokens['doc_id'] = doc_id
self.cursor.execute("""
INSERT INTO words(word)
SELECT DISTINCT token
Expand All @@ -237,14 +238,17 @@ async def process(self, data, doc_id, link):

self.cursor.execute("""
INSERT INTO Inverted_Index(word, doc, amount)
SELECT w.id, t.doc_id, COUNT(t.token)
SELECT w.id, t.doc_id, COUNT(*)
FROM tokens AS t, words AS w
WHERE t.token = w.word
GROUP BY t.token
GROUP BY w.id, t.doc_id, t.token
""")
print(f"Tokenized text for {link}")
except Exception as e:
print(f"Error tokenizing text for {link}: {str(e)}")
tokens = pd.DataFrame({'token': tokenized_text})
tokens['doc_id'] = doc_id
print(doc_id)


def clean_text(text):
Expand Down

0 comments on commit 9b09920

Please sign in to comment.