Fix

am9zZWY · Jul 18, 2024 · 9b09920 · 9b09920
1 parent e303a42
commit 9b09920
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 21 deletions.
diff --git a/engine/download.py b/engine/download.py
@@ -51,6 +51,6 @@ async def process(self):
 
             soup = pickle.loads(lzma.decompress(blob))
 
-            await self.propagate_to_next(soup, doc_id, link)
+            await self.propagate_to_next(soup, link, doc_id = doc_id)
 
             row = self.cursor.fetchone()
diff --git a/engine/index.py b/engine/index.py
@@ -16,12 +16,12 @@ def __init__(self, dbcon: duckdb.DuckDBPyConnection):
 
         self.cursor = dbcon.cursor()
 
-        self._load_state()
+        # self._load_state()
 
     def __del__(self):
         self.cursor.close()
 
-    async def process(self, data, doc_id, link):
+    async def process(self, data, link, doc_id = None):
         """
         Indexes the input data.
         """
@@ -40,10 +40,18 @@ async def process(self, data, doc_id, link):
         description = soup.find("meta", attrs={"name": "description"})
         description_content = description.get("content") if description is not None else ""
 
-        self.cursor.execute("""
-            INSERT INTO documents(id, link, title, description)
-            VALUES (?, ?, ?, ?)
-        """, [doc_id, link, title_content, description_content])
+        if doc_id:
+            self.cursor.execute("""
+                INSERT INTO documents(id, link, title, description)
+                VALUES (?, ?, ?, ?)
+            """, [doc_id, link, title_content, description_content])
+        else:
+            self.cursor.execute("""
+                INSERT INTO documents(link, title, description)
+                VALUES (?, ?, ?)
+            """, [link, title_content, description_content])
+
+            doc_id = self.cursor.execute("SELECT id FROM documents WHERE link = ?", [link]).fetchone()[0]
 
         print(f"Indexed {link}")
         if not self.is_shutdown():

diff --git a/engine/main.py b/engine/main.py
@@ -108,6 +108,31 @@ def signal_handler(signum, frame):
             con.close()
             print("State saved")
 
+    # Compute TF-IDF matrix
+    con.execute("""
+        WITH 
+        DocumentCount(total_docs) AS (
+            SELECT COUNT(*) FROM documents
+        ),
+        TermFrequence AS Inverted_Index,
+        DocumentFrequency(word, doc_count) AS (
+            SELECT word, COUNT(DISTINCT doc) AS doc_count
+            FROM   Inverted_Index
+        ),
+        TFIDF(doc, word, tfidf) AS (
+            SELECT tf.doc, tf.word,
+                   tf.amount * LOG((total_docs * 1.0) / df.doc_count)
+            FROM   TermFrequency AS tf,
+                   DocumentCount AS _(total_docs),
+                   DocumentFrequency AS df
+            WHERE  tf.word = df.word
+        )
+        INSERT INTO TFIDFs (doc, word, tfidf)
+            SELECT doc, word, tfidf
+            FROM   TFIDF
+            WHERE  tfidf > 0
+    """)
+
     # Save the state+
     for element in [crawler, indexer, tokenizer]:
         element.save_state()

diff --git a/engine/setup.sql b/engine/setup.sql
@@ -28,29 +28,28 @@ CREATE TABLE documents (
 );
 
 CREATE TABLE words (
-    word       VARCHAR PRIMARY KEY,
-    id         INTEGER DEFAULT nextval('word_ids') UNIQUE,
-    occurrences INTEGER NOT NULL
+    word        VARCHAR PRIMARY KEY,
+    id          INTEGER DEFAULT nextval('word_ids') UNIQUE,
 );
 
 CREATE TABLE Inverted_Index (
-    word INTEGER,
-    doc  INTEGER,
+    word   INTEGER,
+    doc    INTEGER,
     amount INTEGER,
     PRIMARY KEY (word, doc),
-    FOREIGN KEY (word) ON words(id),
-    FOREIGN KEY (doc)  ON documents(id)
+    FOREIGN KEY (word) REFERENCES words (id),
+    FOREIGN KEY (doc)  REFERENCES documents (id)
 );
 
-CREATE INDEX inverted_index_word(word);
+-- CREATE INDEX inverted_index_word(word);
 
 CREATE TABLE TFIDFs (
     doc   INTEGER,
     word  INTEGER,
     tfidf NUMERIC(10,9) NOT NULL,
     PRIMARY KEY (word, doc),
-    FOREIGN KEY (word) ON words(id),
-    FOREIGN KEY (doc) ON documents(id)
+    FOREIGN KEY (word) REFERENCES words (id),
+    FOREIGN KEY (doc)  REFERENCES documents (id)
 );
 
-CREATE INDEX tfidfs_word(word);
+-- CREATE INDEX tfidfs_word (word);
diff --git a/engine/tokenizer.py b/engine/tokenizer.py
@@ -228,7 +228,8 @@ async def process(self, data, doc_id, link):
         # Tokenize the text
         try:
             tokenized_text : list[str] = process_text(text=text)
-            tokens = pd.DataFrame({'doc_id': doc_id, 'token': tokenized_text})
+            tokens = pd.DataFrame({'token': tokenized_text})
+            tokens['doc_id'] = doc_id
             self.cursor.execute("""
                 INSERT INTO words(word)
                 SELECT DISTINCT token
@@ -237,14 +238,17 @@ async def process(self, data, doc_id, link):
 
             self.cursor.execute("""
                 INSERT INTO Inverted_Index(word, doc, amount)
-                SELECT w.id, t.doc_id, COUNT(t.token)
+                SELECT w.id, t.doc_id, COUNT(*)
                 FROM   tokens AS t, words AS w
                 WHERE  t.token = w.word
-                GROUP BY t.token
+                GROUP BY w.id, t.doc_id, t.token
             """)
             print(f"Tokenized text for {link}")
         except Exception as e:
             print(f"Error tokenizing text for {link}: {str(e)}")
+            tokens = pd.DataFrame({'token': tokenized_text})
+            tokens['doc_id'] = doc_id
+            print(doc_id)
 
 
 def clean_text(text):