Merge pull request #9 from am9zZWY/lukas-tokenize

Add tokenizer pipeline
am9zZWY · Jul 10, 2024 · 5a5c17c · 5a5c17c
2 parents afd8c5f + a82845f
commit 5a5c17c
Show file tree

Hide file tree

Showing 5 changed files with 153 additions and 31 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 crawler_states/*
+*.csv
 
 # Created by https://www.toptal.com/developers/gitignore/api/python,git,visualstudiocode,macos,linux
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,git,visualstudiocode,macos,linux

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
diff --git a/engine/crawl.py b/engine/crawl.py
@@ -15,10 +15,11 @@
 from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words
 ##### Language detection #####
 from nltk.classify import textcat
+from custom_db import *
 
 ##### Constants #####
 # Maximum size of the links
-MAX_SIZE = 1000
+MAX_SIZE = 20
 # Keywords to search for
 # They must be present in the HTML of the page
 REQUIRED_KEYWORDS = ["tübingen", "tuebingen", "tubingen", "t%C3%BCbingen"]
@@ -62,7 +63,7 @@
 # Supported languages
 LANGS = ["en", "eng", "en-GB", "en-US", "english"]
 # Maximum number of threads
-MAX_THREADS = 10
+MAX_THREADS = 5
 # User-Agent
 USER_AGENT = "Modern Search Engines University of Tuebingen Project Crawler (https://uni-tuebingen.de/de/262377)" 
 
@@ -171,6 +172,7 @@ def crawl(self) -> None:
             # If we have reached the maximum size, stop
             if len(found_links) >= MAX_SIZE:
                 print("max size reached")
+                print(get_overview())
                 break
 
             # Get the next link to crawl
@@ -202,6 +204,7 @@ def crawl(self) -> None:
             try:
                 response = requests.get(link, timeout=5, headers={"User-Agent": USER_AGENT}, allow_redirects=True, stream=True, proxies=False, auth=False, cookies=False)
                 soup = BeautifulSoup(response.text, "lxml")
+                #print(f"Das ist soup.text: {soup.text}")
                 text = soup.text.lower()
 
                 # Check language in html-tag and in the link
@@ -230,23 +233,6 @@ def check_link_lang(link):
                 html_lang = soup.find("html").get("lang")
                 xml_lang = soup.find("html").get("xml:lang")
 
-                img_tags = soup.findAll("img")
-                desciption = soup.find("meta", attrs={"name": "description"})
-                desciption_content = desciption.get("content") if desciption is not None else ""
-                title = soup.find("title")
-                title_content = title.string if title is not None else ""
-
-                text = soup.text.lower()
-                alt_texts = [img.get("alt") for img in img_tags]
-                text = text + " ".join(alt_texts) + " " + str(desciption_content) + " " + str(title_content)
-                if i == 1:
-                    print(f"Text: {text}")
-                    print(f"Type of text: {type(text)}")
-                    print("Now printing top 30 words")
-                    top_30 = top_30_words(data=[text])
-                    print(f"Top 30 words: {top_30}")
-                    i+=1
-
                 if not check_lang(html_lang) and not check_lang(xml_lang) and not check_link_lang(link) and not check_text_lang(text):
                     print(crawling_str + "unsupported language")
                     ignore_links.add(link)
@@ -279,6 +265,28 @@ def check_link_lang(link):
                 if link not in found_links and link not in ignore_links:
                     found_links.add(link)
 
+                img_tags = soup.findAll("img")
+                desciption = soup.find("meta", attrs={"name": "description"})
+                desciption_content = desciption.get("content") if desciption is not None else ""
+                title = soup.find("title")
+                title_content = title.string if title is not None else ""
+
+                alt_texts = [img.get("alt") for img in img_tags]
+                text = text + " ".join(alt_texts) + " " + str(desciption_content) + " " + str(title_content)
+
+                tokenized_text = tokenize_data(data=text)
+                if i == 1:
+                    # print(f"Text: {text}")
+                    # print(f"Type of text: {type(text)}")
+                    # print("Now printing top 30 words")
+                    # top_30 = top_30_words(data=[text])
+                    # print(f"Top 30 words: {top_30}")
+                    # i+=1
+                    print("Saving following into the Database")
+                    print(f"URL: {link}")
+                    print(f"Tokenized text: {tokenized_text}")
+                save_html_to_df(url=link, tokenized_text=tokenized_text)
+
                 print(crawling_str + "done")
 
             except Exception as e:
@@ -353,4 +361,11 @@ def start_crawl():
 
 
 if __name__ == "__main__":
-    start_crawl()
+    start_crawl() # in crawling, we also tokenize
+    # TODO - seperarw crawling and tokenizing
+    index_pages()
+    index_df = access_index()
+    index_df.to_csv("inverted_index.csv")
+    save_pages()
+
+
diff --git a/engine/custom_db.py b/engine/custom_db.py
@@ -0,0 +1,46 @@
+import pandas as pd
+
+from collections import defaultdict
+import re
+
+# Create a DataFrame to store HTML pages
+pages_df = pd.DataFrame(columns=['id', 'url', 'tokenized_text'])
+inverted_index = defaultdict(list)
+
+def save_html_to_df(url, tokenized_text):
+    global pages_df
+    new_id = len(pages_df) + 1
+    new_row = {'id': new_id, 'url': url, 'tokenized_text': tokenized_text}
+    pages_df = pd.concat([pages_df,pd.DataFrame([new_row])], ignore_index=True)
+
+
+# Create an inverted index
+
+def get_overview():
+    return pages_df.head()
+
+def save_pages():
+    global pages_df
+    pages_df.to_csv("pages.csv")
+
+def add_document_to_index(doc_id, words: list[str]):
+    print(f"Adding stuff")
+    global inverted_index
+    for word in set(words):
+        inverted_index[word].append(doc_id)
+
+
+def index_pages():
+    for index, row in pages_df.iterrows():
+
+        add_document_to_index(row['id'], row['tokenized_text'])
+
+
+def access_index():
+    index_df = pd.DataFrame(list(inverted_index.items()), columns=['word', 'doc_ids'])
+    return index_df
+
+
+# Convert the inverted index to a DataFrame
+
+
diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py
@@ -3,19 +3,64 @@
 import re
 # We have to name this file something else then tokenizer.py because otherweise there will be a conflict with the beautifoul soup tokenizer
 # and/or nltk tokenizer
+from nltk.corpus import stopwords
+import re
+import nltk
+
+def remove_punctuations(text):
+    punct_tag = re.compile(r'[^\w\s]')
+    text = punct_tag.sub(r'', text)
+    return text
+
+# Removes HTML syntaxes
+def remove_html(text):
+    html_tag = re.compile(r'<.*?>')
+    text = html_tag.sub(r'', text)
+    return text
+
+# Removes URL data
+def remove_url(text):
+    url_clean = re.compile(r"https://\S+|www\.\S+")
+    text = url_clean.sub(r'', text)
+    return text
+
+
+# Removes Emojis
+def remove_emoji(text):
+    emoji_clean = re.compile("["
+                             u"\U0001F600-\U0001F64F"  # emoticons
+                             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                             u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                             u"\U00002702-\U000027B0"
+                             u"\U000024C2-\U0001F251"
+                             "]+", flags=re.UNICODE)
+    text = emoji_clean.sub(r'', text)
+    url_clean = re.compile(r"https://\S+|www\.\S+")
+    text = url_clean.sub(r'', text)
+    return text
+
+def tokenize_plain_words(words: str):
+    return words.split()
+
+def stem_and_remove_stopwords(words):
+    # use english porterStemmer
+
+    stemmer = nltk.stem.porter.PorterStemmer()
+    words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")] # added stemmer
+    return words
+
 
 
 def tokenize_data(data):
     """
     Tokenizes the input data.
     """
-    # delete whitespaces
-    text = data.strip()
-    text = re.sub(r'\s+', ' ', text)
-    # Split the data into words
-    print(f"Text: {text}")
-    words = nltk.word_tokenize(text)
-    return words
+    pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, stem_and_remove_stopwords]
+    for pipe in pipeline:
+        data = pipe(data)
+    print("We are done here in tokenizing")
+    return data
 
 
 def tf_idf_vectorize(data):
@@ -38,10 +83,10 @@ def top_30_words(data):
     X = vectorizer.fit_transform(data)
     # Get the feature names
     feature_names = vectorizer.get_feature_names_out()
-    # print(f"Feature names: {feature_names}")
-    # print(f"X sieht so aus: {X}")
-    # print(f"Shape of X: {X.shape}")
-    # print(f"Summe: {X.sum(axis=0)}")
+    print(f"Feature names: {feature_names}")
+    print(f"X sieht so aus: {X}")
+    print(f"Shape of X: {X.shape}")
+    print(f"Summe: {X.sum(axis=0)}")
     top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30]
     return top_30_words