From 2d2c074d6eb32a898b97cdf5c11b9edcc616a6ff Mon Sep 17 00:00:00 2001 From: Okan Coskun Date: Wed, 10 Jul 2024 13:00:00 +0200 Subject: [PATCH 1/2] include csv files in gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8104af4..27792ab 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ crawler_states/* +*.csv # Created by https://www.toptal.com/developers/gitignore/api/python,git,visualstudiocode,macos,linux # Edit at https://www.toptal.com/developers/gitignore?templates=python,git,visualstudiocode,macos,linux From a82845f88a3289e12c3661c26202e4de6a1d5ca6 Mon Sep 17 00:00:00 2001 From: Okan Coskun Date: Wed, 10 Jul 2024 13:00:49 +0200 Subject: [PATCH 2/2] tokenize and preprocess data with generating inverted index --- .vscode/launch.json | 15 +++++++++ engine/crawl.py | 55 +++++++++++++++++++------------ engine/custom_db.py | 46 ++++++++++++++++++++++++++ engine/custom_tokenizer.py | 67 +++++++++++++++++++++++++++++++------- 4 files changed, 152 insertions(+), 31 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 engine/custom_db.py diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..6b76b4f --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/engine/crawl.py b/engine/crawl.py index 4db7373..f925e45 100644 --- a/engine/crawl.py +++ b/engine/crawl.py @@ -15,10 +15,11 @@ from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words ##### Language detection ##### from nltk.classify import textcat +from custom_db import * ##### Constants ##### # Maximum size of the links -MAX_SIZE = 1000 +MAX_SIZE = 20 # Keywords to search for # They must be present in the HTML of the page REQUIRED_KEYWORDS = ["tübingen", "tuebingen", "tubingen", "t%C3%BCbingen"] @@ -62,7 +63,7 @@ # Supported languages LANGS = ["en", "eng", "en-GB", "en-US", "english"] # Maximum number of threads -MAX_THREADS = 10 +MAX_THREADS = 5 # User-Agent USER_AGENT = "Modern Search Engines University of Tuebingen Project Crawler (https://uni-tuebingen.de/de/262377)" @@ -171,6 +172,7 @@ def crawl(self) -> None: # If we have reached the maximum size, stop if len(found_links) >= MAX_SIZE: print("max size reached") + print(get_overview()) break # Get the next link to crawl @@ -202,6 +204,7 @@ def crawl(self) -> None: try: response = requests.get(link, timeout=5, headers={"User-Agent": USER_AGENT}, allow_redirects=True, stream=True, proxies=False, auth=False, cookies=False) soup = BeautifulSoup(response.text, "lxml") + #print(f"Das ist soup.text: {soup.text}") text = soup.text.lower() # Check language in html-tag and in the link @@ -230,23 +233,6 @@ def check_link_lang(link): html_lang = soup.find("html").get("lang") xml_lang = soup.find("html").get("xml:lang") - img_tags = soup.findAll("img") - desciption = soup.find("meta", attrs={"name": "description"}) - desciption_content = desciption.get("content") if desciption is not None else "" - title = soup.find("title") - title_content = title.string if title is not None else "" - - text = soup.text.lower() - alt_texts = [img.get("alt") for img in img_tags] - text = text + " ".join(alt_texts) + " " + str(desciption_content) + " " + str(title_content) - if i == 1: - print(f"Text: {text}") - print(f"Type of text: {type(text)}") - print("Now printing top 30 words") - top_30 = top_30_words(data=[text]) - print(f"Top 30 words: {top_30}") - i+=1 - if not check_lang(html_lang) and not check_lang(xml_lang) and not check_link_lang(link) and not check_text_lang(text): print(crawling_str + "unsupported language") ignore_links.add(link) @@ -279,6 +265,28 @@ def check_link_lang(link): if link not in found_links and link not in ignore_links: found_links.add(link) + img_tags = soup.findAll("img") + desciption = soup.find("meta", attrs={"name": "description"}) + desciption_content = desciption.get("content") if desciption is not None else "" + title = soup.find("title") + title_content = title.string if title is not None else "" + + alt_texts = [img.get("alt") for img in img_tags] + text = text + " ".join(alt_texts) + " " + str(desciption_content) + " " + str(title_content) + + tokenized_text = tokenize_data(data=text) + if i == 1: + # print(f"Text: {text}") + # print(f"Type of text: {type(text)}") + # print("Now printing top 30 words") + # top_30 = top_30_words(data=[text]) + # print(f"Top 30 words: {top_30}") + # i+=1 + print("Saving following into the Database") + print(f"URL: {link}") + print(f"Tokenized text: {tokenized_text}") + save_html_to_df(url=link, tokenized_text=tokenized_text) + print(crawling_str + "done") except Exception as e: @@ -353,4 +361,11 @@ def start_crawl(): if __name__ == "__main__": - start_crawl() + start_crawl() # in crawling, we also tokenize + # TODO - seperarw crawling and tokenizing + index_pages() + index_df = access_index() + index_df.to_csv("inverted_index.csv") + save_pages() + + diff --git a/engine/custom_db.py b/engine/custom_db.py new file mode 100644 index 0000000..93cfc70 --- /dev/null +++ b/engine/custom_db.py @@ -0,0 +1,46 @@ +import pandas as pd + +from collections import defaultdict +import re + +# Create a DataFrame to store HTML pages +pages_df = pd.DataFrame(columns=['id', 'url', 'tokenized_text']) +inverted_index = defaultdict(list) + +def save_html_to_df(url, tokenized_text): + global pages_df + new_id = len(pages_df) + 1 + new_row = {'id': new_id, 'url': url, 'tokenized_text': tokenized_text} + pages_df = pd.concat([pages_df,pd.DataFrame([new_row])], ignore_index=True) + + +# Create an inverted index + +def get_overview(): + return pages_df.head() + +def save_pages(): + global pages_df + pages_df.to_csv("pages.csv") + +def add_document_to_index(doc_id, words: list[str]): + print(f"Adding stuff") + global inverted_index + for word in set(words): + inverted_index[word].append(doc_id) + + +def index_pages(): + for index, row in pages_df.iterrows(): + + add_document_to_index(row['id'], row['tokenized_text']) + + +def access_index(): + index_df = pd.DataFrame(list(inverted_index.items()), columns=['word', 'doc_ids']) + return index_df + + +# Convert the inverted index to a DataFrame + + diff --git a/engine/custom_tokenizer.py b/engine/custom_tokenizer.py index b1c0f46..36d4c1a 100644 --- a/engine/custom_tokenizer.py +++ b/engine/custom_tokenizer.py @@ -3,19 +3,64 @@ import re # We have to name this file something else then tokenizer.py because otherweise there will be a conflict with the beautifoul soup tokenizer # and/or nltk tokenizer +from nltk.corpus import stopwords +import re +import nltk + +def remove_punctuations(text): + punct_tag = re.compile(r'[^\w\s]') + text = punct_tag.sub(r'', text) + return text + +# Removes HTML syntaxes +def remove_html(text): + html_tag = re.compile(r'<.*?>') + text = html_tag.sub(r'', text) + return text + +# Removes URL data +def remove_url(text): + url_clean = re.compile(r"https://\S+|www\.\S+") + text = url_clean.sub(r'', text) + return text + + +# Removes Emojis +def remove_emoji(text): + emoji_clean = re.compile("[" + u"\U0001F600-\U0001F64F" # emoticons + u"\U0001F300-\U0001F5FF" # symbols & pictographs + u"\U0001F680-\U0001F6FF" # transport & map symbols + u"\U0001F1E0-\U0001F1FF" # flags (iOS) + u"\U00002702-\U000027B0" + u"\U000024C2-\U0001F251" + "]+", flags=re.UNICODE) + text = emoji_clean.sub(r'', text) + url_clean = re.compile(r"https://\S+|www\.\S+") + text = url_clean.sub(r'', text) + return text + +def tokenize_plain_words(words: str): + return words.split() + +def stem_and_remove_stopwords(words): + # use english porterStemmer + + stemmer = nltk.stem.porter.PorterStemmer() + words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")] # added stemmer + return words + def tokenize_data(data): """ Tokenizes the input data. """ - # delete whitespaces - text = data.strip() - text = re.sub(r'\s+', ' ', text) - # Split the data into words - print(f"Text: {text}") - words = nltk.word_tokenize(text) - return words + pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, stem_and_remove_stopwords] + for pipe in pipeline: + data = pipe(data) + print("We are done here in tokenizing") + return data def tf_idf_vectorize(data): @@ -38,10 +83,10 @@ def top_30_words(data): X = vectorizer.fit_transform(data) # Get the feature names feature_names = vectorizer.get_feature_names_out() - # print(f"Feature names: {feature_names}") - # print(f"X sieht so aus: {X}") - # print(f"Shape of X: {X.shape}") - # print(f"Summe: {X.sum(axis=0)}") + print(f"Feature names: {feature_names}") + print(f"X sieht so aus: {X}") + print(f"Shape of X: {X.shape}") + print(f"Summe: {X.sum(axis=0)}") top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30] return top_30_words