Skip to content

Commit

Permalink
Merge pull request #9 from am9zZWY/lukas-tokenize
Browse files Browse the repository at this point in the history
Add tokenizer pipeline
  • Loading branch information
am9zZWY authored Jul 10, 2024
2 parents afd8c5f + a82845f commit 5a5c17c
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 31 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
crawler_states/*
*.csv

# Created by https://www.toptal.com/developers/gitignore/api/python,git,visualstudiocode,macos,linux
# Edit at https://www.toptal.com/developers/gitignore?templates=python,git,visualstudiocode,macos,linux
Expand Down
15 changes: 15 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}
55 changes: 35 additions & 20 deletions engine/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@
from custom_tokenizer import tokenize_data, tf_idf_vectorize, top_30_words
##### Language detection #####
from nltk.classify import textcat
from custom_db import *

##### Constants #####
# Maximum size of the links
MAX_SIZE = 1000
MAX_SIZE = 20
# Keywords to search for
# They must be present in the HTML of the page
REQUIRED_KEYWORDS = ["tübingen", "tuebingen", "tubingen", "t%C3%BCbingen"]
Expand Down Expand Up @@ -62,7 +63,7 @@
# Supported languages
LANGS = ["en", "eng", "en-GB", "en-US", "english"]
# Maximum number of threads
MAX_THREADS = 10
MAX_THREADS = 5
# User-Agent
USER_AGENT = "Modern Search Engines University of Tuebingen Project Crawler (https://uni-tuebingen.de/de/262377)"

Expand Down Expand Up @@ -171,6 +172,7 @@ def crawl(self) -> None:
# If we have reached the maximum size, stop
if len(found_links) >= MAX_SIZE:
print("max size reached")
print(get_overview())
break

# Get the next link to crawl
Expand Down Expand Up @@ -202,6 +204,7 @@ def crawl(self) -> None:
try:
response = requests.get(link, timeout=5, headers={"User-Agent": USER_AGENT}, allow_redirects=True, stream=True, proxies=False, auth=False, cookies=False)
soup = BeautifulSoup(response.text, "lxml")
#print(f"Das ist soup.text: {soup.text}")
text = soup.text.lower()

# Check language in html-tag and in the link
Expand Down Expand Up @@ -230,23 +233,6 @@ def check_link_lang(link):
html_lang = soup.find("html").get("lang")
xml_lang = soup.find("html").get("xml:lang")

img_tags = soup.findAll("img")
desciption = soup.find("meta", attrs={"name": "description"})
desciption_content = desciption.get("content") if desciption is not None else ""
title = soup.find("title")
title_content = title.string if title is not None else ""

text = soup.text.lower()
alt_texts = [img.get("alt") for img in img_tags]
text = text + " ".join(alt_texts) + " " + str(desciption_content) + " " + str(title_content)
if i == 1:
print(f"Text: {text}")
print(f"Type of text: {type(text)}")
print("Now printing top 30 words")
top_30 = top_30_words(data=[text])
print(f"Top 30 words: {top_30}")
i+=1

if not check_lang(html_lang) and not check_lang(xml_lang) and not check_link_lang(link) and not check_text_lang(text):
print(crawling_str + "unsupported language")
ignore_links.add(link)
Expand Down Expand Up @@ -279,6 +265,28 @@ def check_link_lang(link):
if link not in found_links and link not in ignore_links:
found_links.add(link)

img_tags = soup.findAll("img")
desciption = soup.find("meta", attrs={"name": "description"})
desciption_content = desciption.get("content") if desciption is not None else ""
title = soup.find("title")
title_content = title.string if title is not None else ""

alt_texts = [img.get("alt") for img in img_tags]
text = text + " ".join(alt_texts) + " " + str(desciption_content) + " " + str(title_content)

tokenized_text = tokenize_data(data=text)
if i == 1:
# print(f"Text: {text}")
# print(f"Type of text: {type(text)}")
# print("Now printing top 30 words")
# top_30 = top_30_words(data=[text])
# print(f"Top 30 words: {top_30}")
# i+=1
print("Saving following into the Database")
print(f"URL: {link}")
print(f"Tokenized text: {tokenized_text}")
save_html_to_df(url=link, tokenized_text=tokenized_text)

print(crawling_str + "done")

except Exception as e:
Expand Down Expand Up @@ -353,4 +361,11 @@ def start_crawl():


if __name__ == "__main__":
start_crawl()
start_crawl() # in crawling, we also tokenize
# TODO - seperarw crawling and tokenizing
index_pages()
index_df = access_index()
index_df.to_csv("inverted_index.csv")
save_pages()


46 changes: 46 additions & 0 deletions engine/custom_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pandas as pd

from collections import defaultdict
import re

# Create a DataFrame to store HTML pages
pages_df = pd.DataFrame(columns=['id', 'url', 'tokenized_text'])
inverted_index = defaultdict(list)

def save_html_to_df(url, tokenized_text):
global pages_df
new_id = len(pages_df) + 1
new_row = {'id': new_id, 'url': url, 'tokenized_text': tokenized_text}
pages_df = pd.concat([pages_df,pd.DataFrame([new_row])], ignore_index=True)


# Create an inverted index

def get_overview():
return pages_df.head()

def save_pages():
global pages_df
pages_df.to_csv("pages.csv")

def add_document_to_index(doc_id, words: list[str]):
print(f"Adding stuff")
global inverted_index
for word in set(words):
inverted_index[word].append(doc_id)


def index_pages():
for index, row in pages_df.iterrows():

add_document_to_index(row['id'], row['tokenized_text'])


def access_index():
index_df = pd.DataFrame(list(inverted_index.items()), columns=['word', 'doc_ids'])
return index_df


# Convert the inverted index to a DataFrame


67 changes: 56 additions & 11 deletions engine/custom_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,64 @@
import re
# We have to name this file something else then tokenizer.py because otherweise there will be a conflict with the beautifoul soup tokenizer
# and/or nltk tokenizer
from nltk.corpus import stopwords
import re
import nltk

def remove_punctuations(text):
punct_tag = re.compile(r'[^\w\s]')
text = punct_tag.sub(r'', text)
return text

# Removes HTML syntaxes
def remove_html(text):
html_tag = re.compile(r'<.*?>')
text = html_tag.sub(r'', text)
return text

# Removes URL data
def remove_url(text):
url_clean = re.compile(r"https://\S+|www\.\S+")
text = url_clean.sub(r'', text)
return text


# Removes Emojis
def remove_emoji(text):
emoji_clean = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
text = emoji_clean.sub(r'', text)
url_clean = re.compile(r"https://\S+|www\.\S+")
text = url_clean.sub(r'', text)
return text

def tokenize_plain_words(words: str):
return words.split()

def stem_and_remove_stopwords(words):
# use english porterStemmer

stemmer = nltk.stem.porter.PorterStemmer()
words = [stemmer.stem(word) for word in words if word not in stopwords.words("english")] # added stemmer
return words



def tokenize_data(data):
"""
Tokenizes the input data.
"""
# delete whitespaces
text = data.strip()
text = re.sub(r'\s+', ' ', text)
# Split the data into words
print(f"Text: {text}")
words = nltk.word_tokenize(text)
return words
pipeline = [remove_punctuations, remove_html, remove_url, remove_emoji, tokenize_plain_words, stem_and_remove_stopwords]
for pipe in pipeline:
data = pipe(data)
print("We are done here in tokenizing")
return data


def tf_idf_vectorize(data):
Expand All @@ -38,10 +83,10 @@ def top_30_words(data):
X = vectorizer.fit_transform(data)
# Get the feature names
feature_names = vectorizer.get_feature_names_out()
# print(f"Feature names: {feature_names}")
# print(f"X sieht so aus: {X}")
# print(f"Shape of X: {X.shape}")
# print(f"Summe: {X.sum(axis=0)}")
print(f"Feature names: {feature_names}")
print(f"X sieht so aus: {X}")
print(f"Shape of X: {X.shape}")
print(f"Summe: {X.sum(axis=0)}")
top_30_words = sorted(zip(feature_names, X.sum(axis=0).tolist()[0]), key=lambda x: x[1], reverse=True)[:30]
return top_30_words

Expand Down

0 comments on commit 5a5c17c

Please sign in to comment.