Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into main
  • Loading branch information
Abhinav271828 committed Jul 23, 2021
2 parents a0ce409 + f02aff5 commit 4eb9418
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
3 changes: 2 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
SEED_URL = "http://premchand.co.in/story/pariksha"
SEED_URL = "http://premchand.co.in/story/pariksha"
SENTENCE_DELIMITER = r"(!|\?|।)"
21 changes: 21 additions & 0 deletions scraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import re


def clean_text(text):
# takes text and gets rid of the english stuff from it
return_text = re.sub(r'[A-Z]|[0-9]|[a-z]', "", text)
Expand Down Expand Up @@ -55,3 +56,23 @@ def create_relevant_data_files(data_dir_path):
# file was already made
pass


def sentence_tokenize(text):
# simple tokenization using regex

tokenized_list = re.split(SENTENCE_DELIMITER, text)
# splits using the SENTENCE_DELIMITER
# each delimiter is also turned into a token itself
tokenized_list_final = []

for index, token in enumerate(tokenized_list):
if bool(re.match(SENTENCE_DELIMITER, token)):
# this is to weed out the elements that are the delimiters themselves
continue
if index < len(tokenized_list)-1:
if tokenized_list[index + 1] != "?":
# we don't want question sentences

tokenized_list_final.append(token)
return tokenized_list_final

0 comments on commit 4eb9418

Please sign in to comment.