Merge branch 'main' of https://github.com/sentient-bread/CL-Project2 …

…into main
sentient-bread · Jul 23, 2021 · 4eb9418 · 4eb9418
2 parents a0ce409 + f02aff5
commit 4eb9418
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 1 deletion.
diff --git a/config.py b/config.py
@@ -1 +1,2 @@
-SEED_URL = "http://premchand.co.in/story/pariksha"
+SEED_URL = "http://premchand.co.in/story/pariksha"
+SENTENCE_DELIMITER = r"(!|\?|।)"
diff --git a/scraping.py b/scraping.py
@@ -5,6 +5,7 @@
 import os
 import re
 
+
 def clean_text(text):
     # takes text and gets rid of the english stuff from it
     return_text = re.sub(r'[A-Z]|[0-9]|[a-z]', "", text)
@@ -55,3 +56,23 @@ def create_relevant_data_files(data_dir_path):
         # file was already made
         pass
 
+
+def sentence_tokenize(text):
+    # simple tokenization using regex
+
+    tokenized_list = re.split(SENTENCE_DELIMITER, text)
+    # splits using the SENTENCE_DELIMITER
+    # each delimiter is also turned into a token itself
+    tokenized_list_final = []
+
+    for index, token in enumerate(tokenized_list):
+        if bool(re.match(SENTENCE_DELIMITER, token)):
+            # this is to weed out the elements that are the delimiters themselves
+            continue
+        if index < len(tokenized_list)-1:
+            if tokenized_list[index + 1] != "?":
+                # we don't want question sentences
+
+                tokenized_list_final.append(token)
+    return tokenized_list_final
+