-
Notifications
You must be signed in to change notification settings - Fork 0
/
hindicorpus.py
122 lines (96 loc) · 3.11 KB
/
hindicorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import io
from HindiTokenizer import Tokenizer
from pydub import AudioSegment
import numpy as np
corpusdir = './data/' # Directory of corpus.
def raw(fileid = None):
texts = []
if fileid:
file_text = io.open(corpusdir + fileid, "r", encoding="utf-8")
texts.append(file_text.read())
else:
for file in fileids():
file_text = io.open(corpusdir + file, "r", encoding="utf-8")
try:
texts.append(file_text.read())
except:
print("File encoding issue: " + file)
return texts
def fileids():
files_ret = []
for dir in os.listdir(corpusdir):
try:
files = os.listdir(corpusdir + dir + "/")
for file in files:
if file.endswith('.txt'):
files_ret.append(dir + "/" + file)
except:
print("")
return files_ret
def fileids_search(search_text):
files_ret = []
for dir in os.listdir(corpusdir):
try:
files = os.listdir(corpusdir + dir + "/")
for file in files:
file_text = io.open(corpusdir + dir + "/" + file, "r", encoding="utf-8")
if search_text in file_text.read():
files_ret.append(dir + "/" + file)
except:
print("")
return files_ret
def tokenize(fileid = None, remove_stopwords = False):
token_list = []
for text in raw(fileid):
t = Tokenizer(text)
t.tokenize()
if remove_stopwords:
t.remove_stop_words()
token_list.append(t.final_tokens)
else:
token_list.append(t.tokens)
return token_list
def sent_tokenize(fileid = None):
token_list = []
for text in raw(fileid):
t = Tokenizer(text)
t.generate_sentences()
token_list.append(t.sentences)
return token_list
#Helper function to detect silence regions
def ratio(sample):
counter = 0
for i in sample:
if i <= 1 and i >=-1:
counter+=1
return counter/len(sample)
def get_new_sentence_locations(filename):
speech = AudioSegment.from_file(corpusdir+filename)
speech = speech - 3 #Reduce the gain
length = len(speech)/1000
silence_timings = []
for i in range(int(length)):
sample = speech[i*1000 : (i+1)*1000].get_array_of_samples()
sample = np.array(sample)
silence = ratio(sample) > 0.05
if silence:
silence_timings.append(i)
fullstop_timings = silence_timings
if 0 in fullstop_timings:
fullstop_timings.remove(0)
for i in silence_timings[::-1]:
for deviation in [1,2,3,4]:
if i-deviation in fullstop_timings:
fullstop_timings.remove(i-deviation)
return fullstop_timings
def play(filename):
return AudioSegment.from_file(corpusdir+filename)
#Sentences are 0 indexed
def play_sentence(filename, sentence_number):
audio = play(filename)
if sentence_number == 0:
return audio
else:
start = get_new_sentence_locations(filename)[sentence_number]
return audio[start*1000:]