-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataPreparation.py
168 lines (129 loc) · 4.26 KB
/
DataPreparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
@author: Anshul Srivastava
"""
#import os
import pandas as pd
import csv
import numpy as np
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import seaborn as sb
#before reading the files, setup the working directory to point to project repo
#reading data files
test_filename = 'test.csv'
train_filename = 'train.csv'
valid_filename = 'valid.csv'
train_news = pd.read_csv(train_filename)
test_news = pd.read_csv(test_filename)
valid_news = pd.read_csv(valid_filename)
#data observation
def data_obs():
print("training dataset size:")
print(train_news.shape)
print(train_news.head(10))
#below dataset were used for testing and validation purposes
print(test_news.shape)
print(test_news.head(10))
print(valid_news.shape)
print(valid_news.head(10))
#check the data by calling below function
#data_obs()
#distribution of classes for prediction
def create_distribution(dataFile):
return sb.countplot(x='Label', data=dataFile, palette='hls')
#by calling below we can see that training, test and valid data seems to be failry evenly distributed between the classes
create_distribution(train_news)
create_distribution(test_news)
create_distribution(valid_news)
#data integrity check (missing label values)
#none of the datasets contains missing values therefore no cleaning required
def data_qualityCheck():
print("Checking data qualitites...")
train_news.isnull().sum()
train_news.info()
print("check finished.")
#below datasets were used to
test_news.isnull().sum()
test_news.info()
valid_news.isnull().sum()
valid_news.info()
#run the below function call to see the quality check results
#data_qualityCheck()
#eng_stemmer = SnowballStemmer('english')
#stopwords = set(nltk.corpus.stopwords.words('english'))
#Stemming
def stem_tokens(tokens, stemmer):
stemmed = []
for token in tokens:
stemmed.append(stemmer.stem(token))
return stemmed
#process the data
def process_data(data,exclude_stopword=True,stem=True):
tokens = [w.lower() for w in data]
tokens_stemmed = tokens
tokens_stemmed = stem_tokens(tokens, eng_stemmer)
tokens_stemmed = [w for w in tokens_stemmed if w not in stopwords ]
return tokens_stemmed
#creating ngrams
#unigram
def create_unigram(words):
assert type(words) == list
return words
#bigram
def create_bigrams(words):
assert type(words) == list
skip = 0
join_str = " "
Len = len(words)
if Len > 1:
lst = []
for i in range(Len-1):
for k in range(1,skip+2):
if i+k < Len:
lst.append(join_str.join([words[i],words[i+k]]))
else:
#set it as unigram
lst = create_unigram(words)
return lst
"""
#trigrams
def create_trigrams(words):
assert type(words) == list
skip == 0
join_str = " "
Len = len(words)
if L > 2:
lst = []
for i in range(1,skip+2):
for k1 in range(1, skip+2):
for k2 in range(1,skip+2):
for i+k1 < Len and i+k1+k2 < Len:
lst.append(join_str.join([words[i], words[i+k1],words[i+k1+k2])])
else:
#set is as bigram
lst = create_bigram(words)
return lst
"""
porter = PorterStemmer()
def tokenizer(text):
return text.split()
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
#doc = ['runners like running and thus they run','this is a test for tokens']
#tokenizer([word for line in test_news.iloc[:,1] for word in line.lower().split()])
#show the distribution of labels in the train and test data
"""def create_datafile(filename)
#function to slice the dataframe to keep variables necessary to be used for classification
return "return df to be used"
"""
"""#converting multiclass labels present in our datasets to binary class labels
for i , row in data_TrainNews.iterrows():
if (data_TrainNews.iloc[:,0] == "mostly-true" | data_TrainNews.iloc[:,0] == "half-true" | data_TrainNews.iloc[:,0] == "true"):
data_TrainNews.iloc[:,0] = "true"
else :
data_TrainNews.iloc[:,0] = "false"
for i,row in data_TrainNews.iterrows():
print(row)
"""