-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_reader_hansa.py
80 lines (71 loc) · 3.39 KB
/
data_reader_hansa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import csv
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
### Hansa dataset reader
print "****HANSA MARKET DATASET BUILDER****\n"
# files are already in csv, we need to join all the files, remove duplicates
# format "time","item_id","title","vendor","price","type","nation"
HANSAPATH = 'C:\Users\Giovanni\PycharmProjects\dm_project\hansa-marketplace-listings-december-2016.csv'
DATASET_HANSA = 'hansa_dataset'
count = 0
stopwordsCached = set(stopwords.words('english'))
exitFile2 = open(DATASET_HANSA,'w')
with open(HANSAPATH, "r") as currentfile:
reader = csv.reader(currentfile, quotechar='"', doublequote=True, delimiter=',',quoting=csv.QUOTE_ALL, skipinitialspace=True)
for row in reader:
count = count + 1
# row[2] is the title
# remove links, snippet from stackoverflow
row[2] = re.sub(r"http\S+", "", row[2])
# let's clean up description with only alphabetic chars
row[2] = re.sub(r'[^a-zA-Z ]+', '', row[2])
# let's perform stopword removal and stemming from description
# 1 --- stopword removal
description = row[2].lower()
tokens = word_tokenize(description)
filteredDescription = [word for word in tokens if word not in stopwordsCached]
filteredDescription = [word for word in filteredDescription if word not in stopwordsCached]
description = ''
for word in filteredDescription:
description = description + word + ' '
#title cleaning
description = re.sub(r'[^a-zA-Z0-9 ]+', '', description)
try:
if len(row)>7:
size = len(row) - 1
row[size] = re.sub(r'[^a-zA-Z ]+', '', row[size])
price = row[4]
# comma removals from prices
price = re.sub(r'[^a-zA-Z0-9 .]+', '', price)
price = re.sub(r'[^a-zA-Z0-9 .]+', '.', price)
result_string = row[3] + "," + price + "," + description + "," + description + "," + str(row[size]).lstrip(" ") + '\n'
result_string = result_string.lower()
else:
row[6] = re.sub(r'[^a-zA-Z ]+', '', row[6])
price = row[4]
# comma removals from prices
price = re.sub(r'[^a-zA-Z0-9 .]+', '', price)
price = re.sub(r'[^a-zA-Z0-9 .]+', '.', price)
result_string = row[3]+","+price+","+description+","+description+","+str(row[6]).lstrip(" ")+'\n'
result_string = result_string.lower()
except:
try:
price = row[4]
# comma removals from prices
price = re.sub(r'[^a-zA-Z0-9 .]+', '', price)
price = re.sub(r'[^a-zA-Z0-9 .]+', '.', price)
result_string = row[3] + "," + price + "," + description + "," + description + ",worldwide\n"
result_string = result_string.lower()
except:
result_string = 'a,0,' + description + "," + description + ",worldwide\n"
result_string = result_string.lower()
#remove links, snippet from stackoverflow
result_string = re.sub(r"http\S+", "", result_string)
exitFile2.write(result_string)
# rows are ended, end of this file
if count%100 == 0:
print str(count)+" items has been read."
exitFile2.close()
print str(count)+" total items."