-
Notifications
You must be signed in to change notification settings - Fork 1
/
json_reviews_1vote_foodbusiness.py
94 lines (88 loc) · 3.5 KB
/
json_reviews_1vote_foodbusiness.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import json, os, csv
cwd = os.getcwd()
json_review_list = []
json_businessid_list = []
json_business_list = []
with open(os.path.join(cwd, 'yelp_dataset_challenge_academic_dataset', 'categories.csv'), 'r') as f:
reader = csv.reader(f)
restaurant_categories_list = list(reader)
# print(restaurant_categories_list[0])
restaurant_categories_set = set()
for restaurantcategory in restaurant_categories_list[0]:
restaurant_categories_set.add(restaurantcategory)
f.close()
businessIDcsv = open(os.path.join(cwd, 'yelp_dataset_challenge_academic_dataset', 'businessID2.csv'),'w')
with open(os.path.join(cwd, 'yelp_dataset_challenge_academic_dataset', 'yelp_academic_dataset_business.json')) as f:
jfile = {}
for line in f:
while True:
try:
jfile = json.loads(line)
break
except ValueError:
# Not yet a complete JSON value
line += next(f)
# do something with jfile
if jfile:
categories_list = jfile['categories']
categories_set = set(categories_list)
# for category in categories_list:
# categories_set.add(category)
if (not restaurant_categories_set.isdisjoint(categories_set)) and (jfile['business_id'] not in json_businessid_list):
json_businessid_list.append(jfile['business_id'])
businessIDcsv.write(jfile['business_id']+'\n')
f.close()
businessIDcsv.close()
newdataset = open(os.path.join(cwd, 'yelp_dataset_challenge_academic_dataset', 'dataset_1usefulreview_review.json'),'w')
with open(os.path.join(cwd,'yelp_dataset_challenge_academic_dataset', 'yelp_academic_dataset_review.json')) as f:
jfile={}
for line in f:
while True:
try:
jfile = json.loads(line)
break
except ValueError:
# Not yet a complete JSON value
line += next(f)
# do something with jfile
if jfile:
if jfile['business_id'] in json_businessid_list and (jfile['votes']['useful'] >= 1):
try:
newdataset.write(str(jfile)+'\n')
json_review_list.append(jfile)
except:
pass
f.close()
newdataset.close()
# print(len(json_review_list)) # 809706
countratings = []
newdataset2 = open(os.path.join(cwd, 'yelp_dataset_challenge_academic_dataset', 'dataset_1usefulreview_review_refiltered.json'),'w')
corpus = open(os.path.join(cwd, 'yelp_dataset_challenge_academic_dataset', 'corpus_1useful_review.txt'),'w')
useridcsv = open(os.path.join(cwd, 'truncated_UserId_list2.csv'), 'w')
ratingscsv = open(os.path.join(cwd, 'ratings.csv'), 'w')
count = 0
othercount = 0
for i in range(0,len(json_review_list)):
if json_review_list[i]['type'] == 'review':
text = json_review_list[i]['text']
stars = json_review_list[i]['stars']
if text:
try:
corpus.write(text+'\n')
ratingscsv.write(stars+'\n')
useridcsv.write(json_review_list[i]['user_id']+'\n')
newdataset2.write(str(json_review_list[i])+'\n')
countratings.append(stars)
othercount+=1
if othercount < 5:
print(str(json_review_list[i]+'\n'))
except:
count +=1
# print(count) #26
print(othercount) # 809680
print(len(countratings))
# print(countratings)
corpus.close()
useridcsv.close()
ratingscsv.close()
# print()