-
Notifications
You must be signed in to change notification settings - Fork 0
/
putDB.py
104 lines (91 loc) · 3.03 KB
/
putDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json
import re
import pymongo
import csv
from pymongo import MongoClient
def connect():
client = MongoClient()
client = MongoClient('localhost', 27017)
db = client['Tweets']
return client,db
def read_tweets_file(filename):
# Read the tweets file and generate a list of dictionaries with keys as follows
# dictionary keys ---> 'tweet_id', 'tweet', 'language'
print "Reading tweets file"
infile = open(filename,"r")
data = infile.read().split('\n')[:-1]
tweets = []
for temp in data:
try:
item = json.loads(temp)
#print item
tweet = item["text"]
ids = item["id"]
lang = item["lang"]
tweets.append({'tweet_id':ids,'tweet':tweet,'language':lang})
except:
continue
print len(tweets), "Extracted"
return tweets
def write_to_csv(data):
#infile = open("data-bala.json","r")
#data = json.load(infile)
count = 0
non_eng_count = 0
q_count = 0
with open('data-bala.csv', 'wb') as csvfile:
tweet_writer = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
tweet_writer.writerow(['tweet_id', 'tweet_original','tweet_clean','language','has_question_word','question_word'])
for item in data:
tweet_clean = clean_the_tweet_text(item["tweet"])
has_question_word = ''
question_word = ''
question_words = ['?', 'when', 'why', 'where', 'which', 'how', 'what', 'who']
if item["language"] == "en":
for q_word in question_words:
if q_word in item["tweet"].lower().split(' '):
has_question_word = 'yes'
question_word = q_word
q_count+=1
try:
tweet_writer.writerow([str(item["tweet_id"]), item["tweet"].encode("utf-8"),tweet_clean, str(item["language"]),has_question_word, question_word])
except Exception, e:
print item
# raise e
count+=1
else:
non_eng_count+=1
print "count:",count
print "q_count:",q_count
def clean_the_tweet_text(raw_tweet):
clean_tweet = raw_tweet.lower().encode("utf-8")
regex_form = '^rt\s+|@\w+:*|https?://[\w\.\/]*'
clean_tweet = re.sub(regex_form, '', clean_tweet)
return clean_tweet
def write_to_db():
client, db = connect()
infile = open("data.json","r")
data = json.load(infile)
print "Got Tweets, putting in mongodb"
for item in data:
tweets = db.tweets.insert(item)
print "Done putting in mongodb"
def tweet_analyzer(tweets):
qwords = set(['who', 'what', 'when', 'where', 'why', 'how', 'do', 'is', 'could', 'can', "can't", 'cant', 'would', "wouldn't", "wouldnt", 'should', "shouldn't", "shouldnt", 'did', 'will', 'has', 'have', "won't", 'does', 'wont', 'doesnt', "doesn\'t", 'had', 'are'])
data = []
count = 0
for item in tweets:
if '?' in item['tweet']:
data.append(item['tweet'])
continue
temp = item['tweet'].lower().split()
while(temp[0][0] == '@' or temp[0] == 'rt' or temp[0][0] == '#' ):
temp = temp[1:]
for word in qwords:
if word in temp[0]:
data.append(temp)
break
print len(data)
if __name__ == "__main__":
data = read_tweets_file("bala.txt")
write_to_csv(data)