diff --git a/clean.py b/clean.py index 743f859..a55e61f 100644 --- a/clean.py +++ b/clean.py @@ -1,7 +1,18 @@ import csv import json +import re +import string import sys +# Cleans the tweet text by removing special characters, RT, punctuation, and extra whitespace +def clean(tweet_text): + tweet_text = tweet_text.encode('ascii', 'ignore').replace('\n', ' ') + tweet_text = re.sub(r"(?:\@|https?\://)\S+", "", tweet_text) + tweet_text = tweet_text.replace('http', '').replace('RT', '') + tweet_text = tweet_text.translate(string.maketrans("",""), string.punctuation) + tweet_text = ' '.join(tweet_text.split()) + return tweet_text + # Check for proper command line arguments if len(sys.argv) != 2: print 'Usage: clean.py ' @@ -25,9 +36,10 @@ for line in tweets: values = json.loads(line) if values['tweetOwner']['language'] == 'en': - tweetId = str(values['tweetId']) - text = values['text'].encode('ascii', 'ignore').replace('\n', ' ') - writer.writerow([tweetId, text]) + tweetId = values['tweetId'] + text = clean(values['text']) + if text != '': + writer.writerow([tweetId, text]) # Print location of cleaned data and close csv file print 'Data written to:', csvfile.name