Skip to content
This repository has been archived by the owner on Oct 2, 2020. It is now read-only.

Commit

Permalink
new and improved cleaning script
Browse files Browse the repository at this point in the history
  • Loading branch information
kevin-wittmer committed Apr 20, 2016
1 parent d2b5505 commit 255b3a5
Showing 1 changed file with 15 additions and 3 deletions.
18 changes: 15 additions & 3 deletions clean.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
import csv
import json
import re
import string
import sys

# Cleans the tweet text by removing special characters, RT, punctuation, and extra whitespace
def clean(tweet_text):
tweet_text = tweet_text.encode('ascii', 'ignore').replace('\n', ' ')
tweet_text = re.sub(r"(?:\@|https?\://)\S+", "", tweet_text)
tweet_text = tweet_text.replace('http', '').replace('RT', '')
tweet_text = tweet_text.translate(string.maketrans("",""), string.punctuation)
tweet_text = ' '.join(tweet_text.split())
return tweet_text

# Check for proper command line arguments
if len(sys.argv) != 2:
print 'Usage: clean.py <filename.json>'
Expand All @@ -25,9 +36,10 @@
for line in tweets:
values = json.loads(line)
if values['tweetOwner']['language'] == 'en':
tweetId = str(values['tweetId'])
text = values['text'].encode('ascii', 'ignore').replace('\n', ' ')
writer.writerow([tweetId, text])
tweetId = values['tweetId']
text = clean(values['text'])
if text != '':
writer.writerow([tweetId, text])

# Print location of cleaned data and close csv file
print 'Data written to:', csvfile.name
Expand Down

0 comments on commit 255b3a5

Please sign in to comment.