This repository has been archived by the owner on Oct 2, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Modified contains_company_name function to be more strict
- Loading branch information
1 parent
7068649
commit baeca5f
Showing
1 changed file
with
83 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import json | ||
import string | ||
import sys | ||
|
||
from collections import OrderedDict | ||
|
||
# Returns true if the company name appears in the tweet, false otherwise | ||
def contains_company_name(tweet): | ||
# Add a new empty list to the tweet json object to store company names in | ||
# the tweet | ||
tweet["companies"] = [] | ||
|
||
# Extract the tweet text from the json object and remove all punctuation | ||
# not in company names | ||
tweet_text = str(tweet["text"].encode("ascii", "ignore")) | ||
tweet_text = tweet_text.translate(string.maketrans("",""), invalid_punctuation) | ||
|
||
# Split the tweet into individual words stored in a list, then determine | ||
# what company names with no spaces are in this list | ||
words_in_tweet = tweet_text.split() | ||
for name in company_names["no_spaces"]: | ||
if name in words_in_tweet: | ||
tweet["companies"].append(name) | ||
|
||
# Now search for company names with spaces in the entire tweet string (with | ||
# punctuation removed) | ||
for name in company_names["with_spaces"]: | ||
if name in tweet_text: | ||
tweet["companies"].append(name) | ||
|
||
# Returns true if a company was found in the tweet | ||
return len(tweet["companies"]) > 0 | ||
|
||
|
||
# Returns true if the tweet is in English, false otherwise | ||
def is_english(tweet): | ||
return tweet["tweetOwner"]["language"] == "en" | ||
|
||
|
||
# Returns true if the tweet meets the filtering criteria, false otherwise | ||
def meets_criteria(tweet): | ||
return is_english(tweet) and contains_company_name(tweet) | ||
|
||
|
||
def main(): | ||
# Open output .json data file provided at command line | ||
output_file = open(sys.argv[2], "w") | ||
|
||
# Write tweets in input .json file that pass filtering criteria to | ||
# output .json data file | ||
with open(sys.argv[1], "r") as input_file: | ||
for line in input_file: | ||
tweet = json.loads(line, object_pairs_hook=OrderedDict) | ||
if meets_criteria(tweet): | ||
json.dump(tweet, output_file) | ||
output_file.write("\n") | ||
|
||
# Close the opened output file | ||
output_file.close() | ||
|
||
|
||
if __name__ == "__main__": | ||
# Check for proper command line arguments | ||
if len(sys.argv) != 3: | ||
print "Usage: filter.py <input_file.json> <output_file.json>" | ||
sys.exit() | ||
|
||
# Load 100 company names into a dictionary separated into two lists: one | ||
# list containing names with spaces, the other without spaces | ||
company_names = { "no_spaces": [], "with_spaces": [] } | ||
with open("company_names.txt", "r") as companies_file: | ||
for line in companies_file: | ||
name = line.strip() | ||
if ' ' in name: | ||
company_names["with_spaces"].append(name) | ||
else: | ||
company_names["no_spaces"].append(name) | ||
|
||
# String of punctuation to remove from tweet text | ||
invalid_punctuation = "!\"#$%()*+,./:;<=>?@[\\]^_`{|}~" | ||
|
||
# Begin filtering execution | ||
main() |