Skip to content

Commit

Permalink
Merge pull request #144 from chicago-justice-project/feature/sent_eval
Browse files Browse the repository at this point in the history
sentiment evaluation v0
  • Loading branch information
jlherzberg authored Jan 15, 2020
2 parents d897020 + e3d13dd commit d7b3366
Show file tree
Hide file tree
Showing 10 changed files with 861 additions and 10 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ install:
- pip install pytest
- pip install pytest-cov
- pip install requests
- pip install google-cloud-language
- |
if [[ $FLAKE8 ]]; then
pip install flake8
Expand Down
2 changes: 1 addition & 1 deletion CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -1 +1 @@
* @jherzberg @mchladek
* @jherzberg @mchladek @RJWorth
759 changes: 759 additions & 0 deletions lib/notebooks/senteval_budgeting.ipynb

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions lib/tagnews/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
from . import crimetype

from .crimetype.tag import CrimeTags
from .senteval.eval import SentimentGoogler
from .geoloc.tag import GeoCoder, get_lat_longs_from_geostrings
from .utils.load_data import load_data
from .utils.load_vectorizer import load_glove

__all__ = [utils, crimetype, CrimeTags, GeoCoder,
__all__ = [utils, crimetype, CrimeTags, GeoCoder, SentimentGoogler,
get_lat_longs_from_geostrings, load_data, load_glove]

__version__ = '1.2.4'
__version__ = '1.3.0'
3 changes: 3 additions & 0 deletions lib/tagnews/senteval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from . import eval, police_words

__all__ = [eval, police_words]
86 changes: 86 additions & 0 deletions lib/tagnews/senteval/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

from tagnews.senteval.police_words import police_words_list, bins


# def process_google_result(text):
# document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT)
# sentiment = client.analyze_entity_sentiment(document=document)
#
# for entity in sentiment.entities:
# clean_entity = "".join(filter(str.isalpha, entity)).lower()
#
# if clean_entity in police_words_list:
#
# for mention in entity.mentions:
# return mention.sentiment.score


class SentimentGoogler:
def __init__(self):
self.client = self.connect_to_client()
self.police_words = police_words_list
self.bins = bins[::-1] # reversed because we start with lower numbered bins
self.num_bins = len(bins)

def run(self, doc_text):
sentiment_ = self.call_api(doc_text)
for entity in sentiment_.entities:
police_entity = self.is_police_entity(entity)
if police_entity:
return self.sentiment_from_entity(police_entity)

def connect_to_client(self):
return language.LanguageServiceClient()

def sentiment_from_entity(self, entity):
return entity.sentiment.score

def call_api(self, doc_text):
"""
Parameters
----------
doc_text : str
article text
Returns
-------
sentiment : json
google response call
"""
document = types.Document(content=doc_text, type=enums.Document.Type.PLAIN_TEXT)
sentiment = self.client.analyze_entity_sentiment(document=document)

return sentiment

def is_police_entity(self, entity):
if entity in self.police_words:
return entity
for mention in entity.mentions:
if pre_process_text(mention.text.content) in self.police_words:
return entity
return False

def extract_google_priority_bin(self, article, cpd_model_val, cpd_val):
cop_word_counts = sum([article.count(substr) for substr in self.police_words])
score = 0.5 * cpd_val + 0.25 * cpd_model_val + 0.25 * min(cop_word_counts / (2 * len(self.police_words)), 1.)
bin = [bin for bin, bin_max_val in enumerate(bins) if bin_max_val > score][0]
return bin


def pre_process_text(html_text):
"""
Parameters
----------
html_text : str
Article text.
Returns
-------
words: str
lower case, just letters
"""
words = "".join(filter(str.isalpha, html_text)).lower()
return words
4 changes: 4 additions & 0 deletions lib/tagnews/senteval/police_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
police_words_list = ["police", "officer", "cop", "officers", "pigs"]

bins = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
num_bins = len(bins)
6 changes: 1 addition & 5 deletions lib/tagnews/utils/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ def load_data(data_folder=__data_folder, nrows=None):
into columns called cat_NUMBER. The k-hot encoding is done assuming that
the categories are 1-indexed and there are as many categories as the
maximum value of the numerical cateogry_id column.
Inputs:
data_folder:
A folder containing the data files in CSV format.
Expand Down Expand Up @@ -189,7 +188,6 @@ def find_loc_in_string(locs, string):
neatly. This function will hopefully performa all necessary
transformations to find the given location text within the
larger string.
Inputs:
locs: list of locations as loaded by load_locations
string: bodytext of article in which to find locs
Expand Down Expand Up @@ -307,7 +305,6 @@ def subsample_and_resave(out_folder, n=5, input_folder=__data_folder,
into `out_folder`. If there are fewer than `n` articles
tagged with a type-of-crime, then we will use all of
the articles with that tag.
Inputs
------
out_folder : str
Expand Down Expand Up @@ -484,7 +481,6 @@ def load_crime_data(data_folder=__data_folder):
def load_ner_data(data_folder=__data_folder):
"""
Loads ner.csv from the specified data folder.
The column 'stag' is a binary value indicating whether or not
the row corresponds to the entity "geo". Typically, you will
want to use column 'word' to predict the column 'stag'.
Expand All @@ -501,4 +497,4 @@ def load_ner_data(data_folder=__data_folder):
df['tag'] = df['stag']
df = df[['word', 'all_tags', 'tag']]

return df
return df
2 changes: 1 addition & 1 deletion lib/tagnews/utils/load_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def load_glove(vectors_file, normalize=False):
Sample usage:
>>> vectors = load_glove('tagnews/data/glove.6B.50d.txt')
>>> vectors = load_glove('tagnews/geoloc/glove.6B.50d.txt')
>>> text = 'This is a sentence and stuff.'
>>> # you should use an actual tokenizer for this step.
>>> vectorized_text = vectors.loc[[word.lower()
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import os

with open('README.md') as f:
with open('README.md', "r") as f:
long_description = f.read()


init_file = os.path.join(os.path.split(__file__)[0], 'lib/tagnews/__init__.py')
with open(init_file) as f:
try:
Expand Down

0 comments on commit d7b3366

Please sign in to comment.