Merge pull request #144 from chicago-justice-project/feature/sent_eval

sentiment evaluation v0
chicago-justice-project · Jan 15, 2020 · d7b3366 · d7b3366
2 parents d897020 + e3d13dd
commit d7b3366
Show file tree

Hide file tree

Showing 10 changed files with 861 additions and 10 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -30,6 +30,7 @@ install:
   - pip install pytest
   - pip install pytest-cov
   - pip install requests
+  - pip install google-cloud-language
   - |
     if [[ $FLAKE8 ]]; then
       pip install flake8

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -1 +1 @@
-* @jherzberg @mchladek
+* @jherzberg @mchladek @RJWorth
diff --git a/lib/notebooks/senteval_budgeting.ipynb b/lib/notebooks/senteval_budgeting.ipynb
diff --git a/lib/tagnews/__init__.py b/lib/tagnews/__init__.py
@@ -2,11 +2,12 @@
 from . import crimetype
 
 from .crimetype.tag import CrimeTags
+from .senteval.eval import SentimentGoogler
 from .geoloc.tag import GeoCoder, get_lat_longs_from_geostrings
 from .utils.load_data import load_data
 from .utils.load_vectorizer import load_glove
 
-__all__ = [utils, crimetype, CrimeTags, GeoCoder,
+__all__ = [utils, crimetype, CrimeTags, GeoCoder, SentimentGoogler,
            get_lat_longs_from_geostrings, load_data, load_glove]
 
-__version__ = '1.2.4'
+__version__ = '1.3.0'
diff --git a/lib/tagnews/senteval/__init__.py b/lib/tagnews/senteval/__init__.py
@@ -0,0 +1,3 @@
+from . import eval, police_words
+
+__all__ = [eval, police_words]
diff --git a/lib/tagnews/senteval/eval.py b/lib/tagnews/senteval/eval.py
@@ -0,0 +1,86 @@
+from google.cloud import language
+from google.cloud.language import enums
+from google.cloud.language import types
+
+from tagnews.senteval.police_words import police_words_list, bins
+
+
+# def process_google_result(text):
+#     document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT)
+#     sentiment = client.analyze_entity_sentiment(document=document)
+#
+#     for entity in sentiment.entities:
+#         clean_entity = "".join(filter(str.isalpha, entity)).lower()
+#
+#         if clean_entity in police_words_list:
+#
+#             for mention in entity.mentions:
+#                 return mention.sentiment.score
+
+
+class SentimentGoogler:
+    def __init__(self):
+        self.client = self.connect_to_client()
+        self.police_words = police_words_list
+        self.bins = bins[::-1] # reversed because we start with lower numbered bins
+        self.num_bins = len(bins)
+
+    def run(self, doc_text):
+        sentiment_ = self.call_api(doc_text)
+        for entity in sentiment_.entities:
+            police_entity = self.is_police_entity(entity)
+            if police_entity:
+                return self.sentiment_from_entity(police_entity)
+
+    def connect_to_client(self):
+        return language.LanguageServiceClient()
+
+    def sentiment_from_entity(self, entity):
+        return entity.sentiment.score
+
+    def call_api(self, doc_text):
+        """
+        Parameters
+        ----------
+        doc_text : str
+            article text
+
+        Returns
+        -------
+        sentiment : json
+            google response call
+        """
+        document = types.Document(content=doc_text, type=enums.Document.Type.PLAIN_TEXT)
+        sentiment = self.client.analyze_entity_sentiment(document=document)
+
+        return sentiment
+
+    def is_police_entity(self, entity):
+        if entity in self.police_words:
+            return entity
+        for mention in entity.mentions:
+            if pre_process_text(mention.text.content) in self.police_words:
+                return entity
+            return False
+
+    def extract_google_priority_bin(self, article, cpd_model_val, cpd_val):
+        cop_word_counts = sum([article.count(substr) for substr in self.police_words])
+        score = 0.5 * cpd_val + 0.25 * cpd_model_val + 0.25 * min(cop_word_counts / (2 * len(self.police_words)), 1.)
+        bin = [bin for bin, bin_max_val in enumerate(bins) if bin_max_val > score][0]
+        return bin
+
+
+def pre_process_text(html_text):
+    """
+    Parameters
+    ----------
+    html_text : str
+        Article text.
+
+    Returns
+    -------
+    words: str
+        lower case, just letters
+    """
+    words = "".join(filter(str.isalpha, html_text)).lower()
+    return words
diff --git a/lib/tagnews/senteval/police_words.py b/lib/tagnews/senteval/police_words.py
@@ -0,0 +1,4 @@
+police_words_list = ["police", "officer", "cop", "officers", "pigs"]
+
+bins = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
+num_bins = len(bins)
diff --git a/lib/tagnews/utils/load_data.py b/lib/tagnews/utils/load_data.py
@@ -144,7 +144,6 @@ def load_data(data_folder=__data_folder, nrows=None):
     into columns called cat_NUMBER. The k-hot encoding is done assuming that
     the categories are 1-indexed and there are as many categories as the
     maximum value of the numerical cateogry_id column.
-
     Inputs:
         data_folder:
             A folder containing the data files in CSV format.
@@ -189,7 +188,6 @@ def find_loc_in_string(locs, string):
         neatly. This function will hopefully performa all necessary
         transformations to find the given location text within the
         larger string.
-
         Inputs:
             locs: list of locations as loaded by load_locations
             string: bodytext of article in which to find locs
@@ -307,7 +305,6 @@ def subsample_and_resave(out_folder, n=5, input_folder=__data_folder,
     into `out_folder`. If there are fewer than `n` articles
     tagged with a type-of-crime, then we will use all of
     the articles with that tag.
-
     Inputs
     ------
     out_folder : str
@@ -484,7 +481,6 @@ def load_crime_data(data_folder=__data_folder):
 def load_ner_data(data_folder=__data_folder):
     """
     Loads ner.csv from the specified data folder.
-
     The column 'stag' is a binary value indicating whether or not
     the row corresponds to the entity "geo". Typically, you will
     want to use column 'word' to predict the column 'stag'.
@@ -501,4 +497,4 @@ def load_ner_data(data_folder=__data_folder):
     df['tag'] = df['stag']
     df = df[['word', 'all_tags', 'tag']]
 
-    return df
+    return df
diff --git a/lib/tagnews/utils/load_vectorizer.py b/lib/tagnews/utils/load_vectorizer.py
@@ -23,7 +23,7 @@ def load_glove(vectors_file, normalize=False):
 
     Sample usage:
 
-        >>> vectors = load_glove('tagnews/data/glove.6B.50d.txt')
+        >>> vectors = load_glove('tagnews/geoloc/glove.6B.50d.txt')
         >>> text = 'This is a sentence and stuff.'
         >>> # you should use an actual tokenizer for this step.
         >>> vectorized_text = vectors.loc[[word.lower()

diff --git a/setup.py b/setup.py
@@ -4,9 +4,10 @@
 
 import os
 
-with open('README.md') as f:
+with open('README.md', "r") as f:
     long_description = f.read()
 
+
 init_file = os.path.join(os.path.split(__file__)[0], 'lib/tagnews/__init__.py')
 with open(init_file) as f:
     try:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		* @jherzberg @mchladek
		* @jherzberg @mchladek @RJWorth
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from . import eval, police_words

		__all__ = [eval, police_words]