major refactoring

chicago-justice-project · Aug 22, 2017 · 6341db1 · 6341db1
1 parent f267d54
commit 6341db1
Show file tree

Hide file tree

Showing 39 changed files with 1,705 additions and 9,060 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 *.pyc
 *.ipynb_checkpoints*
 *.pkl
+lib/newstag.egg-info
+build/
+dist/
diff --git a/INSTALLATION.md b/INSTALLATION.md
@@ -0,0 +1,2 @@
+1. Run `python setup.py install`.
+2. Verify the installation, with `python -m newstag.crimetype.cli`.
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 Let's compare the amount different types of crimes are reported in certain areas vs. the actual occurrence amount in those areas. Are some crimes under-represented in certain areas but over-represented in others? To accomplish this, we'll need to be able to extract type-of-crime tag and geospatial data from news articles. We meet every Tuesday at [Chi Hack Night](https://chihacknight.org/), and you can find out more about [this specific project here](https://github.com/chihacknight/breakout-groups/issues/61).
 
+For installation instructions, see [INSTALLATION.md](./INSTALLATION.md)
+
 # Details
 
 The [Chicago Justice Project](http://chicagojustice.org/) has been scraping RSS feeds of articles written by Chicago area news outlets for several years, allowing them to collect almost 300,000 articles. At the same time, an amazing group of [volunteers](http://chicagojustice.org/volunteer-for-cjp/) have helped them tag these articles. The tags include crime categories like "Gun Violence", "Drugs", "Sexual Assault", but also organizations such as "Cook County State's Attorney's Office", "Illinois State Police", "Chicago Police Department", and other miscellaneous categories such as "LGBTQ", "Immigration".
@@ -39,7 +41,7 @@ Some articles may discuss multiple crimes. Some crimes may occur in multiple are
 
 # The Code
 
-Under the `src` folder you can find the source code.
+Under the `lib` folder you can find the source code.
 
 The `load_data.py` file will load the data from the CSV files (stored not in GitHub). Specifically, look at the `load_data.load_data()` method, this returns a `k`-hot encoded tagging and article data.
 
@@ -60,3 +62,7 @@ Help [the team scraping articles](https://github.com/chicago-justice-project/chi
 * [Chicago Justice Project](http://chicagojustice.org/)
 * [Database Repo](https://github.com/kyaroch/chicago-justice)
 * [Chi Hack Night Group Description](https://github.com/chihacknight/breakout-groups/issues/61)
+
+# Saving a new model
+
+Working with pickle is difficult. In order to sanely be able load things, I'm running python files that pickle the model using the `-m` flag, e.g. `python -m newstag.crimetype.models.binary_stemmed_logistic.model`.
diff --git a/lib/newstag/__init__.py b/lib/newstag/__init__.py
@@ -0,0 +1,4 @@
+from . import utils
+from . import crimetype
+
+__version__ = '0.0.2'
diff --git a/lib/newstag/crimetype/__init__.py b/lib/newstag/crimetype/__init__.py
@@ -0,0 +1,2 @@
+from . import tag
+from . import benchmark
diff --git a/src/benchmark_tagging.py → lib/newstag/crimetype/benchmark.py b/src/benchmark_tagging.py → lib/newstag/crimetype/benchmark.py
@@ -30,7 +30,7 @@ def get_kfold_split(N, k=4):
     return index_pairs
 
 
-def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4):
+def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4, verbose=False):
     """
     benchmark a classifier on preprocessed data.
 
@@ -47,6 +47,7 @@ def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4):
         dictionary of parameters passed to the classifier factory.
         If None, no parameters are passed.
     k : how many folds to use for cross validation
+    verbose : Should status be printed?
     """
     if clf_params_dict is None:
         clf_params_dict = {}
@@ -61,6 +62,9 @@ def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4):
 
     clfs = []
     for i, (idx_trn, idx_tst) in enumerate(fold_indexes):
+        if verbose:
+            print('step {} of {}...'.format(i, k), end='')
+
         clf = clf_factory(**clf_params_dict)
 
         x_trn = X[idx_trn, :]
@@ -85,6 +89,9 @@ def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4):
 
         clfs.append(clf)
 
+        if verbose:
+            print('done')
+
     return {'acc': acc, 'tpr': tpr, 'fpr': fpr, 'ppv': ppv, 'clfs': clfs}
 
 
@@ -103,8 +110,10 @@ def predict_articles(clf, vectorizer, df, n=100, seed=1029384756):
     for i, rand_i in enumerate(random_subset):
         s = 'Article ID: ' + str(df.index[rand_i])
         s += '\n' + df.iloc[rand_i, 3]
-        s += '\n Predicted Tags: ' + str(preds.iloc[i, :].index[preds.iloc[i, :] > 0.5].values)
+        s += '\n Predicted Tags: '
+        s += str(preds.iloc[i, :].index[preds.iloc[i, :] > 0.5].values)
         s += '\n' + str(preds.iloc[i, :])
         s += '\n'
-        with open('test-tag-' + str(df.index[rand_i]) + '.txt', 'w', encoding='utf-8') as f:
+        filename = 'test-tag-' + str(df.index[rand_i]) + '.txt'
+        with open(filename, 'w', encoding='utf-8') as f:
             f.write(s)
diff --git a/lib/newstag/crimetype/cli.py b/lib/newstag/crimetype/cli.py
@@ -0,0 +1,39 @@
+import sys
+import os
+import pickle
+from ..utils.model_helpers import LemmaTokenizer
+from .tag import Tagger
+
+"""
+A command line interface to the automatic article tagger.
+Run with `python -m newstag.crimetype.cli`
+"""
+
+if __name__ == '__main__':
+    tagger = Tagger()
+
+    if len(sys.argv) == 1:
+        print('Go ahead and start typing. Hit ctrl-d when done.')
+        s = sys.stdin.read()
+        preds = tagger.tagtext_proba(s)
+        preds = preds.sort_values(ascending=False)
+        for tag, prob in zip(preds.index, preds.values):
+            print('{: >5}, {:.9f}'.format(tag, prob))
+    else:
+        if sys.argv[1] in ['-h', '--help']:
+            h = 'python -m newstag.crimetype.tag [filename [filename [...]]]\n'
+            h += '\n'
+            h += 'If no filenames are provided, read and tag from stdin.\n'
+            h += '(Use ctrl-d to stop inputting to stdin.)\n'
+            h += '\n'
+            h += 'Otherwise, tag all filenames, outputting the tags as a CSV\n'
+            h += 'to the file <filename>.tagged.'
+            print(h)
+            quit()
+        for filename in sys.argv[1:]:
+            with open(filename) as f_in:
+                preds = tagger.tagtext_proba(f_in.read())
+            preds = preds.sort_values(ascending=False)
+            with open(filename + '.tagged', 'w') as f_out:
+                for tag, prob in zip(preds.index, preds.values):
+                    f_out.write('{: >5}, {:.9f}\n'.format(tag, prob))
diff --git a/lib/newstag/crimetype/models/__init__.py b/lib/newstag/crimetype/models/__init__.py
diff --git a/lib/newstag/crimetype/models/binary_stemmed_logistic/__init__.py b/lib/newstag/crimetype/models/binary_stemmed_logistic/__init__.py
diff --git a/lib/newstag/crimetype/models/binary_stemmed_logistic/model.py b/lib/newstag/crimetype/models/binary_stemmed_logistic/model.py
@@ -0,0 +1,56 @@
+from ....utils import load_data as ld
+from ....utils.model_helpers import LemmaTokenizer
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import sklearn
+import sklearn.feature_extraction.text
+import sklearn.multiclass
+import sklearn.linear_model
+
+df = ld.load_data()
+
+crime_df = df.ix[df.loc[:, 'OEMC':'TASR'].any(1), :]
+print(crime_df.shape)
+crime_df = crime_df.append(df.ix[~df['relevant'], :].sample(n=3000, axis=0))
+print(crime_df.shape)
+
+idx = np.random.permutation(crime_df.shape[0])
+trn = crime_df.iloc[idx[:int(crime_df.shape[0] * 0.7)], :]
+tst = crime_df.iloc[idx[int(crime_df.shape[0] * 0.7):], :]
+print(trn.shape)
+print(tst.shape)
+
+# vectorize data
+from nltk import word_tokenize
+from nltk.stem import WordNetLemmatizer
+
+
+vectorizer = sklearn.feature_extraction.text.CountVectorizer(tokenizer=LemmaTokenizer(),
+                                                             binary=True)
+X = vectorizer.fit_transform(trn['bodytext'].values)
+
+Y = trn.loc[:, 'OEMC':'TASR'].values
+
+clf = sklearn.multiclass.OneVsRestClassifier(
+    sklearn.linear_model.LogisticRegression()
+)
+
+X = vectorizer.transform(crime_df['bodytext'].values)
+Y = crime_df.loc[:, 'OEMC':'TASR'].values
+
+clf.fit(X, Y)
+
+print(pd.DataFrame(
+    clf.predict_proba(vectorizer.transform(['marijuana'])),
+    columns=df.columns[7:]
+).T.sort_values(0, ascending=False))
+
+
+import pickle
+
+with open('model.pkl', 'wb') as f:
+    pickle.dump(clf, f)
+with open('vectorizer.pkl', 'wb') as f:
+    pickle.dump(vectorizer, f)
diff --git a/lib/newstag/crimetype/tag.py b/lib/newstag/crimetype/tag.py
@@ -0,0 +1,92 @@
+import os
+import pickle
+import pandas as pd
+
+# not used explicitly, but this needs to be imported like this
+# for unpickling to work.
+from ..utils.model_helpers import LemmaTokenizer
+
+"""
+Contains the Tagger class that allows tagging of articles.
+
+This file can also be run as a module, with
+`python -m newstag.crimetype.tag`
+"""
+
+MODEL_LOCATION = os.path.join(os.path.split(__file__)[0],
+                              'models/binary_stemmed_logistic/')
+
+TAGS = ['OEMC', 'CPD', 'SAO', 'CCCC', 'CCJ', 'CCSP',
+        'CPUB', 'IDOC', 'DOMV', 'SEXA', 'POLB', 'POLM',
+        'GUNV', 'GLBTQ', 'JUVE', 'REEN', 'VIOL', 'BEAT',
+        'PROB', 'PARL', 'CPLY', 'DRUG', 'CPS', 'GANG', 'ILSP',
+        'HOMI', 'IPRA', 'CPBD', 'IMMG', 'ENVI', 'UNSPC',
+        'ILSC', 'ARSN', 'BURG', 'DUI', 'FRUD', 'ROBB', 'TASR']
+
+
+def load_model(location=MODEL_LOCATION):
+    with open(os.path.join(location, 'model.pkl'), 'rb') as f:
+        clf = pickle.load(f)
+
+    with open(os.path.join(location, 'vectorizer.pkl'), 'rb') as f:
+        vectorizer = pickle.load(f)
+
+    return clf, vectorizer
+
+
+class Tagger():
+    """
+    Taggers let you tag articles. Neat!
+    """
+    def __init__(self, model_directory=MODEL_LOCATION):
+        self.clf, self.vectorizer = load_model(model_directory)
+
+    def tagtext_proba(self, text):
+        """
+        Compute the probability each tag applies to the given text.
+
+        inputs:
+            text: A python string.
+        returns:
+            pred_proba: A pandas series indexed by the tag name.
+        """
+        x = self.vectorizer.transform([text])
+        y_hat = self.clf.predict_proba(x)
+        preds = pd.DataFrame(y_hat)
+        preds.columns = TAGS
+        preds = preds.T.iloc[:,0].sort_values(ascending=False)
+        return preds
+
+
+    def tagtext(self, text, prob_thresh=0.5):
+        """
+        Tag a string with labels.
+
+        inputs:
+            text: A python string.
+            prob_thresh: The threshold on probability at which point
+                the tag will be applied.
+        returns:
+            preds: A list of tags that have > prob_thresh probability
+                according to the model.
+        """
+        preds = self.tagtext_proba(text)
+        return preds[preds > prob_thresh].index.values.tolist()
+
+
+    def relevant(self, text, prob_thresh=0.05):
+        """
+        Determines whether given text is relevant or not. Relevance
+        is defined as whether any tag has more than prob_thresh
+        chance of applying to the text according to the model.
+
+        inputs:
+            text: A python string.
+            prob_thresh: The threshold on probability that
+                determines relevance. If no tags have >=
+                prob_thresh of applying to the text, then
+                the text is not relevant.
+        returns:
+            relevant: Boolean. Is the text "relevant"?
+        """
+        return len(self.tagtext(text, prob_thresh)) > 0
diff --git a/data/.gitignore → lib/newstag/data/.gitignore b/data/.gitignore → lib/newstag/data/.gitignore
diff --git a/data/column_names.txt → lib/newstag/data/column_names.txt b/data/column_names.txt → lib/newstag/data/column_names.txt
diff --git a/lib/newstag/geoloc/__init__.py b/lib/newstag/geoloc/__init__.py
diff --git a/src/link-to-crime-table.ipynb → lib/newstag/link-to-crime-table.ipynb b/src/link-to-crime-table.ipynb → lib/newstag/link-to-crime-table.ipynb
diff --git a/lib/newstag/tests/__init__.py b/lib/newstag/tests/__init__.py
diff --git a/lib/newstag/tests/test_binary_stemmed.py b/lib/newstag/tests/test_binary_stemmed.py
@@ -0,0 +1,33 @@
+import pytest
+import numpy as np
+
+import newstag
+
+def test_binary_stemmed_model():
+    tagger = newstag.crimetype.tag.Tagger()
+    computed = tagger.tagtext_proba(('This is an article about drugs and'
+                                     ' gangs. Copyright Kevin Rose.'))
+
+    expected_values = np.array(
+        [0.97203727,  0.88789787,  0.2082669 ,  0.14037048,  0.10828103,
+         0.09304668,  0.08045448,  0.07426908,  0.06969677,  0.06545745,
+         0.06261077,  0.05765306,  0.05581387,  0.05180351,  0.04338474,
+         0.03787152,  0.0372484 ,  0.03544577,  0.03451235,  0.03348183,
+         0.03332619,  0.03229261,  0.02849745,  0.02814688,  0.02736295,
+         0.02692241,  0.02479671,  0.02193516,  0.02170981,  0.02148271,
+         0.02139072,  0.02041881,  0.01952591,  0.01899151,  0.01520583,
+         0.0151583 ,  0.00705791,  0.00525071])
+
+    np.testing.assert_array_almost_equal(computed.values,
+                                         expected_values,
+                                         decimal=4)
+
+    expected_columns = [
+        'GANG', 'DRUG', 'VIOL', 'IMMG', 'UNSPC', 'REEN', 'PARL', 'CPS',
+        'GUNV', 'ILSC', 'IPRA', 'DUI', 'CCJ', 'GLBTQ', 'BEAT', 'POLM',
+        'ILSP', 'CCSP', 'CPLY', 'ENVI', 'OEMC', 'TASR', 'CPBD', 'CPD',
+        'HOMI', 'JUVE', 'ARSN', 'BURG', 'SEXA', 'IDOC', 'SAO', 'CPUB',
+        'POLB', 'PROB', 'ROBB', 'FRUD', 'DOMV', 'CCCC'
+    ]
+
+    assert computed.index.values.tolist() == expected_columns
diff --git a/lib/newstag/utils/__init__.py b/lib/newstag/utils/__init__.py
diff --git a/src/load_data.py → lib/newstag/utils/load_data.py b/src/load_data.py → lib/newstag/utils/load_data.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import os
 
-DATA_FOLDER = '../data/'
+DATA_FOLDER = os.path.join(os.path.split(__file__)[0], '../data/')
 
 def load_articles():
     """Loads the articles CSV."""

diff --git a/lib/newstag/utils/model_helpers.py b/lib/newstag/utils/model_helpers.py
@@ -0,0 +1,8 @@
+from nltk import word_tokenize
+from nltk.stem import WordNetLemmatizer
+
+class LemmaTokenizer(object):
+    def __init__(self):
+        self.wnl = WordNetLemmatizer()
+    def __call__(self, doc):
+        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		1. Run `python setup.py install`.
		2. Verify the installation, with `python -m newstag.crimetype.cli`.