Skip to content

Commit

Permalink
major refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
kbrose committed Aug 22, 2017
1 parent f267d54 commit 6341db1
Show file tree
Hide file tree
Showing 39 changed files with 1,705 additions and 9,060 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
*.pyc
*.ipynb_checkpoints*
*.pkl
lib/newstag.egg-info
build/
dist/
2 changes: 2 additions & 0 deletions INSTALLATION.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1. Run `python setup.py install`.
2. Verify the installation, with `python -m newstag.crimetype.cli`.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

Let's compare the amount different types of crimes are reported in certain areas vs. the actual occurrence amount in those areas. Are some crimes under-represented in certain areas but over-represented in others? To accomplish this, we'll need to be able to extract type-of-crime tag and geospatial data from news articles. We meet every Tuesday at [Chi Hack Night](https://chihacknight.org/), and you can find out more about [this specific project here](https://github.com/chihacknight/breakout-groups/issues/61).

For installation instructions, see [INSTALLATION.md](./INSTALLATION.md)

# Details

The [Chicago Justice Project](http://chicagojustice.org/) has been scraping RSS feeds of articles written by Chicago area news outlets for several years, allowing them to collect almost 300,000 articles. At the same time, an amazing group of [volunteers](http://chicagojustice.org/volunteer-for-cjp/) have helped them tag these articles. The tags include crime categories like "Gun Violence", "Drugs", "Sexual Assault", but also organizations such as "Cook County State's Attorney's Office", "Illinois State Police", "Chicago Police Department", and other miscellaneous categories such as "LGBTQ", "Immigration".
Expand Down Expand Up @@ -39,7 +41,7 @@ Some articles may discuss multiple crimes. Some crimes may occur in multiple are

# The Code

Under the `src` folder you can find the source code.
Under the `lib` folder you can find the source code.

The `load_data.py` file will load the data from the CSV files (stored not in GitHub). Specifically, look at the `load_data.load_data()` method, this returns a `k`-hot encoded tagging and article data.

Expand All @@ -60,3 +62,7 @@ Help [the team scraping articles](https://github.com/chicago-justice-project/chi
* [Chicago Justice Project](http://chicagojustice.org/)
* [Database Repo](https://github.com/kyaroch/chicago-justice)
* [Chi Hack Night Group Description](https://github.com/chihacknight/breakout-groups/issues/61)

# Saving a new model

Working with pickle is difficult. In order to sanely be able load things, I'm running python files that pickle the model using the `-m` flag, e.g. `python -m newstag.crimetype.models.binary_stemmed_logistic.model`.
4 changes: 4 additions & 0 deletions lib/newstag/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from . import utils
from . import crimetype

__version__ = '0.0.2'
2 changes: 2 additions & 0 deletions lib/newstag/crimetype/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from . import tag
from . import benchmark
15 changes: 12 additions & 3 deletions src/benchmark_tagging.py → lib/newstag/crimetype/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get_kfold_split(N, k=4):
return index_pairs


def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4):
def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4, verbose=False):
"""
benchmark a classifier on preprocessed data.
Expand All @@ -47,6 +47,7 @@ def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4):
dictionary of parameters passed to the classifier factory.
If None, no parameters are passed.
k : how many folds to use for cross validation
verbose : Should status be printed?
"""
if clf_params_dict is None:
clf_params_dict = {}
Expand All @@ -61,6 +62,9 @@ def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4):

clfs = []
for i, (idx_trn, idx_tst) in enumerate(fold_indexes):
if verbose:
print('step {} of {}...'.format(i, k), end='')

clf = clf_factory(**clf_params_dict)

x_trn = X[idx_trn, :]
Expand All @@ -85,6 +89,9 @@ def benchmark(clf_factory, X, Y, clf_params_dict=None, k=4):

clfs.append(clf)

if verbose:
print('done')

return {'acc': acc, 'tpr': tpr, 'fpr': fpr, 'ppv': ppv, 'clfs': clfs}


Expand All @@ -103,8 +110,10 @@ def predict_articles(clf, vectorizer, df, n=100, seed=1029384756):
for i, rand_i in enumerate(random_subset):
s = 'Article ID: ' + str(df.index[rand_i])
s += '\n' + df.iloc[rand_i, 3]
s += '\n Predicted Tags: ' + str(preds.iloc[i, :].index[preds.iloc[i, :] > 0.5].values)
s += '\n Predicted Tags: '
s += str(preds.iloc[i, :].index[preds.iloc[i, :] > 0.5].values)
s += '\n' + str(preds.iloc[i, :])
s += '\n'
with open('test-tag-' + str(df.index[rand_i]) + '.txt', 'w', encoding='utf-8') as f:
filename = 'test-tag-' + str(df.index[rand_i]) + '.txt'
with open(filename, 'w', encoding='utf-8') as f:
f.write(s)
39 changes: 39 additions & 0 deletions lib/newstag/crimetype/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sys
import os
import pickle
from ..utils.model_helpers import LemmaTokenizer
from .tag import Tagger

"""
A command line interface to the automatic article tagger.
Run with `python -m newstag.crimetype.cli`
"""

if __name__ == '__main__':
tagger = Tagger()

if len(sys.argv) == 1:
print('Go ahead and start typing. Hit ctrl-d when done.')
s = sys.stdin.read()
preds = tagger.tagtext_proba(s)
preds = preds.sort_values(ascending=False)
for tag, prob in zip(preds.index, preds.values):
print('{: >5}, {:.9f}'.format(tag, prob))
else:
if sys.argv[1] in ['-h', '--help']:
h = 'python -m newstag.crimetype.tag [filename [filename [...]]]\n'
h += '\n'
h += 'If no filenames are provided, read and tag from stdin.\n'
h += '(Use ctrl-d to stop inputting to stdin.)\n'
h += '\n'
h += 'Otherwise, tag all filenames, outputting the tags as a CSV\n'
h += 'to the file <filename>.tagged.'
print(h)
quit()
for filename in sys.argv[1:]:
with open(filename) as f_in:
preds = tagger.tagtext_proba(f_in.read())
preds = preds.sort_values(ascending=False)
with open(filename + '.tagged', 'w') as f_out:
for tag, prob in zip(preds.index, preds.values):
f_out.write('{: >5}, {:.9f}\n'.format(tag, prob))
Empty file.
Empty file.
56 changes: 56 additions & 0 deletions lib/newstag/crimetype/models/binary_stemmed_logistic/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from ....utils import load_data as ld
from ....utils.model_helpers import LemmaTokenizer
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import sklearn.feature_extraction.text
import sklearn.multiclass
import sklearn.linear_model

df = ld.load_data()

crime_df = df.ix[df.loc[:, 'OEMC':'TASR'].any(1), :]
print(crime_df.shape)
crime_df = crime_df.append(df.ix[~df['relevant'], :].sample(n=3000, axis=0))
print(crime_df.shape)

idx = np.random.permutation(crime_df.shape[0])
trn = crime_df.iloc[idx[:int(crime_df.shape[0] * 0.7)], :]
tst = crime_df.iloc[idx[int(crime_df.shape[0] * 0.7):], :]
print(trn.shape)
print(tst.shape)

# vectorize data
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer


vectorizer = sklearn.feature_extraction.text.CountVectorizer(tokenizer=LemmaTokenizer(),
binary=True)
X = vectorizer.fit_transform(trn['bodytext'].values)

Y = trn.loc[:, 'OEMC':'TASR'].values

clf = sklearn.multiclass.OneVsRestClassifier(
sklearn.linear_model.LogisticRegression()
)

X = vectorizer.transform(crime_df['bodytext'].values)
Y = crime_df.loc[:, 'OEMC':'TASR'].values

clf.fit(X, Y)

print(pd.DataFrame(
clf.predict_proba(vectorizer.transform(['marijuana'])),
columns=df.columns[7:]
).T.sort_values(0, ascending=False))


import pickle

with open('model.pkl', 'wb') as f:
pickle.dump(clf, f)
with open('vectorizer.pkl', 'wb') as f:
pickle.dump(vectorizer, f)
92 changes: 92 additions & 0 deletions lib/newstag/crimetype/tag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import os
import pickle
import pandas as pd

# not used explicitly, but this needs to be imported like this
# for unpickling to work.
from ..utils.model_helpers import LemmaTokenizer

"""
Contains the Tagger class that allows tagging of articles.
This file can also be run as a module, with
`python -m newstag.crimetype.tag`
"""

MODEL_LOCATION = os.path.join(os.path.split(__file__)[0],
'models/binary_stemmed_logistic/')

TAGS = ['OEMC', 'CPD', 'SAO', 'CCCC', 'CCJ', 'CCSP',
'CPUB', 'IDOC', 'DOMV', 'SEXA', 'POLB', 'POLM',
'GUNV', 'GLBTQ', 'JUVE', 'REEN', 'VIOL', 'BEAT',
'PROB', 'PARL', 'CPLY', 'DRUG', 'CPS', 'GANG', 'ILSP',
'HOMI', 'IPRA', 'CPBD', 'IMMG', 'ENVI', 'UNSPC',
'ILSC', 'ARSN', 'BURG', 'DUI', 'FRUD', 'ROBB', 'TASR']


def load_model(location=MODEL_LOCATION):
with open(os.path.join(location, 'model.pkl'), 'rb') as f:
clf = pickle.load(f)

with open(os.path.join(location, 'vectorizer.pkl'), 'rb') as f:
vectorizer = pickle.load(f)

return clf, vectorizer


class Tagger():
"""
Taggers let you tag articles. Neat!
"""
def __init__(self, model_directory=MODEL_LOCATION):
self.clf, self.vectorizer = load_model(model_directory)

def tagtext_proba(self, text):
"""
Compute the probability each tag applies to the given text.
inputs:
text: A python string.
returns:
pred_proba: A pandas series indexed by the tag name.
"""
x = self.vectorizer.transform([text])
y_hat = self.clf.predict_proba(x)
preds = pd.DataFrame(y_hat)
preds.columns = TAGS
preds = preds.T.iloc[:,0].sort_values(ascending=False)
return preds


def tagtext(self, text, prob_thresh=0.5):
"""
Tag a string with labels.
inputs:
text: A python string.
prob_thresh: The threshold on probability at which point
the tag will be applied.
returns:
preds: A list of tags that have > prob_thresh probability
according to the model.
"""
preds = self.tagtext_proba(text)
return preds[preds > prob_thresh].index.values.tolist()


def relevant(self, text, prob_thresh=0.05):
"""
Determines whether given text is relevant or not. Relevance
is defined as whether any tag has more than prob_thresh
chance of applying to the text according to the model.
inputs:
text: A python string.
prob_thresh: The threshold on probability that
determines relevance. If no tags have >=
prob_thresh of applying to the text, then
the text is not relevant.
returns:
relevant: Boolean. Is the text "relevant"?
"""
return len(self.tagtext(text, prob_thresh)) > 0
File renamed without changes.
File renamed without changes.
Empty file added lib/newstag/geoloc/__init__.py
Empty file.
File renamed without changes.
Empty file added lib/newstag/tests/__init__.py
Empty file.
33 changes: 33 additions & 0 deletions lib/newstag/tests/test_binary_stemmed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pytest
import numpy as np

import newstag

def test_binary_stemmed_model():
tagger = newstag.crimetype.tag.Tagger()
computed = tagger.tagtext_proba(('This is an article about drugs and'
' gangs. Copyright Kevin Rose.'))

expected_values = np.array(
[0.97203727, 0.88789787, 0.2082669 , 0.14037048, 0.10828103,
0.09304668, 0.08045448, 0.07426908, 0.06969677, 0.06545745,
0.06261077, 0.05765306, 0.05581387, 0.05180351, 0.04338474,
0.03787152, 0.0372484 , 0.03544577, 0.03451235, 0.03348183,
0.03332619, 0.03229261, 0.02849745, 0.02814688, 0.02736295,
0.02692241, 0.02479671, 0.02193516, 0.02170981, 0.02148271,
0.02139072, 0.02041881, 0.01952591, 0.01899151, 0.01520583,
0.0151583 , 0.00705791, 0.00525071])

np.testing.assert_array_almost_equal(computed.values,
expected_values,
decimal=4)

expected_columns = [
'GANG', 'DRUG', 'VIOL', 'IMMG', 'UNSPC', 'REEN', 'PARL', 'CPS',
'GUNV', 'ILSC', 'IPRA', 'DUI', 'CCJ', 'GLBTQ', 'BEAT', 'POLM',
'ILSP', 'CCSP', 'CPLY', 'ENVI', 'OEMC', 'TASR', 'CPBD', 'CPD',
'HOMI', 'JUVE', 'ARSN', 'BURG', 'SEXA', 'IDOC', 'SAO', 'CPUB',
'POLB', 'PROB', 'ROBB', 'FRUD', 'DOMV', 'CCCC'
]

assert computed.index.values.tolist() == expected_columns
Empty file added lib/newstag/utils/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion src/load_data.py → lib/newstag/utils/load_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import os

DATA_FOLDER = '../data/'
DATA_FOLDER = os.path.join(os.path.split(__file__)[0], '../data/')

def load_articles():
"""Loads the articles CSV."""
Expand Down
8 changes: 8 additions & 0 deletions lib/newstag/utils/model_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
Loading

0 comments on commit 6341db1

Please sign in to comment.