-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
39 changed files
with
1,705 additions
and
9,060 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
*.pyc | ||
*.ipynb_checkpoints* | ||
*.pkl | ||
lib/newstag.egg-info | ||
build/ | ||
dist/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
1. Run `python setup.py install`. | ||
2. Verify the installation, with `python -m newstag.crimetype.cli`. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from . import utils | ||
from . import crimetype | ||
|
||
__version__ = '0.0.2' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from . import tag | ||
from . import benchmark |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import sys | ||
import os | ||
import pickle | ||
from ..utils.model_helpers import LemmaTokenizer | ||
from .tag import Tagger | ||
|
||
""" | ||
A command line interface to the automatic article tagger. | ||
Run with `python -m newstag.crimetype.cli` | ||
""" | ||
|
||
if __name__ == '__main__': | ||
tagger = Tagger() | ||
|
||
if len(sys.argv) == 1: | ||
print('Go ahead and start typing. Hit ctrl-d when done.') | ||
s = sys.stdin.read() | ||
preds = tagger.tagtext_proba(s) | ||
preds = preds.sort_values(ascending=False) | ||
for tag, prob in zip(preds.index, preds.values): | ||
print('{: >5}, {:.9f}'.format(tag, prob)) | ||
else: | ||
if sys.argv[1] in ['-h', '--help']: | ||
h = 'python -m newstag.crimetype.tag [filename [filename [...]]]\n' | ||
h += '\n' | ||
h += 'If no filenames are provided, read and tag from stdin.\n' | ||
h += '(Use ctrl-d to stop inputting to stdin.)\n' | ||
h += '\n' | ||
h += 'Otherwise, tag all filenames, outputting the tags as a CSV\n' | ||
h += 'to the file <filename>.tagged.' | ||
print(h) | ||
quit() | ||
for filename in sys.argv[1:]: | ||
with open(filename) as f_in: | ||
preds = tagger.tagtext_proba(f_in.read()) | ||
preds = preds.sort_values(ascending=False) | ||
with open(filename + '.tagged', 'w') as f_out: | ||
for tag, prob in zip(preds.index, preds.values): | ||
f_out.write('{: >5}, {:.9f}\n'.format(tag, prob)) |
Empty file.
Empty file.
56 changes: 56 additions & 0 deletions
56
lib/newstag/crimetype/models/binary_stemmed_logistic/model.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from ....utils import load_data as ld | ||
from ....utils.model_helpers import LemmaTokenizer | ||
import matplotlib | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
import pandas as pd | ||
import sklearn | ||
import sklearn.feature_extraction.text | ||
import sklearn.multiclass | ||
import sklearn.linear_model | ||
|
||
df = ld.load_data() | ||
|
||
crime_df = df.ix[df.loc[:, 'OEMC':'TASR'].any(1), :] | ||
print(crime_df.shape) | ||
crime_df = crime_df.append(df.ix[~df['relevant'], :].sample(n=3000, axis=0)) | ||
print(crime_df.shape) | ||
|
||
idx = np.random.permutation(crime_df.shape[0]) | ||
trn = crime_df.iloc[idx[:int(crime_df.shape[0] * 0.7)], :] | ||
tst = crime_df.iloc[idx[int(crime_df.shape[0] * 0.7):], :] | ||
print(trn.shape) | ||
print(tst.shape) | ||
|
||
# vectorize data | ||
from nltk import word_tokenize | ||
from nltk.stem import WordNetLemmatizer | ||
|
||
|
||
vectorizer = sklearn.feature_extraction.text.CountVectorizer(tokenizer=LemmaTokenizer(), | ||
binary=True) | ||
X = vectorizer.fit_transform(trn['bodytext'].values) | ||
|
||
Y = trn.loc[:, 'OEMC':'TASR'].values | ||
|
||
clf = sklearn.multiclass.OneVsRestClassifier( | ||
sklearn.linear_model.LogisticRegression() | ||
) | ||
|
||
X = vectorizer.transform(crime_df['bodytext'].values) | ||
Y = crime_df.loc[:, 'OEMC':'TASR'].values | ||
|
||
clf.fit(X, Y) | ||
|
||
print(pd.DataFrame( | ||
clf.predict_proba(vectorizer.transform(['marijuana'])), | ||
columns=df.columns[7:] | ||
).T.sort_values(0, ascending=False)) | ||
|
||
|
||
import pickle | ||
|
||
with open('model.pkl', 'wb') as f: | ||
pickle.dump(clf, f) | ||
with open('vectorizer.pkl', 'wb') as f: | ||
pickle.dump(vectorizer, f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import os | ||
import pickle | ||
import pandas as pd | ||
|
||
# not used explicitly, but this needs to be imported like this | ||
# for unpickling to work. | ||
from ..utils.model_helpers import LemmaTokenizer | ||
|
||
""" | ||
Contains the Tagger class that allows tagging of articles. | ||
This file can also be run as a module, with | ||
`python -m newstag.crimetype.tag` | ||
""" | ||
|
||
MODEL_LOCATION = os.path.join(os.path.split(__file__)[0], | ||
'models/binary_stemmed_logistic/') | ||
|
||
TAGS = ['OEMC', 'CPD', 'SAO', 'CCCC', 'CCJ', 'CCSP', | ||
'CPUB', 'IDOC', 'DOMV', 'SEXA', 'POLB', 'POLM', | ||
'GUNV', 'GLBTQ', 'JUVE', 'REEN', 'VIOL', 'BEAT', | ||
'PROB', 'PARL', 'CPLY', 'DRUG', 'CPS', 'GANG', 'ILSP', | ||
'HOMI', 'IPRA', 'CPBD', 'IMMG', 'ENVI', 'UNSPC', | ||
'ILSC', 'ARSN', 'BURG', 'DUI', 'FRUD', 'ROBB', 'TASR'] | ||
|
||
|
||
def load_model(location=MODEL_LOCATION): | ||
with open(os.path.join(location, 'model.pkl'), 'rb') as f: | ||
clf = pickle.load(f) | ||
|
||
with open(os.path.join(location, 'vectorizer.pkl'), 'rb') as f: | ||
vectorizer = pickle.load(f) | ||
|
||
return clf, vectorizer | ||
|
||
|
||
class Tagger(): | ||
""" | ||
Taggers let you tag articles. Neat! | ||
""" | ||
def __init__(self, model_directory=MODEL_LOCATION): | ||
self.clf, self.vectorizer = load_model(model_directory) | ||
|
||
def tagtext_proba(self, text): | ||
""" | ||
Compute the probability each tag applies to the given text. | ||
inputs: | ||
text: A python string. | ||
returns: | ||
pred_proba: A pandas series indexed by the tag name. | ||
""" | ||
x = self.vectorizer.transform([text]) | ||
y_hat = self.clf.predict_proba(x) | ||
preds = pd.DataFrame(y_hat) | ||
preds.columns = TAGS | ||
preds = preds.T.iloc[:,0].sort_values(ascending=False) | ||
return preds | ||
|
||
|
||
def tagtext(self, text, prob_thresh=0.5): | ||
""" | ||
Tag a string with labels. | ||
inputs: | ||
text: A python string. | ||
prob_thresh: The threshold on probability at which point | ||
the tag will be applied. | ||
returns: | ||
preds: A list of tags that have > prob_thresh probability | ||
according to the model. | ||
""" | ||
preds = self.tagtext_proba(text) | ||
return preds[preds > prob_thresh].index.values.tolist() | ||
|
||
|
||
def relevant(self, text, prob_thresh=0.05): | ||
""" | ||
Determines whether given text is relevant or not. Relevance | ||
is defined as whether any tag has more than prob_thresh | ||
chance of applying to the text according to the model. | ||
inputs: | ||
text: A python string. | ||
prob_thresh: The threshold on probability that | ||
determines relevance. If no tags have >= | ||
prob_thresh of applying to the text, then | ||
the text is not relevant. | ||
returns: | ||
relevant: Boolean. Is the text "relevant"? | ||
""" | ||
return len(self.tagtext(text, prob_thresh)) > 0 |
File renamed without changes.
File renamed without changes.
Empty file.
File renamed without changes.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import pytest | ||
import numpy as np | ||
|
||
import newstag | ||
|
||
def test_binary_stemmed_model(): | ||
tagger = newstag.crimetype.tag.Tagger() | ||
computed = tagger.tagtext_proba(('This is an article about drugs and' | ||
' gangs. Copyright Kevin Rose.')) | ||
|
||
expected_values = np.array( | ||
[0.97203727, 0.88789787, 0.2082669 , 0.14037048, 0.10828103, | ||
0.09304668, 0.08045448, 0.07426908, 0.06969677, 0.06545745, | ||
0.06261077, 0.05765306, 0.05581387, 0.05180351, 0.04338474, | ||
0.03787152, 0.0372484 , 0.03544577, 0.03451235, 0.03348183, | ||
0.03332619, 0.03229261, 0.02849745, 0.02814688, 0.02736295, | ||
0.02692241, 0.02479671, 0.02193516, 0.02170981, 0.02148271, | ||
0.02139072, 0.02041881, 0.01952591, 0.01899151, 0.01520583, | ||
0.0151583 , 0.00705791, 0.00525071]) | ||
|
||
np.testing.assert_array_almost_equal(computed.values, | ||
expected_values, | ||
decimal=4) | ||
|
||
expected_columns = [ | ||
'GANG', 'DRUG', 'VIOL', 'IMMG', 'UNSPC', 'REEN', 'PARL', 'CPS', | ||
'GUNV', 'ILSC', 'IPRA', 'DUI', 'CCJ', 'GLBTQ', 'BEAT', 'POLM', | ||
'ILSP', 'CCSP', 'CPLY', 'ENVI', 'OEMC', 'TASR', 'CPBD', 'CPD', | ||
'HOMI', 'JUVE', 'ARSN', 'BURG', 'SEXA', 'IDOC', 'SAO', 'CPUB', | ||
'POLB', 'PROB', 'ROBB', 'FRUD', 'DOMV', 'CCCC' | ||
] | ||
|
||
assert computed.index.values.tolist() == expected_columns |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from nltk import word_tokenize | ||
from nltk.stem import WordNetLemmatizer | ||
|
||
class LemmaTokenizer(object): | ||
def __init__(self): | ||
self.wnl = WordNetLemmatizer() | ||
def __call__(self, doc): | ||
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] |
Oops, something went wrong.