Skip to content

Commit

Permalink
updating docs, loading, and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kbrose committed Aug 29, 2017
1 parent 30a7ab2 commit 4c05fcc
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 12 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ lib/tagnews.egg-info
build/
dist/
.eggs/
.cache/
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Setup

Fork this repo, download it, and navigate to it. If you're going to be developing it doesn't necessarily make sense to install as a package, but you'll still need to install the dependencies. See the [README](README.md) for more info.
Fork this repo, download it, and navigate to it. If you're going to be developing it doesn't necessarily make sense to install as a package, but you'll still need to install the dependencies. Head over to the [README](README.md) for instructions on installing the required dependencies.

Once that's done, a good place to start is the [notebooks](./lib/notebooks). Reading through these should help you get up to speed, and running them is a pretty good test to make sure everything is installed correctly. You will need the data to run the notebooks. There is no current cloud-based data sharing solution being used. Instead, it is contained on a USB drive, come to the Chi Hack Night meeting to get it! If this will be a problem for you but you are still interested, contact one of the maintainers.
Done? Great, welcome back. A good place to start is the [notebooks](./lib/notebooks). Reading through these should help you get up to speed, and running them is a pretty good test to make sure everything is installed correctly. You will need the data to run the notebooks. There is no current cloud-based data sharing solution being used. Instead, it is contained on a USB drive, come to the Chi Hack Night meeting to get it! If this will be a problem for you but you are still interested, contact one of the maintainers.

# What can I do?

Expand Down
2 changes: 1 addition & 1 deletion lib/tagnews/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import utils
from . import crimetype

__version__ = '0.0.5'
__version__ = '0.1.0'
17 changes: 17 additions & 0 deletions lib/tagnews/crimetype/tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,23 @@ def tagtext(self, text, prob_thresh=0.5):
return preds[preds > prob_thresh].index.values.tolist()


def relevant_proba(self, text):
"""
Outputs the probability that the given text is relevant.
This probability is computed naively as the maximum of
the probabilities each tag applies to the text.
A more nuanced method would compute a joint probability.
inputs:
text: A python string.
returns:
relevant_proba: Probability the text is relevant.
"""
return max(self.tagtext_proba(text))


def relevant(self, text, prob_thresh=0.05):
"""
Determines whether given text is relevant or not. Relevance
Expand Down
1 change: 0 additions & 1 deletion lib/tagnews/tests/test_binary_stemmed.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import pytest
import numpy as np

import tagnews
Expand Down
22 changes: 22 additions & 0 deletions lib/tagnews/tests/test_load_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os

import pytest
import numpy as np

from tagnews.utils import load_data

_data_exists = os.path.isfile(os.path.join(load_data.__data_folder,
'newsarticles_article.csv'))

@pytest.mark.filterwarnings('ignore')
@pytest.mark.skipif(not _data_exists,
reason='Data must be downloaded to load!')
def test_load_data():
df = load_data.load_data()

# Just assert things about article with ID 12345
row = df.loc[12345, :]
assert row['CCCC'] == 1
assert row['CCJ'] == 0
assert (row['bodytext'].split('\n')[0]
== "![Larry E. Price (Sheriff's photo)][1]")
19 changes: 12 additions & 7 deletions lib/tagnews/utils/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,6 @@ def load_data(data_folder=__data_folder, nrows=None):
categories_df = load_categories(data_folder)
categories_df.set_index('id', drop=True, inplace=True)

for i in range(tags_df['category_id'].max()):
# cat_name = 'cat_' + str(i+1)
cat_name = categories_df.loc[i+1, 'abbreviation']
df[cat_name] = 0
df[cat_name] = df[cat_name].astype('int8') # save on that memory!

# tags_df['category_id'] = tags_df['category_id'].astype(str)
tags_df['category_abbreviation'] = (categories_df
['abbreviation']
Expand All @@ -102,8 +96,19 @@ def load_data(data_folder=__data_folder, nrows=None):

article_ids = tags_df['article_id'].values
cat_abbreviations = tags_df['category_abbreviation'].values

# for some reason, some articles that are tagged don't show up
# in the articles CSV. filter those out.
existing_ids_filter = np.isin(article_ids, df.index.values)
df.loc[article_ids[existing_ids_filter], cat_abbreviations[existing_ids_filter]] = 1

article_ids = article_ids[existing_ids_filter]
cat_abbreviations = cat_abbreviations[existing_ids_filter]

for i in range(tags_df['category_id'].max()):
cat_name = categories_df.loc[i+1, 'abbreviation']
df[cat_name] = 0
df[cat_name] = df[cat_name].astype('int8') # save on that memory!
df.loc[article_ids[cat_abbreviations == cat_name], cat_name] = 1

return df

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def run(self):
nltk.download(nltk_package)

setup(name='tagnews',
version='0.0.5',
version='0.1.0',
description='automatically tag articles with justice-related categories',
author='Kevin Rose',
url='https://github.com/chicago-justice-project/article-tagging',
Expand Down

0 comments on commit 4c05fcc

Please sign in to comment.