From 4c05fcc203bebaf7b21726f73c7daae85b038895 Mon Sep 17 00:00:00 2001 From: Kevin Rose Date: Thu, 24 Aug 2017 01:44:45 -0500 Subject: [PATCH] updating docs, loading, and tests --- .gitignore | 1 + CONTRIBUTING.md | 4 ++-- lib/tagnews/__init__.py | 2 +- lib/tagnews/crimetype/tag.py | 17 +++++++++++++++++ lib/tagnews/tests/test_binary_stemmed.py | 1 - lib/tagnews/tests/test_load_data.py | 22 ++++++++++++++++++++++ lib/tagnews/utils/load_data.py | 19 ++++++++++++------- setup.py | 2 +- 8 files changed, 56 insertions(+), 12 deletions(-) create mode 100644 lib/tagnews/tests/test_load_data.py diff --git a/.gitignore b/.gitignore index fdcf332..3e8c46d 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ lib/tagnews.egg-info build/ dist/ .eggs/ +.cache/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 860f310..14e427f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,8 +1,8 @@ # Setup -Fork this repo, download it, and navigate to it. If you're going to be developing it doesn't necessarily make sense to install as a package, but you'll still need to install the dependencies. See the [README](README.md) for more info. +Fork this repo, download it, and navigate to it. If you're going to be developing it doesn't necessarily make sense to install as a package, but you'll still need to install the dependencies. Head over to the [README](README.md) for instructions on installing the required dependencies. -Once that's done, a good place to start is the [notebooks](./lib/notebooks). Reading through these should help you get up to speed, and running them is a pretty good test to make sure everything is installed correctly. You will need the data to run the notebooks. There is no current cloud-based data sharing solution being used. Instead, it is contained on a USB drive, come to the Chi Hack Night meeting to get it! If this will be a problem for you but you are still interested, contact one of the maintainers. +Done? Great, welcome back. A good place to start is the [notebooks](./lib/notebooks). Reading through these should help you get up to speed, and running them is a pretty good test to make sure everything is installed correctly. You will need the data to run the notebooks. There is no current cloud-based data sharing solution being used. Instead, it is contained on a USB drive, come to the Chi Hack Night meeting to get it! If this will be a problem for you but you are still interested, contact one of the maintainers. # What can I do? diff --git a/lib/tagnews/__init__.py b/lib/tagnews/__init__.py index 8f9a59a..21bf5e1 100644 --- a/lib/tagnews/__init__.py +++ b/lib/tagnews/__init__.py @@ -1,4 +1,4 @@ from . import utils from . import crimetype -__version__ = '0.0.5' +__version__ = '0.1.0' diff --git a/lib/tagnews/crimetype/tag.py b/lib/tagnews/crimetype/tag.py index 0224e6a..1c06d50 100644 --- a/lib/tagnews/crimetype/tag.py +++ b/lib/tagnews/crimetype/tag.py @@ -74,6 +74,23 @@ def tagtext(self, text, prob_thresh=0.5): return preds[preds > prob_thresh].index.values.tolist() + def relevant_proba(self, text): + """ + Outputs the probability that the given text is relevant. + This probability is computed naively as the maximum of + the probabilities each tag applies to the text. + + A more nuanced method would compute a joint probability. + + inputs: + text: A python string. + + returns: + relevant_proba: Probability the text is relevant. + """ + return max(self.tagtext_proba(text)) + + def relevant(self, text, prob_thresh=0.05): """ Determines whether given text is relevant or not. Relevance diff --git a/lib/tagnews/tests/test_binary_stemmed.py b/lib/tagnews/tests/test_binary_stemmed.py index 7e20279..5a45331 100644 --- a/lib/tagnews/tests/test_binary_stemmed.py +++ b/lib/tagnews/tests/test_binary_stemmed.py @@ -1,4 +1,3 @@ -import pytest import numpy as np import tagnews diff --git a/lib/tagnews/tests/test_load_data.py b/lib/tagnews/tests/test_load_data.py new file mode 100644 index 0000000..2677ac4 --- /dev/null +++ b/lib/tagnews/tests/test_load_data.py @@ -0,0 +1,22 @@ +import os + +import pytest +import numpy as np + +from tagnews.utils import load_data + +_data_exists = os.path.isfile(os.path.join(load_data.__data_folder, + 'newsarticles_article.csv')) + +@pytest.mark.filterwarnings('ignore') +@pytest.mark.skipif(not _data_exists, + reason='Data must be downloaded to load!') +def test_load_data(): + df = load_data.load_data() + + # Just assert things about article with ID 12345 + row = df.loc[12345, :] + assert row['CCCC'] == 1 + assert row['CCJ'] == 0 + assert (row['bodytext'].split('\n')[0] + == "![Larry E. Price (Sheriff's photo)][1]") diff --git a/lib/tagnews/utils/load_data.py b/lib/tagnews/utils/load_data.py index 6914382..7d077d2 100644 --- a/lib/tagnews/utils/load_data.py +++ b/lib/tagnews/utils/load_data.py @@ -85,12 +85,6 @@ def load_data(data_folder=__data_folder, nrows=None): categories_df = load_categories(data_folder) categories_df.set_index('id', drop=True, inplace=True) - for i in range(tags_df['category_id'].max()): - # cat_name = 'cat_' + str(i+1) - cat_name = categories_df.loc[i+1, 'abbreviation'] - df[cat_name] = 0 - df[cat_name] = df[cat_name].astype('int8') # save on that memory! - # tags_df['category_id'] = tags_df['category_id'].astype(str) tags_df['category_abbreviation'] = (categories_df ['abbreviation'] @@ -102,8 +96,19 @@ def load_data(data_folder=__data_folder, nrows=None): article_ids = tags_df['article_id'].values cat_abbreviations = tags_df['category_abbreviation'].values + + # for some reason, some articles that are tagged don't show up + # in the articles CSV. filter those out. existing_ids_filter = np.isin(article_ids, df.index.values) - df.loc[article_ids[existing_ids_filter], cat_abbreviations[existing_ids_filter]] = 1 + + article_ids = article_ids[existing_ids_filter] + cat_abbreviations = cat_abbreviations[existing_ids_filter] + + for i in range(tags_df['category_id'].max()): + cat_name = categories_df.loc[i+1, 'abbreviation'] + df[cat_name] = 0 + df[cat_name] = df[cat_name].astype('int8') # save on that memory! + df.loc[article_ids[cat_abbreviations == cat_name], cat_name] = 1 return df diff --git a/setup.py b/setup.py index e777cc9..6d70012 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def run(self): nltk.download(nltk_package) setup(name='tagnews', - version='0.0.5', + version='0.1.0', description='automatically tag articles with justice-related categories', author='Kevin Rose', url='https://github.com/chicago-justice-project/article-tagging',