From 4c05fcc203bebaf7b21726f73c7daae85b038895 Mon Sep 17 00:00:00 2001
From: Kevin Rose <kevin+gh@maypark.com>
Date: Thu, 24 Aug 2017 01:44:45 -0500
Subject: [PATCH] updating docs, loading, and tests

---
 .gitignore                               |  1 +
 CONTRIBUTING.md                          |  4 ++--
 lib/tagnews/__init__.py                  |  2 +-
 lib/tagnews/crimetype/tag.py             | 17 +++++++++++++++++
 lib/tagnews/tests/test_binary_stemmed.py |  1 -
 lib/tagnews/tests/test_load_data.py      | 22 ++++++++++++++++++++++
 lib/tagnews/utils/load_data.py           | 19 ++++++++++++-------
 setup.py                                 |  2 +-
 8 files changed, 56 insertions(+), 12 deletions(-)
 create mode 100644 lib/tagnews/tests/test_load_data.py

diff --git a/.gitignore b/.gitignore
index fdcf332..3e8c46d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ lib/tagnews.egg-info
 build/
 dist/
 .eggs/
+.cache/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 860f310..14e427f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,8 +1,8 @@
 # Setup
 
-Fork this repo, download it, and navigate to it. If you're going to be developing it doesn't necessarily make sense to install as a package, but you'll still need to install the dependencies. See the [README](README.md) for more info.
+Fork this repo, download it, and navigate to it. If you're going to be developing it doesn't necessarily make sense to install as a package, but you'll still need to install the dependencies. Head over to the [README](README.md) for instructions on installing the required dependencies.
 
-Once that's done, a good place to start is the [notebooks](./lib/notebooks). Reading through these should help you get up to speed, and running them is a pretty good test to make sure everything is installed correctly. You will need the data to run the notebooks. There is no current cloud-based data sharing solution being used. Instead, it is contained on a USB drive, come to the Chi Hack Night meeting to get it! If this will be a problem for you but you are still interested, contact one of the maintainers.
+Done? Great, welcome back. A good place to start is the [notebooks](./lib/notebooks). Reading through these should help you get up to speed, and running them is a pretty good test to make sure everything is installed correctly. You will need the data to run the notebooks. There is no current cloud-based data sharing solution being used. Instead, it is contained on a USB drive, come to the Chi Hack Night meeting to get it! If this will be a problem for you but you are still interested, contact one of the maintainers.
 
 # What can I do?
 
diff --git a/lib/tagnews/__init__.py b/lib/tagnews/__init__.py
index 8f9a59a..21bf5e1 100644
--- a/lib/tagnews/__init__.py
+++ b/lib/tagnews/__init__.py
@@ -1,4 +1,4 @@
 from . import utils
 from . import crimetype
 
-__version__ = '0.0.5'
+__version__ = '0.1.0'
diff --git a/lib/tagnews/crimetype/tag.py b/lib/tagnews/crimetype/tag.py
index 0224e6a..1c06d50 100644
--- a/lib/tagnews/crimetype/tag.py
+++ b/lib/tagnews/crimetype/tag.py
@@ -74,6 +74,23 @@ def tagtext(self, text, prob_thresh=0.5):
         return preds[preds > prob_thresh].index.values.tolist()
 
 
+    def relevant_proba(self, text):
+        """
+        Outputs the probability that the given text is relevant.
+        This probability is computed naively as the maximum of
+        the probabilities each tag applies to the text.
+
+        A more nuanced method would compute a joint probability.
+
+        inputs:
+            text: A python string.
+
+        returns:
+            relevant_proba: Probability the text is relevant.
+        """
+        return max(self.tagtext_proba(text))
+
+
     def relevant(self, text, prob_thresh=0.05):
         """
         Determines whether given text is relevant or not. Relevance
diff --git a/lib/tagnews/tests/test_binary_stemmed.py b/lib/tagnews/tests/test_binary_stemmed.py
index 7e20279..5a45331 100644
--- a/lib/tagnews/tests/test_binary_stemmed.py
+++ b/lib/tagnews/tests/test_binary_stemmed.py
@@ -1,4 +1,3 @@
-import pytest
 import numpy as np
 
 import tagnews
diff --git a/lib/tagnews/tests/test_load_data.py b/lib/tagnews/tests/test_load_data.py
new file mode 100644
index 0000000..2677ac4
--- /dev/null
+++ b/lib/tagnews/tests/test_load_data.py
@@ -0,0 +1,22 @@
+import os
+
+import pytest
+import numpy as np
+
+from tagnews.utils import load_data
+
+_data_exists = os.path.isfile(os.path.join(load_data.__data_folder,
+                                           'newsarticles_article.csv'))
+
+@pytest.mark.filterwarnings('ignore')
+@pytest.mark.skipif(not _data_exists,
+                    reason='Data must be downloaded to load!')
+def test_load_data():
+    df = load_data.load_data()
+
+    # Just assert things about article with ID 12345
+    row = df.loc[12345, :]
+    assert row['CCCC'] == 1
+    assert row['CCJ'] == 0
+    assert (row['bodytext'].split('\n')[0]
+            == "![Larry E. Price (Sheriff's photo)][1]")
diff --git a/lib/tagnews/utils/load_data.py b/lib/tagnews/utils/load_data.py
index 6914382..7d077d2 100644
--- a/lib/tagnews/utils/load_data.py
+++ b/lib/tagnews/utils/load_data.py
@@ -85,12 +85,6 @@ def load_data(data_folder=__data_folder, nrows=None):
     categories_df = load_categories(data_folder)
     categories_df.set_index('id', drop=True, inplace=True)
 
-    for i in range(tags_df['category_id'].max()):
-        # cat_name = 'cat_' + str(i+1)
-        cat_name = categories_df.loc[i+1, 'abbreviation']
-        df[cat_name] = 0
-        df[cat_name] = df[cat_name].astype('int8') # save on that memory!
-
     # tags_df['category_id'] = tags_df['category_id'].astype(str)
     tags_df['category_abbreviation'] = (categories_df
                                         ['abbreviation']
@@ -102,8 +96,19 @@ def load_data(data_folder=__data_folder, nrows=None):
 
     article_ids = tags_df['article_id'].values
     cat_abbreviations = tags_df['category_abbreviation'].values
+
+    # for some reason, some articles that are tagged don't show up
+    # in the articles CSV. filter those out.
     existing_ids_filter = np.isin(article_ids, df.index.values)
-    df.loc[article_ids[existing_ids_filter], cat_abbreviations[existing_ids_filter]] = 1
+
+    article_ids = article_ids[existing_ids_filter]
+    cat_abbreviations = cat_abbreviations[existing_ids_filter]
+
+    for i in range(tags_df['category_id'].max()):
+        cat_name = categories_df.loc[i+1, 'abbreviation']
+        df[cat_name] = 0
+        df[cat_name] = df[cat_name].astype('int8') # save on that memory!
+        df.loc[article_ids[cat_abbreviations == cat_name], cat_name] = 1
 
     return df
 
diff --git a/setup.py b/setup.py
index e777cc9..6d70012 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ def run(self):
             nltk.download(nltk_package)
 
 setup(name='tagnews',
-      version='0.0.5',
+      version='0.1.0',
       description='automatically tag articles with justice-related categories',
       author='Kevin Rose',
       url='https://github.com/chicago-justice-project/article-tagging',