Refactor Cision pipeline

open-innovations · Oct 6, 2023 · d81aacf · d81aacf
1 parent 9fc8f0a
commit d81aacf
Show file tree

Hide file tree

Showing 13 changed files with 3,486 additions and 2,453 deletions.
diff --git a/data/metrics/media_coverage/combined_cision.csv b/data/metrics/media_coverage/combined_cision.csv
diff --git a/docs/metrics/media_coverage/_data/monthly_count.csv b/docs/metrics/media_coverage/_data/monthly_count.csv
@@ -23,13 +23,12 @@ month,International,Local,National,Regional,Unknown,Total
 2022-10,1,1,0,9,17,28
 2022-11,10,3,2,3,19,37
 2022-12,29,9,12,8,16,74
-2023-01,45,10,54,45,388,542
-2023-02,2,19,0,11,97,129
-2023-03,14,2,4,14,102,136
-2023-04,2,0,1,0,94,97
-2023-05,0,0,0,0,159,159
-2023-06,1,0,8,0,105,114
+2023-01,54,10,70,45,542,721
+2023-02,2,19,0,11,96,128
+2023-03,13,2,4,14,100,133
+2023-04,0,0,0,0,92,92
+2023-05,0,0,0,0,157,157
+2023-06,0,0,0,0,136,136
 2023-07,0,0,0,0,112,112
-2023-08,0,0,0,0,66,66
-2023-09,5,0,7,0,352,364
-2023-10,0,0,0,0,43,43
+2023-08,0,0,0,0,65,65
+2023-09,0,0,0,0,251,251
diff --git a/docs/metrics/media_coverage/_data/outlet_count.csv b/docs/metrics/media_coverage/_data/outlet_count.csv
@@ -1,6 +1,6 @@
 outlet_name,count
-BBC Radio Leeds,331
-Yorkshire Evening Post,272
-Yorkshire Evening Post - Yorkshireeveningpost.co.uk,189
-West Leeds Dispatch,117
-"Yorkshire Post, The",100
+BBC Radio Leeds,335
+Yorkshire Evening Post,274
+Yorkshire Evening Post - Yorkshireeveningpost.co.uk,190
+West Leeds Dispatch,119
+"Yorkshire Post, The",102
diff --git a/docs/metrics/media_coverage/_data/stats.yml b/docs/metrics/media_coverage/_data/stats.yml
@@ -1,13 +1,13 @@
 reach_max: 24886000
 reach_max_outlet: Daily Telegraph - Telegraph.co.uk, The
-total_audience_reach: 306501320
-total_estimated_circulation: 2005899420
-total_media: 2616
+total_audience_reach: 307793541
+total_estimated_circulation: 2032329143
+total_media: 2649
 total_media_international: 127
 total_media_local: 457
 total_media_national: 147
 total_media_regional: 314
-total_media_unknown: 1583
-total_unique_views: 1699398100
+total_media_unknown: 1616
+total_unique_views: 1724535602
 uv_max: 67151557
 uv_max_outlet: MSN Arabia
diff --git a/scripts/metrics/media/cision.py b/scripts/metrics/media/cision.py
@@ -1,5 +1,3 @@
-import datetime
-import glob
 import logging
 import os
 
@@ -8,50 +6,18 @@
 
 TOP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))
 LOG_DIR = os.path.join(TOP_DIR, 'working/log')
-WORKING_DIR = os.path.join(TOP_DIR, 'working/manual/media')
-OUTPUT_DIR = os.path.join(TOP_DIR, 'data/metrics/media_coverage')
 
-LATEST_DATE = datetime.datetime.now()
 HASH_KEY = "2023202320232023"
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+logger.setLevel(logging.DEBUG)
 log_fh = logging.FileHandler(filename=os.path.join(
     LOG_DIR, "media_cision.log"), mode="w")
 log_formatter = logging.Formatter('%(levelname)s:%(funcName)s:%(message)s')
 log_fh.setFormatter(log_formatter)
 logger.addHandler(log_fh)
 
 
-def load_cision_files():
-    files = list_cision_files()
-
-    dfs = pd.concat([load_cision_file(file) for file in files])
-    print(dfs)
-    return dfs.pipe(clean_up)
-
-
-def list_cision_files():
-    '''
-    List the available CSV files
-    '''
-    return glob.glob(os.path.join(WORKING_DIR, '*.csv'))
-
-
-def load_cision_file(filepath):
-    logger.info('Loading %s', filepath)
-    return (
-        pd.read_csv(filepath, encoding=guess_encoding(filepath), thousands=',')
-          .pipe(append_filename, os.path.basename(filepath))
-          .pipe(normalise_column_names)
-          .pipe(patch_column_names)
-          .pipe(drop_empty_headlines)
-          .pipe(guess_date)
-          .pipe(convert_numbers)
-          .pipe(add_hash)
-    )
-
-
 def guess_encoding(file):
     with open(file, 'rb') as f:
         result = chardet.detect(f.read())
@@ -94,12 +60,16 @@ def drop_empty_headlines(data):
     return data[~data['news_headline'].isna()]
 
 
-def guess_date(data):
+def guess_date(data, latest_date):
+    if not latest_date:
+        latest_date = pd.Timestamp.now()
+
     known_formats = [
         "%d/%m/%Y",
         "%m/%d/%Y",
         "%d/%m/%y",
         "%d.%m.%y",
+        "%d.%m.%Y",
         "%d-%b"
     ]
 
@@ -111,19 +81,23 @@ def guess_date(data):
     )
 
     # Get rid of any dates in the future
-    dates[dates.news_date > pd.Timestamp.now()] = pd.NaT
-
+    dates[dates.news_date > latest_date] = pd.NaT
+    
     # Backfill and take the first column
     dates = dates.bfill(axis=1).iloc[:, 0]
 
     # Handle any dates far into the past
-    dates = dates.mask(dates.dt.year < 2022, dates.apply(lambda d: d + pd.offsets.DateOffset(year=2023)))
+    dates = dates.mask(dates.dt.year < 2022, dates.apply(
+        lambda d: d + pd.offsets.DateOffset(year=2023)))
 
-    if len(dates[dates.isna()]) > 1:
-        logger.error('Some incompatible date formats found')
-        return pd.DataFrame(columns=data.columns)
 
+    data['input_date'] = data.news_date
     data['news_date'] = dates
+
+    if len(dates[dates.isna()]) > 1:
+        logger.warning('Some incompatible date formats found (latest expected date %s)', latest_date)
+        logger.debug('Missing dates %r', data.loc[data.news_date.isna(), ['input_date', 'news_headline', 'source_file']])
+
     return data
 
 
@@ -135,7 +109,7 @@ def convert_numbers(data):
         logger.error('Source file -> %s', data.source_file[0])
         logger.error('Columns %s', data.columns)
     except Exception as e:
-        logger.error(e)
+        logger.warning(e)
         data['uv'] = pd.to_numeric(
             data['uv'].str.strip().str.replace(',', ''), errors="coerce"
         ).astype('Int64')
@@ -145,7 +119,7 @@ def convert_numbers(data):
         data['audience_reach'] = data['audience_reach'].fillna(
             0).astype('Int64')
     except Exception as e:
-        logger.error(e)
+        logger.warning(e)
         data.audience_reach = pd.to_numeric(
             data.audience_reach.str.strip().str.replace(',', ''), errors="coerce"
         ).astype('Int64')
@@ -156,11 +130,10 @@ def convert_numbers(data):
 def add_hash(data):
     # Add identifier
     data['hash'] = pd.util.hash_pandas_object(
-        data[['news_date', 'news_headline', 'outlet_name']], hash_key=HASH_KEY, index=False)
+        data, hash_key=HASH_KEY, index=False)
     return data
 
 
-
 def clean_up(data):
     data = data.drop(
         columns=['news_text', 'contact_name', 'news_attachment_name'], errors='ignore'
@@ -170,17 +143,15 @@ def clean_up(data):
     return data
 
 
-def save_csv(data, output_file):
+def save_csv(data: pd.DataFrame, output_file):
     # Set columns order
     columns_order = ['news_date', 'news_headline', 'outlet_name', 'audience_reach',
                      'uv', 'tone', 'medium', 'outlet_type', 'custom_tags', 'news_company_mentions',
                      'hash',
-                     'source_file'
+                     'source_file', 'latest_date', 'input_date'
                      ]
     data = data.reindex(columns=columns_order)
 
-    data.sort_values(by=['news_date', 'news_headline', 'outlet_name', 'medium']).to_csv(
+    data.loc[:, columns_order].sort_values(by=['latest_date', 'news_date', 'news_headline', 'outlet_name', 'medium']).to_csv(
         output_file, index=False)
     return data
-
-