Skip to content

Commit

Permalink
Refactor Cision pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
gilesdring committed Oct 6, 2023
1 parent 9fc8f0a commit d81aacf
Show file tree
Hide file tree
Showing 13 changed files with 3,486 additions and 2,453 deletions.
4,272 changes: 2,153 additions & 2,119 deletions data/metrics/media_coverage/combined_cision.csv

Large diffs are not rendered by default.

17 changes: 8 additions & 9 deletions docs/metrics/media_coverage/_data/monthly_count.csv
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,12 @@ month,International,Local,National,Regional,Unknown,Total
2022-10,1,1,0,9,17,28
2022-11,10,3,2,3,19,37
2022-12,29,9,12,8,16,74
2023-01,45,10,54,45,388,542
2023-02,2,19,0,11,97,129
2023-03,14,2,4,14,102,136
2023-04,2,0,1,0,94,97
2023-05,0,0,0,0,159,159
2023-06,1,0,8,0,105,114
2023-01,54,10,70,45,542,721
2023-02,2,19,0,11,96,128
2023-03,13,2,4,14,100,133
2023-04,0,0,0,0,92,92
2023-05,0,0,0,0,157,157
2023-06,0,0,0,0,136,136
2023-07,0,0,0,0,112,112
2023-08,0,0,0,0,66,66
2023-09,5,0,7,0,352,364
2023-10,0,0,0,0,43,43
2023-08,0,0,0,0,65,65
2023-09,0,0,0,0,251,251
10 changes: 5 additions & 5 deletions docs/metrics/media_coverage/_data/outlet_count.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outlet_name,count
BBC Radio Leeds,331
Yorkshire Evening Post,272
Yorkshire Evening Post - Yorkshireeveningpost.co.uk,189
West Leeds Dispatch,117
"Yorkshire Post, The",100
BBC Radio Leeds,335
Yorkshire Evening Post,274
Yorkshire Evening Post - Yorkshireeveningpost.co.uk,190
West Leeds Dispatch,119
"Yorkshire Post, The",102
10 changes: 5 additions & 5 deletions docs/metrics/media_coverage/_data/stats.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
reach_max: 24886000
reach_max_outlet: Daily Telegraph - Telegraph.co.uk, The
total_audience_reach: 306501320
total_estimated_circulation: 2005899420
total_media: 2616
total_audience_reach: 307793541
total_estimated_circulation: 2032329143
total_media: 2649
total_media_international: 127
total_media_local: 457
total_media_national: 147
total_media_regional: 314
total_media_unknown: 1583
total_unique_views: 1699398100
total_media_unknown: 1616
total_unique_views: 1724535602
uv_max: 67151557
uv_max_outlet: MSN Arabia
73 changes: 22 additions & 51 deletions scripts/metrics/media/cision.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import datetime
import glob
import logging
import os

Expand All @@ -8,50 +6,18 @@

TOP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))
LOG_DIR = os.path.join(TOP_DIR, 'working/log')
WORKING_DIR = os.path.join(TOP_DIR, 'working/manual/media')
OUTPUT_DIR = os.path.join(TOP_DIR, 'data/metrics/media_coverage')

LATEST_DATE = datetime.datetime.now()
HASH_KEY = "2023202320232023"

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.setLevel(logging.DEBUG)
log_fh = logging.FileHandler(filename=os.path.join(
LOG_DIR, "media_cision.log"), mode="w")
log_formatter = logging.Formatter('%(levelname)s:%(funcName)s:%(message)s')
log_fh.setFormatter(log_formatter)
logger.addHandler(log_fh)


def load_cision_files():
files = list_cision_files()

dfs = pd.concat([load_cision_file(file) for file in files])
print(dfs)
return dfs.pipe(clean_up)


def list_cision_files():
'''
List the available CSV files
'''
return glob.glob(os.path.join(WORKING_DIR, '*.csv'))


def load_cision_file(filepath):
logger.info('Loading %s', filepath)
return (
pd.read_csv(filepath, encoding=guess_encoding(filepath), thousands=',')
.pipe(append_filename, os.path.basename(filepath))
.pipe(normalise_column_names)
.pipe(patch_column_names)
.pipe(drop_empty_headlines)
.pipe(guess_date)
.pipe(convert_numbers)
.pipe(add_hash)
)


def guess_encoding(file):
with open(file, 'rb') as f:
result = chardet.detect(f.read())
Expand Down Expand Up @@ -94,12 +60,16 @@ def drop_empty_headlines(data):
return data[~data['news_headline'].isna()]


def guess_date(data):
def guess_date(data, latest_date):
if not latest_date:
latest_date = pd.Timestamp.now()

known_formats = [
"%d/%m/%Y",
"%m/%d/%Y",
"%d/%m/%y",
"%d.%m.%y",
"%d.%m.%Y",
"%d-%b"
]

Expand All @@ -111,19 +81,23 @@ def guess_date(data):
)

# Get rid of any dates in the future
dates[dates.news_date > pd.Timestamp.now()] = pd.NaT

dates[dates.news_date > latest_date] = pd.NaT
# Backfill and take the first column
dates = dates.bfill(axis=1).iloc[:, 0]

# Handle any dates far into the past
dates = dates.mask(dates.dt.year < 2022, dates.apply(lambda d: d + pd.offsets.DateOffset(year=2023)))
dates = dates.mask(dates.dt.year < 2022, dates.apply(
lambda d: d + pd.offsets.DateOffset(year=2023)))

if len(dates[dates.isna()]) > 1:
logger.error('Some incompatible date formats found')
return pd.DataFrame(columns=data.columns)

data['input_date'] = data.news_date
data['news_date'] = dates

if len(dates[dates.isna()]) > 1:
logger.warning('Some incompatible date formats found (latest expected date %s)', latest_date)
logger.debug('Missing dates %r', data.loc[data.news_date.isna(), ['input_date', 'news_headline', 'source_file']])

return data


Expand All @@ -135,7 +109,7 @@ def convert_numbers(data):
logger.error('Source file -> %s', data.source_file[0])
logger.error('Columns %s', data.columns)
except Exception as e:
logger.error(e)
logger.warning(e)
data['uv'] = pd.to_numeric(
data['uv'].str.strip().str.replace(',', ''), errors="coerce"
).astype('Int64')
Expand All @@ -145,7 +119,7 @@ def convert_numbers(data):
data['audience_reach'] = data['audience_reach'].fillna(
0).astype('Int64')
except Exception as e:
logger.error(e)
logger.warning(e)
data.audience_reach = pd.to_numeric(
data.audience_reach.str.strip().str.replace(',', ''), errors="coerce"
).astype('Int64')
Expand All @@ -156,11 +130,10 @@ def convert_numbers(data):
def add_hash(data):
# Add identifier
data['hash'] = pd.util.hash_pandas_object(
data[['news_date', 'news_headline', 'outlet_name']], hash_key=HASH_KEY, index=False)
data, hash_key=HASH_KEY, index=False)
return data



def clean_up(data):
data = data.drop(
columns=['news_text', 'contact_name', 'news_attachment_name'], errors='ignore'
Expand All @@ -170,17 +143,15 @@ def clean_up(data):
return data


def save_csv(data, output_file):
def save_csv(data: pd.DataFrame, output_file):
# Set columns order
columns_order = ['news_date', 'news_headline', 'outlet_name', 'audience_reach',
'uv', 'tone', 'medium', 'outlet_type', 'custom_tags', 'news_company_mentions',
'hash',
'source_file'
'source_file', 'latest_date', 'input_date'
]
data = data.reindex(columns=columns_order)

data.sort_values(by=['news_date', 'news_headline', 'outlet_name', 'medium']).to_csv(
data.loc[:, columns_order].sort_values(by=['latest_date', 'news_date', 'news_headline', 'outlet_name', 'medium']).to_csv(
output_file, index=False)
return data


Loading

0 comments on commit d81aacf

Please sign in to comment.