Skip to content

Commit

Permalink
Clean up metadata processing
Browse files Browse the repository at this point in the history
  • Loading branch information
gilesdring committed Oct 16, 2023
1 parent 152cfa5 commit 365aa52
Show file tree
Hide file tree
Showing 14 changed files with 239 additions and 226 deletions.
8 changes: 6 additions & 2 deletions scripts/cpi/dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,12 @@ stages:
size: 129729
- path: scripts/cpi/prepare.py
hash: md5
md5: eae0eadf2582f5c6398273d08614a390
size: 8631
md5: baa430f643d94e5596d409b15e943900
size: 8605
- path: working/upstream/metadata.csv
hash: md5
md5: e8f918d67dad983a618a80d2185527dc
size: 134
- path: working/upstream/mm23-codes.csv
hash: md5
md5: 83bd378b2ba90ee84ca2546ff6f51dc9
Expand Down
1 change: 1 addition & 0 deletions scripts/cpi/dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ stages:
- scripts/cpi/prepare.py
- data/cpi/transformed_cpi.csv
- working/upstream/mm23-codes.csv
- working/upstream/metadata.csv
outs:
- data/cpi/indicator.csv:
cache: false
Expand Down
8 changes: 4 additions & 4 deletions scripts/cpi/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@
from transform import DATA_DIR as INPUTS_DIR
from transform import n
from scripts.util.util import iso_to_named_date
from scripts.util.metadata import read_meta, filter_for_dataset
from scripts.util.metadata import read_meta, extract_dates

OUTPUTS_DIR = os.path.realpath(os.path.join('src', '_data', 'sources', 'cpi'))
CPI_METADATA = os.path.join(OUTPUTS_DIR, 'metadata.json')


def get_dates():
# read csv and make a new dataframe
metadata = read_meta().pipe(filter_for_dataset, 'MM23')
next_update = iso_to_named_date(metadata['next_update'].iloc[0])
published = iso_to_named_date(metadata['last_update'].iloc[0])
metadata = read_meta().pipe(extract_dates, 'MM23')
next_update = iso_to_named_date(metadata['next_update'])
published = iso_to_named_date(metadata['last_update'])
dates = pd.Series(
data={
'published': published,
Expand Down
12 changes: 8 additions & 4 deletions scripts/labour-market/dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,21 @@ stages:
size: 359985
- path: ../../scripts/util/
hash: md5
md5: c7bb69be54b1e9f9ce79d49e29ead857.dir
size: 9655
md5: 3727a82e6b3317df98198097a7646dc0.dir
size: 9441
nfiles: 6
- path: ../../working/upstream/metadata.csv
hash: md5
md5: e8f918d67dad983a618a80d2185527dc
size: 134
- path: config.py
hash: md5
md5: c9187b99fb9c48a24be707561d15eaa2
size: 5131
- path: prepare.ipynb
hash: md5
md5: 8e26694a5328022c4ef105c1eb274952
size: 55354
md5: a169583bbac0a6b21b631a4f25058bf1
size: 55309
- path: prepare.py
hash: md5
md5: 09def7c536dd2fc8644c8100908aaebc
Expand Down
1 change: 1 addition & 0 deletions scripts/labour-market/dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ stages:
- config.py
- ${top}/scripts/util/
- ${top}/data/labour-market/monthly-rolling.csv
- ${top}/working/upstream/metadata.csv
outs:
- ${top}/src/_data/sources/labour-market/long_term_unemployed_last_3_years.csv:
cache: false
Expand Down
4 changes: 2 additions & 2 deletions scripts/labour-market/prepare.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"outputs": [],
"source": [
"from prepare import LMS_EXTRACT, DASHBOARD_DIR, create_table, save_files, summarise, labour_market_status_variables, long_term_unemployed_variables\n",
"from scripts.util.metadata import read_meta, filter_for_dataset, extract_dates"
"from scripts.util.metadata import read_meta, extract_dates"
]
},
{
Expand Down Expand Up @@ -1166,7 +1166,7 @@
" long_term_unemployed=long_term_unemployed,\n",
" labour_market_status=labour_market_status,\n",
" ),\n",
" read_meta().pipe(filter_for_dataset, 'LMS').pipe(extract_dates),\n",
" read_meta().pipe(extract_dates, 'LMS'),\n",
"])\n",
"\n",
"summary.to_json(os.path.join(DASHBOARD_DIR, 'latest.json'), indent=2, date_format='iso')"
Expand Down
4 changes: 4 additions & 0 deletions scripts/neet/dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ stages:
transform:
cmd: papermill transform.ipynb ../../working/output/neet-transform.ipynb
deps:
- path: ../../working/upstream/metadata.csv
hash: md5
md5: e8f918d67dad983a618a80d2185527dc
size: 134
- path: ../../working/upstream/neet.csv
hash: md5
md5: a6d894c2cdc326e9b9d803c8149457c8
Expand Down
1 change: 1 addition & 0 deletions scripts/neet/dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ stages:
deps:
- transform.ipynb
- ${top}/working/upstream/neet.csv
- ${top}/working/upstream/metadata.csv
outs:
- ${top}/data/neet/neet.csv:
cache: false
Expand Down
13 changes: 3 additions & 10 deletions scripts/util/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,12 @@

def read_meta():
# read csv and make a new dataframe
metadata = pd.read_csv(METADATA_FILE)
metadata.last_update = pd.to_datetime(metadata.last_update, format='ISO8601')
metadata.next_update = pd.to_datetime(metadata.next_update, format='ISO8601')
metadata = pd.read_csv(METADATA_FILE, parse_dates=['last_update', 'next_update'], index_col=['id'])
return metadata


def filter_for_dataset(metadata, id):
metadata = metadata[metadata.id == id].reset_index()
return metadata


def extract_dates(metadata):
return metadata.loc[0, ['last_update', 'next_update']]
def extract_dates(metadata, id):
return metadata.loc[id, ['last_update', 'next_update']]
# next_update = metadata['next_update'].iloc[0]
# published = metadata['last_update'].iloc[0]
# dates = pd.Series(data={'published': published, 'next_update': next_update}, index=['published', 'next_update'])
Expand Down
12 changes: 8 additions & 4 deletions scripts/vacancies/dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@ stages:
size: 10506
- path: scripts/util/metadata.py
hash: md5
md5: 4c10007d68898289db269e5da494a0f5
size: 911
md5: cfc702d76a3920af615dbbf1a9408cfd
size: 697
- path: scripts/vacancies/prepare.py
hash: md5
md5: 714220570380a26716ec2053a008c262
size: 5849
md5: 3cf43cb424b9c3af8c5b362f36b36325
size: 5823
- path: working/upstream/metadata.csv
hash: md5
md5: e8f918d67dad983a618a80d2185527dc
size: 134
- path: working/upstream/vacancies-growth-by-sector.csv
hash: md5
md5: 8696accdcce4286973199b89da179271
Expand Down
1 change: 1 addition & 0 deletions scripts/vacancies/dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ stages:
- scripts/util/metadata.py
- data/vacancies/vacancies_by_date.csv
- working/upstream/vacancies-growth-by-sector.csv
- working/upstream/metadata.csv
outs:
- src/_data/sources/vacancies/:
cache: false
Expand Down
8 changes: 4 additions & 4 deletions scripts/vacancies/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from scripts.util.date import lms_period_to_quarter_label
from scripts.util.util import iso_to_named_date
from scripts.util.metadata import read_meta, filter_for_dataset
from scripts.util.metadata import read_meta, extract_dates

DATA_DIR = os.path.join('src', '_data', 'sources', 'vacancies')
RAW_DATA_DIR = os.path.realpath(os.path.join('data', 'vacancies'))
Expand Down Expand Up @@ -132,9 +132,9 @@ def summarise():

def get_dates():
# read csv and make a new dataframe
metadata = read_meta().pipe(filter_for_dataset, 'LMS')
next_update = iso_to_named_date(metadata['next_update'].iloc[0])
published = iso_to_named_date(metadata['last_update'].iloc[0])
metadata = read_meta().pipe(extract_dates, 'LMS')
next_update = iso_to_named_date(metadata['next_update'])
published = iso_to_named_date(metadata['last_update'])
dates = pd.Series(data={'published': published, 'next_update': next_update}, index=[
'published', 'next_update'])
dates.to_json(os.path.join(DATA_DIR, 'metadata.json'), date_format='iso')
Expand Down
Loading

0 comments on commit 365aa52

Please sign in to comment.