Skip to content

Commit

Permalink
Merge pull request #12 from odissei-data/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
FjodorvRijsselberg authored Jul 31, 2023
2 parents 8568414 + 293729b commit cd05ed5
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 21 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "metadata-refiner"
version = "1.1.0"
version = "1.2.0"
description = "A service that refines metadata."
authors = ["Thomas van Erven"]

Expand Down
74 changes: 61 additions & 13 deletions src/refiners/cbs_refiner.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,47 @@
import re

from fastapi import HTTPException


def refine_cbs_metadata(metadata: dict, dsc_dictionary) -> dict:
""" Refines CBS metadata by refining the alt title and keyword field.
""" Refines CBS metadata, specificly alt title, keyword and statline field.
The alternative title is either matched on a table or cleaned up.
The keyword is split on "/" character and added as separate keywords.
The statline field formats all statline links to be clickable URLs.
:param metadata: CBS metadata to be refined.
:param dsc_dictionary: DSC dictionary containing refined alt titles.
:return: Refined metadata.
:raises HTTPException: Raises if required keys are missing from metadata.
"""
try:
fields = metadata['datasetVersion']['metadataBlocks']['citation'][
'fields']
metadataBlocks = metadata['datasetVersion']['metadataBlocks']
except KeyError as error:
raise HTTPException(status_code=422, detail=str(error))

alt_title_dict = get_field('alternativeTitle', fields)
if 'value' in alt_title_dict:
alt_title_dict['value'] = refine_alternative_title(
alt_title_dict['value'], dsc_dictionary)
# refinements for fields in the citation metadata block.
if 'citation' in metadataBlocks:
citation_fields = metadataBlocks['citation']['fields']

alt_title_dict = get_field('alternativeTitle', citation_fields)
if 'value' in alt_title_dict:
alt_title_dict['value'] = refine_alternative_title(
alt_title_dict['value'], dsc_dictionary)

keyword_dict = get_field('keyword', fields)
if 'value' in keyword_dict:
keyword_dict['value'] = refine_keywords(
keyword_dict['value'])
keyword_dict = get_field('keyword', citation_fields)
if 'value' in keyword_dict:
keyword_dict['value'] = refine_keywords(
keyword_dict['value'])

# refinements for fields in the CBSMetadata block.
if 'CBSMetadata' in metadataBlocks:
CBS_fields = metadataBlocks['CBSMetadata']['fields']

statline_dict = get_field('statlineTabel', CBS_fields)
if 'value' in statline_dict:
statline_dict['value'] = refine_statline_table(
statline_dict['value'])
return metadata


Expand Down Expand Up @@ -82,10 +96,11 @@ def Add_split_keywords(keyword):

def refine_alternative_title(alt_title, dsc_dictionary):
"""
Refine an alternative title by looking it up in a dictionary or cleaning it if not found.
Refine an alternative title by looking it up in a dictionary or
cleaning it if not found.
:param alt_title: The alternative title to refine.
:param dsc_dictionary: The dictionary containing refined alternative titles.
:param dsc_dictionary: Dictionary containing refined alternative titles.
:return: The refined alternative title.
"""
try:
Expand Down Expand Up @@ -140,3 +155,36 @@ def clean_alternative_title(alternative_title: str):
alternative_title = alternative_title.rstrip('_')

return alternative_title


def refine_statline_table(statlineLinks: list) -> list:
""" Reformats statline links to a clickable URL if they contain a code.
Some links to the statline tables only include the id of the table.
This function splits the links not yet in URL format from the correctly
formatted links. It then formats them to also be clickable URLs.
:param statlineLinks: A list of URLs and table id's. (Linking to statline).
:return: List with URLs linking to statline tables.
"""
valid_urls = [url for url in statlineLinks if is_url(url)]
statline_codes = [url for url in statlineLinks if not is_url(url)]
formatted_statline_urls = [format_statline_url(code) for code in
statline_codes]

return valid_urls + formatted_statline_urls


def format_statline_url(statline_code):
""" Alters a statline code that identifies a table to be a clickable url.
:param statline_code: The code linking to the a table.
:return: The URL that links to a statline table.
"""
return f'https://opendata.cbs.nl/#/CBS/nl/dataset/{statline_code}'


def is_url(s):
""" Checks if a string is a URL """
url_pattern = r'^https?://\S+$'
return bool(re.match(url_pattern, s))
106 changes: 99 additions & 7 deletions src/tests/test_cbs_refiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def test_remove_jjjj_vv():


def test_cbs_metadata_refiner_dsc_dictionary(dsc_dict):
print(dsc_dict)
input_data = {
"datasetVersion": {
"metadataBlocks": {
Expand Down Expand Up @@ -196,9 +195,6 @@ def test_cbs_metadata_refiner_refine_keywords(dsc_dict):
assert test_output == expected_output


assert test_output == expected_output


def test_refine_keywords():
fields = [
{'typeName': 'keyword', 'value': [
Expand All @@ -210,8 +206,10 @@ def test_refine_keywords():

# Test case 1: Keywords with slashes
expected_output = [
{'keywordValue': {'typeName': 'keywordValue', 'multiple': False, 'typeClass': 'primitive', 'value': 'keyword1'}},
{'keywordValue': {'typeName': 'keywordValue', 'multiple': False, 'typeClass': 'primitive', 'value': 'keyword2'}}
{'keywordValue': {'typeName': 'keywordValue', 'multiple': False,
'typeClass': 'primitive', 'value': 'keyword1'}},
{'keywordValue': {'typeName': 'keywordValue', 'multiple': False,
'typeClass': 'primitive', 'value': 'keyword2'}}
]
output = refine_keywords(fields[0]['value'])
assert output == expected_output
Expand All @@ -225,7 +223,8 @@ def test_refine_keywords():
{'typeName': 'other', 'value': 'some value'}
]
expected_output_no_keyword_value = []
output_no_keyword_value = refine_keywords(fields_no_keyword_value[0]['value'])
output_no_keyword_value = refine_keywords(
fields_no_keyword_value[0]['value'])
assert output_no_keyword_value == expected_output_no_keyword_value

# Test case 3: Empty keyword value
Expand All @@ -238,3 +237,96 @@ def test_refine_keywords():
assert output_empty_keywords == expected_output_empty_keywords


def test_refine_statline():
input_data = {
"datasetVersion": {
"metadataBlocks": {
"CBSMetadata": {
"fields": [
{
"typeName": "statlineTabel",
"typeClass": "primitive",
"multiple": True,
"value": [
"https://opendata.cbs.nl/#/CBS/nl/dataset/70810ned",
"https://opendata.cbs.nl/#/CBS/nl/dataset/70077ned",
"https://opendata.cbs.nl/#/CBS/en/dataset/70077eng",
"https://opendata.cbs.nl/#/CBS/nl/dataset/70004ned"
]
}
]
}
}
}
}

expected_output = {
"datasetVersion": {
"metadataBlocks": {
"CBSMetadata": {
"fields": [
{
"typeName": "statlineTabel",
"typeClass": "primitive",
"multiple": True,
"value": [
"https://opendata.cbs.nl/#/CBS/nl/dataset/70810ned",
"https://opendata.cbs.nl/#/CBS/nl/dataset/70077ned",
"https://opendata.cbs.nl/#/CBS/en/dataset/70077eng",
"https://opendata.cbs.nl/#/CBS/nl/dataset/70004ned"
]
}
]
}
}
}
}

test_output = refine_cbs_metadata(input_data, dsc_dict)

assert test_output == expected_output


def test_statline_code_refinement():
input_data = {
"datasetVersion": {
"metadataBlocks": {
"CBSMetadata": {
"fields": [
{
"typeName": "statlineTabel",
"typeClass": "primitive",
"multiple": True,
"value": ["https://opendata.cbs.nl/#/CBS/nl/dataset/81414NED",
"37118"
]
}
]
}
}
}
}

expected_output = {
"datasetVersion": {
"metadataBlocks": {
"CBSMetadata": {
"fields": [
{
"typeName": "statlineTabel",
"typeClass": "primitive",
"multiple": True,
"value": [
"https://opendata.cbs.nl/#/CBS/nl/dataset/81414NED",
"https://opendata.cbs.nl/#/CBS/nl/dataset/37118"
]
}
]
}
}
}
}

test_output = refine_cbs_metadata(input_data, dsc_dict)

assert test_output == expected_output

0 comments on commit cd05ed5

Please sign in to comment.