diff --git a/pyproject.toml b/pyproject.toml index d63f30f..cd1d16a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "metadata-refiner" -version = "1.1.0" +version = "1.2.0" description = "A service that refines metadata." authors = ["Thomas van Erven"] diff --git a/src/refiners/cbs_refiner.py b/src/refiners/cbs_refiner.py index 014c9b6..2eac492 100644 --- a/src/refiners/cbs_refiner.py +++ b/src/refiners/cbs_refiner.py @@ -1,11 +1,14 @@ +import re + from fastapi import HTTPException def refine_cbs_metadata(metadata: dict, dsc_dictionary) -> dict: - """ Refines CBS metadata by refining the alt title and keyword field. + """ Refines CBS metadata, specificly alt title, keyword and statline field. The alternative title is either matched on a table or cleaned up. The keyword is split on "/" character and added as separate keywords. + The statline field formats all statline links to be clickable URLs. :param metadata: CBS metadata to be refined. :param dsc_dictionary: DSC dictionary containing refined alt titles. @@ -13,21 +16,32 @@ def refine_cbs_metadata(metadata: dict, dsc_dictionary) -> dict: :raises HTTPException: Raises if required keys are missing from metadata. """ try: - fields = metadata['datasetVersion']['metadataBlocks']['citation'][ - 'fields'] + metadataBlocks = metadata['datasetVersion']['metadataBlocks'] except KeyError as error: raise HTTPException(status_code=422, detail=str(error)) - alt_title_dict = get_field('alternativeTitle', fields) - if 'value' in alt_title_dict: - alt_title_dict['value'] = refine_alternative_title( - alt_title_dict['value'], dsc_dictionary) + # refinements for fields in the citation metadata block. + if 'citation' in metadataBlocks: + citation_fields = metadataBlocks['citation']['fields'] + + alt_title_dict = get_field('alternativeTitle', citation_fields) + if 'value' in alt_title_dict: + alt_title_dict['value'] = refine_alternative_title( + alt_title_dict['value'], dsc_dictionary) - keyword_dict = get_field('keyword', fields) - if 'value' in keyword_dict: - keyword_dict['value'] = refine_keywords( - keyword_dict['value']) + keyword_dict = get_field('keyword', citation_fields) + if 'value' in keyword_dict: + keyword_dict['value'] = refine_keywords( + keyword_dict['value']) + # refinements for fields in the CBSMetadata block. + if 'CBSMetadata' in metadataBlocks: + CBS_fields = metadataBlocks['CBSMetadata']['fields'] + + statline_dict = get_field('statlineTabel', CBS_fields) + if 'value' in statline_dict: + statline_dict['value'] = refine_statline_table( + statline_dict['value']) return metadata @@ -82,10 +96,11 @@ def Add_split_keywords(keyword): def refine_alternative_title(alt_title, dsc_dictionary): """ - Refine an alternative title by looking it up in a dictionary or cleaning it if not found. + Refine an alternative title by looking it up in a dictionary or + cleaning it if not found. :param alt_title: The alternative title to refine. - :param dsc_dictionary: The dictionary containing refined alternative titles. + :param dsc_dictionary: Dictionary containing refined alternative titles. :return: The refined alternative title. """ try: @@ -140,3 +155,36 @@ def clean_alternative_title(alternative_title: str): alternative_title = alternative_title.rstrip('_') return alternative_title + + +def refine_statline_table(statlineLinks: list) -> list: + """ Reformats statline links to a clickable URL if they contain a code. + + Some links to the statline tables only include the id of the table. + This function splits the links not yet in URL format from the correctly + formatted links. It then formats them to also be clickable URLs. + + :param statlineLinks: A list of URLs and table id's. (Linking to statline). + :return: List with URLs linking to statline tables. + """ + valid_urls = [url for url in statlineLinks if is_url(url)] + statline_codes = [url for url in statlineLinks if not is_url(url)] + formatted_statline_urls = [format_statline_url(code) for code in + statline_codes] + + return valid_urls + formatted_statline_urls + + +def format_statline_url(statline_code): + """ Alters a statline code that identifies a table to be a clickable url. + + :param statline_code: The code linking to the a table. + :return: The URL that links to a statline table. + """ + return f'https://opendata.cbs.nl/#/CBS/nl/dataset/{statline_code}' + + +def is_url(s): + """ Checks if a string is a URL """ + url_pattern = r'^https?://\S+$' + return bool(re.match(url_pattern, s)) diff --git a/src/tests/test_cbs_refiner.py b/src/tests/test_cbs_refiner.py index bc2c8fe..bf96863 100644 --- a/src/tests/test_cbs_refiner.py +++ b/src/tests/test_cbs_refiner.py @@ -34,7 +34,6 @@ def test_remove_jjjj_vv(): def test_cbs_metadata_refiner_dsc_dictionary(dsc_dict): - print(dsc_dict) input_data = { "datasetVersion": { "metadataBlocks": { @@ -196,9 +195,6 @@ def test_cbs_metadata_refiner_refine_keywords(dsc_dict): assert test_output == expected_output - assert test_output == expected_output - - def test_refine_keywords(): fields = [ {'typeName': 'keyword', 'value': [ @@ -210,8 +206,10 @@ def test_refine_keywords(): # Test case 1: Keywords with slashes expected_output = [ - {'keywordValue': {'typeName': 'keywordValue', 'multiple': False, 'typeClass': 'primitive', 'value': 'keyword1'}}, - {'keywordValue': {'typeName': 'keywordValue', 'multiple': False, 'typeClass': 'primitive', 'value': 'keyword2'}} + {'keywordValue': {'typeName': 'keywordValue', 'multiple': False, + 'typeClass': 'primitive', 'value': 'keyword1'}}, + {'keywordValue': {'typeName': 'keywordValue', 'multiple': False, + 'typeClass': 'primitive', 'value': 'keyword2'}} ] output = refine_keywords(fields[0]['value']) assert output == expected_output @@ -225,7 +223,8 @@ def test_refine_keywords(): {'typeName': 'other', 'value': 'some value'} ] expected_output_no_keyword_value = [] - output_no_keyword_value = refine_keywords(fields_no_keyword_value[0]['value']) + output_no_keyword_value = refine_keywords( + fields_no_keyword_value[0]['value']) assert output_no_keyword_value == expected_output_no_keyword_value # Test case 3: Empty keyword value @@ -238,3 +237,96 @@ def test_refine_keywords(): assert output_empty_keywords == expected_output_empty_keywords +def test_refine_statline(): + input_data = { + "datasetVersion": { + "metadataBlocks": { + "CBSMetadata": { + "fields": [ + { + "typeName": "statlineTabel", + "typeClass": "primitive", + "multiple": True, + "value": [ + "https://opendata.cbs.nl/#/CBS/nl/dataset/70810ned", + "https://opendata.cbs.nl/#/CBS/nl/dataset/70077ned", + "https://opendata.cbs.nl/#/CBS/en/dataset/70077eng", + "https://opendata.cbs.nl/#/CBS/nl/dataset/70004ned" + ] + } + ] + } + } + } + } + + expected_output = { + "datasetVersion": { + "metadataBlocks": { + "CBSMetadata": { + "fields": [ + { + "typeName": "statlineTabel", + "typeClass": "primitive", + "multiple": True, + "value": [ + "https://opendata.cbs.nl/#/CBS/nl/dataset/70810ned", + "https://opendata.cbs.nl/#/CBS/nl/dataset/70077ned", + "https://opendata.cbs.nl/#/CBS/en/dataset/70077eng", + "https://opendata.cbs.nl/#/CBS/nl/dataset/70004ned" + ] + } + ] + } + } + } + } + + test_output = refine_cbs_metadata(input_data, dsc_dict) + + assert test_output == expected_output + + +def test_statline_code_refinement(): + input_data = { + "datasetVersion": { + "metadataBlocks": { + "CBSMetadata": { + "fields": [ + { + "typeName": "statlineTabel", + "typeClass": "primitive", + "multiple": True, + "value": ["https://opendata.cbs.nl/#/CBS/nl/dataset/81414NED", + "37118" + ] + } + ] + } + } + } + } + + expected_output = { + "datasetVersion": { + "metadataBlocks": { + "CBSMetadata": { + "fields": [ + { + "typeName": "statlineTabel", + "typeClass": "primitive", + "multiple": True, + "value": [ + "https://opendata.cbs.nl/#/CBS/nl/dataset/81414NED", + "https://opendata.cbs.nl/#/CBS/nl/dataset/37118" + ] + } + ] + } + } + } + } + + test_output = refine_cbs_metadata(input_data, dsc_dict) + + assert test_output == expected_output