From b1b1fa26c64ca5820240ee5917f7ca2554923f98 Mon Sep 17 00:00:00 2001 From: physikerwelt Date: Mon, 14 Oct 2024 20:48:32 +0200 Subject: [PATCH] Fix invalid encoding --- .../convertSoftware_from_json_toXml.py | 306 +++++------ test/data/software/software_with_swhid.json | 496 +++++++++--------- 2 files changed, 401 insertions(+), 401 deletions(-) diff --git a/src/zbmath_rest2oai/convertSoftware_from_json_toXml.py b/src/zbmath_rest2oai/convertSoftware_from_json_toXml.py index 632f961..f20f4e3 100644 --- a/src/zbmath_rest2oai/convertSoftware_from_json_toXml.py +++ b/src/zbmath_rest2oai/convertSoftware_from_json_toXml.py @@ -1,153 +1,153 @@ -import json -import xml.etree.ElementTree as ET - - -def json_to_xml(json_data): - # Create the root element - root = ET.Element('root' , attrib= { - 'xmlns:swhdeposit': "https://www.softwareheritage.org/schema/2018/deposit", - 'xmlns:swh': "https://www.softwareheritage.org/schema/2018/deposit", - 'xmlns:schema': "https://schema.org/" - }) - - # Add articles_count - articles_count = ET.SubElement(root, 'articles_count') - articles_count.text = str(json_data['result']['articles_count']) - - # Add authors - for author in json_data['result']['authors']: - author_elem = ET.SubElement(root, 'authors') - author_elem.text = author - - # Add classification - for classif in json_data['result']['classification']: - classif_elem = ET.SubElement(root, 'classification') - classif_elem.text = classif - - # Add swhdeposit:deposit section - swhdeposit_elem = ET.SubElement(root, 'swhdeposit:deposit') - swhdeposit_reference = ET.SubElement(swhdeposit_elem, 'swhdeposit:reference') - swhdeposit_object = ET.SubElement(swhdeposit_reference, 'swhdeposit:object') - swhdeposit_object.set('swhid', - json_data['swhdeposit:deposit']['swhdeposit:reference']['swhdeposit:object']['@swhid']) - - metadata_provenance = ET.SubElement(swhdeposit_elem, 'swhdeposit:metadata-provenance') - schema_url = ET.SubElement(metadata_provenance, 'schema:url') - schema_url.text = json_data['swhdeposit:deposit']['swhdeposit:metadata-provenance']['schema:url'] - - # Add dependencies - dependencies = ET.SubElement(root, 'dependencies') - dependencies.text = str(json_data['result']['dependencies']) - - # Add description - description = ET.SubElement(root, 'description') - description.text = json_data['result']['description'] - - # Add homepage - homepage = ET.SubElement(root, 'homepage') - homepage.text = json_data['result']['homepage'] - - # Add id - id_elem = ET.SubElement(root, 'id') - id_elem.text = str(json_data['result']['id']) - - # Add keywords - for keyword in json_data['result']['keywords']: - keyword_elem = ET.SubElement(root, 'keywords') - keyword_elem.text = keyword - - # Add license_terms - license_terms = ET.SubElement(root, 'license_terms') - license_terms.text = str(json_data['result']['license_terms']) - - # Add name - name = ET.SubElement(root, 'name') - name.text = json_data['result']['name'] - - # Add operating_systems - operating_systems = ET.SubElement(root, 'operating_systems') - operating_systems.text = str(json_data['result']['operating_systems']) - - # Add orms_id - orms_id = ET.SubElement(root, 'orms_id') - orms_id.text = str(json_data['result']['orms_id']) - - # Add programming_languages - programming_languages = ET.SubElement(root, 'programming_languages') - programming_languages.text = str(json_data['result']['programming_languages']) - - # Add related_software - for software in json_data['result']['related_software']: - related_software_elem = ET.SubElement(root, 'related_software') - software_id = ET.SubElement(related_software_elem, 'id') - software_id.text = str(software['id']) - software_name = ET.SubElement(related_software_elem, 'name') - software_name.text = software['name'] - - # Add source_code - source_code = ET.SubElement(root, 'source_code') - source_code.text = json_data['result']['source_code'] - - # Add standard_articles - for article in json_data['result']['standard_articles']: - article_elem = ET.SubElement(root, 'standard_articles') - article_authors = ET.SubElement(article_elem, 'authors') - article_id = ET.SubElement(article_elem, 'id') - article_id.text = str(article['id']) - article_source = ET.SubElement(article_elem, 'source') - article_source.text = article['source'] - article_title = ET.SubElement(article_elem, 'title') - article_title.text = article['title'] - article_year = ET.SubElement(article_elem, 'year') - article_year.text = article['year'] - - # Add zbmath_url - zbmath_url = ET.SubElement(root, 'zbmath_url') - zbmath_url.text = json_data['result']['zbmath_url'] - - return root - -def indent_xml(elem, level=0): - """Function to add indentation to XML.""" - i = "\n" + level * " " - if len(elem): - if not elem.text or not elem.text.strip(): - elem.text = i + " " - if not elem.tail or not elem.tail.strip(): - elem.tail = i - for sub_elem in elem: - indent_xml(sub_elem, level + 1) - if not sub_elem.tail or not sub_elem.tail.strip(): - sub_elem.tail = i - else: - if level and (not elem.tail or not elem.tail.strip()): - elem.tail = i - - -def convert_json_to_xml(json_file_path, xml_output_path): - # Load JSON from the given file path - with open(json_file_path, 'r') as json_file: - json_data = json.load(json_file) - - # Convert JSON to XML ElementTree - root_element = json_to_xml(json_data) - - # Indent the XML properly - indent_xml(root_element) - - # Create an ElementTree from the root element - tree = ET.ElementTree(root_element) - - # Write the XML to a file with declaration and UTF-8 encoding - tree.write(xml_output_path, encoding='utf-8', xml_declaration=True) - - print(f"XML data has been saved to {xml_output_path}") - - -# Example usage -json_file_path = '../../test/data/software/software_with_swhid.json' # Update with your file path -xml_output_path = '../../test/data/software/software_with_swhid.xml' # Update with your desired output path - -convert_json_to_xml(json_file_path, xml_output_path) - - +import json +import xml.etree.ElementTree as ET + + +def json_to_xml(json_data): + # Create the root element + root = ET.Element('root' , attrib= { + 'xmlns:swhdeposit': "https://www.softwareheritage.org/schema/2018/deposit", + 'xmlns:swh': "https://www.softwareheritage.org/schema/2018/deposit", + 'xmlns:schema': "https://schema.org/" + }) + + # Add articles_count + articles_count = ET.SubElement(root, 'articles_count') + articles_count.text = str(json_data['result']['articles_count']) + + # Add authors + for author in json_data['result']['authors']: + author_elem = ET.SubElement(root, 'authors') + author_elem.text = author + + # Add classification + for classif in json_data['result']['classification']: + classif_elem = ET.SubElement(root, 'classification') + classif_elem.text = classif + + # Add swhdeposit:deposit section + swhdeposit_elem = ET.SubElement(root, 'swhdeposit:deposit') + swhdeposit_reference = ET.SubElement(swhdeposit_elem, 'swhdeposit:reference') + swhdeposit_object = ET.SubElement(swhdeposit_reference, 'swhdeposit:object') + swhdeposit_object.set('swhid', + json_data['swhdeposit:deposit']['swhdeposit:reference']['swhdeposit:object']['@swhid']) + + metadata_provenance = ET.SubElement(swhdeposit_elem, 'swhdeposit:metadata-provenance') + schema_url = ET.SubElement(metadata_provenance, 'schema:url') + schema_url.text = json_data['swhdeposit:deposit']['swhdeposit:metadata-provenance']['schema:url'] + + # Add dependencies + dependencies = ET.SubElement(root, 'dependencies') + dependencies.text = str(json_data['result']['dependencies']) + + # Add description + description = ET.SubElement(root, 'description') + description.text = json_data['result']['description'] + + # Add homepage + homepage = ET.SubElement(root, 'homepage') + homepage.text = json_data['result']['homepage'] + + # Add id + id_elem = ET.SubElement(root, 'id') + id_elem.text = str(json_data['result']['id']) + + # Add keywords + for keyword in json_data['result']['keywords']: + keyword_elem = ET.SubElement(root, 'keywords') + keyword_elem.text = keyword + + # Add license_terms + license_terms = ET.SubElement(root, 'license_terms') + license_terms.text = str(json_data['result']['license_terms']) + + # Add name + name = ET.SubElement(root, 'name') + name.text = json_data['result']['name'] + + # Add operating_systems + operating_systems = ET.SubElement(root, 'operating_systems') + operating_systems.text = str(json_data['result']['operating_systems']) + + # Add orms_id + orms_id = ET.SubElement(root, 'orms_id') + orms_id.text = str(json_data['result']['orms_id']) + + # Add programming_languages + programming_languages = ET.SubElement(root, 'programming_languages') + programming_languages.text = str(json_data['result']['programming_languages']) + + # Add related_software + for software in json_data['result']['related_software']: + related_software_elem = ET.SubElement(root, 'related_software') + software_id = ET.SubElement(related_software_elem, 'id') + software_id.text = str(software['id']) + software_name = ET.SubElement(related_software_elem, 'name') + software_name.text = software['name'] + + # Add source_code + source_code = ET.SubElement(root, 'source_code') + source_code.text = json_data['result']['source_code'] + + # Add standard_articles + for article in json_data['result']['standard_articles']: + article_elem = ET.SubElement(root, 'standard_articles') + article_authors = ET.SubElement(article_elem, 'authors') + article_id = ET.SubElement(article_elem, 'id') + article_id.text = str(article['id']) + article_source = ET.SubElement(article_elem, 'source') + article_source.text = article['source'] + article_title = ET.SubElement(article_elem, 'title') + article_title.text = article['title'] + article_year = ET.SubElement(article_elem, 'year') + article_year.text = article['year'] + + # Add zbmath_url + zbmath_url = ET.SubElement(root, 'zbmath_url') + zbmath_url.text = json_data['result']['zbmath_url'] + + return root + +def indent_xml(elem, level=0): + """Function to add indentation to XML.""" + i = "\n" + level * " " + if len(elem): + if not elem.text or not elem.text.strip(): + elem.text = i + " " + if not elem.tail or not elem.tail.strip(): + elem.tail = i + for sub_elem in elem: + indent_xml(sub_elem, level + 1) + if not sub_elem.tail or not sub_elem.tail.strip(): + sub_elem.tail = i + else: + if level and (not elem.tail or not elem.tail.strip()): + elem.tail = i + + +def convert_json_to_xml(json_file_path, xml_output_path): + # Load JSON from the given file path + with open(json_file_path, 'r') as json_file: + json_data = json.load(json_file) + + # Convert JSON to XML ElementTree + root_element = json_to_xml(json_data) + + # Indent the XML properly + indent_xml(root_element) + + # Create an ElementTree from the root element + tree = ET.ElementTree(root_element) + + # Write the XML to a file with declaration and UTF-8 encoding + tree.write(xml_output_path, encoding='utf-8', xml_declaration=True) + + print(f"XML data has been saved to {xml_output_path}") + + +# Example usage +json_file_path = '../../test/data/software/software_with_swhid.json' # Update with your file path +xml_output_path = '../../test/data/software/software_with_swhid.xml' # Update with your desired output path + +convert_json_to_xml(json_file_path, xml_output_path) + + diff --git a/test/data/software/software_with_swhid.json b/test/data/software/software_with_swhid.json index c14b77b..8a56b20 100644 --- a/test/data/software/software_with_swhid.json +++ b/test/data/software/software_with_swhid.json @@ -1,249 +1,249 @@ -{ - "result": { - "articles_count": 2829, - "authors": [ - "Developers, The Sage", - "Stein, William", - "Joyner, David", - "Kohel, David", - "Cremona, John", - "Eröcal, Burçin" - ], - "classification": [ - "05", - "11", - "14", - "20", - "68", - "00", - "01", - "03", - "06", - "12", - "13", - "15", - "16", - "17", - "18", - "19", - "22", - "26", - "28", - "30", - "31", - "32", - "33", - "34", - "35", - "37", - "39", - "40", - "41", - "42", - "43", - "44", - "46", - "47", - "49", - "51", - "52", - "53", - "54", - "55", - "57", - "58", - "60", - "62", - "65", - "70", - "74", - "76", - "78", - "80", - "81", - "82", - "83", - "85", - "86", - "90", - "91", - "92", - "93", - "94", - "97" - ], - "dependencies": null, - "description": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "homepage": "http://www.sagemath.org", - "id": 825, - "keywords": [ - "orms", - "Python", - "Cython", - "Sage", - "Open Source", - "Interfaces" - ], - "license_terms": null, - "name": "SageMath", - "operating_systems": null, - "orms_id": "255", - "programming_languages": null, - "related_software": [ - { - "id": 540, - "name": "Magma" - }, - { - "id": 320, - "name": "GAP" - }, - { - "id": 23170, - "name": "GitHub" - }, - { - "id": 7248, - "name": "OEIS" - }, - { - "id": 680, - "name": "PARI/GP" - }, - { - "id": 866, - "name": "SINGULAR" - }, - { - "id": 16448, - "name": "Sage-Combinat" - }, - { - "id": 537, - "name": "Macaulay2" - }, - { - "id": 554, - "name": "Mathematica" - }, - { - "id": 545, - "name": "Maple" - }, - { - "id": 7249, - "name": "LMFDB" - }, - { - "id": 14460, - "name": "Python" - }, - { - "id": 27596, - "name": "ecdata" - }, - { - "id": 611, - "name": "nauty" - }, - { - "id": 7823, - "name": "Traces" - }, - { - "id": 560, - "name": "Maxima" - }, - { - "id": 4968, - "name": "DLMF" - }, - { - "id": 4698, - "name": "Gfan" - }, - { - "id": 23728, - "name": "MathOverflow" - }, - { - "id": 6293, - "name": "SciPy" - } - ], - "source_code": "https://github.com/sagemath/sage", - "standard_articles": [ - { - "authors": [], - "id": 7045592, - "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "year": "2019" - }, - { - "authors": [], - "id": 6418360, - "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "year": "2015" - }, - { - "authors": [], - "id": 6532438, - "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "year": "2013" - }, - { - "authors": [], - "id": 6462232, - "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "year": "2013" - }, - { - "authors": [], - "id": 6114556, - "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "year": "2012" - }, - { - "authors": [], - "id": 5785538, - "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "year": "2010" - }, - { - "authors": [], - "id": 5380273, - "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", - "year": "2008" - } - ], - "zbmath_url": "https://zbmath.org/software/825" - }, - "status": { - "execution": "successful request", - "execution_bool": true, - "internal_code": "ok", - "last_id": null, - "nr_total_results": 1, - "nr_request_results": 1, - "query_execution_time_in_seconds": 0.41052913665771484, - "status_code": 200, - "time_stamp": "2024-09-26T18:55:42Z" - }, - "swhdeposit:deposit": { - "swhdeposit:reference": { - "swhdeposit:object": { - "@swhid": "swh:1:snp:5132188cdf169d8ffcf2e44f41ad30734ecbf685" - } - }, - "swhdeposit:metadata-provenance": { - "schema:url": "https://staging.swmath.org/" - } - } +{ + "result": { + "articles_count": 2829, + "authors": [ + "Developers, The Sage", + "Stein, William", + "Joyner, David", + "Kohel, David", + "Cremona, John", + "Eröcal, Burçin" + ], + "classification": [ + "05", + "11", + "14", + "20", + "68", + "00", + "01", + "03", + "06", + "12", + "13", + "15", + "16", + "17", + "18", + "19", + "22", + "26", + "28", + "30", + "31", + "32", + "33", + "34", + "35", + "37", + "39", + "40", + "41", + "42", + "43", + "44", + "46", + "47", + "49", + "51", + "52", + "53", + "54", + "55", + "57", + "58", + "60", + "62", + "65", + "70", + "74", + "76", + "78", + "80", + "81", + "82", + "83", + "85", + "86", + "90", + "91", + "92", + "93", + "94", + "97" + ], + "dependencies": null, + "description": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "homepage": "http://www.sagemath.org", + "id": 825, + "keywords": [ + "orms", + "Python", + "Cython", + "Sage", + "Open Source", + "Interfaces" + ], + "license_terms": null, + "name": "SageMath", + "operating_systems": null, + "orms_id": "255", + "programming_languages": null, + "related_software": [ + { + "id": 540, + "name": "Magma" + }, + { + "id": 320, + "name": "GAP" + }, + { + "id": 23170, + "name": "GitHub" + }, + { + "id": 7248, + "name": "OEIS" + }, + { + "id": 680, + "name": "PARI/GP" + }, + { + "id": 866, + "name": "SINGULAR" + }, + { + "id": 16448, + "name": "Sage-Combinat" + }, + { + "id": 537, + "name": "Macaulay2" + }, + { + "id": 554, + "name": "Mathematica" + }, + { + "id": 545, + "name": "Maple" + }, + { + "id": 7249, + "name": "LMFDB" + }, + { + "id": 14460, + "name": "Python" + }, + { + "id": 27596, + "name": "ecdata" + }, + { + "id": 611, + "name": "nauty" + }, + { + "id": 7823, + "name": "Traces" + }, + { + "id": 560, + "name": "Maxima" + }, + { + "id": 4968, + "name": "DLMF" + }, + { + "id": 4698, + "name": "Gfan" + }, + { + "id": 23728, + "name": "MathOverflow" + }, + { + "id": 6293, + "name": "SciPy" + } + ], + "source_code": "https://github.com/sagemath/sage", + "standard_articles": [ + { + "authors": [], + "id": 7045592, + "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "year": "2019" + }, + { + "authors": [], + "id": 6418360, + "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "year": "2015" + }, + { + "authors": [], + "id": 6532438, + "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "year": "2013" + }, + { + "authors": [], + "id": 6462232, + "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "year": "2013" + }, + { + "authors": [], + "id": 6114556, + "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "year": "2012" + }, + { + "authors": [], + "id": 5785538, + "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "year": "2010" + }, + { + "authors": [], + "id": 5380273, + "source": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "title": "zbMATH Open Web Interface contents unavailable due to conflicting licenses.", + "year": "2008" + } + ], + "zbmath_url": "https://zbmath.org/software/825" + }, + "status": { + "execution": "successful request", + "execution_bool": true, + "internal_code": "ok", + "last_id": null, + "nr_total_results": 1, + "nr_request_results": 1, + "query_execution_time_in_seconds": 0.41052913665771484, + "status_code": 200, + "time_stamp": "2024-09-26T18:55:42Z" + }, + "swhdeposit:deposit": { + "swhdeposit:reference": { + "swhdeposit:object": { + "@swhid": "swh:1:snp:5132188cdf169d8ffcf2e44f41ad30734ecbf685" + } + }, + "swhdeposit:metadata-provenance": { + "schema:url": "https://staging.swmath.org/" + } + } } \ No newline at end of file