Skip to content

Commit

Permalink
Implement set logic for documents
Browse files Browse the repository at this point in the history
Ref #103
  • Loading branch information
physikerwelt committed Oct 17, 2024
1 parent 034b3bf commit 260bf68
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 4 deletions.
20 changes: 18 additions & 2 deletions src/zbmath_rest2oai/getAsXml.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ def apply_zbmath_api_fixes(result, prefix):
result['states'] = states


def extract_tags(result):
mscs = result.get('msc',[])
tags = []
for msc in mscs:
msc0 = msc['code'][:2]
if msc0 not in tags:
tags.append(msc0)
tags.sort()
if result.get('database') == 'JFM':
tags.append('JFM')
return tags


def final_xml2(api_source, prefix):
headers = {'Accept': 'application/json'}
r = requests.get(api_source, headers=headers)
Expand All @@ -69,12 +82,15 @@ def final_xml2(api_source, prefix):
raise Exception(f"Unexpected response with status code {r.status_code}: {r.text}")
json = r.json()
dict_math_entities = {}
tags = {}
for result in json["result"]:
apply_zbmath_api_fixes(result, prefix)
dict_math_entities[result["id"]] = _illegal_xml_chars_RE.sub("", Converter(wrap="root").build(
identifier = result["id"]
dict_math_entities[identifier] = _illegal_xml_chars_RE.sub("", Converter(wrap="root").build(
result,
closed_tags_for=[[], '', [None], None]))
return [dict_math_entities, r.elapsed.total_seconds()]
tags[identifier] = extract_tags(result)
return [dict_math_entities, r.elapsed.total_seconds(), tags]


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions src/zbmath_rest2oai/writeOai.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ async def async_write_oai(xml_contents, ingest_format, tags=None):


def write_oai(api_source, prefix, ingest_format):
test_xml, time_rest = getAsXml.final_xml2(api_source, prefix)
xml_contents, time_rest, tags = getAsXml.final_xml2(api_source, prefix)
start = timer()
records, last_id = asyncio.run(async_write_oai(test_xml, ingest_format))
records, last_id = asyncio.run(async_write_oai(xml_contents, ingest_format, tags))

last_id = int(last_id.removeprefix(prefix))
return {
Expand Down
22 changes: 22 additions & 0 deletions test/test_extract_tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import unittest

import requests

from zbmath_rest2oai.getAsXml import extract_tags

from zbmath_rest2oai import getAsXml

API_SOURCE = 'https://api.zbmath.org/v1/document/_structured_search?page=0&results_per_page=10&zbmath%20id=2500495'


class PlainXmlTest(unittest.TestCase):
def test_similarity(self):
headers = {'Accept': 'application/json'}
r = requests.get(API_SOURCE, headers=headers)
real_tags = extract_tags(r.json()['result'][0])
assert real_tags == ['11','JFM']



if __name__ == '__main__':
unittest.main()

0 comments on commit 260bf68

Please sign in to comment.