From 23c75dd2eaf198e3d86c14f996f4b87e210233f1 Mon Sep 17 00:00:00 2001 From: Cyril Matthey-Doret Date: Tue, 30 Jan 2024 16:16:16 +0100 Subject: [PATCH] feat: cff to doi parser (#107) * chore: update py version to match pkg description * feat(parsers): add cff->doi parser * refactor: register cff parser * test(cff): add doctests for cff->doi parsing * fix(cff): combine re flags with OR * fix(parsers): use .decode() for bytes->str * fix: restrict cff parsing to root dir --- gimie/parsers/__init__.py | 6 ++- gimie/parsers/cff.py | 72 +++++++++++++++++++++++++++++++ gimie/parsers/license/__init__.py | 2 +- pyproject.toml | 2 +- 4 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 gimie/parsers/cff.py diff --git a/gimie/parsers/__init__.py b/gimie/parsers/__init__.py index cfac1ef..f89b66f 100644 --- a/gimie/parsers/__init__.py +++ b/gimie/parsers/__init__.py @@ -22,6 +22,7 @@ from gimie.io import Resource from gimie.parsers.abstract import Parser from gimie.parsers.license import LicenseParser, is_license_filename +from gimie.parsers.cff import CffParser class ParserInfo(NamedTuple): @@ -31,6 +32,7 @@ class ParserInfo(NamedTuple): PARSERS = { "license": ParserInfo(default=True, type=LicenseParser), + "cff": ParserInfo(default=True, type=CffParser), } @@ -69,9 +71,11 @@ def select_parser( parsers: A set of parser names. If None, use the default collection. """ - # Only parse licenses in the root directory + # Only parse licenses and citations in the root directory if is_license_filename(path.name) and len(path.parts) == 1: name = "license" + elif path.name == "CITATION.cff" and len(path.parts) == 1: + name = "cff" else: return None diff --git a/gimie/parsers/cff.py b/gimie/parsers/cff.py new file mode 100644 index 0000000..a00499a --- /dev/null +++ b/gimie/parsers/cff.py @@ -0,0 +1,72 @@ +# Gimie +# Copyright 2022 - Swiss Data Science Center (SDSC) +# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and +# Eidgenössische Technische Hochschule Zürich (ETHZ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from io import BytesIO +import re +from typing import List, Optional, Set + +from rdflib.term import URIRef + +from gimie.graph.namespaces import SDO +from gimie.parsers.abstract import Parser, Property + + +class CffParser(Parser): + """Parse cff file to extract the doi into schema:citation .""" + + def __init__(self): + super().__init__() + + def parse(self, data: bytes) -> Set[Property]: + """Extracts a DOI link from a CFF file and returns a + set with a single tuple . + If no DOI is found, an empty set is returned. + """ + props = set() + doi = get_cff_doi(data) + + if doi: + props.add((SDO.citation, URIRef(doi))) + return props + + +def get_cff_doi(data: bytes) -> Optional[str]: + """Given a CFF file, returns the DOI, if any. + + Parameters + ---------- + data: + The cff file body as bytes. + + Examples + -------- + >>> get_cff_doi(bytes("doi: 10.5281/zenodo.1234", encoding="utf8")) + '10.5281/zenodo.1234' + >>> get_cff_doi(bytes("abc: def", encoding="utf8")) + + """ + + matches = re.search( + r"^doi: *(.*)$", + data.decode(), + flags=re.IGNORECASE | re.MULTILINE, + ) + try: + doi = matches.groups()[0] + except AttributeError: + doi = None + + return doi diff --git a/gimie/parsers/license/__init__.py b/gimie/parsers/license/__init__.py index c3e78c8..fe66a81 100644 --- a/gimie/parsers/license/__init__.py +++ b/gimie/parsers/license/__init__.py @@ -66,7 +66,7 @@ def match_license(data: bytes, min_similarity: float = 0.9) -> Optional[str]: """ # Compute tfidf vector for input license vectorizer = load_tfidf_vectorizer() - input_vec = vectorizer.transform([str(data)]) + input_vec = vectorizer.transform([data.decode()]) # Load ids and tfidf vectors for spdx licenses spdx_licenses = load_spdx_ids() diff --git a/pyproject.toml b/pyproject.toml index 39e8ac2..1d77f57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ # Dependency management [tool.poetry.dependencies] -python = ">=3.9,<=3.11" +python = ">=3.9,<=3.12" gitpython = ">=3.1.35" PyDriller = "^2.5" pyshacl = "^0.20.0"