Skip to content

Commit

Permalink
feat: cff to doi parser (#107)
Browse files Browse the repository at this point in the history
* chore: update py version to match pkg description

* feat(parsers): add cff->doi parser

* refactor: register cff parser

* test(cff): add doctests for cff->doi parsing

* fix(cff): combine re flags with OR

* fix(parsers): use .decode() for bytes->str

* fix: restrict cff parsing to root dir
  • Loading branch information
cmdoret authored Jan 30, 2024
1 parent 7d8c3a4 commit 23c75dd
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 3 deletions.
6 changes: 5 additions & 1 deletion gimie/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from gimie.io import Resource
from gimie.parsers.abstract import Parser
from gimie.parsers.license import LicenseParser, is_license_filename
from gimie.parsers.cff import CffParser


class ParserInfo(NamedTuple):
Expand All @@ -31,6 +32,7 @@ class ParserInfo(NamedTuple):

PARSERS = {
"license": ParserInfo(default=True, type=LicenseParser),
"cff": ParserInfo(default=True, type=CffParser),
}


Expand Down Expand Up @@ -69,9 +71,11 @@ def select_parser(
parsers:
A set of parser names. If None, use the default collection.
"""
# Only parse licenses in the root directory
# Only parse licenses and citations in the root directory
if is_license_filename(path.name) and len(path.parts) == 1:
name = "license"
elif path.name == "CITATION.cff" and len(path.parts) == 1:
name = "cff"
else:
return None

Expand Down
72 changes: 72 additions & 0 deletions gimie/parsers/cff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Gimie
# Copyright 2022 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from io import BytesIO
import re
from typing import List, Optional, Set

from rdflib.term import URIRef

from gimie.graph.namespaces import SDO
from gimie.parsers.abstract import Parser, Property


class CffParser(Parser):
"""Parse cff file to extract the doi into schema:citation <doi>."""

def __init__(self):
super().__init__()

def parse(self, data: bytes) -> Set[Property]:
"""Extracts a DOI link from a CFF file and returns a
set with a single tuple <schema:citation> <doi>.
If no DOI is found, an empty set is returned.
"""
props = set()
doi = get_cff_doi(data)

if doi:
props.add((SDO.citation, URIRef(doi)))
return props


def get_cff_doi(data: bytes) -> Optional[str]:
"""Given a CFF file, returns the DOI, if any.
Parameters
----------
data:
The cff file body as bytes.
Examples
--------
>>> get_cff_doi(bytes("doi: 10.5281/zenodo.1234", encoding="utf8"))
'10.5281/zenodo.1234'
>>> get_cff_doi(bytes("abc: def", encoding="utf8"))
"""

matches = re.search(
r"^doi: *(.*)$",
data.decode(),
flags=re.IGNORECASE | re.MULTILINE,
)
try:
doi = matches.groups()[0]
except AttributeError:
doi = None

return doi
2 changes: 1 addition & 1 deletion gimie/parsers/license/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def match_license(data: bytes, min_similarity: float = 0.9) -> Optional[str]:
"""
# Compute tfidf vector for input license
vectorizer = load_tfidf_vectorizer()
input_vec = vectorizer.transform([str(data)])
input_vec = vectorizer.transform([data.decode()])

# Load ids and tfidf vectors for spdx licenses
spdx_licenses = load_spdx_ids()
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ classifiers = [
# Dependency management

[tool.poetry.dependencies]
python = ">=3.9,<=3.11"
python = ">=3.9,<=3.12"
gitpython = ">=3.1.35"
PyDriller = "^2.5"
pyshacl = "^0.20.0"
Expand Down

0 comments on commit 23c75dd

Please sign in to comment.