-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding support for self-referencing (#59)
* refactor: change default logging level * feat: tested version of self_references * chore * feat: Support and test asynchronous calls * chore
- Loading branch information
1 parent
0afbada
commit 7670c44
Showing
15 changed files
with
255 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from ..scholar import get_citations_from_title | ||
from .core import self_references, self_references_paper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import asyncio | ||
import logging | ||
import re | ||
import sys | ||
from typing import Dict, Iterable, Union | ||
|
||
import httpx | ||
|
||
from ..utils import optional_async | ||
from .utils import check_overlap, doi_pattern | ||
|
||
logging.basicConfig(stream=sys.stdout, level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
logging.getLogger("httpx").setLevel(logging.WARNING) | ||
|
||
|
||
@optional_async | ||
async def self_references( | ||
inputs: Union[str, Iterable[str]], | ||
relative: bool = False, | ||
verbose: bool = False, | ||
) -> Dict[str, Dict[str, Union[float, int]]]: | ||
""" | ||
Analyze self-references for a DOI or a list of DOIs. | ||
Args: | ||
inputs: A single DOI or an iterable of DOIs. | ||
relative: If True, returns self-citations as percentages; otherwise, as raw counts. | ||
Defaults to False. | ||
verbose: Whether to log detailed information. Defaults to False. | ||
Returns: | ||
A dictionary where the keys are DOIs and the values are dictionaries mapping | ||
authors to their self-citations. | ||
Raises: | ||
NotImplementedError: If the input does not match a DOI format. | ||
""" | ||
if isinstance(inputs, str): | ||
inputs = [inputs] | ||
|
||
results: Dict[str, Dict[str, Union[float, int]]] = {} | ||
|
||
tasks = [] | ||
|
||
for sample in inputs: | ||
dois = re.findall(doi_pattern, sample, re.IGNORECASE) | ||
if len(dois) == 1: | ||
# This is a DOI | ||
tasks.append( | ||
( | ||
sample, | ||
self_references_paper(dois[0], verbose=verbose, relative=relative), | ||
) | ||
) | ||
elif len(dois) == 0: | ||
# TODO: Check that it is a proper name or an ORCID ID | ||
raise NotImplementedError( | ||
"Analyzing self-references of whole authors is not yet implemented." | ||
) | ||
completed_tasks = await asyncio.gather(*[task[1] for task in tasks]) | ||
for sample, task_result in zip(tasks, completed_tasks): | ||
results[sample[0]] = task_result | ||
|
||
return results | ||
|
||
|
||
@optional_async | ||
async def self_references_paper( | ||
doi: str, | ||
relative: bool = False, | ||
verbose: bool = False, | ||
) -> Dict[str, Union[float, int]]: | ||
""" | ||
Analyze self-references for a single DOI. | ||
Args: | ||
doi: The DOI to analyze. | ||
relative: If True, returns self-citations as percentages; otherwise, as raw counts. | ||
Defaults to False. | ||
verbose: Whether to log detailed information. Defaults to False. | ||
Returns: | ||
A dictionary mapping authors to their self-citations. | ||
Raises: | ||
ValueError: If no references are found for the given DOI. | ||
""" | ||
async with httpx.AsyncClient() as client: | ||
response = await client.get( | ||
f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}", | ||
params={"fields": "title,authors,references.authors"}, | ||
) | ||
response.raise_for_status() | ||
paper = response.json() | ||
|
||
if not paper["references"]: | ||
raise ValueError("Could not find citations from Semantic Scholar") | ||
|
||
authors: Dict[str, int] = {a["name"]: 0 for a in paper["authors"]} | ||
|
||
for ref in paper["references"]: | ||
ref_authors = {a["name"] for a in ref["authors"]} | ||
for author in authors: | ||
if any(check_overlap(author, ra) for ra in ref_authors): | ||
authors[author] += 1 | ||
total = len(paper["references"]) | ||
|
||
if verbose: | ||
logger.info(f"Self references in \"{paper['title']}\"") | ||
logger.info(f" N = {len(paper['references'])}") | ||
for author, self_cites in authors.items(): | ||
logger.info(f" {author}: {100*(self_cites/total):.2f}% self-references") | ||
|
||
if relative: | ||
for author, self_cites in authors.items(): | ||
authors[author] = round(100 * self_cites / total, 2) | ||
|
||
return authors |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import asyncio | ||
import logging | ||
import time | ||
|
||
import pytest | ||
|
||
from paperscraper.citations import self_references | ||
|
||
logging.disable(logging.INFO) | ||
|
||
|
||
class TestSelfReferences: | ||
@pytest.fixture | ||
def dois(self): | ||
return [ | ||
"10.1038/s43586-024-00334-2", | ||
"10.1038/s41586-023-06600-9", | ||
"10.1016/j.neunet.2014.09.003", | ||
] | ||
|
||
def test_single_doi(self, dois): | ||
for relative in [True, False]: | ||
result = self_references(dois[0], relative=relative) | ||
assert isinstance(result, dict) | ||
assert len(result) > 0 | ||
for doi, self_cite_dict in result.items(): | ||
assert isinstance(doi, str) | ||
assert isinstance(self_cite_dict, dict) | ||
for author, self_cites in self_cite_dict.items(): | ||
assert isinstance(author, str) | ||
if relative: | ||
assert isinstance(self_cites, float) | ||
assert self_cites >= 0 and self_cites <= 100 | ||
else: | ||
assert isinstance(self_cites, int) | ||
assert self_cites >= 0 | ||
|
||
def test_multiple_dois(self, dois): | ||
for relative in [True, False]: | ||
result = self_references(dois[1:], relative=relative) | ||
assert isinstance(result, dict) | ||
assert len(result) == len(dois[1:]) | ||
for doi, self_cite_dict in result.items(): | ||
assert isinstance(doi, str) | ||
assert isinstance(self_cite_dict, dict) | ||
for author, self_cites in self_cite_dict.items(): | ||
assert isinstance(author, str) | ||
if relative: | ||
assert isinstance(self_cites, float) | ||
assert self_cites >= 0 and self_cites <= 100 | ||
else: | ||
assert isinstance(self_cites, int) | ||
assert self_cites >= 0 | ||
|
||
def test_not_implemented_error(self): | ||
with pytest.raises(NotImplementedError): | ||
self_references("John Jumper") | ||
|
||
def test_compare_async_and_sync_performance(self, dois): | ||
""" | ||
Compares the execution time of asynchronous and synchronous `self_references` | ||
for a list of DOIs. | ||
""" | ||
|
||
start_time = time.perf_counter() | ||
self_references(dois) | ||
async_duration = time.perf_counter() - start_time | ||
|
||
# Measure synchronous execution time (three independent calls) | ||
start_time = time.perf_counter() | ||
for doi in dois: | ||
self_references(doi) | ||
sync_duration = time.perf_counter() - start_time | ||
|
||
print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds") | ||
print( | ||
f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds" | ||
) | ||
|
||
# Assert that async execution (batch) is faster or at least not slower | ||
assert async_duration <= sync_duration, ( | ||
f"Async execution ({async_duration:.2f}s) is slower than sync execution " | ||
f"({sync_duration:.2f}s)" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from typing import List | ||
|
||
import httpx | ||
|
||
doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b" | ||
|
||
|
||
def check_overlap(n1: str, n2: str) -> bool: | ||
""" | ||
Check whether two author names are identical. | ||
TODO: This can be made more robust | ||
Args: | ||
n1: first name | ||
n2: second name | ||
Returns: | ||
bool: Whether names are identical. | ||
""" | ||
# remove initials and check for name intersection | ||
s1 = {w for w in n1.lower().replace(".", "").split() if len(w) > 1} | ||
s2 = {w for w in n2.lower().replace(".", "").split() if len(w) > 1} | ||
return len(s1 | s2) == len(s1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters