Skip to content

Commit

Permalink
Adding support for self-referencing (#59)
Browse files Browse the repository at this point in the history
* refactor: change default logging level

* feat: tested version of self_references

* chore

* feat: Support and test asynchronous calls

* chore
  • Loading branch information
jannisborn authored Nov 24, 2024
1 parent 0afbada commit 7670c44
Show file tree
Hide file tree
Showing 15 changed files with 255 additions and 14 deletions.
2 changes: 1 addition & 1 deletion paperscraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .load_dumps import QUERY_FN_DICT
from .utils import get_filename_from_query

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logger = logging.getLogger(__name__)

# Set urllib logging depth
Expand Down
2 changes: 2 additions & 0 deletions paperscraper/citations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from ..scholar import get_citations_from_title
from .core import self_references, self_references_paper
119 changes: 119 additions & 0 deletions paperscraper/citations/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import asyncio
import logging
import re
import sys
from typing import Dict, Iterable, Union

import httpx

from ..utils import optional_async
from .utils import check_overlap, doi_pattern

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("httpx").setLevel(logging.WARNING)


@optional_async
async def self_references(
inputs: Union[str, Iterable[str]],
relative: bool = False,
verbose: bool = False,
) -> Dict[str, Dict[str, Union[float, int]]]:
"""
Analyze self-references for a DOI or a list of DOIs.
Args:
inputs: A single DOI or an iterable of DOIs.
relative: If True, returns self-citations as percentages; otherwise, as raw counts.
Defaults to False.
verbose: Whether to log detailed information. Defaults to False.
Returns:
A dictionary where the keys are DOIs and the values are dictionaries mapping
authors to their self-citations.
Raises:
NotImplementedError: If the input does not match a DOI format.
"""
if isinstance(inputs, str):
inputs = [inputs]

results: Dict[str, Dict[str, Union[float, int]]] = {}

tasks = []

for sample in inputs:
dois = re.findall(doi_pattern, sample, re.IGNORECASE)
if len(dois) == 1:
# This is a DOI
tasks.append(
(
sample,
self_references_paper(dois[0], verbose=verbose, relative=relative),
)
)
elif len(dois) == 0:
# TODO: Check that it is a proper name or an ORCID ID
raise NotImplementedError(
"Analyzing self-references of whole authors is not yet implemented."
)
completed_tasks = await asyncio.gather(*[task[1] for task in tasks])
for sample, task_result in zip(tasks, completed_tasks):
results[sample[0]] = task_result

return results


@optional_async
async def self_references_paper(
doi: str,
relative: bool = False,
verbose: bool = False,
) -> Dict[str, Union[float, int]]:
"""
Analyze self-references for a single DOI.
Args:
doi: The DOI to analyze.
relative: If True, returns self-citations as percentages; otherwise, as raw counts.
Defaults to False.
verbose: Whether to log detailed information. Defaults to False.
Returns:
A dictionary mapping authors to their self-citations.
Raises:
ValueError: If no references are found for the given DOI.
"""
async with httpx.AsyncClient() as client:
response = await client.get(
f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}",
params={"fields": "title,authors,references.authors"},
)
response.raise_for_status()
paper = response.json()

if not paper["references"]:
raise ValueError("Could not find citations from Semantic Scholar")

Check warning on line 98 in paperscraper/citations/core.py

View check run for this annotation

Codecov / codecov/patch

paperscraper/citations/core.py#L98

Added line #L98 was not covered by tests

authors: Dict[str, int] = {a["name"]: 0 for a in paper["authors"]}

for ref in paper["references"]:
ref_authors = {a["name"] for a in ref["authors"]}
for author in authors:
if any(check_overlap(author, ra) for ra in ref_authors):
authors[author] += 1
total = len(paper["references"])

if verbose:
logger.info(f"Self references in \"{paper['title']}\"")
logger.info(f" N = {len(paper['references'])}")
for author, self_cites in authors.items():
logger.info(f" {author}: {100*(self_cites/total):.2f}% self-references")

Check warning on line 113 in paperscraper/citations/core.py

View check run for this annotation

Codecov / codecov/patch

paperscraper/citations/core.py#L110-L113

Added lines #L110 - L113 were not covered by tests

if relative:
for author, self_cites in authors.items():
authors[author] = round(100 * self_cites / total, 2)

return authors
Empty file.
84 changes: 84 additions & 0 deletions paperscraper/citations/tests/test_self_references.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import asyncio
import logging
import time

import pytest

from paperscraper.citations import self_references

logging.disable(logging.INFO)


class TestSelfReferences:
@pytest.fixture
def dois(self):
return [
"10.1038/s43586-024-00334-2",
"10.1038/s41586-023-06600-9",
"10.1016/j.neunet.2014.09.003",
]

def test_single_doi(self, dois):
for relative in [True, False]:
result = self_references(dois[0], relative=relative)
assert isinstance(result, dict)
assert len(result) > 0
for doi, self_cite_dict in result.items():
assert isinstance(doi, str)
assert isinstance(self_cite_dict, dict)
for author, self_cites in self_cite_dict.items():
assert isinstance(author, str)
if relative:
assert isinstance(self_cites, float)
assert self_cites >= 0 and self_cites <= 100
else:
assert isinstance(self_cites, int)
assert self_cites >= 0

def test_multiple_dois(self, dois):
for relative in [True, False]:
result = self_references(dois[1:], relative=relative)
assert isinstance(result, dict)
assert len(result) == len(dois[1:])
for doi, self_cite_dict in result.items():
assert isinstance(doi, str)
assert isinstance(self_cite_dict, dict)
for author, self_cites in self_cite_dict.items():
assert isinstance(author, str)
if relative:
assert isinstance(self_cites, float)
assert self_cites >= 0 and self_cites <= 100
else:
assert isinstance(self_cites, int)
assert self_cites >= 0

def test_not_implemented_error(self):
with pytest.raises(NotImplementedError):
self_references("John Jumper")

def test_compare_async_and_sync_performance(self, dois):
"""
Compares the execution time of asynchronous and synchronous `self_references`
for a list of DOIs.
"""

start_time = time.perf_counter()
self_references(dois)
async_duration = time.perf_counter() - start_time

# Measure synchronous execution time (three independent calls)
start_time = time.perf_counter()
for doi in dois:
self_references(doi)
sync_duration = time.perf_counter() - start_time

print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
print(
f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
)

# Assert that async execution (batch) is faster or at least not slower
assert async_duration <= sync_duration, (
f"Async execution ({async_duration:.2f}s) is slower than sync execution "
f"({sync_duration:.2f}s)"
)
23 changes: 23 additions & 0 deletions paperscraper/citations/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import List

import httpx

doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"


def check_overlap(n1: str, n2: str) -> bool:
"""
Check whether two author names are identical.
TODO: This can be made more robust
Args:
n1: first name
n2: second name
Returns:
bool: Whether names are identical.
"""
# remove initials and check for name intersection
s1 = {w for w in n1.lower().replace(".", "").split() if len(w) > 1}
s2 = {w for w in n2.lower().replace(".", "").split() if len(w) > 1}
return len(s1 | s2) == len(s1)
2 changes: 1 addition & 1 deletion paperscraper/get_dumps/chemrxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from .utils.chemrxiv import ChemrxivAPI, download_full, parse_dump

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

today = datetime.today().strftime("%Y-%m-%d")
Expand Down
2 changes: 1 addition & 1 deletion paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import requests

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

now_datetime = datetime.now()
Expand Down
4 changes: 1 addition & 3 deletions paperscraper/get_dumps/utils/chemrxiv/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from .chemrxiv_api import ChemrxivAPI

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

today = datetime.today().strftime("%Y-%m-%d")
Expand Down Expand Up @@ -90,7 +90,6 @@ def parse_dump(source_path: str, target_path: str) -> None:
dump = []
# Read source dump
for file_name in tqdm(os.listdir(source_path)):

if not file_name.endswith(".json"):
continue
filepath = os.path.join(source_path, file_name)
Expand Down Expand Up @@ -131,7 +130,6 @@ def download_full(save_dir: str, api: Optional[ChemrxivAPI] = None) -> None:

os.makedirs(save_dir, exist_ok=True)
for preprint in tqdm(api.all_preprints()):

path = os.path.join(save_dir, f"{preprint['item']['id']}.json")
if os.path.exists(path):
continue
Expand Down
2 changes: 1 addition & 1 deletion paperscraper/load_dumps.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .pubmed import get_and_dump_pubmed_papers
from .xrxiv.xrxiv_query import XRXivQuery

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up the query dictionary
Expand Down
2 changes: 1 addition & 1 deletion paperscraper/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from .utils import load_jsonl

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

ABSTRACT_ATTRIBUTE = {
Expand Down
3 changes: 1 addition & 2 deletions paperscraper/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
import pandas as pd

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -75,7 +75,6 @@ def aggregate_paper(
# At least one synonym per keyword needs to be in either title or
# abstract.
if filtering and filter_keys != list():

# Filter out papers which undesired terms
unwanted = False
for unwanted_key in unwanted_keys:
Expand Down
3 changes: 1 addition & 2 deletions paperscraper/scholar/scholar.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from ..utils import dump_papers

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -49,7 +49,6 @@ def get_scholar_papers(

processed = []
for paper in matches:

# Extracts title, author, year, journal, abstract
entry = {
scholar_field_mapper.get(key, key): process_fields.get(
Expand Down
19 changes: 18 additions & 1 deletion paperscraper/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import asyncio
import json
import logging
import sys
from functools import wraps
from typing import Dict, List

import pandas as pd

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -68,3 +70,18 @@ def load_jsonl(filepath: str) -> List[Dict[str, str]]:
with open(filepath, "r") as f:
data = [json.loads(line) for line in f.readlines()]
return data


def optional_async(func):
@wraps(func)
def wrapper(*args, **kwargs):
# Check if there's an active event loop
try:
loop = asyncio.get_running_loop()
# If we're in an async context, await the function
return func(*args, **kwargs)
except RuntimeError:
# Otherwise, run it synchronously using asyncio.run
return asyncio.run(func(*args, **kwargs))

return wrapper
2 changes: 1 addition & 1 deletion paperscraper/xrxiv/xrxiv_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pandas as pd

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)


Expand Down

0 comments on commit 7670c44

Please sign in to comment.