Skip to content

Commit

Permalink
Simplify curated paper loading
Browse files Browse the repository at this point in the history
  • Loading branch information
bgyori committed Nov 3, 2024
1 parent f9a4179 commit aaf35c7
Showing 1 changed file with 22 additions and 41 deletions.
63 changes: 22 additions & 41 deletions src/bioregistry/analysis/paper_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,60 +65,40 @@ def load_bioregistry_json(file_path):
return pd.DataFrame(publications)


def load_curated_pmids(file_path):
"""Load curated papers data from TSV file, extracting PMIDs in a list.
:param file_path: Path to the curated_papers.tsv file.
:type file_path: str
:return: List containing already curated PMIDs
:rtype: list
"""
try:
curated_papers_df = pd.read_csv(file_path, sep="\t")
return curated_papers_df["pubmed"].tolist()
except FileNotFoundError:
click.echo(f"Could not find file {file_path}.")
return list()


def load_curated_papers(file_path):
def load_curated_papers(file_path=CURATED_PAPERS_PATH):

Check warning on line 68 in src/bioregistry/analysis/paper_ranking.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/analysis/paper_ranking.py#L68

Added line #L68 was not covered by tests
"""Load curated papers data from TSV file, and fetch titles and abstracts for PMIDs.
:param file_path: Path to the curated_papers.tsv file.
:type file_path: str
:return: DataFrame containing curated publication details.
:rtype: pd.DataFrame
"""
try:
curated_df = pd.read_csv(file_path, sep="\t")
curated_df = curated_df.rename(columns={"pmid": "pubmed", "relevant": "label"})
curated_df["title"] = ""
curated_df["abstract"] = ""

pmids_to_fetch = curated_df["pubmed"].tolist()
fetched_metadata = {}
for chunk in [pmids_to_fetch[i : i + 200] for i in range(0, len(pmids_to_fetch), 200)]:
fetched_metadata.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True))

for index, row in curated_df.iterrows():
if row["pubmed"] in fetched_metadata:
curated_df.at[index, "title"] = fetched_metadata[row["pubmed"]].get("title", "")
curated_df.at[index, "abstract"] = fetched_metadata[row["pubmed"]].get(
"abstract", ""
)
curated_df = pd.read_csv(file_path, sep="\t")
curated_df = curated_df.rename(columns={"pmid": "pubmed", "relevant": "label"})
curated_df["title"] = ""
curated_df["abstract"] = ""

Check warning on line 79 in src/bioregistry/analysis/paper_ranking.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/analysis/paper_ranking.py#L76-L79

Added lines #L76 - L79 were not covered by tests

click.echo(f"Got {len(curated_df)} curated publications from the curated_papers.tsv file")
return curated_df
except FileNotFoundError:
click.echo(f"Could not find file {file_path}.")
return pd.DataFrame()
pmids_to_fetch = curated_df["pubmed"].tolist()
fetched_metadata = {}

Check warning on line 82 in src/bioregistry/analysis/paper_ranking.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/analysis/paper_ranking.py#L81-L82

Added lines #L81 - L82 were not covered by tests
for chunk in [pmids_to_fetch[i : i + 200] for i in range(0, len(pmids_to_fetch), 200)]:
fetched_metadata.update(pubmed_client.get_metadata_for_ids(chunk, get_abstracts=True))

Check warning on line 84 in src/bioregistry/analysis/paper_ranking.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/analysis/paper_ranking.py#L84

Added line #L84 was not covered by tests

for index, row in curated_df.iterrows():
if row["pubmed"] in fetched_metadata:
curated_df.at[index, "title"] = fetched_metadata[row["pubmed"]].get("title", "")
curated_df.at[index, "abstract"] = fetched_metadata[row["pubmed"]].get(

Check warning on line 89 in src/bioregistry/analysis/paper_ranking.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/analysis/paper_ranking.py#L88-L89

Added lines #L88 - L89 were not covered by tests
"abstract", ""
)

click.echo(f"Got {len(curated_df)} curated publications from the curated_papers.tsv file")
return curated_df

Check warning on line 94 in src/bioregistry/analysis/paper_ranking.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/analysis/paper_ranking.py#L93-L94

Added lines #L93 - L94 were not covered by tests


def fetch_pubmed_papers(curated_pmids):

Check warning on line 97 in src/bioregistry/analysis/paper_ranking.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/analysis/paper_ranking.py#L97

Added line #L97 was not covered by tests
"""Fetch PubMed papers from the last 30 days using specific search terms, excluding curated papers.
:param curated_pmids: List containing already curated PMIDs
:type curated_pmids: list
:type curated_pmids: Iterable
:return: DataFrame containing PubMed paper details.
:rtype: pd.DataFrame
"""
Expand Down Expand Up @@ -410,7 +390,8 @@ def main(bioregistry_file, start_date, end_date):
click.echo(f"Writing feature (word) importances to {importance_path}")
importances_df.to_csv(importance_path, sep="\t", index=False)

curated_pmids = load_curated_pmids(CURATED_PAPERS_PATH)
# These have already been curated and will therefore be filtered out
curated_pmids = set(curated_papers_df["pubmed"])

Check warning on line 394 in src/bioregistry/analysis/paper_ranking.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/analysis/paper_ranking.py#L394

Added line #L394 was not covered by tests

new_pub_df = fetch_pubmed_papers(curated_pmids)

Check warning on line 396 in src/bioregistry/analysis/paper_ranking.py

View check run for this annotation

Codecov / codecov/patch

src/bioregistry/analysis/paper_ranking.py#L396

Added line #L396 was not covered by tests
if not new_pub_df.empty:
Expand Down

0 comments on commit aaf35c7

Please sign in to comment.