Skip to content

Commit

Permalink
Implement requested changes
Browse files Browse the repository at this point in the history
  • Loading branch information
panos-span committed Jun 5, 2024
1 parent 89fcf1f commit e4aa67c
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 40 deletions.
4 changes: 1 addition & 3 deletions src/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@ verify_ssl = true
name = "pypi"

[packages]
pybliometrics = "*"
apsw = "*"
importlib-metadata = "*"
pyahocorasick = "*"

[dev-packages]
tqdm = "*"
pandas = "*"
pybliometrics = "*"
black = "*"
hatch = "*"
pylint = "*"
Expand Down
46 changes: 43 additions & 3 deletions src/alexandria3k/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,22 +101,39 @@ def get_data_source_instance(args):

def download(args):
"""Download data using the specified data source."""

args.validate_args(args)
data_source_instance = get_data_source_instance(args)
data_source_instance.download(args.database, args.sql_query)
if not hasattr(data_source_instance, "download"):
raise Alexandria3kError(
f"The data source {args.data_name} does not support downloading"
)
data_source_instance.download(
args.database, args.sql_query, args.output_path, *args.extra_args
)
perf.log(
f"Data downloaded and saved to {data_source_instance.output_path}"
)


def validate_args(args):
"""Validate that both database and sql_query are either both provided or both omitted."""
if (args.database and not args.sql_query) or (
args.sql_query and not args.database
):
raise argparse.ArgumentTypeError(
"Both --database and --sql-query must be provided together or not at all."
)
return args


def add_subcommand_download(subparsers):
"""Add the arguments of the download subcommand."""
parser = subparsers.add_parser(
"download", help="Download data using the specified data source."
)
parser.set_defaults(func=download)
parser.add_argument(
"database", help="File path of the database to use for retrieving data"
"database", help="File path of the database to use", nargs="?"
)
parser.add_argument(
"data_name",
Expand All @@ -129,12 +146,35 @@ def add_subcommand_download(subparsers):
type=str,
help="SQL query to retrieve the data for downloading",
)
parser.add_argument(
"-o",
"--output-path",
type=str,
nargs="?",
help="File path to save the downloaded data",
)
parser.add_argument(
"extra_args",
nargs="*",
help="Additional arguments for the data source (e.g. URL, key, file path)",
)

# Add a custom validation function to the parser
parser.set_defaults(validate_args=validate_args)


def populate(args):
"""Populate the specified database from the specified data source."""

data_source_instance = get_data_source_instance(args)
if hasattr(data_source_instance, "download"):
# Check if the ouput_path attribute is not None
if data_source_instance.output_path is None:
raise Alexandria3kError(
"Output path is not set. Please ensure the download"
"method has been called and output_path is set."
)

if args.row_selection_file:
with open(args.row_selection_file, encoding="utf-8") as file:
args.row_selection = file.read()
Expand Down
92 changes: 58 additions & 34 deletions src/alexandria3k/data_sources/issn_subject_codes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#
# Alexandria3k Crossref bibliographic metadata processing
# Copyright (C) 2022-2023 Diomidis Spinellis
# Copyright (C) 2024 Panagiotis-Alexios Spanakis
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This program is free software: you can redistribute it and/or modify
Expand All @@ -27,15 +27,14 @@
import os
import sqlite3
import tempfile
import pandas as pd
from tqdm import tqdm
import pybliometrics
from pybliometrics.scopus import SerialSearch
from alexandria3k.csv_source import CsvCursor, VTSource
from alexandria3k.data_source import DataSource
from alexandria3k.db_schema import ColumnMeta, TableMeta
from alexandria3k import perf
from alexandria3k import perf, debug
from alexandria3k.common import ensure_table_exists, get_string_resource
from alexandria3k.data_source import PROGRESS_BAR_LENGTH

issn_subject_codes_table = TableMeta(
"issn_subject_codes",
Expand Down Expand Up @@ -95,17 +94,17 @@ def get_config_path(config_path):
raise FileNotFoundError(
f"Configuration file not found at {config_path}"
)
print(f"Using config file at {config_path}")
debug.log("config-file", f"Using config file at {config_path}")
return config_path

# Set the configuration file path
config_path = get_config_path(config_path)
os.environ["PYBLIOMETRICS_CONFIG_FILE"] = config_path
# Check if the environment variable is set or not
if "PYBLIOMETRICS_CONFIG_FILE" not in os.environ:
# Set the configuration file path
config_path = get_config_path(config_path)
os.environ["PYBLIOMETRICS_CONFIG_FILE"] = config_path

self.sample = sample
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
self.output_path = tmp_file.name

self.output_path = None
data_source = self.output_path if data_source is None else data_source
super().__init__(
VTSource(issn_subject_codes_table, data_source, sample),
Expand All @@ -119,38 +118,63 @@ def execute_sql_query(self, cursor, script):
issns = [row[0] for row in cursor.fetchall()]
return issns

def debug_progress_bar(self, current_progress, total_length):
"""Print a progress bar"""
percent = current_progress / total_length * 100
progress_marker = int(
PROGRESS_BAR_LENGTH * current_progress / total_length
)
progress_bar = "#" * progress_marker + "-" * (
PROGRESS_BAR_LENGTH - progress_marker
)
debug.log(
"progress_bar",
f"\r[{progress_bar}] {percent:.2f}% | "
f"Processed {current_progress} out of {total_length} ISSNs",
end="",
)

def fetch_subject_codes(self, writer, issns):
"""Fetch the subject codes for the specified ISSNs."""
for issn in tqdm(issns, desc="Processing ISSNs"):
total_issns = len(issns)
for index, issn in enumerate(issns):
self.debug_progress_bar(index + 1, total_issns)
query = {"issn": issn}
try:
serial_search = SerialSearch(query=query, view="STANDARD")
results = serial_search.results
df = pd.DataFrame(results)

if (
"subject_area_codes" in df.columns
and not df["subject_area_codes"].isnull().all()
):
subject_area_codes = (
df["subject_area_codes"].str.split(";").tolist()
)
flat_list = [
item
for sublist in subject_area_codes
for item in sublist
]
unique_codes = list(set(flat_list))

for code in unique_codes:
writer.writerow([issn, code])
results = list(serial_search.results)

subject_area_codes_set = set()

for result in results:
if (
"subject_area_codes" in result
and result["subject_area_codes"]
):
subject_area_codes = result[
"subject_area_codes"
].split(";")
subject_area_codes_set.update(subject_area_codes)

for code in subject_area_codes_set:
writer.writerow([issn, code])

except (KeyError, ValueError) as e:
print(f"Error processing ISSN {issn}: {e}")

def download(self, database, sql_query=None):
def download(self, database, sql_query=None, output_path=None):
"""
Create a CSV file with ISSNs and their corresponding ASJC subject codes from API calls.
"""
if output_path is None:
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
self.output_path = tmp_file.name
else:
with tempfile.NamedTemporaryFile(
delete=False, dir=os.path.dirname(output_path)
) as tmp_file:
self.output_path = tmp_file.name

try:
pybliometrics.scopus.init()
except RuntimeError as e:
Expand Down Expand Up @@ -193,5 +217,5 @@ def populate(self, database_path, columns=None, condition=None):

# Proceed with the standard population process
super().populate(database_path, columns, condition)

os.remove(self.output_path)
if self.output_path:
os.remove(self.output_path)

0 comments on commit e4aa67c

Please sign in to comment.