Implement requested changes

dspinellis · Jun 5, 2024 · e4aa67c · e4aa67c
1 parent 89fcf1f
commit e4aa67c
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 40 deletions.
diff --git a/src/Pipfile b/src/Pipfile
@@ -4,14 +4,12 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
+pybliometrics = "*"
 apsw = "*"
 importlib-metadata = "*"
 pyahocorasick = "*"
 
 [dev-packages]
-tqdm = "*"
-pandas = "*"
-pybliometrics = "*"
 black = "*"
 hatch = "*"
 pylint = "*"

diff --git a/src/alexandria3k/__main__.py b/src/alexandria3k/__main__.py
@@ -101,22 +101,39 @@ def get_data_source_instance(args):
 
 def download(args):
     """Download data using the specified data source."""
-
+    args.validate_args(args)
     data_source_instance = get_data_source_instance(args)
-    data_source_instance.download(args.database, args.sql_query)
+    if not hasattr(data_source_instance, "download"):
+        raise Alexandria3kError(
+            f"The data source {args.data_name} does not support downloading"
+        )
+    data_source_instance.download(
+        args.database, args.sql_query, args.output_path, *args.extra_args
+    )
     perf.log(
         f"Data downloaded and saved to {data_source_instance.output_path}"
     )
 
 
+def validate_args(args):
+    """Validate that both database and sql_query are either both provided or both omitted."""
+    if (args.database and not args.sql_query) or (
+        args.sql_query and not args.database
+    ):
+        raise argparse.ArgumentTypeError(
+            "Both --database and --sql-query must be provided together or not at all."
+        )
+    return args
+
+
 def add_subcommand_download(subparsers):
     """Add the arguments of the download subcommand."""
     parser = subparsers.add_parser(
         "download", help="Download data using the specified data source."
     )
     parser.set_defaults(func=download)
     parser.add_argument(
-        "database", help="File path of the database to use for retrieving data"
+        "database", help="File path of the database to use", nargs="?"
     )
     parser.add_argument(
         "data_name",
@@ -129,12 +146,35 @@ def add_subcommand_download(subparsers):
         type=str,
         help="SQL query to retrieve the data for downloading",
     )
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=str,
+        nargs="?",
+        help="File path to save the downloaded data",
+    )
+    parser.add_argument(
+        "extra_args",
+        nargs="*",
+        help="Additional arguments for the data source (e.g. URL, key, file path)",
+    )
+
+    # Add a custom validation function to the parser
+    parser.set_defaults(validate_args=validate_args)
 
 
 def populate(args):
     """Populate the specified database from the specified data source."""
 
     data_source_instance = get_data_source_instance(args)
+    if hasattr(data_source_instance, "download"):
+        # Check if the ouput_path attribute is not None
+        if data_source_instance.output_path is None:
+            raise Alexandria3kError(
+                "Output path is not set. Please ensure the download"
+                "method has been called and output_path is set."
+            )
+
     if args.row_selection_file:
         with open(args.row_selection_file, encoding="utf-8") as file:
             args.row_selection = file.read()

diff --git a/src/alexandria3k/data_sources/issn_subject_codes.py b/src/alexandria3k/data_sources/issn_subject_codes.py
@@ -1,6 +1,6 @@
 #
 # Alexandria3k Crossref bibliographic metadata processing
-# Copyright (C) 2022-2023  Diomidis Spinellis
+# Copyright (C) 2024  Panagiotis-Alexios Spanakis
 # SPDX-License-Identifier: GPL-3.0-or-later
 #
 # This program is free software: you can redistribute it and/or modify
@@ -27,15 +27,14 @@
 import os
 import sqlite3
 import tempfile
-import pandas as pd
-from tqdm import tqdm
 import pybliometrics
 from pybliometrics.scopus import SerialSearch
 from alexandria3k.csv_source import CsvCursor, VTSource
 from alexandria3k.data_source import DataSource
 from alexandria3k.db_schema import ColumnMeta, TableMeta
-from alexandria3k import perf
+from alexandria3k import perf, debug
 from alexandria3k.common import ensure_table_exists, get_string_resource
+from alexandria3k.data_source import PROGRESS_BAR_LENGTH
 
 issn_subject_codes_table = TableMeta(
     "issn_subject_codes",
@@ -95,17 +94,17 @@ def get_config_path(config_path):
                 raise FileNotFoundError(
                     f"Configuration file not found at {config_path}"
                 )
-            print(f"Using config file at {config_path}")
+            debug.log("config-file", f"Using config file at {config_path}")
             return config_path
 
-        # Set the configuration file path
-        config_path = get_config_path(config_path)
-        os.environ["PYBLIOMETRICS_CONFIG_FILE"] = config_path
+        # Check if the environment variable is set or not
+        if "PYBLIOMETRICS_CONFIG_FILE" not in os.environ:
+            # Set the configuration file path
+            config_path = get_config_path(config_path)
+            os.environ["PYBLIOMETRICS_CONFIG_FILE"] = config_path
 
         self.sample = sample
-        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
-            self.output_path = tmp_file.name
-
+        self.output_path = None
         data_source = self.output_path if data_source is None else data_source
         super().__init__(
             VTSource(issn_subject_codes_table, data_source, sample),
@@ -119,38 +118,63 @@ def execute_sql_query(self, cursor, script):
         issns = [row[0] for row in cursor.fetchall()]
         return issns
 
+    def debug_progress_bar(self, current_progress, total_length):
+        """Print a progress bar"""
+        percent = current_progress / total_length * 100
+        progress_marker = int(
+            PROGRESS_BAR_LENGTH * current_progress / total_length
+        )
+        progress_bar = "#" * progress_marker + "-" * (
+            PROGRESS_BAR_LENGTH - progress_marker
+        )
+        debug.log(
+            "progress_bar",
+            f"\r[{progress_bar}] {percent:.2f}% | "
+            f"Processed {current_progress} out of {total_length} ISSNs",
+            end="",
+        )
+
     def fetch_subject_codes(self, writer, issns):
         """Fetch the subject codes for the specified ISSNs."""
-        for issn in tqdm(issns, desc="Processing ISSNs"):
+        total_issns = len(issns)
+        for index, issn in enumerate(issns):
+            self.debug_progress_bar(index + 1, total_issns)
             query = {"issn": issn}
             try:
                 serial_search = SerialSearch(query=query, view="STANDARD")
-                results = serial_search.results
-                df = pd.DataFrame(results)
-
-                if (
-                    "subject_area_codes" in df.columns
-                    and not df["subject_area_codes"].isnull().all()
-                ):
-                    subject_area_codes = (
-                        df["subject_area_codes"].str.split(";").tolist()
-                    )
-                    flat_list = [
-                        item
-                        for sublist in subject_area_codes
-                        for item in sublist
-                    ]
-                    unique_codes = list(set(flat_list))
-
-                    for code in unique_codes:
-                        writer.writerow([issn, code])
+                results = list(serial_search.results)
+
+                subject_area_codes_set = set()
+
+                for result in results:
+                    if (
+                        "subject_area_codes" in result
+                        and result["subject_area_codes"]
+                    ):
+                        subject_area_codes = result[
+                            "subject_area_codes"
+                        ].split(";")
+                        subject_area_codes_set.update(subject_area_codes)
+
+                for code in subject_area_codes_set:
+                    writer.writerow([issn, code])
+
             except (KeyError, ValueError) as e:
                 print(f"Error processing ISSN {issn}: {e}")
 
-    def download(self, database, sql_query=None):
+    def download(self, database, sql_query=None, output_path=None):
         """
         Create a CSV file with ISSNs and their corresponding ASJC subject codes from API calls.
         """
+        if output_path is None:
+            with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+                self.output_path = tmp_file.name
+        else:
+            with tempfile.NamedTemporaryFile(
+                delete=False, dir=os.path.dirname(output_path)
+            ) as tmp_file:
+                self.output_path = tmp_file.name
+
         try:
             pybliometrics.scopus.init()
         except RuntimeError as e:
@@ -193,5 +217,5 @@ def populate(self, database_path, columns=None, condition=None):
 
         # Proceed with the standard population process
         super().populate(database_path, columns, condition)
-
-        os.remove(self.output_path)
+        if self.output_path:
+            os.remove(self.output_path)