From b40db9685b35e43ccdadde0d50bb4ba93638e6b4 Mon Sep 17 00:00:00 2001
From: Mike Bannister <mike.bannister@nrel.gov>
Date: Thu, 17 Aug 2023 09:00:10 -0600
Subject: [PATCH] Start converting 'scrape' to 'process' (#1)

* Start converting 'scrape' to 'process'

* Finish converting 'scrape' to 'process'

---------

Co-authored-by: Brian Mirletz <brian.mirletz@nrel.gov>
---
 .gitignore                                    | 104 ++++++++++++++++++
 README.md                                     |  12 +-
 .../debt_fraction_calc.py                     |   3 +-
 example_notebooks/Full work flow.ipynb        |  25 +++--
 .../Process ATB electricity technology.ipynb  |   1 -
 lcoe_calculator/README.md                     |  11 +-
 .../{full_scrape.py => process_all.py}        |  50 ++++-----
 lcoe_calculator/tech_processors.py            |   4 +-
 8 files changed, 158 insertions(+), 52 deletions(-)
 create mode 100644 .gitignore
 rename lcoe_calculator/{full_scrape.py => process_all.py} (73%)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f6d2cd6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,104 @@
+# Custom list
+.DS_Store
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/README.md b/README.md
index c92a796..1bcce38 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 Python files and Jupyter notebooks for processing the Annual Technology Baseline (ATB) electricity data and determining LCOE and other metrics. All documentation and data for the ATB is available at the [ATB website](https://atb.nrel.gov).
 
 ## Installation and Requirements
-The pipeline requires [Python](https://www.python.org) 3.10 or newer. Dependancies can be installed using `pip`: 
+The pipeline requires [Python](https://www.python.org) 3.10 or newer. Dependancies can be installed using `pip`:
 
 ```
 $ pip install -r requirements.txt
@@ -30,27 +30,27 @@ is the path and filename to the ATB electricity data workbook `xlsx` file.
 Process all techs and export to a flat file named `flat_file.csv`:
 
 ```
-$ python -m lcoe_calculator.full_scrape --save-flat flat_file.csv {PATH-TO-DATA-WORKBOOK}
+$ python -m lcoe_calculator.process_all --save-flat flat_file.csv {PATH-TO-DATA-WORKBOOK}
 ```
 
 Process only land-based wind and export pivoted data and meta data:
 
 ```
-$ python -m lcoe_calculator.full_scrape --tech LandBasedWindProc \
+$ python -m lcoe_calculator.process_all --tech LandBasedWindProc \
 	--save-pivoted pivoted_file.csv --save-meta meta_file.csv {PATH-TO-DATA-WORKBOOK}
 ```
 
 Process only pumped storage hydropower and copy data to the clipboard so it may be pasted into a spreadsheet:
 
 ```
-$ python -m lcoe_calculator.full_scrape --tech PumpedStorageHydroProc \
+$ python -m lcoe_calculator.process_all --tech PumpedStorageHydroProc \
 	--clipboard {PATH-TO-DATA-WORKBOOK}
 ```
 
-Help for the scraper and the names of available technologies can be viewed by running:
+Help for the processor and the names of available technologies can be viewed by running:
 
 ```
-$ python -m lcoe_calculator.full_scrape --help
+$ python -m lcoe_calculator.process_all --help
 ```
 
 ## Debt Fraction Calculator
diff --git a/debt_fraction_calculator/debt_fraction_calc.py b/debt_fraction_calculator/debt_fraction_calc.py
index 5db7872..f476097 100644
--- a/debt_fraction_calculator/debt_fraction_calc.py
+++ b/debt_fraction_calculator/debt_fraction_calc.py
@@ -5,7 +5,7 @@
 # (see https://github.com/NREL/ATB-calc).
 #
 """
-Workflow to calculate debt fractions based on scraped data
+Workflow to calculate debt fractions based on ATB data
 
 Developed against PySAM 4.0.0
 """
@@ -13,7 +13,6 @@
 import pandas as pd
 import click
 
-
 import PySAM.Levpartflip as levpartflip
 
 from lcoe_calculator.extractor import Extractor
diff --git a/example_notebooks/Full work flow.ipynb b/example_notebooks/Full work flow.ipynb
index 1dcd7b8..a1836fc 100644
--- a/example_notebooks/Full work flow.ipynb	
+++ b/example_notebooks/Full work flow.ipynb	
@@ -31,7 +31,7 @@
     "from datetime import datetime as dt\n",
     "\n",
     "sys.path.insert(0, os.path.dirname(os.getcwd()))\n",
-    "from lcoe_calculator.full_scrape import FullScrape\n",
+    "from lcoe_calculator.process_all import ProcessAll\n",
     "from lcoe_calculator.tech_processors import (ALL_TECHS,\n",
     "    OffShoreWindProc, LandBasedWindProc, DistributedWindProc,\n",
     "    UtilityPvProc, CommPvProc, ResPvProc, UtilityPvPlusBatteryProc,\n",
@@ -79,8 +79,8 @@
     "# Or process a single technology\n",
     "techs = LandBasedWindProc\n",
     "\n",
-    "# Initiate the scraper with the workbook location and desired technologies\n",
-    "scraper = FullScrape(atb_electricity_workbook, techs)"
+    "# Initiate the processor with the workbook location and desired technologies\n",
+    "processor = ProcessAll(atb_electricity_workbook, techs)"
    ]
   },
   {
@@ -90,7 +90,10 @@
    "metadata": {},
    "source": [
     "## Run the pipeline\n",
-    "Now that the scraper knows where the data workbook is and which technologies were interested in we can kick it off. Depending on the number of requested technologies, this can take a couple minutes. Note that calculated LCOE and CAPEX is automatically compared to the values in the workbook. Not all technologies have LCOE and CAPEX."
+    "Now that the processor knows where the data workbook is and which technologies we are interested in, we\n",
+    "can kick it off. Depending on the number of requested technologies, this can take a couple minutes.\n",
+    "Note that calculated LCOE and CAPEX is automatically compared to the values in the workbook. Not all\n",
+    "technologies have LCOE and CAPEX."
    ]
   },
   {
@@ -103,7 +106,7 @@
    "outputs": [],
    "source": [
     "start = dt.now()\n",
-    "scraper.scrape()\n",
+    "processor.process()\n",
     "print('Processing completed in ', dt.now() - start)"
    ]
   },
@@ -124,16 +127,16 @@
    "outputs": [],
    "source": [
     "# Save data to as a CSV\n",
-    "scraper.to_csv('atb_data.csv')\n",
+    "processor.to_csv('atb_data.csv')\n",
     "\n",
     "# Save flattened data to as a CSV\n",
-    "scraper.flat_to_csv('atb_data_flat.csv')\n",
+    "processor.flat_to_csv('atb_data_flat.csv')\n",
     "\n",
     "# Save meta data to as a CSV\n",
-    "scraper.meta_data_to_csv('atb_meta_data.csv')\n",
+    "processor.meta_data_to_csv('atb_meta_data.csv')\n",
     "\n",
     "# Copy data to the clipboard so it can be pasted in a spreadsheet \n",
-    "scraper.data.to_clipboard()"
+    "processor.data.to_clipboard()"
    ]
   },
   {
@@ -152,7 +155,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = scraper.data\n",
+    "data = processor.data\n",
     "\n",
     "# Show available parameters\n",
     "print('Available parameters')\n",
@@ -184,7 +187,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.12"
   },
   "vscode": {
    "interpreter": {
diff --git a/example_notebooks/Process ATB electricity technology.ipynb b/example_notebooks/Process ATB electricity technology.ipynb
index 000695b..2e83d2a 100644
--- a/example_notebooks/Process ATB electricity technology.ipynb	
+++ b/example_notebooks/Process ATB electricity technology.ipynb	
@@ -31,7 +31,6 @@
     "from datetime import datetime as dt\n",
     "\n",
     "sys.path.insert(0, os.path.dirname(os.getcwd()))\n",
-    "from lcoe_calculator.full_scrape import FullScrape\n",
     "\n",
     "# Electricity technology processors\n",
     "from lcoe_calculator.tech_processors import (\n",
diff --git a/lcoe_calculator/README.md b/lcoe_calculator/README.md
index f148166..4553aa4 100644
--- a/lcoe_calculator/README.md
+++ b/lcoe_calculator/README.md
@@ -1,14 +1,15 @@
 # ATB Calculator
-This code scrapes the Excel ATB data workbook, calculates LCOE and CAPEX
-for all technologies as needed, and exports data in flat or flat + pivoted formats.
+This code extracts data from the ATB Excel workbook, then calculates LCOE and CAPEX for all
+technologies as needed, and exports data in flat or flat and pivoted formats.
 
-**Note:** You will likely have to give Python access to interact with Excel. A window will automatically ask for permission the first time this script is run.
+**Note:** You will likely have to give Python access to interact with Excel. A window will
+automatically ask for permission the first time this script is run.
 
 ## Files
 Files are listed in roughly descending order of importance and approachability.
 
- - `full_scrape.py` - Class that performs full scrape with built in command line interface. See the README in the root of this repo for CLI examples.
- - `tech_processors.py` - Classes to scrape and process individual technologies. Any new ATB technologies should be added to this file.
+ - `process_all.py` - Class that performs processing for all ATB technologies with a built-in command line interface. See the README in the root of this repo for CLI examples.
+ - `tech_processors.py` - Classes to process individual technologies. Any new ATB technologies should be added to this file.
  - `base_processor.py` - Base processor class that is subclassed to process individual technologies.
  - `config.py` - Constant definitions including the base year and scenario names
  - `extractor.py` - Code to pull values from the workbook
diff --git a/lcoe_calculator/full_scrape.py b/lcoe_calculator/process_all.py
similarity index 73%
rename from lcoe_calculator/full_scrape.py
rename to lcoe_calculator/process_all.py
index b3ac6be..47ef689 100644
--- a/lcoe_calculator/full_scrape.py
+++ b/lcoe_calculator/process_all.py
@@ -5,7 +5,7 @@
 # (see https://github.com/NREL/ATB-calc).
 #
 """
-Scrape ATB data workbook and calculate all metrics.
+Process all (or some) ATB technologies and calculate all metrics.
 """
 from typing import List, Dict, Type
 from datetime import datetime as dt
@@ -14,11 +14,11 @@
 
 from .tech_processors import ALL_TECHS
 from .base_processor import TechProcessor
-from .config import FINANCIAL_CASES, CRP_CHOICES, CrpChoiceType
+from .config import FINANCIAL_CASES, CRP_CHOICES
 
-class FullScrape:
+class ProcessAll:
     """
-    Scrape data workbook and calculate LCOE for techs, CRPs, and financial
+    Extract data from ATB workbook and calculate LCOE for techs, CRPs, and financial
     scenarios.
     """
     def __init__(self, data_workbook_fname: str,
@@ -30,16 +30,16 @@ def __init__(self, data_workbook_fname: str,
         if not isinstance(techs, list):
             techs = [techs]
 
-        self.data = pd.DataFrame()  # Flat data from scrape
-        self.meta = pd.DataFrame()  # Meta data from scrape
+        self.data = pd.DataFrame()  # Flat data
+        self.meta = pd.DataFrame()  # Meta data
 
         self._techs = techs
         self._fname = data_workbook_fname
 
-    def scrape(self, test_capex: bool = True, test_lcoe: bool = True):
-        """ Scrap all techs """
-        self.data = pd.DataFrame()  # Flat data from scrape
-        self.meta = pd.DataFrame()  # Meta data from scrape
+    def process(self, test_capex: bool = True, test_lcoe: bool = True):
+        """ Process all techs """
+        self.data = pd.DataFrame()
+        self.meta = pd.DataFrame()
 
         for i, Tech in enumerate(self._techs):
             print(f'##### Processing {Tech.tech_name} ({i+1}/{len(self._techs)}) #####')
@@ -72,7 +72,7 @@ def scrape(self, test_capex: bool = True, test_lcoe: bool = True):
     def data_flattened(self):
         """ Get flat data pivoted with each year as a row """
         if self.data is None:
-            raise ValueError('Please run scrape() first')
+            raise ValueError('Please run process() first')
 
         melted = pd.melt(self.data, id_vars=['Parameter', 'Case', 'CRPYears',
                                              'Technology', 'DisplayName', 'Scenario'])
@@ -81,20 +81,20 @@ def data_flattened(self):
     def to_csv(self, fname: str):
         """ Write data to CSV """
         if self.data is None:
-            raise ValueError('Please run scrape() first')
+            raise ValueError('Please run process() first')
 
         self.data.to_csv(fname)
 
     def flat_to_csv(self, fname: str):
         """ Write pivoted data to CSV """
         if self.data is None:
-            raise ValueError('Please run scrape() first')
+            raise ValueError('Please run process() first')
         self.data_flattened.to_csv(fname)
 
     def meta_data_to_csv(self, fname: str):
         """ Write meta data to CSV """
         if self.data is None:
-            raise ValueError('Please run scrape() first')
+            raise ValueError('Please run process() first')
 
         self.meta.to_csv(fname)
 
@@ -104,7 +104,7 @@ def meta_data_to_csv(self, fname: str):
 @click.command
 @click.argument('data_workbook_filename', type=click.Path(exists=True))
 @click.option('-t', '--tech', type=click.Choice(tech_names),
-              help="Name of tech to scrape. Scrape all techs if none are specified.")
+              help="Name of tech to process. Process all techs if none are specified.")
 @click.option('-m', '--save-meta', 'meta_file', type=click.Path(),
               help="Save meta data to CSV.")
 @click.option('-f', '--save-flat', 'flat_file', type=click.Path(),
@@ -113,36 +113,36 @@ def meta_data_to_csv(self, fname: str):
               help="Save data in pivoted format to CSV.")
 @click.option('-c', '--clipboard', is_flag=True, default=False,
               help="Copy data to system clipboard.")
-def run_scrape(data_workbook_filename: str, tech: str|None, meta_file: str|None, flat_file: str|None,
+def process(data_workbook_filename: str, tech: str|None, meta_file: str|None, flat_file: str|None,
                pivoted_file: str|None, clipboard: bool):
     """
-    CLI to scrape ATB data workbook and calculate metrics.
+    CLI to process ATB data workbook and calculate metrics.
     """
     tech_map: Dict[str, Type[TechProcessor]] = {tech.__name__: tech for tech in ALL_TECHS}
 
     techs = ALL_TECHS if tech is None else [tech_map[tech]]
 
     start_dt = dt.now()
-    scraper = FullScrape(data_workbook_filename, techs)
-    scraper.scrape()
-    click.echo(f'Scrape completed in {dt.now()-start_dt}.')
+    processor = ProcessAll(data_workbook_filename, techs)
+    processor.process()
+    click.echo(f'Processing completed in {dt.now()-start_dt}.')
 
     if meta_file:
         click.echo(f'Writing meta data to {meta_file}.')
-        scraper.meta_data_to_csv(meta_file)
+        processor.meta_data_to_csv(meta_file)
 
     if flat_file:
         click.echo(f'Writing flat data to {flat_file}.')
-        scraper.flat_to_csv(flat_file)
+        processor.flat_to_csv(flat_file)
 
     if pivoted_file:
         click.echo(f'Writing pivoted data to {pivoted_file}.')
-        scraper.to_csv(pivoted_file)
+        processor.to_csv(pivoted_file)
 
     if clipboard:
         click.echo('Data was copied to clipboard.')
-        scraper.data.to_clipboard()
+        processor.data.to_clipboard()
 
 
 if __name__ == '__main__':
-    run_scrape()  # pylint: disable=no-value-for-parameter
+    process()  # pylint: disable=no-value-for-parameter
diff --git a/lcoe_calculator/tech_processors.py b/lcoe_calculator/tech_processors.py
index dc72cd9..f523738 100644
--- a/lcoe_calculator/tech_processors.py
+++ b/lcoe_calculator/tech_processors.py
@@ -5,10 +5,10 @@
 # (see https://github.com/NREL/ATB-calc).
 #
 """
-Individual tech scrapers. See documentation in base_processor.py.
+Individual tech processors. See documentation in base_processor.py.
 """
-import pandas as pd
 from typing import List, Type
+import pandas as pd
 
 from .config import MARKET_FIN_CASE
 from .extractor import Extractor