Add unittests for input validations

- fuels - auxiliary/check_dtypes
blue-marble · Aug 7, 2019 · b70e787 · b70e787
1 parent 97d2f57
commit b70e787
Show file tree

Hide file tree

Showing 4 changed files with 246 additions and 23 deletions.
diff --git a/gridpath/auxiliary/auxiliary.py b/gridpath/auxiliary/auxiliary.py
@@ -335,8 +335,6 @@ def check_dtypes(df, expected_dtypes):
     :return: List of error messages for each column with invalid datatypes.
         Error message specifies the column and the expected data type.
         List of columns with erroneous data types.
-
-    TODO: add example
     """
 
     result = []

diff --git a/gridpath/project/fuels.py b/gridpath/project/fuels.py
@@ -190,6 +190,43 @@ def validate_inputs(subscenarios, subproblem, stage, conn):
         )
 
     # Check that fuels specified for projects exist in fuels table
+    validation_errors = validate_fuel_projects(prj_df, fuels_df)
+    for error in validation_errors:
+        validation_results.append(
+            (subscenarios.SCENARIO_ID,
+             __name__,
+             "PROJECT_OPERATIONAL_CHARS",
+             "inputs_project_operational_chars",
+             "Non existent fuel",
+             error)
+        )
+
+    # Check that fuel prices exist for the period and month
+    validation_errors = validate_fuel_prices(fuels_df, fuel_prices_df,
+                                             periods_months)
+    for error in validation_errors:
+        validation_results.append(
+            (subscenarios.SCENARIO_ID,
+             __name__,
+             "PROJECT_FUEL_PRICES",
+             "inputs_project_fuel_prices",
+             "Missing fuel price",
+             error
+             )
+        )
+
+    # Write all input validation errors to database
+    write_validation_to_database(validation_results, conn)
+
+
+def validate_fuel_projects(prj_df, fuels_df):
+    """
+    Check that fuels specified for projects exist in fuels table
+    :param prj_df:
+    :param fuels_df:
+    :return:
+    """
+    results = []
     fuel_mask = pd.notna(prj_df["fuel"])
     existing_fuel_mask = prj_df["fuel"].isin(fuels_df["fuel"])
     invalids = fuel_mask & ~existing_fuel_mask
@@ -198,35 +235,33 @@ def validate_inputs(subscenarios, subproblem, stage, conn):
         bad_fuels = prj_df["fuel"][invalids].values
         print_bad_projects = ", ".join(bad_projects)
         print_bad_fuels = ", ".join(bad_fuels)
-        validation_results.append(
-            (subscenarios.SCENARIO_ID,
-             __name__,
-             "PROJECT_OPERATIONAL_CHARS",
-             "inputs_project_operational_chars",
-             "Non existent fuel",
-             "Project(s) '{}': Specified fuel(s) '{}' do(es) not exist"
-             .format(print_bad_projects, print_bad_fuels)
-             )
+        results.append(
+            "Project(s) '{}': Specified fuel(s) '{}' do(es) not exist"
+            .format(print_bad_projects, print_bad_fuels)
         )
 
-    # Check that fuel prices exist for the period and month
+    return results
+
+
+def validate_fuel_prices(fuels_df, fuel_prices_df, periods_months):
+    """
+    Check that fuel prices exist for the period and month
+    :param fuels_df:
+    :param fuel_prices_df:
+    :param periods_months:
+    :return:
+    """
+    results = []
     for f in fuels_df["fuel"].values:
         df = fuel_prices_df[fuel_prices_df["fuel"] == f]
         for period, month in periods_months:
             if not ((df.period == period) & (df.month == month)).any():
-                validation_results.append(
-                    (subscenarios.SCENARIO_ID,
-                     __name__,
-                     "PROJECT_FUEL_PRICES",
-                     "inputs_project_fuel_prices",
-                     "Missing fuel price",
-                     "Fuel '{}': Missing price for period '{}', month '{}')"
-                     .format(f, str(period), str(month))
-                     )
+                results.append(
+                    "Fuel '{}': Missing price for period '{}', month '{}'"
+                    .format(f, str(period), str(month))
                 )
 
-    # Write all input validation errors to database
-    write_validation_to_database(validation_results, conn)
+    return results
 
 
 def write_model_inputs(inputs_directory, subscenarios, subproblem, stage, conn):

diff --git a/tests/auxiliary/test_auxiliary.py b/tests/auxiliary/test_auxiliary.py
@@ -3,6 +3,8 @@
 
 from pyomo.environ import AbstractModel
 import unittest
+import pandas as pd
+import numpy as np
 
 import gridpath.auxiliary.auxiliary as auxiliary_module_to_test
 
@@ -85,6 +87,104 @@ def test_is_number(self):
         self.assertEqual(True, auxiliary_module_to_test.is_number(100.5))
         self.assertEqual(False, auxiliary_module_to_test.is_number("string"))
 
+    def test_check_dtypes(self):
+        """
+
+        :return:
+        """
+        test_cases = {
+            # Make sure correct inputs don't throw error
+            1: {"df": pd.DataFrame(
+                    columns=["project", "capacity"],
+                    data=[["gas_ct", 10], ["coal_plant", 20]]),
+                "expected_dtypes": {
+                    "project": "string",
+                    "capacity": "numeric"},
+                "result": ([], [])
+                },
+            # Test invalid string column
+            2: {"df": pd.DataFrame(
+                columns=["project", "capacity"],
+                data=[["gas_ct", 10], ["coal_plant", "string"]]),
+                "expected_dtypes": {
+                    "project": "string",
+                    "capacity": "numeric"},
+                "result": (
+                    ["Invalid data type for column 'capacity'; expected numeric"],
+                    ["capacity"]
+                )},
+            # Test invalid numeric column
+            3: {"df": pd.DataFrame(
+                columns=["project", "capacity"],
+                data=[[1, 10], [1, 20]]),
+                "expected_dtypes": {
+                    "project": "string",
+                    "capacity": "numeric"},
+                "result": (
+                    ["Invalid data type for column 'project'; expected string"],
+                    ["project"]
+                )},
+            # If at least one string in the column, pandas will convert
+            # all column data to string so there will be no error
+            4: {"df": pd.DataFrame(
+                columns=["project", "capacity"],
+                data=[["gas_ct", 10], [1, 20]]),
+                "expected_dtypes": {
+                    "project": "string",
+                    "capacity": "numeric"},
+                "result": ([], [])
+                },
+            # Columns with all None are ignored
+            5: {"df": pd.DataFrame(
+                columns=["project", "capacity"],
+                data=[[None, 10], [None, 20]]),
+                "expected_dtypes": {
+                    "project": "string",
+                    "capacity": "numeric"},
+                "result": ([], [])
+                },
+            # Columns with all NaN are ignored
+            6: {"df": pd.DataFrame(
+                columns=["project", "capacity"],
+                data=[[np.nan, 10], [np.nan, 20]]),
+                "expected_dtypes": {
+                    "project": "string",
+                    "capacity": "numeric"},
+                "result": ([], [])
+                },
+            # Columns with some None are not ignored
+            7: {"df": pd.DataFrame(
+                columns=["project", "capacity"],
+                data=[[10, 10], [None, 20]]),
+                "expected_dtypes": {
+                    "project": "string",
+                    "capacity": "numeric"},
+                "result": (
+                    ["Invalid data type for column 'project'; expected string"],
+                    ["project"]
+                )},
+            # Test multiple error columns
+            8: {"df": pd.DataFrame(
+                columns=["project", "capacity"],
+                data=[[10, "string"], [10, "string"]]),
+                "expected_dtypes": {
+                    "project": "string",
+                    "capacity": "numeric"},
+                "result": (
+                    ["Invalid data type for column 'project'; expected string",
+                     "Invalid data type for column 'capacity'; expected numeric"],
+                    ["project", "capacity"]
+                )}
+        }
+
+        for test_case in test_cases.keys():
+            expected_tuple = test_cases[test_case]["result"]
+            actual_tuple = auxiliary_module_to_test.check_dtypes(
+                df=test_cases[test_case]["df"],
+                expected_dtypes=test_cases[test_case]["expected_dtypes"]
+            )
+            self.assertTupleEqual(expected_tuple, actual_tuple)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/project/test_fuels.py b/tests/project/test_fuels.py
@@ -13,6 +13,8 @@
 
 from tests.common_functions import create_abstract_model, \
     add_components_and_load_data
+from gridpath.project.fuels import \
+    validate_fuel_projects, validate_fuel_prices
 
 TEST_DATA_DIRECTORY = \
     os.path.join(os.path.dirname(__file__), "..", "test_data")
@@ -133,6 +135,94 @@ def test_initialized_components(self):
         )
         self.assertDictEqual(expected_price, actual_price)
 
+    def test_fuel_validations(self):
+        test_cases = {
+            # Make sure correct inputs don't throw error
+            1: {"prj_df": pd.DataFrame(
+                    columns=["project", "fuel"],
+                    data=[["gas_ct", "gas"], ["coal_plant", "coal"]]),
+                "fuels_df": pd.DataFrame(
+                    columns=["fuel", "co2_intensity_tons_per_mmbtu"],
+                    data=[["gas", 0.4], ["coal", 0.8]]),
+                "fuel_prices_df": pd.DataFrame(
+                    columns=["fuel", "period", "month", "fuel_price_per_mmbtu"],
+                    data=[["gas", 2018, 1, 3], ["gas", 2018, 2, 4],
+                          ["coal", 2018, 1, 2], ["coal", 2018, 2, 2]]),
+                "periods_months": [(2018, 1), (2018, 2)],
+                "fuel_project_error": [],
+                "fuel_prices_error": []
+                },
+            # If a project's fuel in prj_df does not exist in the fuels_df,
+            # there should be an error. Similarly, if a fuel price is missing
+            # for a certain month/period, there should be an error.
+            2: {"prj_df": pd.DataFrame(
+                    columns=["project", "fuel"],
+                    data=[["gas_ct", "invalid_fuel"], ["coal_plant", "coal"]]),
+                "fuels_df": pd.DataFrame(
+                    columns=["fuel", "co2_intensity_tons_per_mmbtu"],
+                    data=[["gas", 0.4], ["coal", 0.8]]),
+                "fuel_prices_df": pd.DataFrame(
+                    columns=["fuel", "period", "month", "fuel_price_per_mmbtu"],
+                    data=[["gas", 2018, 1, 3],
+                          ["coal", 2018, 1, 2], ["coal", 2018, 2, 2]]),
+                "periods_months": [(2018, 1), (2018, 2)],
+                "fuel_project_error": [
+                    "Project(s) 'gas_ct': Specified fuel(s) 'invalid_fuel' do(es) not exist"],
+                "fuel_prices_error": [
+                    "Fuel 'gas': Missing price for period '2018', month '2'"]
+                },
+            # It's okay if there are more fuels and fuels prices specified than
+            # needed for the active projects
+            3: {"prj_df": pd.DataFrame(
+                    columns=["project", "fuel"],
+                    data=[["gas_ct", "gas"]]),
+                "fuels_df": pd.DataFrame(
+                    columns=["fuel", "co2_intensity_tons_per_mmbtu"],
+                    data=[["gas", 0.4], ["coal", 0.8]]),
+                "fuel_prices_df": pd.DataFrame(
+                    columns=["fuel", "period", "month", "fuel_price_per_mmbtu"],
+                    data=[["gas", 2018, 1, 3], ["gas", 2018, 2, 4],
+                          ["coal", 2018, 1, 2], ["coal", 2018, 2, 2]]),
+                "periods_months": [(2018, 1), (2018, 2)],
+                "fuel_project_error": [],
+                "fuel_prices_error": []
+                },
+            # Test for multiple errors in a column
+            4: {"prj_df": pd.DataFrame(
+                columns=["project", "fuel"],
+                data=[["gas_ct", "invalid_fuel1"], ["coal_plant", "invalid_fuel2"]]),
+                "fuels_df": pd.DataFrame(
+                    columns=["fuel", "co2_intensity_tons_per_mmbtu"],
+                    data=[["gas", 0.4], ["coal", 0.8]]),
+                "fuel_prices_df": pd.DataFrame(
+                    columns=["fuel", "period", "month", "fuel_price_per_mmbtu"],
+                    data=[["gas", 2018, 1, 3],
+                          ["coal", 2018, 1, 2]]),
+                "periods_months": [(2018, 1), (2018, 2)],
+                "fuel_project_error":
+                    ["Project(s) 'gas_ct, coal_plant': Specified fuel(s) 'invalid_fuel1, invalid_fuel2' do(es) not exist"],
+                "fuel_prices_error":
+                    ["Fuel 'gas': Missing price for period '2018', month '2'",
+                     "Fuel 'coal': Missing price for period '2018', month '2'"]
+                }
+        }
+
+        for test_case in test_cases.keys():
+            expected_list = test_cases[test_case]["fuel_project_error"]
+            actual_list = validate_fuel_projects(
+                prj_df=test_cases[test_case]["prj_df"],
+                fuels_df=test_cases[test_case]["fuels_df"]
+            )
+            self.assertListEqual(expected_list, actual_list)
+
+            expected_list = test_cases[test_case]["fuel_prices_error"]
+            actual_list = validate_fuel_prices(
+                fuels_df=test_cases[test_case]["fuels_df"],
+                fuel_prices_df=test_cases[test_case]["fuel_prices_df"],
+                periods_months=test_cases[test_case]["periods_months"]
+            )
+            self.assertListEqual(expected_list, actual_list)
+
 
 if __name__ == "__main__":
     unittest.main()