From 488f703b143156744ac00835731f8409ccdc2ddd Mon Sep 17 00:00:00 2001
From: Diane Napolitano <diane.napolitano@washpost.com>
Date: Fri, 13 Dec 2024 11:52:22 -0500
Subject: [PATCH] Moving the preprocessed data superclass to a new branch

---
 .../data/BasePreprocessedDataHandler.py       | 60 ----------------
 src/elexmodel/handlers/data/LiveData.py       | 58 ++++++++++++---
 .../handlers/data/PreprocessedData.py         | 71 ++++++++++++++-----
 tests/handlers/test_preprocessed_data.py      |  2 +-
 4 files changed, 101 insertions(+), 90 deletions(-)
 delete mode 100644 src/elexmodel/handlers/data/BasePreprocessedDataHandler.py

diff --git a/src/elexmodel/handlers/data/BasePreprocessedDataHandler.py b/src/elexmodel/handlers/data/BasePreprocessedDataHandler.py
deleted file mode 100644
index f99f9273..00000000
--- a/src/elexmodel/handlers/data/BasePreprocessedDataHandler.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import abc
-from io import StringIO
-from pathlib import Path
-
-import pandas as pd
-
-from elexmodel.handlers.data.Estimandizer import Estimandizer
-from elexmodel.utils.file_utils import get_directory_path
-
-
-class BasePreprocessedDataHandler(abc.ABC):
-    """
-    Abstract base handler for preprocessed (input) model data
-    """
-
-    def __init__(
-        self, election_id, office_id, geographic_unit_type, estimands, s3_client=None, historical=False, data=None
-    ):
-        self.election_id = election_id
-        self.office_id = office_id
-        self.geographic_unit_type = geographic_unit_type
-        self.estimands = estimands
-        self.s3_client = s3_client
-        self.historical = historical
-        self.estimandizer = Estimandizer()
-        self.file_path = self.get_data_path()
-
-        if data is not None:
-            self.data = self.load_data(data)
-        else:
-            self.data = self.get_data()
-
-    def get_data_path(self):
-        directory_path = get_directory_path()
-        path = f"{directory_path}/data/{self.election_id}/{self.office_id}/data_{self.geographic_unit_type}.csv"
-        return path
-
-    def get_data(self):
-        # If local data file is not available, read data from s3
-        if not Path(self.file_path).is_file():
-            path_info = {
-                "election_id": self.election_id,
-                "office": self.office_id,
-                "geographic_unit_type": self.geographic_unit_type,
-            }
-            file_path = self.s3_client.get_file_path("preprocessed", path_info)
-
-            csv_data = self.s3_client.get(file_path)
-            # read data as a buffer
-            preprocessed_data = StringIO(csv_data)
-        else:
-            # read data as a filepath
-            preprocessed_data = self.file_path
-
-        data = pd.read_csv(preprocessed_data, dtype={"geographic_unit_fips": str, "county_fips": str, "district": str})
-        return self.load_data(data)
-
-    @abc.abstractmethod
-    def load_data(self, data):
-        pass
diff --git a/src/elexmodel/handlers/data/LiveData.py b/src/elexmodel/handlers/data/LiveData.py
index 797f61be..56e6c352 100644
--- a/src/elexmodel/handlers/data/LiveData.py
+++ b/src/elexmodel/handlers/data/LiveData.py
@@ -1,12 +1,15 @@
 import math
+from io import StringIO
+from pathlib import Path
 
 import numpy as np
 import pandas as pd
 
-from elexmodel.handlers.data.BasePreprocessedDataHandler import BasePreprocessedDataHandler
+from elexmodel.handlers.data.Estimandizer import Estimandizer
+from elexmodel.utils.file_utils import get_directory_path
 
 
-class MockLiveDataHandler(BasePreprocessedDataHandler):
+class MockLiveDataHandler:
     """
     Handles current data, which we would pull from Dynamo on an election night
     """
@@ -22,7 +25,14 @@ def __init__(
         s3_client=None,
         unexpected_units=0,
     ):
+        self.election_id = election
+        self.office_id = office_id
+        self.geographic_unit_type = geographic_unit_type
+        self.estimands = estimands
+        self.s3_client = s3_client
+        self.historical = historical
         self.unexpected_rows = unexpected_units
+        self.estimandizer = Estimandizer()
 
         self.shuffle_columns = [
             "postal_code",
@@ -31,17 +41,45 @@ def __init__(
         ]  # columns we may want to sample by
         self.shuffle_dataframe = None
 
+        self.data = data
+        if data is not None:
+            # passed in as a df
+            data_for_estimands = self.load_data(data)
+            self.data = data_for_estimands
+        else:
+            self.data = self.get_data()
+
         self.current_reporting_data = None
 
-        super().__init__(
-            election,
-            office_id,
-            geographic_unit_type,
-            estimands,
-            s3_client=s3_client,
-            historical=historical,
-            data=data,
+    def get_data(self):
+        file_path = self.get_live_data_file_path()
+        # If local data file is not available, read data from s3
+        if not Path(file_path).is_file():
+            path_info = {
+                "election_id": self.election_id,
+                "office": self.office_id,
+                "geographic_unit_type": self.geographic_unit_type,
+            }
+            # we're mimicking live data from a file of preprocessed data
+            # but for a real live election, we will pull live data from dynamo
+            file_path = self.s3_client.get_file_path("preprocessed", path_info)
+            csv_data = self.s3_client.get(file_path)
+            # read data as a buffer
+            live_data = StringIO(csv_data)
+        else:
+            # read data as a filepath
+            live_data = file_path
+
+        data = pd.read_csv(
+            live_data,
+            dtype={"geographic_unit_fips": str, "geographic_unit_type": str, "county_fips": str, "district": str},
         )
+        data = self.load_data(data)
+        return data
+
+    def get_live_data_file_path(self):
+        directory_path = get_directory_path()
+        return f"{directory_path}/data/{self.election_id}/{self.office_id}/data_{self.geographic_unit_type}.csv"
 
     def load_data(self, data):
         columns_to_return = ["postal_code", "geographic_unit_fips"]
diff --git a/src/elexmodel/handlers/data/PreprocessedData.py b/src/elexmodel/handlers/data/PreprocessedData.py
index 01170a3a..82ba7335 100644
--- a/src/elexmodel/handlers/data/PreprocessedData.py
+++ b/src/elexmodel/handlers/data/PreprocessedData.py
@@ -1,13 +1,16 @@
+from io import StringIO
 from pathlib import Path
 
-from elexmodel.handlers.data.BasePreprocessedDataHandler import BasePreprocessedDataHandler
+import pandas as pd
+
+from elexmodel.handlers.data.Estimandizer import Estimandizer
 from elexmodel.logger import getModelLogger
-from elexmodel.utils.file_utils import create_directory
+from elexmodel.utils.file_utils import create_directory, get_directory_path
 
 LOG = getModelLogger()
 
 
-class PreprocessedDataHandler(BasePreprocessedDataHandler):
+class PreprocessedDataHandler:
     """
     Handler for preprocessed data for model
     """
@@ -15,7 +18,7 @@ class PreprocessedDataHandler(BasePreprocessedDataHandler):
     def __init__(
         self,
         election_id,
-        office_id,
+        office,
         geographic_unit_type,
         estimands,
         estimand_baselines,
@@ -27,17 +30,47 @@ def __init__(
         """
         Initialize preprocessed data. If not present, download from s3.
         """
+        self.election_id = election_id
+        self.office = office
+        self.geographic_unit_type = geographic_unit_type
+        self.estimands = estimands
+        self.s3_client = s3_client
         self.estimand_baselines = estimand_baselines
+        self.historical = historical
         self.include_results_estimand = include_results_estimand
-        super().__init__(
-            election_id,
-            office_id,
-            geographic_unit_type,
-            estimands,
-            s3_client=s3_client,
-            historical=historical,
-            data=data,
-        )
+        self.estimandizer = Estimandizer()
+
+        self.local_file_path = self.get_preprocessed_data_path()
+
+        if data is not None:
+            self.data = self.load_data(data)
+        else:
+            self.data = self.get_data()
+
+    def get_data(self):
+        # If local data file is not available, read data from s3
+        if not Path(self.local_file_path).is_file():
+            path_info = {
+                "election_id": self.election_id,
+                "office": self.office,
+                "geographic_unit_type": self.geographic_unit_type,
+            }
+            file_path = self.s3_client.get_file_path("preprocessed", path_info)
+
+            csv_data = self.s3_client.get(file_path)
+            # read data as a buffer
+            preprocessed_data = StringIO(csv_data)
+        else:
+            # read data as a filepath
+            preprocessed_data = self.local_file_path
+
+        data = pd.read_csv(preprocessed_data, dtype={"geographic_unit_fips": str, "county_fips": str, "district": str})
+        return self.load_data(data)
+
+    def get_preprocessed_data_path(self):
+        directory_path = get_directory_path()
+        path = f"{directory_path}/data/{self.election_id}/{self.office}/data_{self.geographic_unit_type}.csv"
+        return path
 
     def select_rows_in_states(self, data, states_with_election):
         data = data.query(
@@ -47,13 +80,13 @@ def select_rows_in_states(self, data, states_with_election):
         )
         return data
 
-    def load_data(self, data):
+    def load_data(self, preprocessed_data):
         """
         Load preprocessed csv data as df
         """
-        LOG.info("Loading preprocessed data: %s, %s, %s", self.election_id, self.office_id, self.geographic_unit_type)
+        LOG.info("Loading preprocessed data: %s, %s, %s", self.election_id, self.office, self.geographic_unit_type)
         data = self.estimandizer.add_estimand_baselines(
-            data,
+            preprocessed_data,
             self.estimand_baselines,
             self.historical,
             include_results_estimand=self.include_results_estimand,
@@ -62,6 +95,6 @@ def load_data(self, data):
         return data
 
     def save_data(self, preprocessed_data):
-        if not Path(self.file_path).parent.exists():
-            create_directory(str(Path(self.file_path).parent))
-        preprocessed_data.to_csv(self.file_path, index=False)
+        if not Path(self.local_file_path).parent.exists():
+            create_directory(str(Path(self.local_file_path).parent))
+        preprocessed_data.to_csv(self.local_file_path, index=False)
diff --git a/tests/handlers/test_preprocessed_data.py b/tests/handlers/test_preprocessed_data.py
index fac5f7c8..35adac7b 100644
--- a/tests/handlers/test_preprocessed_data.py
+++ b/tests/handlers/test_preprocessed_data.py
@@ -10,7 +10,7 @@ def test_save(va_governor_county_data, test_path):
     local_file_path = f"{test_path}/test_dir/data_county.csv"
     if os.path.exists(local_file_path):
         os.remove(local_file_path)
-    data_handler.file_path = local_file_path
+    data_handler.local_file_path = local_file_path
     data_handler.save_data(va_governor_county_data)
 
     assert os.path.exists(local_file_path)