From 488f703b143156744ac00835731f8409ccdc2ddd Mon Sep 17 00:00:00 2001 From: Diane Napolitano Date: Fri, 13 Dec 2024 11:52:22 -0500 Subject: [PATCH] Moving the preprocessed data superclass to a new branch --- .../data/BasePreprocessedDataHandler.py | 60 ---------------- src/elexmodel/handlers/data/LiveData.py | 58 ++++++++++++--- .../handlers/data/PreprocessedData.py | 71 ++++++++++++++----- tests/handlers/test_preprocessed_data.py | 2 +- 4 files changed, 101 insertions(+), 90 deletions(-) delete mode 100644 src/elexmodel/handlers/data/BasePreprocessedDataHandler.py diff --git a/src/elexmodel/handlers/data/BasePreprocessedDataHandler.py b/src/elexmodel/handlers/data/BasePreprocessedDataHandler.py deleted file mode 100644 index f99f9273..00000000 --- a/src/elexmodel/handlers/data/BasePreprocessedDataHandler.py +++ /dev/null @@ -1,60 +0,0 @@ -import abc -from io import StringIO -from pathlib import Path - -import pandas as pd - -from elexmodel.handlers.data.Estimandizer import Estimandizer -from elexmodel.utils.file_utils import get_directory_path - - -class BasePreprocessedDataHandler(abc.ABC): - """ - Abstract base handler for preprocessed (input) model data - """ - - def __init__( - self, election_id, office_id, geographic_unit_type, estimands, s3_client=None, historical=False, data=None - ): - self.election_id = election_id - self.office_id = office_id - self.geographic_unit_type = geographic_unit_type - self.estimands = estimands - self.s3_client = s3_client - self.historical = historical - self.estimandizer = Estimandizer() - self.file_path = self.get_data_path() - - if data is not None: - self.data = self.load_data(data) - else: - self.data = self.get_data() - - def get_data_path(self): - directory_path = get_directory_path() - path = f"{directory_path}/data/{self.election_id}/{self.office_id}/data_{self.geographic_unit_type}.csv" - return path - - def get_data(self): - # If local data file is not available, read data from s3 - if not Path(self.file_path).is_file(): - path_info = { - "election_id": self.election_id, - "office": self.office_id, - "geographic_unit_type": self.geographic_unit_type, - } - file_path = self.s3_client.get_file_path("preprocessed", path_info) - - csv_data = self.s3_client.get(file_path) - # read data as a buffer - preprocessed_data = StringIO(csv_data) - else: - # read data as a filepath - preprocessed_data = self.file_path - - data = pd.read_csv(preprocessed_data, dtype={"geographic_unit_fips": str, "county_fips": str, "district": str}) - return self.load_data(data) - - @abc.abstractmethod - def load_data(self, data): - pass diff --git a/src/elexmodel/handlers/data/LiveData.py b/src/elexmodel/handlers/data/LiveData.py index 797f61be..56e6c352 100644 --- a/src/elexmodel/handlers/data/LiveData.py +++ b/src/elexmodel/handlers/data/LiveData.py @@ -1,12 +1,15 @@ import math +from io import StringIO +from pathlib import Path import numpy as np import pandas as pd -from elexmodel.handlers.data.BasePreprocessedDataHandler import BasePreprocessedDataHandler +from elexmodel.handlers.data.Estimandizer import Estimandizer +from elexmodel.utils.file_utils import get_directory_path -class MockLiveDataHandler(BasePreprocessedDataHandler): +class MockLiveDataHandler: """ Handles current data, which we would pull from Dynamo on an election night """ @@ -22,7 +25,14 @@ def __init__( s3_client=None, unexpected_units=0, ): + self.election_id = election + self.office_id = office_id + self.geographic_unit_type = geographic_unit_type + self.estimands = estimands + self.s3_client = s3_client + self.historical = historical self.unexpected_rows = unexpected_units + self.estimandizer = Estimandizer() self.shuffle_columns = [ "postal_code", @@ -31,17 +41,45 @@ def __init__( ] # columns we may want to sample by self.shuffle_dataframe = None + self.data = data + if data is not None: + # passed in as a df + data_for_estimands = self.load_data(data) + self.data = data_for_estimands + else: + self.data = self.get_data() + self.current_reporting_data = None - super().__init__( - election, - office_id, - geographic_unit_type, - estimands, - s3_client=s3_client, - historical=historical, - data=data, + def get_data(self): + file_path = self.get_live_data_file_path() + # If local data file is not available, read data from s3 + if not Path(file_path).is_file(): + path_info = { + "election_id": self.election_id, + "office": self.office_id, + "geographic_unit_type": self.geographic_unit_type, + } + # we're mimicking live data from a file of preprocessed data + # but for a real live election, we will pull live data from dynamo + file_path = self.s3_client.get_file_path("preprocessed", path_info) + csv_data = self.s3_client.get(file_path) + # read data as a buffer + live_data = StringIO(csv_data) + else: + # read data as a filepath + live_data = file_path + + data = pd.read_csv( + live_data, + dtype={"geographic_unit_fips": str, "geographic_unit_type": str, "county_fips": str, "district": str}, ) + data = self.load_data(data) + return data + + def get_live_data_file_path(self): + directory_path = get_directory_path() + return f"{directory_path}/data/{self.election_id}/{self.office_id}/data_{self.geographic_unit_type}.csv" def load_data(self, data): columns_to_return = ["postal_code", "geographic_unit_fips"] diff --git a/src/elexmodel/handlers/data/PreprocessedData.py b/src/elexmodel/handlers/data/PreprocessedData.py index 01170a3a..82ba7335 100644 --- a/src/elexmodel/handlers/data/PreprocessedData.py +++ b/src/elexmodel/handlers/data/PreprocessedData.py @@ -1,13 +1,16 @@ +from io import StringIO from pathlib import Path -from elexmodel.handlers.data.BasePreprocessedDataHandler import BasePreprocessedDataHandler +import pandas as pd + +from elexmodel.handlers.data.Estimandizer import Estimandizer from elexmodel.logger import getModelLogger -from elexmodel.utils.file_utils import create_directory +from elexmodel.utils.file_utils import create_directory, get_directory_path LOG = getModelLogger() -class PreprocessedDataHandler(BasePreprocessedDataHandler): +class PreprocessedDataHandler: """ Handler for preprocessed data for model """ @@ -15,7 +18,7 @@ class PreprocessedDataHandler(BasePreprocessedDataHandler): def __init__( self, election_id, - office_id, + office, geographic_unit_type, estimands, estimand_baselines, @@ -27,17 +30,47 @@ def __init__( """ Initialize preprocessed data. If not present, download from s3. """ + self.election_id = election_id + self.office = office + self.geographic_unit_type = geographic_unit_type + self.estimands = estimands + self.s3_client = s3_client self.estimand_baselines = estimand_baselines + self.historical = historical self.include_results_estimand = include_results_estimand - super().__init__( - election_id, - office_id, - geographic_unit_type, - estimands, - s3_client=s3_client, - historical=historical, - data=data, - ) + self.estimandizer = Estimandizer() + + self.local_file_path = self.get_preprocessed_data_path() + + if data is not None: + self.data = self.load_data(data) + else: + self.data = self.get_data() + + def get_data(self): + # If local data file is not available, read data from s3 + if not Path(self.local_file_path).is_file(): + path_info = { + "election_id": self.election_id, + "office": self.office, + "geographic_unit_type": self.geographic_unit_type, + } + file_path = self.s3_client.get_file_path("preprocessed", path_info) + + csv_data = self.s3_client.get(file_path) + # read data as a buffer + preprocessed_data = StringIO(csv_data) + else: + # read data as a filepath + preprocessed_data = self.local_file_path + + data = pd.read_csv(preprocessed_data, dtype={"geographic_unit_fips": str, "county_fips": str, "district": str}) + return self.load_data(data) + + def get_preprocessed_data_path(self): + directory_path = get_directory_path() + path = f"{directory_path}/data/{self.election_id}/{self.office}/data_{self.geographic_unit_type}.csv" + return path def select_rows_in_states(self, data, states_with_election): data = data.query( @@ -47,13 +80,13 @@ def select_rows_in_states(self, data, states_with_election): ) return data - def load_data(self, data): + def load_data(self, preprocessed_data): """ Load preprocessed csv data as df """ - LOG.info("Loading preprocessed data: %s, %s, %s", self.election_id, self.office_id, self.geographic_unit_type) + LOG.info("Loading preprocessed data: %s, %s, %s", self.election_id, self.office, self.geographic_unit_type) data = self.estimandizer.add_estimand_baselines( - data, + preprocessed_data, self.estimand_baselines, self.historical, include_results_estimand=self.include_results_estimand, @@ -62,6 +95,6 @@ def load_data(self, data): return data def save_data(self, preprocessed_data): - if not Path(self.file_path).parent.exists(): - create_directory(str(Path(self.file_path).parent)) - preprocessed_data.to_csv(self.file_path, index=False) + if not Path(self.local_file_path).parent.exists(): + create_directory(str(Path(self.local_file_path).parent)) + preprocessed_data.to_csv(self.local_file_path, index=False) diff --git a/tests/handlers/test_preprocessed_data.py b/tests/handlers/test_preprocessed_data.py index fac5f7c8..35adac7b 100644 --- a/tests/handlers/test_preprocessed_data.py +++ b/tests/handlers/test_preprocessed_data.py @@ -10,7 +10,7 @@ def test_save(va_governor_county_data, test_path): local_file_path = f"{test_path}/test_dir/data_county.csv" if os.path.exists(local_file_path): os.remove(local_file_path) - data_handler.file_path = local_file_path + data_handler.local_file_path = local_file_path data_handler.save_data(va_governor_county_data) assert os.path.exists(local_file_path)