diff --git a/README.md b/README.md index 1e1d0d5a..2708e5b8 100644 --- a/README.md +++ b/README.md @@ -75,26 +75,6 @@ Parameters for the CLI tool: Note: When running the model with multiple fixed effects, make sure they are not linearly dependent. For example, `county_fips` and `county_classification` are linearly dependent when run together. That's because every county is in one county class, so all the fixed effect columns of the counties in the county class sum up to the fixed effect column of that county class. -#### Custom Estimands - -It's possible to create a custom estimand based on other data elements. Here's how to create a new estimand called "my_estimand": - -1. In `src/elexmodel/handlers/data/Estimandizer.py`, create a function with the signature `def my_estimand(data_df)`. -2. In `my_estimand()`, use the columns in `data_df` to create a new column, either `baseline_my_estimand` or `results_my_estimand` as necessary. See the `party_vote_share_dem` function for an example. -3. Specify `my_estimand` as one of your estimands. For example, via the command line: - -``` -elexmodel 2017-11-07_VA_G --estimands my_estimand --office_id=G --geographic_unit_type=county --percent_reporting 50 -``` - -Your output should have columns including `baseline_my_estimand`, `results_my_estimand`, and related columns for the prediction intervals, if using them. - -Here's an example showing multiple estimands, including `my_estimand`: - -``` -elexmodel 2017-11-07_VA_G --estimands=turnout --estimands my_estimand --estimands party_vote_share_dem --office_id=G --geographic_unit_type=county --percent_reporting 50 -``` - #### Python ##### Model Parameters diff --git a/setup.cfg b/setup.cfg index ac26f636..704471d8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,7 +7,7 @@ max-line-length = 120 [pylint] max-line-length = 120 good-names= on, x, df, NonparametricElectionModel, GaussianElectionModel, - BaseElectionModel, qr, X, y, f, LiveData, n, Featurizer, Estimandizer, fe, PreprocessedData, CombinedData, + BaseElectionModel, qr, X, y, f, LiveData, n, Featurizer, fe, PreprocessedData, CombinedData, ModelResults, GaussianModel, MODEL_THRESHOLD, LOG, w, df_X, df_y, v, n, g, a, b disable=missing-function-docstring, missing-module-docstring, missing-class-docstring, #missing too-many-arguments, too-many-locals, too-many-branches, too-many-instance-attributes, too-many-statements, #structure: too-many diff --git a/src/elexmodel/cli.py b/src/elexmodel/cli.py index c7128ebe..88b20092 100644 --- a/src/elexmodel/cli.py +++ b/src/elexmodel/cli.py @@ -20,8 +20,8 @@ class PythonLiteralOption(click.Option): def type_cast_value(self, ctx, value): try: return ast.literal_eval(value) - except ValueError as e: - raise click.BadParameter(value) from e + except ValueError: + raise click.BadParameter(value) @click.command() diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py index 6194b7f0..f6e5fd65 100644 --- a/src/elexmodel/client.py +++ b/src/elexmodel/client.py @@ -56,27 +56,21 @@ def _check_input_parameters( offices = config_handler.get_offices() if office not in offices: raise ValueError(f"Office '{office}' is not valid. Please check config.") - - estimands_in_config = config_handler.get_estimands(office) - extra_estimands = set(estimands).difference(set(estimands_in_config)) - if len(extra_estimands) > 0: - # this is ok; later they'll all be created by the Estimandizer - LOG.info("Found additional estimands that were not specified in the config file: %s", extra_estimands) - + valid_estimands = config_handler.get_estimands(office) + for estimand in estimands: + if estimand not in valid_estimands: + raise ValueError(f"Estimand: '{estimand}' is not valid. Please check config") geographic_unit_types = config_handler.get_geographic_unit_types(office) if geographic_unit_type not in geographic_unit_types: raise ValueError(f"Geographic unit type: '{geographic_unit_type}' is not valid. Please check config") - model_features = config_handler.get_features(office) - invalid_features = list(set(features).difference(set(model_features))) + invalid_features = [feature for feature in features if feature not in model_features] if len(invalid_features) > 0: raise ValueError(f"Feature(s): {invalid_features} not valid. Please check config") - model_aggregates = config_handler.get_aggregates(office) invalid_aggregates = [aggregate for aggregate in aggregates if aggregate not in model_aggregates] if len(invalid_aggregates) > 0: raise ValueError(f"Aggregate(s): {invalid_aggregates} not valid. Please check config") - model_fixed_effects = config_handler.get_fixed_effects(office) if isinstance(fixed_effects, dict): invalid_fixed_effects = [ @@ -88,16 +82,14 @@ def _check_input_parameters( ] if len(invalid_fixed_effects) > 0: raise ValueError(f"Fixed effect(s): {invalid_fixed_effects} not valid. Please check config") - if pi_method not in {"gaussian", "nonparametric"}: raise ValueError( f"Prediction interval method: {pi_method} is not valid. \ pi_method has to be either `gaussian` or `nonparametric`." ) - if not isinstance(model_parameters, dict): raise ValueError("model_paramters is not valid. Has to be a dict.") - if model_parameters != {}: + elif model_parameters != {}: if "lambda_" in model_parameters and ( not isinstance(model_parameters["lambda_"], (float, int)) or model_parameters["lambda_"] < 0 ): @@ -110,10 +102,8 @@ def _check_input_parameters( elif pi_method == "nonparametric": if "robust" in model_parameters and not isinstance(model_parameters["robust"], bool): raise ValueError("robust is not valid. Has to be a boolean.") - if handle_unreporting not in {"drop", "zero"}: raise ValueError("handle_unreporting must be either `drop` or `zero`") - return True def get_aggregate_list(self, office, aggregate): diff --git a/src/elexmodel/handlers/data/CombinedData.py b/src/elexmodel/handlers/data/CombinedData.py index 951dff97..41bce8f3 100644 --- a/src/elexmodel/handlers/data/CombinedData.py +++ b/src/elexmodel/handlers/data/CombinedData.py @@ -1,5 +1,4 @@ from elexmodel.handlers import s3 -from elexmodel.handlers.data.Estimandizer import Estimandizer from elexmodel.utils.file_utils import S3_FILE_PATH, TARGET_BUCKET, convert_df_to_csv @@ -17,12 +16,6 @@ def __init__( handle_unreporting="drop", ): self.estimands = estimands - - estimandizer = Estimandizer() - (current_data, _) = estimandizer.check_and_create_estimands( - current_data.copy(), self.estimands, False, current_data=True - ) - # if we're running this for a past election, drop results columns from preprocessed data # so we use results_{estimand} numbers from current_data preprocessed_results_columns = list(filter(lambda col: col.startswith("results_"), preprocessed_data.columns)) @@ -34,7 +27,7 @@ def __init__( # if unreporting is 'drop' then drop units that are not reporting (ie. units where results are na) # this is necessary if units will not be returning results in this election, # but we didn't know that (ie. townships) - result_cols = [f"results_{estimand}" for estimand in self.estimands] + result_cols = [f"results_{estimand}" for estimand in estimands] if handle_unreporting == "drop": # Drop the whole row if an estimand is not reporting data = data.dropna(axis=0, how="any", subset=result_cols) diff --git a/src/elexmodel/handlers/data/Estimandizer.py b/src/elexmodel/handlers/data/Estimandizer.py deleted file mode 100644 index 30e6a00f..00000000 --- a/src/elexmodel/handlers/data/Estimandizer.py +++ /dev/null @@ -1,88 +0,0 @@ -from numpy import nan - - -class EstimandException(Exception): - pass - - -RESULTS_PREFIX = "results_" -BASELINE_PREFIX = "baseline_" - - -class Estimandizer: - """ - Estimandizer. Generate estimands explicitly. - """ - - def check_and_create_estimands(self, data_df, estimands, historical, current_data=False): - columns_to_return = [] - - for estimand in estimands: - results_col = f"{RESULTS_PREFIX}{estimand}" - baseline_col = f"{BASELINE_PREFIX}{estimand}" - target_col = results_col if current_data else baseline_col - - if target_col not in data_df.columns: - if estimand in data_df.columns: - data_df[target_col] = data_df[estimand].copy() - else: - # will raise a KeyError if a function with the same name as `estimand` doesn't exist - data_df = globals()[estimand](data_df) - if target_col == baseline_col: - data_df[results_col] = data_df[baseline_col].copy() - - if historical: - data_df[results_col] = nan - else: - if results_col not in data_df.columns: - raise EstimandException("This is missing results data for estimand: ", estimand) - - columns_to_return.append(results_col) - - results_column_names = [x for x in data_df.columns if x.startswith(RESULTS_PREFIX)] - # If this is not a historical run, then this is a live election - # so we are expecting that there will be actual results data - if not historical and len(results_column_names) == 0: - raise EstimandException("This is not a test election, it is missing results data") - - return (data_df, columns_to_return) - - def add_estimand_baselines(self, data_df, estimand_baselines, historical): - # if we are in a historical election we are only reading preprocessed data to get - # the historical election results of the currently reporting units. - # so we don't care about the total voters or the baseline election. - - for estimand, pointer in estimand_baselines.items(): - if pointer is None: - # should only happen when we're going to create a new estimand - pointer = estimand - - baseline_col = f"{BASELINE_PREFIX}{pointer}" - - if baseline_col not in data_df.columns: - # will raise a KeyError if a function with the same name as `pointer` doesn't exist - data_df = globals()[pointer](data_df) - results_col = f"{RESULTS_PREFIX}{estimand}" - data_df[results_col] = data_df[baseline_col].copy() - - if not historical: - # Adding one to prevent zero divison - data_df[f"last_election_results_{estimand}"] = data_df[baseline_col].copy() + 1 - - return data_df - - -# custom estimands - - -def party_vote_share_dem(data_df): - # should only happen when we're replaying an election - if f"{BASELINE_PREFIX}dem" not in data_df.columns and f"{BASELINE_PREFIX}turnout" not in data_df.columns: - data_df[f"{RESULTS_PREFIX}party_vote_share_dem"] = ( - data_df[f"{RESULTS_PREFIX}dem"] / data_df[f"{RESULTS_PREFIX}turnout"] - ) - else: - data_df[f"{BASELINE_PREFIX}party_vote_share_dem"] = ( - data_df[f"{BASELINE_PREFIX}dem"] / data_df[f"{BASELINE_PREFIX}turnout"] - ) - return data_df diff --git a/src/elexmodel/handlers/data/LiveData.py b/src/elexmodel/handlers/data/LiveData.py index c9c598f3..f52a9a9e 100644 --- a/src/elexmodel/handlers/data/LiveData.py +++ b/src/elexmodel/handlers/data/LiveData.py @@ -5,10 +5,13 @@ import numpy as np import pandas as pd -from elexmodel.handlers.data.Estimandizer import Estimandizer from elexmodel.utils.file_utils import get_directory_path +class MockLiveDataHandlerException(Exception): + pass + + class MockLiveDataHandler: """ Handles current data, which we would pull from Dynamo on an election night @@ -32,7 +35,6 @@ def __init__( self.s3_client = s3_client self.historical = historical self.unexpected_rows = unexpected_units - self.estimandizer = Estimandizer() self.shuffle_columns = [ "postal_code", @@ -44,7 +46,7 @@ def __init__( self.data = data if data is not None: # passed in as a df - data_for_estimands = self.load_data(data) + data_for_estimands = self.load_data(data, estimands, historical) self.data = data_for_estimands else: self.data = self.get_data() @@ -74,19 +76,26 @@ def get_data(self): live_data, dtype={"geographic_unit_fips": str, "geographic_unit_type": str, "county_fips": str, "district": str}, ) - data = self.load_data(data) + data = self.load_data(data, self.estimands, self.historical) return data def get_live_data_file_path(self): directory_path = get_directory_path() return f"{directory_path}/data/{self.election_id}/{self.office_id}/data_{self.geographic_unit_type}.csv" - def load_data(self, data): + def load_data(self, data, estimands, historical): columns_to_return = ["postal_code", "geographic_unit_fips"] - - (data, more_columns) = self.estimandizer.check_and_create_estimands(data, self.estimands, self.historical) - columns_to_return += more_columns - + for estimand in estimands: + if historical: + data[f"results_{estimand}"] = np.nan + results_column_names = [x for x in data.columns if x.startswith("results")] + # If this is not a historical run, then this is a live election + # so we are expecting that there will be actual results data + if not self.historical and len(results_column_names) == 0: + raise MockLiveDataHandlerException("This is not a test election, it is missing results data") + if f"results_{estimand}" not in results_column_names: + raise MockLiveDataHandlerException("This is missing results data for estimand: ", estimand) + columns_to_return.append(f"results_{estimand}") self.shuffle_dataframe = data[self.shuffle_columns].copy() return data[columns_to_return].copy() diff --git a/src/elexmodel/handlers/data/PreprocessedData.py b/src/elexmodel/handlers/data/PreprocessedData.py index a2ed3902..34f5c174 100644 --- a/src/elexmodel/handlers/data/PreprocessedData.py +++ b/src/elexmodel/handlers/data/PreprocessedData.py @@ -4,7 +4,6 @@ import pandas as pd -from elexmodel.handlers.data.Estimandizer import Estimandizer from elexmodel.utils.file_utils import create_directory, get_directory_path LOG = logging.getLogger(__name__) @@ -36,12 +35,11 @@ def __init__( self.s3_client = s3_client self.estimand_baselines = estimand_baselines self.historical = historical - self.estimandizer = Estimandizer() self.local_file_path = self.get_preprocessed_data_path() if data is not None: - self.data = self.load_data(data) + self.data = self.load_data(data, estimand_baselines) else: self.data = self.get_data() @@ -63,7 +61,7 @@ def get_data(self): preprocessed_data = self.local_file_path data = pd.read_csv(preprocessed_data, dtype={"geographic_unit_fips": str, "county_fips": str, "district": str}) - return self.load_data(data) + return self.load_data(data, self.estimand_baselines) def get_preprocessed_data_path(self): directory_path = get_directory_path() @@ -78,13 +76,24 @@ def select_rows_in_states(self, data, states_with_election): ) return data - def load_data(self, preprocessed_data): + def load_data(self, preprocessed_data, estimand_baselines): """ Load preprocessed csv data as df """ LOG.info("Loading preprocessed data: %s, %s, %s", self.election_id, self.office, self.geographic_unit_type) - return self.estimandizer.add_estimand_baselines(preprocessed_data, self.estimand_baselines, self.historical) + if self.historical: + # if we are in a historical election we are only reading preprocessed data to get + # the historical election results of the currently reporting units. + # so we don't care about the total voters or the baseline election. + return preprocessed_data + + for estimand, pointer in estimand_baselines.items(): + baseline_name = f"baseline_{pointer}" + # Adding one to prevent zero divison + preprocessed_data[f"last_election_results_{estimand}"] = preprocessed_data[baseline_name].copy() + 1 + + return preprocessed_data def save_data(self, preprocessed_data): if not Path(self.local_file_path).parent.exists(): diff --git a/src/elexmodel/logging.py b/src/elexmodel/logging.py index ef2fd49c..37cebd24 100644 --- a/src/elexmodel/logging.py +++ b/src/elexmodel/logging.py @@ -17,7 +17,7 @@ "elexmodel": { "handlers": ["default"], "level": "INFO", - "propagate": True, + "propagate": False, } }, } diff --git a/tests/conftest.py b/tests/conftest.py index 1760a21b..d6c0c48f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,20 +26,15 @@ def setup_logging(): def get_fixture(): def _get_fixture(filename, load=True, pandas=False): filepath = os.path.join(FIXTURE_DIR, filename) - with open(filepath, encoding="utf-8") as fileobj: - if load: - return json.load(fileobj) - if pandas: - return pd.read_csv( - filepath, - dtype={ - "geographic_unit_fips": str, - "geographic_unit_type": str, - "county_fips": str, - "district": str, - }, - ) - return fileobj + fileobj = open(filepath, encoding="utf-8") + if load: + return json.load(fileobj) + if pandas: + return pd.read_csv( + filepath, + dtype={"geographic_unit_fips": str, "geographic_unit_type": str, "county_fips": str, "district": str}, + ) + return fileobj return _get_fixture @@ -84,7 +79,7 @@ def va_governor_precinct_data(get_fixture): return get_fixture(path, load=False, pandas=True) -@pytest.fixture(scope="module") +@pytest.fixture(scope="session") def va_governor_county_data(get_fixture): path = os.path.join("data", "2017-11-07_VA_G", "G", "data_county.csv") return get_fixture(path, load=False, pandas=True) @@ -102,12 +97,6 @@ def va_assembly_precinct_data(get_fixture): return get_fixture(path, load=False, pandas=True) -@pytest.fixture(scope="session") -def az_assembly_precinct_data(get_fixture): - path = os.path.join("data", "2020-08-04_AZ_R", "S", "data_precinct.csv") - return get_fixture(path, load=False, pandas=True) - - @pytest.fixture(scope="session") def test_path(): return _TEST_FOLDER diff --git a/tests/handlers/test_config.py b/tests/handlers/test_config.py index 2567b535..4d957786 100644 --- a/tests/handlers/test_config.py +++ b/tests/handlers/test_config.py @@ -33,8 +33,7 @@ def test_get_baseline_pointer_general(va_config): office = "G" baseline_pointer = config_handler.get_baseline_pointer(office) - expected = {"dem": "dem", "gop": "gop", "turnout": "turnout"} - assert expected == baseline_pointer + assert {"dem": "dem", "gop": "gop", "turnout": "turnout"} == baseline_pointer def test_get_baseline_pointer_primary(tx_primary_governor_config): @@ -77,8 +76,7 @@ def test_get_estimands_general(va_config): office = "G" estimands = config_handler.get_estimands(office) - expected = ["dem", "gop", "turnout"] - assert expected == estimands + assert ["dem", "gop", "turnout"] == estimands def test_get_estimands_primary(tx_primary_governor_config): diff --git a/tests/handlers/test_estimandizer.py b/tests/handlers/test_estimandizer.py deleted file mode 100644 index 84312e46..00000000 --- a/tests/handlers/test_estimandizer.py +++ /dev/null @@ -1,50 +0,0 @@ -from elexmodel.handlers.data.Estimandizer import Estimandizer - - -def test_check_and_create_estimands_not_historical(va_governor_county_data): - """ - Tests the check_and_create_estimands() method. - """ - - va_data_copy = va_governor_county_data.copy() - estimands = ["party_vote_share_dem"] - - estimandizer = Estimandizer() - (output_df, result_columns) = estimandizer.check_and_create_estimands(va_data_copy, estimands, False) - - assert "baseline_party_vote_share_dem" in output_df.columns - assert "results_party_vote_share_dem" in output_df.columns - assert result_columns == ["results_party_vote_share_dem"] - - -def test_check_and_create_estimands_historical(va_governor_county_data): - """ - Tests the check_and_create_estimands() method with historical elections. - """ - va_data_copy = va_governor_county_data.copy() - estimands = ["party_vote_share_dem"] - - estimandizer = Estimandizer() - (output_df, result_columns) = estimandizer.check_and_create_estimands(va_data_copy, estimands, True) - - assert "baseline_party_vote_share_dem" in output_df.columns - assert "results_party_vote_share_dem" in output_df.columns - assert result_columns == ["results_party_vote_share_dem"] - - -def test_add_estimand_baselines_not_historical(va_governor_county_data): - estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"} - estimandizer = Estimandizer() - output_df = estimandizer.add_estimand_baselines(va_governor_county_data.copy(), estimand_baselines, False) - assert "baseline_party_vote_share_dem" in output_df.columns - assert "results_party_vote_share_dem" in output_df.columns - assert "last_election_results_party_vote_share_dem" in output_df.columns - - -def test_add_estimand_baselines_historical(va_governor_county_data): - estimand_baselines = {"turnout": "turnout", "party_vote_share_dem": "party_vote_share_dem"} - estimandizer = Estimandizer() - output_df = estimandizer.add_estimand_baselines(va_governor_county_data.copy(), estimand_baselines, True) - assert "baseline_party_vote_share_dem" in output_df.columns - assert "results_party_vote_share_dem" in output_df.columns - assert "last_election_results_party_vote_share_dem" not in output_df.columns diff --git a/tests/handlers/test_model_results.py b/tests/handlers/test_model_results.py index 9038be01..14cdd237 100644 --- a/tests/handlers/test_model_results.py +++ b/tests/handlers/test_model_results.py @@ -112,4 +112,4 @@ def test_no_unit_data(): handler.add_agg_predictions("e1", "postal_code", agg1_e1, intervals1_e1) handler.process_final_results() - assert "unit_data" not in handler.final_results + assert "unit_data" not in handler.final_results.keys() diff --git a/tests/test_client.py b/tests/test_client.py index 837d14b9..4a032802 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,5 +1,3 @@ -import logging - import numpy as np import pandas as pd import pytest @@ -7,11 +5,8 @@ from elexmodel.client import ModelNotEnoughSubunitsException from elexmodel.handlers.config import ConfigHandler from elexmodel.handlers.data.LiveData import MockLiveDataHandler -from elexmodel.logging import initialize_logging from elexmodel.utils import math_utils -initialize_logging() - # A set of valid model parameters office = "G" estimands = ["turnout", "dem"] @@ -85,15 +80,15 @@ def test_check_input_parameters_pi_method(model_client, va_config): ) -def test_check_input_parameters_estimand(caplog, model_client, va_config): +def test_check_input_parameters_estimand(model_client, va_config): election_id = "2017-11-07_VA_G" config_handler = ConfigHandler(election_id, config=va_config) - with caplog.at_level(logging.INFO): + with pytest.raises(ValueError): model_client._check_input_parameters( config_handler, office, - ["foo"], + ["estimand"], geographic_unit_type, features, aggregates, @@ -103,8 +98,6 @@ def test_check_input_parameters_estimand(caplog, model_client, va_config): handle_unreporting, ) - assert "Found additional estimands " in caplog.text - def test_check_input_parameters_geographic_unit_type(model_client, va_config): election_id = "2017-11-07_VA_G" @@ -766,38 +759,3 @@ def test_conformalization_data(model_client, va_governor_county_data, va_config) assert isinstance(conform_unit[0.9]["turnout"][1], pd.DataFrame) assert conform_agg[0.9]["turnout"][0] is None assert isinstance(conform_agg[0.9]["turnout"][1], pd.DataFrame) - - -def test_estimandizer_input(model_client, va_governor_county_data, va_config): - election_id = "2017-11-07_VA_G" - office_id = "G" - geographic_unit_type = "county" - estimands = ["turnout", "party_vote_share_dem"] - prediction_intervals = [0.9] - percent_reporting_threshold = 100 - - data_handler = MockLiveDataHandler( - election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data - ) - - data_handler.shuffle() - data = data_handler.get_percent_fully_reported(100) - - preprocessed_data = va_governor_county_data.copy() - preprocessed_data["last_election_results_turnout"] = preprocessed_data["baseline_turnout"].copy() + 1 - - try: - model_client.get_estimates( - data, - election_id, - office_id, - estimands, - prediction_intervals, - percent_reporting_threshold, - geographic_unit_type, - raw_config=va_config, - preprocessed_data=preprocessed_data, - save_output=[], - ) - except KeyError: - pytest.raises("Error with client input for estimandizer")