uchicago-dsi · trevorspreadbury · Apr 10, 2024 · Apr 12, 2024 · May 2, 2024 · May 8, 2024
diff --git a/scripts/transform_pipeline.py b/scripts/transform_pipeline.py
@@ -32,9 +32,9 @@
 input_directory.mkdir(parents=True, exist_ok=True)
 output_directory.mkdir(parents=True, exist_ok=True)
 
-individuals_output_path = output_directory / "individuals_table.csv"
-organizations_output_path = output_directory / "organizations_table.csv"
-transactions_output_path = output_directory / "transactions_table.csv"
+individuals_output_path = output_directory / "individuals_table_TX.csv"
+organizations_output_path = output_directory / "organizations_table_TX.csv"
+transactions_output_path = output_directory / "transactions_table_TX.csv"
 (
     complete_individuals_table,
     complete_organizations_table,
@@ -43,3 +43,4 @@
 complete_individuals_table.to_csv(individuals_output_path)
 complete_organizations_table.to_csv(organizations_output_path)
 complete_transactions_table.to_csv(transactions_output_path)
+print("pipeline finished and save data to csv.")
diff --git a/src/utils/scrape/texas.py b/src/utils/scrape/texas.py
@@ -0,0 +1,38 @@
+import pandas as pd
+import os
+
+contrib_files = [file for file in os.listdir('/Users/yuexu/Downloads/TEC_CF_CSV') if file.startswith('contribs_') and file.endswith('.csv')]
+expend_files = [file for file in os.listdir('/Users/yuexu/Downloads/TEC_CF_CSV') if file.startswith('expend_') and file.endswith('.csv')]
+
+def mergeFiles(files,mergedFileName):
+    merged_df = pd.DataFrame()
+
+    for file in files:
+        print(f"Processing File {file}...")
+        try:
+            df = pd.read_csv(os.path.join('/Users/yuexu/Downloads/TEC_CF_CSV', file), low_memory=False)  # Read CSV file into DataFrame
+    #         if mergedFileName =="merged_trimmed_contribs":
+    #             df = df[['reportInfoIdent',
+    #    'receivedDt',  'filerIdent', 'filerTypeCd', 'filerName',
+    #    'contributionInfoId', 'contributionDt', 'contributionAmount',
+    #    'contributionDescr', 'contributorNameOrganization',
+    #    'contributorNameLast', 'contributorNameSuffixCd',
+    #    'contributorNameFirst', 'contributorNamePrefixCd',
+    #    'contributorNameShort',  'contributorEmployer',
+    #    'contributorOccupation', 'contributorJobTitle']]
+            df= df.sample(50)
+            merged_df = pd.concat([merged_df, df], ignore_index=True)  # Concatenate with merged DataFrame
+        except FileNotFoundError:
+            print(f"File {file} not found. Skipping...")
+            continue
+
+    merged_df.to_csv(f'/Users/yuexu/Desktop/practicum project/climate-cabinet-campaign-finance-tracker/data/raw/TX/sample/{mergedFileName}.csv', index=False)
+
+
+
+
+
+mergeFiles(contrib_files,"contribs")
+# mergeFiles(expend_files,"merged_expend")
+
+
diff --git a/src/utils/transform/Form.py b/src/utils/transform/Form.py
@@ -0,0 +1,184 @@
+import abc
+
+import pandas as pd
+
+from utils.transform import constants as const
+
+
+class Form(abc.ABC):
+    def __init__(self, required_columns: list[str], column_mapper: dict):
+        self.required_columns = required_columns
+        self.column_mapper = column_mapper
+        self.table = None
+
+    @abc.abstractmethod
+    def read_table(self, paths: list[str]) -> pd.DataFrame:
+        """Read table(s) into a DataFrame"""
+        pass
+
+    def map_columns(self) -> None:
+        """Map and filter columns of the DataFrame"""
+        self.table.rename(columns=self.column_mapper, inplace=True)
+        self.table = self.table[self.required_columns]
+        return
+
+    def get_table(self) -> pd.DataFrame:
+        return self.table
+
+
+class ContributionForm(Form):
+    def __init__(self, column_mapper=None):
+        required_columns = ["RECIPIENT_ID", "DONOR", "AMOUNT", "YEAR", "PURPOSE"]
+
+        super().__init__(required_columns, column_mapper)
+
+    def read_table(self, paths: list[str]) -> pd.DataFrame:
+        # Read and concatenate multiple tables (e.g., expense_01.csv, expense_02.csv)
+        tables = [pd.read_csv(path) for path in paths]
+        self.table = pd.concat(tables)
+        print("Reading contribution table...")
+        return self.table
+
+    def map_columns(self) -> None:
+        """Refine the mapping and selection of columns specific to Texas contributions."""
+        # Call the parent implementation of map_columns which automatically handles the mapping and filtering based on 'column_mapper' and 'required_columns'
+        super().map_columns()
+        self.table["RECIPIENT_ID"] = self.table["RECIPIENT_ID"].astype("str")
+        return
+
+
+class ExpenseForm(Form):
+    def __init__(self, column_mapper=None):
+        required_columns = [
+            "DONOR_ID",
+            "RECIPIENT",
+            "PURPOSE",
+            "AMOUNT",
+            "YEAR",
+        ]
+
+        super().__init__(required_columns, column_mapper)
+
+    def read_table(self, paths: list[str]) -> pd.DataFrame:
+        # Read and concatenate multiple tables (e.g., expense_01.csv, expense_02.csv)
+        tables = [pd.read_csv(path) for path in paths]
+        self.table = pd.concat(tables)
+        return self.table
+
+    def map_columns(self) -> None:
+        """Refine the mapping and selection of columns specific to Texas contributions."""
+        # Call the parent implementation of map_columns which automatically handles the mapping and filtering based on 'column_mapper' and 'required_columns'
+        super().map_columns()
+        self.table["DONOR_ID"] = self.table["DONOR_ID"].astype("str")
+        return
+
+
+class FilerForm(Form):
+    def __init__(self, column_mapper=None):
+        required_columns = [
+            "RECIPIENT_ID",
+            "RECIPIENT_TYPE",
+            "RECIPIENT",
+            "RECIPIENT_OFFICE",
+            "RECIPIENT_PARTY",
+        ]
+
+        super().__init__(required_columns, column_mapper)
+
+    def read_table(self, paths: list[str]) -> pd.DataFrame:
+        # Read and concatenate multiple tables (e.g., filer_01.csv, filer_02.csv)
+        try:
+            tables = [pd.read_csv(path) for path in paths]
+            self.table = pd.concat(tables)
+        except Exception as e:
+            print(f"Error reading table: {e}")
+        return self.table
+
+    def map_columns(self) -> None:
+        """Refine the mapping and selection of columns specific to Texas contributions."""
+        super().map_columns()
+        self.table["RECIPIENT_ID"] = self.table["RECIPIENT_ID"].astype("str")
+        self.table.drop_duplicates(subset=["RECIPIENT_ID"], inplace=True)
+        return
+
+
+class TexasContributionForm(ContributionForm):
+    def __init__(self):
+        column_mapper = {"filerIdent": "RECIPIENT_ID", "contributionAmount": "AMOUNT"}
+
+        super().__init__(column_mapper)
+
+    def type_classifier(self, PersentTypeCd: str) -> str:
+        return "Individual" if PersentTypeCd.lower() == "individual" else "Organization"
+
+    def get_additional_columns(self) -> None:
+        """Enhance and prepare the dataset for final output."""
+        self.table["RECIPIENT_TYPE"] = None
+        self.table["RECIPIENT_TYPE"] = self.table["contributorPersentTypeCd"].apply(
+            self.type_classifier
+        )
+
+        self.table["DONOR"] = self.table.apply(
+            lambda row: f"{row['contributorNameLast']}, {row['contributorNameFirst']}"
+            if "contributorPersentTypeCd" in row
+            and row["contributorPersentTypeCd"] == "INDIVIDUAL"
+            else row.get("contributorNameOrganization", ""),
+            axis=1,
+        )
+        self.table["YEAR"] = pd.to_datetime(self.table["contributionDt"]).dt.year
+        self.table["PURPOSE"] = pd.NA
+        return
+
+    def preprocess_data(self) -> None:
+        self.get_additional_columns()
+        self.map_columns()
+        return
+
+
+class TexasFilerForm(FilerForm):
+    def __init__(self):
+        column_mapper = {
+            "filerIdent": "RECIPIENT_ID",
+            # "filerName": "RECIPIENT",
+            # "filerTypeCd": "RECIPIENT_TYPE",
+        }
+
+        super().__init__(column_mapper=column_mapper)
+
+    def get_additional_columns(self) -> None:
+        self.table["RECIPIENT_TYPE"] = self.table.filerTypeCd.map(
+            const.PA_FILER_ABBREV_DICT
+        )
+        self.table["RECIPIENT"] = self.table["filerName"].apply(
+            lambda x: str(x).title()
+        )
+        self.table["RECIPIENT_OFFICE"] = pd.NA
+        self.table["RECIPIENT_PARTY"] = pd.NA
+
+    def preprocess_data(self) -> None:
+        """Preprocess additional data if necessary."""
+        self.get_additional_columns()
+        self.map_columns()
+        return
+
+
+class TexasExpenseForm(ExpenseForm):
+    def __init__(self):
+        column_mapper = {"expendAmount": "AMOUNT", "filerIdent": "DONOR_ID"}
+        super().__init__(column_mapper=column_mapper)
+
+    def get_additional_columns(self) -> None:
+        self.table["RECIPIENT"] = self.table.apply(
+            lambda row: row["payeeNameLast"] + row["payeeNameFirst"]
+            if row["payeePersentTypeCd"] == "INDIVIDUAL"
+            else row["payeeNameOrganization"],
+            axis=1,
+        )
+        self.table["YEAR"] = pd.to_datetime(self.table["expendDt"]).dt.year
+        self.table["PURPOSE"] = pd.NA
+
+    def preprocess_data(self) -> None:
+        """Preprocess additional data if necessary."""
+        self.get_additional_columns()
+        self.map_columns()
+        return
diff --git a/src/utils/transform/constants.py b/src/utils/transform/constants.py
@@ -550,3 +550,75 @@
     " WV ",
     " WY ",
 ]
+PA_CONTRIBUTION_COLS: list = [
+    "RECIPIENT_ID",
+    "DONOR",
+    "AMOUNT",
+    "YEAR",
+    "PURPOSE"]
+
+PA_FILER_COLS: list =[
+    "RECIPIENT_ID",
+    "RECIPIENT_TYPE",
+    "RECIPIENT",
+    "RECIPIENT_OFFICE",
+    "RECIPIENT_PARTY"
+]
+
+PA_EXPENSE_COLS: list= [
+    "DONOR_ID",
+    "RECIPIENT",
+    "PURPOSE",
+    "AMOUNT",
+    "YEAR",
+]
+
+TX_CONTRIBUTION_COLS: list = [
+    'filerIdent',
+    'filerName',
+    'contributionDt', 
+    'contributionAmount',
+    'contributorPersentTypeCd', 
+    'contributorNameOrganization',
+    'contributorNameLast', 
+    'contributorNameFirst']
+# TO CLARIFY: (1) does filer refer to recipient (2)no office for texas
+TX_FILER_COLS: list = [
+    "filerIdent",
+    "filerTypeCd",
+    "filerName",
+    "filerNameOrganization",
+]
+
+TX_FILER_MAPPING: dict = {
+    "filerIdent": "RECIPIENT_ID",
+    "filerTypeCd": "RECIPIENT_TYPE",
+    "filerName": "RECIPIENT",
+}
+
+
+# TO CLARIFY: no purpose in texas expend
+TX_EXPENSE_COLS: list = [
+    "filerIdent",
+    "payeePersentTypeCd",
+    "payeeNameOrganization",
+    "payeeNameLast",
+    "payeeNameFirst",
+    "expendAmount",
+    "expendDt"
+]
+
+TX_CONTRIBUTION_MAPPING: dict = {
+    "filerIdent": "RECIPIENT_ID",
+    "contributionAmount": "AMOUNT"
+}
+
+TX_FILER_MAPPING: dict = {
+    "filerIdent": "RECIPIENT_ID",
+    "filerTypeCd": "RECIPIENT_TYPE",
+    "filerName": "RECIPIENT",
+}
+
+TX_EXPENSE_MAPPING: dict = {
+    "filerIdent": "DONOR_ID",
+}
diff --git a/src/utils/transform/pennsylvania.py b/src/utils/transform/pennsylvania.py
@@ -443,7 +443,7 @@ def classify_contributor(self, entity: str) -> str:
                 return "Organization"
             loc += 1
         return "Individual"
-
+    # TODO: why occupation is drop here?
     def pre_process_contributor_dataset(
         self, contributor_df: pd.DataFrame
     ) -> pd.DataFrame:

diff --git a/src/utils/transform/pipeline.py b/src/utils/transform/pipeline.py
@@ -7,12 +7,14 @@
 from utils.transform.michigan import MichiganTransformer
 from utils.transform.minnesota import MinnesotaTransformer
 from utils.transform.pennsylvania import PennsylvaniaTransformer
+from utils.transform.texas import TexasTransformer
 
 ALL_STATE_CLEANERS = [
-    ArizonaTransformer(),
-    MichiganTransformer(),
-    MinnesotaTransformer(),
-    PennsylvaniaTransformer(),
+    # ArizonaTransformer(),
+    # MichiganTransformer(),
+    # MinnesotaTransformer(),
+    # PennsylvaniaTransformer(),
+    TexasTransformer()
 ]
 
 
@@ -52,3 +54,5 @@ def transform_and_merge(
         complete_organizations_table,
         complete_transactions_table,
     )
+
+
diff --git a/src/utils/transform/schema.txt b/src/utils/transform/schema.txt
@@ -0,0 +1,38 @@
+
+/**
+ * The `standardized_df` is a list of essential columns that should be present
+ * for the Penn and Texas tables in the transformmer.
+ * These columns represent the standardized data fields for transformer instance for Penn and Texas.
+ * 
+ * The columns in `standardized_df` are as follows:
+ * - DONOR: The name of the donor.
+ * - DONOR_ID: The unique identifier of the donor.
+ * - DONOR_PARTY: The political party affiliation of the donor.
+ * - DONOR_TYPE: The type of the donor (individual, organization, etc.).
+ * - RECIPIENT: The name of the recipient.
+ * - RECIPIENT_ID: The unique identifier of the recipient.
+ * - RECIPIENT_PARTY: The political party affiliation of the recipient.
+ * - RECIPIENT_TYPE: The type of the recipient (individual, organization, etc.).
+ * - AMOUNT: The amount of the donation.
+ * - DONOR_OFFICE: The office held by the donor (if applicable).
+ * - PURPOSE: The purpose of the donation.
+ * - RECIPIENT_OFFICE: The office held by the recipient (if applicable).
+ * - YEAR: The year of the transaction.
+ * - TRANSACTION_ID: The unique identifier of the transaction.
+ */
+standardized_df = [
+    "DONOR",
+    "DONOR_ID",
+    "DONOR_PARTY",
+    "DONOR_TYPE",
+    "RECIPIENT",
+    "RECIPIENT_ID",
+    "RECIPIENT_PARTY",
+    "RECIPIENT_TYPE",
+    "AMOUNT",
+    "DONOR_OFFICE",
+    "PURPOSE",
+    "RECIPIENT_OFFICE",
+    "YEAR",
+    "TRANSACTION_ID",
+]