From cabf0e18201897baaaee7b4fb38861c0bfa48db2 Mon Sep 17 00:00:00 2001
From: nlebovits <nissim.lebovits@vanderbilt.edu>
Date: Thu, 10 Oct 2024 19:12:07 -0400
Subject: [PATCH 1/4] filter li complaints and violations for only
 blight-related instances

---
 data/src/data_utils/l_and_i.py | 65 +++++++++++++++++++++++++++-------
 1 file changed, 52 insertions(+), 13 deletions(-)

diff --git a/data/src/data_utils/l_and_i.py b/data/src/data_utils/l_and_i.py
index 20cc9790..27f28147 100644
--- a/data/src/data_utils/l_and_i.py
+++ b/data/src/data_utils/l_and_i.py
@@ -1,18 +1,43 @@
 import pandas as pd
+import geopandas as gpd
+from typing import List
 from classes.featurelayer import FeatureLayer
 from constants.services import COMPLAINTS_SQL_QUERY, VIOLATIONS_SQL_QUERY
 
+def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
+    """
+    Process L&I (Licenses and Inspections) data for complaints and violations.
+
+    This function filters and processes L&I complaints and violations data,
+    joining it with the primary feature layer based on spatial relationships
+    and OPA (Office of Property Assessment) identifiers.
+
+    Args:
+        primary_featurelayer (FeatureLayer): The primary feature layer to join L&I data to.
+
+    Returns:
+        FeatureLayer: The primary feature layer updated with L&I data.
+    """
+    keywords: List[str] = [
+        'dumping', 'blight', 'rubbish', 'weeds', 'graffiti',
+        'abandoned', 'sanitation', 'litter', 'vacant', 'trash',
+        'unsafe'
+    ]
 
-def l_and_i(primary_featurelayer):
     # Load complaints data from L&I
-    l_and_i_complaints = FeatureLayer(
+    l_and_i_complaints: FeatureLayer = FeatureLayer(
         name="LI Complaints",
         carto_sql_queries=COMPLAINTS_SQL_QUERY
     )
 
-    # filter for only Status = 'Open'
+    # Filter for rows where 'subject' contains any of the keywords
     l_and_i_complaints.gdf = l_and_i_complaints.gdf[
-        l_and_i_complaints.gdf["status"] == "Open"
+        l_and_i_complaints.gdf["subject"].str.lower().str.contains('|'.join(keywords))
+    ]
+
+    # Filter for only Status = 'Open'
+    l_and_i_complaints.gdf = l_and_i_complaints.gdf[
+        l_and_i_complaints.gdf["status"].str.lower() == "open"
     ]
 
     # Group by geometry and concatenate the violationcodetitle values into a list with a semicolon separator
@@ -30,13 +55,18 @@ def l_and_i(primary_featurelayer):
     )
 
     # Load data for violations from L&I
-    l_and_i_violations = FeatureLayer(
+    l_and_i_violations: FeatureLayer = FeatureLayer(
         name="LI Violations",
         carto_sql_queries=VIOLATIONS_SQL_QUERY,
         from_xy=True
     )
 
-    all_violations_count_df = (
+    # Filter for rows where 'casetype' contains any of the keywords, handling NaN values
+    l_and_i_violations.gdf = l_and_i_violations.gdf[
+        l_and_i_violations.gdf["violationcodetitle"].fillna('').str.lower().str.contains('|'.join(keywords))
+    ]
+
+    all_violations_count_df: pd.DataFrame = (
         l_and_i_violations.gdf.groupby("opa_account_num")
         .count()
         .reset_index()[["opa_account_num", "violationnumber", "geometry"]]
@@ -45,11 +75,11 @@ def l_and_i(primary_featurelayer):
         columns={"violationnumber": "all_violations_past_year"}
     )
     # filter for only cases where the casestatus is 'IN VIOLATION' or 'UNDER INVESTIGATION'
-    violations_gdf = l_and_i_violations.gdf[
-        (l_and_i_violations.gdf["violationstatus"] == "OPEN")
+    violations_gdf: gpd.GeoDataFrame = l_and_i_violations.gdf[
+        (l_and_i_violations.gdf["violationstatus"].str.lower() == "open")
     ]
 
-    open_violations_count_df = (
+    open_violations_count_df: pd.DataFrame = (
         violations_gdf.groupby("opa_account_num")
         .count()
         .reset_index()[["opa_account_num", "violationnumber", "geometry"]]
@@ -58,7 +88,7 @@ def l_and_i(primary_featurelayer):
         columns={"violationnumber": "open_violations_past_year"}
     )
     # join the all_violations_count_df and open_violations_count_df dataframes on opa_account_num
-    violations_count_gdf = all_violations_count_df.merge(
+    violations_count_gdf: gpd.GeoDataFrame = all_violations_count_df.merge(
         open_violations_count_df, how="left", on="opa_account_num"
     )
 
@@ -96,7 +126,7 @@ def l_and_i(primary_featurelayer):
     )
 
     # Complaints need a spatial join, but we need to take special care to merge on just the parcel geoms first to get opa_id
-    complaints_with_opa_id = primary_featurelayer.gdf.sjoin(
+    complaints_with_opa_id: gpd.GeoDataFrame = primary_featurelayer.gdf.sjoin(
         l_and_i_complaints.gdf, how="left", predicate="contains"
     )
     complaints_with_opa_id.drop(columns=["index_right"], inplace=True)
@@ -109,7 +139,16 @@ def l_and_i(primary_featurelayer):
     )
 
     # Clean up the NaN values in the li_complaints column
-    def remove_nan_strings(x):
+    def remove_nan_strings(x: str) -> str | None:
+        """
+        Remove 'nan' strings from the input.
+
+        Args:
+            x (str): Input string.
+
+        Returns:
+            str | None: Cleaned string or None if only 'nan' values.
+        """
         if x == "nan" or ("nan;" in x):
             return None
         else:
@@ -136,4 +175,4 @@ def remove_nan_strings(x):
         .astype(int)
     )
 
-    return primary_featurelayer
+    return primary_featurelayer
\ No newline at end of file

From 2a7b1307d0c9aa2a0a6849aac12e291cd92af573 Mon Sep 17 00:00:00 2001
From: nlebovits <nissim.lebovits@vanderbilt.edu>
Date: Thu, 10 Oct 2024 19:29:00 -0400
Subject: [PATCH 2/4] update owner types to include public

---
 data/src/data_utils/llc_owner.py           | 16 ----------
 data/src/data_utils/owner_type.py          | 37 ++++++++++++++++++++++
 data/src/script.py                         |  4 +--
 src/components/FilterView.tsx              |  4 +--
 src/components/Filters/DimensionFilter.tsx |  7 ++--
 5 files changed, 45 insertions(+), 23 deletions(-)
 delete mode 100644 data/src/data_utils/llc_owner.py
 create mode 100644 data/src/data_utils/owner_type.py

diff --git a/data/src/data_utils/llc_owner.py b/data/src/data_utils/llc_owner.py
deleted file mode 100644
index 181c5772..00000000
--- a/data/src/data_utils/llc_owner.py
+++ /dev/null
@@ -1,16 +0,0 @@
-def llc_owner(primary_featurelayer):
-    llc_owners = []
-
-    for _, row in primary_featurelayer.gdf.iterrows():
-        # Extracting owner1 and owner2 from the row
-        owner1 = str(row["owner_1"]).lower()
-        owner2 = str(row["owner_2"]).lower()
-
-        # Checking if " llc" is in either owner1 or owner2
-        if " llc" in owner1 or " llc" in owner2:
-            llc_owners.append("Yes")
-        else:
-            llc_owners.append("No")
-
-    primary_featurelayer.gdf["llc_owner"] = llc_owners
-    return primary_featurelayer
diff --git a/data/src/data_utils/owner_type.py b/data/src/data_utils/owner_type.py
new file mode 100644
index 00000000..291364df
--- /dev/null
+++ b/data/src/data_utils/owner_type.py
@@ -0,0 +1,37 @@
+import pandas as pd
+from classes.featurelayer import FeatureLayer
+
+def owner_type(primary_featurelayer: FeatureLayer) -> FeatureLayer:
+    """
+    Determines the ownership type for each property in the primary feature layer based on 
+    the 'owner_1', 'owner_2', and 'city_owner_agency' columns. The ownership type is set as:
+    - "Public" if 'city_owner_agency' is not NA.
+    - "Business (LLC)" if 'city_owner_agency' is NA and "LLC" is found in 'owner_1' or 'owner_2'.
+    - "Individual" if 'city_owner_agency' is NA and "LLC" is not found in 'owner_1' or 'owner_2'.
+
+    Args:
+        primary_featurelayer (FeatureLayer): The feature layer containing property ownership data.
+
+    Returns:
+        FeatureLayer: The updated feature layer with the 'owner_type' column added.
+    """
+    owner_types = []
+
+    for _, row in primary_featurelayer.gdf.iterrows():
+        # Extract owner1, owner2, and city_owner_agency
+        owner1 = str(row["owner_1"]).lower()
+        owner2 = str(row["owner_2"]).lower()
+        city_owner_agency = row["city_owner_agency"]
+
+        # Determine ownership type based on the conditions
+        if pd.notna(city_owner_agency):
+            owner_types.append("Public")
+        elif " llc" in owner1 or " llc" in owner2:
+            owner_types.append("Business (LLC)")
+        else:
+            owner_types.append("Individual")
+
+    # Add the 'owner_type' column to the GeoDataFrame
+    primary_featurelayer.gdf["owner_type"] = owner_types
+
+    return primary_featurelayer
diff --git a/data/src/script.py b/data/src/script.py
index 763de925..c5f3a8aa 100644
--- a/data/src/script.py
+++ b/data/src/script.py
@@ -16,7 +16,7 @@
 from data_utils.gun_crimes import gun_crimes
 from data_utils.imm_dang_buildings import imm_dang_buildings
 from data_utils.l_and_i import l_and_i
-from data_utils.llc_owner import llc_owner
+from data_utils.owner_type import owner_type
 from data_utils.nbhoods import nbhoods
 from data_utils.negligent_devs import negligent_devs
 from data_utils.opa_properties import opa_properties
@@ -50,7 +50,7 @@
     imm_dang_buildings,
     tactical_urbanism,
     conservatorship,
-    llc_owner,
+    owner_type,
     community_gardens,
     park_priority,
     ppr_properties,
diff --git a/src/components/FilterView.tsx b/src/components/FilterView.tsx
index fab565d7..2666f12c 100644
--- a/src/components/FilterView.tsx
+++ b/src/components/FilterView.tsx
@@ -51,9 +51,9 @@ const filters = [
     type: 'buttonGroup',
   },
   {
-    property: 'llc_owner',
+    property: 'owner_type',
     display: 'Owner',
-    options: ['Yes', 'No'],
+    options: ['Public', 'Business (LLC)', 'Individual'],
     type: 'buttonGroup',
   },
 ];
diff --git a/src/components/Filters/DimensionFilter.tsx b/src/components/Filters/DimensionFilter.tsx
index fdd1a391..0c7f374a 100644
--- a/src/components/Filters/DimensionFilter.tsx
+++ b/src/components/Filters/DimensionFilter.tsx
@@ -19,9 +19,10 @@ type OptionDisplayMapping = {
 };
 
 const optionsDisplayMapping: OptionDisplayMapping = {
-  llc_owner: {
-    Yes: 'Business',
-    No: 'Individual',
+  owner_type: {
+    Public: 'Public',
+    'Business (LLC)': 'Business (LLC)',
+    Individual: 'Individual',
   },
 };
 

From 5397e2e1be4116c21824491b0ac029fbc45236b0 Mon Sep 17 00:00:00 2001
From: nlebovits <nissim.lebovits@vanderbilt.edu>
Date: Thu, 10 Oct 2024 20:56:42 -0400
Subject: [PATCH 3/4] lint and format

---
 data/src/data_utils/access_process.py        | 30 +++++++++++++++++---
 data/src/data_utils/city_owned_properties.py | 25 +++++++++++++++-
 data/src/data_utils/conservatorship.py       |  2 +-
 3 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/data/src/data_utils/access_process.py b/data/src/data_utils/access_process.py
index d1d2deb9..c888f5cc 100644
--- a/data/src/data_utils/access_process.py
+++ b/data/src/data_utils/access_process.py
@@ -1,4 +1,20 @@
-def access_process(dataset):
+from typing import Any
+
+def access_process(dataset: Any) -> Any:
+    """
+    Process a dataset to determine the access process for each property based on
+    city ownership and market value. The result is added as a new column in the dataset.
+
+    Args:
+        dataset (Any): The dataset containing a GeoDataFrame named `gdf` with
+                       columns "city_owner_agency" and "market_value".
+
+    Returns:
+        Any: The updated dataset with an additional "access_process" column.
+
+    Side Effects:
+        Prints the distribution of the "access_process" column.
+    """
     access_processes = []
 
     for _, row in dataset.gdf.iterrows():
@@ -9,9 +25,9 @@ def access_process(dataset):
         )
 
         # Simplified decision logic
-        if city_owner_agency == "PLB":
-            access_process = "Land Bank"
-        elif city_owner_agency in ["PRA", "PHDC"]:
+        if city_owner_agency == "Land Bank (PHDC)":
+            access_process = "Go through Land Bank"
+        elif city_owner_agency == "PRA":
             access_process = "Do Nothing"
         else:
             if market_value_over_1000:
@@ -22,4 +38,10 @@ def access_process(dataset):
         access_processes.append(access_process)
 
     dataset.gdf["access_process"] = access_processes
+
+    # Print the distribution of "access_process"
+    distribution = dataset.gdf["access_process"].value_counts()
+    print("Distribution of access process:")
+    print(distribution)
+
     return dataset
diff --git a/data/src/data_utils/city_owned_properties.py b/data/src/data_utils/city_owned_properties.py
index 602871d7..b4277ca8 100644
--- a/data/src/data_utils/city_owned_properties.py
+++ b/data/src/data_utils/city_owned_properties.py
@@ -1,8 +1,22 @@
+from typing import Any
 from classes.featurelayer import FeatureLayer
 from constants.services import CITY_OWNED_PROPERTIES_TO_LOAD
 
+def city_owned_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
+    """
+    Processes city-owned property data by joining it with the primary feature layer,
+    renaming columns, and updating access information for properties based on ownership.
+    All instances where the "city_owner_agency" is "PLB" are changed to "Land Bank (PHDC)".
+    The function also prints the counts of properties with "PLB" and "Land Bank (PHDC)" agencies.
 
-def city_owned_properties(primary_featurelayer):
+    Args:
+        primary_featurelayer (FeatureLayer): The primary feature layer to which city-owned 
+                                             property data will be joined.
+
+    Returns:
+        FeatureLayer: The updated primary feature layer with processed city ownership 
+                      information.
+    """
     city_owned_properties = FeatureLayer(
         name="City Owned Properties",
         esri_rest_urls=CITY_OWNED_PROPERTIES_TO_LOAD,
@@ -60,4 +74,13 @@ def city_owned_properties(primary_featurelayer):
         "side_yard_eligible"
     ].fillna("No")
 
+    # Update all instances where city_owner_agency is "PLB" to "Land Bank (PHDC)"
+    primary_featurelayer.gdf.loc[
+        primary_featurelayer.gdf["city_owner_agency"] == "PLB", "city_owner_agency"
+    ] = "Land Bank (PHDC)"
+
+    # Print the counts for "PLB" and "Land Bank (PHDC)"
+    plb_count = primary_featurelayer.gdf["city_owner_agency"].eq("PLB").sum()
+    land_bank_count = primary_featurelayer.gdf["city_owner_agency"].eq("Land Bank (PHDC)").sum()
+
     return primary_featurelayer
diff --git a/data/src/data_utils/conservatorship.py b/data/src/data_utils/conservatorship.py
index 349c9e44..5f9c9793 100644
--- a/data/src/data_utils/conservatorship.py
+++ b/data/src/data_utils/conservatorship.py
@@ -44,7 +44,7 @@ def conservatorship(primary_featurelayer):
             sale_date_6_months_ago = False
 
         # Simplified decision logic
-        if city_owner_agency == "PLB" or (
+        if city_owner_agency == "Land Bank (PHDC)" or (
             not sale_date_6_months_ago and market_value_over_1000
         ):
             conservatorship = "No"

From 5b9f4aae1808832a49a7f6fc2e032a4547a56ac1 Mon Sep 17 00:00:00 2001
From: nlebovits <nissim.lebovits@vanderbilt.edu>
Date: Thu, 10 Oct 2024 20:59:08 -0400
Subject: [PATCH 4/4] remove unneeded print statements

---
 data/src/data_utils/city_owned_properties.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/data/src/data_utils/city_owned_properties.py b/data/src/data_utils/city_owned_properties.py
index b4277ca8..a5b21980 100644
--- a/data/src/data_utils/city_owned_properties.py
+++ b/data/src/data_utils/city_owned_properties.py
@@ -7,7 +7,6 @@ def city_owned_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
     Processes city-owned property data by joining it with the primary feature layer,
     renaming columns, and updating access information for properties based on ownership.
     All instances where the "city_owner_agency" is "PLB" are changed to "Land Bank (PHDC)".
-    The function also prints the counts of properties with "PLB" and "Land Bank (PHDC)" agencies.
 
     Args:
         primary_featurelayer (FeatureLayer): The primary feature layer to which city-owned 
@@ -79,8 +78,4 @@ def city_owned_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
         primary_featurelayer.gdf["city_owner_agency"] == "PLB", "city_owner_agency"
     ] = "Land Bank (PHDC)"
 
-    # Print the counts for "PLB" and "Land Bank (PHDC)"
-    plb_count = primary_featurelayer.gdf["city_owner_agency"].eq("PLB").sum()
-    land_bank_count = primary_featurelayer.gdf["city_owner_agency"].eq("Land Bank (PHDC)").sum()
-
     return primary_featurelayer