Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Weekly PR from Staging to Main #1023

Merged
merged 23 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
3c07241
Update package-lock.json dependencies to pass precommit hook
HeyZoos Nov 16, 2024
450e16a
Add @maptiler/geocoding-control dependency
HeyZoos Nov 16, 2024
e67b5d0
Switch to MapTiler GeocodingControl
HeyZoos Nov 16, 2024
1949087
Simplify filter condition in PropertyMap.tsx
HeyZoos Nov 16, 2024
f1c6845
Cast expression operator to a string for now
HeyZoos Nov 16, 2024
7f3eed9
Remove mapgl dependencies
HeyZoos Nov 16, 2024
22d4afc
Run eslint on PropertyMap.tsx
HeyZoos Nov 16, 2024
46d2612
Merge pull request #1003 from HeyZoos/issue-864-clean-up-mapbox-refer…
nlebovits Nov 18, 2024
cd87b34
run precommit
nlebovits Nov 18, 2024
4465aeb
remove references to mapbox key, which we are no longer using
nlebovits Nov 18, 2024
8485a30
set force reload to False by default
nlebovits Nov 18, 2024
4d47ac6
Merge pull request #1008 from CodeForPhilly/lebovits/remove-mapbox-re…
nlebovits Nov 18, 2024
b40c1cd
skip precommit hook
nlebovits Nov 21, 2024
e28ded3
precommit hook
nlebovits Nov 21, 2024
7bdec82
reset regular pipeline to staging
nlebovits Nov 21, 2024
4255b5d
add main.py for new etl script
nlebovits Nov 21, 2024
e8ceaaa
fix imports
nlebovits Nov 21, 2024
b228c3a
clean up li complaints-related items; add dor parcel boundaries (not …
nlebovits Nov 21, 2024
e629bb2
more formatting
nlebovits Nov 22, 2024
ea35c07
Reset Dockerfile-pg to match staging as changes are outside scope of …
nlebovits Nov 22, 2024
f4f6964
Merge pull request #1014 from CodeForPhilly/lebovits/refactor-etl-pip…
nlebovits Nov 22, 2024
5306464
Merge pull request #1009 from CodeForPhilly/main
CodeWritingCow Nov 22, 2024
02c7dad
Merge pull request #1018 from CodeForPhilly/main
nlebovits Nov 23, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion data/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ services:
image: vacant-lots-proj:latest
environment:
- GOOGLE_APPLICATION_CREDENTIALS=/app/service-account-key.json
- CFP_MAPBOX_TOKEN_UPLOADER
- VACANT_LOTS_DB
- CLEAN_GREEN_GOOGLE_KEY
- PYTHONUNBUFFERED=1
Expand Down
3 changes: 0 additions & 3 deletions data/src/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
USE_CRS = "EPSG:2272"
""" the standard geospatial code for Pennsylvania South (ftUS) """

MAPBOX_TOKEN = os.environ.get("CFP_MAPBOX_TOKEN_UPLOADER")
""" The location of the token for your mapbox account in your environment """

log_level: int = logging.WARN
""" overall log level for the project """

Expand Down
155 changes: 155 additions & 0 deletions data/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import sys

from new_etl.data_utils.access_process import access_process
from new_etl.data_utils.contig_neighbors import contig_neighbors
from new_etl.data_utils.dev_probability import dev_probability
from new_etl.data_utils.negligent_devs import negligent_devs
from new_etl.data_utils.opa_properties import opa_properties
from new_etl.data_utils.priority_level import priority_level
from new_etl.data_utils.vacant_properties import vacant_properties
from new_etl.data_utils.pwd_parcels import pwd_parcels
from new_etl.data_utils.city_owned_properties import city_owned_properties
from new_etl.data_utils.phs_properties import phs_properties
from new_etl.data_utils.li_violations import li_violations
from new_etl.data_utils.li_complaints import li_complaints
from new_etl.data_utils.rco_geoms import rco_geoms
from new_etl.data_utils.council_dists import council_dists
from new_etl.data_utils.tree_canopy import tree_canopy
from new_etl.data_utils.nbhoods import nbhoods
from new_etl.data_utils.gun_crimes import gun_crimes
from new_etl.data_utils.drug_crimes import drug_crimes
from new_etl.data_utils.delinquencies import delinquencies
from new_etl.data_utils.unsafe_buildings import unsafe_buildings
from new_etl.data_utils.imm_dang_buildings import imm_dang_buildings
from new_etl.data_utils.tactical_urbanism import tactical_urbanism
from new_etl.data_utils.conservatorship import conservatorship
from new_etl.data_utils.owner_type import owner_type
from new_etl.data_utils.community_gardens import community_gardens
from new_etl.data_utils.park_priority import park_priority
from new_etl.data_utils.ppr_properties import ppr_properties

import pandas as pd


# Ensure the directory containing awkde is in the Python path
awkde_path = "/usr/src/app"
if awkde_path not in sys.path:
sys.path.append(awkde_path)

services = [
# vacant designation
vacant_properties, # needs to run early so that other utils can make use of the `vacant` designation
# geometries/areas
pwd_parcels,
council_dists,
nbhoods,
rco_geoms,
# ownership
city_owned_properties,
phs_properties,
community_gardens,
ppr_properties,
owner_type,
# quality of life
li_violations,
li_complaints,
tree_canopy,
gun_crimes,
drug_crimes,
delinquencies,
unsafe_buildings,
imm_dang_buildings,
# development
contig_neighbors,
dev_probability,
negligent_devs,
# access/interventions
tactical_urbanism,
conservatorship,
park_priority,
]

dataset = opa_properties()

print("Initial Dataset:")
print("Shape:", dataset.gdf.shape)
print("Head:\n", dataset.gdf.head())
print("NA Counts:\n", dataset.gdf.isna().sum())

for service in services:
dataset = service(dataset)
print(f"After {service.__name__}:")
print("Dataset type:", type(dataset.gdf).__name__)
print("Shape:", dataset.gdf.shape)
print("Head:\n", dataset.gdf.head())
print("NA Counts:\n", dataset.gdf.isna().sum())

before_drop = dataset.gdf.shape[0]
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
after_drop = dataset.gdf.shape[0]
print(
f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}"
)

# Add Priority Level
dataset = priority_level(dataset)

# Print the distribution of "priority_level"
distribution = dataset.gdf["priority_level"].value_counts()
print("Distribution of priority level:")
print(distribution)

# Add Access Process
dataset = access_process(dataset)

# Print the distribution of "access_process"
distribution = dataset.gdf["access_process"].value_counts()
print("Distribution of access process:")
print(distribution)

before_drop = dataset.gdf.shape[0]
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
after_drop = dataset.gdf.shape[0]
print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}")

# Convert problematic columns to numeric
numeric_columns = [
"market_value",
"sale_price",
"total_assessment",
"total_due",
"num_years_owed",
"permit_count",
]
for col in numeric_columns:
dataset.gdf[col] = pd.to_numeric(dataset.gdf[col], errors="coerce")

dataset.gdf["most_recent_year_owed"] = dataset.gdf["most_recent_year_owed"].astype(str)

print("Column data types before exporting to Parquet:")
print(dataset.gdf.dtypes)

# Quick dataset profiling
print("\nQuick dataset profile:")

# 1) Number of NA values per column
print("\nNumber of NA values per column:")
print(dataset.gdf.isna().sum())

# 2) Mean, median, and std of numeric columns
print("\nMean, Median, and Standard Deviation of numeric columns:")
numeric_columns = dataset.gdf.select_dtypes(include=["float", "int"]).columns

for column in numeric_columns:
mean = dataset.gdf[column].mean()
median = dataset.gdf[column].median()
std = dataset.gdf[column].std()
print(f"{column}:\n Mean: {mean:.2f}\n Median: {median:.2f}\n Std: {std:.2f}")

# 3) Number of unique values in string columns
print("\nNumber of unique values in string columns:")
string_columns = dataset.gdf.select_dtypes(include=["object", "string"]).columns
unique_values = dataset.gdf[string_columns].nunique()
print(unique_values)

dataset.gdf.to_parquet("tmp/test_output.parquet")
Empty file added data/src/new_etl/__init__.py
Empty file.
Empty file.
Loading
Loading