Skip to content

Commit

Permalink
setup for initial build of new cdbg data product (#1355)
Browse files Browse the repository at this point in the history
* add cdbg build and test to github actions

* cdbg product folder and dbt project setup

* ingest template for hud data

* cdbg recipe, sources.yml

* cdbg transformation logic

* cdbg export script

---------

Co-authored-by: Finn van Krieken <[email protected]>
  • Loading branch information
damonmcc and fvankrieken authored Dec 31, 2024
1 parent 47f2592 commit 2bac1f3
Show file tree
Hide file tree
Showing 23 changed files with 753 additions and 0 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ on:
options:
- template
- cbbr
- cdbg
- checkbook
- colp
- cpdb
Expand Down Expand Up @@ -128,6 +129,17 @@ jobs:
logging_level: ${{ inputs.logging_level }}
build_note: ${{ inputs.build_note }}
dev_bucket: ${{ inputs.dev_bucket && format('de-dev-{0}', inputs.dev_bucket) || '' }}
cdbg:
needs: health_check
if: inputs.dataset_name == 'cdbg' || inputs.dataset_name == 'all'
uses: ./.github/workflows/cdbg_build.yml
secrets: inherit
with:
image_tag: ${{ needs.health_check.outputs.tag }}
recipe_file: ${{ inputs.recipe_file }}
build_name: ${{ needs.health_check.outputs.build_name }}
plan_command: ${{ needs.health_check.outputs.plan_command }}
dev_bucket: ${{ inputs.dev_bucket && format('de-dev-{0}', inputs.dev_bucket) || '' }}
checkbook:
needs: health_check
if: inputs.dataset_name == 'checkbook' || inputs.dataset_name == 'all'
Expand Down
76 changes: 76 additions & 0 deletions .github/workflows/cdbg_build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: CDBG - 🏗️ Build
on:
workflow_call:
inputs:
image_tag:
type: string
required: false
build_name:
type: string
required: true
recipe_file:
type: string
required: true
plan_command:
type: string
default: recipe
dev_bucket:
type: string
required: false

jobs:
build:
name: Build CDBG
runs-on: ubuntu-22.04
defaults:
run:
shell: bash
working-directory: products/cdbg
container:
image: nycplanning/build-base:${{ inputs.image_tag || 'latest' }}
env:
BUILD_ENGINE_DB: db-cdbg
BUILD_NAME: ${{ inputs.build_name }}
RECIPES_BUCKET: ${{ inputs.dev_bucket || 'edm-recipes' }}
PUBLISHING_BUCKET: ${{ inputs.dev_bucket || 'edm-publishing' }}
DEV_FLAG: ${{ inputs.dev_bucket && 'true' || 'false' }}
steps:
- uses: actions/checkout@v4

- name: Load Secrets
uses: 1password/load-secrets-action@v1
with:
export-env: true
env:
OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }}
AWS_S3_ENDPOINT: "op://Data Engineering/DO_keys/AWS_S3_ENDPOINT"
AWS_SECRET_ACCESS_KEY: "op://Data Engineering/DO_keys/AWS_SECRET_ACCESS_KEY"
AWS_ACCESS_KEY_ID: "op://Data Engineering/DO_keys/AWS_ACCESS_KEY_ID"
BUILD_ENGINE_SERVER: "op://Data Engineering/EDM_DATA/server_url"
BUILD_ENGINE_HOST: "op://Data Engineering/EDM_DATA/server"
BUILD_ENGINE_USER: "op://Data Engineering/EDM_DATA/username"
BUILD_ENGINE_PASSWORD: "op://Data Engineering/EDM_DATA/password"
BUILD_ENGINE_PORT: "op://Data Engineering/EDM_DATA/port"

- name: Setup build environment
working-directory: ./
run: |
./bash/docker_container_setup.sh
./bash/build_env_setup.sh
- name: Plan build
run: python3 -m dcpy.lifecycle.builds.plan ${{ inputs.plan_command }}

- name: Dataloading
run: python -m dcpy.lifecycle.builds.load load --recipe-path ${{ inputs.recipe_file }}.lock.yml

- name: Build
run: |
dbt debug
dbt build
- name: Export
run: ./bash/export.sh

- name: Upload
run: python3 -m dcpy.connectors.edm.publishing upload --product db-cdbg --acl public-read
1 change: 1 addition & 0 deletions .github/workflows/test_helper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ jobs:
project:
- green_fast_track
- zoningtaxlots
- cdbg
steps:
- uses: actions/checkout@v4
- name: setup
Expand Down
60 changes: 60 additions & 0 deletions dcpy/lifecycle/ingest/templates/hud_lowmodincomebyblockgroup.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
id: hud_lowmodincomebyblockgroup
acl: public-read

attributes:
name: HUD Low to Moderate Income Population by Block Group
description: >-
This particular version of this dataset has come from OMB. We haven't fully assessed whether
we could just pull from the linked feature service instead and filter to NYC or if OMB has
done any other preprocessing.
This service identifies U.S. Census Block Groups in which 51% or more of the households earn
less than 80 percent of the Area Median Income (AMI). The Community Development Block Grant
(CDBG) program requires that each CDBG funded activity must either principally benefit low-
and moderate-income persons, aid in the prevention or elimination of slums or blight, or meet
a community development need having a particular urgency because existing conditions pose a
serious and immediate threat to the health or welfare of the community and other financial
resources are not available to meet that need. With respect to activities that principally
benefit low- and moderate-income persons, at least 51 percent of the activity's beneficiaries
must be low and moderate income.
The Community Development Block Grant (CDBG) program requires that each CDBG funded activity
must either principally benefit low- and moderate-income persons, aid in the prevention or
elimination of slums or blight, or meet a community development need having a particular urgency
because existing conditions pose a serious and immediate threat to the health or welfare of
the community and other financial resources are not available to meet that need. With respect
to activities that principally benefit low- and moderate-income persons, at least 51 percent
of the activity's beneficiaries must be low and moderate income. For CDBG, a person is considered
to be of low income only if he or she is a member of a household whose income would qualify as
"very low income" under the Section 8 Housing Assistance Payments program. Generally, these
Section 8 limits are based on 50% of area median. Similarly, CDBG moderate income relies on
Section 8 "lower income" limits, which are generally tied to 80% of area median. These data
are from the 2011-2015 American Community Survey (ACS).
url: https://hudgis-hud.opendata.arcgis.com/datasets/HUD::low-to-moderate-income-population-by-block-group/about

ingestion:
source:
type: s3
bucket: edm-recipes
key: inbox/omb/20241227/ACS-2020-Low-Mod-Summarized-All-Block-Groups-2023.csv
file_format:
type: csv

#columns:
#- id: CDBGUOGID
#- id: GEOID
#- id: CDBGNAME
#- id: STUSAB
#- id: STATE
#- id: CDBGTYPE
#- id: Geoname
#- id: COUNTY
#- id: TRACT
#- id: BLKGRP
#- id: LOW
#- id: LOWMOD
#- id: LMMI
#- id: LOWMODUNIV
#- id: LOWMOD_PCT
#- id: MOE_LOWMODPCT
#- id: Column1
22 changes: 22 additions & 0 deletions products/cdbg/bash/export.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
source ../../bash/utils.sh
set_error_traps

rm -rf output

echo "Export product tables"
mkdir -p output && (
cd output

echo "Copy metadata files"
cp ../source_data_versions.csv .
cp ../build_metadata.json .

echo "export cdbg_block_groups.csv ..."
csv_export cdbg_block_groups cdbg_block_groups

echo "export cdbg_tracts.csv ..."
csv_export cdbg_tracts cdbg_tracts
)

zip -r output/output.zip output
21 changes: 21 additions & 0 deletions products/cdbg/dbt_project.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: "cdbg"

profile: "dcp-de-postgres"

model-paths: ["models"]

tests:
+store_failures: true
schema: "_tests"

models:
cdbg:
staging:
+materialized: view
intermediate:
+materialized: table
product:
+materialized: table

flags:
fail-fast: true
9 changes: 9 additions & 0 deletions products/cdbg/macros/test_sum.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{% test sum_by(model, group_by, target_column, val, precision=4) %}

SELECT
{{ group_by }}, sum({{ target_column }}) AS sum, array_agg( {{target_column }}) AS vals
FROM {{ model }}
GROUP BY {{ group_by }}
HAVING round(sum({{ target_column }})::numeric, {{ precision }}) <> {{ val }}

{% endtest %}
17 changes: 17 additions & 0 deletions products/cdbg/models/_sources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
version: 2

sources:
- name: recipe_sources
schema: "{{ env_var('BUILD_ENGINE_SCHEMA') }}"
tables:
- name: dcp_mappluto_clipped
columns:
- name: bbl
tests:
- not_null
- name: wkb_geometry
tests:
- not_null
- name: dcp_cb2020_wi
- name: dcp_ct2020_wi
- name: hud_lowmodincomebyblockgroup
68 changes: 68 additions & 0 deletions products/cdbg/models/intermediate/_intermediate_models.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
version: 2

models:
- name: int__block_groups
description: residential area and low-to-moderate income data aggregated by census block group
columns:
- name: geoid
tests: [unique, not_null]
- name: borough_name
- name: tract
- name: block_group
- name: total_floor_area
- name: residential_floor_area
- name: residential_floor_area_percentage
- name: total_population
- name: lowmod_population
- name: lowmod_population_percentage

- name: int__lot_block_groups_details
description: int__lot_block_groups joined to pluto for lot info
columns:
- name: bbl
- name: block_group_geoid
- name: overlap_ratio
- name: bldgarea
- name: bldgarea_in_block_group
- name: resarea
- name: resarea_in_block_group

- name: int__lot_block_groups_raw
description: unique intersections of pluto lots and census block groups with proportion of lot in block group
columns:
- name: bbl
- name: block_group_geoid
- name: overlap_ratio

- name: int__lot_block_groups
description: >-
unique intersections of pluto lots and census block groups with proportion of lot in block group,
corrected to assign lots fully to block groups that contain 90%+ of a lot
columns:
- name: bbl
- name: block_group_geoid
- name: overlap_ratio
tests:
- dbt_utils.unique_combination_of_columns:
combination_of_columns: [bbl, block_group_geoid]
- sum_by:
group_by: bbl
target_column: overlap_ratio
val: 1
config:
severity: warn

- name: int__tracts
description: residential area and low-to-moderate income data aggregated by census tract
columns:
- name: geoid
tests: [unique, not_null]
- name: borough_name
- name: tract
- name: block_group
- name: total_floor_area
- name: residential_floor_area
- name: residential_floor_area_percentage
- name: total_population
- name: lowmod_population
- name: lowmod_population_percentage
39 changes: 39 additions & 0 deletions products/cdbg/models/intermediate/int__block_groups.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
WITH lot_block_groups AS (
SELECT * FROM {{ ref("int__lot_block_groups_details") }}
),

block_groups_income AS (
SELECT * FROM {{ ref("stg__low_mod_by_block_group") }}
),

block_groups_floor_area AS (
SELECT
block_group_geoid AS geoid,
sum(bldgarea_in_block_group) AS total_floor_area,
sum(resarea_in_block_group) AS residential_floor_area
FROM lot_block_groups
GROUP BY geoid
),

block_group_details AS (
SELECT
block_groups_floor_area.geoid,
block_groups_income.boro AS borough_name,
block_groups_income.tract,
block_groups_income.block_group,
total_floor_area,
residential_floor_area,
CASE
WHEN total_floor_area = 0
THEN 0
ELSE (residential_floor_area / total_floor_area) * 100
END AS residential_floor_area_percentage,
block_groups_income.total_population,
block_groups_income.lowmod_population AS low_mod_income_population,
block_groups_income.lowmod_pct AS low_mod_income_population_percentage
FROM block_groups_floor_area
LEFT JOIN block_groups_income
ON block_groups_floor_area.geoid = block_groups_income.geoid
)

SELECT * FROM block_group_details
35 changes: 35 additions & 0 deletions products/cdbg/models/intermediate/int__lot_block_groups.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
WITH lot_block_groups AS (
SELECT
bbl,
block_group_geoid,
overlap_ratio
FROM {{ ref("int__lot_block_groups_raw") }}
),

valid_lot_block_groups AS (
SELECT * FROM lot_block_groups
WHERE overlap_ratio IS NOT null
),

lots_easy AS (
SELECT
bbl,
block_group_geoid,
1 AS overlap_ratio
FROM valid_lot_block_groups
WHERE overlap_ratio > 0.9
),

lots_split AS (
SELECT *
FROM valid_lot_block_groups
WHERE bbl NOT IN (SELECT bbl FROM lots_easy)
),

lots AS (
SELECT * FROM lots_easy
UNION ALL
SELECT * FROM lots_split
)

SELECT * FROM lots
Loading

0 comments on commit 2bac1f3

Please sign in to comment.