diff --git a/README.md b/README.md index eb94209..c7e0844 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,34 @@ # Space2Stats -Consistent, comparable, authoritative data describing sub-national variation is a constant point of complication for World Bank teams, our development partners, and client countries when assessing and investigating economic issues and national policy. This project will focus on creating and disseminating such data through aggregation of geospatial information at standard administrative divisions, and through the attribution of household survey data with foundational geospatial variables. \ No newline at end of file +Consistent, comparable, authoritative data describing sub-national variation is a constant point of complication for World Bank teams, our development partners, and client countries when assessing and investigating economic issues and national policy. This project will focus on creating and disseminating such data through aggregation of geospatial information at standard administrative divisions, and through the attribution of household survey data with foundational geospatial variables. + +## Getting Started Locally + +- Setup the database: +``` +docker-compose up -d +``` + +- Create a `db.env` file: +```.env +DB_HOST=localhost +DB_PORT=5439 +DB_NAME=postgis +DB_USER=username +DB_PASSWORD=password +DB_TABLE_NAME=space2stats +``` + +- Load our dataset into the database +``` +./postgres/download_parquet.sh +python postgres/chunk_parquet.py +./postgres/load_parquet_chunks.sh +``` + +> You can get started with a subset of data for NYC with `./load_nyc_sample.sh` which requires changing your `db.env` value for `DB_TABLE_NAME` to `space2stats_nyc_sample`. + +- Access your data using the Space2statS API! See the [example notebook](notebooks/space2stats_api_demo.ipynb). + + + diff --git a/notebooks/space2stats_api_demo.ipynb b/notebooks/space2stats_api_demo.ipynb new file mode 100644 index 0000000..f9cdb5c --- /dev/null +++ b/notebooks/space2stats_api_demo.ipynb @@ -0,0 +1,527 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "\n", + "import requests\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "import h3\n", + "from shapely.geometry import Point\n", + "\n", + "from lonboard.colormap import apply_continuous_cmap\n", + "from lonboard import Map, ScatterplotLayer\n", + "from palettable.cartocolors.sequential import BurgYl_2\n", + "from geojson_pydantic import Feature, Polygon" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_URL = \"http://localhost:8000\"\n", + "FIELDS_ENDPOINT = f\"{BASE_URL}/fields\"\n", + "SUMMARY_ENDPOINT = f\"{BASE_URL}/summary\"" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Available Fields: ['sum_pop_2020', 'ogc_fid', 'sum_pop_f_0_2020', 'sum_pop_f_10_2020', 'sum_pop_f_15_2020', 'sum_pop_f_1_2020', 'sum_pop_f_20_2020', 'sum_pop_f_25_2020', 'sum_pop_f_30_2020', 'sum_pop_f_35_2020', 'sum_pop_f_40_2020', 'sum_pop_f_45_2020', 'sum_pop_f_50_2020', 'sum_pop_f_55_2020', 'sum_pop_f_5_2020', 'sum_pop_f_60_2020', 'sum_pop_f_65_2020', 'sum_pop_f_70_2020', 'sum_pop_f_75_2020', 'sum_pop_f_80_2020', 'sum_pop_m_0_2020', 'sum_pop_m_10_2020', 'sum_pop_m_15_2020', 'sum_pop_m_1_2020', 'sum_pop_m_20_2020', 'sum_pop_m_25_2020', 'sum_pop_m_30_2020', 'sum_pop_m_35_2020', 'sum_pop_m_40_2020', 'sum_pop_m_45_2020', 'sum_pop_m_50_2020', 'sum_pop_m_55_2020', 'sum_pop_m_5_2020', 'sum_pop_m_60_2020', 'sum_pop_m_65_2020', 'sum_pop_m_70_2020', 'sum_pop_m_75_2020', 'sum_pop_m_80_2020', 'sum_pop_m_2020', 'sum_pop_f_2020']\n" + ] + } + ], + "source": [ + "response = requests.get(FIELDS_ENDPOINT)\n", + "if response.status_code != 200:\n", + " raise Exception(f\"Failed to get fields: {response.text}\")\n", + "\n", + "available_fields = response.json()\n", + "print(\"Available Fields:\", available_fields)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "AOIModel = Feature[Polygon, Dict]\n", + "\n", + "# kenya\n", + "aoi = {\n", + " \"type\": \"Feature\",\n", + " \"geometry\": {\n", + " \"type\": \"Polygon\",\n", + " \"coordinates\": [\n", + " [\n", + " [41.85508309264397, -1.68325],\n", + " [40.98105, -2.49979],\n", + " [40.993, -3.444],\n", + " [41.58513, -3.91909],\n", + " [40.88477, -4.95913],\n", + " [39.55938425876585, -4.437641590288629],\n", + " [39.25451, -3.42206],\n", + " [37.7669, -3.67712],\n", + " [37.69869, -3.09699],\n", + " [34.07262, -1.05982],\n", + " [33.90371119710453, -0.95],\n", + " [33.893568969666944, 0.109813537861896],\n", + " [34.18, 0.515],\n", + " [34.6721, 1.17694],\n", + " [35.03599, 1.90584],\n", + " [34.59607, 3.05374],\n", + " [34.47913, 3.5556],\n", + " [35.298007118232946, 4.77696566346189],\n", + " [35.817447662353516, 4.77696566346189],\n", + " [36.159078632855646, 4.447864127672769],\n", + " [36.85509323800812, 4.447864127672769],\n", + " [38.120915, 3.598605],\n", + " [38.43697, 3.58851],\n", + " [38.67114, 3.61607],\n", + " [38.89251, 3.50074],\n", + " [39.55938425876585, 3.42206],\n", + " [39.85494, 3.83879],\n", + " [40.76848, 4.25702],\n", + " [41.1718, 3.91909],\n", + " [41.85508309264397, 2.97959],\n", + " [41.58513, 2.09],\n", + " [40.993, 1.657],\n", + " [40.98105, 1.002],\n", + " [41.85508309264397, -1.68325]\n", + " ]\n", + " ]\n", + " },\n", + " \"properties\": {\n", + " \"name\": \"Kenya\"\n", + " }\n", + " }\n", + "\n", + "\n", + "feat = AOIModel(**aoi)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hex_idsum_pop_2020sum_pop_f_2020sum_pop_m_2020
0866a4a48fffffff399.860905189.675539210.185366
1866a4a497ffffff582.555159276.337255306.217904
2866a4a49fffffff749.911237355.723245394.187992
3866a4a4d7ffffff863.888290418.309236445.579054
4866a5820fffffff525.085147249.076134276.009012
...............
16212867b5dd77ffffff-36.000000-18.000000-18.000000
16213867b5dd87ffffff-36.000000-18.000000-18.000000
16214867b5dd8fffffff-36.000000-18.000000-18.000000
16215867b5dd9fffffff-36.000000-18.000000-18.000000
16216867b5ddafffffff-36.000000-18.000000-18.000000
\n", + "

16217 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " hex_id sum_pop_2020 sum_pop_f_2020 sum_pop_m_2020\n", + "0 866a4a48fffffff 399.860905 189.675539 210.185366\n", + "1 866a4a497ffffff 582.555159 276.337255 306.217904\n", + "2 866a4a49fffffff 749.911237 355.723245 394.187992\n", + "3 866a4a4d7ffffff 863.888290 418.309236 445.579054\n", + "4 866a5820fffffff 525.085147 249.076134 276.009012\n", + "... ... ... ... ...\n", + "16212 867b5dd77ffffff -36.000000 -18.000000 -18.000000\n", + "16213 867b5dd87ffffff -36.000000 -18.000000 -18.000000\n", + "16214 867b5dd8fffffff -36.000000 -18.000000 -18.000000\n", + "16215 867b5dd9fffffff -36.000000 -18.000000 -18.000000\n", + "16216 867b5ddafffffff -36.000000 -18.000000 -18.000000\n", + "\n", + "[16217 rows x 4 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define the Request Payload\n", + "request_payload = {\n", + " \"aoi\": aoi,\n", + " \"spatial_join_method\": \"centroid\",\n", + " \"fields\": [\"sum_pop_2020\", \"sum_pop_f_2020\", \"sum_pop_m_2020\"] # Use all available fields\n", + "}\n", + "\n", + "# Get Summary Data\n", + "response = requests.post(SUMMARY_ENDPOINT, json=request_payload)\n", + "if response.status_code != 200:\n", + " raise Exception(f\"Failed to get summary: {response.text}\")\n", + "\n", + "summary_data = response.json()\n", + "\n", + "# Convert Summary Data to DataFrame\n", + "summary_df = pd.DataFrame(summary_data)\n", + "summary_df" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def h3_to_point(h3_id):\n", + " lat, lng = h3.h3_to_geo(h3_id)\n", + " \n", + " return Point(lng, lat)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hex_idsum_pop_2020sum_pop_f_2020sum_pop_m_2020geometry
0866a4a48fffffff399.860905189.675539210.185366POINT (35.77461 4.75647)
1866a4a497ffffff582.555159276.337255306.217904POINT (35.67197 4.74377)
2866a4a49fffffff749.911237355.723245394.187992POINT (35.72824 4.72169)
3866a4a4d7ffffff863.888290418.309236445.579054POINT (35.83087 4.73438)
4866a5820fffffff525.085147249.076134276.009012POINT (34.87996 4.14901)
..................
16212867b5dd77ffffff-36.000000-18.000000-18.000000POINT (40.83955 -4.90064)
16213867b5dd87ffffff-36.000000-18.000000-18.000000POINT (41.02377 -4.74704)
16214867b5dd8fffffff-36.000000-18.000000-18.000000POINT (40.96649 -4.72508)
16215867b5dd9fffffff-36.000000-18.000000-18.000000POINT (41.01249 -4.68672)
16216867b5ddafffffff-36.000000-18.000000-18.000000POINT (40.97776 -4.78541)
\n", + "

16217 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " hex_id sum_pop_2020 sum_pop_f_2020 sum_pop_m_2020 \\\n", + "0 866a4a48fffffff 399.860905 189.675539 210.185366 \n", + "1 866a4a497ffffff 582.555159 276.337255 306.217904 \n", + "2 866a4a49fffffff 749.911237 355.723245 394.187992 \n", + "3 866a4a4d7ffffff 863.888290 418.309236 445.579054 \n", + "4 866a5820fffffff 525.085147 249.076134 276.009012 \n", + "... ... ... ... ... \n", + "16212 867b5dd77ffffff -36.000000 -18.000000 -18.000000 \n", + "16213 867b5dd87ffffff -36.000000 -18.000000 -18.000000 \n", + "16214 867b5dd8fffffff -36.000000 -18.000000 -18.000000 \n", + "16215 867b5dd9fffffff -36.000000 -18.000000 -18.000000 \n", + "16216 867b5ddafffffff -36.000000 -18.000000 -18.000000 \n", + "\n", + " geometry \n", + "0 POINT (35.77461 4.75647) \n", + "1 POINT (35.67197 4.74377) \n", + "2 POINT (35.72824 4.72169) \n", + "3 POINT (35.83087 4.73438) \n", + "4 POINT (34.87996 4.14901) \n", + "... ... \n", + "16212 POINT (40.83955 -4.90064) \n", + "16213 POINT (41.02377 -4.74704) \n", + "16214 POINT (40.96649 -4.72508) \n", + "16215 POINT (41.01249 -4.68672) \n", + "16216 POINT (40.97776 -4.78541) \n", + "\n", + "[16217 rows x 5 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary_df['geometry'] = summary_df['hex_id'].apply(h3_to_point)\n", + "\n", + "gdf = gpd.GeoDataFrame(summary_df, geometry='geometry')\n", + "gdf\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/zacdez/Library/Caches/pypoetry/virtualenvs/notebooks-sNYx7QfP-py3.12/lib/python3.12/site-packages/lonboard/_geoarrow/ops/reproject.py:23: UserWarning: No CRS exists on data. If no data is shown on the map, double check that your CRS is WGS84.\n", + " warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "38d54fd92ba84320927ca29d9bca3ae1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map(layers=[ScatterplotLayer(get_fill_color=\n", + "[\n", + " [\n", + " 2…" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max = gdf[\"sum_pop_2020\"].max()\n", + "min = gdf[\"sum_pop_2020\"].min()\n", + "normalized_sum_pop_2020 = (gdf[\"sum_pop_2020\"] - min) / (max - min)\n", + "normalized_sum_pop_2020\n", + "\n", + "layer = ScatterplotLayer.from_geopandas(gdf, get_radius=2000, get_fill_color=apply_continuous_cmap(normalized_sum_pop_2020, BurgYl_2, alpha=0.7))\n", + "m = Map(layer)\n", + "m\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/postgres/chunk_parquet.py b/postgres/chunk_parquet.py index ad9f254..f6806e6 100644 --- a/postgres/chunk_parquet.py +++ b/postgres/chunk_parquet.py @@ -1,11 +1,16 @@ -import pandas as pd +import os +import pandas as pd -df = pd.read_parquet('space2stats.parquet') +chunk_dir = "parquet_chunks" +df = pd.read_parquet('space2stats_updated.parquet') chunk_size = 100000 # Number of rows per chunk +if not os.path.exists(chunk_dir): + os.mkdir(chunk_dir) + for i in range(0, len(df), chunk_size): chunk = df.iloc[i:i + chunk_size] - chunk.to_parquet(f'parquet_chunks/space2stats_part_{i // chunk_size}.parquet') + chunk.to_parquet(os.path.join(chunk_dir, f'space2stats_part_{i // chunk_size}.parquet')) print("Parquet file split into smaller chunks.") \ No newline at end of file diff --git a/postgres/load_parquet_chunks.sh b/postgres/load_parquet_chunks.sh index bc3be4e..18c1dd4 100755 --- a/postgres/load_parquet_chunks.sh +++ b/postgres/load_parquet_chunks.sh @@ -33,7 +33,8 @@ do ogr2ogr -f "PostgreSQL" \ PG:"host=$DB_HOST port=$DB_PORT dbname=$DB_NAME user=$DB_USER password=$DB_PASSWORD" \ "$PARQUET_FILE" \ - -nln $TABLE_NAME + -nln $TABLE_NAME \ + -lco SPATIAL_INDEX=NONE TABLE_EXISTS="t" fi