rapidsai · jarmak-nv · Jul 27, 2023
diff --git a/notebooks/cuSpatial-Polygonize.ipynb b/notebooks/cuSpatial-Polygonize.ipynb
@@ -0,0 +1,192 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import rasterio\n",
+    "import cupy as cp\n",
+    "import cuspatial\n",
+    "\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Intro\n",
+    "This notebook takes Sentinel-2 JPEG2000 images and performs a simple GPU polygonize using cuPy and cuSpatial. Once vectorized the raster image can be used like any other vector dataset, and gain GPU acceleration from other RAPIDS libraries like cuDF and cuML.\n",
+    "\n",
+    "## Sentinel 2 Data  \n",
+    "Sentinel-2 Data is open and free, it can be accessed here: https://scihub.copernicus.eu/dhus/#/home\n",
+    "\n",
+    "The Sentinel-2 mission delivers medium/high-resolution optical images (from visible to shortwave infrared range) and frequent revisit times (every 5 days with two satellites in operation). Each tile covers 100km x 100km and is projected in UTM. The data is delivered as a set of tiles in JPEG2000 format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: upload this data to S3 or similar so we don't need to use Copernicus (it requires an account)\n",
+    "# TODO: I wonder if I can use NVJPEG2000's python bindings from this repo https://github.com/louis-she/nvjpeg2k-python to read the jp2s - could be a totally GPU solution\n",
+    "sentinel_2_data = r'S2A_MSIL2A_20230711T153821_N0509_R011_T18TWL_20230711T233752/S2A_MSIL2A_20230711T153821_N0509_R011_T18TWL_20230711T233752.SAFE/GRANULE/L2A_T18TWL_A042050_20230711T154201/IMG_DATA/R10m/'\n",
+    "sentinel_2_path = Path(sentinel_2_data)\n",
+    "band_jp2s = [x for x in sentinel_2_path.iterdir() if '_B0' in str(x)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### CPU-version - needed a 64GB swapfile on 32GB system memory; gave up after 33m runtime on an i9-13900k\n",
+    "# import geopandas as gpd\n",
+    "# import rasterio\n",
+    "# from shapely.geometry import shape\n",
+    "\n",
+    "# # read the data and create the shapes \n",
+    "# with rasterio.open(band_jp2s[0]) as f:\n",
+    "#     data = f.read(1)\n",
+    "#     data = data.astype('int16')\n",
+    "#     shapes = rasterio.features.shapes(data, transform=f.transform)\n",
+    "\n",
+    "# # read the shapes as separate lists\n",
+    "# pixel_values = []\n",
+    "# geometry = []\n",
+    "# for shapedict, value in shapes:\n",
+    "#     pixel_values.append(value)\n",
+    "#     geometry.append(shape(shapedict))\n",
+    "\n",
+    "# # build the gdf object over the two lists\n",
+    "# gdf = gpd.GeoDataFrame(\n",
+    "#     {'pixel_values': pixel_values, 'geometry': geometry },\n",
+    "#     crs=\"EPSG:32618\"\n",
+    "# )\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gpu_simple_polygonize(raster_file, bounding_box_mask=None):\n",
+    "    with rasterio.open(raster_file) as src:\n",
+    "        # Get the coordinates of the pixels\n",
+    "        x_coords = cp.arange(src.bounds.left, src.bounds.right, src.transform[0])\n",
+    "        y_coords = cp.arange(src.bounds.top + src.transform[4], src.bounds.bottom + src.transform[4], src.transform[4])        \n",
+    "    \n",
+    "    # TODO: Add cuProj Transformation to WGS here\n",
+    "    # TODO: Support masking to help reduce memory errors, this image covers a lot outside of NYC\n",
+    "    cols, rows = cp.meshgrid(x_coords, y_coords)\n",
+    "    del x_coords, y_coords\n",
+    "\n",
+    "    # Create arrays representing the four corners of each polygon\n",
+    "    top_left = cp.dstack((cols, rows))\n",
+    "    top_right = cp.dstack((cols + src.transform[0], rows))\n",
+    "    bottom_right = cp.dstack((cols + src.transform[0], rows - src.transform[4]))\n",
+    "    bottom_left = cp.dstack((cols, rows - src.transform[4]))\n",
+    "\n",
+    "    polygons = cp.stack((top_left, top_right, bottom_right, bottom_left, top_left), axis=2)\n",
+    "    del top_left, top_right, bottom_right, bottom_left, cols, rows\n",
+    "\n",
+    "    interleaved_polygons = polygons.reshape(polygons.shape[0], polygons.shape[1], -1)\n",
+    "    del polygons\n",
+    "\n",
+    "    return interleaved_polygons.flatten()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate the geometry from the raster\n",
+    "polygons_for_cuspatial = gpu_simple_polygonize(band_jp2s[0])\n",
+    "geometry_offsets = cp.arange(0, int(len(polygons_for_cuspatial)/10) + 1, 1)\n",
+    "part_offsets = geometry_offsets\n",
+    "ring_offsets = cp.arange(0, int(len(polygons_for_cuspatial)/2) + 1, 5)\n",
+    "\n",
+    "cuspatial_geoseries = cuspatial.GeoSeries.from_polygons_xy(polygons_for_cuspatial, ring_offsets, part_offsets, geometry_offsets)\n",
+    "polygonized_raster = cuspatial.GeoDataFrame()\n",
+    "polygonized_raster['geometry'] = cuspatial_geoseries\n",
+    "del cuspatial_geoseries, polygons_for_cuspatial, geometry_offsets, part_offsets, ring_offsets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add band values to the GeoDataFrame\n",
+    "for band in band_jp2s:\n",
+    "    band_number = str(band).split('_')[-2]\n",
+    "    with rasterio.open(band) as src:\n",
+    "        # Read pixel values and add to our cudf DataFrame (they return as a numpy array)\n",
+    "        polygonized_raster[band_number] = src.read().flatten()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate NDVI - a vegetation index, values over 0.5 are very likely to be vegetation\n",
+    "polygonized_raster['NDVI'] = (polygonized_raster['B08'] - polygonized_raster['B04']) / (polygonized_raster['B08'] + polygonized_raster['B04'])\n",
+    "vegetation = polygonized_raster[(polygonized_raster['NDVI'] >= 0.5) & (polygonized_raster['NDVI'] < 1)]\n",
+    "vegetation.reset_index(inplace=True, drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# I wonder if we can do a DBSCAN that clusters based on band values that gives us a cluster similar to our vegetation pixels\n",
+    "# Not working right now, probably need to use cuml.preprocessing.StandardScaler to scale the data first\n",
+    "import cudf\n",
+    "from cuml.cluster import DBSCAN\n",
+    "\n",
+    "polygonized_raster = polygonized_raster[:100000] # Sliced for memory reasons\n",
+    "\n",
+    "dbscan = DBSCAN(eps=250, min_samples=2)\n",
+    "# Workaround for it breaking on the geometry column, hopefully a better way exists for memory reasons\n",
+    "clusters = dbscan.fit_predict(cudf.DataFrame([polygonized_raster.B02, polygonized_raster.B03, polygonized_raster.B04, polygonized_raster.B08]).astype('float32'))\n",
+    "clusters"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "rapids-23.06",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}