From 109c6fc839cf19d47e1fb1717cb9810e353c324b Mon Sep 17 00:00:00 2001 From: casadoj Date: Tue, 27 Aug 2024 15:08:22 +0200 Subject: [PATCH] Copy files from lisflood-preprocessing --- src/lisfloodutilities/lfcoords/__init__.py | 37 ++++ .../lfcoords/coarser_grid.py | 196 ++++++++++++++++++ src/lisfloodutilities/lfcoords/config.yml | 13 ++ src/lisfloodutilities/lfcoords/finer_grid.py | 130 ++++++++++++ src/lisfloodutilities/lfcoords/lfcoords.py | 44 ++++ src/lisfloodutilities/lfcoords/utils.py | 139 +++++++++++++ 6 files changed, 559 insertions(+) create mode 100644 src/lisfloodutilities/lfcoords/__init__.py create mode 100644 src/lisfloodutilities/lfcoords/coarser_grid.py create mode 100644 src/lisfloodutilities/lfcoords/config.yml create mode 100644 src/lisfloodutilities/lfcoords/finer_grid.py create mode 100644 src/lisfloodutilities/lfcoords/lfcoords.py create mode 100644 src/lisfloodutilities/lfcoords/utils.py diff --git a/src/lisfloodutilities/lfcoords/__init__.py b/src/lisfloodutilities/lfcoords/__init__.py new file mode 100644 index 0000000..7ca5e4e --- /dev/null +++ b/src/lisfloodutilities/lfcoords/__init__.py @@ -0,0 +1,37 @@ +import yaml +from pathlib import Path +from typing import Union, Dict + + +class Config: + def __init__(self, config_file): + """ + Reads the configuration from a YAML file and sets default values if not provided. + + Parameters: + ----------- + config_file: string or pathlib.Path + The path to the YAML configuration file. + """ + + # read configuration file + with open(config_file, 'r', encoding='utf8') as ymlfile: + config = yaml.load(ymlfile, Loader=yaml.FullLoader) + + # input + self.STATIONS = Path(config['input']['stations']) + self.LDD_FINE = Path(config['input']['ldd_fine']) + self.UPSTREAM_FINE = Path(config['input']['upstream_fine']) + self.LDD_COARSE = Path(config['input']['ldd_coarse']) + self.UPSTREAM_COARSE = Path(config['input']['upstream_coarse']) + + # output + self.SHAPE_FOLDER = Path(config.get('output_folder', './shapefiles')) + self.SHAPE_FOLDER.mkdir(parents=True, exist_ok=True) + + # conditions + self.MIN_AREA = config['conditions'].get('min_area', 10) + self.ABS_ERROR = config['conditions'].get('abs_error', 50) + self.PCT_ERROR = config['conditions'].get('pct_error', 1) + + \ No newline at end of file diff --git a/src/lisfloodutilities/lfcoords/coarser_grid.py b/src/lisfloodutilities/lfcoords/coarser_grid.py new file mode 100644 index 0000000..7b4e6d3 --- /dev/null +++ b/src/lisfloodutilities/lfcoords/coarser_grid.py @@ -0,0 +1,196 @@ +import os +os.environ['USE_PYGEOS'] = '0' +import numpy as np +import pandas as pd +import geopandas as gpd +import rioxarray +import pyflwdir +from pathlib import Path +from tqdm.auto import tqdm +from typing import Optional +import logging +import warnings +warnings.filterwarnings("ignore") + +from lisfloodpreprocessing import Config +from lisfloodpreprocessing.utils import catchment_polygon + +# set logger +logging.basicConfig(level=logging.INFO, + format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger(__name__) + +def coordinates_coarse( + cfg: Config, + stations: pd.DataFrame, + save: bool = True +) -> Optional[pd.DataFrame]: + """ + Transforms station coordinates from a high-resolution grid to a corresponding location in a coarser grid, aiming to match the shape of the catchment area derived from the high-resolution map. It updates the station coordinates and exports the catchment areas as shapefiles in the coarser grid. + + The function reads the upstream area map and local drainage direction (LDD) map in the coarse grid. It then finds the pixel in the coarse grid that best matches the catchment shape derived from the high-resolution map. The match is evaluated based on the intersection-over-union of catchment shapes and the ratio of upstream areas. + + Parameters: + ----------- + cfg: Config + Configuration object containing file paths and parameters specified in the configuration file. + stations: pandas.DataFrame + DataFrame containing station coordinates and upstream areas in the finer grid. It's the result of coarse_grid.coarse_grid() + save: bool + If True, the updated stations table is saved to a CSV file. + If False, the updated stations DataFrame is returned without saving. + + Returns: + -------- + staions: pandas.DataFrame + If save is False, returns a pandas DataFrame with updated station coordinates and upstream areas in the coarser grid. Otherwise, the function returns None, and the results are saved directly to a CSV file. + """ + + ### READ INPUTS + + # read upstream area map of coarse grid + upstream_coarse = rioxarray.open_rasterio(cfg.UPSTREAM_COARSE).squeeze(dim='band') + logger.info(f'Map of upstream area corretly read: {cfg.UPSTREAM_COARSE}') + + # read local drainage direction map + ldd_coarse = rioxarray.open_rasterio(cfg.LDD_COARSE).squeeze(dim='band') + logger.info(f'Map of local drainage directions correctly read: {cfg.LDD_COARSE}') + + + ### PROCESSING + + # create river network + fdir_coarse = pyflwdir.from_array(ldd_coarse.data, + ftype='ldd', + transform=ldd_coarse.rio.transform(), + check_ftype=False, + latlon=True) + + # boundaries of the input maps + lon_min, lat_min, lon_max, lat_max = np.round(ldd_coarse.rio.bounds(), 6) + + # resolution of the input maps + cellsize = np.round(np.mean(np.diff(ldd_coarse.x)), 6) # degrees + cellsize_arcmin = int(np.round(cellsize * 60, 0)) # arcmin + suffix_coarse = f'{cellsize_arcmin}min' + logger.info(f'Coarse resolution is {cellsize_arcmin} arcminutes') + + # extract resolution of the finer grid from 'stations' + suffix_fine = sorted(set([col.split('_')[1] for col in stations.columns if '_' in col]))[0] + cols_fine = [f'{col}_{suffix_fine}' for col in ['area', 'lat', 'lon']] + + # add new columns to 'stations' + cols_coarse = [f'{col}_{suffix_coarse}' for col in ['area', 'lat', 'lon']] + stations[cols_coarse] = np.nan + + # output folders + SHAPE_FOLDER_FINE = cfg.SHAPE_FOLDER / suffix_fine + SHAPE_FOLDER_COARSE = cfg.SHAPE_FOLDER / suffix_coarse + SHAPE_FOLDER_COARSE.mkdir(parents=True, exist_ok=True) + + # search range of 5x5 array -> this is where the best point can be found in the coarse grid + rangexy = np.linspace(-2, 2, 5) * cellsize # arcmin + for ID, attrs in tqdm(stations.iterrows(), total=stations.shape[0], desc='stations'): + + # real upstream area + area_ref = attrs['area'] + + # coordinates and upstream area in the fine grid + lat_fine, lon_fine, area_fine = attrs[[f'{col}_{suffix_fine}' for col in ['lat', 'lon', 'area']]] + + if (area_ref < cfg.MIN_AREA) or (area_fine < cfg.MIN_AREA): + logger.warning(f'The catchment area of station {ID} is smaller than the minimum of {cfg.MIN_AREA} km2') + continue + + # import shapefile of catchment polygon + shapefile = SHAPE_FOLDER_FINE / f'{ID}.shp' + try: + basin_fine = gpd.read_file(shapefile) + logger.info(f'Catchment polygon correctly read: {shapefile}') + except OSError as e: + logger.error(f'Error reading {shapefile}: {e}') + continue + except Exception as e: # This will catch other exceptions that might occur. + logger.error(f'An unexpected error occurred while reading {shapefile}: {e}') + continue + + # find ratio + logger.debug('Start search') + inter_vs_union, area_ratio, area_lisf = [], [], [] + for Δlat in rangexy: + for Δlon in rangexy: + lon = lon_fine + Δlon + lat = lat_fine + Δlat + basin = catchment_polygon(fdir_coarse.basins(xy=(lon, lat)).astype(np.int32), + transform=ldd_coarse.rio.transform(), + crs=ldd_coarse.rio.crs) + + # calculate union and intersection of shapes + intersection = gpd.overlay(basin_fine, basin, how='intersection') + union = gpd.overlay(basin_fine, basin, how='union') + inter_vs_union.append(intersection.area.sum() / union.area.sum()) + + # get upstream area (km2) of coarse grid (LISFLOOD) + area = upstream_coarse.sel(x=lon, y=lat, method='nearest').item() * 1e-6 + area_lisf.append(area) + + # ratio between reference and coarse area + if area_ref == 0 or area == 0: + ratio = 0 + else: + ratio = area_ref / area if area_ref < area else area / area_ref + area_ratio.append(ratio) + logger.debug('End search') + + # maximum of shape similarity and upstream area accordance + i_shape = np.argmax(inter_vs_union) + area_shape = area_lisf[i_shape] + i_centre = int(len(rangexy)**2 / 2) # middle point + area_centre = area_lisf[i_centre] + # use middle point if errors are small + abs_error = abs(area_shape - area_centre) + pct_error = 100 * abs(1 - area_centre / area_shape) + if (abs_error <= cfg.ABS_ERROR) and (pct_error <= cfg.PCT_ERROR): + i_shape = i_centre + area_shape = area_centre + + #i_ratio = np.argmax(area_ratio) + + # coordinates in the fine resolution + i = i_shape // len(rangexy) + j = i_shape % len(rangexy) + lat = lat_fine + rangexy[i] + lon = lon_fine + rangexy[j] + + # coordinates and upstream area on coarse resolution + area = upstream_coarse.sel(x=lon, y=lat, method='nearest') + area_coarse = area.item() * 1e-6 + lon_coarse = area.x.item() + lat_coarse = area.y.item() + + # derive catchment polygon from the selected coordinates + basin_coarse = catchment_polygon(fdir_coarse.basins(xy=(lon_coarse, lat_coarse)).astype(np.int32), + transform=ldd_coarse.rio.transform(), + crs=ldd_coarse.rio.crs, + name='ID') + basin_coarse['ID'] = ID + basin_coarse.set_index('ID', inplace=True) + basin_coarse[cols_fine] = area_fine, lat_fine, lon_fine + basin_coarse[cols_coarse] = area_coarse, lat_coarse, lon_coarse + + # export shapefile + output_shp = SHAPE_FOLDER_COARSE / f'{ID}.shp' + basin_coarse.to_file(output_shp) + logger.info(f'Catchment {ID} exported as shapefile: {output_shp}') + + # update new columns in 'stations' + stations.loc[ID, cols_coarse] = [int(area_coarse), round(lat_coarse, 6), round(lon_coarse, 6)] + + # return/save + stations.sort_index(axis=1, inplace=True) + if save: + output_csv = cfg.STATIONS.parent / f'{cfg.STATIONS.stem}_{suffix_coarse}.csv' + stations.to_csv(output_csv) + logger.info(f'The updated stations table in the coarser grid has been exported to: {output_csv}') + else: + return stations diff --git a/src/lisfloodutilities/lfcoords/config.yml b/src/lisfloodutilities/lfcoords/config.yml new file mode 100644 index 0000000..d66f3f7 --- /dev/null +++ b/src/lisfloodutilities/lfcoords/config.yml @@ -0,0 +1,13 @@ +input: + stations: # CSV file defining four station attributes: 'ID', 'lat', 'lon', 'area' in km2 + ldd_fine: # TIFF or NetCDF file of the local direction drainage in the high resolution grid + upstream_fine: # TIFF or NetCDF file of the upstream area (km2) in the high resolution grid + ldd_coarse: # TIFF or NetCDF file of the local direction drainage in the low resolution grid + upstream_coarse: # TIFF or NetCDF file of the upstream area (km2) in the low resolution grid + +output_folder: # folder where catchment shapefiles will be saved. By default, './shapefiles/' + +conditions: + min_area: # minimum catchment area (km2) to consider a station. By default, 10 km2 + abs_error: # maximum absolute error (km2) allowed between the fine and coarse resolution catchments. By default, 50 km2 + pct_error: # maximum percentage error (%) allowed between the fine and coarse resolution catchments. By default, 1% \ No newline at end of file diff --git a/src/lisfloodutilities/lfcoords/finer_grid.py b/src/lisfloodutilities/lfcoords/finer_grid.py new file mode 100644 index 0000000..eade254 --- /dev/null +++ b/src/lisfloodutilities/lfcoords/finer_grid.py @@ -0,0 +1,130 @@ +import numpy as np +import pandas as pd +import rioxarray +import pyflwdir +from tqdm.auto import tqdm +from pathlib import Path +from typing import Optional +import logging +import warnings +warnings.filterwarnings("ignore") + +from lisfloodpreprocessing import Config +from lisfloodpreprocessing.utils import find_pixel, catchment_polygon + +# set logger +logging.basicConfig(level=logging.INFO, + format='%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger(__name__) + +def coordinates_fine( + cfg: Config, + save: bool = True +) -> Optional[pd.DataFrame]: + """ + Processes station coordinates to find the most accurate pixel in a high-resolution map, + based on a reference value of catchment area. It updates the station coordinates and + exports the catchment areas as shapefiles. + + The function reads the upstream area map and local drainage direction (LDD) map in + fine resolution, as well as the station coordinates with their reference upstream areas. + It then iteratively searches for the best-matching pixel within an increasing search + range and applies penalties and factors to determine the closest match. The function + creates a boolean map of each station's catchment area and vectorizes it into a + GeoDataFrame for export as a shapefile. + + Parameters: + ----------- + cfg: Config + Configuration object containing file paths and parameters specified in the configuration file. + save: bool + If True, the updated stations table is saved to a CSV file. If False, the updated stations DataFrame is returned without saving. + + Returns: + -------- + stations: pandas.DataFrame + If save is False, returns a pandas DataFrame with updated station coordinates and upstream areas in the finer grid. Otherwise, the function returns None, and the results are saved directly to a CSV file. + """ + + # READ INPUTS + + # read upstream map with fine resolution + upstream_fine = rioxarray.open_rasterio(cfg.UPSTREAM_FINE).squeeze(dim='band') + logger.info(f'Map of upstream area corretly read: {cfg.UPSTREAM_FINE}') + + # read local drainage direction map + ldd_fine = rioxarray.open_rasterio(cfg.LDD_FINE).squeeze(dim='band') + logger.info(f'Map of local drainage directions corretly read: {cfg.LDD_FINE}') + + # read stations text file + stations = pd.read_csv(cfg.STATIONS, index_col='ID') + logger.info(f'Table of stations correctly read: {cfg.STATIONS}') + + + # PROCESSING + + # resolution of the input map + cellsize = np.mean(np.diff(upstream_fine.x)) # degrees + cellsize_arcsec = int(np.round(cellsize * 3600, 0)) # arcsec + suffix_fine = f'{cellsize_arcsec}sec' + logger.info(f'The resolution of the finer grid is {cellsize_arcsec} arcseconds') + + # add columns to the table of stations + new_cols = sorted([f'{col}_{suffix_fine}' for col in stations.columns]) + stations[new_cols] = np.nan + + # create river network + fdir_fine = pyflwdir.from_array(ldd_fine.data, + ftype='d8', + transform=ldd_fine.rio.transform(), + check_ftype=False, + latlon=True) + + # output path + SHAPE_FOLDER_FINE = cfg.SHAPE_FOLDER / suffix_fine + SHAPE_FOLDER_FINE.mkdir(parents=True, exist_ok=True) + + for ID, attrs in tqdm(stations.iterrows(), total=stations.shape[0], desc='stations'): + + # reference coordinates and upstream area + lat_ref, lon_ref, area_ref = attrs[['lat', 'lon', 'area']] + + # search new coordinates in an increasing range + ranges = [55, 101, 151] + penalties = [500, 500, 1000] + factors = [2, .5, .25] + acceptable_errors = [50, 80, np.nan] + for rangexy, penalty, factor, max_error in zip(ranges, penalties, factors, acceptable_errors): + logger.debug(f'Set range to {rangexy}') + lat, lon, error = find_pixel(upstream_fine, lat_ref, lon_ref, area_ref, rangexy=rangexy, penalty=penalty, factor=factor) + if error <= max_error: + break + + # update new columns in 'stations' + stations.loc[ID, new_cols] = [int(upstream_fine.sel(y=lat, x=lon).item()), round(lat, 6), round(lon, 6)] + + # boolean map of the catchment associated to the corrected coordinates + basin_arr = fdir_fine.basins(xy=(lon, lat)).astype(np.int32) + + # vectorize the boolean map into geopandas + basin_gdf = catchment_polygon(basin_arr.astype(np.int32), + transform=ldd_fine.rio.transform(), + crs=ldd_fine.rio.crs, + name='ID') + basin_gdf['ID'] = ID + basin_gdf.set_index('ID', inplace=True) + basin_gdf[attrs.index] = attrs.values + + # export shape file + output_file = SHAPE_FOLDER_FINE / f'{ID}.shp' + basin_gdf.to_file(output_file) + logger.info(f'Catchment {ID} exported as shapefile: {output_file}') + + # return/save + stations.sort_index(axis=1, inplace=True) + if save: + output_csv = cfg.STATIONS.parent / f'{cfg.STATIONS.stem}_{suffix_fine}.csv' + stations.to_csv(output_csv) + logger.info(f'The updated stations table in the finer grid has been exported to: {output_csv}') + else: + return stations \ No newline at end of file diff --git a/src/lisfloodutilities/lfcoords/lfcoords.py b/src/lisfloodutilities/lfcoords/lfcoords.py new file mode 100644 index 0000000..c3a09e3 --- /dev/null +++ b/src/lisfloodutilities/lfcoords/lfcoords.py @@ -0,0 +1,44 @@ +import pandas as pd +import argparse +import logging + +from lisfloodpreprocessing import Config +from lisfloodpreprocessing.finer_grid import coordinates_fine +from lisfloodpreprocessing.coarser_grid import coordinates_coarse + +def main(): + + parser = argparse.ArgumentParser( + description=""" + Correct the coordinates of a set of stations to match the river network in the + LISFLOOD static map. + First, it uses a reference value of catchment area to find the most accurate + pixel in a high-resolution map. + Second, it finds the pixel in the low-resolution map that better matches the + catchment shape derived from the high-resolution map. + """ + ) + parser.add_argument('-c', '--config-file', type=str, required=True, help='Path to the configuration file') + args = parser.parse_args() + + # create logger + logger = logging.getLogger('correct-coordinates') + logger.setLevel(logging.INFO) + logger.propagate = False + log_format = logging.Formatter('%(asctime)s | %(levelname)s | %(name)s | %(message)s', datefmt='%Y-%m-%d %H:%M:%S') + c_handler = logging.StreamHandler() + c_handler.setFormatter(log_format) + c_handler.setLevel(logging.INFO) + logger.addHandler(c_handler) + + # read configuration + cfg = Config(args.config_file) + + # find coordinates in high resolution + stations_HR = coordinates_fine(cfg, save=False) + + # find coordinates in LISFLOOD + coordinates_coarse(cfg, stations_HR, save=True) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/lisfloodutilities/lfcoords/utils.py b/src/lisfloodutilities/lfcoords/utils.py new file mode 100644 index 0000000..8174c42 --- /dev/null +++ b/src/lisfloodutilities/lfcoords/utils.py @@ -0,0 +1,139 @@ +import os +os.environ['USE_PYGEOS'] = '0' +import numpy as np +import geopandas as gpd +import xarray as xr +from affine import Affine +from rasterio import features +from pyproj.crs import CRS +from typing import Tuple + + + +def find_pixel( + upstream: xr.DataArray, + lat: int, + lon: int, + area: float, + rangexy: int = 55, + penalty: int = 500, + factor: int = 2, + distance_scaler: float = .92, + error_threshold: int = 50 +) -> Tuple: + """ + Find the coordinates of the pixel in the upstream map with a smaller error compared with a reference area. + + Parameters: + ----------- + upstream: xr.DataArray + The upstream data containing latitude and longitude coordinates. + lat: float + The original latitude value. + lon: float + The original longitude value. + area: float + The reference area to calculate percent error. + rangexy: int, optional + The range in both x and y directions to search for the new location. + penalty: int, optional + The penalty value to add to the distance when the percent error is too high. + error_threshold: float, optional + The threshold for the percent error to apply the penalty. + factor: int, optional + The factor to multiply with the distance for the error calculation. + distance_scaler: float, optional + The scaling factor for the distance calculation in pixels. + + Returns: + -------- + lat_new : float + The latitude of the new location. + lon_new : float + The longitude of the new location. + min_error : float + The minimum error value at the new location. + """ + + # find coordinates of the nearest pixel in the map + nearest_pixel = upstream.sel(y=lat, x=lon, method='nearest') + lat_orig, lon_orig = [nearest_pixel[coord].item() for coord in ['y', 'x']] + + # extract subset of the upstream map + cellsize = np.mean(np.diff(upstream.x.data)) + delta = rangexy * cellsize + 1e-6 + upstream_sel = upstream.sel(y=slice(lat_orig + delta, lat_orig - delta), + x=slice(lon_orig - delta, lon_orig + delta)) + + # distance from the original pixel (in pixels) + i = np.arange(-rangexy, rangexy + 1) + ii, jj = np.meshgrid(i, i) + distance = xr.DataArray(data=np.sqrt(ii**2 + jj**2) * distance_scaler, coords=upstream_sel.coords, dims=upstream_sel.dims) + + # percent error in catchment area + error = 100 * abs(area - upstream_sel) / area + + # penalise if error is too big + distance = distance.where(error <= error_threshold, distance + penalty) + + # update error based on distance (every pixel deviation is a 'factor' increase in the error) + error += factor * distance + + # the new location is that with the smallest error + min_error = error.where(error == error.min(), drop=True) + lat_new, lon_new = [min_error[coord].item() for coord in ['y', 'x']] + + return lat_new, lon_new, min_error.item() + + + +def catchment_polygon( + data: np.ndarray, + transform: Affine, + crs: CRS, + name: str = "catchment" +) -> gpd.GeoDataFrame: + """ + Convert a boolean 2D array of catchment extent to a GeoDataFrame. + + Parameters: + ----------- + data : numpy.ndarray + The input raster data to be vectorized. + transform : affine.Affine + The affine transform matrix for the raster data. + crs : dict or str + The Coordinate Reference System (CRS) of the input data. + name : str, optional + The name to be used for the value column in the resulting GeoDataFrame. + + Returns: + -------- + gdf : geopandas.GeoDataFrame + A GeoDataFrame containing the vectorized geometries. + """ + + # Generate shapes and associated values from the raster data + feats_gen = features.shapes( + data, + mask=data != 0, + transform=transform, + connectivity=8, + ) + + # Create a list of features with geometry and properties + feats = [ + {"geometry": geom, "properties": {name: val}} for geom, val in feats_gen + ] + + # Check if no features were found + if not feats: + raise ValueError("No features found in the raster data.") + + # Create a GeoDataFrame from the features + gdf = gpd.GeoDataFrame.from_features(feats, crs=crs) + + # Convert the value column to the same data type as the input raster data + gdf[name] = gdf[name].astype(data.dtype) + + return gdf \ No newline at end of file