diff --git a/src/lisfloodutilities/gridding/generate_grids.py b/src/lisfloodutilities/gridding/generate_grids.py index 2532739..1675a95 100755 --- a/src/lisfloodutilities/gridding/generate_grids.py +++ b/src/lisfloodutilities/gridding/generate_grids.py @@ -21,8 +21,8 @@ from pathlib import Path from argparse import ArgumentParser, ArgumentTypeError from datetime import datetime, timedelta -from lisfloodutilities.gridding.lib.utils import Printable, Dem, Config, FileUtils, GriddingUtils -from lisfloodutilities.gridding.lib.writers import NetCDFWriter, GDALWriter +from src.lisfloodutilities.gridding.lib.utils import Printable, Dem, Config, FileUtils, GriddingUtils +from src.lisfloodutilities.gridding.lib.writers import NetCDFWriter, GDALWriter END_DATE_DEFAULT = datetime.now().strftime('%Y%m%d060000') @@ -35,8 +35,14 @@ def print_msg(msg: str = ''): if not quiet_mode: print(msg) -def run(config_filename: str, infolder: str, outfolder_or_file: str, processing_dates_file: str, file_utils: FileUtils, - output_tiff: bool, overwrite_output: bool, start_date: datetime = None, end_date: datetime = None): +def interpolation_mode_type(mode: str) -> str: + if not mode or mode not in Config.INTERPOLATION_MODES: + raise ArgumentTypeError(f'You must select a mode out of {list(Config.INTERPOLATION_MODES.keys())}.') + return mode + +def run(config_filename: str, infolder: str, output_file: str, processing_dates_file: str, file_utils: FileUtils, + output_tiff: bool, overwrite_output: bool, start_date: datetime = None, end_date: datetime = None, + interpolation_mode: str = 'adw', use_broadcasting: bool = False): """ Interpolate text files containing (x, y, value) using inverse distance interpolation. Produces as output, either a netCDF file containing all the grids or one TIFF file per grid. @@ -47,13 +53,13 @@ def run(config_filename: str, infolder: str, outfolder_or_file: str, processing_ """ global quiet_mode - conf = Config(config_filename, start_date, end_date, quiet_mode) + conf = Config(config_filename, start_date, end_date, quiet_mode, interpolation_mode) if conf.start_date > conf.end_date: raise ArgumentTypeError("Start date is greater than End date.") dates_to_process = file_utils.read_processing_dates_file(processing_dates_file) - grid_utils = GriddingUtils(conf, quiet_mode) + grid_utils = GriddingUtils(conf, quiet_mode, use_broadcasting) size_lons = len(conf.dem_lons) size_lats = len(conf.dem_lats) @@ -69,34 +75,25 @@ def run(config_filename: str, infolder: str, outfolder_or_file: str, processing_ netcdf_offset_file_date = int(conf.get_config_field('VAR_TIME','OFFSET_FILE_DATE')) + outfile = output_file if output_tiff: - outfolder = outfolder_or_file - output_writer = GDALWriter(conf, overwrite_output, quiet_mode) - for filename in sorted(Path(infolder).rglob(inwildcard)): - file_timestamp = file_utils.get_timestamp_from_filename(filename) + timedelta(days=netcdf_offset_file_date) - if not file_utils.processable_file(file_timestamp, dates_to_process, conf.start_date, conf.end_date): - continue # Skip processing file - print_msg(f'Processing file: {filename}') - outfile = str(filename).replace(infolder, outfolder) - outfilepath = Path(outfile).with_suffix('.tiff') - # Create the output parent folders if not exist yet - Path(outfilepath.parent).mkdir(parents=True, exist_ok=True) - output_writer.open(Path(outfilepath)) - grid_data = grid_utils.generate_grid(filename) - output_writer.write(grid_data, file_timestamp) - output_writer.close() - else: # NetCDF - outfile = outfolder_or_file - output_writer = NetCDFWriter(conf, overwrite_output, quiet_mode) - output_writer.open(Path(outfile)) - for filename in sorted(Path(infolder).rglob(inwildcard)): - file_timestamp = file_utils.get_timestamp_from_filename(filename) + timedelta(days=netcdf_offset_file_date) - if not file_utils.processable_file(file_timestamp, dates_to_process, conf.start_date, conf.end_date): - continue # Skip processing file - print_msg(f'Processing file: {filename}') - grid_data = grid_utils.generate_grid(filename) - output_writer.write(grid_data, file_timestamp) - output_writer.close() + output_writer_tiff = GDALWriter(conf, overwrite_output, quiet_mode) + output_writer_netcdf = NetCDFWriter(conf, overwrite_output, quiet_mode) + output_writer_netcdf.open(Path(outfile)) + for filename in sorted(Path(infolder).rglob(inwildcard)): + file_timestamp = file_utils.get_timestamp_from_filename(filename) + timedelta(days=netcdf_offset_file_date) + if not file_utils.processable_file(file_timestamp, dates_to_process, conf.start_date, conf.end_date): + continue # Skip processing file + print_msg(f'Processing file: {filename}') + if output_tiff: + outfilepath = filename.with_suffix('.tiff') + output_writer_tiff.open(outfilepath) + grid_data = grid_utils.generate_grid(filename) + output_writer_netcdf.write(grid_data, file_timestamp) + if output_tiff: + output_writer_tiff.write(grid_data, file_timestamp) + output_writer_tiff.close() + output_writer_netcdf.close() print_msg('Finished writing files') @@ -111,7 +108,7 @@ def main(argv): program_version_string = 'version %s (%s)\n' % (program_version, program_build_date) program_longdesc = ''' - This script interpolates meteo input variables data into either a single NETCDF4 file or one GEOTIFF file per timestep. + This script interpolates meteo input variables data into a single NETCDF4 file and, if selected, generates also a GEOTIFF file per timestep. The resulting netCDF is CF-1.6 compliant. ''' program_license = """ @@ -134,14 +131,16 @@ def main(argv): out_tiff=False, overwrite_output=False, start_date='', - end_date=END_DATE_DEFAULT) + end_date=END_DATE_DEFAULT, + interpolation_mode='adw', + use_broadcasting=False) parser.add_argument("-i", "--in", dest="infolder", required=True, type=FileUtils.folder_type, help="Set input folder path with kiwis/point files", metavar="input_folder") - parser.add_argument("-o", "--out", dest="outfolder_or_file", required=True, type=FileUtils.file_or_folder, - help="Set output folder base path for the tiff files or the netCDF file path.", - metavar="{output_folder, netcdf_file}") + parser.add_argument("-o", "--out", dest="output_file", required=True, type=FileUtils.file_or_folder, + help="Set the output netCDF file path containing all the timesteps between start and end dates.", + metavar="output_netcdf_file") parser.add_argument("-c", "--conf", dest="config_type", required=True, help="Set the grid configuration type to use.", metavar="{5x5km, 1arcmin,...}") @@ -159,9 +158,14 @@ def main(argv): metavar="YYYYMMDDHHMISS") parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", help="Set script output into quiet mode [default: %(default)s]") parser.add_argument("-t", "--tiff", dest="out_tiff", action="store_true", - help="Outputs a tiff file per timestep instead of the default single netCDF [default: %(default)s]") + help="Outputs a tiff file per timestep and also the single netCDF with all the timesteps [default: %(default)s]") parser.add_argument("-f", "--force", dest="overwrite_output", action="store_true", help="Force write to existing file. TIFF files will be overwritten and netCDF file will be appended. [default: %(default)s]") + parser.add_argument("-m", "--mode", dest="interpolation_mode", required=False, type=interpolation_mode_type, + help="Set interpolation mode. [default: %(default)s]", + metavar=f"{list(Config.INTERPOLATION_MODES.keys())}") + parser.add_argument("-b", "--broadcast", dest="use_broadcasting", action="store_true", + help="When set, computations will run faster in full broadcasting mode but require more memory. [default: %(default)s]") # process options args = parser.parse_args(argv) @@ -194,17 +198,18 @@ def main(argv): print_msg(f"Input Folder: {args.infolder}") print_msg(f"Overwrite Output: {args.overwrite_output}") + print_msg(f"Interpolation Mode: {args.interpolation_mode}") + print_msg(f"Broadcasting: {args.use_broadcasting}") if args.out_tiff: print_msg("Output Type: TIFF") - print_msg(f"Output Folder: {args.outfolder_or_file}") - else: - print_msg("Output Type: netCDF") - print_msg(f"Output File: {args.outfolder_or_file}") + print_msg(f"Output Folder: {args.infolder}") + print_msg("Output Type: netCDF") + print_msg(f"Output File: {args.output_file}") print_msg(f"Processing Dates File: {args.processing_dates_file}") print_msg(f"Config File: {config_filename}") - run(config_filename, args.infolder, args.outfolder_or_file, args.processing_dates_file, - file_utils, args.out_tiff, args.overwrite_output, start_date, end_date) + run(config_filename, args.infolder, args.output_file, args.processing_dates_file, + file_utils, args.out_tiff, args.overwrite_output, start_date, end_date, args.interpolation_mode, args.use_broadcasting) except Exception as e: indent = len(program_name) * " " sys.stderr.write(program_name + ": " + repr(e) + "\n") diff --git a/src/lisfloodutilities/gridding/lib/utils.py b/src/lisfloodutilities/gridding/lib/utils.py index 02f7258..ae6dc9b 100755 --- a/src/lisfloodutilities/gridding/lib/utils.py +++ b/src/lisfloodutilities/gridding/lib/utils.py @@ -47,8 +47,8 @@ def print_msg(self, msg: str = ''): class Dem(Printable): - def __init__(self, dem_map: Path): - super().__init__() + def __init__(self, dem_map: Path, quiet_mode: bool = False): + super().__init__(quiet_mode) self._dem_map = dem_map self.print_msg(f'Reading grid settings and altitude values from: {dem_map}') reader = NetCDFReader(self._dem_map) @@ -150,14 +150,18 @@ def file_or_folder(fname: str) -> str: class Config(Printable): + # Each interpolation is paired with the number of neighbours used by the interpolation algorithm. + INTERPOLATION_MODES = {'nearest': 1, 'invdist': 20, 'adw': 11, 'cdd': 11, + 'bilinear': 4, 'triangulation': 3, 'bilinear_delaunay': 4} def __init__(self, config_filename: str, start_date: datetime = None, - end_date: datetime = None, quiet_mode: bool = False): + end_date: datetime = None, quiet_mode: bool = False, interpolation_mode: str = 'adw'): super().__init__(quiet_mode) self.COLUMN_HEIGHT = "height" self.COLUMN_VALUE = "value" self.COLUMN_LON = "lon" self.COLUMN_LAT = "lat" self.VALUE_NAN = -9999.0 + self.interpolation_mode = interpolation_mode self.__setup_variable_config(config_filename) self.__HEIGHT_CORRECTION_FACTOR = {"tx": 0.006, "tn": 0.006, "ta": 0.006, "ta6": 0.006, "pd": 0.00025} @@ -167,18 +171,30 @@ def __init__(self, config_filename: str, start_date: datetime = None, self._dem_height_correction_kdt_file = Path(self.config_path).joinpath('dem.kdt') self.__setup_dem() - - # self.interpolation_mode = 'nearest' - self.interpolation_mode = 'invdist' - # self.interpolation_mode = 'bilinear' - # self.interpolation_mode = 'triangulation' - # self.interpolation_mode = 'bilinear_delaunay' - self.scipy_modes_nnear = {'nearest': 1, 'invdist': 4, 'bilinear': 4, 'triangulation': 3, 'bilinear_delaunay': 4} - self.grid_details = {'gridType': 'NOT rotated', 'Nj': 1, 'radius': 1.0} - + self.__setup_interpolation_parameters() self.__setup_data_structures() self.__setup_dates(start_date, end_date) + def __setup_interpolation_parameters(self): + self.scipy_modes_nnear = Config.INTERPOLATION_MODES + self.grid_details = {'gridType': 'NOT rotated', 'Nj': 1, 'radius': 6367470.0} + self.min_upper_bound = 100000000 # max search distance in meters + cdd_map_path = Path(self.config_path).joinpath(f'CDDmap_{self.var_code}.nc') + self.cdd_map = f'{cdd_map_path}' + if self.interpolation_mode == 'cdd' and not os.path.isfile(self.cdd_map): + raise ArgumentTypeError(f'CDD map was not found: {self.cdd_map}') + self.cdd_mode = 'MixHofstraShepard' + # 1) m=4; r=1/3; min 4 stations + # 2) m=8, r=1/3; min 4 stations + # 3) m=8; r=2/3; min 4 stations + # 4) m=4; r=1/3; min 4 stations + CDD map without Euro4m, CarpatClim + self.cdd_options = { + 'm_const': 4, + 'min_num_of_station': 4, + 'radius_ratio': 1/3, + 'weights_mode': 'All' # 'OnlyTOP10' + } + def __setup_dates(self, start_date: datetime = None, end_date: datetime = None): netcdf_var_time_unit_pattern = self.get_config_field('VAR_TIME','UNIT_PATTERN') if start_date is None: @@ -202,7 +218,7 @@ def __setup_variable_config(self, config_filename: str): self.__configFile.read(config_filename) def __setup_dem(self): - self.dem = Dem(self._dem_file) + self.dem = Dem(self._dem_file, self.quiet_mode) self.dem_nrows = self.dem.nrows self.dem_ncols = self.dem.ncols self.dem_lons = self.dem.lons.flatten() @@ -263,39 +279,62 @@ def __setup_data_structures(self): self.DEM_HEIGHT_CORRECTION_QUERY = pickle.load(self._dem_height_correction_kdt_file.open("rb")) self.print_msg('Finish loading data structures') - def get_config_field(self, config_group: str = '', config_property: str = ''): + def get_config_field(self, config_group: str = '', config_property: str = '') -> str: return self.__configFile.get(config_group, config_property) @property - def scale_factor(self): + def scale_factor(self) -> float: return float(self.get_config_field('PROPERTIES', 'VALUE_SCALE')) @property - def add_offset(self): + def add_offset(self) -> float: return float(self.get_config_field('PROPERTIES', 'VALUE_OFFSET')) @property - def var_code(self): + def value_min(self) -> int: + return int(self.get_config_field('PROPERTIES', 'VALUE_MIN')) + + @property + def value_max(self) -> int: + return int(self.get_config_field('PROPERTIES', 'VALUE_MAX')) + + @property + def value_min_packed(self) -> int: + return int((self.value_min - self.add_offset) / self.scale_factor) + + @property + def value_max_packed(self) -> int: + return int((self.value_max - self.add_offset) / self.scale_factor) + + @property + def value_nan_packed(self) -> float: + return float((self.VALUE_NAN - self.add_offset) / self.scale_factor) + + @property + def var_code(self) -> str: return self.get_config_field('PROPERTIES', 'VAR_CODE') @property - def do_height_correction(self): + def do_height_correction(self) -> bool: return self.var_code in self.__HEIGHT_CORRECTION_FACTOR @property - def height_correction_factor(self): + def height_correction_factor(self) -> float: return self.__HEIGHT_CORRECTION_FACTOR[self.var_code] @property - def neighbours_near(self): + def neighbours_near(self) -> int: return self.scipy_modes_nnear[self.interpolation_mode] class GriddingUtils(Printable): - def __init__(self, conf: Config, quiet_mode: bool = False): + def __init__(self, conf: Config, quiet_mode: bool = False, use_broadcasting: bool = False): super().__init__(quiet_mode) self.conf = conf + self.use_broadcasting = use_broadcasting + self.unit_conversion = float(self.conf.get_config_field('PROPERTIES', 'UNIT_CONVERSION')) + # self.compressed_NaN = (self.conf.VALUE_NAN - self.conf.add_offset) / self.conf.scale_factor def correct_height(self, df: pd.DataFrame) -> pd.DataFrame: if self.conf.do_height_correction: @@ -325,8 +364,14 @@ def reset_height(self, result: ma.MaskedArray) -> np.ndarray: def prepare_grid(self, result: np.ndarray, grid_shape: np.ndarray) -> np.ndarray: # Compress data - result[np.where(result!=self.conf.VALUE_NAN)] /= self.conf.scale_factor - result[np.where(result!=self.conf.VALUE_NAN)] += self.conf.add_offset + if self.unit_conversion != 1.0: + result = result * self.unit_conversion + result[np.where(result == self.conf.VALUE_NAN)] = np.nan + result[np.where(result < self.conf.value_min)] = np.nan + result[np.where(result > self.conf.value_max)] = np.nan + result = np.round(result.astype(float), 1) + result = (result - self.conf.add_offset) / self.conf.scale_factor + result[np.isnan(result)] = self.conf.VALUE_NAN # Reshape grid grid_data = result.reshape(grid_shape) return grid_data @@ -335,12 +380,10 @@ def check_grid_nan(self, filename: Path, result: np.ndarray): # Verify if grid contains NaN different than the ones on the DEM. # Which means there are NaN values that shouldn't be generated. count_dem_nan = self.conf.dem.count_nan - value_min = int(self.conf.get_config_field('PROPERTIES', 'VALUE_MIN')) - value_max = int(self.conf.get_config_field('PROPERTIES', 'VALUE_MAX')) current_grid = result.copy() - current_grid[np.where(current_grid < value_min)] = np.nan - current_grid[np.where(current_grid > value_max)] = np.nan current_grid[np.where(current_grid == self.conf.VALUE_NAN)] = np.nan + current_grid[np.where(current_grid < self.conf.value_min)] = np.nan + current_grid[np.where(current_grid > self.conf.value_max)] = np.nan count_grid_nan = np.count_nonzero(np.isnan(current_grid)) if count_dem_nan != count_grid_nan: print(f"WARNING: The grid interpolated from file {filename.name} contains different NaN values ({count_grid_nan}) than the DEM ({count_dem_nan}). diff: {count_grid_nan - count_dem_nan}") @@ -360,10 +403,24 @@ def generate_grid(self, filename: Path) -> np.ndarray: xp = np.array(x) yp = np.array(y) values = np.array(z) - scipy_interpolation = ScipyInterpolation(xp, yp, self.conf.grid_details, values, - self.conf.neighbours_near, mv_target, mv_source, - target_is_rotated=False, parallel=False, - mode=self.conf.interpolation_mode) + if self.conf.interpolation_mode == 'cdd': + scipy_interpolation = ScipyInterpolation(xp, yp, self.conf.grid_details, values, + self.conf.neighbours_near, mv_target, mv_source, + target_is_rotated=False, parallel=False, + mode=self.conf.interpolation_mode, + cdd_map=self.conf.cdd_map, + cdd_mode=self.conf.cdd_mode, + cdd_options=self.conf.cdd_options, + use_broadcasting=self.use_broadcasting) + else: + scipy_interpolation = ScipyInterpolation(xp, yp, self.conf.grid_details, values, + self.conf.neighbours_near, mv_target, mv_source, + target_is_rotated=False, parallel=False, + mode=self.conf.interpolation_mode, + use_broadcasting=self.use_broadcasting) + + # reset search radius to a fixed value + scipy_interpolation.min_upper_bound = self.conf.min_upper_bound results, weights, indexes = scipy_interpolation.interpolate(grid_x, grid_y) result = np.ma.masked_array(data=results, mask=self.conf.dem_mask, fill_value=self.conf.VALUE_NAN) self.print_msg('Finish interpolation') diff --git a/src/lisfloodutilities/gridding/lib/writers.py b/src/lisfloodutilities/gridding/lib/writers.py index 0dd7db9..882894c 100644 --- a/src/lisfloodutilities/gridding/lib/writers.py +++ b/src/lisfloodutilities/gridding/lib/writers.py @@ -20,7 +20,8 @@ from netCDF4 import Dataset, default_fillvals, date2num, num2date from netCDF4._netCDF4 import Variable as NetCDF4Variable from osgeo import osr, gdal -from lisfloodutilities.gridding.lib.utils import Printable, Config +from src.lisfloodutilities.gridding.lib.utils import Printable, Config, FileUtils +from src.lisfloodutilities import version class OutputWriter(Printable): @@ -30,9 +31,26 @@ def __init__(self, conf: Config, overwrite_file: bool = False, quiet_mode: bool self.filepath = None self.overwrite_file = overwrite_file self.conf = conf + self.__setup_metadata() + + def __setup_metadata(self): + # General Attributes + self.Source_Software = f'lisflood-utilities.gridding v{version} (interpolation: {self.conf.interpolation_mode})' + self.reference = self.conf.get_config_field('GENERIC','NETCDF_REFERENCE') + self.title = self.conf.get_config_field('GENERIC','NETCDF_TITLE') + self.keywords = self.conf.get_config_field('GENERIC','NETCDF_KEYWORDS') + self.source = self.conf.get_config_field('GENERIC','NETCDF_SOURCE') + self.institution = self.conf.get_config_field('GENERIC','NETCDF_INSTITUTION') + self.comment = self.conf.get_config_field('GENERIC','NETCDF_COMMENT') + self.var_code = self.conf.get_config_field('PROPERTIES', 'VAR_CODE') + self.var_standard_name = self.conf.get_config_field('PROPERTIES', 'STANDARD_NAME') + self.var_long_name = self.conf.get_config_field('PROPERTIES', 'LONG_NAME') + self.cell_methods = self.conf.get_config_field('PROPERTIES', 'CELL_METHODS') + self.units = self.conf.get_config_field('PROPERTIES', 'UNIT') def open(self, out_filename: Path): self.filepath = out_filename + self.time_created = timex.ctime(timex.time()) def write(self, grid: np.ndarray, timestamp: datetime = None): raise NotImplementedError @@ -61,10 +79,6 @@ def __init__(self, conf: Config, overwrite_file: bool = False, quiet_mode: bool self.netcdf_var_time_unit_pattern = self.conf.get_config_field('VAR_TIME','UNIT_PATTERN') self.netcdf_var_time = self.conf.get_config_field('VAR_TIME','NAME') self.time_frequency = int(self.conf.get_config_field('VAR_TIME','FREQUENCY')) - self.var_code = self.conf.get_config_field('PROPERTIES', 'VAR_CODE') - self.unit_conversion = float(self.conf.get_config_field('PROPERTIES', 'UNIT_CONVERSION')) - self.scale_factor = float(self.conf.get_config_field('PROPERTIES', 'VALUE_SCALE')) - self.add_offset = float(self.conf.get_config_field('PROPERTIES', 'VALUE_OFFSET')) self.calendar_type = self.NETCDF_VAR_TIME_CALENDAR_TYPE self.calendar_time_unit = self.conf.start_date.strftime(self.netcdf_var_time_unit_pattern) @@ -78,6 +92,25 @@ def open(self, out_filename: Path): else: raise ArgumentTypeError(f'File {self.filepath} already exists. Use --force flag to append.') + def setNaN(self, value, defaultNaN=np.nan): + try: + value[value==1e31] = defaultNaN + except Exception as e: + self.print_msg(f"value==1e31 : {str(e)}") + try: + value[value==self.conf.VALUE_NAN] = defaultNaN + except Exception as e: + self.print_msg(f"value=={self.conf.VALUE_NAN} : {str(e)}") + try: + value[value==-32768.0] = defaultNaN + except Exception as e: + self.print_msg(f"value==-32768.0 : {str(e)}") + try: + value[value==31082] = defaultNaN + except Exception as e: + self.print_msg(f"value==31082 : {str(e)}") + return value + def write(self, grid: np.ndarray, timestamp: datetime = None): timestep = -1 if timestamp is not None: @@ -90,13 +123,12 @@ def write_timestep(self, grid: np.ndarray, timestep: int = -1): raise Exception("netCDF Dataset was not initialized. If file already exists, use --force flag to append.") timestep_idx = int(timestep / self.time_frequency) self.nf.variables[self.netcdf_var_time][timestep_idx] = timestep - values = grid - values[values!=self.conf.VALUE_NAN] -= self.add_offset - values[values!=self.conf.VALUE_NAN] *= self.scale_factor - # values = self.setNaN(grid) - if self.unit_conversion != 1.0: - values = values * self.unit_conversion - values[np.isnan(values)] = (self.conf.VALUE_NAN - self.add_offset) * self.scale_factor + values = self.setNaN(grid) + values[values < self.conf.value_min_packed] = np.nan + values[values > self.conf.value_max_packed] = np.nan + values[values != self.conf.VALUE_NAN] *= self.conf.scale_factor + values[values != self.conf.VALUE_NAN] += self.conf.add_offset + values[np.isnan(values)] = self.conf.VALUE_NAN * self.conf.scale_factor + self.conf.add_offset self.nf.variables[self.var_code][timestep_idx, :, :] = values def opened(self) -> bool: @@ -115,16 +147,15 @@ def __set_property(self, netcdf_var: NetCDF4Variable, property: str, section: st def __setup_netcdf_metadata(self, start_date: datetime = None): # General Attributes - time_created = timex.ctime(timex.time()) - self.nf.history = f'Created {time_created}' + self.nf.history = f'Created {self.time_created}' self.nf.Conventions = self.NETCDF_CONVENTIONS - self.nf.Source_Software = self.conf.get_config_field('GENERIC','NETCDF_SPHEREMAP_VERSION') - self.nf.reference = self.conf.get_config_field('GENERIC','NETCDF_REFERENCE') - self.nf.title = self.conf.get_config_field('GENERIC','NETCDF_TITLE') - self.nf.keywords = self.conf.get_config_field('GENERIC','NETCDF_KEYWORDS') - self.nf.source = self.conf.get_config_field('GENERIC','NETCDF_SOURCE') - self.nf.institution = self.conf.get_config_field('GENERIC','NETCDF_INSTITUTION') - self.nf.comment = self.conf.get_config_field('GENERIC','NETCDF_COMMENT') + self.nf.Source_Software = self.Source_Software + self.nf.reference = self.reference + self.nf.title = self.title + self.nf.keywords = self.keywords + self.nf.source = self.source + self.nf.institution = self.institution + self.nf.comment = self.comment netcdf_var_x = self.conf.get_config_field('VAR_X','NAME') netcdf_var_y = self.conf.get_config_field('VAR_Y','NAME') @@ -169,23 +200,20 @@ def __setup_netcdf_metadata(self, start_date: datetime = None): # self.__set_property(proj, 'longitude_of_prime_meridian', 'PROJECTION', 'LONGITUDE_PRIME_MERIDIAN', float) # self.__set_property(proj, 'GeoTransform', 'PROJECTION', 'GEO_TRANSFORM', self.__get_tuple) - var_std_name = self.conf.get_config_field('PROPERTIES', 'STANDARD_NAME') var_data_type_packed = self.conf.get_config_field('PROPERTIES', 'DATA_TYPE_PACKED') - value_min = int(self.conf.get_config_field('PROPERTIES', 'VALUE_MIN')) - value_max = int(self.conf.get_config_field('PROPERTIES', 'VALUE_MAX')) value = self.nf.createVariable(self.var_code, var_data_type_packed, (self.netcdf_var_time, netcdf_var_y, netcdf_var_x), zlib=True, complevel=self.COMPRESSION_LEVEL, fill_value=self.conf.VALUE_NAN) # , least_significant_digit=2) value.missing_value=self.conf.VALUE_NAN - value.standard_name = var_std_name - value.long_name = self.conf.get_config_field('PROPERTIES', 'LONG_NAME') - value.cell_methods = self.conf.get_config_field('PROPERTIES', 'CELL_METHODS') - value.units = self.conf.get_config_field('PROPERTIES', 'UNIT') - value.valid_min = int(value_min / self.scale_factor) - if value_max != self.conf.VALUE_NAN: - value.valid_max = int(value_max / self.scale_factor) - value.scale_factor = self.scale_factor - value.add_offset = self.add_offset + value.standard_name = self.var_standard_name + value.long_name = self.var_long_name + value.cell_methods = self.cell_methods + value.units = self.units + value.valid_min = self.conf.value_min_packed + if self.conf.value_max != self.conf.VALUE_NAN: + value.valid_max = self.conf.value_max_packed + value.scale_factor = self.conf.scale_factor + value.add_offset = self.conf.add_offset value.set_auto_maskandscale(True) self.__set_property(value, 'grid_mapping', 'PROJECTION', 'GRID_MAPPING') self.__set_property(value, 'esri_pe_string', 'PROJECTION', 'STRING') @@ -195,13 +223,13 @@ def __setup_netcdf_metadata(self, start_date: datetime = None): latitude[:] = self.conf.dem.lat_values longitude[:] = self.conf.dem.lon_values - self.print_msg(f'var-name: {var_std_name}') + self.print_msg(f'var-name: {self.var_standard_name}') self.print_msg(f'data-type-packed: {var_data_type_packed}') self.print_msg(f'scale: {value.scale}') self.print_msg(f'scale-factor: {value.scale_factor}') self.print_msg(f'offset: {value.add_offset}') self.print_msg(f'valid-min: {value.valid_min}') - if value_max != self.conf.VALUE_NAN: + if self.conf.value_max != self.conf.VALUE_NAN: self.print_msg(f'valid-max: {value.valid_max}') else: self.print_msg('valid-max: N/A') @@ -211,10 +239,35 @@ class GDALWriter(OutputWriter): def __init__(self, conf: Config, overwrite_file: bool = False, quiet_mode: bool = False): super().__init__(conf, overwrite_file, quiet_mode) + self.current_timestamp = None self.driver_gtiff = gdal.GetDriverByName("GTiff") + def setup_dataset_metadata(self, ds: gdal.Dataset) -> gdal.Dataset: + ds.SetMetadataItem('SCALE', f'{self.conf.scale_factor}') + ds.SetMetadataItem('OFFSET', f'{self.conf.add_offset}') + ds.SetMetadataItem('NODATA', f'{self.conf.VALUE_NAN}') + ds.SetMetadataItem('Source_Software', self.Source_Software) + ds.SetMetadataItem('Created', f'{self.time_created}') + ds.SetMetadataItem('reference', f'{self.reference}') + ds.SetMetadataItem('title', f'{self.title}') + ds.SetMetadataItem('keywords', f'{self.keywords}') + ds.SetMetadataItem('source', f'{self.source}') + ds.SetMetadataItem('institution', f'{self.institution}') + ds.SetMetadataItem('comment', f'{self.comment}') + ds.SetMetadataItem('var_code', f'{self.var_code}') + ds.SetMetadataItem('var_standard_name', f'{self.var_standard_name}') + ds.SetMetadataItem('var_long_name', f'{self.var_long_name}') + ds.SetMetadataItem('cell_methods', f'{self.cell_methods}') + ds.SetMetadataItem('units', f'{self.units}') + if self.current_timestamp is not None: + ds.SetMetadataItem('Timestamp', f'{self.current_timestamp}') + return ds + def write(self, grid: np.ndarray, timestamp: datetime = None): + if timestamp is not None: + self.current_timestamp = timestamp.strftime(FileUtils.DATE_PATTERN_SEPARATED) self.write_timestep(grid) + self.current_timestamp = None def write_timestep(self, grid: np.ndarray, timestep: int = -1): if not self.opened(): @@ -222,7 +275,7 @@ def write_timestep(self, grid: np.ndarray, timestep: int = -1): if self.overwrite_file or not self.filepath.is_file(): self.print_msg(f'Generating file: {self.filepath}') size_lats, size_lons = grid.shape - ds = self.driver_gtiff.Create(str(self.filepath), size_lons, size_lats, 1, gdal.GDT_Int16) + ds = self.driver_gtiff.Create(str(self.filepath), size_lons, size_lats, 1, gdal.GDT_Int16, options=['COMPRESS=LZW']) # Upper Left x, East-West px resolution, rotation, Upper Left y, rotation, North-South px resolution ds.SetGeoTransform( [self.conf.COORDINATE_MIN_X, self.conf.CELL_SIZE_X, 0, self.conf.COORDINATE_MAX_Y, 0, self.conf.CELL_SIZE_Y ] ) # Set CRS @@ -230,10 +283,11 @@ def write_timestep(self, grid: np.ndarray, timestep: int = -1): srs.SetWellKnownGeogCS("WGS84") ds.SetProjection( srs.ExportToWkt() ) # Write the band - ds.GetRasterBand(1).SetNoDataValue(self.conf.VALUE_NAN) # optional if no-data transparent + ds.GetRasterBand(1).SetNoDataValue(self.conf.VALUE_NAN) ds.GetRasterBand(1).SetScale(self.conf.scale_factor) ds.GetRasterBand(1).SetOffset(self.conf.add_offset) - ds.GetRasterBand(1).WriteArray(grid) + ds = self.setup_dataset_metadata(ds) + ds.GetRasterBand(1).WriteArray(grid.astype(np.int16)) ds.FlushCache() ds = None self.print_msg(f'Wrote file: {self.filepath}') diff --git a/src/lisfloodutilities/gridding/tools/leave1out.py b/src/lisfloodutilities/gridding/tools/leave1out.py new file mode 100644 index 0000000..96b0991 --- /dev/null +++ b/src/lisfloodutilities/gridding/tools/leave1out.py @@ -0,0 +1,165 @@ +__author__="Goncalo Gomes" +__date__="$Jul 12, 2023 12:01:00$" +__version__="0.1" +__updated__="$Jul 13, 2023 10:41:00$" + +""" +Copyright 2019-2020 European Union +Licensed under the EUPL, Version 1.2 or as soon they will be approved by the European Commission subsequent versions of the EUPL (the "Licence"); +You may not use this work except in compliance with the Licence. +You may obtain a copy of the Licence at: +https://joinup.ec.europa.eu/sites/default/files/inline-files/EUPL%20v1_2%20EN(1).txt +Unless required by applicable law or agreed to in writing, software distributed under the Licence is distributed on an "AS IS" basis, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the Licence for the specific language governing permissions and limitations under the Licence. + +""" + +import sys +import os +import csv +from pathlib import Path +from argparse import ArgumentParser, ArgumentTypeError +import pandas as pd +from lisfloodutilities.gridding.lib.utils import FileUtils +from sklearn.model_selection import RepeatedStratifiedKFold + + +def write_datasets(i: int, orig_outfilepath: Path, X_df: pd.DataFrame, y_df: pd.DataFrame, write_train_dataset=False): + if i > 9: + test_folder = Path(orig_outfilepath.parent).joinpath(f't{i}') + else: + test_folder = Path(orig_outfilepath.parent).joinpath(f't0{i}') + output_kiwis_filename = orig_outfilepath.name + kiwis_filepath = Path(test_folder).joinpath(output_kiwis_filename) + Path(kiwis_filepath.parent).mkdir(parents=True, exist_ok=True) + # Create an empty kiwis file once to be used later by the spheremap program to detect the files to be processed + if write_train_dataset: + with open(kiwis_filepath, mode='a'): pass + output_filename = output_kiwis_filename.replace('all.kiwis', '20230714101901.txt') + else: + # write test datset + output_filename = output_kiwis_filename.replace('all.kiwis', 'TEST_DATASET.txt') + output_file = test_folder.joinpath(output_filename) + df2 = pd.DataFrame({ + 'x': X_df['station_local_x'].values, + 'y': X_df['station_local_y'].values, + 'value': y_df['ts_value'].values, + }) + df2.to_csv(output_file, sep='\t', header=False, index=False) + + +def run(infolder: str, outfolder: str): + inwildcard = '*_all.kiwis' + + for filename in sorted(Path(infolder).rglob(inwildcard)): + print(f'Processing file: {filename}') + outfile = str(filename).replace(infolder, outfolder) + outfilepath = Path(outfile) + # Create the output parent folders if not exist yet + Path(outfilepath.parent).mkdir(parents=True, exist_ok=True) + + # Load the CSV file into a pandas dataframe + df = pd.read_csv(filename, delimiter='\t') + + # Clean the dataset from unwanted rows + df = df.astype(str) + df = df[df['EFAS-ADDATTR-NOGRIDDING'].isin(['no'])] + df = df[df['EFAS_ADDATTR_ISINNEWDOMAIN'].isin(['yes'])] + df = df[~df['EXCLUDE'].isin(['yes'])] + df = df[df['q_code'].isin(['40', '120'])] + + # Use the x and y coordinate columns as our input features (X), + # and the value column as our target variable (y) + X = df[['station_local_x', 'station_local_y']] + # y = df[['site_no', 'ts_value']] + y = df[['ts_value']] + + # Instantiate the RepeatedStratifiedKFold object + rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=47) + + i = 0 + # Iterate over the folds created by RepeatedStratifiedKFold + for train_index, test_index in rskf.split(X, y): + i += 1 + # Split the data into training and testing sets based on the current fold indices + X_train, X_test = X.iloc[train_index], X.iloc[test_index] + y_train, y_test = y.iloc[train_index], y.iloc[test_index] + + # Write the train dataset + X_train_df = X_train[['station_local_x', 'station_local_y']] + y_train_df = y_train[['ts_value']] + write_datasets(i, outfilepath, X_train_df, y_train_df, write_train_dataset=True) + + # Write the test dataset + X_test_df = X_test[['station_local_x', 'station_local_y']] + y_test_df = y_test[['ts_value']] + write_datasets(i, outfilepath, X_test_df, y_test_df, write_train_dataset=False) + + +def main(argv): + '''Command line options.''' + global quiet_mode + + program_name = os.path.basename(sys.argv[0]) + program_path = os.path.dirname(os.path.realpath(sys.argv[0])) + program_version = "v%s" % __version__ + program_build_date = "%s" % __updated__ + + program_version_string = 'version %s (%s)\n' % (program_version, program_build_date) + program_longdesc = ''' + This script uses RepeatedStratifiedKFold to generate several input files as training dataset where a small percentage + of evenly distributed stations are removed to constitute the test cases. Later the training dataset will be interpolated + with several algorithms: Spheremap, Inverse distance and Angular Distance Weighting. Then the error at the coordinates of + the test cases is calculated by the difference between the interpolated value and the real value in the test case. + ''' + program_license = """ + Copyright 2019-2020 European Union + Licensed under the EUPL, Version 1.2 or as soon they will be approved by the European Commission subsequent versions of the EUPL (the "Licence"); + You may not use this work except in compliance with the Licence. + You may obtain a copy of the Licence at: + https://joinup.ec.europa.eu/sites/default/files/inline-files/EUPL%20v1_2%20EN(1).txt + Unless required by applicable law or agreed to in writing, software distributed under the Licence is distributed on an "AS IS" basis, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the Licence for the specific language governing permissions and limitations under the Licence. + """ + + try: + # setup option parser + parser = ArgumentParser(epilog=program_license, description=program_version_string+program_longdesc) + +# # set defaults +# parser.set_defaults(quiet=False, +# out_tiff=False, +# overwrite_output=False, +# start_date='', +# end_date=END_DATE_DEFAULT) + + parser.add_argument("-i", "--in", dest="infolder", required=True, type=FileUtils.folder_type, + help="Set input folder path with kiwis files", + metavar="input_folder") + parser.add_argument("-o", "--out", dest="outfolder", required=True, type=FileUtils.folder_type, + help="Set output folder base path.", + metavar="output_folder") + + # process options + args = parser.parse_args(argv) + + print(f"Input Folder: {args.infolder}") + print(f"Output Folder: {args.outfolder}") + + run(args.infolder, args.outfolder) + print("Finished.") + except Exception as e: + indent = len(program_name) * " " + sys.stderr.write(program_name + ": " + repr(e) + "\n") + sys.stderr.write(indent + " for help use --help") + return 2 + + +def main_script(): + sys.exit(main(sys.argv[1:])) + + +if __name__ == "__main__": + main_script() diff --git a/src/lisfloodutilities/gridding/tools/leave1out_analysis.py b/src/lisfloodutilities/gridding/tools/leave1out_analysis.py new file mode 100644 index 0000000..098ccc5 --- /dev/null +++ b/src/lisfloodutilities/gridding/tools/leave1out_analysis.py @@ -0,0 +1,291 @@ +__author__="Goncalo Gomes" +__date__="$Jul 12, 2023 12:01:00$" +__version__="0.1" +__updated__="$Jul 13, 2023 10:41:00$" + +""" +Copyright 2019-2020 European Union +Licensed under the EUPL, Version 1.2 or as soon they will be approved by the European Commission subsequent versions of the EUPL (the "Licence"); +You may not use this work except in compliance with the Licence. +You may obtain a copy of the Licence at: +https://joinup.ec.europa.eu/sites/default/files/inline-files/EUPL%20v1_2%20EN(1).txt +Unless required by applicable law or agreed to in writing, software distributed under the Licence is distributed on an "AS IS" basis, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the Licence for the specific language governing permissions and limitations under the Licence. + +""" + +import sys +import os +import csv +from pathlib import Path +from argparse import ArgumentParser, ArgumentTypeError +from lisfloodutilities.gridding.lib.utils import FileUtils +import pandas as pd +import numpy as np +import rioxarray as rxr +from osgeo import gdal +import netCDF4 as nc +from scipy.spatial import cKDTree +from scipy.stats import pearsonr +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.metrics import mean_absolute_error +from sklearn.feature_selection import r_regression + + +def mean_bias_error(y_true: np.array, y_pred: np.array) -> float: + ''' + Parameters: + y_true (array): Array of observed values + y_pred (array): Array of prediction values + + Returns: + mbe (float): Bias score + ''' + mbe = np.mean(y_true - y_pred) + return mbe + + +def get_netcdf_meteo_variable_name(nc_file_obj): + # Only one variable must be present in netcdf files + num_dims = 3 if 'time' in nc_file_obj.variables else 2 + varname = [v for v in nc_file_obj.variables if len(nc_file_obj.variables[v].dimensions) == num_dims][0] + return varname + + +def get_pixel_area(pixel_area_file: Path) -> np.ndarray: + ncfile = nc.Dataset(pixel_area_file, 'r') + var_name = get_netcdf_meteo_variable_name(ncfile) + var = ncfile.variables[var_name] + data = var[:] + ncfile.close() + return np.ascontiguousarray(data.flatten()) + + +def write_results(outfile: Path, test_code: str, mae: float, mbe: float, pearson_r: float, values_sum: float, + count_pixels_1st_interval: float, count_pixels_2nd_interval: float): + if outfile.is_file(): + df = pd.read_csv(outfile, delimiter='\t') + new_data = [{ + 'test': test_code, + 'mae': mae, + 'mbe': mbe, + 'pearson_r': pearson_r, + 'values_sum': values_sum, + 'pixels_with_values_0 np.ndarray: +# compressed_data = data.astype(float) +# decompressed_data = compressed_data * scale_factor + offset +# decompressed_data = np.round(decompressed_data, 1) +# decompressed_data = np.where(decompressed_data == no_data, np.nan, decompressed_data) +# return decompressed_data + + +def unpack_data(data: np.ndarray, scale_factor: float = 1.0, offset: float = 0.0, no_data: float = -9999.0) -> np.ndarray: + print(f'scale_factor: {scale_factor} offset: {offset} no_data: {no_data}') + compressed_data = data.astype(float) + compressed_data = np.where(compressed_data == no_data, np.nan, compressed_data) + decompressed_data = compressed_data * scale_factor + offset + decompressed_data = np.round(decompressed_data, 1) + unpacked_no_data = no_data * scale_factor + offset + decompressed_data = np.where(decompressed_data == unpacked_no_data, np.nan, decompressed_data) + return decompressed_data + + +def format_metadata(metadata: dict, value: str, key: str, default: float) -> float: + if value is None: + return float(metadata.get(key, default)) + return float(value) + + +def get_unpacking_metadata(file_interpolated_values: Path) -> tuple[float, float, float]: + ds=gdal.Open(file_interpolated_values.as_posix()) + gdal_nan = ds.GetRasterBand(1).GetNoDataValue() + gdal_scale = ds.GetRasterBand(1).GetScale() + gdal_offset = ds.GetRasterBand(1).GetOffset() + print(f'gdal_nan: {gdal_nan} gdal_scale: {gdal_scale} gdal_offset: {gdal_offset}') + metadata=ds.GetMetadata() + print('metadata: ', metadata) + scale_factor = format_metadata(metadata, gdal_scale, 'SCALE', 1.0) + offset = format_metadata(metadata, gdal_offset, 'OFFSET', 0.0) + no_data = format_metadata(metadata, gdal_nan, 'NODATA', -9999.0) + ds = None + return scale_factor, offset, no_data + + +def get_interpolated_values_dataframe(file_interpolated_values: Path, is_compressed_data: bool) -> pd.DataFrame: + scale_factor, offset, no_data = get_unpacking_metadata(file_interpolated_values) + dataarray = rxr.open_rasterio(file_interpolated_values) + band = dataarray[0] + x, y, values = band.x.values, band.y.values, band.values + x, y = np.meshgrid(x, y) + x, y, values = x.flatten(), y.flatten(), unpack_data(values.flatten(), scale_factor, offset, no_data) + df_interpolated_values = pd.DataFrame( + { + 'x': x, + 'y': y, + 'value': values, + } + ) + # df_interpolated_values = df_interpolated_values.dropna() + return df_interpolated_values + + +def get_interpolated_values(df_interpolated_values: pd.DataFrame, df_true_values: pd.DataFrame) -> np.ndarray: + interpolated_values = np.ascontiguousarray(df_interpolated_values['value'].values) + transformed_coordinates = (df_interpolated_values['x'].values, df_interpolated_values['y'].values) + interpolated_values_query = cKDTree(data=np.vstack(transformed_coordinates).T, copy_data=True) + stations = np.array([df_true_values['x'], df_true_values['y']]).T + _, idx = interpolated_values_query.query(stations) + return interpolated_values[idx] + + +def run(file_true_values: str, file_interpolated_values: str, file_pixel_area: str, outfile: str, is_compressed_data: bool, limit_value: float): + + file_interpolated_values_path = Path(file_interpolated_values) + outfile_path = Path(outfile) + file_true_values_path = Path(file_true_values) + # Load the CSV file into a pandas dataframe + df_true_values = pd.read_csv(file_true_values_path, delimiter='\t', header=None, names=['x', 'y', 'true_value']) + df_interpolated_values = get_interpolated_values_dataframe(file_interpolated_values_path, is_compressed_data) + predicted_values_mm = np.ascontiguousarray(df_interpolated_values['value'].values) # includes NaN + df_interpolated_values = df_interpolated_values.dropna() + interpolated_values = get_interpolated_values(df_interpolated_values, df_true_values) + df_true_values['interpolated_value'] = interpolated_values + # write intermediate results to a file + df_true_values.to_csv(file_true_values_path.with_suffix('.tab'), sep="\t", index=False) + + # process only values greater than 1.0 mm + # limit_value = 1.0 + df_true_values = df_true_values[(df_true_values['true_value'] > limit_value) & (df_true_values['interpolated_value'] > limit_value)] + + true_values = df_true_values['true_value'] + predicted_values = df_true_values['interpolated_value'] + + test_code = file_interpolated_values_path.parent.name + mae = -9999.0 + mbe = -9999.0 + pearson_correlation_coeficient = -9999.0 + values_sum = -9999.0 + count_pixels_1st_interval = -9999.0 + count_pixels_2nd_interval = -9999.0 + + if len(true_values) > 0 and len(predicted_values) > 0: + mae = mean_absolute_error(true_values, predicted_values) + mbe = mean_bias_error(true_values, predicted_values) + # The correlations needs at least 2 elements in each array + if len(true_values) > 1 and len(predicted_values) > 1: + pearson_correlation_coeficient, p_value = pearsonr(true_values, predicted_values) + # values_sum = np.round(np.nansum(np.ascontiguousarray(df_interpolated_values['value'].values)), 1) + values_sum = np.round(np.nansum(np.ascontiguousarray(predicted_values)), 1) + + if len(file_pixel_area) > 0: + pixel_area_m2 = get_pixel_area(Path(file_pixel_area)) + predicted_values_m = predicted_values_mm * 0.001 + predicted_values_m3 = predicted_values_m * pixel_area_m2 + values_sum = np.round(np.nansum(predicted_values_m3), 1) + # number of pixels with values 0 < X <0.1 + count_pixels_1st_interval = np.count_nonzero((~np.isnan(predicted_values_mm)) & (predicted_values_mm > 0) & (predicted_values_mm <= 0.1)) + # number of pixels with values 0.1 <= X < 1 + count_pixels_2nd_interval = np.count_nonzero((~np.isnan(predicted_values_mm)) & (predicted_values_mm > 0.1) & (predicted_values_mm < 1)) + + # write the results to the output file + write_results(outfile_path, test_code, mae, mbe, pearson_correlation_coeficient, values_sum, + count_pixels_1st_interval, count_pixels_2nd_interval) + + +def main(argv): + '''Command line options.''' + global quiet_mode + + program_name = os.path.basename(sys.argv[0]) + program_path = os.path.dirname(os.path.realpath(sys.argv[0])) + program_version = "v%s" % __version__ + program_build_date = "%s" % __updated__ + + program_version_string = 'version %s (%s)\n' % (program_version, program_build_date) + program_longdesc = ''' + This script uses RepeatedStratifiedKFold to generate several input files as training dataset where a small percentage + of evenly distributed stations are removed to constitute the test cases. Later the training dataset will be interpolated + with several algorithms: Spheremap, Inverse distance and Angular Distance Weighting. Then the error at the coordinates of + the test cases is calculated by the difference between the interpolated value and the real value in the test case. + ''' + program_license = """ + Copyright 2019-2020 European Union + Licensed under the EUPL, Version 1.2 or as soon they will be approved by the European Commission subsequent versions of the EUPL (the "Licence"); + You may not use this work except in compliance with the Licence. + You may obtain a copy of the Licence at: + https://joinup.ec.europa.eu/sites/default/files/inline-files/EUPL%20v1_2%20EN(1).txt + Unless required by applicable law or agreed to in writing, software distributed under the Licence is distributed on an "AS IS" basis, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the Licence for the specific language governing permissions and limitations under the Licence. + """ + + try: + # setup option parser + parser = ArgumentParser(epilog=program_license, description=program_version_string+program_longdesc) + + # set defaults + parser.set_defaults(is_compressed_data=False, limit_value=1.0) + + parser.add_argument("-t", "--test", dest="file_true_values", required=True, type=FileUtils.file_type, + help="Set file path for true values form the test dataset in txt format.", + metavar="txt_file") + parser.add_argument("-i", "--interpolated", dest="file_interpolated_values", required=True, type=FileUtils.file_type, + help="Set file path for interpolated values in tiff format.", + metavar="tiff_file") + parser.add_argument("-p", "--pixelarea", dest="file_pixel_area", required=False, type=FileUtils.file_type, + help="Set file path for pixel area netCDF.", + metavar="pixel_area_file") + parser.add_argument("-o", "--out", dest="outfile", required=True, type=FileUtils.file_or_folder, + help="Set output file containing the statistics.", + metavar="output_file") + parser.add_argument("-c", "--compressed", dest="is_compressed_data", action="store_true", + help="Indicates if data in the tiff file is compressed and decompresses it before using. [default: %(default)s]") + parser.add_argument("-l", "--limit", dest="limit_value", required=False, type=float, + help="process only values greater than limit", metavar="limit_value") + + # process options + args = parser.parse_args(argv) + + print(f"True values: {args.file_true_values}") + print(f"Interpolated values: {args.file_interpolated_values}") + print(f"Output File: {args.outfile}") + print(f"Compressed Data: {args.is_compressed_data}") + print(f"Limit Value: {args.limit_value}") + file_pixel_area = "" + if args.file_pixel_area is not None: + file_pixel_area = args.file_pixel_area + print(f"Pixel Area File: {file_pixel_area}") + + run(args.file_true_values, args.file_interpolated_values, file_pixel_area, args.outfile, args.is_compressed_data, args.limit_value) + print("Finished.") + except Exception as e: + indent = len(program_name) * " " + sys.stderr.write(program_name + ": " + repr(e) + "\n") + sys.stderr.write(indent + " for help use --help") + return 2 + + +def main_script(): + sys.exit(main(sys.argv[1:])) + + +if __name__ == "__main__": + main_script() \ No newline at end of file diff --git a/src/lisfloodutilities/gridding/tools/leave1out_analysis_aggregation.py b/src/lisfloodutilities/gridding/tools/leave1out_analysis_aggregation.py new file mode 100644 index 0000000..f49a996 --- /dev/null +++ b/src/lisfloodutilities/gridding/tools/leave1out_analysis_aggregation.py @@ -0,0 +1,90 @@ +import pandas as pd + + +BASE_PATH = '/BGFS/DISASTER/grids/leave_one_out' + +variable_name = 'pr6' + +is_test_running=False + +# interpolation_algorithms = ['spheremap', 'idw', 'adw', 'cdd'] +interpolation_algorithms = ['cdd'] + +if not is_test_running: + algorithm_folder_name = { + 'spheremap': 'sph_all', + 'idw': 'idw_all', + 'adw': 'adw_all', + 'cdd': 'cdd_all' + } +else: + algorithm_folder_name = { + 'spheremap': 'spheremap', + 'idw': 'idw', + 'adw': 'adw', + 'cdd': 'cdd' + } + +timesteps_file = f'{BASE_PATH}/all-time-steps.txt' +series = pd.read_csv(timesteps_file, delimiter='\t', header=None, squeeze=True) +timesteps = list(series.T) +series = None + +df_dic = { + 'mae': None, + 'mbe': None, + 'pearson_r': None, + 'values_sum': None, + 'pixels_with_values_0 np.ndarray: + print(f'scale_factor: {scale_factor} offset: {offset} no_data: {no_data}') + compressed_data = data.astype(float) + compressed_data = np.where(compressed_data == no_data, np.nan, compressed_data) + decompressed_data = compressed_data * scale_factor + offset + decompressed_data = np.round(decompressed_data, 1) + unpacked_no_data = no_data * scale_factor + offset + decompressed_data = np.where(decompressed_data == unpacked_no_data, np.nan, decompressed_data) + return decompressed_data + + +def format_metadata(metadata: dict, value: str, key: str, default: float) -> float: + if value is None: + return float(metadata.get(key, default)) + return float(value) + + +def get_unpacking_metadata(file_interpolated_values: Path) -> tuple[float, float, float]: + ds=gdal.Open(file_interpolated_values.as_posix()) + gdal_nan = ds.GetRasterBand(1).GetNoDataValue() + gdal_scale = ds.GetRasterBand(1).GetScale() + gdal_offset = ds.GetRasterBand(1).GetOffset() + print(f'gdal_nan: {gdal_nan} gdal_scale: {gdal_scale} gdal_offset: {gdal_offset}') + metadata=ds.GetMetadata() + print('metadata: ', metadata) + scale_factor = format_metadata(metadata, gdal_scale, 'SCALE', 1.0) + offset = format_metadata(metadata, gdal_offset, 'OFFSET', 0.0) + no_data = format_metadata(metadata, gdal_nan, 'NODATA', -9999.0) + ds = None + return scale_factor, offset, no_data + + +def get_interpolated_values(file_interpolated_values: Path, is_compressed_data: bool) -> np.ndarray: + # values_ds = rxr.open_rasterio(file_interpolated_values, mask_and_scale=True) + scale_factor, offset, no_data = get_unpacking_metadata(file_interpolated_values) + values_ds = rxr.open_rasterio(file_interpolated_values) + values_ds = values_ds.sel(band=1) + values_ds = values_ds.rename({'band': 'values'}) + values_ds = values_ds.sortby('y', ascending=False) + values = unpack_data(values_ds.values, scale_factor, offset, no_data) + return values + + +def get_pixelarea_catchments_and_interpolated_values(pixelarea_file_path: Path, catchments_file_path: Path, + file_interpolated_values: Path, is_compressed_data: bool) -> xr.Dataset: + ds_pixelarea = xr.open_dataset(pixelarea_file_path) + ds_pixelarea = ds_pixelarea.sortby('lat', ascending=False) + ds_pixelarea = ds_pixelarea.rename({"Band1": "pixarea"}) + pixelarea = ds_pixelarea.pixarea + pixelarea_values = pixelarea.values + + ds_catchments = xr.open_dataset(catchments_file_path) + ds_catchments = ds_catchments.sortby('lat', ascending=False) + ds_catchments = ds_catchments.rename({"Band1": "catchments"}) + catchments = ds_catchments.catchments + catchments_values = catchments.values + + values = get_interpolated_values(file_interpolated_values, is_compressed_data) + + combined_data = xr.Dataset({'pixarea': (['lat', 'lon'], pixelarea_values), + 'catchments': (['lat', 'lon'], catchments_values), + 'values': (['lat', 'lon'], values)}, + coords={'lon': ds_pixelarea['lon'], 'lat': ds_pixelarea['lat']}) + combined_data = combined_data.where(combined_data.catchments != 0) + return combined_data + + +def write_results(outfile: Path, df: pd.DataFrame): + if outfile.is_file(): + df_from_file = pd.read_csv(outfile, delimiter='\t', index_col='catchments') + df = df.drop('pixarea', axis=1) + df = pd.merge(df_from_file, df, on=['catchments']) + df.to_csv(outfile, sep='\t', index=True) + + +def run(file_catchments: str, file_interpolated_values: str, file_pixel_area: str, outfile: str, is_compressed_data: bool): + file_catchments_path = Path(file_catchments) + file_interpolated_values_path = Path(file_interpolated_values) + file_pixel_area_path = Path(file_pixel_area) + outfile_path = Path(outfile) + ds = get_pixelarea_catchments_and_interpolated_values(file_pixel_area_path, file_catchments_path, + file_interpolated_values_path, is_compressed_data) + + # Convert the precipitation values from mm to m + ds['values'] /= 1000.0 + # Convert precipitation from m to m3 + ds['values'] = ds['values'] * ds['pixarea'] + + values_total_group = ds.groupby('catchments').sum() + + values_total_group_df = values_total_group.to_dataframe() +# values_total_group_df = values_total_group_df.dropna() + + test_code = file_interpolated_values_path.parent.name + # test_code = 'ALL' + values_total_group_df = values_total_group_df.rename(columns={'values': f'{test_code}'}) + write_results(outfile_path, values_total_group_df) + + +def main(argv): + '''Command line options.''' + global quiet_mode + + program_name = os.path.basename(sys.argv[0]) + program_path = os.path.dirname(os.path.realpath(sys.argv[0])) + program_version = "v%s" % __version__ + program_build_date = "%s" % __updated__ + + program_version_string = 'version %s (%s)\n' % (program_version, program_build_date) + program_longdesc = ''' + This script uses RepeatedStratifiedKFold to generate several input files as training dataset where a small percentage + of evenly distributed stations are removed to constitute the test cases. Later the training dataset will be interpolated + with several algorithms: Spheremap, Inverse distance and Angular Distance Weighting. Then the error at the coordinates of + the test cases is calculated by the difference between the interpolated value and the real value in the test case. + ''' + program_license = """ + Copyright 2019-2020 European Union + Licensed under the EUPL, Version 1.2 or as soon they will be approved by the European Commission subsequent versions of the EUPL (the "Licence"); + You may not use this work except in compliance with the Licence. + You may obtain a copy of the Licence at: + https://joinup.ec.europa.eu/sites/default/files/inline-files/EUPL%20v1_2%20EN(1).txt + Unless required by applicable law or agreed to in writing, software distributed under the Licence is distributed on an "AS IS" basis, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the Licence for the specific language governing permissions and limitations under the Licence. + """ + +# try: + # setup option parser + parser = ArgumentParser(epilog=program_license, description=program_version_string+program_longdesc) + + # set defaults + parser.set_defaults(is_compressed_data=False) + + parser.add_argument("-t", "--catchments", dest="file_catchments", required=True, type=FileUtils.file_type, + help="Set file path for catchments netCDF.", + metavar="txt_file") + parser.add_argument("-i", "--interpolated", dest="file_interpolated_values", required=True, type=FileUtils.file_type, + help="Set file path for interpolated values in tiff format.", + metavar="tiff_file") + parser.add_argument("-p", "--pixelarea", dest="file_pixel_area", required=True, type=FileUtils.file_type, + help="Set file path for pixel area netCDF.", + metavar="pixel_area_file") + parser.add_argument("-o", "--out", dest="outfile", required=True, type=FileUtils.file_or_folder, + help="Set output file containing the statistics.", + metavar="output_file") + parser.add_argument("-c", "--compressed", dest="is_compressed_data", action="store_true", + help="Indicates if data in the tiff file is compressed and decompresses it before using. [default: %(default)s]") + + # process options + args = parser.parse_args(argv) + + print(f"Catchments File: {args.file_catchments}") + print(f"Pixel Area File: {args.file_pixel_area}") + print(f"Interpolated values: {args.file_interpolated_values}") + print(f"Output File: {args.outfile}") + print(f"Compressed Data: {args.is_compressed_data}") + + run(args.file_catchments, args.file_interpolated_values, args.file_pixel_area, args.outfile, args.is_compressed_data) + print("Finished.") +# except Exception as e: +# indent = len(program_name) * " " +# sys.stderr.write(program_name + ": " + repr(e) + "\n") +# sys.stderr.write(indent + " for help use --help") +# return 2 + + +def main_script(): + sys.exit(main(sys.argv[1:])) + + +if __name__ == "__main__": + main_script() diff --git a/src/lisfloodutilities/gridding/tools/leave1out_analysis_by_catchment_aggregation.py b/src/lisfloodutilities/gridding/tools/leave1out_analysis_by_catchment_aggregation.py new file mode 100644 index 0000000..8a79a7f --- /dev/null +++ b/src/lisfloodutilities/gridding/tools/leave1out_analysis_by_catchment_aggregation.py @@ -0,0 +1,20 @@ +import pandas as pd + +variable_name = 'pr' + +BASE_PATH = '/BGFS/DISASTER/grids/leave_one_out' + +timesteps = ['199106170600', '199106180600'] +# timesteps = ['199106180600'] +interpolation_algorithms = ['spheremap', 'idw', 'adw', 'cdd'] + +for timestep in timesteps: + for interpolation_algorithm in interpolation_algorithms: + file_in = f'{BASE_PATH}/{interpolation_algorithm}/{variable_name}/{variable_name}{timestep}_{interpolation_algorithm}_BY_CATCHMENT.stats' + file_out = f'{BASE_PATH}/{interpolation_algorithm}/{variable_name}/{variable_name}{timestep}_{interpolation_algorithm}_BY_CATCHMENT_out.stats' + df = pd.read_csv(file_in, delimiter='\t') + df = df.set_index('catchments') + df['mean'] = df.loc[:, 't01':].mean(axis=1) + df['std'] = df.loc[:, 't01':].std(axis=1) + df.to_csv(file_out, sep='\t', index=True) + diff --git a/src/lisfloodutilities/gridding/tools/leave1out_analysis_error_distribution.py b/src/lisfloodutilities/gridding/tools/leave1out_analysis_error_distribution.py new file mode 100644 index 0000000..85857d1 --- /dev/null +++ b/src/lisfloodutilities/gridding/tools/leave1out_analysis_error_distribution.py @@ -0,0 +1,145 @@ +__author__="Goncalo Gomes" +__date__="$Jul 12, 2023 12:01:00$" +__version__="0.1" +__updated__="$Jul 13, 2023 10:41:00$" + +""" +Copyright 2019-2020 European Union +Licensed under the EUPL, Version 1.2 or as soon they will be approved by the European Commission subsequent versions of the EUPL (the "Licence"); +You may not use this work except in compliance with the Licence. +You may obtain a copy of the Licence at: +https://joinup.ec.europa.eu/sites/default/files/inline-files/EUPL%20v1_2%20EN(1).txt +Unless required by applicable law or agreed to in writing, software distributed under the Licence is distributed on an "AS IS" basis, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the Licence for the specific language governing permissions and limitations under the Licence. + +""" + +import sys +import os +import csv +from pathlib import Path +from argparse import ArgumentParser, ArgumentTypeError +from lisfloodutilities.gridding.lib.utils import FileUtils +import pandas as pd +import numpy as np +import rioxarray as rxr +from osgeo import gdal +import netCDF4 as nc +from scipy.spatial import cKDTree +from scipy.stats import pearsonr +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.metrics import mean_absolute_error +from sklearn.feature_selection import r_regression + + +def ratio(y_true: np.array, y_pred: np.array) -> np.array: + ''' + Parameters: + y_true (array): Array of observed values + y_pred (array): Array of prediction values + + Returns: + Ratio array between observed and predicted values + ''' + return np.abs(1 - y_pred / y_true) + + +def write_results(outfile: Path, input_df: pd.DataFrame): + if outfile.is_file(): + output_df = pd.read_csv(outfile, delimiter='\t') + df = pd.merge(output_df, input_df, on=['x', 'y'], suffixes=('_1', '_2'), how='outer') + df['ratio'] = df['ratio_1'].fillna(0) + df['ratio_2'].fillna(0) + df.drop(['ratio_1', 'ratio_2'], axis=1, inplace=True) + else: + df = pd.DataFrame({ + 'x': input_df['x'], + 'y': input_df['y'], + 'ratio': input_df['ratio'], + }) + df.to_csv(outfile, sep='\t', index=False) + + +def run(file_true_values: str, outfile: str): + outfile_path = Path(outfile) + file_true_values_path = Path(file_true_values) + # Load the CSV file into a pandas dataframe + df_true_and_predicted_values = pd.read_csv(file_true_values_path.with_suffix('.tab'), delimiter='\t') + # remove all duplicates by x,y since we do not know which to keep and they give issues writing the results + df_true_and_predicted_values = df_true_and_predicted_values.drop_duplicates(['x', 'y'], keep=False) + + limit_value = 0.0 + df_true_and_predicted_values = df_true_and_predicted_values[(df_true_and_predicted_values['true_value'] > limit_value) & (df_true_and_predicted_values['interpolated_value'] > limit_value)] + + true_values = df_true_and_predicted_values['true_value'] + predicted_values = df_true_and_predicted_values['interpolated_value'] + + df_true_and_predicted_values['ratio'] = ratio(true_values, predicted_values) + # leaving only x, y, ratio + df_true_and_predicted_values.drop(['true_value', 'interpolated_value'], axis=1, inplace=True) + # write the results to the output file + write_results(outfile_path, df_true_and_predicted_values) + + +def main(argv): + '''Command line options.''' + global quiet_mode + + program_name = os.path.basename(sys.argv[0]) + program_path = os.path.dirname(os.path.realpath(sys.argv[0])) + program_version = "v%s" % __version__ + program_build_date = "%s" % __updated__ + + program_version_string = 'version %s (%s)\n' % (program_version, program_build_date) + program_longdesc = ''' + This script uses RepeatedStratifiedKFold to generate several input files as training dataset where a small percentage + of evenly distributed stations are removed to constitute the test cases. Later the training dataset will be interpolated + with several algorithms: Spheremap, Inverse distance and Angular Distance Weighting. Then the error at the coordinates of + the test cases is calculated by the difference between the interpolated value and the real value in the test case. + ''' + program_license = """ + Copyright 2019-2020 European Union + Licensed under the EUPL, Version 1.2 or as soon they will be approved by the European Commission subsequent versions of the EUPL (the "Licence"); + You may not use this work except in compliance with the Licence. + You may obtain a copy of the Licence at: + https://joinup.ec.europa.eu/sites/default/files/inline-files/EUPL%20v1_2%20EN(1).txt + Unless required by applicable law or agreed to in writing, software distributed under the Licence is distributed on an "AS IS" basis, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the Licence for the specific language governing permissions and limitations under the Licence. + """ + + try: + # setup option parser + parser = ArgumentParser(epilog=program_license, description=program_version_string+program_longdesc) + + # set defaults + parser.set_defaults(is_compressed_data=False) + + parser.add_argument("-t", "--test", dest="file_true_values", required=True, type=FileUtils.file_type, + help="Set file path for true values form the test dataset in txt format.", + metavar="txt_file") + parser.add_argument("-o", "--out", dest="outfile", required=True, type=FileUtils.file_or_folder, + help="Set output file containing the statistics.", + metavar="output_file") + + # process options + args = parser.parse_args(argv) + + print(f"True values: {args.file_true_values}") + print(f"Output File: {args.outfile}") + + run(args.file_true_values, args.outfile) + print("Finished.") + except Exception as e: + indent = len(program_name) * " " + sys.stderr.write(program_name + ": " + repr(e) + "\n") + sys.stderr.write(indent + " for help use --help") + return 2 + + +def main_script(): + sys.exit(main(sys.argv[1:])) + + +if __name__ == "__main__": + main_script() \ No newline at end of file diff --git a/tests/data/gridding/meteo_in/test2/pr6.nc b/tests/data/gridding/meteo_in/test2/pr6.nc index c10f281..e4ccdbd 100644 Binary files a/tests/data/gridding/meteo_in/test2/pr6.nc and b/tests/data/gridding/meteo_in/test2/pr6.nc differ diff --git a/tests/data/gridding/reference/test1/pr6.nc b/tests/data/gridding/reference/test1/pr6.nc index c10f281..e4ccdbd 100644 Binary files a/tests/data/gridding/reference/test1/pr6.nc and b/tests/data/gridding/reference/test1/pr6.nc differ diff --git a/tests/data/gridding/reference/test2/pr6.nc b/tests/data/gridding/reference/test2/pr6.nc index 41ddfe8..a3fd146 100644 Binary files a/tests/data/gridding/reference/test2/pr6.nc and b/tests/data/gridding/reference/test2/pr6.nc differ diff --git a/tests/data/gridding/reference/test3/2023/03/13/pr6202303131200_20230113111901.tiff b/tests/data/gridding/reference/test3/2023/03/13/pr6202303131200_20230113111901.tiff index 80fb173..ba26636 100644 Binary files a/tests/data/gridding/reference/test3/2023/03/13/pr6202303131200_20230113111901.tiff and b/tests/data/gridding/reference/test3/2023/03/13/pr6202303131200_20230113111901.tiff differ diff --git a/tests/data/gridding/reference/test3/2023/03/13/pr6202303131800_20230113111901.tiff b/tests/data/gridding/reference/test3/2023/03/13/pr6202303131800_20230113111901.tiff index 4007ef2..0ff404d 100644 Binary files a/tests/data/gridding/reference/test3/2023/03/13/pr6202303131800_20230113111901.tiff and b/tests/data/gridding/reference/test3/2023/03/13/pr6202303131800_20230113111901.tiff differ diff --git a/tests/data/gridding/reference/test3/2023/03/14/pr6202303140000_20230113111901.tiff b/tests/data/gridding/reference/test3/2023/03/14/pr6202303140000_20230113111901.tiff index 6b2d370..f7f646e 100644 Binary files a/tests/data/gridding/reference/test3/2023/03/14/pr6202303140000_20230113111901.tiff and b/tests/data/gridding/reference/test3/2023/03/14/pr6202303140000_20230113111901.tiff differ diff --git a/tests/data/gridding/reference/test3/2023/03/14/pr6202303140600_20230113111901.tiff b/tests/data/gridding/reference/test3/2023/03/14/pr6202303140600_20230113111901.tiff index 9fa7a14..1fb7463 100644 Binary files a/tests/data/gridding/reference/test3/2023/03/14/pr6202303140600_20230113111901.tiff and b/tests/data/gridding/reference/test3/2023/03/14/pr6202303140600_20230113111901.tiff differ diff --git a/tests/data/gridding/reference/test3/2023/03/14/pr6202303141200_20230113111901.tiff b/tests/data/gridding/reference/test3/2023/03/14/pr6202303141200_20230113111901.tiff index ae3dc08..d79fc7b 100644 Binary files a/tests/data/gridding/reference/test3/2023/03/14/pr6202303141200_20230113111901.tiff and b/tests/data/gridding/reference/test3/2023/03/14/pr6202303141200_20230113111901.tiff differ diff --git a/tests/data/gridding/reference/test3/2023/03/14/pr6202303141800_20230113111901.tiff b/tests/data/gridding/reference/test3/2023/03/14/pr6202303141800_20230113111901.tiff index aa0734e..4497b63 100644 Binary files a/tests/data/gridding/reference/test3/2023/03/14/pr6202303141800_20230113111901.tiff and b/tests/data/gridding/reference/test3/2023/03/14/pr6202303141800_20230113111901.tiff differ diff --git a/tests/data/gridding/reference/test3/2023/03/15/pr6202303150000_20230113111901.tiff b/tests/data/gridding/reference/test3/2023/03/15/pr6202303150000_20230113111901.tiff index f0945bf..0d6f249 100644 Binary files a/tests/data/gridding/reference/test3/2023/03/15/pr6202303150000_20230113111901.tiff and b/tests/data/gridding/reference/test3/2023/03/15/pr6202303150000_20230113111901.tiff differ diff --git a/tests/data/gridding/reference/test3/2023/03/15/pr6202303150600_20230113111901.tiff b/tests/data/gridding/reference/test3/2023/03/15/pr6202303150600_20230113111901.tiff index fd2f19b..546b603 100644 Binary files a/tests/data/gridding/reference/test3/2023/03/15/pr6202303150600_20230113111901.tiff and b/tests/data/gridding/reference/test3/2023/03/15/pr6202303150600_20230113111901.tiff differ diff --git a/tests/data/gridding/reference/test3/2023/03/15/pr6202303151200_20230113111901.tiff b/tests/data/gridding/reference/test3/2023/03/15/pr6202303151200_20230113111901.tiff index 1369f3e..f439240 100644 Binary files a/tests/data/gridding/reference/test3/2023/03/15/pr6202303151200_20230113111901.tiff and b/tests/data/gridding/reference/test3/2023/03/15/pr6202303151200_20230113111901.tiff differ diff --git a/tests/data/gridding/reference/test3/2023/03/15/pr6202303151800_20230113111901.tiff b/tests/data/gridding/reference/test3/2023/03/15/pr6202303151800_20230113111901.tiff index 970fdd6..b0f19a0 100644 Binary files a/tests/data/gridding/reference/test3/2023/03/15/pr6202303151800_20230113111901.tiff and b/tests/data/gridding/reference/test3/2023/03/15/pr6202303151800_20230113111901.tiff differ diff --git a/tests/test_gridding.py b/tests/test_gridding.py index b854449..839ebd6 100755 --- a/tests/test_gridding.py +++ b/tests/test_gridding.py @@ -5,8 +5,8 @@ import numpy as np from osgeo import gdal from netCDF4 import Dataset -from lisfloodutilities.gridding.generate_grids import run, print_msg -from lisfloodutilities.gridding.lib.utils import FileUtils +from src.lisfloodutilities.gridding.generate_grids import run, print_msg +from src.lisfloodutilities.gridding.lib.utils import FileUtils cur_folder = os.path.dirname(os.path.realpath(__file__)) @@ -28,13 +28,13 @@ def compare_tiffs(self, tiff_folder_path1: Path, tiff_folder_path2: Path): ds2 = gdal.Open(str(tiff_file2)) values2 = ds2.GetRasterBand(1).ReadAsArray() ds2 = None - values1 = np.zeros(values2.shape) - print(f'Testing {tiff_file2.name}') - for tiff_file1 in tiff_folder_path1.rglob(tiff_file2.name): + values1 = None + for tiff_file1 in sorted(tiff_folder_path1.rglob(tiff_file2.name)): ds1 = gdal.Open(str(tiff_file1)) values1 = ds1.GetRasterBand(1).ReadAsArray() ds1 = None - assert values2.size == values2.size, 'Grid {tiff_file2.name} do not have same size as reference.' + assert values1 is not None, f'Grid {tiff_file2.name} was not generated.' + assert values1.size == values2.size, f'Grid {tiff_file2.name} do not have same size as reference.' assert np.allclose(values1, values2, atol=0.0000001), f'File {tiff_file2.name} is not equal to reference.' def test_generate_netcdf(self): @@ -92,6 +92,7 @@ def test_update_netcdf(self): # This is needed so we can test updating 1 timestep on an existing netCDF file. copy2(os.path.join(input_folder, 'pr6.nc'), output_netcdf) + use_broadcasting = False quiet_mode = True variable_code = 'pr6' config_type = '1arcmin' @@ -114,7 +115,7 @@ def test_update_netcdf(self): end_date = datetime.strptime(end_date_str, FileUtils.DATE_PATTERN_CONDENSED) run(config_filename, infolder, outfolder_or_file, processing_dates_file, - file_utils, out_tiff, overwrite_output, start_date, end_date) + file_utils, out_tiff, overwrite_output, start_date, end_date, use_broadcasting=use_broadcasting) reference = Dataset(reference_output) out = Dataset(output_netcdf) @@ -128,14 +129,16 @@ def test_generate_tiff(self): input_folder = 'tests/data/gridding/meteo_in/test1' reference_output = 'tests/data/gridding/reference/test3' output_folder = 'tests/data/gridding/meteo_out/test3' + + output_folder_path = Path(output_folder) - Path(output_folder).mkdir(parents=True, exist_ok=True) - - for filename in os.listdir(output_folder): - file_path = os.path.join(output_folder, filename) - if os.path.exists(file_path): - rmtree(file_path) + output_folder_path.mkdir(parents=True, exist_ok=True) + + # Clean output folder + for output_file in sorted(output_folder_path.rglob('*.*')): + os.remove(output_file) + use_broadcasting = False quiet_mode = True variable_code = 'pr6' config_type = '1arcmin' @@ -144,7 +147,7 @@ def test_generate_tiff(self): out_tiff = True infolder = input_folder overwrite_output = False - outfolder_or_file = output_folder + outfolder_or_file = str(Path(output_folder, 'output.nc')) processing_dates_file = None configuration_base_folder = os.path.join(cur_folder, '../src/lisfloodutilities/gridding/configuration') @@ -159,10 +162,18 @@ def test_generate_tiff(self): end_date = datetime.strptime(end_date_str, FileUtils.DATE_PATTERN_CONDENSED) run(config_filename, infolder, outfolder_or_file, processing_dates_file, - file_utils, out_tiff, overwrite_output, start_date, end_date) - - print('compare_tiffs') - self.compare_tiffs(Path(output_folder), Path(reference_output)) - + file_utils, out_tiff, overwrite_output, start_date, end_date, use_broadcasting=use_broadcasting) + + # Remove netcdf output file since we are only interested in the output tiffs + if os.path.exists(outfolder_or_file): + os.remove(outfolder_or_file) + + # Move the output tiffs from the input folder into the output folder + for tiff_file in sorted(Path(input_folder).rglob('*.tiff')): + out_tiff_file = Path(output_folder, tiff_file.name) + tiff_file.rename(out_tiff_file) + + self.compare_tiffs(output_folder_path, Path(reference_output)) + out = None reference = None