Skip to content

Commit

Permalink
Create cross validation and tune interpolation
Browse files Browse the repository at this point in the history
  • Loading branch information
gnrgomes committed Nov 23, 2023
1 parent 42fedc3 commit f72b6fa
Show file tree
Hide file tree
Showing 23 changed files with 1,163 additions and 130 deletions.
95 changes: 50 additions & 45 deletions src/lisfloodutilities/gridding/generate_grids.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
from pathlib import Path
from argparse import ArgumentParser, ArgumentTypeError
from datetime import datetime, timedelta
from lisfloodutilities.gridding.lib.utils import Printable, Dem, Config, FileUtils, GriddingUtils
from lisfloodutilities.gridding.lib.writers import NetCDFWriter, GDALWriter
from src.lisfloodutilities.gridding.lib.utils import Printable, Dem, Config, FileUtils, GriddingUtils
from src.lisfloodutilities.gridding.lib.writers import NetCDFWriter, GDALWriter


END_DATE_DEFAULT = datetime.now().strftime('%Y%m%d060000')
Expand All @@ -35,8 +35,14 @@ def print_msg(msg: str = ''):
if not quiet_mode:
print(msg)

def run(config_filename: str, infolder: str, outfolder_or_file: str, processing_dates_file: str, file_utils: FileUtils,
output_tiff: bool, overwrite_output: bool, start_date: datetime = None, end_date: datetime = None):
def interpolation_mode_type(mode: str) -> str:
if not mode or mode not in Config.INTERPOLATION_MODES:
raise ArgumentTypeError(f'You must select a mode out of {list(Config.INTERPOLATION_MODES.keys())}.')
return mode

def run(config_filename: str, infolder: str, output_file: str, processing_dates_file: str, file_utils: FileUtils,
output_tiff: bool, overwrite_output: bool, start_date: datetime = None, end_date: datetime = None,
interpolation_mode: str = 'adw', use_broadcasting: bool = False):
"""
Interpolate text files containing (x, y, value) using inverse distance interpolation.
Produces as output, either a netCDF file containing all the grids or one TIFF file per grid.
Expand All @@ -47,13 +53,13 @@ def run(config_filename: str, infolder: str, outfolder_or_file: str, processing_
"""
global quiet_mode

conf = Config(config_filename, start_date, end_date, quiet_mode)
conf = Config(config_filename, start_date, end_date, quiet_mode, interpolation_mode)

if conf.start_date > conf.end_date:
raise ArgumentTypeError("Start date is greater than End date.")

dates_to_process = file_utils.read_processing_dates_file(processing_dates_file)
grid_utils = GriddingUtils(conf, quiet_mode)
grid_utils = GriddingUtils(conf, quiet_mode, use_broadcasting)

size_lons = len(conf.dem_lons)
size_lats = len(conf.dem_lats)
Expand All @@ -69,34 +75,25 @@ def run(config_filename: str, infolder: str, outfolder_or_file: str, processing_

netcdf_offset_file_date = int(conf.get_config_field('VAR_TIME','OFFSET_FILE_DATE'))

outfile = output_file
if output_tiff:
outfolder = outfolder_or_file
output_writer = GDALWriter(conf, overwrite_output, quiet_mode)
for filename in sorted(Path(infolder).rglob(inwildcard)):
file_timestamp = file_utils.get_timestamp_from_filename(filename) + timedelta(days=netcdf_offset_file_date)
if not file_utils.processable_file(file_timestamp, dates_to_process, conf.start_date, conf.end_date):
continue # Skip processing file
print_msg(f'Processing file: {filename}')
outfile = str(filename).replace(infolder, outfolder)
outfilepath = Path(outfile).with_suffix('.tiff')
# Create the output parent folders if not exist yet
Path(outfilepath.parent).mkdir(parents=True, exist_ok=True)
output_writer.open(Path(outfilepath))
grid_data = grid_utils.generate_grid(filename)
output_writer.write(grid_data, file_timestamp)
output_writer.close()
else: # NetCDF
outfile = outfolder_or_file
output_writer = NetCDFWriter(conf, overwrite_output, quiet_mode)
output_writer.open(Path(outfile))
for filename in sorted(Path(infolder).rglob(inwildcard)):
file_timestamp = file_utils.get_timestamp_from_filename(filename) + timedelta(days=netcdf_offset_file_date)
if not file_utils.processable_file(file_timestamp, dates_to_process, conf.start_date, conf.end_date):
continue # Skip processing file
print_msg(f'Processing file: {filename}')
grid_data = grid_utils.generate_grid(filename)
output_writer.write(grid_data, file_timestamp)
output_writer.close()
output_writer_tiff = GDALWriter(conf, overwrite_output, quiet_mode)
output_writer_netcdf = NetCDFWriter(conf, overwrite_output, quiet_mode)
output_writer_netcdf.open(Path(outfile))
for filename in sorted(Path(infolder).rglob(inwildcard)):
file_timestamp = file_utils.get_timestamp_from_filename(filename) + timedelta(days=netcdf_offset_file_date)
if not file_utils.processable_file(file_timestamp, dates_to_process, conf.start_date, conf.end_date):
continue # Skip processing file
print_msg(f'Processing file: {filename}')
if output_tiff:
outfilepath = filename.with_suffix('.tiff')
output_writer_tiff.open(outfilepath)
grid_data = grid_utils.generate_grid(filename)
output_writer_netcdf.write(grid_data, file_timestamp)
if output_tiff:
output_writer_tiff.write(grid_data, file_timestamp)
output_writer_tiff.close()
output_writer_netcdf.close()
print_msg('Finished writing files')


Expand All @@ -111,7 +108,7 @@ def main(argv):

program_version_string = 'version %s (%s)\n' % (program_version, program_build_date)
program_longdesc = '''
This script interpolates meteo input variables data into either a single NETCDF4 file or one GEOTIFF file per timestep.
This script interpolates meteo input variables data into a single NETCDF4 file and, if selected, generates also a GEOTIFF file per timestep.
The resulting netCDF is CF-1.6 compliant.
'''
program_license = """
Expand All @@ -134,14 +131,16 @@ def main(argv):
out_tiff=False,
overwrite_output=False,
start_date='',
end_date=END_DATE_DEFAULT)
end_date=END_DATE_DEFAULT,
interpolation_mode='adw',
use_broadcasting=False)

parser.add_argument("-i", "--in", dest="infolder", required=True, type=FileUtils.folder_type,
help="Set input folder path with kiwis/point files",
metavar="input_folder")
parser.add_argument("-o", "--out", dest="outfolder_or_file", required=True, type=FileUtils.file_or_folder,
help="Set output folder base path for the tiff files or the netCDF file path.",
metavar="{output_folder, netcdf_file}")
parser.add_argument("-o", "--out", dest="output_file", required=True, type=FileUtils.file_or_folder,
help="Set the output netCDF file path containing all the timesteps between start and end dates.",
metavar="output_netcdf_file")
parser.add_argument("-c", "--conf", dest="config_type", required=True,
help="Set the grid configuration type to use.",
metavar="{5x5km, 1arcmin,...}")
Expand All @@ -159,9 +158,14 @@ def main(argv):
metavar="YYYYMMDDHHMISS")
parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", help="Set script output into quiet mode [default: %(default)s]")
parser.add_argument("-t", "--tiff", dest="out_tiff", action="store_true",
help="Outputs a tiff file per timestep instead of the default single netCDF [default: %(default)s]")
help="Outputs a tiff file per timestep and also the single netCDF with all the timesteps [default: %(default)s]")
parser.add_argument("-f", "--force", dest="overwrite_output", action="store_true",
help="Force write to existing file. TIFF files will be overwritten and netCDF file will be appended. [default: %(default)s]")
parser.add_argument("-m", "--mode", dest="interpolation_mode", required=False, type=interpolation_mode_type,
help="Set interpolation mode. [default: %(default)s]",
metavar=f"{list(Config.INTERPOLATION_MODES.keys())}")
parser.add_argument("-b", "--broadcast", dest="use_broadcasting", action="store_true",
help="When set, computations will run faster in full broadcasting mode but require more memory. [default: %(default)s]")

# process options
args = parser.parse_args(argv)
Expand Down Expand Up @@ -194,17 +198,18 @@ def main(argv):

print_msg(f"Input Folder: {args.infolder}")
print_msg(f"Overwrite Output: {args.overwrite_output}")
print_msg(f"Interpolation Mode: {args.interpolation_mode}")
print_msg(f"Broadcasting: {args.use_broadcasting}")
if args.out_tiff:
print_msg("Output Type: TIFF")
print_msg(f"Output Folder: {args.outfolder_or_file}")
else:
print_msg("Output Type: netCDF")
print_msg(f"Output File: {args.outfolder_or_file}")
print_msg(f"Output Folder: {args.infolder}")
print_msg("Output Type: netCDF")
print_msg(f"Output File: {args.output_file}")
print_msg(f"Processing Dates File: {args.processing_dates_file}")
print_msg(f"Config File: {config_filename}")

run(config_filename, args.infolder, args.outfolder_or_file, args.processing_dates_file,
file_utils, args.out_tiff, args.overwrite_output, start_date, end_date)
run(config_filename, args.infolder, args.output_file, args.processing_dates_file,
file_utils, args.out_tiff, args.overwrite_output, start_date, end_date, args.interpolation_mode, args.use_broadcasting)
except Exception as e:
indent = len(program_name) * " "
sys.stderr.write(program_name + ": " + repr(e) + "\n")
Expand Down
104 changes: 76 additions & 28 deletions src/lisfloodutilities/gridding/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def print_msg(self, msg: str = ''):


class Dem(Printable):
def __init__(self, dem_map: Path):
super().__init__()
def __init__(self, dem_map: Path, quiet_mode: bool = False):
super().__init__(quiet_mode)
self._dem_map = dem_map
self.print_msg(f'Reading grid settings and altitude values from: {dem_map}')
reader = NetCDFReader(self._dem_map)
Expand Down Expand Up @@ -150,14 +150,18 @@ def file_or_folder(fname: str) -> str:


class Config(Printable):
# Each interpolation is paired with the number of neighbours used by the interpolation algorithm.
INTERPOLATION_MODES = {'nearest': 1, 'invdist': 20, 'adw': 11, 'cdd': 11,
'bilinear': 4, 'triangulation': 3, 'bilinear_delaunay': 4}
def __init__(self, config_filename: str, start_date: datetime = None,
end_date: datetime = None, quiet_mode: bool = False):
end_date: datetime = None, quiet_mode: bool = False, interpolation_mode: str = 'adw'):
super().__init__(quiet_mode)
self.COLUMN_HEIGHT = "height"
self.COLUMN_VALUE = "value"
self.COLUMN_LON = "lon"
self.COLUMN_LAT = "lat"
self.VALUE_NAN = -9999.0
self.interpolation_mode = interpolation_mode
self.__setup_variable_config(config_filename)
self.__HEIGHT_CORRECTION_FACTOR = {"tx": 0.006, "tn": 0.006, "ta": 0.006, "ta6": 0.006, "pd": 0.00025}

Expand All @@ -167,18 +171,28 @@ def __init__(self, config_filename: str, start_date: datetime = None,
self._dem_height_correction_kdt_file = Path(self.config_path).joinpath('dem.kdt')

self.__setup_dem()

# self.interpolation_mode = 'nearest'
self.interpolation_mode = 'invdist'
# self.interpolation_mode = 'bilinear'
# self.interpolation_mode = 'triangulation'
# self.interpolation_mode = 'bilinear_delaunay'
self.scipy_modes_nnear = {'nearest': 1, 'invdist': 4, 'bilinear': 4, 'triangulation': 3, 'bilinear_delaunay': 4}
self.grid_details = {'gridType': 'NOT rotated', 'Nj': 1, 'radius': 1.0}

self.__setup_interpolation_parameters()
self.__setup_data_structures()
self.__setup_dates(start_date, end_date)

def __setup_interpolation_parameters(self):
self.scipy_modes_nnear = Config.INTERPOLATION_MODES
self.grid_details = {'gridType': 'NOT rotated', 'Nj': 1, 'radius': 6367470.0}
self.min_upper_bound = 100000000 # max search distance in meters
cdd_map_path = Path(self.config_path).joinpath(f'CDDmap_{self.var_code}.nc')
self.cdd_map = f'{cdd_map_path}'
self.cdd_mode = 'MixHofstraShepard'
# 1) m=4; r=1/3; min 4 stations
# 2) m=8, r=1/3; min 4 stations
# 3) m=8; r=2/3; min 4 stations
# 4) m=4; r=1/3; min 4 stations + CDD map without Euro4m, CarpatClim
self.cdd_options = {
'm_const': 4,
'min_num_of_station': 4,
'radius_ratio': 1/3,
'weights_mode': 'All' # 'OnlyTOP10'
}

def __setup_dates(self, start_date: datetime = None, end_date: datetime = None):
netcdf_var_time_unit_pattern = self.get_config_field('VAR_TIME','UNIT_PATTERN')
if start_date is None:
Expand All @@ -202,7 +216,7 @@ def __setup_variable_config(self, config_filename: str):
self.__configFile.read(config_filename)

def __setup_dem(self):
self.dem = Dem(self._dem_file)
self.dem = Dem(self._dem_file, self.quiet_mode)
self.dem_nrows = self.dem.nrows
self.dem_ncols = self.dem.ncols
self.dem_lons = self.dem.lons.flatten()
Expand Down Expand Up @@ -263,39 +277,62 @@ def __setup_data_structures(self):
self.DEM_HEIGHT_CORRECTION_QUERY = pickle.load(self._dem_height_correction_kdt_file.open("rb"))
self.print_msg('Finish loading data structures')

def get_config_field(self, config_group: str = '', config_property: str = ''):
def get_config_field(self, config_group: str = '', config_property: str = '') -> str:
return self.__configFile.get(config_group, config_property)

@property
def scale_factor(self):
def scale_factor(self) -> float:
return float(self.get_config_field('PROPERTIES', 'VALUE_SCALE'))

@property
def add_offset(self):
def add_offset(self) -> float:
return float(self.get_config_field('PROPERTIES', 'VALUE_OFFSET'))

@property
def var_code(self):
def value_min(self) -> int:
return int(self.get_config_field('PROPERTIES', 'VALUE_MIN'))

@property
def value_max(self) -> int:
return int(self.get_config_field('PROPERTIES', 'VALUE_MAX'))

@property
def value_min_packed(self) -> int:
return int((self.value_min - self.add_offset) / self.scale_factor)

@property
def value_max_packed(self) -> int:
return int((self.value_max - self.add_offset) / self.scale_factor)

@property
def value_nan_packed(self) -> float:
return float((self.VALUE_NAN - self.add_offset) / self.scale_factor)

@property
def var_code(self) -> str:
return self.get_config_field('PROPERTIES', 'VAR_CODE')

@property
def do_height_correction(self):
def do_height_correction(self) -> bool:
return self.var_code in self.__HEIGHT_CORRECTION_FACTOR

@property
def height_correction_factor(self):
def height_correction_factor(self) -> float:
return self.__HEIGHT_CORRECTION_FACTOR[self.var_code]

@property
def neighbours_near(self):
def neighbours_near(self) -> int:
return self.scipy_modes_nnear[self.interpolation_mode]


class GriddingUtils(Printable):

def __init__(self, conf: Config, quiet_mode: bool = False):
def __init__(self, conf: Config, quiet_mode: bool = False, use_broadcasting: bool = False):
super().__init__(quiet_mode)
self.conf = conf
self.use_broadcasting = use_broadcasting
self.unit_conversion = float(self.conf.get_config_field('PROPERTIES', 'UNIT_CONVERSION'))
# self.compressed_NaN = (self.conf.VALUE_NAN - self.conf.add_offset) / self.conf.scale_factor

def correct_height(self, df: pd.DataFrame) -> pd.DataFrame:
if self.conf.do_height_correction:
Expand Down Expand Up @@ -325,8 +362,14 @@ def reset_height(self, result: ma.MaskedArray) -> np.ndarray:

def prepare_grid(self, result: np.ndarray, grid_shape: np.ndarray) -> np.ndarray:
# Compress data
result[np.where(result!=self.conf.VALUE_NAN)] /= self.conf.scale_factor
result[np.where(result!=self.conf.VALUE_NAN)] += self.conf.add_offset
if self.unit_conversion != 1.0:
result = result * self.unit_conversion
result[np.where(result == self.conf.VALUE_NAN)] = np.nan
result[np.where(result < self.conf.value_min)] = np.nan
result[np.where(result > self.conf.value_max)] = np.nan
result = np.round(result.astype(float), 1)
result = (result - self.conf.add_offset) / self.conf.scale_factor
result[np.isnan(result)] = self.conf.VALUE_NAN
# Reshape grid
grid_data = result.reshape(grid_shape)
return grid_data
Expand All @@ -335,12 +378,10 @@ def check_grid_nan(self, filename: Path, result: np.ndarray):
# Verify if grid contains NaN different than the ones on the DEM.
# Which means there are NaN values that shouldn't be generated.
count_dem_nan = self.conf.dem.count_nan
value_min = int(self.conf.get_config_field('PROPERTIES', 'VALUE_MIN'))
value_max = int(self.conf.get_config_field('PROPERTIES', 'VALUE_MAX'))
current_grid = result.copy()
current_grid[np.where(current_grid < value_min)] = np.nan
current_grid[np.where(current_grid > value_max)] = np.nan
current_grid[np.where(current_grid == self.conf.VALUE_NAN)] = np.nan
current_grid[np.where(current_grid < self.conf.value_min)] = np.nan
current_grid[np.where(current_grid > self.conf.value_max)] = np.nan
count_grid_nan = np.count_nonzero(np.isnan(current_grid))
if count_dem_nan != count_grid_nan:
print(f"WARNING: The grid interpolated from file {filename.name} contains different NaN values ({count_grid_nan}) than the DEM ({count_dem_nan}). diff: {count_grid_nan - count_dem_nan}")
Expand All @@ -363,7 +404,14 @@ def generate_grid(self, filename: Path) -> np.ndarray:
scipy_interpolation = ScipyInterpolation(xp, yp, self.conf.grid_details, values,
self.conf.neighbours_near, mv_target, mv_source,
target_is_rotated=False, parallel=False,
mode=self.conf.interpolation_mode)
mode=self.conf.interpolation_mode,
cdd_map=self.conf.cdd_map,
cdd_mode=self.conf.cdd_mode,
cdd_options=self.conf.cdd_options,
use_broadcasting=self.use_broadcasting)

# reset search radius to a fixed value
scipy_interpolation.min_upper_bound = self.conf.min_upper_bound
results, weights, indexes = scipy_interpolation.interpolate(grid_x, grid_y)
result = np.ma.masked_array(data=results, mask=self.conf.dem_mask, fill_value=self.conf.VALUE_NAN)
self.print_msg('Finish interpolation')
Expand Down
Loading

0 comments on commit f72b6fa

Please sign in to comment.