Merge pull request #53 from ec-jrc/module_kiwis_interpreter

Add option to use existing file. Correct tests
ec-jrc · Jan 25, 2024 · e02c81c · e02c81c
2 parents 3e564e8 + a8c0c69
commit e02c81c
Show file tree

Hide file tree

Showing 27 changed files with 50 additions and 557 deletions.
diff --git a/src/lisfloodutilities/gridding/configuration/1arcmin/config_tx.txt b/src/lisfloodutilities/gridding/configuration/1arcmin/config_tx.txt
@@ -1,3 +1,8 @@
+[GENERIC]
+
+INPUT_WILDCARD = ??????????.kiwis
+INPUT_TIMESTAMP_PATTERN = %%Y%%m%%d%%H.kiwis
+
 [PROPERTIES]
 
 VAR_CODE = tx
@@ -12,6 +17,9 @@ DATA_TYPE_PACKED = i2
 STANDARD_NAME = air_temperature
 LONG_NAME = Daily Maximum Temperature
 
+# KIWIS_FILTER_COLUMNS = {'COL_LAT': 'station_local_y', 'COL_LON': 'station_local_x', 'COL_IS_IN_DOMAIN': 'EFAS-ADDATTR-ISINARCMINDOMAIN'}
+KIWIS_FILTER_COLUMNS = {'COL_LAT': 'lat', 'COL_LON': 'lon', 'COL_IS_IN_DOMAIN': 'EFAS-ADDATTR-ISINARCMINDOMAIN'}
+
 [VAR_TIME]
 
 UNIT = days since 1990-01-01 18:00:00.0

diff --git a/src/lisfloodutilities/gridding/generate_grids.py b/src/lisfloodutilities/gridding/generate_grids.py
@@ -46,8 +46,9 @@ def memory_save_mode_type(mode: str) -> str:
     return mode
 
 def run(config_filename: str, infolder: str, output_file: str, processing_dates_file: str, file_utils: FileUtils,
-        output_tiff: bool, output_netcdf: bool, overwrite_output: bool, start_date: datetime = None, end_date: datetime = None,
-        interpolation_mode: str = 'adw', use_broadcasting: bool = False, memory_save_mode: str = None):
+        output_tiff: bool, output_netcdf: bool, overwrite_output: bool, use_existing_file: bool,
+        start_date: datetime = None, end_date: datetime = None, interpolation_mode: str = 'adw',
+        use_broadcasting: bool = False, memory_save_mode: str = None):
     """
     Interpolate text files containing (x, y, value) using inverse distance interpolation.
     Produces as output, either a netCDF file containing all the grids or one TIFF file per grid.
@@ -86,7 +87,7 @@ def run(config_filename: str, infolder: str, output_file: str, processing_dates_
     if output_netcdf:
         output_writer_netcdf = NetCDFWriter(conf, overwrite_output, quiet_mode)
         output_writer_netcdf.open(Path(outfile))
-    file_loader = KiwisLoader(conf, Path(infolder), dates_to_process, overwrite_output, quiet_mode)
+    file_loader = KiwisLoader(conf, Path(infolder), dates_to_process, overwrite_output, use_existing_file, quiet_mode)
     for filename in file_loader:
         file_timestamp = file_utils.get_timestamp_from_filename(filename) + timedelta(days=netcdf_offset_file_date)
         print_msg(f'Processing file: {filename}')
@@ -138,6 +139,7 @@ def main(argv):
                             out_tiff=False,
                             out_netcdf=False,
                             overwrite_output=False,
+                            use_existing_file=False,
                             start_date='',
                             end_date=END_DATE_DEFAULT,
                             interpolation_mode='adw',
@@ -175,6 +177,8 @@ def main(argv):
                             help="Outputs a single netCDF with all the timesteps [default: %(default)s]")
         parser.add_argument("-f", "--force", dest="overwrite_output", action="store_true",
                             help="Force write to existing file. TIFF files will be overwritten and netCDF file will be appended. [default: %(default)s]")
+        parser.add_argument("-u", "--useexisting", dest="use_existing_file", action="store_true",
+                            help="Force to use existing point/txt filenames, so these files will be used for gridding. [default: %(default)s]")
         parser.add_argument("-m", "--mode", dest="interpolation_mode", required=False, type=interpolation_mode_type,
                             help="Set interpolation mode. [default: %(default)s]",
                             metavar=f"{list(Config.INTERPOLATION_MODES.keys())}")
@@ -233,15 +237,16 @@ def main(argv):
 
         print_msg(f"Input Folder:  {args.in_file_or_folder}")
         print_msg(f"Overwrite Output: {args.overwrite_output}")
+        print_msg(f"Use Existing Point Files: {args.use_existing_file}")
         print_msg(f"Interpolation Mode: {args.interpolation_mode}")
         print_msg(f"RAM Save Mode: {args.memory_save_mode}")
         print_msg(f"Broadcasting: {args.use_broadcasting}")
         print_msg(f"Processing Dates File: {args.processing_dates_file}")
         print_msg(f"Config File: {config_filename}")
 
         run(config_filename, args.in_file_or_folder, args.output_file, args.processing_dates_file, file_utils, args.out_tiff,
-            args.out_netcdf, args.overwrite_output, start_date, end_date, args.interpolation_mode, args.use_broadcasting,
-            args.memory_save_mode)
+            args.out_netcdf, args.overwrite_output, args.use_existing_file, start_date, end_date, args.interpolation_mode,
+            args.use_broadcasting, args.memory_save_mode)
     except Exception as e:
         indent = len(program_name) * " "
         sys.stderr.write(program_name + ": " + repr(e) + "\n")

diff --git a/src/lisfloodutilities/gridding/lib/utils.py b/src/lisfloodutilities/gridding/lib/utils.py
@@ -485,10 +485,12 @@ def generate_grid(self, filename: Path) -> np.ndarray:
 
 class KiwisLoader(Printable):
 
-    def __init__(self, conf: Config, infolder: Path, dates_to_process: dict = {}, overwrite_file: bool = False, quiet_mode: bool = False):
+    def __init__(self, conf: Config, infolder: Path, dates_to_process: dict = {}, overwrite_file: bool = False,
+                 use_existing_file: bool = False, quiet_mode: bool = False):
         super().__init__(quiet_mode)
         self.conf = conf
         self.overwrite_file = overwrite_file
+        self.use_existing_file = use_existing_file
         self.var_code = self.conf.var_code
         self.var_size = len(self.var_code)
         self.inwildcard = self.conf.input_wildcard
@@ -548,30 +550,34 @@ def __process_next_batch_of_files(self):
         filepath_kiwis = [pair[0] for pair in files_group]
         filepath_points = [pair[1] for pair in files_group]
         kiwis_timestamps = [pair[2] for pair in files_group]
-        df_kiwis_array = []
-        # Allow us to get the output columns out of the dataframe
-        last_filter_class = KiwisFilter()
-        # At this point we need to process all the files nevertheless because even
-        # if 1 single file needs to be generated it might affect the results of the
-        # other existing ones, since one station that is filtered in one file might
-        # be filtered on the remaining files simultaneously.
-        for filter_class in self.filter_classes:
-            self.print_msg(f'{filter_class.get_class_description()}')
-            last_filter_class = filter_class
-            df_kiwis_array = filter_class.filter(filepath_kiwis, kiwis_timestamps, df_kiwis_array)
-        i = 0
-        for df in df_kiwis_array:
-            df_output = last_filter_class.get_dataframe_output_columns(df)
-            filepath = filepath_points[i]
-            if self.overwrite_file or not filepath.is_file():
+        # Skip regenerating files if not in overwriting mode and all files are already existing.
+        regenerate_files = False
+        for filepath in filepath_points:
+             if not filepath.is_file():
+                regenerate_files = True
+        if self.overwrite_file or regenerate_files:
+            df_kiwis_array = []
+            # Allow us to get the output columns out of the dataframe
+            last_filter_class = KiwisFilter()
+            # At this point we need to process all the files nevertheless because even
+            # if 1 single file needs to be generated it might affect the results of the
+            # other existing ones, since one station that is filtered in one file might
+            # be filtered on the remaining files simultaneously.
+            for filter_class in self.filter_classes:
+                self.print_msg(f'{filter_class.get_class_description()}')
+                last_filter_class = filter_class
+                df_kiwis_array = filter_class.filter(filepath_kiwis, kiwis_timestamps, df_kiwis_array)
+            i = 0
+            for df in df_kiwis_array:
+                df_output = last_filter_class.get_dataframe_output_columns(df)
+                filepath = filepath_points[i]
                 df_output.to_csv(
                     filepath,
                     index=False,
                     header=False,
                     sep="\t",
                 )
-            i += 1
-
+                i += 1
 
     def __get_filter_classes(self) -> list:
         '''
@@ -605,7 +611,7 @@ def __get_points_filename(self, kiwis_timestamp: str, filename_kiwis: Path) -> P
         If the mode is overwrite tries to get the first pointfile path it finds and if it does not find generates a new file path.
         Otherwise generates a new file path.
         '''
-        if self.overwrite_file:
+        if self.use_existing_file:
             for points_path in sorted(filename_kiwis.parent.rglob(f'{self.var_code}{kiwis_timestamp}_??????????????.txt')):
                 if points_path.is_file():
                     return points_path