From 8ac845af0365da84c9751ec1dc0603516fd1c4d8 Mon Sep 17 00:00:00 2001 From: Ken Kehoe Date: Mon, 29 Jul 2024 14:02:22 -0600 Subject: [PATCH] Summary QC (#844) * The DQR used for testing was reprocessed so it was no longer returned with the web API. Need to request Reprocessed assessment type to get the test to work. * Adding new method to create QC Summary of embedded QC. * Adding testing for large amounts of data when on ARM ADC systems. * PEP8 linting * Checking for scalar variables before indexing. Resizing the QC data variable when it only has a time dimension but data has time/height * PEP8 linting * Limiting the facilities to run in test. * PEP8 linting * PEP8 Linting * Adding in c level testing * Correcting the check for dimentionality when setting time values. * Adding option to only return qc mask. Also, correctly setting mask based on QC variable shape instead of assuming data and qc are the same size. * Correcting for incorrect QC variable attribute name. Checking if data is from SIRS SERI QC. If so will run an additional method to convert from SERI QC to standardized QC. * Changing the text for meaning and assessment for passing QC. Changed to update the QC varible in place so the order of variables are preserved. * Adding testing for SERI QC converter' * Continued improvments. * PEP8 * PEP8 * Adding specific SWATS cleanup method * Fixing issue with mask variable not in correct loop. * Temporarily adding two new files for testing. * Adding new tests and some PEP8 stuff. * Moving the dataset copy afer the cleanup_qc call. * Correctly testing each variable after modification. * Adding to docstring. Improving method to check if variable is a quality control variable to allow for calling it field or variable. * Adding test for fix_incorrect_variable_bit_description_attributes * Changing to method that does not use deprecicated .ndim Xarray method. * Adding new testing files. * Changing texting files to new correct import * Cleaned up how to determine if SERI QC should be applied. * Adding two new testing files * PEP8 * MNT: Update __init__.py * Changing way to find SIRI QC files to include more crazy screwups by ARM. * Makeing QC mask with either QC variable or data variable based on structure of the data file. * Catching additional parsing error for times when the time is not parsed upon reading. * Changing the big processing to allow for better documentation and switching between single and big processing. * Stopping adding new qc varible to qc variable. * PEP8 * PEP8 --------- Co-authored-by: zssherman --- act/io/arm.py | 2 +- act/qc/__init__.py | 2 + act/qc/arm.py | 8 ++ act/qc/clean.py | 253 ++++++++++++++++++++++++++++++++++- act/qc/qc_summary.py | 124 +++++++++++++++++ act/qc/qcfilter.py | 28 +++- act/tests/__init__.py | 2 + act/tests/sample_files.py | 2 + act/utils/datetime_utils.py | 3 +- tests/qc/conftest.py | 19 +++ tests/qc/test_clean.py | 189 +++++++++++++++++++++++++- tests/qc/test_qc_summary.py | 260 ++++++++++++++++++++++++++++++++++++ 12 files changed, 878 insertions(+), 14 deletions(-) create mode 100644 act/qc/qc_summary.py create mode 100644 tests/qc/conftest.py create mode 100644 tests/qc/test_qc_summary.py diff --git a/act/io/arm.py b/act/io/arm.py index 3633bc9845..e501726a4d 100644 --- a/act/io/arm.py +++ b/act/io/arm.py @@ -248,7 +248,7 @@ def read_arm_netcdf( file_dates.append(pts[2]) file_times.append(pts[3]) else: - if ds['time'].size > 1: + if len(ds['time'].shape) > 0: dummy = ds['time'].values[0] else: dummy = ds['time'].values diff --git a/act/qc/__init__.py b/act/qc/__init__.py index 89fd7c1aae..2b043494fd 100644 --- a/act/qc/__init__.py +++ b/act/qc/__init__.py @@ -15,6 +15,7 @@ 'clean', 'comparison_tests', 'qcfilter', + 'qc_summary', 'qctests', 'radiometer_tests', 'sp2', @@ -27,6 +28,7 @@ 'set_bit', 'unset_bit', ], + 'qc_summary': ['QCSummary'], 'qctests': [ 'QCTests', ], diff --git a/act/qc/arm.py b/act/qc/arm.py index 0096befe68..bf323e77ac 100644 --- a/act/qc/arm.py +++ b/act/qc/arm.py @@ -185,6 +185,14 @@ def add_dqr_to_qc( if variable is not None and var_name not in variable: continue + # Do not process quality control variables as this will create a new + # quality control variable for the quality control varible. + try: + if ds[var_name].attrs['standard_name'] == 'quality_flag': + continue + except KeyError: + pass + try: ds.qcfilter.add_test( var_name, diff --git a/act/qc/clean.py b/act/qc/clean.py index 00ab5f8cc5..2c0bb7baa7 100644 --- a/act/qc/clean.py +++ b/act/qc/clean.py @@ -91,6 +91,7 @@ def cleanup( link_qc_variables=True, normalize_assessment=False, cleanup_cf_qc=True, + cleanup_incorrect_qc_attributes=True, **kwargs, ): """ @@ -118,6 +119,9 @@ def cleanup( Option to clean up assessments to use the same terminology. Set to False for default because should only be an issue after adding DQRs and the function to add DQRs calls this method. + cleanup_incorrect_qc_attributes : bool + Fix incorrectly named quality control variable attributes before + converting to standardized QC. **kwargs : keywords Keyword arguments passed through to clean.clean_arm_qc method. @@ -131,6 +135,12 @@ def cleanup( ds.clean.cleanup() """ + # There are some QC variables with incorrect bit_#_description attribute names. + # This will check for the incorrect attribute names and correct to allow next + # process to work correctly + if cleanup_incorrect_qc_attributes: + self._ds.clean.fix_incorrect_variable_bit_description_attributes() + # Convert ARM QC to be more like CF state fields if cleanup_arm_qc: self._ds.clean.clean_arm_qc(**kwargs) @@ -579,10 +589,7 @@ def correct_valid_minmax(self, qc_variable): def link_variables(self): """ - Add some attributes to link and explain data - to QC data relationship. Will use non-CF standard_name - of quality_flag. Hopefully this will be added to the - standard_name table in the future. + Add some attributes to link and explain data to QC data relationship. """ for var in self._ds.data_vars: aa = re.match(r'^qc_(.+)', var) @@ -591,9 +598,10 @@ def link_variables(self): qc_variable = var except AttributeError: continue + # Skip data quality fields. try: - if "Quality check results on field:" not in self._ds[var].attrs["long_name"]: + if not self._ds[var].attrs["long_name"].startswith("Quality check results on"): continue except KeyError: pass @@ -607,7 +615,11 @@ def link_variables(self): # If the QC variable is not in ancillary_variables add if qc_variable not in ancillary_variables: ancillary_variables = qc_variable - self._ds[variable].attrs['ancillary_variables'] = copy.copy(ancillary_variables) + + try: + self._ds[variable].attrs['ancillary_variables'] = copy.copy(ancillary_variables) + except KeyError: + pass # Check if QC variable has correct standard_name and iff not fix it. correct_standard_name = 'quality_flag' @@ -648,6 +660,7 @@ def clean_arm_qc( """ global_qc = self.get_attr_info() + qc_attributes = None for qc_var in self.matched_qc_variables: # Clean up units attribute from unitless to udunits '1' try: @@ -749,6 +762,32 @@ def clean_arm_qc( qc_var_name=qc_var_name, test_number=test_to_remove ) + # If the QC was not cleaned up because it is not correctly formatted with SERI QC + # call the SERI QC method. + if global_qc is None and qc_attributes is None: + try: + DQMS = self._ds.attrs['qc_method'] == 'DQMS' + self._ds.attrs['comment'] + except KeyError: + try: + DQMS = 'sirs_seriqc' in self._ds.attrs['Command_Line'] + except KeyError: + DQMS = False + + if DQMS: + self._ds.clean.clean_seri_qc() + + # If the QC was not cleaned up because it is not correctly formatted with + # SWATS global attributes call the SWATS QC method. + try: + text = 'SWATS QC checks (bit values)' + SWATS_QC = text in self._ds.attrs['Mentor_QC_Field_Information'] + except KeyError: + SWATS_QC = False + + if SWATS_QC and global_qc is None and qc_attributes is None: + self._ds.clean.clean_swats_qc() + def normalize_assessment( self, variables=None, @@ -896,3 +935,205 @@ def clean_cf_qc(self, variables=None, sep='__', **kwargs): except KeyError: pass + + def fix_incorrect_variable_bit_description_attributes(self): + """ + Method to correct incorrectly defined quality control variable attributes. + There are some datastreams with the attribute names incorrectly having 'qc_' + prepended to the attribute name. This will fix those attributes so the cleanqc + method can correctly read the attributes. + + If the variable long_name starts with the string "Quality check results on" + and a variable attribute follows the pattern qc_bit_#_description the 'qc_' part of + the variable attribute will be removed. + + """ + + attr_description_pattern = r'^qc_bit_([0-9]+)_description$' + attr_assessment_pattern = r'^qc_bit_([0-9]+)_assessment$' + + for var_name in self._ds.data_vars: + try: + if not self._ds[var_name].attrs['long_name'].startswith("Quality check results on"): + continue + except KeyError: + continue + + for attr, value in self._ds[var_name].attrs.copy().items(): + for pattern in [attr_description_pattern, attr_assessment_pattern]: + description = re.match(pattern, attr) + if description is not None: + new_attr = attr[3:] + self._ds[var_name].attrs[new_attr] = self._ds[var_name].attrs.pop(attr) + + def clean_seri_qc(self): + """ + Method to apply SERI QC to the quality control variables. The definition of the QC + is listed in a single global attribute and not easily parsable. This method will update + the quality control variable to correctly set the test descriptions for each of the + SERI QC tests defined in the global attributes. + + """ + for var_name in self._ds.data_vars: + if not self._ds[var_name].attrs['long_name'].startswith("Quality check results on"): + continue + + qc_var_name = var_name + var_name = var_name.replace('qc_', '') + qc_data = self._ds[qc_var_name].values.copy() + self._ds[qc_var_name] = xr.zeros_like(self._ds[qc_var_name], dtype=np.int32) + + if qc_var_name in [ + "qc_down_short_diffuse", + "qc_short_direct_normal", + "qc_down_short_hemisp", + ]: + value_number = [1, 2, 3, 6, 7, 8, 9, 94, 95, 96, 97] + test_number = list(range(2, len(value_number) + 2)) + test_description = [ + 'Passed 1-component test; data fall within max-min limits of Kt,Kn, or Kd', + 'Passed 2-component test; data fall within 0.03 of the Gompertz boundaries', + 'Passed 3-component test; data come within +/- 0.03 of satifying Kt=Kn+Kd', + 'Value estimated; passes all pertinent SERI QC tests', + 'Failed 1-component test; lower than allowed minimum', + 'Falied 1-component test; higher than allowed maximum', + 'Passed 3-component test but failed 2-component test by >0.05', + 'Data fall into a physically impossible region where Kn>Kt by K-space distances of 0.05 to 0.10.', + 'Data fall into a physically impossible region where Kn>Kt by K-space distances of 0.10 to 0.15.', + 'Data fall into a physically impossible region where Kn>Kt by K-space distances of 0.15 to 0.20.', + 'Data fall into a physically impossible region where Kn>Kt by K-space distances of >= 0.20.', + ] + test_assessment = [ + 'Not failing', + 'Not failing', + 'Not failing', + 'Not failing', + 'Bad', + 'Bad', + 'Indeterminate', + 'Bad', + 'Bad', + 'Bad', + 'Bad', + ] + elif qc_var_name in ["qc_up_long_hemisp", "qc_down_long_hemisp_shaded"]: + value_number = [1, 2, 7, 8, 31] + test_number = list(range(2, len(value_number) + 2)) + test_description = [ + 'Passed 1-component test; data fall within max-min limits of up_long_hemisp and down_long_hemisp_shaded, but short_direct_normal and down_short_hemisp or down_short_diffuse fail the SERI QC tests.', + 'Passed 2-component test; data fall within max-min limits of up_long_hemisp and down_long_hemisp_shaded, and short_direct_normal, or down_short_hemisp and down_short_diffuse pass the SERI QC tests while the difference between down_short_hemisp and down_short_diffuse is greater than 20 W/m2.', + 'Failed 1-component test; lower than allowed minimum', + 'Failed 1-component test; higher than allowed maximum', + 'Failed 2-component test', + ] + test_assessment = [ + 'Not failing', + 'Not failing', + 'Bad', + 'Bad', + 'Bad', + ] + elif qc_var_name in ["qc_up_short_hemisp"]: + value_number = [1, 2, 7, 8, 31] + test_number = list(range(2, len(value_number) + 2)) + test_description = [ + 'Passed 1-component test', + 'Passed 2-component test', + 'Failed 1-component test; lower than allowed minimum', + 'Failed 1-component test; higher than allowed maximum', + 'Failed 2-component test; solar zenith angle is less than 80 degrees and down_short_hemisp is 0 or missing', + ] + test_assessment = [ + 'Not failing', + 'Not failing', + 'Bad', + 'Bad', + 'Bad', + ] + + self._ds[var_name].attrs['ancillary_variables'] = qc_var_name + self._ds[qc_var_name].attrs['standard_name'] = 'quality_flag' + self._ds[qc_var_name].attrs['flag_masks'] = [] + self._ds[qc_var_name].attrs['flag_meanings'] = [] + self._ds[qc_var_name].attrs['flag_assessments'] = [] + + self._ds.qcfilter.add_missing_value_test(var_name) + + for ii, _ in enumerate(value_number): + index = qc_data == value_number[ii] + self._ds.qcfilter.add_test( + var_name, + index=index, + test_number=test_number[ii], + test_meaning=test_description[ii], + test_assessment=test_assessment[ii], + ) + + if qc_var_name in [ + "qc_down_short_diffuse", + "qc_short_direct_normal", + "qc_down_short_hemisp", + ]: + calculation = ((qc_data + 2) / 4.0) % 4 + calculation = calculation.astype(np.int16) + value_number = [0, 1, 2, 3] + test_description = [ + 'Parameter too low by 3-component test (Kt=Kn+Kd)', + 'Parameter too high by 3-component test (Kt=Kn+Kd)', + 'Parameter too low by 2-component test (Gompertz boundary)', + 'Parameter too high by 2-component test (Gompertz boundary)', + ] + test_assessment = ['Bad', 'Bad', 'Bad', 'Bad'] + for ii, _ in enumerate(value_number): + index = (qc_data >= 10) & (qc_data <= 93) & (calculation == value_number[ii]) + self._ds.qcfilter.add_test( + var_name, + index=index, + test_meaning=test_description[ii], + test_assessment=test_assessment[ii], + ) + + def clean_swats_qc(self, fix_data_units=True): + """ + Method to apply SWATS global attribute quality control definition to the + quality control variables. + + Parameters + ---------- + fix_data_units : bool + The units string for some data variables incorrectly defines degrees Celsius + as 'C' insted of the udunits 'degC'. When set to true those units strings + are updated. + + """ + + for var_name in self._ds.data_vars: + if fix_data_units: + try: + unit = self._ds[var_name].attrs['units'] + if unit == 'C': + self._ds[var_name].attrs['units'] = 'degC' + except KeyError: + pass + + if not self._ds[var_name].attrs['long_name'].startswith("Quality check results on"): + continue + + qc_var_name = var_name + self._ds[qc_var_name].attrs['flag_masks'] = [1, 2, 4, 8] + self._ds[qc_var_name].attrs['flag_meanings'] = [ + 'Value is set to missing_value.', + 'Data value less than valid_min.', + 'Data value greater than valid_max.', + 'Difference between current and previous values exceeds valid_delta.', + ] + self._ds[qc_var_name].attrs['flag_assessments'] = [ + 'Bad', + 'Bad', + 'Bad', + 'Indeterminate', + ] + + self._ds.clean.correct_valid_minmax(qc_var_name) + + del self._ds.attrs['Mentor_QC_Field_Information'] diff --git a/act/qc/qc_summary.py b/act/qc/qc_summary.py new file mode 100644 index 0000000000..ff6518dde0 --- /dev/null +++ b/act/qc/qc_summary.py @@ -0,0 +1,124 @@ +""" +Method for creating Quality Control Summary variables from the embedded +quality control varialbes. The summary variable is a simplified version of +quality control that uses flag integers instead of bit-packed masks. The +number of descriptions is simplified to consolidate all categories into one +description. + +""" + +import datetime + + +class QCSummary: + """ + This is a Mixins class used to allow using qcfilter class that is already + registered to the Xarray dataset. All the methods in this class will be added + to the qcfilter class. Doing this to make the code spread across more files + so it is more manageable and readable. + + """ + + def __init__(self, ds): + """initialize""" + self._ds = ds + + def create_qc_summary(self, cleanup_qc=False): + """ + Method to convert embedded quality control to summary QC that utilzes + flag values instead of flag masks and summarizes the assessments to only + a few states. Lowest level of quality control will be listed first with most + sever having higher integer numbers. Dataset is updated in place. + + cleanup_qc : boolean + Call clean.cleanup() method to convert to standardized ancillary quality control + variables. The quality control summary requires the current embedded quality + control variables to use ACT standards. + + Returns + ------- + return_ds : Xarray.dataset + ACT Xarray dataset with quality control variables converted to summary flag values. + + """ + + standard_assessments = [ + 'Suspect', + 'Indeterminate', + 'Incorrect', + 'Bad', + ] + standard_meanings = [ + "Data suspect, further analysis recommended", + "Data suspect, further analysis recommended", + "Data incorrect, use not recommended", + "Data incorrect, use not recommended", + ] + + if cleanup_qc: + self._ds.clean.cleanup() + + return_ds = self._ds.copy() + + added = False + for var_name in list(self._ds.data_vars): + qc_var_name = self.check_for_ancillary_qc(var_name, add_if_missing=False, cleanup=False) + + if qc_var_name is None: + continue + + added = True + + assessments = list(set(self._ds[qc_var_name].attrs['flag_assessments'])) + + import xarray as xr + + result = xr.zeros_like(return_ds[qc_var_name]) + for attr in ['flag_masks', 'flag_meanings', 'flag_assessments', 'flag_values']: + try: + del result.attrs[attr] + except KeyError: + pass + + return_ds[qc_var_name] = result + + return_ds.qcfilter.add_test( + var_name, + index=None, + test_number=0, + test_meaning='Not failing quality control tests', + test_assessment='Not failing', + flag_value=True, + ) + + for ii, assessment in enumerate(standard_assessments): + if assessment not in assessments: + continue + + qc_mask = self.get_masked_data( + var_name, rm_assessments=assessment, return_mask_only=True + ) + + # Do not really know how to handle scalars yet. + if qc_mask.ndim == 0: + continue + + return_ds.qcfilter.add_test( + var_name, + index=qc_mask, + test_meaning=standard_meanings[ii], + test_assessment=assessment, + flag_value=True, + ) + + self._ds.update({qc_var_name: return_ds[qc_var_name]}) + + if added: + history = return_ds.attrs['history'] + history += ( + " ; Quality control summary implemented by ACT at " + f"{datetime.datetime.utcnow().isoformat()} UTC." + ) + return_ds.attrs['history'] = history + + return return_ds diff --git a/act/qc/qcfilter.py b/act/qc/qcfilter.py index aeecbe2a85..c8a64ca5e4 100644 --- a/act/qc/qcfilter.py +++ b/act/qc/qcfilter.py @@ -9,11 +9,11 @@ import numpy as np import xarray as xr -from act.qc import comparison_tests, qctests, bsrn_tests +from act.qc import comparison_tests, qctests, bsrn_tests, qc_summary @xr.register_dataset_accessor('qcfilter') -class QCFilter(qctests.QCTests, comparison_tests.QCTests, bsrn_tests.QCTests): +class QCFilter(qctests.QCTests, comparison_tests.QCTests, bsrn_tests.QCTests, qc_summary.QCSummary): """ A class for building quality control variables containing arrays for filtering data based on a set of test condition typically based on the @@ -539,7 +539,10 @@ def set_test(self, var_name, index=None, test_number=None, flag_value=False): if index is not None: if flag_value: - qc_variable[index] = test_number + if len(qc_variable.shape) == 0: + qc_variable = test_number + else: + qc_variable[index] = test_number else: if bool(np.shape(index)): qc_variable[index] = set_bit(qc_variable[index], test_number) @@ -792,6 +795,7 @@ def get_masked_data( return_nan_array=False, ma_fill_value=None, return_inverse=False, + return_mask_only=False, ): """ Returns a numpy masked array containing data and mask or @@ -818,6 +822,8 @@ def get_masked_data( Invert the masked array mask or return data array where mask is set to False instead of True set to NaN. Useful for overplotting where failing. + return_mask_only : boolean + Return the boolean mask only as a numpy array. Returns ------- @@ -902,9 +908,21 @@ def get_masked_data( if variable.dtype in (np.float64, np.int64): nan_dtype = np.float64 - mask = np.zeros(variable.shape, dtype=bool) + try: + # Get shape of mask from QC variable since there is a chance it will + # be a different shape than data variable. + mask = np.zeros(self._ds[qc_var_name].shape, dtype=bool) + except KeyError: + # If there is no QC variable make mask from shape of data variable. + mask = np.zeros(self._ds[var_name].shape, dtype=bool) + for test in test_numbers: - mask = mask | self._ds.qcfilter.get_qc_test_mask(var_name, test, flag_value=flag_value) + qc_test_mask = self._ds.qcfilter.get_qc_test_mask(var_name, test, flag_value=flag_value) + mask = mask | qc_test_mask + + # If requested only return the mask. + if return_mask_only: + return mask # Convert data numpy array into masked array try: diff --git a/act/tests/__init__.py b/act/tests/__init__.py index 23d5708a62..c41d4b7d83 100644 --- a/act/tests/__init__.py +++ b/act/tests/__init__.py @@ -61,6 +61,8 @@ 'EXAMPLE_CCN', 'EXAMPLE_OLD_QC', 'EXAMPLE_AOSACSM', + 'EXAMPLE_SIRS_SIRI_QC', + 'EXAMPLE_SWATS', ] }, ) diff --git a/act/tests/sample_files.py b/act/tests/sample_files.py index 11991023f5..47704a63e2 100644 --- a/act/tests/sample_files.py +++ b/act/tests/sample_files.py @@ -31,7 +31,9 @@ EXAMPLE_SIGMA_MPLV5 = DATASETS.fetch('201509021500.bi') EXAMPLE_RL1 = DATASETS.fetch('sgprlC1.a0.20160131.000000.nc') EXAMPLE_CO2FLX4M = DATASETS.fetch('sgpco2flx4mC1.b1.20201007.001500.nc') +EXAMPLE_SWATS = DATASETS.fetch('sgpswatsE8.b1.20071229.000700.cdf') EXAMPLE_SIRS = DATASETS.fetch('sgpsirsE13.b1.20190101.000000.cdf') +EXAMPLE_SIRS_SIRI_QC = DATASETS.fetch('sgpsirsC1.b1.20040101.000000.cdf') EXAMPLE_GML_RADIATION = DATASETS.fetch('brw21001.dat') EXAMPLE_GML_MET = DATASETS.fetch('met_brw_insitu_1_obop_hour_2020.txt') EXAMPLE_GML_OZONE = DATASETS.fetch('brw_12_2020_hour.dat') diff --git a/act/utils/datetime_utils.py b/act/utils/datetime_utils.py index 70c8945fb1..b65cfbec79 100644 --- a/act/utils/datetime_utils.py +++ b/act/utils/datetime_utils.py @@ -55,6 +55,7 @@ def numpy_to_arm_date(_date, returnTime=False): """ from dateutil.parser._parser import ParserError + from pandas._libs.tslibs.parsing import DateParseError try: date = pd.to_datetime(str(_date)) @@ -62,7 +63,7 @@ def numpy_to_arm_date(_date, returnTime=False): date = date.strftime('%Y%m%d') else: date = date.strftime('%H%M%S') - except ParserError: + except (ParserError, DateParseError): date = None return date diff --git a/tests/qc/conftest.py b/tests/qc/conftest.py new file mode 100644 index 0000000000..ab31c52f9f --- /dev/null +++ b/tests/qc/conftest.py @@ -0,0 +1,19 @@ +import pytest + + +def pytest_addoption(parser): + parser.addoption("--runbig", action="store_true", default=False, help="Run big tests") + + +def pytest_configure(config): + config.addinivalue_line("markers", "big: mark test as slow to run") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--runbig"): + # --runbig given in cli: do not skip big tests + return + skip_big = pytest.mark.skip(reason="need --runbig option to run") + for item in items: + if "big" in item.keywords: + item.add_marker(skip_big) diff --git a/tests/qc/test_clean.py b/tests/qc/test_clean.py index add485b3d9..ec1e7ba4e2 100644 --- a/tests/qc/test_clean.py +++ b/tests/qc/test_clean.py @@ -1,7 +1,14 @@ import numpy as np from act.io.arm import read_arm_netcdf -from act.tests import EXAMPLE_CEIL1, EXAMPLE_CO2FLX4M, EXAMPLE_MET1 +from act.tests import ( + EXAMPLE_CEIL1, + EXAMPLE_CO2FLX4M, + EXAMPLE_MET1, + EXAMPLE_MET_SAIL, + EXAMPLE_SIRS_SIRI_QC, + EXAMPLE_SWATS, +) def test_global_qc_cleanup(): @@ -154,3 +161,183 @@ def test_qc_flag_description(): unique_flag_assessments = list({'Acceptable', 'Indeterminate', 'Bad'}) for f in list(set(ds[qc_var_name].attrs['flag_assessments'])): assert f in unique_flag_assessments + + +def test_clean_sirs_siri_qc(): + ds = read_arm_netcdf(EXAMPLE_SIRS_SIRI_QC) + + data = ds["qc_short_direct_normal"].values + data[0:5] = 1 + data[5:10] = 2 + data[10:15] = 3 + data[15:20] = 6 + data[20:25] = 7 + data[25:30] = 8 + data[30:35] = 9 + data[35:40] = 94 + data[40:45] = 95 + data[45:50] = 96 + data[50:55] = 97 + data[55:60] = 14 + data[60:65] = 18 + data[65:70] = 22 + data[70:75] = 26 + ds["qc_short_direct_normal"].values = data + + data = ds["qc_up_long_hemisp"].values + data[0:5] = 1 + data[5:10] = 2 + data[10:15] = 7 + data[15:20] = 8 + data[20:25] = 31 + ds["qc_up_long_hemisp"].values = data + + data = ds["qc_up_short_hemisp"].values + data[0:5] = 1 + data[5:10] = 2 + data[10:15] = 7 + data[15:20] = 8 + data[20:25] = 31 + ds["qc_up_short_hemisp"].values = data + + ds.clean.cleanup() + + assert ds["qc_short_direct_normal"].attrs['flag_masks'] == [ + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1024, + 2048, + 4096, + 8192, + 16384, + 32768, + ] + assert ds["qc_short_direct_normal"].attrs['flag_meanings'] == [ + 'Value is set to missing_value.', + 'Passed 1-component test; data fall within max-min limits of Kt,Kn, or Kd', + 'Passed 2-component test; data fall within 0.03 of the Gompertz boundaries', + 'Passed 3-component test; data come within +/- 0.03 of satifying Kt=Kn+Kd', + 'Value estimated; passes all pertinent SERI QC tests', + 'Failed 1-component test; lower than allowed minimum', + 'Falied 1-component test; higher than allowed maximum', + 'Passed 3-component test but failed 2-component test by >0.05', + 'Data fall into a physically impossible region where Kn>Kt by K-space distances of 0.05 to 0.10.', + 'Data fall into a physically impossible region where Kn>Kt by K-space distances of 0.10 to 0.15.', + 'Data fall into a physically impossible region where Kn>Kt by K-space distances of 0.15 to 0.20.', + 'Data fall into a physically impossible region where Kn>Kt by K-space distances of >= 0.20.', + 'Parameter too low by 3-component test (Kt=Kn+Kd)', + 'Parameter too high by 3-component test (Kt=Kn+Kd)', + 'Parameter too low by 2-component test (Gompertz boundary)', + 'Parameter too high by 2-component test (Gompertz boundary)', + ] + + assert ds["qc_up_long_hemisp"].attrs['flag_masks'] == [1, 2, 4, 8, 16, 32] + assert ds["qc_up_long_hemisp"].attrs['flag_meanings'] == [ + 'Value is set to missing_value.', + 'Passed 1-component test; data fall within max-min limits of up_long_hemisp and down_long_hemisp_shaded, but short_direct_normal and down_short_hemisp or down_short_diffuse fail the SERI QC tests.', + 'Passed 2-component test; data fall within max-min limits of up_long_hemisp and down_long_hemisp_shaded, and short_direct_normal, or down_short_hemisp and down_short_diffuse pass the SERI QC tests while the difference between down_short_hemisp and down_short_diffuse is greater than 20 W/m2.', + 'Failed 1-component test; lower than allowed minimum', + 'Failed 1-component test; higher than allowed maximum', + 'Failed 2-component test', + ] + + assert ds["qc_up_short_hemisp"].attrs['flag_masks'] == [1, 2, 4, 8, 16, 32] + assert ds["qc_up_short_hemisp"].attrs['flag_meanings'] == [ + 'Value is set to missing_value.', + 'Passed 1-component test', + 'Passed 2-component test', + 'Failed 1-component test; lower than allowed minimum', + 'Failed 1-component test; higher than allowed maximum', + 'Failed 2-component test; solar zenith angle is less than 80 degrees and down_short_hemisp is 0 or missing', + ] + + assert np.all(ds["qc_short_direct_normal"].values[0:5] == 2) + assert np.all(ds["qc_short_direct_normal"].values[5:10] == 4) + assert np.all(ds["qc_short_direct_normal"].values[10:15] == 8) + assert np.all(ds["qc_short_direct_normal"].values[15:20] == 16) + assert np.all(ds["qc_short_direct_normal"].values[20:25] == 32) + assert np.all(ds["qc_short_direct_normal"].values[25:30] == 64) + assert np.all(ds["qc_short_direct_normal"].values[30:35] == 128) + assert np.all(ds["qc_short_direct_normal"].values[35:40] == 256) + assert np.all(ds["qc_short_direct_normal"].values[40:45] == 512) + assert np.all(ds["qc_short_direct_normal"].values[45:50] == 1024) + assert np.all(ds["qc_short_direct_normal"].values[50:55] == 2048) + assert np.all(ds["qc_short_direct_normal"].values[55:60] == 4096) + assert np.all(ds["qc_short_direct_normal"].values[60:65] == 8192) + assert np.all(ds["qc_short_direct_normal"].values[65:70] == 16384) + assert np.all(ds["qc_short_direct_normal"].values[70:75] == 32768) + + assert np.all(ds["qc_up_long_hemisp"].values[0:5] == 2) + assert np.all(ds["qc_up_long_hemisp"].values[5:10] == 4) + assert np.all(ds["qc_up_long_hemisp"].values[10:15] == 8) + assert np.all(ds["qc_up_long_hemisp"].values[15:20] == 16) + assert np.all(ds["qc_up_long_hemisp"].values[20:25] == 32) + + assert np.all(ds["qc_up_short_hemisp"].values[0:5] == 2) + assert np.all(ds["qc_up_short_hemisp"].values[5:10] == 4) + assert np.all(ds["qc_up_short_hemisp"].values[10:15] == 8) + assert np.all(ds["qc_up_short_hemisp"].values[15:20] == 16) + assert np.all(ds["qc_up_short_hemisp"].values[20:25] == 32) + + +def test_swats_qc(): + ds = read_arm_netcdf(EXAMPLE_SWATS) + ds.clean.cleanup() + + data_var_names = [] + for var_name in ds.data_vars: + try: + ds[f'qc_{var_name}'] + data_var_names.append(var_name) + except KeyError: + pass + + for var_name in data_var_names: + qc_var_name = f'qc_{var_name}' + + assert ds[qc_var_name].attrs['flag_masks'] == [1, 2, 4, 8] + assert ds[qc_var_name].attrs['flag_meanings'] == [ + 'Value is set to missing_value.', + 'Data value less than fail_min.', + 'Data value greater than fail_max.', + 'Difference between current and previous values exceeds fail_delta.', + ] + assert ds[qc_var_name].attrs['flag_assessments'] == ['Bad', 'Bad', 'Bad', 'Indeterminate'] + assert 'fail_min' in ds[qc_var_name].attrs + assert 'fail_max' in ds[qc_var_name].attrs + assert 'fail_delta' in ds[qc_var_name].attrs + + assert 'valid_min' not in ds[var_name].attrs + assert 'valid_max' not in ds[var_name].attrs + assert 'valid_delta' not in ds[var_name].attrs + assert ds[var_name].attrs['units'] != 'C' + + +def test_fix_incorrect_variable_bit_description_attributes(): + ds = read_arm_netcdf(EXAMPLE_MET_SAIL) + qc_var_name = 'qc_temp_mean' + ds[qc_var_name].attrs['qc_bit_2_description'] = ds[qc_var_name].attrs['bit_2_description'] + ds[qc_var_name].attrs['qc_bit_2_assessment'] = ds[qc_var_name].attrs['bit_2_assessment'] + del ds[qc_var_name].attrs['bit_2_description'] + del ds[qc_var_name].attrs['bit_2_assessment'] + + ds.clean.cleanup() + + assert ds[qc_var_name].attrs['flag_masks'] == [1, 2, 4, 8] + assert ds[qc_var_name].attrs['flag_meanings'] == [ + 'Value is equal to missing_value.', + 'Value is less than fail_min.', + 'Value is greater than fail_max.', + 'Difference between current and previous values exceeds fail_delta.', + ] + assert ds[qc_var_name].attrs['flag_assessments'] == ['Bad', 'Bad', 'Bad', 'Indeterminate'] + assert 'qc_bit_2_description' not in ds[qc_var_name].attrs + assert 'qc_bit_2_assessment' not in ds[qc_var_name].attrs diff --git a/tests/qc/test_qc_summary.py b/tests/qc/test_qc_summary.py new file mode 100644 index 0000000000..40cd8c5fc0 --- /dev/null +++ b/tests/qc/test_qc_summary.py @@ -0,0 +1,260 @@ +import numpy as np +from os import environ +from pathlib import Path +import random +import pytest +import datetime + +from act.io.arm import read_arm_netcdf +from act.tests import EXAMPLE_MET1 +from act.qc.qcfilter import set_bit +from act.utils.data_utils import DatastreamParserARM + + +def test_qc_summary(): + for cleanup in [False, True]: + ds = read_arm_netcdf(EXAMPLE_MET1, cleanup_qc=not cleanup) + for var_name in ['temp_mean', 'rh_mean']: + qc_var_name = f'qc_{var_name}' + qc_data = ds[qc_var_name].values + + assert np.sum(qc_data) == 0 + + index_4 = np.arange(100, 200) + qc_data[index_4] = set_bit(qc_data[index_4], 4) + index_1 = np.arange(170, 230) + qc_data[index_1] = set_bit(qc_data[index_1], 1) + index_2 = np.arange(250, 400) + qc_data[index_2] = set_bit(qc_data[index_2], 2) + index_3 = np.arange(450, 510) + qc_data[index_3] = set_bit(qc_data[index_3], 3) + ds[qc_var_name].values = qc_data + + result = ds.qcfilter.create_qc_summary(cleanup_qc=cleanup) + + for var_name in ['temp_mean', 'rh_mean']: + assert 'flag_masks' not in result[qc_var_name].attrs.keys() + assert isinstance(result[qc_var_name].attrs['flag_values'], list) + + assert np.sum(result[qc_var_name].values) == 610 + + qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Indeterminate') + assert np.all(np.where(qc_ma.mask)[0] == np.arange(100, 170)) + + qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Bad') + index = np.concatenate([index_1, index_2, index_3]) + assert np.all(np.where(qc_ma.mask)[0] == index) + + assert "Quality control summary implemented by ACT" in result.attrs['history'] + + del ds + + +def test_qc_summary_multiple_assessment_names(): + ds = read_arm_netcdf(EXAMPLE_MET1, cleanup_qc=True) + var_name = 'temp_mean' + qc_var_name = f'qc_{var_name}' + qc_data = ds[qc_var_name].values + + assert np.sum(qc_data) == 0 + + index_4 = np.arange(200, 300) + qc_data[index_4] = set_bit(qc_data[index_4], 4) + index_1 = np.arange(270, 330) + qc_data[index_1] = set_bit(qc_data[index_1], 1) + index_2 = np.arange(350, 500) + qc_data[index_2] = set_bit(qc_data[index_2], 2) + index_3 = np.arange(550, 610) + qc_data[index_3] = set_bit(qc_data[index_3], 3) + ds[qc_var_name].values = qc_data + + index_5 = np.arange(50, 150) + ds.qcfilter.add_test( + var_name, index=index_5, test_meaning='Testing Suspect', test_assessment='Suspect' + ) + + index_6 = np.arange(130, 210) + ds.qcfilter.add_test( + var_name, index=index_6, test_meaning='Testing Incorrect', test_assessment='Incorrect' + ) + + result = ds.qcfilter.create_qc_summary() + + assert result[qc_var_name].attrs['flag_assessments'] == [ + 'Not failing', + 'Suspect', + 'Indeterminate', + 'Incorrect', + 'Bad', + ] + + qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Indeterminate') + assert np.sum(np.where(qc_ma.mask)[0]) == 14370 + + qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Suspect') + assert np.sum(np.where(qc_ma.mask)[0]) == 7160 + + qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Bad') + assert np.sum(np.where(qc_ma.mask)[0]) == 116415 + + qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Incorrect') + assert np.sum(np.where(qc_ma.mask)[0]) == 13560 + + assert np.sum(np.where(result[qc_var_name].values == 0)) == 884575 + qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Not failing') + assert np.sum(np.where(qc_ma.mask)[0]) == 884575 + + +@pytest.mark.big +@pytest.mark.skipif('ARCHIVE_DATA' not in environ, reason="Running outside ADC system.") +def test_qc_summary_big_data(): + """ + We want to test on as much ARM data as possible. But we do not want to force + a large amount of test data in GitHub. Plan is to see if the pytest code is being + run on ARM system and if so then run on historical data. If running on GitHub + then don't run tests. Also, have a switch to not force this big test to always + run as that would be mean to the developer. So need to periodicaly run with the + manual switch enabled. + + All exceptions are caught and a file name is sent to the output file when + an exception is found. Since this is testing 10,000+ files it will take hours + to run. I suggest you run in background and capture the standard out to a different + file. If no files are written to the output file then all tests passed. + + Output file name follows the convention of: + ~/test_qc_summary_big_data.{datetime}.txt + + To Run this test set keyword on pytest command line: + > pytest -s --runbig test_qc_summary.py::test_qc_summary_big_data &> ~/out.txt & + + + """ + + base_path = Path(environ['ARCHIVE_DATA']) + if not base_path.is_dir(): + return + + # Set number of files from each directory to test. + skip_sites = [ + 'shb', + 'wbu', + 'dna', + 'rld', + 'smt', + 'nic', + 'isp', + 'dmf', + 'nac', + 'rev', + 'yeu', + 'zrh', + 'osc', + ] + skip_datastream_codes = [ + 'mmcrmom', + # 'microbasepi', + # 'lblch1a', + # '30co2flx4mmet', + # 'microbasepi2', + # '30co2flx60m', + # 'bbhrpavg1mlawer', + # 'co', + # 'lblch1b', + # '30co2flx25m', + # '30co2flx4m', + # 'armbeatm', + # 'armtrajcld', + # '1swfanalsiros1long', + ] + # skip_datastreams = ['nimmfrsraod5chcorM1.c1', 'anxaoso3M1.b0'] + num_files = 3 + expected_assessments = ['Not failing', 'Suspect', 'Indeterminate', 'Incorrect', 'Bad'] + + testing_files = [] + + single_test = False + if len(testing_files) == 0: + single_test = True + filename = ( + f'test_qc_summary_big_data.{datetime.datetime.utcnow().strftime("%Y%m%d.%H%M%S")}.txt' + ) + output_file = Path(environ['HOME'], filename) + output_file.unlink(missing_ok=True) + output_file.touch() + + site_dirs = list(base_path.glob('???')) + for site_dir in site_dirs: + if site_dir.name in skip_sites: + continue + + datastream_dirs = list(site_dir.glob('*.[bc]?')) + for datastream_dir in datastream_dirs: + if '-' in datastream_dir.name: + continue + + # if datastream_dir.name in skip_datastreams: + # continue + + fn_obj = DatastreamParserARM(datastream_dir.name) + facility = fn_obj.facility + if facility is not None and facility[0] in ['A', 'X', 'U', 'F', 'N']: + continue + + datastream_class = fn_obj.datastream_class + if datastream_class is not None and datastream_class in skip_datastream_codes: + continue + + files = list(datastream_dir.glob('*.nc')) + files.extend(datastream_dir.glob('*.cdf')) + if len(files) == 0: + continue + + num_tests = num_files + if len(files) < num_files: + num_tests = len(files) + + for ii in range(0, num_tests): + testing_files.append(random.choice(files)) + + if single_test: + print(f"Testing {len(testing_files)} files\n") + print(f"Output file name = {output_file}\n") + + for file in testing_files: + try: + print(f"Testing: {file}") + ds = read_arm_netcdf(str(file), cleanup_qc=True, decode_times=False) + ds = ds.qcfilter.create_qc_summary() + + created_qc_summary = False + for var_name in ds.data_vars: + qc_var_name = ds.qcfilter.check_for_ancillary_qc( + var_name, add_if_missing=False, cleanup=False + ) + + if qc_var_name is None: + continue + + created_qc_summary = True + assert isinstance(ds[qc_var_name].attrs['flag_values'], list) + assert isinstance(ds[qc_var_name].attrs['flag_assessments'], list) + assert isinstance(ds[qc_var_name].attrs['flag_meanings'], list) + assert len(ds[qc_var_name].attrs['flag_values']) >= 1 + assert len(ds[qc_var_name].attrs['flag_assessments']) >= 1 + assert len(ds[qc_var_name].attrs['flag_meanings']) >= 1 + assert ds[qc_var_name].attrs['flag_assessments'][0] == 'Not failing' + assert ( + ds[qc_var_name].attrs['flag_meanings'][0] == 'Not failing quality control tests' + ) + + for assessment in ds[qc_var_name].attrs['flag_assessments']: + assert assessment in expected_assessments + + if created_qc_summary: + assert "Quality control summary implemented by ACT" in ds.attrs['history'] + + del ds + + except Exception: + with open(output_file, "a") as myfile: + myfile.write(f"{file}\n")