ARM-DOE · zssherman · Jul 29, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 16, 2024
@@ -0,0 +1,102 @@
+import numpy as np
+import datetime
+
+
+class QCSummary:
+    """
+    This is a Mixins class used to allow using qcfilter class that is already
+    registered to the Xarray dataset. All the methods in this class will be added
+    to the qcfilter class. Doing this to make the code spread across more files
+    so it is more manageable and readable.
+
+    """
+
+    def __init__(self, ds):
+        """initialize"""
+        self._ds = ds
+
+    def create_qc_summary(self, cleanup_qc=False):
+        """
+        Method to convert embedded quality control to summary QC that utilzes
+        flag values instead of flag masks and summarizes the assessments to only
+        a few states. Lowest level of quality control will be listed first with most
+        sever having higher integer numbers.
+
+        cleanup_qc : boolean
+            Call clean.cleanup() method to convert to standardized ancillary quality control
+            variables. The quality control summary requires the current embedded quality
+            control variables to use ACT standards.
+
+        Returns
+        -------
+        return_ds : xarray.Dataset
+            ACT Xarray dataset with quality control variales converted to summary flag values.
+
+        """
+
+        standard_assessments = [
+            'Suspect',
+            'Indeterminate',
+            'Incorrect',
+            'Bad',
+        ]
+        standard_meanings = [
+            "Data suspect, further analysis recommended",
+            "Data suspect, further analysis recommended",
+            "Data incorrect, use not recommended",
+            "Data incorrect, use not recommended",
+        ]
+
+        return_ds = self._ds.copy()
+
+        if cleanup_qc:
+            self._ds.clean.cleanup()
+
+        added = False
+        for var_name in list(self._ds.data_vars):
+            qc_var_name = self.check_for_ancillary_qc(var_name, add_if_missing=False, cleanup=False)
+
+            if qc_var_name is None:
+                continue
+
+            added = True
+
+            assessments = list(set(self._ds[qc_var_name].attrs['flag_assessments']))
+            del return_ds[qc_var_name]
+
+            return_ds.qcfilter.add_test(
+                var_name,
+                index=None,
+                test_number=0,
+                test_meaning='Passing all quality control tests',
+                test_assessment='Passing',
+                flag_value=True,
+            )
+
+            for ii, assessment in enumerate(standard_assessments):
+                if assessment not in assessments:
+                    continue
+
+                qc_ma = self.get_masked_data(var_name, rm_assessments=assessment)
+
+                # Do not really know how to handle scalars yet.
+                if len(qc_ma.mask.shape) == 0:
+                    continue
+
+                return_ds.qcfilter.add_test(
+                    var_name,
+                    index=np.where(qc_ma.mask),
+                    test_meaning=standard_meanings[ii],
+                    test_assessment=assessment,
+                    flag_value=True,
+                )
+
+        if added:
+            history = return_ds.attrs['history']
+            history += (
+                " ; Quality control summary implemented by ACT at "
+                f"{datetime.datetime.utcnow().isoformat()} UTC."
+            )
+            return_ds.attrs['history'] = history
+
+        return return_ds
@@ -9,11 +9,11 @@
 import numpy as np
 import xarray as xr
 
-from act.qc import comparison_tests, qctests, bsrn_tests
+from act.qc import comparison_tests, qctests, bsrn_tests, qc_summary
 
 
 @xr.register_dataset_accessor('qcfilter')
-class QCFilter(qctests.QCTests, comparison_tests.QCTests, bsrn_tests.QCTests):
+class QCFilter(qctests.QCTests, comparison_tests.QCTests, bsrn_tests.QCTests, qc_summary.QCSummary):
     """
     A class for building quality control variables containing arrays for
     filtering data based on a set of test condition typically based on the
@@ -539,7 +539,10 @@ def set_test(self, var_name, index=None, test_number=None, flag_value=False):
 
         if index is not None:
             if flag_value:
-                qc_variable[index] = test_number
+                if len(qc_variable.shape) == 0:
+                    qc_variable = test_number
+                else:
+                    qc_variable[index] = test_number
             else:
                 if bool(np.shape(index)):
                     qc_variable[index] = set_bit(qc_variable[index], test_number)
@@ -904,7 +907,14 @@ def get_masked_data(
 
         mask = np.zeros(variable.shape, dtype=bool)
         for test in test_numbers:
-            mask = mask | self._ds.qcfilter.get_qc_test_mask(var_name, test, flag_value=flag_value)
+            qc_test_mask = self._ds.qcfilter.get_qc_test_mask(var_name, test, flag_value=flag_value)
+            # There are some variables that incorrectly have only a time dimension for QC
+            # variable which corresponds to a time-height data variable. If that is the case
+            # streach the QC Mask along the height dimension to match for broadcasting.
+            if variable.shape != qc_test_mask.shape:
+                qc_test_mask = np.resize(qc_test_mask, variable.shape)
+
+            mask = mask | qc_test_mask
 
         # Convert data numpy array into masked array
         try:

@@ -0,0 +1,19 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption("--runbig", action="store_true", default=False, help="Run big tests")
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "big: mark test as slow to run")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--runbig"):
+        # --runbig given in cli: do not skip big tests
+        return
+    skip_big = pytest.mark.skip(reason="need --runbig option to run")
+    for item in items:
+        if "big" in item.keywords:
+            item.add_marker(skip_big)
diff --git a/tests/qc/test_arm_qc.py b/tests/qc/test_arm_qc.py
@@ -12,7 +12,7 @@ def test_scalar_dqr():
     # DQR webservice does go down, so ensure it
     # properly runs first before testing
     try:
-        ds = add_dqr_to_qc(ds)
+        ds = add_dqr_to_qc(ds, assessment='Reprocessed,Suspect,Incorrect')
         ran = True
     except ValueError:
         ran = False

@@ -0,0 +1,208 @@
+import numpy as np
+from os import environ
+from pathlib import Path
+import random
+import pytest
+
+from act.io.arm import read_arm_netcdf
+from act.tests import EXAMPLE_MET1
+from act.qc.qcfilter import set_bit
+
+
+def test_qc_summary():
+    for cleanup in [False, True]:
+        ds = read_arm_netcdf(EXAMPLE_MET1, cleanup_qc=not cleanup)
+        for var_name in ['temp_mean', 'rh_mean']:
+            qc_var_name = f'qc_{var_name}'
+            qc_data = ds[qc_var_name].values
+
+            assert np.sum(qc_data) == 0
+
+            index_4 = np.arange(100, 200)
+            qc_data[index_4] = set_bit(qc_data[index_4], 4)
+            index_1 = np.arange(170, 230)
+            qc_data[index_1] = set_bit(qc_data[index_1], 1)
+            index_2 = np.arange(250, 400)
+            qc_data[index_2] = set_bit(qc_data[index_2], 2)
+            index_3 = np.arange(450, 510)
+            qc_data[index_3] = set_bit(qc_data[index_3], 3)
+            ds[qc_var_name].values = qc_data
+
+        result = ds.qcfilter.create_qc_summary(cleanup_qc=cleanup)
+
+        assert 'flag_masks' not in result[qc_var_name].attrs.keys()
+        assert isinstance(result[qc_var_name].attrs['flag_values'], list)
+
+        assert np.sum(result[qc_var_name].values) == 610
+
+        qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Indeterminate')
+        assert np.all(np.where(qc_ma.mask)[0] == np.arange(100, 170))
+
+        qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Bad')
+        index = np.concatenate([index_1, index_2, index_3])
+        assert np.all(np.where(qc_ma.mask)[0] == index)
+
+        assert "Quality control summary implemented by ACT" in result.attrs['history']
+
+
+def test_qc_summary_multiple_assessment_names():
+    ds = read_arm_netcdf(EXAMPLE_MET1, cleanup_qc=True)
+    var_name = 'temp_mean'
+    qc_var_name = f'qc_{var_name}'
+    qc_data = ds[qc_var_name].values
+
+    assert np.sum(qc_data) == 0
+
+    index_4 = np.arange(200, 300)
+    qc_data[index_4] = set_bit(qc_data[index_4], 4)
+    index_1 = np.arange(270, 330)
+    qc_data[index_1] = set_bit(qc_data[index_1], 1)
+    index_2 = np.arange(350, 500)
+    qc_data[index_2] = set_bit(qc_data[index_2], 2)
+    index_3 = np.arange(550, 610)
+    qc_data[index_3] = set_bit(qc_data[index_3], 3)
+    ds[qc_var_name].values = qc_data
+
+    index_5 = np.arange(50, 150)
+    ds.qcfilter.add_test(
+        var_name, index=index_5, test_meaning='Testing Suspect', test_assessment='Suspect'
+    )
+
+    index_6 = np.arange(130, 210)
+    ds.qcfilter.add_test(
+        var_name, index=index_6, test_meaning='Testing Incorrect', test_assessment='Incorrect'
+    )
+
+    result = ds.qcfilter.create_qc_summary()
+
+    assert result[qc_var_name].attrs['flag_assessments'] == [
+        'Passing',
+        'Suspect',
+        'Indeterminate',
+        'Incorrect',
+        'Bad',
+    ]
+
+    qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Indeterminate')
+    assert np.sum(np.where(qc_ma.mask)[0]) == 14370
+
+    qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Suspect')
+    assert np.sum(np.where(qc_ma.mask)[0]) == 7160
+
+    qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Bad')
+    assert np.sum(np.where(qc_ma.mask)[0]) == 116415
+
+    qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Incorrect')
+    assert np.sum(np.where(qc_ma.mask)[0]) == 13560
+
+    assert np.sum(np.where(result[qc_var_name].values == 0)) == 884575
+    qc_ma = result.qcfilter.get_masked_data(var_name, rm_assessments='Passing')
+    assert np.sum(np.where(qc_ma.mask)[0]) == 884575
+
+
+@pytest.mark.big
+@pytest.mark.skipif('ARCHIVE_DATA' not in environ, reason="Running outside ADC system.")
+def test_qc_summary_big_data():
+    """
+    We want to test on as much ARM data as possible. But we do not want to force
+    a large amount of test data in GitHub. Plan is to see if the pytest code is being
+    run on ARM system and if so then run on historical data. If running on GitHub
+    then don't run tests. Also, have a switch to not force this big test to always
+    run as that would be mean to the developer. So need to periodicaly run with the
+    manual switch enabled.
+
+    To Run this test set keyword on pytest command line:
+    --runbig
+
+    """
+
+    base_path = Path(environ['ARCHIVE_DATA'])
+    if not base_path.is_dir():
+        return
+
+    # Set number of files from each directory to test.
+    skip_sites = [
+        'shb',
+        'wbu',
+        'dna',
+        'rld',
+        'smt',
+        'nic',
+        'isp',
+        'dmf',
+        'nac',
+        'rev',
+        'yeu',
+        'zrh',
+        'osc',
+    ]
+    skip_datastream_codes = ['mmcrmom']
+    num_files = 1  # 3
+    testing_files = []
+    expected_assessments = ['Passing', 'Suspect', 'Indeterminate', 'Incorrect', 'Bad']
+
+    site_dirs = list(base_path.glob('???'))
+    for site_dir in site_dirs:
+        if site_dir.name in skip_sites:
+            continue
+
+        datastream_dirs = list(site_dir.glob('*.[bc]?'))
+        for datastream_dir in datastream_dirs:
+            skip = False
+            for character in ['A', 'X', 'U', 'F']:
+                if character in datastream_dir.name:
+                    skip = True
+                    break
+
+            for datastream_code in skip_datastream_codes:
+                if datastream_code in datastream_dir.name:
+                    skip = True
+                    break
+
+            if skip:
+                continue
+
+            files = list(datastream_dir.glob('*.nc'))
+            files.extend(datastream_dir.glob('*.cdf'))
+            if len(files) == 0:
+                continue
+
+            num_tests = num_files
+            if len(files) < num_files:
+                num_tests = len(files)
+
+            for ii in range(0, num_tests):
+                testing_files.append(random.choice(files))
+
+    for file in testing_files:
+        print(f"Testing: {file}")
+        ds = read_arm_netcdf(str(file), cleanup_qc=True)
+        ds = ds.qcfilter.create_qc_summary()
+
+        created_qc_summary = False
+        for var_name in ds.data_vars:
+            qc_var_name = ds.qcfilter.check_for_ancillary_qc(
+                var_name, add_if_missing=False, cleanup=False
+            )
+
+            if qc_var_name is None:
+                continue
+
+            created_qc_summary = True
+
+            assert isinstance(ds[qc_var_name].attrs['flag_values'], list)
+            assert isinstance(ds[qc_var_name].attrs['flag_assessments'], list)
+            assert isinstance(ds[qc_var_name].attrs['flag_meanings'], list)
+            assert len(ds[qc_var_name].attrs['flag_values']) >= 1
+            assert len(ds[qc_var_name].attrs['flag_assessments']) >= 1
+            assert len(ds[qc_var_name].attrs['flag_meanings']) >= 1
+            assert ds[qc_var_name].attrs['flag_assessments'][0] == 'Passing'
+            assert ds[qc_var_name].attrs['flag_meanings'][0] == 'Passing all quality control tests'
+
+            for assessment in ds[qc_var_name].attrs['flag_assessments']:
+                assert assessment in expected_assessments
+
+        if created_qc_summary:
+            assert "Quality control summary implemented by ACT" in ds.attrs['history']
+
+        del ds