Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to making QC Summary files #852

Merged
merged 32 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
5b1fed4
Not creating QC variable for time variables.
kenkehoe Aug 8, 2024
74892bd
Updates to make writing more CF complient.
kenkehoe Aug 8, 2024
187f28f
Merge branch 'main' into fix_writing_files
kenkehoe Aug 8, 2024
e53eb1e
Updated how to handle _FillValue. Changed all Xarray Datasets to ds. …
kenkehoe Aug 9, 2024
b50fd21
Adding option to suppres adding QC variables. Checking if time is num…
kenkehoe Aug 9, 2024
85516b2
Adding option to remove QC variable attributes. Updated method to add…
kenkehoe Aug 9, 2024
ba46940
Adding a method to ensure datatype is datetime64.
kenkehoe Aug 10, 2024
a44ad88
Returning order to correct format for making copy. Returning .update(…
kenkehoe Aug 10, 2024
d870cae
Changing the default from Internal QC Assessment terms to DQR Assessm…
kenkehoe Aug 23, 2024
2229dbe
Adding option to normalize assessment terms used.
kenkehoe Aug 23, 2024
2efa139
Adding option to set the missing value indicater to be a value other …
kenkehoe Aug 23, 2024
82a3b57
Changed to use Suspect and Incorrect
kenkehoe Aug 23, 2024
f831cbe
Updated to match new default values for normalized assessments
kenkehoe Aug 23, 2024
f6121d9
Removing commented code.
kenkehoe Aug 23, 2024
29a3359
Improving the datafilter test. Checking ancillary_variables attribute…
kenkehoe Aug 27, 2024
b18dd53
Adding more testing to qc_summary.
kenkehoe Aug 28, 2024
f5119e6
Updated to handle flag_assessments ouside the standard 4. Correctly s…
kenkehoe Aug 28, 2024
e06191c
Catching warning with pytest to ensure the warning was issued
kenkehoe Aug 28, 2024
440f821
Catching warning with pytest to ensure the warning was issued
kenkehoe Aug 28, 2024
01d4b0e
Catching warning with pytest to ensure the warning was issued
kenkehoe Aug 28, 2024
778227c
Catching warning with pytest to ensure the warning was issued
kenkehoe Aug 28, 2024
b8e3c4d
Catching warning with pytest to ensure the warning was issued
kenkehoe Aug 28, 2024
4b2f0eb
Catching warning with pytest to ensure the warning was issued
kenkehoe Aug 28, 2024
c0f2423
Catching warning with pytest to ensure the warning was issued
kenkehoe Aug 28, 2024
99e8524
DOC: Fix spelling
zssherman Aug 30, 2024
4bba281
DOC: Fix spelling
zssherman Aug 30, 2024
2d7dc76
DOC: Fix spelling
zssherman Aug 30, 2024
6ccc537
MTN: Remove old code
zssherman Aug 30, 2024
05541fc
MTN: Remove old code
zssherman Aug 30, 2024
4681905
MNT: Remove old code
zssherman Aug 30, 2024
ac5967c
MNT: Remove old code
zssherman Aug 30, 2024
8bdff06
MNT: Remove old code
zssherman Aug 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 77 additions & 74 deletions act/io/arm.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,8 +549,9 @@ def write_netcdf(
make_copy=True,
cf_compliant=False,
delete_global_attrs=['qc_standards_version', 'qc_method', 'qc_comment'],
FillValue=-9999,
FillValue=True,
cf_convention='CF-1.8',
encoding={},
**kwargs,
):
"""
Expand All @@ -573,7 +574,8 @@ def write_netcdf(
white space between words.
join_char : str
The character sting to use for replacing white spaces between words when converting
a list of strings to single character string attributes.
a list of strings to single character string attributes. Main use is with the
flag_meanings attribute.
make_copy : boolean
Make a copy before modifying Dataset to write. For large Datasets this
may add processing time and memory. If modifying the Dataset is OK
Expand All @@ -587,14 +589,18 @@ def write_netcdf(
Optional global attributes to be deleted. Defaults to some standard
QC attributes that are not needed. Can add more or set to None to not
remove the attributes.
FillValue : int, float
The value to use as a _FillValue in output file. This is used to fix
issues with how Xarray handles missing_value upon reading. It's confusing
so not a perfect fix. Set to None to leave Xarray to do what it wants.
Set to a value to be the value used as _FillValue in the file and data
array. This should then remove missing_value attribute from the file as well.
FillValue : boolean
Xarray assumes all float type variables had the missing value indicator converted
to NaN upon reading. to_netcdf() will then write a _FillValue attribute set to NaN.
Set FillValue to False to supress adding the _FillValue=NaN variable attribute to
the written file. Set to True to allow to_netcdf() to add the attribute.
If the Dataset variable already has a _FillValue attribute or a _FillValue key
is provided in the encoding dictionary those will not be changed and a _FillValue
will be written to NetCDF file.
cf_convention : str
The Climate and Forecast convention string to add to Conventions attribute.
encoding : dict
The encoding dictionary used with to_netcdf() method.
**kwargs : keywords
Keywords to pass through to Dataset.to_netcdf()

Expand All @@ -607,114 +613,118 @@ def write_netcdf(
"""

if make_copy:
write_ds = copy.deepcopy(self._ds)
ds = copy.deepcopy(self._ds)
else:
write_ds = self._ds
ds = self._ds

encoding = {}
if cleanup_global_atts:
for attr in list(write_ds.attrs):
for attr in list(ds.attrs):
if attr.startswith('_'):
del write_ds.attrs[attr]
del ds.attrs[attr]

if cleanup_qc_atts:
check_atts = ['flag_meanings', 'flag_assessments']
for var_name in list(write_ds.data_vars):
if 'standard_name' not in write_ds[var_name].attrs.keys():
for var_name in list(ds.data_vars):
if 'standard_name' not in ds[var_name].attrs.keys():
continue

if ds[var_name].attrs['standard_name'] != "quality_flag":
continue

for attr_name in check_atts:
try:
att_values = write_ds[var_name].attrs[attr_name]
att_values = ds[var_name].attrs[attr_name]
if isinstance(att_values, (list, tuple)):
att_values = [
att_value.replace(' ', join_char) for att_value in att_values
]
write_ds[var_name].attrs[attr_name] = ' '.join(att_values)
ds[var_name].attrs[attr_name] = ' '.join(att_values)

except KeyError:
pass

# Tell .to_netcdf() to not add a _FillValue attribute for
# quality control variables.
if FillValue is not None:
encoding[var_name] = {'_FillValue': None}
# Xarray makes an assumption that float type variables were read in and converted
# missing value indicator to NaN. .to_netcdf() will then automatically assign
# _FillValue attribute set to NaN when writing. If requested will set _FillValue
# key in encoding to None which will supress to_netcdf() from adding a _FillValue.
# If _FillValue attribute or _FillValue key in encoding is already set, will not
# override and the _FillValue will be written to the file.
if not FillValue:
all_var_names = list(ds.coords.keys()) + list(ds.data_vars)
for var_name in all_var_names:
if '_FillValue' in ds[var_name].attrs:
continue

# Clean up _FillValue vs missing_value mess by creating an
# encoding dictionary with each variable's _FillValue set to
# requested fill value. May need to improve upon this for data type
# and other issues in the future.
if FillValue is not None:
skip_variables = ['base_time', 'time_offset', 'qc_time'] + list(encoding.keys())
for var_name in list(write_ds.data_vars):
if var_name not in skip_variables:
encoding[var_name] = {'_FillValue': FillValue}
if var_name not in encoding.keys():
encoding[var_name] = {'_FillValue': None}
elif '_FillValue' not in encoding[var_name].keys():
encoding[var_name]['_FillValue'] = None

if delete_global_attrs is not None:
for attr in delete_global_attrs:
try:
del write_ds.attrs[attr]
del ds.attrs[attr]
except KeyError:
pass

for var_name in list(write_ds.keys()):
if 'string' in list(write_ds[var_name].attrs.keys()):
att = write_ds[var_name].attrs['string']
write_ds[var_name].attrs[var_name + '_string'] = att
del write_ds[var_name].attrs['string']
for var_name in list(ds.keys()):
if 'string' in list(ds[var_name].attrs.keys()):
att = ds[var_name].attrs['string']
ds[var_name].attrs[var_name + '_string'] = att
del ds[var_name].attrs['string']

# If requested update global attributes and variables attributes for required
# CF attributes.
if cf_compliant:
# Get variable names and standard name for each variable
var_names = list(write_ds.keys())
var_names = list(ds.keys())
standard_names = []
for var_name in var_names:
try:
standard_names.append(write_ds[var_name].attrs['standard_name'])
standard_names.append(ds[var_name].attrs['standard_name'])
except KeyError:
standard_names.append(None)

# Check if time varible has axis and standard_name attribute
coord_name = 'time'
try:
write_ds[coord_name].attrs['axis']
ds[coord_name].attrs['axis']
except KeyError:
try:
write_ds[coord_name].attrs['axis'] = 'T'
ds[coord_name].attrs['axis'] = 'T'
except KeyError:
pass

try:
write_ds[coord_name].attrs['standard_name']
ds[coord_name].attrs['standard_name']
except KeyError:
try:
write_ds[coord_name].attrs['standard_name'] = 'time'
ds[coord_name].attrs['standard_name'] = 'time'
except KeyError:
pass

# Try to determine type of dataset by coordinate dimention named time
# and other factors
try:
write_ds.attrs['FeatureType']
ds.attrs['FeatureType']
except KeyError:
dim_names = list(write_ds.dims)
dim_names = list(ds.dims)
FeatureType = None
if dim_names == ['time']:
FeatureType = 'timeSeries'
elif len(dim_names) == 2 and 'time' in dim_names and 'bound' in dim_names:
FeatureType = 'timeSeries'
elif len(dim_names) >= 2 and 'time' in dim_names:
for var_name in var_names:
dims = list(write_ds[var_name].dims)
dims = list(ds[var_name].dims)
if len(dims) == 2 and 'time' in dims:
prof_dim = list(set(dims) - {'time'})[0]
if write_ds[prof_dim].values.size > 2:
if ds[prof_dim].values.size > 2:
FeatureType = 'timeSeriesProfile'
break

if FeatureType is not None:
write_ds.attrs['FeatureType'] = FeatureType
ds.attrs['FeatureType'] = FeatureType

# Add axis and positive attributes to variables with standard_name
# equal to 'altitude'
Expand All @@ -723,18 +733,18 @@ def write_netcdf(
]
for var_name in alt_variables:
try:
write_ds[var_name].attrs['axis']
ds[var_name].attrs['axis']
except KeyError:
write_ds[var_name].attrs['axis'] = 'Z'
ds[var_name].attrs['axis'] = 'Z'

try:
write_ds[var_name].attrs['positive']
ds[var_name].attrs['positive']
except KeyError:
write_ds[var_name].attrs['positive'] = 'up'
ds[var_name].attrs['positive'] = 'up'

# Check if the Conventions global attribute lists the CF convention
try:
Conventions = write_ds.attrs['Conventions']
Conventions = ds.attrs['Conventions']
Conventions = Conventions.split()
cf_listed = False
for ii in Conventions:
Expand All @@ -743,37 +753,30 @@ def write_netcdf(
break
if not cf_listed:
Conventions.append(cf_convention)
write_ds.attrs['Conventions'] = ' '.join(Conventions)
ds.attrs['Conventions'] = ' '.join(Conventions)

except KeyError:
write_ds.attrs['Conventions'] = str(cf_convention)
ds.attrs['Conventions'] = str(cf_convention)

# Reorder global attributes to ensure history is last
try:
history = copy.copy(write_ds.attrs['history'])
del write_ds.attrs['history']
write_ds.attrs['history'] = history
history = copy.copy(ds.attrs['history'])
del ds.attrs['history']
ds.attrs['history'] = history
except KeyError:
pass
current_time = dt.datetime.now().replace(microsecond=0)
if 'history' in list(write_ds.attrs.keys()):
write_ds.attrs['history'] += ''.join(
[
'\n',
str(current_time),
' created by ACT ',
str(act.__version__),
' act.io.write.write_netcdf',
]
)

if 'time_bounds' in encoding.keys():
encoding['time_bounds']['dtype'] = 'float64'

if hasattr(write_ds, 'time_bounds') and not write_ds.time.encoding:
write_ds.time.encoding.update(write_ds.time_bounds.encoding)
current_time = dt.datetime.utcnow().replace(microsecond=0)
history_value = (
f'Written to file by ACT-{act.__version__} '
f'with write_netcdf() at {current_time} UTC'
)
if 'history' in list(ds.attrs.keys()):
ds.attrs['history'] += f" ; {history_value}"
else:
ds.attrs['history'] = history_value

write_ds.to_netcdf(encoding=encoding, **kwargs)
ds.to_netcdf(encoding=encoding, **kwargs)


def check_if_tar_gz_file(filenames):
Expand Down
48 changes: 45 additions & 3 deletions act/qc/arm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import requests
import json
from dateutil import parser

from act.config import DEFAULT_DATASTREAM_NAME

Expand All @@ -22,6 +23,7 @@ def add_dqr_to_qc(
cleanup_qc=True,
dqr_link=False,
skip_location_vars=False,
create_missing_qc_variables=True,
):
"""
Function to query the ARM DQR web service for reports and
Expand Down Expand Up @@ -68,6 +70,9 @@ def add_dqr_to_qc(
skip_location_vars : boolean
Does not apply DQRs to location variables. This can be useful in the event
the submitter has erroneously selected all variables.
create_missing_qc_variables : boolean
If a quality control variable for the data variable does not exist,
create the quality control varible and apply DQR.

Returns
-------
Expand Down Expand Up @@ -102,8 +107,35 @@ def add_dqr_to_qc(
if cleanup_qc:
ds.clean.cleanup()

start_date = ds['time'].values[0].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')
end_date = ds['time'].values[-1].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')
# Get time from Dataset
time = ds['time'].values

# If the time is not a datetime64 because the read routine was not asked to
# convert CF variables, convert the time variable for this routine only.
if not np.issubdtype(time.dtype, np.datetime64):
units_strings = [
'seconds since ',
'minutes since ',
'hours since ',
'days since ',
'milliseconds since ',
'months since ',
'years since ',
]
td64_strings = ['s', 'm', 'h', 'D', 'ms', 'M', 'Y']
units = ds['time'].attrs['units']
for ii, _ in enumerate(units_strings):
if units.startswith(units_strings[ii]):
units = units.replace(units_strings[ii], '')
td64_string = td64_strings[ii]
break

start_time = parser.parse(units)
start_time = np.datetime64(start_time, td64_string)
time = start_time + ds['time'].values.astype('timedelta64[s]')

start_date = time[0].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')
end_date = time[-1].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')

# Clean up assessment to ensure it is a string with no spaces.
if isinstance(assessment, (list, tuple)):
Expand Down Expand Up @@ -152,7 +184,7 @@ def add_dqr_to_qc(
for time_range in docs[quality_category][dqr_number]['dates']:
starttime = np.datetime64(time_range['start_date'])
endtime = np.datetime64(time_range['end_date'])
ind = np.where((ds['time'].values >= starttime) & (ds['time'].values <= endtime))
ind = np.where((time >= starttime) & (time <= endtime))
if ind[0].size > 0:
index = np.append(index, ind[0])

Expand Down Expand Up @@ -181,6 +213,10 @@ def add_dqr_to_qc(
if skip_location_vars and var_name in loc_vars:
continue

# Do not process time variables
if var_name in ['time', 'time_offset', 'time_bounds']:
continue

# Only process provided variable names
if variable is not None and var_name not in variable:
continue
Expand All @@ -193,6 +229,12 @@ def add_dqr_to_qc(
except KeyError:
pass

if (
create_missing_qc_variables is False
and ds.qcfilter.check_for_ancillary_qc(var_name, add_if_missing=False) is None
):
continue

try:
ds.qcfilter.add_test(
var_name,
Expand Down
2 changes: 1 addition & 1 deletion act/qc/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ def normalize_assessment(
self,
variables=None,
exclude_variables=None,
qc_lookup={'Incorrect': 'Bad', 'Suspect': 'Indeterminate'},
qc_lookup={'Bad': 'Incorrect', 'Indeterminate': 'Suspect'},
):
"""
Method to clean up assessment terms used to be consistent between
Expand Down
Loading