Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make caching warn if some of the args are unhashable #2585

Merged
merged 7 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 26 additions & 19 deletions satpy/modifiers/angles.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,33 +138,40 @@

def __call__(self, *args, cache_dir: Optional[str] = None) -> Any:
"""Call the decorated function."""
new_args = self._sanitize_args_func(*args) if self._sanitize_args_func is not None else args
arg_hash = _hash_args(*new_args, unhashable_types=self._uncacheable_arg_types)
should_cache, cache_dir = self._get_should_cache_and_cache_dir(new_args, cache_dir)
sanitized_args = self._sanitize_args_func(*args) if self._sanitize_args_func is not None else args
should_cache, cache_dir = self._get_should_cache_and_cache_dir(sanitized_args, cache_dir)
if should_cache:
try:
arg_hash = _hash_args(*sanitized_args, unhashable_types=self._uncacheable_arg_types)
except TypeError as err:
warnings.warn("Cannot cache function because of unhashable argument: " + str(err), stacklevel=2)
should_cache = False

if not should_cache:
return self._func(*args)

zarr_fn = self._zarr_pattern(arg_hash)
zarr_format = os.path.join(cache_dir, zarr_fn)
zarr_paths = glob(zarr_format.format("*"))
if not should_cache or not zarr_paths:
# use sanitized arguments if we are caching, otherwise use original arguments
args_to_use = new_args if should_cache else args

if not zarr_paths:
# use sanitized arguments
args_to_use = sanitized_args
res = self._func(*args_to_use)
if should_cache and not zarr_paths:
self._warn_if_irregular_input_chunks(args, args_to_use)
self._cache_results(res, zarr_format)
self._warn_if_irregular_input_chunks(args, args_to_use)
self._cache_results(res, zarr_format)

# if we did any caching, let's load from the zarr files
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@djhoese while refactoring this function a bit, I got confused by this: Why do we load from the zarr files just after we cached it to disk?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. So that we're caching immediately and using those cached values.
  2. So that future calls to this function that result in the same cached results will use the same cached values and when .compute() is finally called there is no redundant computation. By that I mean if we generate these angles once but then continue to use the dask array and then a different product generates these angles and detects the cached value and uses the cached arrays, then when we finally do dask.array.compute(composite1, composite2) we would get one set of computations based on the original dask array and one set of computations based on the cached array even though they should be the same numbers.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, ok, good. I might improve the comment to explain point 2, for future us.

if should_cache:
# re-calculate the cached paths
zarr_paths = sorted(glob(zarr_format.format("*")))
if not zarr_paths:
raise RuntimeError("Data was cached to disk but no files were found")
new_chunks = _get_output_chunks_from_func_arguments(args)
res = tuple(da.from_zarr(zarr_path, chunks=new_chunks) for zarr_path in zarr_paths)
# re-calculate the cached paths
zarr_paths = sorted(glob(zarr_format.format("*")))
if not zarr_paths:
raise RuntimeError("Data was cached to disk but no files were found")

Check warning on line 168 in satpy/modifiers/angles.py

View check run for this annotation

Codecov / codecov/patch

satpy/modifiers/angles.py#L168

Added line #L168 was not covered by tests
new_chunks = _get_output_chunks_from_func_arguments(args)
res = tuple(da.from_zarr(zarr_path, chunks=new_chunks) for zarr_path in zarr_paths)

Check notice on line 170 in satpy/modifiers/angles.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

✅ No longer an issue: Complex Method

ZarrCacheHelper.__call__ is no longer above the threshold for cyclomatic complexity

Check notice on line 170 in satpy/modifiers/angles.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

✅ No longer an issue: Bumpy Road Ahead

ZarrCacheHelper.__call__ is no longer above the threshold for logical blocks with deeply nested code
return res

def _get_should_cache_and_cache_dir(self, args, cache_dir: Optional[str]) -> tuple[bool, str]:
should_cache: bool = satpy.config.get(self._cache_config_key, False)
can_cache = not any(isinstance(arg, self._uncacheable_arg_types) for arg in args)
should_cache = should_cache and can_cache
cache_dir = self._get_cache_dir_from_config(cache_dir)
return should_cache, cache_dir

Expand Down Expand Up @@ -250,7 +257,7 @@
hashable_args = []
for arg in args:
if isinstance(arg, unhashable_types):
continue
raise TypeError(f"Unhashable type ({type(arg)}).")
if isinstance(arg, HASHABLE_GEOMETRIES):
arg = hash(arg)
elif isinstance(arg, datetime):
Expand Down
24 changes: 24 additions & 0 deletions satpy/tests/modifier_tests/test_angles.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,30 @@
satpy.config.set(cache_lonlats=True, cache_dir=str(tmp_path)):
_fake_func((5, 5), ((5,), (5,)))

def test_caching_with_array_in_args_warns(self, tmp_path):
"""Test that trying to cache with non-dask arrays fails."""
from satpy.modifiers.angles import cache_to_zarr_if

@cache_to_zarr_if("cache_lonlats")
def _fake_func(array):
return array + 1

with pytest.warns(UserWarning), \
satpy.config.set(cache_lonlats=True, cache_dir=str(tmp_path)):
_fake_func(da.zeros(100))

Check warning on line 335 in satpy/tests/modifier_tests/test_angles.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

❌ New issue: Code Duplication

The module contains 4 functions with similar structure: TestAngleGeneration.test_cached_no_chunks_fails,TestAngleGeneration.test_cached_result_numpy_fails,TestAngleGeneration.test_caching_with_array_in_args_does_not_warn_when_caching_is_not_enabled,TestAngleGeneration.test_caching_with_array_in_args_warns. Avoid duplicated, aka copy-pasted, code inside the module. More duplication lowers the code health.

def test_caching_with_array_in_args_does_not_warn_when_caching_is_not_enabled(self, tmp_path, recwarn):
"""Test that trying to cache with non-dask arrays fails."""
from satpy.modifiers.angles import cache_to_zarr_if

@cache_to_zarr_if("cache_lonlats")
def _fake_func(array):
return array + 1

with satpy.config.set(cache_lonlats=False, cache_dir=str(tmp_path)):
_fake_func(da.zeros(100))
assert len(recwarn) == 0

def test_no_cache_dir_fails(self, tmp_path):
"""Test that 'cache_dir' not being set fails."""
from satpy.modifiers.angles import _get_sensor_angles_from_sat_pos, get_angles
Expand Down
Loading