Add option to fit data to a time varying GEV (#52)

* Update gev_fit to include non-stationary distribution parameters * Update calc_moments and return_curve for compatability with fit_gev * Fix bug in rx5day calculation * Add missing fileio.py package dependencies * Test only latest python * Import all modules when testing * Fix deliberate bugs * Add missing package * Add more missing packages * Add another missing package * Fix linting errors and ignore W503 --------- Co-authored-by: Damien Irving <[email protected]>
AusClimateService · Oct 16, 2023 · 28add95 · 28add95
1 parent 89fdaa0
commit 28add95
Show file tree

Hide file tree

Showing 16 changed files with 1,373 additions and 1,332 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -9,7 +9,6 @@ jobs:
       fail-fast: false
       matrix:
         os: ["ubuntu-latest", "windows-latest", "macos-latest"]
-        python-version: ["3.9", "3.10", "3.11"]
 
     steps:
       - name: Checkout source
@@ -19,8 +18,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v2
         with:
           miniconda-version: "latest"
-          python-version: ${{ matrix.python-version }}
-          environment-file: ci/environment-${{ matrix.python-version }}.yml
+          environment-file: ci/environment.yml
           activate-environment: unseen-test
           auto-activate-base: false
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,4 +18,4 @@ repos:
         # these are errors that will be ignored by flake8
         # check out their meaning here
         # https://flake8.pycqa.org/en/latest/user/error-codes.html
-        - "--ignore=E203,C901"
+        - "--ignore=E203,C901,W503"
diff --git a/ci/environment-3.11.yml b/ci/environment-3.11.yml
diff --git a/ci/environment-3.9.yml b/ci/environment-3.9.yml
diff --git a/ci/environment-3.10.yml → ci/environment.yml b/ci/environment-3.10.yml → ci/environment.yml
@@ -2,15 +2,24 @@ name: unseen-test
 channels:
   - conda-forge
 dependencies:
-  - python=3.10
+  - python
   - geopandas
   - regionmask
   - xarray
   - dask-core
   - numpy
   - pytest
   - cftime
+  - gitpython
+  - cmdline_provenance
+  - xclim
+  - xskillscore
+  - seaborn
+  - zarr
+  - netcdf4
+  - dask-jobqueue
   - pip
   - pip:
     - codecov
     - pytest-cov
+    - xstatstests
diff --git a/docs/user_guide/distribution.png b/docs/user_guide/distribution.png
diff --git a/docs/user_guide/independence.png b/docs/user_guide/independence.png
diff --git a/docs/user_guide/moments.png b/docs/user_guide/moments.png
diff --git a/docs/user_guide/return_curve.png b/docs/user_guide/return_curve.png
diff --git a/docs/user_guide/stability.png b/docs/user_guide/stability.png
diff --git a/docs/user_guide/worked_example-HadGEM3-GC31-MM.ipynb b/docs/user_guide/worked_example-HadGEM3-GC31-MM.ipynb
diff --git a/docs/user_guide/worked_example.rst b/docs/user_guide/worked_example.rst
@@ -272,7 +272,7 @@ you could also run it at the command line and submit to the job queue.
 
 .. code-block:: none
 
-    $ fileio HadGEM3-GC31-MM_dcppA-hindcast_pr_files.txt Rx5day_HadGEM3-GC31-MM_dcppA-hindcast_s1960-2018_gn_hobart.zarr.zip --n_ensemble_files 10 --variables pr --time_freq A-DEC --time_agg max --input_freq D --point_selection -42.9 147.3 --reset_times --complete_time_agg_periods --units pr=mm day-1 --forecast -v --n_time_files 12
+    $ fileio HadGEM3-GC31-MM_dcppA-hindcast_pr_files.txt Rx5day_HadGEM3-GC31-MM_dcppA-hindcast_s1960-2018_gn_hobart.zarr.zip --n_ensemble_files 10 --variables pr --rolling_sum_window 5 --time_freq A-DEC --time_agg max --input_freq D --point_selection -42.9 147.3 --reset_times --complete_time_agg_periods --units pr=mm day-1 --forecast -v --n_time_files 12
 
 
 Stability and stationarity testing
@@ -295,7 +295,7 @@ To do this, we can use the ``stability`` module:
         uncertainty=True,
         return_method='gev',
         units='Rx5day (mm)',
-        ylim=(0, 250),
+        ylim=(0, 450),
     )
 
 
@@ -469,9 +469,9 @@ we can use the ``similarity`` module:
 
 .. code-block:: none
 
-    KS score: 0.6598098
-    KS p-value: 0.0
-    AD score: 235.39091
+    KS score: 0.2797175
+    KS p-value: 7.914835e-09
+    AD score: 29.255798
     AD p-value: 0.001
 
 
@@ -486,9 +486,9 @@ we can use the ``similarity`` module:
 
 .. code-block:: none
 
-    KS score: 0.0795494
-    KS p-value: 0.40908137
-    AD score: -0.29753023
+    KS score: 0.07429516
+    KS p-value: 0.49535686
+    AD score: -0.15807883
     AD p-value: 0.25
 
 
@@ -516,8 +516,8 @@ Once we've stacked our model data so it's one dimensional,
 .. code-block:: none
 
     <xarray.DataArray 'pr' (sample: 5900)>
-    array([ 84.050224,  81.98476 ,  46.54293 , ...,  68.81363 , 129.38924 ,
-            28.640915], dtype=float32)
+    array([124.84209 ,  73.138466,  62.510113, ...,  57.245464, 121.23235 ,
+            34.655647], dtype=float32)
     Coordinates:
         time       (sample) object 1961-11-01 12:00:00 ... 2028-11-01 12:00:00
       * sample     (sample) object MultiIndex
@@ -549,8 +549,8 @@ Once we've stacked our model data so it's one dimensional,
 
 .. code-block:: none
 
-    220 year return period
-    95% CI: 177-280 years
+    235 year return period
+    95% CI: 188-304 years
 
 
 .. image:: return_curve.png

diff --git a/unseen/general_utils.py b/unseen/general_utils.py
@@ -4,8 +4,11 @@
 import re
 
 import numpy as np
+from scipy.optimize import minimize
+from scipy.stats import genextreme, rv_continuous
+from scipy.stats._constants import _LOGXMAX
+from scipy.stats._distn_infrastructure import _sum_finite
 import xclim
-from scipy.stats import genextreme as gev
 
 
 class store_dict(argparse.Action):
@@ -138,45 +141,161 @@ def event_in_context(data, threshold, direction):
     return n_events, n_population, return_period, percentile
 
 
-def fit_gev(data, user_estimates=[], generate_estimates=False):
-    """Fit a GEV by providing fit and scale estimates.
+class ns_genextreme_gen(rv_continuous):
+    """Extreme value distributions (stationary or non-stationary)."""
 
-    Parameters
-    ----------
-    data : numpy ndarray
-    user_estimates : list, optional
-        Estimate of the location and scale parameters
-    generate_estimates : bool, default False
-        Fit GEV to data subset first to estimate parameters (useful for large datasets)
+    def fit_stationary(self, data, user_estimates, generate_estimates):
+        """Return estimates of shape, location and scale parameters using genextreme."""
+        if user_estimates:
+            shape, loc, scale = user_estimates
+            shape, loc, scale = genextreme.fit(data, shape, loc=loc, scale=scale)
 
-    Returns
-    -------
-    shape : float
-        Shape parameter
-    loc : float
-        Location parameter
-    scale : float
-        Scale parameter
-    """
+        elif generate_estimates:
+            # Generate initial estimates using a data subset (useful for large datasets).
+            shape, loc, scale = genextreme.fit(data[::2])
+            shape, loc, scale = genextreme.fit(data, shape, loc=loc, scale=scale)
+        else:
+            shape, loc, scale = genextreme.fit(data)
+        return shape, loc, scale
+
+    def nllf(self, theta, data, times):
+        """Penalised negative log-likelihood of GEV probability density function.
+
+        A modified version of scipy.stats.genextremes.fit for fitting extreme value
+        distribution parameters, in which the location and scale parameters can vary
+        linearly with a covariate. The non-stationary parameters will be returned only
+        if the input 'theta' incudes the time-varying location and scale parameters.
+        A large, finite penalty (rather than infinite negative log-likelihood)
+        is applied for observations beyond the support of the distribution.
+
+        Parameters
+        ----------
+        theta : tuple of floats
+            Shape, location and scale parameters (shape, loc0, loc1, scale0, scale1).
+        data, times : array_like
+            Data time series and indexes of covariates.
+
+        Returns
+        -------
+        total : float
+            The sum of the penalised negative likelihood function.
+        """
+        if len(theta) == 5:
+            # Non-stationary GEV parameters.
+            shape, loc0, loc1, scale0, scale1 = theta
+            loc = loc0 + loc1 * times
+            scale = scale0 + scale1 * times
 
-    if user_estimates:
-        loc_estimate, scale_estimate = user_estimates
-        shape, loc, scale = gev.fit(data, loc=loc_estimate, scale=scale_estimate)
-    elif generate_estimates:
-        shape_estimate, loc_estimate, scale_estimate = gev.fit(data[::2])
-        shape, loc, scale = gev.fit(data, loc=loc_estimate, scale=scale_estimate)
-    else:
-        shape, loc, scale = gev.fit(data)
+        else:
+            # Stationary GEV parameters.
+            shape, loc, scale = theta
 
-    return shape, loc, scale
+        s = (data - loc) / scale
 
+        # Calculate the NLLF (type 1 or types 2-3 extreme value distributions).
+        if shape == 0:
+            f = np.log(scale) + s + np.exp(-s)
 
-def return_period(data, event):
-    """Get return period for given event by fitting a GEV"""
+        else:
+            Z = 1 + shape * s
+            # NLLF at points where the data is supported by the distribution parameters.
+            # (N.B. the NLLF is not finite when the shape is nonzero and Z is negative
+            # because the PDF is zero (log(0)=inf) outside of these bounds).
+            f = np.where(
+                Z > 0,
+                np.log(scale)
+                + (1 + 1 / shape) * np.ma.log(Z)
+                + np.ma.power(Z, -1 / shape),
+                np.inf,
+            )
+
+        f = np.where(scale > 0, f, np.inf)  # Scale parameter must be positive.
+
+        # Sum function along all axes (where finite) & count infinite elements.
+        total, n_bad = _sum_finite(f)
+
+        # Add large finite penalty instead of infinity (log of the largest useable float).
+        total = total + n_bad * _LOGXMAX * 100
+        return total
+
+    def fit(
+        self,
+        data,
+        user_estimates=[],
+        loc1=0,
+        scale1=0,
+        generate_estimates=False,
+        stationary=True,
+        method="Nelder-Mead",
+    ):
+        """Return estimates of data distribution parameters and their trend (if applicable).
+
+        For stationary data, estimates the shape, location and scale parameters using
+        scipy.stats.genextremes.fit(). For non-stationary data, also estimates the linear
+        location and scale trend parameters using a penalised negative log-likelihood
+        function with initial estimates based on the stationary fit.
+
+        Parameters
+        ----------
+        data : array_like
+            Data timeseries.
+        user estimates: list, optional
+            Initial estimates of the shape, loc and scale parameters.
+        loc1, scale1 : float, optional
+            Initial estimates of the location and scale trend parameters. Defaults to 0.
+        stationary : bool, optional
+            Fit as a stationary GEV using scipy.stats.genextremes.fit. Defaults to True.
+        method : str, optional
+            Method used for scipy.optimize.minimize. Defaults to 'Nelder-Mead'.
+
+        Returns
+        -------
+        theta : tuple of floats
+            Shape, location and scale parameters (and loc1 and scale1 if applicable)
+
+        Example
+        -------
+        '''
+        ns_genextreme = ns_genextreme_gen()
+        data = scipy.stats.genextreme.rvs(0.8, loc=3.2, scale=0.5, size=500, random_state=0)
+        shape, loc, scale = ns_genextreme.fit(data, stationary=True)
+        shape, loc, loc1, scale, scale1 = ns_genextreme.fit(data, stationary=False)
+        '''
+        """
+
+        # Use genextremes to get stationary distribution parameters.
+        shape, loc, scale = self.fit_stationary(
+            data, user_estimates, generate_estimates
+        )
+
+        if stationary:
+            theta = shape, loc, scale
+        else:
+            times = np.arange(data.shape[-1], dtype=int)
+            theta_i = shape, loc, loc1, scale, scale1
+
+            # Optimisation bounds (scale parameter must be non-negative).
+            bounds = [(None, None), (None, None), (None, None), (0, None), (None, None)]
+
+            # Minimise the negative log-likelihood function to get optimal theta.
+            res = minimize(
+                self.nllf, theta_i, args=(data, times), method=method, bounds=bounds
+            )
+            theta = res.x
+
+        return theta
 
-    shape, loc, scale = fit_gev(data, generate_estimates=True)
-    probability = gev.sf(event, shape, loc=loc, scale=scale)
-    return_period = 1.0 / probability
+
+fit_gev = ns_genextreme_gen().fit
+
+
+def return_period(data, event, **kwargs):
+    """Get return period for given event by fitting a GEV."""
+
+    shape, loc, scale = fit_gev(data, **kwargs)
+    return_period = genextreme.isf(
+        event, shape, loc=loc, scale=scale
+    )  # 1.0 / probability
 
     return return_period
 
@@ -206,25 +325,27 @@ def gev_return_curve(
 
     curve_return_periods = np.logspace(0, max_return_period, num=10000)
     curve_probabilities = 1.0 / curve_return_periods
-    curve_values = gev.isf(curve_probabilities, shape, loc, scale)
+    curve_values = genextreme.isf(curve_probabilities, shape, loc, scale)
 
-    event_probability = gev.sf(event_value, shape, loc=loc, scale=scale)
+    event_probability = genextreme.sf(event_value, shape, loc=loc, scale=scale)
     event_return_period = 1.0 / event_probability
 
     # Bootstrapping for confidence interval
     boot_values = curve_values
     boot_event_return_periods = []
     for i in range(n_bootstraps):
         if bootstrap_method == "parametric":
-            boot_data = gev.rvs(shape, loc=loc, scale=scale, size=len(data))
+            boot_data = genextreme.rvs(shape, loc=loc, scale=scale, size=len(data))
         elif bootstrap_method == "non-parametric":
             boot_data = np.random.choice(data, size=data.shape, replace=True)
         boot_shape, boot_loc, boot_scale = fit_gev(boot_data, generate_estimates=True)
 
-        boot_value = gev.isf(curve_probabilities, boot_shape, boot_loc, boot_scale)
+        boot_value = genextreme.isf(
+            curve_probabilities, boot_shape, boot_loc, boot_scale
+        )
         boot_values = np.vstack((boot_values, boot_value))
 
-        boot_event_probability = gev.sf(
+        boot_event_probability = genextreme.sf(
             event_value, boot_shape, loc=boot_loc, scale=boot_scale
         )
         boot_event_return_period = 1.0 / boot_event_probability