From 8126b8aed8fb293e3f0e8cd622019841fc5b3e42 Mon Sep 17 00:00:00 2001
From: EricThomson <thomson.eric@gmail.com>
Date: Wed, 21 Feb 2024 18:21:47 -0500
Subject: [PATCH 1/2] handle problems that can come up with auto-percentile
 calculations for dff

---
 caiman/utils/stats.py | 47 ++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/caiman/utils/stats.py b/caiman/utils/stats.py
index 4264ab7c2..94d75c46b 100644
--- a/caiman/utils/stats.py
+++ b/caiman/utils/stats.py
@@ -169,6 +169,8 @@ def df_percentile(inputData, axis=None):
     Extracting the percentile of the data where the mode occurs and its value.
     Used to determine the filtering level for DF/F extraction. Note that
     computation can be inaccurate for short traces.
+
+    If errors occur, will just return fallback value of median (50% percentile)
     """
     if axis is not None:
 
@@ -181,24 +183,37 @@ def fnc(x):
     else:
         # Create the function that we can use for the half-sample mode
         err = True
-        while err:
+        err_count = 0
+        max_err_count = 10
+        while err==True and err_count < max_err_count:
             try:
                 bandwidth, mesh, density, cdf = kde(inputData)
                 err = False
-            except:
-                logging.warning('Percentile computation failed. Duplicating ' + 'and trying again.')
+            except ValueError:
+                err_count += 1
+                logging.warning(f"kde() failure {err_count}. Concatenating traces and recomputing.")
                 if not isinstance(inputData, list):
                     inputData = inputData.tolist()
                 inputData += inputData
 
-        data_prct = cdf[np.argmax(density)] * 100
-        val = mesh[np.argmax(density)]
+        # There are three ways the above calculation can go wrong. 
+        # For each case: just use median.
+        # if cdf was never defined, it means kde never ran without error
+        try:
+            cdf
+        except NameError:
+            logging.warning('Max err count reached. Reverting to median.')
+            data_prct = 50
+            val = np.median(np.array(inputData))
+        else:
+            data_prct = cdf[np.argmax(density)] * 100
+            val = mesh[np.argmax(density)]
+
         if data_prct >= 100 or data_prct < 0:
-            logging.warning('Invalid percentile computed possibly due ' + 'short trace. Duplicating and recomuputing.')
-            if not isinstance(inputData, list):
-                inputData = inputData.tolist()
-            inputData *= 2
-            err = True
+            logging.warning('Invalid percentile (<0 or >100) computed. Reverting to median.')
+            data_prct = 50
+            val = np.median(np.array(inputData))
+
         if np.isnan(data_prct):
             logging.warning('NaN percentile computed. Reverting to median.')
             data_prct = 50
@@ -215,8 +230,6 @@ def fnc(x):
 Daniel B. Smith, PhD
 Updated 1-23-2013
 """
-
-
 def kde(data, N=None, MIN=None, MAX=None):
 
     # Parameters to set up the mesh on which to calculate
@@ -241,12 +254,13 @@ def kde(data, N=None, MIN=None, MAX=None):
     SqDCTData = (DCTData[1:] / 2)**2
 
     # The fixed point calculation finds the bandwidth = t_star
+    # fixed_point is function defined below
     guess = 0.1
     try:
-        t_star = scipy.optimize.brentq(fixed_point, 0, guess, args=(M, I, SqDCTData))
-    except ValueError:
-        print('Oops!')
-        return None
+       t_star = scipy.optimize.brentq(fixed_point, 0, guess, args=(M, I, SqDCTData))
+    except ValueError as e:
+       # 
+       raise ValueError(f"Unable to find root: {str(e)}")
 
     # Smooth the DCTransformed data using t_star
     SmDCTData = DCTData * np.exp(-np.arange(N)**2 * np.pi**2 * t_star / 2)
@@ -262,6 +276,7 @@ def kde(data, N=None, MIN=None, MAX=None):
 
 
 def fixed_point(t, M, I, a2):
+    # TODO: document this
     l = 7
     I = np.float64(I)
     M = np.float64(M)

From dc8237cffbce2d7c9ac47c242fef4b1c5c83f90f Mon Sep 17 00:00:00 2001
From: EricThomson <thomson.eric@gmail.com>
Date: Wed, 21 Feb 2024 20:37:52 -0500
Subject: [PATCH 2/2] updated based on review

---
 caiman/utils/stats.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/caiman/utils/stats.py b/caiman/utils/stats.py
index 94d75c46b..a5688fe36 100644
--- a/caiman/utils/stats.py
+++ b/caiman/utils/stats.py
@@ -170,7 +170,7 @@ def df_percentile(inputData, axis=None):
     Used to determine the filtering level for DF/F extraction. Note that
     computation can be inaccurate for short traces.
 
-    If errors occur, will just return fallback value of median (50% percentile)
+    If errors occur, return fallback values
     """
     if axis is not None:
 
@@ -185,7 +185,7 @@ def fnc(x):
         err = True
         err_count = 0
         max_err_count = 10
-        while err==True and err_count < max_err_count:
+        while err and err_count < max_err_count:
             try:
                 bandwidth, mesh, density, cdf = kde(inputData)
                 err = False
@@ -226,7 +226,8 @@ def fnc(x):
 An implementation of the kde bandwidth selection method outlined in:
 Z. I. Botev, J. F. Grotowski, and D. P. Kroese. Kernel density
 estimation via diffusion. The Annals of Statistics, 38(5):2916-2957, 2010.
-Based on the implementation in Matlab by Zdravko Botev.
+Based on the implementation in Matlab by Zdravko Botev:
+See: https://www.mathworks.com/matlabcentral/fileexchange/14034-kernel-density-estimator
 Daniel B. Smith, PhD
 Updated 1-23-2013
 """
@@ -253,8 +254,7 @@ def kde(data, N=None, MIN=None, MAX=None):
     I = [iN * iN for iN in range(1, N)]
     SqDCTData = (DCTData[1:] / 2)**2
 
-    # The fixed point calculation finds the bandwidth = t_star
-    # fixed_point is function defined below
+    # The fixed point calculation finds the bandwidth = t_star for smoothing (see paper cited above)
     guess = 0.1
     try:
        t_star = scipy.optimize.brentq(fixed_point, 0, guess, args=(M, I, SqDCTData))
@@ -276,7 +276,8 @@ def kde(data, N=None, MIN=None, MAX=None):
 
 
 def fixed_point(t, M, I, a2):
-    # TODO: document this
+    # TODO: document this: 
+    #   From Matlab code: this implements the function t-zeta*gamma^[l](t)
     l = 7
     I = np.float64(I)
     M = np.float64(M)