Merge pull request #408 from elfofmaxwell/preprocessing_docstring

Preprocessing docstring
aristoteleo · Sep 3, 2022 · 1d1f5c5 · 1d1f5c5
2 parents f5ec039 + 6605b7a
commit 1d1f5c5
Show file tree

Hide file tree

Showing 44 changed files with 1,900 additions and 1,373 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
 -   repo: https://github.com/python/black
-    rev: 20.8b1
+    rev: 22.6.0
     hooks:
     - id: black
       args: [--line-length=120]

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -33,4 +33,5 @@ GitPython
 KDEpy
 
 sphinxcontrib-bibtex>=2.3
-sphinx-gallery
+sphinx-gallery
+typing-extensions
diff --git a/dynamo/estimation/csc/utils_velocity.py b/dynamo/estimation/csc/utils_velocity.py
@@ -663,7 +663,7 @@ def solve_alpha_degradation(t, u, beta, intercept=False):
     ym = np.mean(y)
 
     # calculate slope
-    var_x = np.mean(x ** 2) - xm ** 2
+    var_x = np.mean(x**2) - xm**2
     cov = np.sum(y.dot(x)) / n - ym * xm
     k = cov / var_x
 
@@ -776,7 +776,7 @@ def concat_time_series_matrices(mats, t=None):
 # ---------------------------------------------------------------------------------------------------
 # negbin method related
 def compute_dispersion(mX, varX):
-    phi = fit_linreg(mX ** 2, varX - mX, intercept=False)[0]
+    phi = fit_linreg(mX**2, varX - mX, intercept=False)[0]
     return phi
 
 

diff --git a/dynamo/estimation/fit_jacobian.py b/dynamo/estimation/fit_jacobian.py
@@ -5,23 +5,23 @@
 
 
 def hill_inh_func(x, A, K, n, g):
-    Kd = K ** n
-    return A * Kd / (Kd + x ** n) - g * x
+    Kd = K**n
+    return A * Kd / (Kd + x**n) - g * x
 
 
 def hill_inh_grad(x, A, K, n, g):
-    Kd = K ** n
-    return -A * n * Kd * x ** (n - 1) / (Kd + x ** n) ** 2 - g
+    Kd = K**n
+    return -A * n * Kd * x ** (n - 1) / (Kd + x**n) ** 2 - g
 
 
 def hill_act_func(x, A, K, n, g):
-    Kd = K ** n
-    return A * x ** n / (Kd + x ** n) - g * x
+    Kd = K**n
+    return A * x**n / (Kd + x**n) - g * x
 
 
 def hill_act_grad(x, A, K, n, g):
-    Kd = K ** n
-    return A * n * Kd * x ** (n - 1) / (Kd + x ** n) ** 2 - g
+    Kd = K**n
+    return A * n * Kd * x ** (n - 1) / (Kd + x**n) ** 2 - g
 
 
 def calc_mean_squared_deviation(func, x_data, y_mean, y_sigm, weighted=True):

diff --git a/dynamo/estimation/tsc/utils_kinetic.py b/dynamo/estimation/tsc/utils_kinetic.py
@@ -252,11 +252,11 @@ def get_n_labeled(self):
 
     def get_var_nu(self):
         c = self.get_nu()
-        return self.x[:, self.uu] + c - c ** 2
+        return self.x[:, self.uu] + c - c**2
 
     def get_var_nx(self):
         c = self.get_nx()
-        return self.x[:, self.xx] + c - c ** 2
+        return self.x[:, self.xx] + c - c**2
 
     def get_cov_ux(self):
         cu = self.get_nu()
@@ -380,7 +380,7 @@ def get_nu(self):
 
     def get_var_nu(self):
         c = self.get_nu()
-        return self.x[:, self.uu] + c - c ** 2
+        return self.x[:, self.uu] + c - c**2
 
     def computeKnp(self):
         # parameters
@@ -480,11 +480,11 @@ def get_mean_s(self):
 
     def get_var_u(self):
         c = self.get_mean_u()
-        return self.x[:, self.uu] - c ** 2
+        return self.x[:, self.uu] - c**2
 
     def get_var_s(self):
         c = self.get_mean_s()
-        return self.x[:, self.ss] - c ** 2
+        return self.x[:, self.ss] - c**2
 
     def get_cov_us(self):
         cu = self.get_mean_u()
@@ -576,7 +576,7 @@ def get_mean_u(self):
 
     def get_var_u(self):
         c = self.get_mean_u()
-        return self.x[:, self.uu] - c ** 2
+        return self.x[:, self.uu] - c**2
 
     def computeKnp(self):
         # parameters

diff --git a/dynamo/estimation/tsc/utils_moments.py b/dynamo/estimation/tsc/utils_moments.py
@@ -124,11 +124,11 @@ def get_n_labeled(self):
 
     def get_var_nu(self):
         c = self.get_nu()
-        return self.x[:, self.uu] + c - c ** 2
+        return self.x[:, self.uu] + c - c**2
 
     def get_var_nx(self):
         c = self.get_nx()
-        return self.x[:, self.xx] + c - c ** 2
+        return self.x[:, self.xx] + c - c**2
 
     def get_cov_ux(self):
         cu = self.get_nu()

diff --git a/dynamo/external/pearson_residual_recipe.py b/dynamo/external/pearson_residual_recipe.py
@@ -133,7 +133,7 @@ def _highly_variable_pearson_residuals(
             stop = start + chunksize
             mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total)
             X_dense = X_batch[:, start:stop].toarray()
-            residuals = (X_dense - mu) / np.sqrt(mu + mu ** 2 / theta)
+            residuals = (X_dense - mu) / np.sqrt(mu + mu**2 / theta)
             residuals = np.clip(residuals, a_min=-clip, a_max=clip)
             residual_gene_var[start:stop] = np.var(residuals, axis=0)
 
@@ -377,7 +377,7 @@ def compute_pearson_residuals(X, theta, clip, check_values, copy=False):
 
     mu = np.array(sums_cells @ sums_genes / sum_total)
     diff = np.array(X - mu)
-    residuals = diff / np.sqrt(mu + mu ** 2 / theta)
+    residuals = diff / np.sqrt(mu + mu**2 / theta)
 
     # clip
     residuals = np.clip(residuals, a_min=-clip, a_max=clip)

diff --git a/dynamo/external/sctransform.py b/dynamo/external/sctransform.py
@@ -216,7 +216,7 @@ def sctransform_core(
     x = model_pars["theta"].values.copy()
     x[x < min_theta] = min_theta
     model_pars["theta"] = x
-    dispersion_par = np.log10(1 + 10 ** genes_log_gmean_step1 / model_pars["theta"].values.flatten())
+    dispersion_par = np.log10(1 + 10**genes_log_gmean_step1 / model_pars["theta"].values.flatten())
 
     model_pars_theta = model_pars["theta"]
     model_pars = model_pars.iloc[:, model_pars.columns != "theta"].copy()
@@ -250,7 +250,7 @@ def sctransform_core(
         )
         full_model_pars[i] = kr.fit(data_predict=x_points)[0]
 
-    theta = 10 ** genes_log_gmean / (10 ** full_model_pars["dispersion"].values - 1)
+    theta = 10**genes_log_gmean / (10 ** full_model_pars["dispersion"].values - 1)
     full_model_pars["theta"] = theta
     del full_model_pars["dispersion"]
 
@@ -261,9 +261,9 @@ def sctransform_core(
     d = X.data
     x, y = X.nonzero()
     mud = np.exp(full_model_pars.values[:, 0][y] + full_model_pars.values[:, 1][y] * cell_attrs["log_umi"].values[x])
-    vard = mud + mud ** 2 / full_model_pars["theta"].values.flatten()[y]
+    vard = mud + mud**2 / full_model_pars["theta"].values.flatten()[y]
 
-    X.data[:] = (d - mud) / vard ** 0.5
+    X.data[:] = (d - mud) / vard**0.5
     X.data[X.data < 0] = 0
     X.eliminate_zeros()
 

diff --git a/dynamo/plot/ezplots.py b/dynamo/plot/ezplots.py
@@ -227,7 +227,7 @@ def zstreamline(
         "zorder": 3,
     }
 
-    mass = np.sqrt((V_grid ** 2).sum(0))
+    mass = np.sqrt((V_grid**2).sum(0))
     # velocity filtering
     if min_vel_mag is not None:
         min_vel_mag = np.clip(min_vel_mag, None, np.quantile(mass, 0.4))

diff --git a/dynamo/plot/heatmaps.py b/dynamo/plot/heatmaps.py
@@ -65,7 +65,7 @@ def rep2(x, length_out):
 
 
 def dnorm(x, u=0, sig=1):
-    return np.exp(-((x - u) ** 2) / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
+    return np.exp(-((x - u) ** 2) / (2 * sig**2)) / (math.sqrt(2 * math.pi) * sig)
 
 
 def kde2d(x, y, h=None, n=25, lims=None):
@@ -286,7 +286,7 @@ def response(
 
     id = 0
     for gene_pairs_ind, gene_pairs in enumerate(pairs_mat):
-        f_ini_ind = (grid_num ** 2) * id
+        f_ini_ind = (grid_num**2) * id
         r_ini_ind = grid_num * id
 
         gene_pair_name = gene_pairs[0] + "->" + gene_pairs[1]
@@ -842,7 +842,7 @@ def causality(
     id = 0
     for gene_pairs_ind in range(0, len(pairs_mat)):
         gene_pairs = pairs_mat[gene_pairs_ind, :]
-        f_ini_ind = (grid_num ** 2) * id
+        f_ini_ind = (grid_num**2) * id
 
         gene_pair_name = reduce(lambda a, b: a + "->" + b, gene_pairs)
 

diff --git a/dynamo/plot/scVectorField.py b/dynamo/plot/scVectorField.py
@@ -470,7 +470,7 @@ def line_integral_conv(
             V_grid = V_grid_[1, :, :].T
 
     if V_threshold is not None:
-        mass = np.sqrt((V_grid ** 2).sum(0))
+        mass = np.sqrt((V_grid**2).sum(0))
         if V_threshold is not None:
             V_grid[0][mass.reshape(V_grid[0].shape) < V_threshold] = np.nan
 
@@ -498,7 +498,7 @@ def line_integral_conv(
         data["velocity_y"] = (velocity_y, "km/s")
         data["velocity_z"] = (velocity_z, "km/s")
         data["velocity_sum"] = (
-            np.sqrt(velocity_x ** 2 + velocity_y ** 2),
+            np.sqrt(velocity_x**2 + velocity_y**2),
             "km/s",
         )
 
@@ -1623,7 +1623,7 @@ def streamline_plot(
         "integration_direction": "both",
         "zorder": 3,
     }
-    mass = np.sqrt((V_grid ** 2).sum(0))
+    mass = np.sqrt((V_grid**2).sum(0))
     linewidth *= 2 * mass / mass[~np.isnan(mass)].max()
     streamplot_kwargs.update({"linewidth": linewidth * streamline_kwargs.pop("linewidth", 1)})
 

diff --git a/dynamo/plot/scatters.py b/dynamo/plot/scatters.py
@@ -758,7 +758,7 @@ def _plot_basis_layer(cur_b, cur_l):
                     values = (
                         calc_1nd_moment(values, knn)[0]
                         if smooth in [1, True]
-                        else calc_1nd_moment(values, knn ** smooth)[0]
+                        else calc_1nd_moment(values, knn**smooth)[0]
                     )
 
                 if affine_transform_A is None or affine_transform_b is None:

diff --git a/dynamo/plot/space.py b/dynamo/plot/space.py
@@ -127,7 +127,7 @@ def space(
         # meaning of s in scatters:
         # https://stackoverflow.com/questions/14827650/pyplot-scatter-plot-marker-size/47403507#47403507
         # Note that np.sqrt(adata.shape[0]) / 16000.0 is used in pl.scatters
-        pointsize = pointsize ** 2 * np.sqrt(adata.shape[0]) / 16000.0
+        pointsize = pointsize**2 * np.sqrt(adata.shape[0]) / 16000.0
 
         main_info("estimated point size for plotting each cell in space: %f" % (pointsize))
 

diff --git a/dynamo/plot/topography.py b/dynamo/plot/topography.py
@@ -129,7 +129,7 @@ def plot_flow_field(
             u_vel[i, j], v_vel[i, j] = vecfld(np.array([uu[i, j], vv[i, j]]))
 
     # Compute speed
-    speed = np.sqrt(u_vel ** 2 + v_vel ** 2)
+    speed = np.sqrt(u_vel**2 + v_vel**2)
 
     # Make linewidths proportional to speed,
     # with minimal line width of 0.5 and max of 3

diff --git a/dynamo/prediction/trajectory.py b/dynamo/prediction/trajectory.py
@@ -95,7 +95,7 @@ def interp_X(self, num=100, **interp_kwargs):
         return self.interpolate(self.interp_t(num=num), **interp_kwargs)
 
     def integrate(self, func):
-        """ Calculate the integral of func along the curve. The first and last points are omitted. """
+        """Calculate the integral of func along the curve. The first and last points are omitted."""
         F = np.zeros(func(self.X[0]).shape)
         tvec = self.calc_tangent(normalize=False)
         for i in range(1, self.X.shape[0] - 1):

diff --git a/dynamo/preprocessing/CnmfPreprocessor.py b/dynamo/preprocessing/CnmfPreprocessor.py
@@ -10,7 +10,9 @@
 
 
 class CnmfPreprocessor(Preprocessor):
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs) -> None:
+        """A specialized preprocessor based on cNMF. Args used are the same as normal Preprocessor."""
+
         super().__init__(**kwargs)
         self.selected_K = 7
         self.n_iter = 200
@@ -26,7 +28,16 @@ def __init__(self, **kwargs):
         # TODO: enable parallel computing in the future. Currently cNMF only provides cmd interfaces for factorization.
         self.num_worker = 1
 
-    def preprocess_adata(self, adata: AnnData):
+    def preprocess_adata(self, adata: AnnData) -> AnnData:
+        """Preprocess the AnnData object with cNMF.
+
+        Args:
+            adata: an AnnData object.
+
+        Returns:
+            The preprocessed AnnData object.
+        """
+
         try:
             from cnmf import cNMF
         except Exception as e:
@@ -67,8 +78,12 @@ def preprocess_adata(self, adata: AnnData):
         self.cnmf_obj = cnmf_obj
         return adata
 
-    def k_selection_plot(self):
+    def k_selection_plot(self) -> None:
+        """Plot the K selection curve of cNMF and save to the output folder."""
+
         self.cnmf_obj.k_selection_plot(close_fig=False)
 
-    def cleanup_cnmf(self):
+    def cleanup_cnmf(self) -> None:
+        """Remove the tmp folder to store data used for cNMF."""
+
         rmtree(self.output_dir, ignore_errors=True)