Fix obvious issues

tansey-lab · Feb 8, 2024 · b1ef787 · b1ef787
1 parent fda1994
commit b1ef787
Show file tree

Hide file tree

Showing 7 changed files with 25 additions and 50 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,7 +33,8 @@ dependencies = [
     "torchvision",
     "spatialdata-io",
     "matplotlib",
-    "autograd-minimize"
+    "autograd-minimize",
+    "scikit-learn"
 ]
 
 [project.optional-dependencies]

diff --git a/src/nuc2seg/kernel_cell_segmentation.py b/src/nuc2seg/kernel_cell_segmentation.py
@@ -2,6 +2,12 @@
 import numpy as np
 from scipy.special import softmax
 from scipy.stats import poisson
+import torch
+from torch.distributions import Poisson
+from autograd_minimize import minimize
+from tqdm import tqdm
+from nuc2seg.utils import grid_graph_edges, calc_plateaus
+from sklearn.cluster import KMeans
 
 
 def calculate_neighbor_weights(
@@ -167,10 +173,6 @@ def e_step(
 def fit_fused_lasso_count_process(
     counts, lam_min=1e-2, lam_max=50, n_lam=30, segmentation_threshold=0.1
 ):
-    import torch
-    from torch.distributions import Poisson
-    from autograd_minimize import minimize
-    from tqdm import tqdm
 
     # Estimates of the latent rates
     Rates = np.zeros((n_lam,) + counts.shape)
@@ -334,7 +336,6 @@ def test_fused_lasso_count_process():
     )
 
     # Initialize cell type profiles and cell type IDs via K-means
-    from sklearn.cluster import KMeans
 
     kmeans = KMeans(n_clusters=n_cell_types).fit(cell_transcript_counts)
     model_cell_types = kmeans.labels_ + 1

diff --git a/src/nuc2seg/segment.py b/src/nuc2seg/segment.py
@@ -1,11 +1,13 @@
 import torch
 import numpy as np
+import geopandas
+import pandas
 
 from scipy.sparse import csr_matrix
 from scipy.sparse.csgraph import connected_components
 from scipy.special import expit, softmax
 
-from xenium_utils import pol2cart
+from xenium_utils import pol2cart, create_pixel_geodf, load_nuclei
 
 
 def temp_forward(model, x, y, z):
@@ -262,7 +264,7 @@ def flow_graph_segmentation(
         nuclei_geo_df = load_nuclei(nuclei_file)
 
         # Find the nearest nucleus to each pixel
-        labels_geo_df = gpd.sjoin_nearest(
+        labels_geo_df = geopandas.sjoin_nearest(
             labels_geo_df, nuclei_geo_df, how="left", distance_col="nucleus_distance"
         )
         labels_geo_df.rename(columns={"index_right": "nucleus_id_xenium"}, inplace=True)
@@ -314,7 +316,7 @@ def greedy_cell_segmentation(
         nuclei_geo_df = load_nuclei(nuclei_file)
 
         # Find the nearest nucleus to each pixel
-        labels_geo_df = gpd.sjoin_nearest(
+        labels_geo_df = geopandas.sjoin_nearest(
             labels_geo_df, nuclei_geo_df, how="left", distance_col="nucleus_distance"
         )
         labels_geo_df.rename(columns={"index_right": "nucleus_id_xenium"}, inplace=True)
@@ -587,4 +589,4 @@ def save_cell_matrix(pixel_labels_arr, tx_geo_df, outfile):
         df[gene_name] = counts[:, idx]
 
     # Save to file
-    pd.DataFrame(df).to_csv(outfile, index=False)
+    pandas.DataFrame(df).to_csv(outfile, index=False)
diff --git a/src/nuc2seg/train.py b/src/nuc2seg/train.py
@@ -1,10 +1,11 @@
 import torch
 import torch.nn as nn
-from nuc2seg.utils.data_loading import XeniumDataset, xenium_collate_fn
+from nuc2seg.data_loading import XeniumDataset, xenium_collate_fn
 from torch import optim
 from torch.utils.data import DataLoader, random_split
 
 from nuc2seg.unet_model import SparseUNet
+from nuc2seg.evaluate import evaluate
 
 
 def angle_loss(predictions, targets):

diff --git a/src/nuc2seg/unet_model.py b/src/nuc2seg/unet_model.py
@@ -1,7 +1,7 @@
 """ Full assembly of the parts to form the complete network """
 
 from torch.nn import Embedding
-
+import numpy as np
 from nuc2seg.unet_parts import *
 
 

diff --git a/src/nuc2seg/utils.py b/src/nuc2seg/utils.py
@@ -122,8 +122,7 @@ def calc_plateaus(beta, edges, rel_tol=1e-4, verbose=0):
             for local_idx in local_check:
                 if (
                     not check_map[local_idx]
-                    and beta[local_idx] >= min_member
-                    and beta[local_idx] <= max_member
+                    and min_member <= beta[local_idx] <= max_member
                 ):
                     # Label this index as being checked so it's not re-checked unnecessarily
                     check_map[local_idx] = True

diff --git a/src/nuc2seg/xenium_utils.py b/src/nuc2seg/xenium_utils.py
@@ -3,11 +3,14 @@
 import os
 import sys
 
+from scipy.special import softmax
 import geopandas as gpd
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt
 from shapely import Polygon
+from scipy.special import logsumexp
+from scipy.stats import poisson
 
 logger = logging.getLogger(__name__)
 
@@ -20,36 +23,6 @@ def configure_logging():
     )
 
 
-def get_args():
-    parser = argparse.ArgumentParser(
-        description="Align transcripts and xenium cell segmentation"
-    )
-    parser.add_argument(
-        "--transcripts",
-        required=True,
-        help="Path to the transcripts.csv file produced by Xenium.",
-    )
-    parser.add_argument(
-        "--boundaries",
-        required=True,
-        help="Path to the cell_boundaries.csv file produced by Xenium.",
-    )
-    parser.add_argument(
-        "--sample-area",
-        default=None,
-        type=str,
-        help='Rectangular area to sample in "x1,x2,y1,y2 format.',
-    )
-    parser.add_argument("--output", required=True, help="Path to the output file.")
-    parser.add_argument(
-        "--boundaries-output", required=True, help="Path to the boundary shapefile."
-    )
-    parser.add_argument(
-        "--transcripts-output", required=True, help="Path to the transcripts shapefile."
-    )
-    return parser.parse_args()
-
-
 def create_shapely_rectangle(x1, y1, x2, y2):
     return Polygon([(x1, y1), (x1, y2), (x2, y2), (x2, y1)])
 
@@ -135,7 +108,6 @@ def estimate_cell_types(
     tol=1e-4,
     warm_start=False,
 ):
-    from scipy.special import softmax
 
     n_nuclei, n_genes = gene_counts.shape
 
@@ -254,7 +226,6 @@ def estimate_cell_types(
 
 
 def aic_bic(gene_counts, expression_profiles, prior_probs):
-    from scipy.special import logsumexp
 
     n_components = expression_profiles.shape[0]
     n_genes = expression_profiles.shape[1]
@@ -327,7 +298,7 @@ def calculate_pixel_loglikes(
     # Calculate the log-likelihoods for each pixel generating a certain set of transcripts.
     # NOTE: yes we could vectorize this, but that leads to memory issues.
     expression_loglikes = np.zeros_like(rate_loglikes)
-    log_expression = np.log(all_expression_profiles)
+    log_expression = np.log(expression_profiles)
     for i in range(expression_loglikes.shape[0]):
         if i % 100 == 0:
             print(i)
@@ -390,7 +361,7 @@ def load_and_filter_transcripts(transcripts_file: str, min_qv=20.0):
     mapping = dict(zip(gene_ids, np.arange(len(gene_ids))))
     tx_geo_df["gene_id"] = tx_geo_df["feature_name"].apply(lambda x: mapping.get(x, 0))
 
-    return tx_geo_df
+    return tx_geo_df, gene_ids
 
 
 def create_pixel_geodf(x_max, y_max):
@@ -414,7 +385,7 @@ def create_pixel_geodf(x_max, y_max):
 def spatial_as_sparse_arrays(
     nuclei_file: str,
     transcripts_file: str,
-    out_dir: str,
+    outdir: str,
     pixel_stride=1,
     min_qv=20.0,
     foreground_nucleus_distance=1,
@@ -431,7 +402,7 @@ def spatial_as_sparse_arrays(
     nuclei_geo_df = load_nuclei(nuclei_file)
 
     # Load the transcript locations
-    tx_geo_df = load_and_filter_transcripts(transcripts_file, min_qv=min_qv)
+    tx_geo_df, gene_ids = load_and_filter_transcripts(transcripts_file, min_qv=min_qv)
     n_genes = tx_geo_df["gene_id"].max() + 1
 
     # Get the approx bounds of the image