From 7b43109dc5dcc5a0471b60be8c1c41c25c50f6f8 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Mon, 15 Jan 2024 15:07:24 +0000
Subject: [PATCH 001/204] Removing umap feature compression

---
 scripts/shapes/shape_embed.py | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 0c4efec4..7eb1160e 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -342,30 +342,15 @@ def shape_embed_process():
     # Use the namespace variables
     latent_space = torch.stack([d.out.z.flatten() for d in predictions])
     scalings = torch.stack([d.x.scalings.flatten() for d in predictions])
-
     idx_to_class = {v: k for k, v in dataset.dataset.class_to_idx.items()}
-
     y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()])
 
     y_partial = y.copy()
     indices = np.random.choice(y.size, int(0.3 * y.size), replace=False)
     y_partial[indices] = -1
     y_blind = -1 * np.ones_like(y)
-    umap_labels = y_blind
-    classes = np.array([idx_to_class[i] for i in y])
-
-    n_components = 64  # Number of UMAP components
-    component_names = [f"umap{i}" for i in range(n_components)]  # List of column names
-
-    logger.info("UMAP fitting")
-    mapper = umap.UMAP(n_components=64, random_state=42).fit(
-        latent_space.numpy(), y=umap_labels
-    )
-
-    logger.info("UMAP transforming")
-    semi_supervised_latent = mapper.transform(latent_space.numpy())
-
-    df = pd.DataFrame(semi_supervised_latent, columns=component_names)
+    
+    df = pd.DataFrame(latent_space.numpy())
     df["Class"] = y
     # Map numeric classes to their labels
     idx_to_class = {0: "alive", 1: "dead"}

From b17827a40f6e0af43a7c520bf99374f80955e72a Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Fri, 5 Jan 2024 18:50:29 +0000
Subject: [PATCH 002/204] Indexation augmentation (forgot this wasnt in here)

---
 bioimage_embed/shapes/transforms.py | 26 ++++++++++++++++++++++++--
 scripts/shapes/shape_embed.py       |  7 ++++++-
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/bioimage_embed/shapes/transforms.py b/bioimage_embed/shapes/transforms.py
index 33535871..504b71cc 100644
--- a/bioimage_embed/shapes/transforms.py
+++ b/bioimage_embed/shapes/transforms.py
@@ -159,8 +159,13 @@ def __repr__(self):
     def get_distogram(self, coords, matrix_normalised=False):
 
         xii, yii = coords
-        distance_matrix = euclidean_distances(np.array([xii, yii]).T)
-        # Fro norm is the same as the L2 norm, but for positive semi-definite matrices
+        distance_matrix = euclidean_distances(np.array([xii, yii]).T) / (
+            np.sqrt(2) * self.size
+        )
+        # TODO size should be shape of matrix and the normalisation should be
+        # D / (np.linalg.norm(x.shape[-2:]))
+
+        norm = np.linalg.norm(distance_matrix, "fro")
         if matrix_normalised:
             return distance_matrix / np.linalg.norm(distance_matrix, "fro")
         if not matrix_normalised:
@@ -365,3 +370,20 @@ def asym_dist_to_sym_dist(self, asymm_dist):
 
         sym_dist = np.max(dist_stack, axis=0)
         return torch.tensor(np.array(sym_dist))
+
+
+class RotateIndexingClockwise(nn.Module):
+    def __init__(self, max_rotations=None, p=1.0):
+        super(RotateIndexingClockwise, self).__init__()
+        self.max_rotations = max_rotations
+        self.probability = p
+
+    def forward(self, img):
+        if np.random.rand() < self.probability:
+            if self.max_rotations is None:
+                self.max_rotations = img.shape[0]
+            num_rotations = np.random.randint(0, self.max_rotations)
+            img = np.roll(
+                img.numpy(), shift=[num_rotations, num_rotations], axis=[0, 1]
+            )
+        return torch.from_numpy(img)
diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index b6834cac..49f9dff9 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -241,7 +241,12 @@ def shape_embed_process():
     # %%
     gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1))
     transform = transforms.Compose(
-        [transform_mask_to_dist, transforms.ToTensor(), gray2rgb]
+        [
+            transform_mask_to_dist,
+            transforms.ToTensor(),
+            RotateIndexingClockwise(p=1),
+            gray2rgb,
+        ]
     )
 
     dataset = datasets.ImageFolder(train_data_path, transform=transform)

From 5cf77fc9bd88553dd23df8cdb67cc81e774c504d Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Mon, 8 Jan 2024 09:54:11 +0000
Subject: [PATCH 003/204] Fixed the import issue

---
 scripts/shapes/shape_embed.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 49f9dff9..df8f26fd 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -41,6 +41,7 @@
     CropCentroidPipeline,
     DistogramToCoords,
     MaskToDistogramPipeline,
+    RotateIndexingClockwise,
 )
 
 import matplotlib.pyplot as plt

From 9273f8cfd161a36daf9466ad7718c8a502c059ac Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Mon, 8 Jan 2024 13:51:23 +0000
Subject: [PATCH 004/204] missing import

---
 bioimage_embed/shapes/transforms.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bioimage_embed/shapes/transforms.py b/bioimage_embed/shapes/transforms.py
index 504b71cc..1d350a04 100644
--- a/bioimage_embed/shapes/transforms.py
+++ b/bioimage_embed/shapes/transforms.py
@@ -11,6 +11,7 @@
 from sklearn.metrics.pairwise import euclidean_distances
 from skimage.measure import find_contours
 import torch
+from torch import nn
 import torch.nn.functional as F
 
 from . import contours

From 5f46b74ba775e4ab53e053dcdfcf8003025d40f4 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 17 Jan 2024 13:29:14 +0000
Subject: [PATCH 005/204] Fixing tests

---
 bioimage_embed/tests/test_lightning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bioimage_embed/tests/test_lightning.py b/bioimage_embed/tests/test_lightning.py
index e1e5dc4a..a02ed2ca 100644
--- a/bioimage_embed/tests/test_lightning.py
+++ b/bioimage_embed/tests/test_lightning.py
@@ -109,7 +109,7 @@ def data(input_dim):
 
 @pytest.fixture()
 def dataset(data):
-    return data.unsqueeze(0)
+    return data
 
 
 @pytest.fixture()

From 0fc9066d1a4c4b3075af79bf7337bf4e976594fc Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Tue, 16 Jan 2024 14:16:38 +0000
Subject: [PATCH 006/204] First attempt at setting up the testing cicd

---
 .github/workflows/docker.yaml | 60 ++++++++++-----------
 .github/workflows/test.yaml   | 56 ++++++++++----------
 Makefile                      | 98 -----------------------------------
 environment.yml               | 32 ++++++------
 4 files changed, 69 insertions(+), 177 deletions(-)

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
index fa528792..d5c5392e 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@@ -2,8 +2,8 @@ name: Publish Docker
 on:
   push:
     branches:
-      - main
-      - master
+    - main
+    - master
 #   pull_request: ~
 
 env:
@@ -14,37 +14,29 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-      - name: Checkout
-        uses: actions/checkout@v3.3.0
-        with:
-          fetch-depth: 2
-      - name: Log in to the Container registry
-        uses: docker/login-action@v2.1.0
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
+    - name: Checkout
+      uses: actions/checkout@v3.3.0
+      with:
+        fetch-depth: 2
+    - name: Log in to the Container registry
+      if: ${{ !env.ACT }}
+      uses: docker/login-action@v2.1.0
+      with:
+        registry: ${{ env.REGISTRY }}
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Extract metadata (tags, labels) for Docker
-        id: meta
-        uses: docker/metadata-action@v4.3.0
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+    - name: Extract metadata (tags, labels) for Docker
+      id: meta
+      uses: docker/metadata-action@v4.3.0
+      with:
+        images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
 
-      - name: Build and push Docker image (version tag)
-        if: steps.check-version.outputs.current-version
-        uses: docker/build-push-action@v3.3.0
-        with:
-          context: .
-          push: true
-          tags: ghcr.io/${{ github.repository }}:${{ steps.check-version.outputs.current-version }}
-          labels: ${{ steps.meta.outputs.labels }}
-
-      - name: Build and push Docker image (latest tag)
-        if: steps.check-version.outputs.current-version
-        uses: docker/build-push-action@v3.3.0
-        with:
-          context: .
-          push: true
-          tags: ghcr.io/${{ github.repository }}:latest
-          labels: ${{ steps.meta.outputs.labels }}
\ No newline at end of file
+    - name: Build and push Docker image (version tag)
+      if: steps.check-version.outputs.current-version
+      uses: docker/build-push-action@v3.3.0
+      with:
+        context: .
+        push: true
+        tags: ghcr.io/${{ github.repository }}:${{ steps.check-version.outputs.current-version }}
+        labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 5f3d9f5f..cc5c6b31 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -1,36 +1,34 @@
-# https://github.com/marketplace/actions/install-poetry-action
-name: test
-
-on: [pull_request,push]
-
+name: conda
+on: [push]
 jobs:
-  test:
+  constructor:
+    name: conda build (${{ matrix.python-version }}, ${{ matrix.os }})
+    runs-on: ${{ matrix.os }}-latest
     defaults:
       run:
-        shell: bash -l {0}
+        shell: ${{ matrix.shell }}
     strategy:
-      fail-fast: false
       matrix:
+        os: [ubuntu]
         python-version: ["3.9"]
-        os: [ubuntu-latest]
-        # os: [ubuntu-18.04, macos-latest, windows-latest]
-    runs-on: ${{ matrix.os }}
+        include:
+        - os: ubuntu
+          shell: bash -l {0}
+        - os: windows
+          shell: cmd /C call {0}
+        - os: macos
+          shell: bash -l {0}
     steps:
-      - name: Check out repository
-        uses: actions/checkout@v2
-      - uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          use-mamba: true
-          environment-file: environment.yml
-          python-version: ${{ matrix.python-version }}
-      - name: poetry env
-        run: poetry env use python
-      - name: Poetry lock
-        run: poetry lock     
-      - name: Install library
-        run: poetry install --no-interaction
-      # - name: Run tests
-      #   run: |
-      #     source .venv/bin/activate
-      #     pytest tests/
\ No newline at end of file
+    - uses: actions/checkout@v2
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        environment-file: environment.yml
+        miniforge-variant: Mambaforge
+        miniforge-version: latest
+        mamba-version: "*"
+        use-mamba: true
+        python-version: ${{ matrix.python-version }}
+    - name: Run tests
+      run: |
+        source .venv/bin/activate
+        make test
diff --git a/Makefile b/Makefile
index 1f1fb42a..08d73569 100644
--- a/Makefile
+++ b/Makefile
@@ -9,101 +9,3 @@ download.data:
 test:
 	pytest
 	
-
-GOOGLE_APPLICATION_CREDENTIALS=$(shell pwd)/credentials.json
-BUCKET_NAME=idr-hipsci
-TRAINING_DIR=idr0034-kilpinen-hipsci
-PROJECT=prj-ext-dev-bia-binder-113155
-
-JOB_PREFIX=vae
-JOB_NAME=$(JOB_PREFIX)_$(shell date +%Y%m%d_%H%M%S)
-JOB_DIR=gs://${BUCKET_NAME}/${JOB_NAME}/models
-DATA_DIR=gs://${BUCKET_NAME}/${TRAINING_DIR}
-
-.EXPORT_ALL_VARIABLES:
-	GOOGLE_APPLICATION_CREDENTIALS
-	BUCKET_NAME
-	TRAINING_DIR
-	JOB_PREFIX
-	JOB_NAME
-	JOB_DIR
-
-
-# MY_VAR := $(shell echo whatever)
-
-# test:
-# 	@echo MY_VAR IS $(MY_VAR)
-
-test:
-	@echo $$GOOGLE_APPLICATION_CREDENTIALS $$BUCKET_NAME $$TRAINING_DIR
-
-all: get_data_list build
-
-build:
-	conda activate torch
-	python idr_get_data.py
-
-get_data_list:
-	ls /nfs/bioimage/drop/idr*/**/*.tiff > file_list.txt
-	ls -u /nfs/bioimage/drop/idr*/**/*.tiff > file_list.txt
-
-run.on.cloud:
-	python idr_get_data_s3.py
-
-run.on.cloud.snake:
-	snakemake --use-conda --cores all \
-		--verbose --google-lifesciences \
-		--default-remote-prefix idr-hipsci \
-		--google-lifesciences-region eu-west2
-
-run.snake:
-	snakemake  --cores all -F --use-conda --verbose
-
-get.env.file:
-	conda env export --from-history -f environment.yml -n torch
-
-on.gcp:
-	gcloud ai-platform jobs submit training ${JOB_NAME} \
-	--region=europe-west2 \
-	--master-image-uri=gcr.io/cloud-ml-public/training/pytorch-gpu.1-9 \
-	--scale-tier=CUSTOM \
-	--master-machine-type=n1-standard-8 \
-	--master-accelerator=type=nvidia-tesla-t4,count=1 \
-	--job-dir=${JOB_DIR} \
-	--package-path=./trainer \
-	--module-name=trainer.train \
-	--stream-logs \
-	-- \
-	--num-epochs=10 \
-	--batch-size=100 \
-	--learning-rate=0.001 \
-	--gpus=1
-
-
-on.gcp.big:
-	gcloud ai-platform jobs submit training ${JOB_NAME} \
-	--region=europe-west2 \
-	--master-image-uri=gcr.io/cloud-ml-public/training/pytorch-gpu.1-9 \
-	--config=config.yaml \
-	--job-dir=${JOB_DIR} \
-	--package-path=./trainer \
-	--module-name=trainer.train \
-	--stream-logs \
-	-- \
-	--num-epochs=10 \
-	--batch-size=100 \
-	--learning-rate=0.001 \
-	--gpus=2 \
-	--accelerator='ddp'\
-	--num_nodes=3
-
-tensorboard:
-	tensorboard --logdir=gs://$(BUCKET_NAME)/${JOB_NAME}
-download.data:
-	kaggle competitions download -c data-science-bowl-2018
-
-test:
-	pytest
-	
-download.idr:
-	rsync -avR --progress ctr26@noah-login:/nfs/bioimage/drop/idr0093-mueller-perturbation/ data/idr
diff --git a/environment.yml b/environment.yml
index 32343b75..e1887e27 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,19 +1,19 @@
 # name: bioimage_embed
 channels:
-  - conda-forge
-  - defaults
-  - torch
-  - bioconda
+- conda-forge
+- defaults
+- torch
+- bioconda
 dependencies:
-  - cudatoolkit-dev=10
-  - python=3.9
-  - mamba
-  - poetry
-  - gcc
-  - libgcc
-  - pytorch
-  - pillow=9.5.0
-  - snakemake-minimal
-  - pip
-  - pip:
-      - -e .
+- cudatoolkit-dev=10
+- python=3.9
+- mamba
+- poetry
+- gcc
+- libgcc
+- pytorch
+- pillow=9.5.0
+- snakemake-minimal
+- pip
+- pip:
+  - -e .

From 8a2d14d77f9a54e58766e3c253ca81dc46ff5bc2 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Tue, 16 Jan 2024 14:18:32 +0000
Subject: [PATCH 007/204] adding windows back in?

---
 .github/workflows/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index cc5c6b31..1bdc01fd 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -9,7 +9,7 @@ jobs:
         shell: ${{ matrix.shell }}
     strategy:
       matrix:
-        os: [ubuntu]
+        os: [ubuntu, windows, macos]
         python-version: ["3.9"]
         include:
         - os: ubuntu

From 74f2d13cd0f12f38403fa6f1ad30383f64978c3a Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Tue, 16 Jan 2024 14:19:00 +0000
Subject: [PATCH 008/204] commented instead I think this makes more sense

---
 .github/workflows/test.yaml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1bdc01fd..e0e9d468 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -9,15 +9,16 @@ jobs:
         shell: ${{ matrix.shell }}
     strategy:
       matrix:
-        os: [ubuntu, windows, macos]
+        # os: [ubuntu, windows, macos]
+        os: [ubuntu]
         python-version: ["3.9"]
         include:
         - os: ubuntu
           shell: bash -l {0}
-        - os: windows
-          shell: cmd /C call {0}
-        - os: macos
-          shell: bash -l {0}
+          # - os: windows
+          #   shell: cmd /C call {0}
+          # - os: macos
+          #   shell: bash -l {0}
     steps:
     - uses: actions/checkout@v2
     - uses: conda-incubator/setup-miniconda@v2

From c190e04e7160fc8d047617ecabd91675774965bb Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Tue, 16 Jan 2024 14:19:40 +0000
Subject: [PATCH 009/204] removing snakemake from env

---
 environment.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index e1887e27..568d68ce 100644
--- a/environment.yml
+++ b/environment.yml
@@ -13,7 +13,6 @@ dependencies:
 - libgcc
 - pytorch
 - pillow=9.5.0
-- snakemake-minimal
 - pip
 - pip:
   - -e .

From 1e657fdac2d5a2e912a32d33dda8232e5d86c005 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Tue, 16 Jan 2024 14:26:20 +0000
Subject: [PATCH 010/204] Forgot to remove sourceing

---
 .github/workflows/test.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index e0e9d468..dbf6e63d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -31,5 +31,4 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Run tests
       run: |
-        source .venv/bin/activate
         make test

From 510aa042dcc2c5ae67bada8d830cccc64e6c8c73 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 18 Jan 2024 10:35:40 +0000
Subject: [PATCH 011/204] Generalised the logging a bit

---
 bioimage_embed/lightning/torch.py             | 73 +++++++++----------
 bioimage_embed/models/pythae/legacy/vq_vae.py |  6 +-
 scripts/shapes/shape_embed.py                 |  5 +-
 3 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/bioimage_embed/lightning/torch.py b/bioimage_embed/lightning/torch.py
index e9eef522..53d649fe 100644
--- a/bioimage_embed/lightning/torch.py
+++ b/bioimage_embed/lightning/torch.py
@@ -7,6 +7,7 @@
 import argparse
 import timm
 from pythae.models.base.base_utils import ModelOutput
+import torch.nn.functional as F
 
 
 class LitAutoEncoderTorch(pl.LightningModule):
@@ -45,8 +46,8 @@ def __init__(self, model, args=SimpleNamespace()):
         if args:
             self.args = SimpleNamespace(**{**vars(args), **vars(self.args)})
         # if kwargs:
-            # merged_kwargs = {k: v for d in kwargs.values() for k, v in d.items()}
-            # self.args = SimpleNamespace(**{**merged_kwargs, **vars(self.args)})
+        # merged_kwargs = {k: v for d in kwargs.values() for k, v in d.items()}
+        # self.args = SimpleNamespace(**{**merged_kwargs, **vars(self.args)})
         self.save_hyperparameters(vars(self.args))
         # self.model.train()
 
@@ -72,31 +73,24 @@ def get_model_output(self, x, batch_idx):
         return model_output, loss
 
     def training_step(self, batch, batch_idx):
-        # results = self.get_results(batch)
         self.model.train()
         x = self.batch_to_tensor(batch)
         model_output, loss = self.get_model_output(
             x,
             batch_idx,
         )
-        # loss = self.model.training_step(x)
-        # loss = self.loss_function(model_output,optimizer_idx)
-
-        # self.log("train_loss", self.loss)
-        # self.log("train_loss", loss)
-        self.logger.experiment.add_scalar("Loss/train", loss, batch_idx)
-
-        self.logger.experiment.add_image(
-            "input", torchvision.utils.make_grid(x["data"]), batch_idx
-        )
-
-        # if self.PYTHAE_FLAG:
-        self.logger.experiment.add_image(
-            "output",
-            torchvision.utils.make_grid(model_output.recon_x),
-            batch_idx,
+        self.log_dict(
+            {
+                "loss/train": loss,
+                "mse/train": F.mse_loss(model_output.recon_x, x["data"]),
+            },
+            # on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
         )
-
+        if isinstance(self.logger, pl.loggers.TensorBoardLogger):
+            self.log_tensorboard(model_output, x)
         return loss
 
     def loss_function(self, model_output, *args, **kwargs):
@@ -121,20 +115,13 @@ def validation_step(self, batch, batch_idx):
         x = self.batch_to_tensor(batch)
         model_output, loss = self.get_model_output(x, batch_idx)
         z = self.embedding_from_output(model_output)
-        # z_indices
-        self.logger.experiment.add_embedding(
-            z,
-            label_img=x["data"],
-            global_step=self.current_epoch,
-            tag="z",
-        )
-
-        self.logger.experiment.add_scalar("Loss/val", loss, batch_idx)
-        self.logger.experiment.add_image(
-            "val",
-            torchvision.utils.make_grid(model_output["recon_x"]),
-            batch_idx,
+        self.log_dict(
+            {
+                "loss/val": loss,
+                "mse/val": F.mse_loss(model_output.recon_x, x["data"]),
+            }
         )
+        return loss
 
     # def lr_scheduler_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
     #     # Implement your own logic for updating the lr scheduler
@@ -181,19 +168,27 @@ def test_step(self, batch, batch_idx):
         loss = self.loss_function(model_output)
 
         # Log test metrics
-        self.log("test_loss", loss)
+        self.log_dict(
+            {
+                "loss/test": loss,
+                "mse/test": F.mse_loss(model_output.recon_x, x["data"]),
+            }
+        )
 
+        return loss
+    
+    def log_wandb(self):
+        pass
+    
+    def log_tensorboard(self, model_output, x):
         # Optionally you can add more logging, for example, visualizations:
         self.logger.experiment.add_image(
             "test_input",
             torchvision.utils.make_grid(x["data"]),
-            batch_idx,
+            self.global_step,
         )
         self.logger.experiment.add_image(
             "test_output",
             torchvision.utils.make_grid(model_output.recon_x),
-            batch_idx,
+            self.global_step,
         )
-
-        # Return whatever data you need, for example, the loss
-        return loss
diff --git a/bioimage_embed/models/pythae/legacy/vq_vae.py b/bioimage_embed/models/pythae/legacy/vq_vae.py
index 38a45706..8ddc00c1 100644
--- a/bioimage_embed/models/pythae/legacy/vq_vae.py
+++ b/bioimage_embed/models/pythae/legacy/vq_vae.py
@@ -132,10 +132,12 @@ def forward(self, x, epoch=None):
             input=x["data"],
         )
         # This matches how pythae returns the loss
+        
+        indices = (encodings == 1).nonzero(as_tuple=True)
+        
         recon_loss = F.mse_loss(x_recon, x["data"], reduction="sum")
-        mse_loss = F.mse_loss(x_recon, x["data"])
+        mse_loss = F.mse_loss(x_recon, x["data"], reduction="mean")
 
-        indices = (encodings == 1).nonzero(as_tuple=True)
         variational_loss = loss-mse_loss
          
         pythae_loss_dict = {
diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index df8f26fd..bf7e5ec5 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -295,13 +295,15 @@ def shape_embed_process():
     model_dir = f"my_models/{dataset_path}_{model._get_name()}_{lit_model._get_name()}"
 
     tb_logger = pl_loggers.TensorBoardLogger(f"logs/")
+    wandb = pl_loggers.WandbLogger(project="bioimage-embed", name="shapes")
 
     Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True)
 
     checkpoint_callback = ModelCheckpoint(dirpath=f"{model_dir}/", save_last=True)
+    wandb.watch(lit_model, log="all")
 
     trainer = pl.Trainer(
-        logger=tb_logger,
+        logger=[wandb,tb_logger],
         gradient_clip_val=0.5,
         enable_checkpointing=True,
         devices=1,
@@ -310,6 +312,7 @@ def shape_embed_process():
         callbacks=[checkpoint_callback],
         min_epochs=50,
         max_epochs=args.epochs,
+        log_every_n_steps=1,
     )
     # %%
     try:

From 3b37ca94c5eb040a86b83a2fbedfebf0988e91f5 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 18 Jan 2024 10:44:17 +0000
Subject: [PATCH 012/204] First attempt as arg hashing for checkpoints

---
 scripts/shapes/shape_embed.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index df8f26fd..af88d130 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -51,9 +51,17 @@
 from matplotlib import rc
 
 import logging
+import pickle 
+import base64
+import hashlib
 
 logger = logging.getLogger(__name__)
 
+def hashing_fn(args):
+    serialized_args = pickle.dumps(vars(args))
+    hash_object = hashlib.sha256(serialized_args)
+    hashed_string = base64.urlsafe_b64encode(hash_object.digest()).decode()
+    return hashed_string
 
 def scoring_df(X, y):
     # Split the data into training and test sets
@@ -157,7 +165,6 @@ def shape_embed_process():
 
     path = Path(metadata(""))
     path.mkdir(parents=True, exist_ok=True)
-    model_dir = f"models/{dataset_path}_{args.model}"
     # %%
 
     transform_crop = CropCentroidPipeline(window_size)
@@ -292,7 +299,7 @@ def shape_embed_process():
     dataloader.setup()
     model.eval()
 
-    model_dir = f"my_models/{dataset_path}_{model._get_name()}_{lit_model._get_name()}"
+    model_dir = f"checkpoints/{hashing_fn(args)}"
 
     tb_logger = pl_loggers.TensorBoardLogger(f"logs/")
 

From 8e1894340000e1354b9d5905aa1e6fc455e1bc07 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 18 Jan 2024 10:55:36 +0000
Subject: [PATCH 013/204] Early stopping on val loss to stop overfitting

---
 scripts/shapes/shape_embed.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index bf7e5ec5..37792a1b 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -20,6 +20,7 @@
 import pytorch_lightning as pl
 import torch
 from types import SimpleNamespace
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 
 # Deal with the filesystem
 import torch.multiprocessing
@@ -312,6 +313,7 @@ def shape_embed_process():
         callbacks=[checkpoint_callback],
         min_epochs=50,
         max_epochs=args.epochs,
+        callbacks=[EarlyStopping(monitor="loss/val", mode="min")],
         log_every_n_steps=1,
     )
     # %%

From 397abe8fac046250ed493de4b9892d0209c73eed Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 18 Jan 2024 12:29:21 +0000
Subject: [PATCH 014/204] Attempt at cli

---
 bioimage_embed/augmentations.py  |  73 +++++++--------
 bioimage_embed/cli.py            |  12 +++
 bioimage_embed/hydra.py          | 106 +++++++++++++++++++++
 bioimage_embed/tests/test_cli.py |  42 +++++++++
 conf/augmentations/default.yaml  |  70 --------------
 conf/bio_vae/default.yaml        |   8 --
 conf/checkpoints/default.yaml    |   3 -
 conf/config.yaml                 | 153 -------------------------------
 conf/dataloader/default.yaml     |   7 --
 conf/dataset/default.yaml        |   0
 conf/hydra/default.yaml          |  14 ---
 conf/ivy_gap.yaml                | 103 ---------------------
 conf/lightning/default.yaml      |   3 -
 conf/logger/default.yaml         |   2 -
 conf/paths/default.yaml          |  18 ----
 conf/pythae/default.yaml         |  17 ----
 conf/timm/default.yaml           |  15 ---
 conf/trainer/default.yaml        |  18 ----
 scripts/shapes/shape_embed.py    |  33 +------
 19 files changed, 197 insertions(+), 500 deletions(-)
 create mode 100644 bioimage_embed/cli.py
 create mode 100644 bioimage_embed/hydra.py
 create mode 100644 bioimage_embed/tests/test_cli.py
 delete mode 100644 conf/augmentations/default.yaml
 delete mode 100644 conf/bio_vae/default.yaml
 delete mode 100644 conf/checkpoints/default.yaml
 delete mode 100644 conf/config.yaml
 delete mode 100644 conf/dataloader/default.yaml
 delete mode 100644 conf/dataset/default.yaml
 delete mode 100644 conf/hydra/default.yaml
 delete mode 100644 conf/ivy_gap.yaml
 delete mode 100644 conf/lightning/default.yaml
 delete mode 100644 conf/logger/default.yaml
 delete mode 100644 conf/paths/default.yaml
 delete mode 100644 conf/pythae/default.yaml
 delete mode 100644 conf/timm/default.yaml
 delete mode 100644 conf/trainer/default.yaml

diff --git a/bioimage_embed/augmentations.py b/bioimage_embed/augmentations.py
index e2c14074..6c9daba4 100644
--- a/bioimage_embed/augmentations.py
+++ b/bioimage_embed/augmentations.py
@@ -1,40 +1,6 @@
 import albumentations as A
 import cv2
 
-DEFAULT_AUGMENTATION = A.Compose(
-    [
-        # Flip the images horizontally or vertically with a 50% chance
-        A.OneOf(
-            [
-                A.HorizontalFlip(p=0.5),
-                A.VerticalFlip(p=0.5),
-            ],
-            p=0.5,
-        ),
-        # Rotate the images by a random angle within a specified range
-        A.Rotate(limit=45, p=0.5),
-        # Randomly scale the image intensity to adjust brightness and contrast
-        A.RandomGamma(gamma_limit=(80, 120), p=0.5),
-        # Apply random elastic transformations to the images
-        A.ElasticTransform(
-            alpha=1,
-            sigma=50,
-            alpha_affine=50,
-            p=0.5,
-        ),
-        # Shift the image channels along the intensity axis
-        A.ChannelShuffle(p=0.5),
-        # Add a small amount of noise to the images
-        A.GaussNoise(var_limit=(10.0, 50.0), p=0.5),
-        # Crop a random part of the image and resize it back to the original size
-        A.RandomResizedCrop(
-            height=512, width=512, scale=(0.9, 1.0), ratio=(0.9, 1.1), p=0.5
-        ),
-        # Adjust image intensity with a specified range for individual channels
-        A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
-    ]
-)
-
 from typing import Any
 
 import albumentations
@@ -43,6 +9,39 @@
 from omegaconf import DictConfig
 from PIL import Image
 
+DEFAULT_AUGMENTATION_LIST = [
+    # Flip the images horizontally or vertically with a 50% chance
+    A.OneOf(
+        [
+            A.HorizontalFlip(p=0.5),
+            A.VerticalFlip(p=0.5),
+        ],
+        p=0.5,
+    ),
+    # Rotate the images by a random angle within a specified range
+    A.Rotate(limit=45, p=0.5),
+    # Randomly scale the image intensity to adjust brightness and contrast
+    A.RandomGamma(gamma_limit=(80, 120), p=0.5),
+    # Apply random elastic transformations to the images
+    A.ElasticTransform(
+        alpha=1,
+        sigma=50,
+        alpha_affine=50,
+        p=0.5,
+    ),
+    # Shift the image channels along the intensity axis
+    A.ChannelShuffle(p=0.5),
+    # Add a small amount of noise to the images
+    A.GaussNoise(var_limit=(10.0, 50.0), p=0.5),
+    # Crop a random part of the image and resize it back to the original size
+    A.RandomResizedCrop(
+        height=512, width=512, scale=(0.9, 1.0), ratio=(0.9, 1.1), p=0.5
+    ),
+    # Adjust image intensity with a specified range for individual channels
+    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
+]
+
+DEFAULT_AUGMENTATION = A.Compose(DEFAULT_AUGMENTATION_LIST)
 
 class TransformsWrapper:
     def __init__(self, transforms_cfg: DictConfig) -> None:
@@ -81,9 +80,7 @@ def __init__(self, transforms_cfg: DictConfig) -> None:
                 _convert_="object",
             )
             valid_test_predict_aug.append(aug)
-        self.valid_test_predict_aug = albumentations.Compose(
-            valid_test_predict_aug
-        )
+        self.valid_test_predict_aug = albumentations.Compose(valid_test_predict_aug)
 
     def set_mode(self, mode: str) -> None:
         """Set `__call__` mode.
@@ -111,4 +108,4 @@ def __call__(self, image: Any, **kwargs: Any) -> Any:
             image = np.asarray(image)
         if self.mode == "train":
             return self.train_aug(image=image, **kwargs)
-        return self.valid_test_predict_aug(image=image, **kwargs)
\ No newline at end of file
+        return self.valid_test_predict_aug(image=image, **kwargs)
diff --git a/bioimage_embed/cli.py b/bioimage_embed/cli.py
new file mode 100644
index 00000000..45529654
--- /dev/null
+++ b/bioimage_embed/cli.py
@@ -0,0 +1,12 @@
+from .hydra import train, infer
+from typer import Typer
+
+app = Typer()
+app.command()(train)
+app.command()(infer)
+
+def main():
+    app()
+    
+if __name__ == "__main__":
+    main()
diff --git a/bioimage_embed/hydra.py b/bioimage_embed/hydra.py
new file mode 100644
index 00000000..46ad75de
--- /dev/null
+++ b/bioimage_embed/hydra.py
@@ -0,0 +1,106 @@
+from hydra.core.config_store import ConfigStore
+from dataclasses import dataclass
+from hydra import compose, initialize
+from omegaconf import OmegaConf
+from types import SimpleNamespace
+import hydra
+from hydra.core.config_store import ConfigStore
+from omegaconf import OmegaConf
+import albumentations
+from dataclasses import dataclass, field
+from bioimage_embed.augmentations import DEFAULT_AUGMENTATION_LIST
+import albumentations as A
+import os
+
+@dataclass
+class Receipe:
+    _target_: str = "types.SimpleNamespace"
+    opt: str = "adamw"
+    weight_decay: float = 0.001
+    momentum: float = 0.9
+    sched: str = "cosine"
+    epochs: int = 50
+    lr: float = 1e-4
+    min_lr: float = 1e-6
+    t_initial: int = 10
+    t_mul: int = 2
+    lr_min: float = None
+    decay_rate: float = 0.1
+    warmup_lr: float = 1e-6
+    warmup_lr_init: float = 1e-6
+    warmup_epochs: int = 5
+    cycle_limit: int = None
+    t_in_epochs: bool = False
+    noisy: bool = False
+    noise_std: float = 0.1
+    noise_pct: float = 0.67
+    noise_seed: int = None
+    cooldown_epochs: int = 5
+    warmup_t: int = 0
+
+
+@dataclass
+class Transform:
+    _target_: str = "albumentations.Compose"
+    transforms: A.Compose = field(default_factory=A.Compose(DEFAULT_AUGMENTATION_LIST))
+
+
+# @dataclass
+# class AlbumentationsTransform:
+#     _target_: str = "albumentations.from_dict"
+#     transform_dict: dict = field(default_factory=A.from_dict)
+#     transform = A.from_dict(OmegaConf.to_container(cfg.albumentations, resolve=True))
+
+
+@dataclass
+class ImageDataset:
+    _target_: str = "torchvision.datasets.ImageFolder"
+    transform: Transform = field(default_factory=Transform)
+
+
+@dataclass
+class Dataset:
+    pass
+
+
+@dataclass
+class DataLoader:
+    _target_: str = "bioimage_embed.lightning.dataloader.DataModule"
+    dataset: str = field(default_factory=ImageDataset)
+
+
+# def cs_generator():
+cs = ConfigStore.instance()
+cs.store(name="receipe", node=Receipe)
+cs.store(name="dataloader", node=DataLoader)
+
+
+# return cs
+def train():
+    main(job_name="test_app")
+
+
+def write_default_config_file(config_path, config_filename, config):
+    os.makedirs(config_path, exist_ok=True)
+    with open(os.path.join(config_path, config_filename), "w") as file:
+        file.write(OmegaConf.to_yaml(config))
+
+
+def main(config_path="conf", job_name="test_app"):
+    config_file = os.path.join(config_path, "config.yaml")
+
+    # Check if the configuration directory exists, if not, create it
+    if not os.path.exists(config_path):
+        os.makedirs(config_path)
+        # Initialize Hydra with a basic configuration
+        hydra.initialize(version_base=None, config_path=config_path, job_name=job_name)
+        cfg = hydra.compose(config_name="config")
+        # Save the default configuration
+        with open(config_file, "w") as file:
+            file.write(OmegaConf.to_yaml(cfg))
+    else:
+        # Initialize Hydra normally if the configuration directory exists
+        hydra.initialize(version_base=None, config_path=config_path, job_name=job_name)
+        cfg = hydra.compose(config_name="config")
+
+    print(OmegaConf.to_yaml(cfg))
diff --git a/bioimage_embed/tests/test_cli.py b/bioimage_embed/tests/test_cli.py
new file mode 100644
index 00000000..dca082aa
--- /dev/null
+++ b/bioimage_embed/tests/test_cli.py
@@ -0,0 +1,42 @@
+import os
+import pytest
+from ..hydra import main
+
+def test_main_creates_config():
+    # Arrange
+    config_path = "test_conf"
+    job_name = "test_app"
+
+    # Ensure the configuration directory does not exist initially
+    if os.path.exists(config_path):
+        os.rmdir(config_path)
+
+    # Act
+    main(config_path=config_path, job_name=job_name)
+
+    # Assert
+    assert os.path.exists(config_path), "Config directory was not created"
+    assert os.path.isfile(os.path.join(config_path, "config.yaml")), "Config file was not created"
+
+    # Clean up
+    os.remove(os.path.join(config_path, "config.yaml"))
+    os.rmdir(config_path)
+
+@pytest.mark.parametrize("config_path, job_name", [
+    ("conf", "test_app"),
+    ("another_conf", "another_job")
+])
+def test_hydra_initializes(config_path, job_name):
+    # Act
+    main(config_path=config_path, job_name=job_name)
+
+    # Assert
+    # Here you can assert specifics about the cfg object if needed.
+    # Since main does not return anything, you might need to adjust
+    # the main function to return the cfg for more thorough testing.
+
+    # Clean up
+    if os.path.exists(config_path):
+        os.remove(os.path.join(config_path, "config.yaml"))
+        os.rmdir(config_path)
+        
\ No newline at end of file
diff --git a/conf/augmentations/default.yaml b/conf/augmentations/default.yaml
deleted file mode 100644
index 3ab17c45..00000000
--- a/conf/augmentations/default.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-# __version__: 1.3.0
-# transform:
-#   __class_fullname__: Compose
-#   additional_targets: {}
-#   bbox_params: null
-#   keypoint_params: null
-#   p: 1.0
-#   transforms:
-#   - __class_fullname__: OneOf
-#     p: 0.5
-#     transforms:
-#     - __class_fullname__: HorizontalFlip
-#       always_apply: false
-#       p: 0.5
-#     - __class_fullname__: VerticalFlip
-#       always_apply: false
-#       p: 0.5
-#   - __class_fullname__: Rotate
-#     always_apply: false
-#     border_mode: 4
-#     crop_border: false
-#     interpolation: 1
-#     limit:
-#     - -45
-#     - 45
-#     mask_value: null
-#     p: 0.5
-#     rotate_method: largest_box
-#     value: null
-#   - __class_fullname__: RandomGamma
-#     always_apply: false
-#     eps: null
-#     gamma_limit:
-#     - 80
-#     - 120
-#     p: 0.5
-#   - __class_fullname__: ElasticTransform
-#     alpha: 1
-#     alpha_affine: 50
-#     always_apply: false
-#     approximate: false
-#     border_mode: 4
-#     interpolation: 1
-#     mask_value: null
-#     p: 0.5
-#     same_dxdy: false
-#     sigma: 50
-#     value: null
-#   - __class_fullname__: GaussNoise
-#     always_apply: false
-#     mean: 0
-#     p: 0.5
-#     per_channel: true
-#     var_limit:
-#     - 10.0
-#     - 50.0
-#   - __class_fullname__: RandomCrop
-#     always_apply: false
-#     height: ${dataset.crop_size[0]}
-#     p: 1
-#     width: ${dataset.crop_size[1]}
-#   - __class_fullname__: Normalize
-#     always_apply: true
-#     p: 1.0
-#     transpose_mask: false
-#   - __class_fullname__: ToTensorV2
-#     always_apply: true
-#     p: 1.0
-#     transpose_mask: false
-
diff --git a/conf/bio_vae/default.yaml b/conf/bio_vae/default.yaml
deleted file mode 100644
index 12f762d1..00000000
--- a/conf/bio_vae/default.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-_target_: bioimage_embed.models.BioimageEmbed
-model: "VQVAE"
-input_dim:
-  - 3
-  - 128
-  - 128
-latent_dim: 64
-model_config: ${pythae.model_config}
diff --git a/conf/checkpoints/default.yaml b/conf/checkpoints/default.yaml
deleted file mode 100644
index 76ebb7cf..00000000
--- a/conf/checkpoints/default.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-_target_: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
-dirpath: ${paths.output_dir}
-save_last: True
\ No newline at end of file
diff --git a/conf/config.yaml b/conf/config.yaml
deleted file mode 100644
index d8156ab3..00000000
--- a/conf/config.yaml
+++ /dev/null
@@ -1,153 +0,0 @@
-defaults:
-  - _self_
-  - trainer: default.yaml
-  - pythae: default.yaml
-  # - optimizer: default.yaml
-  # - scheulder: default.yaml
-  - timm: default.yaml
-  - augmentations: default.yaml
-  # - dataset: default.yaml
-  - dataloader: default.yaml
-  - paths: default.yaml
-  - lightning: default.yaml
-  - bioimage_embed: default.yaml
-  - logger: default.yaml
-  - checkpoints: default.yaml
-
-version_base: 2.0
-
-# seed for random number generators in pytorch, numpy and python.random
-seed: 42
-
-# name of the run, accessed by loggers
-name: null
-
-trainer:
-  accelerator: "gpu"
-  devices: "auto"
-  gradient_clip_val: 1
-  accumulate_grad_batches: 16
-  min_epochs: 0
-  max_epochs: 200
-  strategy: "ddp"
-  profiler: null
-  fast_dev_run: False
-
-dataset:
-  name: "ivy_gap"
-  # dir: "data"
-  train_dataset_glob: ${paths.data_dir}/${dataset.name}/random/*png
-  crop_size:
-    - 256
-    - 256
-
-dataloader:
-  batch_size: 32
-  num_workers: 8
-  pin_memory: false
-  shuffle: true
-  persistent_workers: true
-
-model:
-  _target_: bioimage_embed.models.create_model
-  name: "resnet18_vqvae_legacy"
-  # Dims match ImageNet
-  input_dim: [3, 64, 64]
-  latent_dim: 8
-  opt: LAMB
-  lr: 1.0e-4 
-  weight_decay: 0.0001
-  momentum: 0.9
-  sched: cosine
-  min_lr: 1.0e-6
-  warmup_epochs: 5
-  warmup_lr: 1.0e-6
-  cooldown_epochs: 10
-  t_max: 50
-  cycle_momentum: false
-
-# pythae:
-#   encoder: bioimage_embed.models.ResNet18VAEEncoder
-#     # _target_: Encoder_ResNet_VQVAE_CELEBA
-#   decoder: bioimage_embed.models.ResNet18VAEDecoder
-#   model_config:
-#     _target_: pythae.models.VAEConfig
-
-albumentations:
-  __version__: 1.3.0
-  transform:
-    __class_fullname__: Compose
-    additional_targets: {}
-    bbox_params: null
-    keypoint_params: null
-    p: 1.0
-    transforms:
-      - __class_fullname__: OneOf
-        p: 0.5
-        transforms:
-          - __class_fullname__: HorizontalFlip
-            always_apply: false
-            p: 0.5
-          - __class_fullname__: VerticalFlip
-            always_apply: false
-            p: 0.5
-      - __class_fullname__: RandomCrop
-        always_apply: true
-        height: ${dataset.crop_size[0]}
-        p: 1
-        width: ${dataset.crop_size[1]}
-        # scale:
-        #   - 1.0
-        #   - 1.0
-      # - __class_fullname__: Rotate
-      #   always_apply: false
-      #   border_mode: 4
-      #   crop_border: false
-      #   interpolation: 1
-      #   limit:
-      #     - -45
-      #     - 45
-      #   mask_value: null
-      #   p: 0.5
-      #   rotate_method: largest_box
-      #   value: null
-      # - __class_fullname__: RandomGamma
-      #   always_apply: false
-      #   eps: null
-      #   gamma_limit:
-      #     - 80
-      #     - 120
-      #   p: 0.5
-      # - __class_fullname__: ElasticTransform
-      #   alpha: 1
-      #   alpha_affine: 50
-      #   always_apply: false
-      #   approximate: false
-      #   border_mode: 4
-      #   interpolation: 1
-      #   mask_value: null
-      #   p: 0.5
-      #   same_dxdy: false
-      #   sigma: 50
-      #   value: null
-      # - __class_fullname__: GaussNoise
-      #   always_apply: false
-      #   mean: 0
-      #   p: 0.5
-      #   per_channel: true
-      #   var_limit:
-      #     - 10.0
-      #     - 50.0
-      - __class_fullname__: Resize
-        always_apply: true
-        height: ${model.input_dim[1]}
-        p: 1
-        width: ${model.input_dim[2]}
-      - __class_fullname__: ToFloat
-        always_apply: true
-        p: 1.0
-        max_value: 1.0
-      - __class_fullname__: ToTensorV2
-        always_apply: true
-        p: 1.0
-      #   transpose_mask: false
diff --git a/conf/dataloader/default.yaml b/conf/dataloader/default.yaml
deleted file mode 100644
index 872861b6..00000000
--- a/conf/dataloader/default.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-_target_: bioimage_embed.lightning.DatamoduleGlob
-glob_str: ${dataset.train_dataset_glob}
-batch_size: 32
-num_workers: 4
-pin_memory: true
-shuffle: true
-persistent_workers: true
\ No newline at end of file
diff --git a/conf/dataset/default.yaml b/conf/dataset/default.yaml
deleted file mode 100644
index e69de29b..00000000
diff --git a/conf/hydra/default.yaml b/conf/hydra/default.yaml
deleted file mode 100644
index 9de8ac12..00000000
--- a/conf/hydra/default.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# https://hydra.cc/docs/configure_hydra/intro/
-# https://github.com/ashleve/lightning-hydra-template/blob/main/configs/hydra/default.yaml
-
-# enable color logging
-defaults:
-  - override hydra_logging: colorlog
-  - override job_logging: colorlog
-
-# output directory, generated dynamically on each run
-run:
-  dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S}
-sweep:
-  dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S}
-  subdir: ${hydra.job.num}
\ No newline at end of file
diff --git a/conf/ivy_gap.yaml b/conf/ivy_gap.yaml
deleted file mode 100644
index 777faccc..00000000
--- a/conf/ivy_gap.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-dataset: "ivy_gap"
-data_dir: "data"
-train_dataset_glob: f"{data_dir}/{dataset}/random/*png"
-
-optimizer_params:
-  opt: LAMB
-  lr: 0.001
-  weight_decay: 0.0001
-  momentum: 0.9
-
-lr_scheduler_params:
-  sched: cosine
-  min_lr: 1.0e-6
-  warmup_epochs: 5
-  warmup_lr: 1.0e-6
-  cooldown_epochs: 10
-  t_max: 50
-  cycle_momentum: false
-
-albumentations:
-  __version__: 1.3.0
-  transform:
-    __class_fullname__: Compose
-    additional_targets: {}
-    bbox_params: null
-    keypoint_params: null
-    p: 1.0
-    transforms:
-    - __class_fullname__: OneOf
-      p: 0.5
-      transforms:
-      - __class_fullname__: HorizontalFlip
-        always_apply: false
-        p: 0.5
-      - __class_fullname__: VerticalFlip
-        always_apply: false
-        p: 0.5
-    - __class_fullname__: Rotate
-      always_apply: false
-      border_mode: 4
-      crop_border: false
-      interpolation: 1
-      limit:
-      - -45
-      - 45
-      mask_value: null
-      p: 0.5
-      rotate_method: largest_box
-      value: null
-    - __class_fullname__: RandomGamma
-      always_apply: false
-      eps: null
-      gamma_limit:
-      - 80
-      - 120
-      p: 0.5
-    - __class_fullname__: ElasticTransform
-      alpha: 1
-      alpha_affine: 50
-      always_apply: false
-      approximate: false
-      border_mode: 4
-      interpolation: 1
-      mask_value: null
-      p: 0.5
-      same_dxdy: false
-      sigma: 50
-      value: null
-    - __class_fullname__: GaussNoise
-      always_apply: false
-      mean: 0
-      p: 0.5
-      per_channel: true
-      var_limit:
-      - 10.0
-      - 50.0
-    - __class_fullname__: RandomCrop
-      always_apply: false
-      height: 128
-      p: 1
-      width: 128
-    - __class_fullname__: RandomBrightnessContrast
-      always_apply: false
-      brightness_by_max: true
-      brightness_limit:
-      - -0.2
-      - 0.2
-      contrast_limit:
-      - -0.2
-      - 0.2
-      p: 0.5
-    - __class_fullname__: Normalize
-      always_apply: false
-      max_pixel_value: 255.0
-      mean:
-      - 0.485
-      - 0.456
-      - 0.406
-      p: 1.0
-      std:
-      - 0.229
-      - 0.224
-      - 0.225
diff --git a/conf/lightning/default.yaml b/conf/lightning/default.yaml
deleted file mode 100644
index 6a45b2de..00000000
--- a/conf/lightning/default.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-_target_: bioimage_embed.lightning.LitAutoEncoderTorch
-model: ${pythae}
-args: ${timm}
\ No newline at end of file
diff --git a/conf/logger/default.yaml b/conf/logger/default.yaml
deleted file mode 100644
index 2ad96e8b..00000000
--- a/conf/logger/default.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-_target_:  pytorch_lightning.loggers.TensorBoardLogger
-save_dir: ${paths.log_dir}
diff --git a/conf/paths/default.yaml b/conf/paths/default.yaml
deleted file mode 100644
index d8738dc1..00000000
--- a/conf/paths/default.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# path to root directory
-# this requires PROJECT_ROOT environment variable to exist
-# you can replace it with "." if you want the root to be the current working directory
-# root_dir: ${oc.env:PROJECT_ROOT}
-root_dir: .
-# path to data directory
-data_dir: ${paths.root_dir}/data/
-
-# path to logging directory
-log_dir: ${paths.root_dir}/logs/
-
-# path to output directory, created dynamically by hydra
-# path generation pattern is specified in `configs/hydra/default.yaml`
-# use it to store all files generated during the run, like ckpts and metrics
-output_dir: ${hydra:runtime.output_dir}
-
-# path to working directory
-work_dir: ${hydra:runtime.cwd}
\ No newline at end of file
diff --git a/conf/pythae/default.yaml b/conf/pythae/default.yaml
deleted file mode 100644
index f4c01e7f..00000000
--- a/conf/pythae/default.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-# model_name: VQVAE
-
-# model:
-_target_: pythae.models.VAE
-# model_config: $(model.model_config)
-encoder: 
-  _target_: bioimage_embed.models.ResNet18VAEEncoder
-  model_config: ${pythae.model_config}
-decoder:
-  _target_: bioimage_embed.models.ResNet18VAEDecoder
-  model_config: ${pythae.model_config}
-
-model_config:
-  _target_: pythae.models.VAEConfig
-  _convert_: all
-  input_dim: ${model.input_dim}
-  latent_dim: ${model.latent_dim}
diff --git a/conf/timm/default.yaml b/conf/timm/default.yaml
deleted file mode 100644
index 0d61e8c3..00000000
--- a/conf/timm/default.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
- # _target_: timm.optim.optimizer
-opt: LAMB
-lr: 1.0e-3 
-weight_decay: 0.0001
-momentum: 0.9
-# scheduler:
-# _target_: timm.scheduler.scheduler
-sched: cosine
-min_lr: 1.0e-6
-warmup_epochs: 5
-warmup_lr: 1.0e-6
-cooldown_epochs: 10
-t_max: 50
-cycle_momentum: false
-epochs: ${trainer.max_epochs}
\ No newline at end of file
diff --git a/conf/trainer/default.yaml b/conf/trainer/default.yaml
deleted file mode 100644
index 86d4d552..00000000
--- a/conf/trainer/default.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-_target_: pytorch_lightning.Trainer
-
-accelerator: "gpu"
-devices: "1"
-# weights_summary: null
-# progress_bar_refresh_rate: 5
-# resume_from_checkpoint: null
-# val_check_interval: 1
-check_val_every_n_epoch: 1
-logger: ${logger}
-gradient_clip_val: 1
-enable_checkpointing: True
-accumulate_grad_batches: 4
-callbacks:
-  - ${checkpoints}
-min_epochs: 50
-max_epochs: 200
-precision: 32
\ No newline at end of file
diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 37792a1b..1034256e 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -116,11 +116,7 @@ def shape_embed_process():
         "latent_dim": interp_size,
         "num_embeddings": interp_size,
         "num_hiddens": interp_size,
-        "num_residual_hiddens": 32,
-        "num_residual_layers": 150,
         "pretrained": True,
-        # "embedding_dim": 32,
-        # "num_embeddings": 16,
         "commitment_cost": 0.25,
         "decay": 0.99,
         "frobenius_norm": False,
@@ -153,7 +149,7 @@ def shape_embed_process():
     # dataset = "bbbc010"
 
     # train_data_path = f"scripts/shapes/data/{dataset_path}"
-    train_data_path = f"scripts/shapes/data/{dataset_path}"
+    train_data_path = f"data/{dataset_path}"
     metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}"
 
     path = Path(metadata(""))
@@ -360,7 +356,7 @@ def shape_embed_process():
     indices = np.random.choice(y.size, int(0.3 * y.size), replace=False)
     y_partial[indices] = -1
     y_blind = -1 * np.ones_like(y)
-    
+
     df = pd.DataFrame(latent_space.numpy())
     df["Class"] = y
     # Map numeric classes to their labels
@@ -370,31 +366,6 @@ def shape_embed_process():
     df = df.set_index("Class")
     df_shape_embed = df.copy()
 
-    ax = sns.relplot(
-        data=df,
-        x="umap0",
-        y="umap1",
-        hue="Class",
-        palette="deep",
-        alpha=0.5,
-        edgecolor=None,
-        s=5,
-        height=height,
-        aspect=0.5 * width / height,
-    )
-
-    sns.move_legend(
-        ax,
-        "upper center",
-    )
-    ax.set(xlabel=None, ylabel=None)
-    sns.despine(left=True, bottom=True)
-    plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
-    plt.tight_layout()
-    plt.savefig(metadata(f"umap_no_axes.pdf"))
-    # plt.show()
-    plt.close()
-
     # %%
 
     X = df_shape_embed.to_numpy()

From 4bf27f9930fe86bdb64e3c48701ddfe7742a8fea Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 18 Jan 2024 10:55:36 +0000
Subject: [PATCH 015/204] Early stopping on val loss to stop overfitting

---
 scripts/shapes/shape_embed.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index f70ed089..91c466d6 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -20,6 +20,7 @@
 import pytorch_lightning as pl
 import torch
 from types import SimpleNamespace
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 
 # Deal with the filesystem
 import torch.multiprocessing
@@ -319,6 +320,7 @@ def shape_embed_process():
         callbacks=[checkpoint_callback],
         min_epochs=50,
         max_epochs=args.epochs,
+        callbacks=[EarlyStopping(monitor="loss/val", mode="min")],
         log_every_n_steps=1,
     )
     # %%

From 4ca9dd01f6bb411291abb7317cc409b465d59eed Mon Sep 17 00:00:00 2001
From: Craig Russell <ctr26@ebi.ac.uk>
Date: Sat, 20 Jan 2024 08:19:41 +0000
Subject: [PATCH 016/204] adding branch prose back

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6b34690d..ccf6c625 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,13 @@ This utility makes it simple to fetch the necessary datasets:
 ```bash
 make download.data
 ```
+If you don't have a Kaggle account you must create one and then follow the next steps:
+1. Install the Kaggle API package so you can download the data from the Makefile you have all the information in their [Github repository](https://github.com/Kaggle/kaggle-api).
+2. To use the Kaggle API you need also to create an API token.
+   You can found how to do it in their [documentation](https://github.com/Kaggle/kaggle-api#api-credentials)
+4. After that you will need to add your user and key in a file called `kaggle.json` in this location in your home directory `chmod 600 ~/.kaggle/kaggle.json`
+5. Don't forget to accept the conditions for the "2018 Data Science Bowl" on the Kaggle website.
+   Otherwise you would not be able to pull this data from the command line. 
 
 ### 4. Developer Installation:
 
@@ -88,4 +95,4 @@ bioimage_embed is licensed under the MIT License. Please refer to the [LICENSE](
 
 ---
 
-Happy Embedding! 🧬🔬
\ No newline at end of file
+Happy Embedding! 🧬🔬

From a34ee720594c02af5aec304a7c521a3c5ee3a22e Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 12:36:43 +0000
Subject: [PATCH 017/204] local changes to run

---
 scripts/shapes/shape_embed.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 986f556c..038707d4 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -150,14 +150,14 @@ def shape_embed_process():
     args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params)
 
     #dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm"
-    dataset_path = "shape_embed_data/data/bbbc010/BBBC010_v1_foreground_eachworm/"
+    dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm/"
     # dataset_path = "vampire/mefs/data/processed/Control"
     # dataset_path = "shape_embed_data/data/vampire/torchvision/Control/"
     # dataset_path = "vampire/torchvision/Control"
     # dataset = "bbbc010"
 
     # train_data_path = f"scripts/shapes/data/{dataset_path}"
-    train_data_path = f"data/{dataset_path}"
+    train_data_path = f"/nfs/research/uhlmann/afoix/{dataset_path}"
     metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}"
 
     path = Path(metadata(""))
@@ -316,7 +316,6 @@ def shape_embed_process():
         callbacks=[checkpoint_callback],
         min_epochs=50,
         max_epochs=args.epochs,
-        callbacks=[EarlyStopping(monitor="loss/val", mode="min")],
         log_every_n_steps=1,
     )
     # %%

From 9135043708fbba92c0637b91ce41664917a6ab28 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 12:44:54 +0000
Subject: [PATCH 018/204] command line arguments

---
 scripts/shapes/shape_embed.py | 57 +++++++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 038707d4..1ced1d5f 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -21,6 +21,7 @@
 import torch
 from types import SimpleNamespace
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+import argparse
 
 # Deal with the filesystem
 import torch.multiprocessing
@@ -98,7 +99,7 @@ def scoring_df(X, y):
     return pd.DataFrame(cv_results)
 
 
-def shape_embed_process():
+def shape_embed_process(clargs):
     # Setting the font size
     mpl.rcParams["font.size"] = 10
 
@@ -111,14 +112,18 @@ def shape_embed_process():
     sns.set(style="white", context="notebook", rc={"figure.figsize": (width, height)})
 
     # matplotlib.use("TkAgg")
-    interp_size = 128 * 2
+    interp_size = clargs.latent_space_size * 2
+    #interp_size = 128 * 2
     max_epochs = 100
-    window_size = 128 * 2
+    window_size = clargs.latent_space_size * 2
+    #window_size = 128 * 2
 
     params = {
-        "model":"resnet18_vqvae_legacy",
+        "model":clargs.model,
+        #"model":"resnet18_vae",
         "epochs": 75,
-        "batch_size": 4,
+        "batch_size": clargs.batch_size,
+        #"batch_size": 4,
         "num_workers": 2**4,
         "input_dim": (3, interp_size, interp_size),
         "latent_dim": interp_size,
@@ -496,5 +501,45 @@ def shape_embed_process():
     # tikzplotlib.save(metadata(f"trials_barplot.tikz"))
 
 
+
+
+###############################################################################
+
 if __name__ == "__main__":
-    shape_embed_process()
+
+    def auto_pos_int (x):
+      val = int(x,0)
+      if val <= 0:
+          raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
+      return val
+    
+    parser = argparse.ArgumentParser(description='Run the shape embed pipeline')
+    
+    models = [
+      "resnet18_vae"
+    , "resnet50_vae"
+    , "resnet18_vae_bolt"
+    , "resnet50_vae_bolt"
+    , "resnet18_vqvae"
+    , "resnet50_vqvae"
+    , "resnet18_vqvae_legacy"
+    , "resnet50_vqvae_legacy"
+    , "resnet101_vqvae_legacy"
+    , "resnet110_vqvae_legacy"
+    , "resnet152_vqvae_legacy"
+    , "resnet18_vae_legacy"
+    , "resnet50_vae_legacy"
+    ]
+    parser.add_argument(
+        '-m', '--model', choices=models, default=models[0], metavar='MODEL'
+      , help=f"The MODEL to use, one of {models} (default {models[0]}).")
+    parser.add_argument(
+        '-b', '--batch-size', nargs=1, default=int(4), metavar='BATCH_SIZE', type=auto_pos_int
+      , help="The BATCH_SIZE for the run, a positive integer (default 4)")
+    parser.add_argument(
+        '-l', '--latent-space-size', nargs=1, default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int
+      , help="The LATENT_SPACE_SIZE, a positive integer (default 128)")
+    #parser.add_argument('-v', '--verbose', action='count', default=0,
+    #  help="Increase verbosity level by adding more \"v\".")
+    
+    shape_embed_process(parser.parse_args())

From 43c6ed0ea98a94d648d02dc623f745e640ef7a8a Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 12:47:20 +0000
Subject: [PATCH 019/204] enable testing + uncomment dataset

---
 scripts/shapes/shape_embed.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 1ced1d5f..9130892e 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -154,14 +154,8 @@ def shape_embed_process(clargs):
 
     args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params)
 
-    #dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm"
     dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm/"
-    # dataset_path = "vampire/mefs/data/processed/Control"
-    # dataset_path = "shape_embed_data/data/vampire/torchvision/Control/"
-    # dataset_path = "vampire/torchvision/Control"
-    # dataset = "bbbc010"
-
-    # train_data_path = f"scripts/shapes/data/{dataset_path}"
+    dataset = "bbbc010"
     train_data_path = f"/nfs/research/uhlmann/afoix/{dataset_path}"
     metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}"
 
@@ -334,7 +328,7 @@ def shape_embed_process(clargs):
     lit_model.eval()
 
     validation = trainer.validate(lit_model, datamodule=dataloader)
-    # testing = trainer.test(lit_model, datamodule=dataloader)
+    testing = trainer.test(lit_model, datamodule=dataloader)
     example_input = Variable(torch.rand(1, *args.input_dim))
 
     # torch.jit.save(lit_model.to_torchscript(), f"{model_dir}/model.pt")

From e4e0aaeac9bedac111433d9a43c6f14f97131630 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 21:58:18 +0000
Subject: [PATCH 020/204] added a slurm python script

---
 slurm_shape_embed.py | 82 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 slurm_shape_embed.py

diff --git a/slurm_shape_embed.py b/slurm_shape_embed.py
new file mode 100644
index 00000000..68e16cb9
--- /dev/null
+++ b/slurm_shape_embed.py
@@ -0,0 +1,82 @@
+#! /usr/bin/env python3
+
+import os
+import subprocess
+import tempfile
+
+## Assign the arguments to variables
+#model_arg=$1
+#sizes_list="${@:2}"
+#
+## Create SLURM job script
+#job_script="slurm_job.sh"
+#
+#echo "#!/bin/bash" > "$job_script"
+#echo "#SBATCH --job-name=ite_shape_embed" >> "$job_script"
+#echo "#SBATCH --output=ite_shape_embed.out" >> "$job_script"
+#echo "#SBATCH --error=ite_shape_embed.err" >> "$job_script"
+#echo "#SBATCH --gres=gpu:2" >> "$job_script"  # Adjust the number of CPUs as needed
+#echo "#SBATCH --mem=50GB" >> "$job_script"          # Adjust the memory requirement as needed
+#echo "" >> "$job_script"
+#
+## Loop through the sizes and append the Python command to the job script
+#for size in $sizes_list; do
+#    echo "python ite_shape_embed.py --model $model_arg --ls_size $size" >> "$job_script"
+#done
+#
+## Submit SLURM job
+#sbatch "$job_script"
+
+models = [
+  "resnet18_vae"
+, "resnet50_vae"
+, "resnet18_vae_bolt"
+, "resnet50_vae_bolt"
+, "resnet18_vqvae"
+, "resnet50_vqvae"
+, "resnet18_vqvae_legacy"
+, "resnet50_vqvae_legacy"
+, "resnet101_vqvae_legacy"
+, "resnet110_vqvae_legacy"
+, "resnet152_vqvae_legacy"
+, "resnet18_vae_legacy"
+, "resnet50_vae_legacy"
+]
+batch_sizes = [4, 8, 16]
+latent_space_sizes = [64, 128, 256, 512]
+
+slurm_script="""#!/bin/bash
+
+JOB_NAME=shape_embed_{model}_{b_size}_{ls_size}
+echo "running shape embed with:"
+echo "  - model {model}"
+echo "  - batch size {b_size}"
+echo "  - latent space size {ls_size}"
+python3 scripts/shapes/shape_embed.py --model {model} --batch-size {b_size} --latent-space-size {ls_size}
+"""
+
+if __name__ == "__main__":
+    
+    slurmdir = f'{os.getcwd()}/slurmdir'
+    os.makedirs(slurmdir, exist_ok=True)
+    for m, bs, ls in [ (m,bs,ls) for  m in models
+                                 for bs in batch_sizes
+                                 for ls in latent_space_sizes ]:
+        jobname = f'shape_embed_{m}_{bs}_{ls}'
+        print(jobname)
+        fp = open(mode='w+', file=f'{slurmdir}/slurm_script_shape_embed_{m}_{bs}_{ls}.script')
+        fp.write(slurm_script.format(model=m, b_size=bs, ls_size=ls))
+        fp.flush()
+        print(f'{fp.name}')
+        print(f'cat {fp.name}')
+        result = subprocess.run(['cat', fp.name], stdout=subprocess.PIPE)
+        print(result.stdout.decode('utf-8'))
+        result = subprocess.run([ 'sbatch'
+                                , '--time', '10:00:00'
+                                , '--mem', '50GB'
+                                , '--job-name', jobname
+                                , '--output', f'{slurmdir}/{jobname}.out'
+                                , '--error', f'{slurmdir}/{jobname}.err'
+                                , '--gres', 'gpu:2'
+                                , fp.name], stdout=subprocess.PIPE)
+        print(result.stdout.decode('utf-8'))

From e78afd640ac0164064ee83927294d009b01fce87 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 21:59:37 +0000
Subject: [PATCH 021/204] fix cli type

---
 scripts/shapes/shape_embed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 9130892e..c0560da0 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -528,10 +528,10 @@ def auto_pos_int (x):
         '-m', '--model', choices=models, default=models[0], metavar='MODEL'
       , help=f"The MODEL to use, one of {models} (default {models[0]}).")
     parser.add_argument(
-        '-b', '--batch-size', nargs=1, default=int(4), metavar='BATCH_SIZE', type=auto_pos_int
+        '-b', '--batch-size', default=int(4), metavar='BATCH_SIZE', type=auto_pos_int
       , help="The BATCH_SIZE for the run, a positive integer (default 4)")
     parser.add_argument(
-        '-l', '--latent-space-size', nargs=1, default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int
+        '-l', '--latent-space-size', default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int
       , help="The LATENT_SPACE_SIZE, a positive integer (default 128)")
     #parser.add_argument('-v', '--verbose', action='count', default=0,
     #  help="Increase verbosity level by adding more \"v\".")

From ee021c64786b82d9284812e43861afdf84474046 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 22:04:33 +0000
Subject: [PATCH 022/204] add correct name for the jobs

---
 scripts/shapes/shape_embed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index c0560da0..96124967 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -298,7 +298,7 @@ def shape_embed_process(clargs):
     model_dir = f"checkpoints/{hashing_fn(args)}"
 
     tb_logger = pl_loggers.TensorBoardLogger(f"logs/")
-    wandb = pl_loggers.WandbLogger(project="bioimage-embed", name="shapes")
+    wandb = pl_loggers.WandbLogger(project="shape-embed", name=f"{params['model']}_{interp_size}_{params['batch_size']}")
 
     Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True)
 

From 30c34ccd740d5c88a2d25c21a0cbb5f2e1fb5106 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 22:07:59 +0000
Subject: [PATCH 023/204] Log f1 score mean and std in wandb

---
 scripts/shapes/shape_embed.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 96124967..27cd5a38 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -465,6 +465,13 @@ def shape_embed_process(clargs):
     trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv"))
     trial_df.plot(kind="bar")
 
+    # Special metrics for f1 score for wandb
+    wandb.log({"trial_df": wandb.Table(dataframe=trial_df)})
+    mean_df = trial_df.groupby("trial").mean()
+    std_df = trial_df.groupby("trial").std()
+    wandb.log({"Mean": wandb.Table(dataframe=mean_df)})
+    wandb.log({"Std": wandb.Table(dataframe=std_df)})
+
     melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score")
     # fig, ax = plt.subplots(figsize=(width, height))
     ax = sns.catplot(

From b77c4fc0d3838a6c3dd225bbdba180d885375807 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 22:34:40 +0000
Subject: [PATCH 024/204] choose memory allocation base on latent space size

---
 scripts/shapes/shape_embed.py |  1 +
 slurm_shape_embed.py          | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 27cd5a38..d9c6eb86 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -22,6 +22,7 @@
 from types import SimpleNamespace
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 import argparse
+import wandb
 
 # Deal with the filesystem
 import torch.multiprocessing
diff --git a/slurm_shape_embed.py b/slurm_shape_embed.py
index 68e16cb9..3b78ee35 100644
--- a/slurm_shape_embed.py
+++ b/slurm_shape_embed.py
@@ -55,6 +55,14 @@
 python3 scripts/shapes/shape_embed.py --model {model} --batch-size {b_size} --latent-space-size {ls_size}
 """
 
+def mem_size(ls):
+    if ls <= 128:
+        return '50GB'
+    if ls <= 256:
+        return '100GB'
+    if ls <= 512:
+        return '300GB'
+
 if __name__ == "__main__":
     
     slurmdir = f'{os.getcwd()}/slurmdir'
@@ -71,9 +79,10 @@
         print(f'cat {fp.name}')
         result = subprocess.run(['cat', fp.name], stdout=subprocess.PIPE)
         print(result.stdout.decode('utf-8'))
+        print(mem_size(ls))
         result = subprocess.run([ 'sbatch'
                                 , '--time', '10:00:00'
-                                , '--mem', '50GB'
+                                , '--mem', mem_size(ls)
                                 , '--job-name', jobname
                                 , '--output', f'{slurmdir}/{jobname}.out'
                                 , '--error', f'{slurmdir}/{jobname}.err'

From 8f7d9d8e80c405b167a9bc51784eafec94db550b Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 22:42:36 +0000
Subject: [PATCH 025/204] dynamically chose n gpus based on latent space size +
 fix mem allocation as well

---
 slurm_shape_embed.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/slurm_shape_embed.py b/slurm_shape_embed.py
index 3b78ee35..bd47eef3 100644
--- a/slurm_shape_embed.py
+++ b/slurm_shape_embed.py
@@ -58,11 +58,19 @@
 def mem_size(ls):
     if ls <= 128:
         return '50GB'
-    if ls <= 256:
+    if ls > 128:
         return '100GB'
-    if ls <= 512:
+    if ls > 256:
         return '300GB'
 
+def n_gpus(ls):
+    if ls <= 128:
+        return 'gpus:2'
+    if ls > 128:
+        return 'gpus:2'
+    if ls > 256:
+        return 'gpus:3'
+
 if __name__ == "__main__":
     
     slurmdir = f'{os.getcwd()}/slurmdir'
@@ -86,6 +94,6 @@ def mem_size(ls):
                                 , '--job-name', jobname
                                 , '--output', f'{slurmdir}/{jobname}.out'
                                 , '--error', f'{slurmdir}/{jobname}.err'
-                                , '--gres', 'gpu:2'
+                                , '--gres', n_gpus(ls)
                                 , fp.name], stdout=subprocess.PIPE)
         print(result.stdout.decode('utf-8'))

From 41dec50d1ba7afc5c32ff9f0e0e3e9a1d3b709d0 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 22:44:30 +0000
Subject: [PATCH 026/204] fix gpu allocation typo

---
 slurm_shape_embed.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/slurm_shape_embed.py b/slurm_shape_embed.py
index bd47eef3..c542dd3e 100644
--- a/slurm_shape_embed.py
+++ b/slurm_shape_embed.py
@@ -65,11 +65,11 @@ def mem_size(ls):
 
 def n_gpus(ls):
     if ls <= 128:
-        return 'gpus:2'
+        return 'gpu:2'
     if ls > 128:
-        return 'gpus:2'
+        return 'gpu:2'
     if ls > 256:
-        return 'gpus:3'
+        return 'gpu:3'
 
 if __name__ == "__main__":
     

From 775548a671cc85574873f85e75984c6e67214246 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jan 2024 22:48:03 +0000
Subject: [PATCH 027/204] comment out all the mean and std login for f1

---
 scripts/shapes/shape_embed.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index d9c6eb86..f9d21974 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -22,7 +22,6 @@
 from types import SimpleNamespace
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 import argparse
-import wandb
 
 # Deal with the filesystem
 import torch.multiprocessing
@@ -467,11 +466,11 @@ def shape_embed_process(clargs):
     trial_df.plot(kind="bar")
 
     # Special metrics for f1 score for wandb
-    wandb.log({"trial_df": wandb.Table(dataframe=trial_df)})
-    mean_df = trial_df.groupby("trial").mean()
-    std_df = trial_df.groupby("trial").std()
-    wandb.log({"Mean": wandb.Table(dataframe=mean_df)})
-    wandb.log({"Std": wandb.Table(dataframe=std_df)})
+    #wandb.log({"trial_df": wandb.Table(dataframe=trial_df)})
+    #mean_df = trial_df.groupby("trial").mean()
+    #std_df = trial_df.groupby("trial").std()
+    #wandb.log({"Mean": wandb.Table(dataframe=mean_df)})
+    #wandb.log({"Std": wandb.Table(dataframe=std_df)})
 
     melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score")
     # fig, ax = plt.subplots(figsize=(width, height))

From 1cf6646619a840abed247653943c39cd9bb57091 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 23 Jan 2024 21:55:25 +0000
Subject: [PATCH 028/204] added a --clear-checkpoints clarg

---
 scripts/shapes/shape_embed.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index f9d21974..a70d02fc 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -22,6 +22,8 @@
 from types import SimpleNamespace
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 import argparse
+import wandb
+import shutil
 
 # Deal with the filesystem
 import torch.multiprocessing
@@ -295,6 +297,9 @@ def shape_embed_process(clargs):
     dataloader.setup()
     model.eval()
 
+    if clargs.clear_checkpoints:
+      print("cleaning checkpoints")
+      shutil.rmtree("checkpoints/")
     model_dir = f"checkpoints/{hashing_fn(args)}"
 
     tb_logger = pl_loggers.TensorBoardLogger(f"logs/")
@@ -540,7 +545,9 @@ def auto_pos_int (x):
     parser.add_argument(
         '-l', '--latent-space-size', default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int
       , help="The LATENT_SPACE_SIZE, a positive integer (default 128)")
+    parser.add_argument('--clear-checkpoints', action='store_true'
+      , help='remove checkpoints')
     #parser.add_argument('-v', '--verbose', action='count', default=0,
     #  help="Increase verbosity level by adding more \"v\".")
     
-    shape_embed_process(parser.parse_args())
+    shape_embed_process(parser.parse_args())
\ No newline at end of file

From 704c88fc8dcdf6ba915f0ea59f0fa70e5ee4092e Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 23 Jan 2024 21:55:57 +0000
Subject: [PATCH 029/204] use wandblogger to log info (mean, std dev...)

---
 scripts/shapes/shape_embed.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index a70d02fc..25326431 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -303,15 +303,15 @@ def shape_embed_process(clargs):
     model_dir = f"checkpoints/{hashing_fn(args)}"
 
     tb_logger = pl_loggers.TensorBoardLogger(f"logs/")
-    wandb = pl_loggers.WandbLogger(project="shape-embed", name=f"{params['model']}_{interp_size}_{params['batch_size']}")
+    wandblogger = pl_loggers.WandbLogger(project="shape-embed", name=f"{params['model']}_{interp_size}_{params['batch_size']}")
 
     Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True)
 
     checkpoint_callback = ModelCheckpoint(dirpath=f"{model_dir}/", save_last=True)
-    wandb.watch(lit_model, log="all")
+    wandblogger.watch(lit_model, log="all")
 
     trainer = pl.Trainer(
-        logger=[wandb,tb_logger],
+        logger=[wandblogger,tb_logger],
         gradient_clip_val=0.5,
         enable_checkpointing=True,
         devices=1,
@@ -469,13 +469,18 @@ def shape_embed_process(clargs):
     trial_df.to_csv(metadata(f"trial_df.csv"))
     trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv"))
     trial_df.plot(kind="bar")
-
-    # Special metrics for f1 score for wandb
-    #wandb.log({"trial_df": wandb.Table(dataframe=trial_df)})
+    
     #mean_df = trial_df.groupby("trial").mean()
     #std_df = trial_df.groupby("trial").std()
-    #wandb.log({"Mean": wandb.Table(dataframe=mean_df)})
-    #wandb.log({"Std": wandb.Table(dataframe=std_df)})
+    #wandb.log_table(mean_df)
+    #wandb.log_table(std_df) 
+    
+    #Special metrics for f1 score for wandb
+    wandblogger.experiment.log({"trial_df": wandb.Table(dataframe=trial_df)})
+    mean_df = trial_df.groupby("trial").mean()
+    std_df = trial_df.groupby("trial").std()
+    wandblogger.experiment.log({"Mean": wandb.Table(dataframe=mean_df)})
+    wandblogger.experiment.log({"Std": wandb.Table(dataframe=std_df)})
 
     melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score")
     # fig, ax = plt.subplots(figsize=(width, height))

From 15343f691d85a4f9137b76d79a1071a35891a353 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 23 Jan 2024 23:13:19 +0000
Subject: [PATCH 030/204] run individual jobs in own folder to work around
 checkpoints

---
 slurm_shape_embed.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/slurm_shape_embed.py b/slurm_shape_embed.py
index c542dd3e..daea5ca5 100644
--- a/slurm_shape_embed.py
+++ b/slurm_shape_embed.py
@@ -52,7 +52,11 @@
 echo "  - model {model}"
 echo "  - batch size {b_size}"
 echo "  - latent space size {ls_size}"
-python3 scripts/shapes/shape_embed.py --model {model} --batch-size {b_size} --latent-space-size {ls_size}
+rand_name=$(cat /dev/urandom | tr -cd 'a-f0-9' | head -c 16)
+mkdir -p slurm_rundir/$rand_name
+cp -r $(ls | grep -v slurm_rundir) slurm_rundir/$rand_name/.
+cd slurm_rundir/$rand_name
+python3 scripts/shapes/shape_embed.py --model {model} --batch-size {b_size} --latent-space-size {ls_size} --clear-checkpoints
 """
 
 def mem_size(ls):

From 49813811129bf89db88460dd0c24dc9c58e4dada Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 21 Feb 2024 11:47:13 +0000
Subject: [PATCH 031/204] Updating pythae

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 82fb7df8..23672073 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ scikit-image = "^0.21.0"
 iteround = "^1.0.4"
 ipykernel = "^6.25.1"
 nonechucks = "^0.4.2"
-pythae = "^0.1.1"
+pythae = { git = "https://github.com/clementchadebec/benchmark_VAE.git", branch = "main" }
 pytest = "^7.4.0"
 pandas = "^2.1.0"
 bokeh = "^3.2.2"

From 9da2a22a00b9de833e36e50c13f65de46fd66346 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 21 Feb 2024 11:48:30 +0000
Subject: [PATCH 032/204] Adding standard scalar to df scoring fun

---
 scripts/shapes/shape_embed.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index f70ed089..59544fed 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -1,6 +1,8 @@
 # %%
 import seaborn as sns
 import pyefd
+from sklearn.decomposition import PCA
+from sklearn.discriminant_analysis import StandardScaler
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import cross_validate, KFold, train_test_split
 from sklearn.metrics import make_scorer
@@ -9,6 +11,7 @@
 import matplotlib as mpl
 import seaborn as sns
 from pathlib import Path
+from sklearn.pipeline import Pipeline
 import umap
 from torch.autograd import Variable
 from types import SimpleNamespace
@@ -77,14 +80,20 @@ def scoring_df(X, y):
     }
 
     # Create a random forest classifier
-    clf = RandomForestClassifier()
+    pipeline = Pipeline(
+        [
+            ("scaler", StandardScaler()),
+            #  ("pca", PCA(n_components=0.95, whiten=True, random_state=42)),
+            ("clf", RandomForestClassifier()),
+        ]
+    )
 
     # Specify the number of folds
     k_folds = 10
 
     # Perform k-fold cross-validation
     cv_results = cross_validate(
-        estimator=clf,
+        estimator=pipeline,
         X=X,
         y=y,
         cv=KFold(n_splits=k_folds),

From 873ef920fe40d3a7a6782bd4323fa998c65711c7 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 21 Feb 2024 11:49:31 +0000
Subject: [PATCH 033/204] Refactoring and adding back umap

---
 scripts/shapes/shape_embed.py | 69 ++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 30 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 59544fed..e8fda82f 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -23,6 +23,8 @@
 import pytorch_lightning as pl
 import torch
 from types import SimpleNamespace
+from umap import UMAP
+import os
 
 # Deal with the filesystem
 import torch.multiprocessing
@@ -66,6 +68,41 @@ def hashing_fn(args):
     hashed_string = base64.urlsafe_b64encode(hash_object.digest()).decode()
     return hashed_string
 
+
+def umap_plot(df, metadata, width=3.45, height=3.45 / 1.618):
+    umap_reducer = UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
+    mask = np.random.rand(len(df)) < 0.7
+
+    semi_labels = df["Class"].copy()
+    semi_labels[~mask] = -1  # Assuming -1 indicates unknown label for semi-supervision
+
+    umap_embedding = umap_reducer.fit_transform(df, y=semi_labels)
+
+    ax = sns.relplot(
+        data=pd.DataFrame(umap_embedding, columns=["umap0", "umap1"]),
+        x="umap0",
+        y="umap1",
+        hue="Class",
+        palette="deep",
+        alpha=0.5,
+        edgecolor=None,
+        s=5,
+        height=height,
+        aspect=0.5 * width / height,
+    )
+
+    sns.move_legend(
+        ax,
+        "upper center",
+    )
+    ax.set(xlabel=None, ylabel=None)
+    sns.despine(left=True, bottom=True)
+    plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
+    plt.tight_layout()
+    plt.savefig(metadata(f"umap_no_axes.pdf"))
+    # plt.show()
+    plt.close()
+
 def scoring_df(X, y):
     # Split the data into training and test sets
     X_train, X_test, y_train, y_test = train_test_split(
@@ -370,11 +407,6 @@ def shape_embed_process():
     idx_to_class = {v: k for k, v in dataset.dataset.class_to_idx.items()}
     y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()])
 
-    y_partial = y.copy()
-    indices = np.random.choice(y.size, int(0.3 * y.size), replace=False)
-    y_partial[indices] = -1
-    y_blind = -1 * np.ones_like(y)
-    
     df = pd.DataFrame(latent_space.numpy())
     df["Class"] = y
     # Map numeric classes to their labels
@@ -384,31 +416,8 @@ def shape_embed_process():
     df = df.set_index("Class")
     df_shape_embed = df.copy()
 
-    ax = sns.relplot(
-        data=df,
-        x="umap0",
-        y="umap1",
-        hue="Class",
-        palette="deep",
-        alpha=0.5,
-        edgecolor=None,
-        s=5,
-        height=height,
-        aspect=0.5 * width / height,
-    )
-
-    sns.move_legend(
-        ax,
-        "upper center",
-    )
-    ax.set(xlabel=None, ylabel=None)
-    sns.despine(left=True, bottom=True)
-    plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
-    plt.tight_layout()
-    plt.savefig(metadata(f"umap_no_axes.pdf"))
-    # plt.show()
-    plt.close()
-
+    # %% UMAP plot
+    umap_plot(df, metadata, width, height)
     # %%
 
     X = df_shape_embed.to_numpy()

From 752736386338364f5ec780bb9aef30732e6bcf41 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 21 Feb 2024 11:49:52 +0000
Subject: [PATCH 034/204] Seed "everything"

---
 scripts/shapes/shape_embed.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index e8fda82f..4efd6352 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -62,6 +62,10 @@
 
 logger = logging.getLogger(__name__)
 
+# Seed everything
+np.random.seed(42)
+pl.seed_everything(42)
+
 def hashing_fn(args):
     serialized_args = pickle.dumps(vars(args))
     hash_object = hashlib.sha256(serialized_args)

From afb0368e68cd3b9b68c4b3236f7137249ac50a0e Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 21 Feb 2024 11:50:07 +0000
Subject: [PATCH 035/204] Reduce k folds (should be a hparam)

---
 scripts/shapes/shape_embed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 4efd6352..f9e0d716 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -130,7 +130,7 @@ def scoring_df(X, y):
     )
 
     # Specify the number of folds
-    k_folds = 10
+    k_folds = 5
 
     # Perform k-fold cross-validation
     cv_results = cross_validate(

From 941dc804b195a4f9ec918b01166149f39bb11536 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 21 Feb 2024 11:50:28 +0000
Subject: [PATCH 036/204] Update args to match what we now think is good

---
 scripts/shapes/shape_embed.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index f9e0d716..72239453 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -165,26 +165,18 @@ def shape_embed_process():
     window_size = 128 * 2
 
     params = {
-        "model":"resnet18_vqvae_legacy",
-        "epochs": 75,
+        "model": "resnet50_vqvae",
+        "epochs": 250,
         "batch_size": 4,
         "num_workers": 2**4,
         "input_dim": (3, interp_size, interp_size),
-        "latent_dim": interp_size,
-        "num_embeddings": interp_size,
-        "num_hiddens": interp_size,
-        "num_residual_hiddens": 32,
-        "num_residual_layers": 150,
+        "latent_dim": int(128),
         "pretrained": True,
-        # "embedding_dim": 32,
-        # "num_embeddings": 16,
-        "commitment_cost": 0.25,
-        "decay": 0.99,
         "frobenius_norm": False,
     }
 
     optimizer_params = {
-        "opt": "LAMB",
+        "opt": "AdamW",
         "lr": 0.001,
         "weight_decay": 0.0001,
         "momentum": 0.9,

From 2ccf8b4ed64564fc398830e680bf693f60f14712 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 21 Feb 2024 11:50:53 +0000
Subject: [PATCH 037/204] Dynamic best weights finding

---
 scripts/shapes/shape_embed.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 72239453..b1b253f8 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -348,11 +348,17 @@ def shape_embed_process():
 
     Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True)
 
-    checkpoint_callback = ModelCheckpoint(dirpath=f"{model_dir}/", save_last=True)
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=f"{model_dir}/",
+        save_last=True,
+        save_top_k=1,
+        monitor="loss/val",
+        mode="min",
+    )
     wandb.watch(lit_model, log="all")
 
     trainer = pl.Trainer(
-        logger=[wandb,tb_logger],
+        logger=[wandb, tb_logger],
         gradient_clip_val=0.5,
         enable_checkpointing=True,
         devices=1,
@@ -364,12 +370,20 @@ def shape_embed_process():
         log_every_n_steps=1,
     )
     # %%
-    try:
-        trainer.fit(
-            lit_model, datamodule=dataloader, ckpt_path=f"{model_dir}/last.ckpt"
-        )
-    except:
-        trainer.fit(lit_model, datamodule=dataloader)
+
+    # Determine the checkpoint path for resuming
+    last_checkpoint_path = f"{model_dir}/last.ckpt"
+    best_checkpoint_path = checkpoint_callback.best_model_path
+
+    # Check if a last checkpoint exists to resume from
+    if os.path.isfile(last_checkpoint_path):
+        resume_checkpoint = last_checkpoint_path
+    elif best_checkpoint_path and os.path.isfile(best_checkpoint_path):
+        resume_checkpoint = best_checkpoint_path
+    else:
+        resume_checkpoint = None
+
+    trainer.fit(lit_model, datamodule=dataloader, ckpt_path=resume_checkpoint)
 
     lit_model.eval()
 

From d7761b6d9faa8df7d14ecb670ba5def3d2124365 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 21 Feb 2024 13:09:25 +0000
Subject: [PATCH 038/204] Fixed umap

---
 scripts/shapes/shape_embed.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index b1b253f8..b38b4722 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -73,17 +73,19 @@ def hashing_fn(args):
     return hashed_string
 
 
-def umap_plot(df, metadata, width=3.45, height=3.45 / 1.618):
+def umap_plot(df, metadata, width=3.45, height=3.45 / 1.618, split=0.8):
     umap_reducer = UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
-    mask = np.random.rand(len(df)) < 0.7
+    mask = np.random.rand(len(df)) < split
 
-    semi_labels = df["Class"].copy()
-    semi_labels[~mask] = -1  # Assuming -1 indicates unknown label for semi-supervision
+    semi_labels = df.index.codes.copy()
+    semi_labels[~mask] = -1
 
-    umap_embedding = umap_reducer.fit_transform(df, y=semi_labels)
+    umap_embedding = umap_reducer.fit_transform(df.sample(frac=1), y=semi_labels)
 
     ax = sns.relplot(
-        data=pd.DataFrame(umap_embedding, columns=["umap0", "umap1"]),
+        data=pd.DataFrame(
+            umap_embedding, columns=["umap0", "umap1"], index=df.index
+        ).reset_index(),
         x="umap0",
         y="umap1",
         hue="Class",
@@ -107,6 +109,7 @@ def umap_plot(df, metadata, width=3.45, height=3.45 / 1.618):
     # plt.show()
     plt.close()
 
+
 def scoring_df(X, y):
     # Split the data into training and test sets
     X_train, X_test, y_train, y_test = train_test_split(

From 22832d6b131266c44d7b71b1c6a954d3196c64ae Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 21 Feb 2024 13:10:09 +0000
Subject: [PATCH 039/204] Made the class column categorical

---
 scripts/shapes/shape_embed.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index b38b4722..49cc9c1d 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -424,17 +424,16 @@ def shape_embed_process():
     df["Class"] = y
     # Map numeric classes to their labels
     idx_to_class = {0: "alive", 1: "dead"}
-    df["Class"] = df["Class"].map(idx_to_class)
+    df["Class"] = df["Class"].map(idx_to_class).astype("category")
     df["Scale"] = scalings[:, 0].squeeze()
     df = df.set_index("Class")
     df_shape_embed = df.copy()
 
     # %% UMAP plot
-    umap_plot(df, metadata, width, height)
-    # %%
+    umap_plot(df, metadata, width, height,split=0.9)
 
     X = df_shape_embed.to_numpy()
-    y = df_shape_embed.index.values
+    y = df_shape_embed.index
 
     properties = [
         "area",

From e567748a3e12e935e8e1e847941fd9459ed0d98c Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 22 Feb 2024 12:31:26 +0000
Subject: [PATCH 040/204] modification for slurm

---
 scripts/shapes/shape_embed_backup.py | 558 +++++++++++++++++++++++++++
 1 file changed, 558 insertions(+)
 create mode 100644 scripts/shapes/shape_embed_backup.py

diff --git a/scripts/shapes/shape_embed_backup.py b/scripts/shapes/shape_embed_backup.py
new file mode 100644
index 00000000..eea708e4
--- /dev/null
+++ b/scripts/shapes/shape_embed_backup.py
@@ -0,0 +1,558 @@
+# %%
+import seaborn as sns
+import pyefd
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import cross_validate, KFold, train_test_split
+from sklearn.metrics import make_scorer
+import pandas as pd
+from sklearn import metrics
+import matplotlib as mpl
+import seaborn as sns
+from pathlib import Path
+import umap
+from torch.autograd import Variable
+from types import SimpleNamespace
+import numpy as np
+import logging
+from skimage import measure
+import umap.plot
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
+import pytorch_lightning as pl
+import torch
+from types import SimpleNamespace
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+import argparse
+import wandb
+import shutil
+
+# Deal with the filesystem
+import torch.multiprocessing
+
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+from bioimage_embed import shapes
+import bioimage_embed
+
+# Note - you must have torchvision installed for this example
+
+from pytorch_lightning import loggers as pl_loggers
+from torchvision import transforms
+from bioimage_embed.lightning import DataModule
+
+from torchvision import datasets
+from bioimage_embed.shapes.transforms import (
+    ImageToCoords,
+    CropCentroidPipeline,
+    DistogramToCoords,
+    MaskToDistogramPipeline,
+    RotateIndexingClockwise,
+)
+
+import matplotlib.pyplot as plt
+
+from bioimage_embed.lightning import DataModule
+import matplotlib as mpl
+from matplotlib import rc
+
+import logging
+import pickle 
+import base64
+import hashlib
+
+logger = logging.getLogger(__name__)
+
+def hashing_fn(args):
+    serialized_args = pickle.dumps(vars(args))
+    hash_object = hashlib.sha256(serialized_args)
+    hashed_string = base64.urlsafe_b64encode(hash_object.digest()).decode()
+    return hashed_string
+
+def scoring_df(X, y):
+    # Split the data into training and test sets
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y
+    )
+    # Define a dictionary of metrics
+    scoring = {
+        "accuracy": make_scorer(metrics.accuracy_score),
+        "precision": make_scorer(metrics.precision_score, average="macro"),
+        "recall": make_scorer(metrics.recall_score, average="macro"),
+        "f1": make_scorer(metrics.f1_score, average="macro"),
+    }
+
+    # Create a random forest classifier
+    clf = RandomForestClassifier()
+
+    # Specify the number of folds
+    k_folds = 10
+
+    # Perform k-fold cross-validation
+    cv_results = cross_validate(
+        estimator=clf,
+        X=X,
+        y=y,
+        cv=KFold(n_splits=k_folds),
+        scoring=scoring,
+        n_jobs=-1,
+        return_train_score=False,
+    )
+
+    # Put the results into a DataFrame
+    return pd.DataFrame(cv_results)
+
+
+def shape_embed_process(clargs):
+    # Setting the font size
+    mpl.rcParams["font.size"] = 10
+
+    # rc("text", usetex=True)
+    rc("font", **{"family": "sans-serif", "sans-serif": ["Arial"]})
+    width = 3.45
+    height = width / 1.618
+    plt.rcParams["figure.figsize"] = [width, height]
+
+    sns.set(style="white", context="notebook", rc={"figure.figsize": (width, height)})
+
+    # matplotlib.use("TkAgg")
+    interp_size = clargs.latent_space_size * 2
+    #interp_size = 128 * 2
+    max_epochs = 100
+    window_size = clargs.latent_space_size * 2
+    #window_size = 128 * 2
+
+    params = {
+        "model":clargs.model,
+        #"model":"resnet18_vae",
+        "epochs": 75,
+        "batch_size": clargs.batch_size,
+        #"batch_size": 4,
+        "num_workers": 2**4,
+        "input_dim": (3, interp_size, interp_size),
+        "latent_dim": interp_size,
+        "num_embeddings": interp_size,
+        "num_hiddens": interp_size,
+        "pretrained": True,
+        "commitment_cost": 0.25,
+        "decay": 0.99,
+        "frobenius_norm": False,
+    }
+
+    optimizer_params = {
+        "opt": "AdamW",
+        "lr": 0.001,
+        "weight_decay": 0.0001,
+        "momentum": 0.9,
+    }
+
+    lr_scheduler_params = {
+        "sched": "cosine",
+        "min_lr": 1e-4,
+        "warmup_epochs": 5,
+        "warmup_lr": 1e-6,
+        "cooldown_epochs": 10,
+        "t_max": 50,
+        "cycle_momentum": False,
+    }
+
+    args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params)
+
+    dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm/"
+    dataset = "bbbc010"
+    train_data_path = f"/nfs/research/uhlmann/afoix/{dataset_path}"
+    metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}"
+
+    path = Path(metadata(""))
+    path.mkdir(parents=True, exist_ok=True)
+    # %%
+
+    transform_crop = CropCentroidPipeline(window_size)
+    transform_dist = MaskToDistogramPipeline(
+        window_size, interp_size, matrix_normalised=False
+    )
+    transform_mdscoords = DistogramToCoords(window_size)
+    transform_coords = ImageToCoords(window_size)
+
+    transform_mask_to_gray = transforms.Compose([transforms.Grayscale(1)])
+
+    transform_mask_to_crop = transforms.Compose(
+        [
+            # transforms.ToTensor(),
+            transform_mask_to_gray,
+            transform_crop,
+        ]
+    )
+
+    transform_mask_to_dist = transforms.Compose(
+        [
+            transform_mask_to_crop,
+            transform_dist,
+        ]
+    )
+    transform_mask_to_coords = transforms.Compose(
+        [
+            transform_mask_to_crop,
+            transform_coords,
+        ]
+    )
+
+    transforms_dict = {
+        "none": transform_mask_to_gray,
+        "transform_crop": transform_mask_to_crop,
+        "transform_dist": transform_mask_to_dist,
+        "transform_coords": transform_mask_to_coords,
+    }
+
+    train_data = {
+        key: datasets.ImageFolder(train_data_path, transform=value)
+        for key, value in transforms_dict.items()
+    }
+
+    for key, value in train_data.items():
+        print(key, len(value))
+        plt.imshow(train_data[key][0][0], cmap="gray")
+        plt.imsave(metadata(f"{key}.png"), train_data[key][0][0], cmap="gray")
+        # plt.show()
+        plt.close()
+
+    # plt.scatter(*train_data["transform_coords"][0][0])
+    # plt.savefig(metadata(f"transform_coords.png"))
+    # plt.show()
+
+    # plt.imshow(train_data["transform_crop"][0][0], cmap="gray")
+    # plt.scatter(*train_data["transform_coords"][0][0],c=np.arange(interp_size), cmap='rainbow', s=1)
+    # plt.show()
+    # plt.savefig(metadata(f"transform_coords.png"))
+
+    # Retrieve the coordinates and cropped image
+    coords = train_data["transform_coords"][0][0]
+    crop_image = train_data["transform_crop"][0][0]
+
+    fig = plt.figure(frameon=True)
+    ax = plt.Axes(fig, [0, 0, 1, 1])
+    ax.set_axis_off()
+    fig.add_axes(ax)
+
+    # Display the cropped image using grayscale colormap
+    plt.imshow(crop_image, cmap="gray_r")
+
+    # Scatter plot with smaller point size
+    plt.scatter(*coords, c=np.arange(interp_size), cmap="rainbow", s=2)
+
+    # Save the plot as an image without border and coordinate axes
+    plt.savefig(metadata(f"transform_coords.png"), bbox_inches="tight", pad_inches=0)
+
+    # Close the plot
+    plt.close()
+    # import albumentations as A
+    # %%
+    gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1))
+    transform = transforms.Compose(
+        [
+            transform_mask_to_dist,
+            transforms.ToTensor(),
+            RotateIndexingClockwise(p=1),
+            gray2rgb,
+        ]
+    )
+
+    dataset = datasets.ImageFolder(train_data_path, transform=transform)
+
+    valid_indices = []
+    # Iterate through the dataset and apply the transform to each image
+    for idx in range(len(dataset)):
+        try:
+            image, label = dataset[idx]
+            # If the transform works without errors, add the index to the list of valid indices
+            valid_indices.append(idx)
+        except Exception as e:
+            # A better way to do with would be with batch collation
+            print(f"Error occurred for image {idx}: {e}")
+
+    # Create a Subset using the valid indices
+    dataset = torch.utils.data.Subset(dataset, valid_indices)
+    dataloader = DataModule(
+        dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=args.num_workers,
+    )
+
+    # model = bioimage_embed.models.create_model("resnet18_vqvae_legacy", **vars(args))
+    # 
+    model = bioimage_embed.models.create_model(
+        model=args.model,
+        input_dim=args.input_dim,
+        latent_dim=args.latent_dim,
+        pretrained=args.pretrained,
+    )
+
+    # model = bioimage_embed.models.factory.ModelFactory(**vars(args)).resnet50_vqvae_legacy()
+
+    # lit_model = shapes.MaskEmbedLatentAugment(model, args)
+    lit_model = shapes.MaskEmbed(model, args)
+    test_data = dataset[0][0].unsqueeze(0)
+    # test_lit_data = 2*(dataset[0][0].unsqueeze(0).repeat_interleave(3, dim=1),)
+    test_output = lit_model.forward((test_data,))
+
+    dataloader.setup()
+    model.eval()
+
+    if clargs.clear_checkpoints:
+      print("cleaning checkpoints")
+      shutil.rmtree("checkpoints/")
+    model_dir = f"checkpoints/{hashing_fn(args)}"
+
+    tb_logger = pl_loggers.TensorBoardLogger(f"logs/")
+    wandblogger = pl_loggers.WandbLogger(project="shape-embed", name=f"{params['model']}_{interp_size}_{params['batch_size']}")
+
+    Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True)
+
+    checkpoint_callback = ModelCheckpoint(dirpath=f"{model_dir}/", save_last=True)
+    wandblogger.watch(lit_model, log="all")
+
+    trainer = pl.Trainer(
+        logger=[wandblogger,tb_logger],
+        gradient_clip_val=0.5,
+        enable_checkpointing=True,
+        devices=1,
+        accelerator="gpu",
+        accumulate_grad_batches=4,
+        callbacks=[checkpoint_callback],
+        min_epochs=50,
+        max_epochs=args.epochs,
+        log_every_n_steps=1,
+    )
+    # %%
+    try:
+        trainer.fit(
+            lit_model, datamodule=dataloader, ckpt_path=f"{model_dir}/last.ckpt"
+        )
+    except:
+        trainer.fit(lit_model, datamodule=dataloader)
+
+    lit_model.eval()
+
+    validation = trainer.validate(lit_model, datamodule=dataloader)
+    testing = trainer.test(lit_model, datamodule=dataloader)
+    example_input = Variable(torch.rand(1, *args.input_dim))
+
+    # torch.jit.save(lit_model.to_torchscript(), f"{model_dir}/model.pt")
+    # torch.onnx.export(lit_model, example_input, f"{model_dir}/model.onnx")
+
+    # %%
+    # Inference
+
+    dataloader = DataModule(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=args.num_workers,
+        # Transform is commented here to avoid augmentations in real data
+        # HOWEVER, applying a the transform multiple times and averaging the results might produce better latent embeddings
+        # transform=transform,
+        # transform=transform,
+    )
+    dataloader.setup()
+
+    predictions = trainer.predict(lit_model, datamodule=dataloader)
+
+    # Use the namespace variables
+    latent_space = torch.stack([d.out.z.flatten() for d in predictions])
+    scalings = torch.stack([d.x.scalings.flatten() for d in predictions])
+    idx_to_class = {v: k for k, v in dataset.dataset.class_to_idx.items()}
+    y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()])
+
+    y_partial = y.copy()
+    indices = np.random.choice(y.size, int(0.3 * y.size), replace=False)
+    y_partial[indices] = -1
+    y_blind = -1 * np.ones_like(y)
+
+    df = pd.DataFrame(latent_space.numpy())
+    df["Class"] = y
+    # Map numeric classes to their labels
+    idx_to_class = {0: "alive", 1: "dead"}
+    df["Class"] = df["Class"].map(idx_to_class)
+    df["Scale"] = scalings[:, 0].squeeze()
+    df = df.set_index("Class")
+    df_shape_embed = df.copy()
+
+    # %%
+
+    X = df_shape_embed.to_numpy()
+    y = df_shape_embed.index.values
+
+    properties = [
+        "area",
+        "perimeter",
+        "centroid",
+        "major_axis_length",
+        "minor_axis_length",
+        "orientation",
+    ]
+    dfs = []
+    for i, data in enumerate(train_data["transform_crop"]):
+        X, y = data
+        # Do regionprops here
+        # Calculate shape summary statistics using regionprops
+        # We're considering that the mask has only one object, thus we take the first element [0]
+        # props = regionprops(np.array(X).astype(int))[0]
+        props_table = measure.regionprops_table(
+            np.array(X).astype(int), properties=properties
+        )
+
+        # Store shape properties in a dataframe
+        df = pd.DataFrame(props_table)
+
+        # Assuming the class or label is contained in 'y' variable
+        df["class"] = y
+        df.set_index("class", inplace=True)
+        dfs.append(df)
+
+    df_regionprops = pd.concat(dfs)
+
+    # Assuming 'dataset_contour' is your DataLoader for the dataset
+    dfs = []
+    for i, data in enumerate(train_data["transform_coords"]):
+        # Convert the tensor to a numpy array
+        X, y = data
+
+        # Feed it to PyEFD's calculate_efd function
+        coeffs = pyefd.elliptic_fourier_descriptors(X, order=10, normalize=False)
+        # coeffs_df = pd.DataFrame({'class': [y], 'norm_coeffs': [norm_coeffs.flatten().tolist()]})
+
+        norm_coeffs = pyefd.normalize_efd(coeffs)
+        df = pd.DataFrame(
+            {
+                "norm_coeffs": norm_coeffs.flatten().tolist(),
+                "coeffs": coeffs.flatten().tolist(),
+            }
+        ).T.rename_axis("coeffs")
+        df["class"] = y
+        df.set_index("class", inplace=True, append=True)
+        dfs.append(df)
+
+    df_pyefd = pd.concat(dfs)
+
+    trials = [
+        {
+            "name": "mask_embed",
+            "features": df_shape_embed.to_numpy(),
+            "labels": df_shape_embed.index,
+        },
+        {
+            "name": "fourier_coeffs",
+            "features": df_pyefd.xs("coeffs", level="coeffs"),
+            "labels": df_pyefd.xs("coeffs", level="coeffs").index,
+        },
+        # {"name": "fourier_norm_coeffs",
+        #  "features": df_pyefd.xs("norm_coeffs", level="coeffs"),
+        #  "labels": df_pyefd.xs("norm_coeffs", level="coeffs").index
+        # }
+        {
+            "name": "regionprops",
+            "features": df_regionprops,
+            "labels": df_regionprops.index,
+        },
+    ]
+
+    trial_df = pd.DataFrame()
+    for trial in trials:
+        X = trial["features"]
+        y = trial["labels"]
+        trial["score_df"] = scoring_df(X, y)
+        trial["score_df"]["trial"] = trial["name"]
+        print(trial["score_df"])
+        trial["score_df"].to_csv(metadata(f"{trial['name']}_score_df.csv"))
+        trial_df = pd.concat([trial_df, trial["score_df"]])
+    trial_df = trial_df.drop(["fit_time", "score_time"], axis=1)
+
+    trial_df.to_csv(metadata(f"trial_df.csv"))
+    trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv"))
+    trial_df.plot(kind="bar")
+    
+    #mean_df = trial_df.groupby("trial").mean()
+    #std_df = trial_df.groupby("trial").std()
+    #wandb.log_table(mean_df)
+    #wandb.log_table(std_df) 
+    
+    #Special metrics for f1 score for wandb
+    wandblogger.experiment.log({"trial_df": wandb.Table(dataframe=trial_df)})
+    mean_df = trial_df.groupby("trial").mean()
+    std_df = trial_df.groupby("trial").std()
+    wandblogger.experiment.log({"Mean": wandb.Table(dataframe=mean_df)})
+    wandblogger.experiment.log({"Std": wandb.Table(dataframe=std_df)})
+
+    melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score")
+    # fig, ax = plt.subplots(figsize=(width, height))
+    ax = sns.catplot(
+        data=melted_df,
+        kind="bar",
+        x="trial",
+        hue="Metric",
+        y="Score",
+        errorbar="se",
+        height=height,
+        aspect=width * 2**0.5 / height,
+    )
+    # ax.xtick_params(labelrotation=45)
+    # plt.legend(loc='lower center', bbox_to_anchor=(1, 1))
+    # sns.move_legend(ax, "lower center", bbox_to_anchor=(1, 1))
+    # ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    # plt.tight_layout()
+    plt.savefig(metadata(f"trials_barplot.pdf"))
+    plt.close()
+
+    avs = (
+        melted_df.set_index(["trial", "Metric"])
+        .xs("test_f1", level="Metric", drop_level=False)
+        .groupby("trial")
+        .mean()
+    )
+    print(avs)
+    # tikzplotlib.save(metadata(f"trials_barplot.tikz"))
+
+
+
+
+###############################################################################
+
+if __name__ == "__main__":
+
+    def auto_pos_int (x):
+      val = int(x,0)
+      if val <= 0:
+          raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
+      return val
+    
+    parser = argparse.ArgumentParser(description='Run the shape embed pipeline')
+    
+    models = [
+      "resnet18_vae"
+    , "resnet50_vae"
+    , "resnet18_vae_bolt"
+    , "resnet50_vae_bolt"
+    , "resnet18_vqvae"
+    , "resnet50_vqvae"
+    , "resnet18_vqvae_legacy"
+    , "resnet50_vqvae_legacy"
+    , "resnet101_vqvae_legacy"
+    , "resnet110_vqvae_legacy"
+    , "resnet152_vqvae_legacy"
+    , "resnet18_vae_legacy"
+    , "resnet50_vae_legacy"
+    ]
+    parser.add_argument(
+        '-m', '--model', choices=models, default=models[0], metavar='MODEL'
+      , help=f"The MODEL to use, one of {models} (default {models[0]}).")
+    parser.add_argument(
+        '-b', '--batch-size', default=int(4), metavar='BATCH_SIZE', type=auto_pos_int
+      , help="The BATCH_SIZE for the run, a positive integer (default 4)")
+    parser.add_argument(
+        '-l', '--latent-space-size', default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int
+      , help="The LATENT_SPACE_SIZE, a positive integer (default 128)")
+    parser.add_argument('--clear-checkpoints', action='store_true'
+      , help='remove checkpoints')
+    #parser.add_argument('-v', '--verbose', action='count', default=0,
+    #  help="Increase verbosity level by adding more \"v\".")
+    
+    shape_embed_process(parser.parse_args())

From 161b0a0e6f9fe6dfa323628043e8ad8b634aff23 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 22 Feb 2024 12:33:00 +0000
Subject: [PATCH 041/204] changes in the shape embed script

---
 scripts/shapes/shape_embed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 25326431..eea708e4 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -138,7 +138,7 @@ def shape_embed_process(clargs):
     }
 
     optimizer_params = {
-        "opt": "LAMB",
+        "opt": "AdamW",
         "lr": 0.001,
         "weight_decay": 0.0001,
         "momentum": 0.9,
@@ -555,4 +555,4 @@ def auto_pos_int (x):
     #parser.add_argument('-v', '--verbose', action='count', default=0,
     #  help="Increase verbosity level by adding more \"v\".")
     
-    shape_embed_process(parser.parse_args())
\ No newline at end of file
+    shape_embed_process(parser.parse_args())

From a6cb292c1650c0c626ce08bb4f1c7d024fd48a60 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 22 Feb 2024 21:24:25 +0000
Subject: [PATCH 042/204] fix merge commit + add command line args for dataset
 (name and path) and wandb project

---
 scripts/shapes/shape_embed.py | 810 +++++++++++++++++-----------------
 1 file changed, 409 insertions(+), 401 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 24aad0fc..32e6b898 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -174,406 +174,406 @@ def shape_embed_process(clargs):
     #window_size = 128 * 2
 
     params = {
-    "model":clargs.model,
-    #"model":"resnet18_vae",
-    "epochs": 250,
-    "batch_size": clargs.batch_size,
-    #"batch_size": 4,
-    "num_workers": 2**4,
-    "input_dim": (3, interp_size, interp_size),
-    "latent_dim": interp_size,
-    "num_embeddings": interp_size,
-    "num_hiddens": interp_size,
-    "pretrained": True,
-    "commitment_cost": 0.25,
-    "decay": 0.99,
-    "frobenius_norm": False,
-}
-
-optimizer_params = {
-    "opt": "AdamW",
-    "lr": 0.001,
-    "weight_decay": 0.0001,
-    "momentum": 0.9,
-}
-
-lr_scheduler_params = {
-    "sched": "cosine",
-    "min_lr": 1e-4,
-    "warmup_epochs": 5,
-    "warmup_lr": 1e-6,
-    "cooldown_epochs": 10,
-    "t_max": 50,
-    "cycle_momentum": False,
-}
-
-args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params)
-
-dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm/"
-dataset = "bbbc010"
-train_data_path = f"/nfs/research/uhlmann/afoix/{dataset_path}"
-metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}"
-
-path = Path(metadata(""))
-path.mkdir(parents=True, exist_ok=True)
-# %%
-
-transform_crop = CropCentroidPipeline(window_size)
-transform_dist = MaskToDistogramPipeline(
-    window_size, interp_size, matrix_normalised=False
-)
-transform_mdscoords = DistogramToCoords(window_size)
-transform_coords = ImageToCoords(window_size)
-
-transform_mask_to_gray = transforms.Compose([transforms.Grayscale(1)])
-
-transform_mask_to_crop = transforms.Compose(
-    [
-        # transforms.ToTensor(),
-        transform_mask_to_gray,
-        transform_crop,
-    ]
-)
-
-transform_mask_to_dist = transforms.Compose(
-    [
-        transform_mask_to_crop,
-        transform_dist,
-    ]
-)
-transform_mask_to_coords = transforms.Compose(
-    [
-        transform_mask_to_crop,
-        transform_coords,
-    ]
-)
-
-transforms_dict = {
-    "none": transform_mask_to_gray,
-    "transform_crop": transform_mask_to_crop,
-    "transform_dist": transform_mask_to_dist,
-    "transform_coords": transform_mask_to_coords,
-}
-
-train_data = {
-    key: datasets.ImageFolder(train_data_path, transform=value)
-    for key, value in transforms_dict.items()
-}
-
-for key, value in train_data.items():
-    print(key, len(value))
-    plt.imshow(train_data[key][0][0], cmap="gray")
-    plt.imsave(metadata(f"{key}.png"), train_data[key][0][0], cmap="gray")
+        "model":clargs.model,
+        #"model":"resnet18_vae",
+        "epochs": 250,
+        "batch_size": clargs.batch_size,
+        #"batch_size": 4,
+        "num_workers": 2**4,
+        "input_dim": (3, interp_size, interp_size),
+        "latent_dim": interp_size,
+        "num_embeddings": interp_size,
+        "num_hiddens": interp_size,
+        "pretrained": True,
+        "commitment_cost": 0.25,
+        "decay": 0.99,
+        "frobenius_norm": False,
+    }
+    
+    optimizer_params = {
+        "opt": "AdamW",
+        "lr": 0.001,
+        "weight_decay": 0.0001,
+        "momentum": 0.9,
+    }
+    
+    lr_scheduler_params = {
+        "sched": "cosine",
+        "min_lr": 1e-4,
+        "warmup_epochs": 5,
+        "warmup_lr": 1e-6,
+        "cooldown_epochs": 10,
+        "t_max": 50,
+        "cycle_momentum": False,
+    }
+    
+    args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params)
+    
+    dataset_path = clargs.dataset[1]
+    train_data_path = f"/nfs/research/uhlmann/afoix/{dataset_path}"
+    metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}"
+    
+    path = Path(metadata(""))
+    path.mkdir(parents=True, exist_ok=True)
+    # %%
+    
+    transform_crop = CropCentroidPipeline(window_size)
+    transform_dist = MaskToDistogramPipeline(
+        window_size, interp_size, matrix_normalised=False
+    )
+    transform_mdscoords = DistogramToCoords(window_size)
+    transform_coords = ImageToCoords(window_size)
+    
+    transform_mask_to_gray = transforms.Compose([transforms.Grayscale(1)])
+    
+    transform_mask_to_crop = transforms.Compose(
+        [
+            # transforms.ToTensor(),
+            transform_mask_to_gray,
+            transform_crop,
+        ]
+    )
+    
+    transform_mask_to_dist = transforms.Compose(
+        [
+            transform_mask_to_crop,
+            transform_dist,
+        ]
+    )
+    transform_mask_to_coords = transforms.Compose(
+        [
+            transform_mask_to_crop,
+            transform_coords,
+        ]
+    )
+    
+    transforms_dict = {
+        "none": transform_mask_to_gray,
+        "transform_crop": transform_mask_to_crop,
+        "transform_dist": transform_mask_to_dist,
+        "transform_coords": transform_mask_to_coords,
+    }
+    
+    train_data = {
+        key: datasets.ImageFolder(train_data_path, transform=value)
+        for key, value in transforms_dict.items()
+    }
+    
+    for key, value in train_data.items():
+        print(key, len(value))
+        plt.imshow(train_data[key][0][0], cmap="gray")
+        plt.imsave(metadata(f"{key}.png"), train_data[key][0][0], cmap="gray")
+        # plt.show()
+        plt.close()
+    
+    # plt.scatter(*train_data["transform_coords"][0][0])
+    # plt.savefig(metadata(f"transform_coords.png"))
     # plt.show()
+    
+    # plt.imshow(train_data["transform_crop"][0][0], cmap="gray")
+    # plt.scatter(*train_data["transform_coords"][0][0],c=np.arange(interp_size), cmap='rainbow', s=1)
+    # plt.show()
+    # plt.savefig(metadata(f"transform_coords.png"))
+    
+    # Retrieve the coordinates and cropped image
+    coords = train_data["transform_coords"][0][0]
+    crop_image = train_data["transform_crop"][0][0]
+    
+    fig = plt.figure(frameon=True)
+    ax = plt.Axes(fig, [0, 0, 1, 1])
+    ax.set_axis_off()
+    fig.add_axes(ax)
+    
+    # Display the cropped image using grayscale colormap
+    plt.imshow(crop_image, cmap="gray_r")
+    
+    # Scatter plot with smaller point size
+    plt.scatter(*coords, c=np.arange(interp_size), cmap="rainbow", s=2)
+    
+    # Save the plot as an image without border and coordinate axes
+    plt.savefig(metadata(f"transform_coords.png"), bbox_inches="tight", pad_inches=0)
+    
+    # Close the plot
     plt.close()
-
-# plt.scatter(*train_data["transform_coords"][0][0])
-# plt.savefig(metadata(f"transform_coords.png"))
-# plt.show()
-
-# plt.imshow(train_data["transform_crop"][0][0], cmap="gray")
-# plt.scatter(*train_data["transform_coords"][0][0],c=np.arange(interp_size), cmap='rainbow', s=1)
-# plt.show()
-# plt.savefig(metadata(f"transform_coords.png"))
-
-# Retrieve the coordinates and cropped image
-coords = train_data["transform_coords"][0][0]
-crop_image = train_data["transform_crop"][0][0]
-
-fig = plt.figure(frameon=True)
-ax = plt.Axes(fig, [0, 0, 1, 1])
-ax.set_axis_off()
-fig.add_axes(ax)
-
-# Display the cropped image using grayscale colormap
-plt.imshow(crop_image, cmap="gray_r")
-
-# Scatter plot with smaller point size
-plt.scatter(*coords, c=np.arange(interp_size), cmap="rainbow", s=2)
-
-# Save the plot as an image without border and coordinate axes
-plt.savefig(metadata(f"transform_coords.png"), bbox_inches="tight", pad_inches=0)
-
-# Close the plot
-plt.close()
-# import albumentations as A
-# %%
-gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1))
-transform = transforms.Compose(
-    [
-        transform_mask_to_dist,
-        transforms.ToTensor(),
-        RotateIndexingClockwise(p=1),
-        gray2rgb,
-    ]
-)
-
-dataset = datasets.ImageFolder(train_data_path, transform=transform)
-
-valid_indices = []
-# Iterate through the dataset and apply the transform to each image
-for idx in range(len(dataset)):
-    try:
-        image, label = dataset[idx]
-        # If the transform works without errors, add the index to the list of valid indices
-        valid_indices.append(idx)
-    except Exception as e:
-        # A better way to do with would be with batch collation
-        print(f"Error occurred for image {idx}: {e}")
-
-# Create a Subset using the valid indices
-dataset = torch.utils.data.Subset(dataset, valid_indices)
-dataloader = DataModule(
-    dataset,
-    batch_size=args.batch_size,
-    shuffle=True,
-    num_workers=args.num_workers,
-)
-
-# model = bioimage_embed.models.create_model("resnet18_vqvae_legacy", **vars(args))
-# 
-model = bioimage_embed.models.create_model(
-    model=args.model,
-    input_dim=args.input_dim,
-    latent_dim=args.latent_dim,
-    pretrained=args.pretrained,
-)
-
-# model = bioimage_embed.models.factory.ModelFactory(**vars(args)).resnet50_vqvae_legacy()
-
-# lit_model = shapes.MaskEmbedLatentAugment(model, args)
-lit_model = shapes.MaskEmbed(model, args)
-test_data = dataset[0][0].unsqueeze(0)
-# test_lit_data = 2*(dataset[0][0].unsqueeze(0).repeat_interleave(3, dim=1),)
-test_output = lit_model.forward((test_data,))
-
-dataloader.setup()
-model.eval()
-
-if clargs.clear_checkpoints:
-  print("cleaning checkpoints")
-  shutil.rmtree("checkpoints/")
-model_dir = f"checkpoints/{hashing_fn(args)}"
-
-tb_logger = pl_loggers.TensorBoardLogger(f"logs/")
-wandblogger = pl_loggers.WandbLogger(project="shape-embed", name=f"{params['model']}_{interp_size}_{params['batch_size']}")
-
-Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True)
-
-checkpoint_callback = ModelCheckpoint(
-    dirpath=f"{model_dir}/",
-    save_last=True,
-    save_top_k=1,
-    monitor="loss/val",
-    mode="min",
-)
-wandb.watch(lit_model, log="all")
-
-trainer = pl.Trainer(
-    logger=[wandb, tb_logger],
-    gradient_clip_val=0.5,
-    enable_checkpointing=True,
-    devices=1,
-    accelerator="gpu",
-    accumulate_grad_batches=4,
-    callbacks=[checkpoint_callback],
-    min_epochs=50,
-    max_epochs=args.epochs,
-    log_every_n_steps=1,
-)
-# %%
-
-# Determine the checkpoint path for resuming
-last_checkpoint_path = f"{model_dir}/last.ckpt"
-best_checkpoint_path = checkpoint_callback.best_model_path
-
-# Check if a last checkpoint exists to resume from
-if os.path.isfile(last_checkpoint_path):
-    resume_checkpoint = last_checkpoint_path
-elif best_checkpoint_path and os.path.isfile(best_checkpoint_path):
-    resume_checkpoint = best_checkpoint_path
-else:
-    resume_checkpoint = None
-
-trainer.fit(lit_model, datamodule=dataloader, ckpt_path=resume_checkpoint)
-
-lit_model.eval()
-
-validation = trainer.validate(lit_model, datamodule=dataloader)
-testing = trainer.test(lit_model, datamodule=dataloader)
-example_input = Variable(torch.rand(1, *args.input_dim))
-
-# torch.jit.save(lit_model.to_torchscript(), f"{model_dir}/model.pt")
-# torch.onnx.export(lit_model, example_input, f"{model_dir}/model.onnx")
-
-# %%
-# Inference
-
-dataloader = DataModule(
-    dataset,
-    batch_size=1,
-    shuffle=False,
-    num_workers=args.num_workers,
-    # Transform is commented here to avoid augmentations in real data
-    # HOWEVER, applying a the transform multiple times and averaging the results might produce better latent embeddings
-    # transform=transform,
-    # transform=transform,
-)
-dataloader.setup()
-
-predictions = trainer.predict(lit_model, datamodule=dataloader)
-
-# Use the namespace variables
-latent_space = torch.stack([d.out.z.flatten() for d in predictions])
-scalings = torch.stack([d.x.scalings.flatten() for d in predictions])
-idx_to_class = {v: k for k, v in dataset.dataset.class_to_idx.items()}
-y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()])
-
-df = pd.DataFrame(latent_space.numpy())
-df["Class"] = y
-# Map numeric classes to their labels
-idx_to_class = {0: "alive", 1: "dead"}
-df["Class"] = df["Class"].map(idx_to_class).astype("category")
-df["Scale"] = scalings[:, 0].squeeze()
-df = df.set_index("Class")
-df_shape_embed = df.copy()
-
-# %%
-# %% UMAP plot
-umap_plot(df, metadata, width, height,split=0.9)
-
-X = df_shape_embed.to_numpy()
-y = df_shape_embed.index
-
-properties = [
-    "area",
-    "perimeter",
-    "centroid",
-    "major_axis_length",
-    "minor_axis_length",
-    "orientation",
-]
-dfs = []
-for i, data in enumerate(train_data["transform_crop"]):
-    X, y = data
-    # Do regionprops here
-    # Calculate shape summary statistics using regionprops
-    # We're considering that the mask has only one object, thus we take the first element [0]
-    # props = regionprops(np.array(X).astype(int))[0]
-    props_table = measure.regionprops_table(
-        np.array(X).astype(int), properties=properties
+    # import albumentations as A
+    # %%
+    gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1))
+    transform = transforms.Compose(
+        [
+            transform_mask_to_dist,
+            transforms.ToTensor(),
+            RotateIndexingClockwise(p=1),
+            gray2rgb,
+        ]
     )
-
-    # Store shape properties in a dataframe
-    df = pd.DataFrame(props_table)
-
-    # Assuming the class or label is contained in 'y' variable
-    df["class"] = y
-    df.set_index("class", inplace=True)
-    dfs.append(df)
-
-df_regionprops = pd.concat(dfs)
-
-# Assuming 'dataset_contour' is your DataLoader for the dataset
-dfs = []
-for i, data in enumerate(train_data["transform_coords"]):
-    # Convert the tensor to a numpy array
-    X, y = data
-
-    # Feed it to PyEFD's calculate_efd function
-    coeffs = pyefd.elliptic_fourier_descriptors(X, order=10, normalize=False)
-    # coeffs_df = pd.DataFrame({'class': [y], 'norm_coeffs': [norm_coeffs.flatten().tolist()]})
-
-    norm_coeffs = pyefd.normalize_efd(coeffs)
-    df = pd.DataFrame(
+    
+    dataset = datasets.ImageFolder(train_data_path, transform=transform)
+    
+    valid_indices = []
+    # Iterate through the dataset and apply the transform to each image
+    for idx in range(len(dataset)):
+        try:
+            image, label = dataset[idx]
+            # If the transform works without errors, add the index to the list of valid indices
+            valid_indices.append(idx)
+        except Exception as e:
+            # A better way to do with would be with batch collation
+            print(f"Error occurred for image {idx}: {e}")
+    
+    # Create a Subset using the valid indices
+    dataset = torch.utils.data.Subset(dataset, valid_indices)
+    dataloader = DataModule(
+        dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=args.num_workers,
+    )
+    
+    # model = bioimage_embed.models.create_model("resnet18_vqvae_legacy", **vars(args))
+    # 
+    model = bioimage_embed.models.create_model(
+        model=args.model,
+        input_dim=args.input_dim,
+        latent_dim=args.latent_dim,
+        pretrained=args.pretrained,
+    )
+    
+    # model = bioimage_embed.models.factory.ModelFactory(**vars(args)).resnet50_vqvae_legacy()
+    
+    # lit_model = shapes.MaskEmbedLatentAugment(model, args)
+    lit_model = shapes.MaskEmbed(model, args)
+    test_data = dataset[0][0].unsqueeze(0)
+    # test_lit_data = 2*(dataset[0][0].unsqueeze(0).repeat_interleave(3, dim=1),)
+    test_output = lit_model.forward((test_data,))
+    
+    dataloader.setup()
+    model.eval()
+    
+    if clargs.clear_checkpoints:
+        print("cleaning checkpoints")
+        shutil.rmtree("checkpoints/")
+        model_dir = f"checkpoints/{hashing_fn(args)}"
+    
+    tb_logger = pl_loggers.TensorBoardLogger(f"logs/")
+    jobname = f"{params['model']}_{interp_size}_{params['batch_size']}_{clargs.dataset[0]}"
+    wandblogger = pl_loggers.WandbLogger(project=clargs.wandb_project, name=jobname)
+    
+    Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True)
+    
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=f"{model_dir}/",
+        save_last=True,
+        save_top_k=1,
+        monitor="loss/val",
+        mode="min",
+    )
+    wandb.watch(lit_model, log="all")
+    
+    trainer = pl.Trainer(
+        logger=[wandb, tb_logger],
+        gradient_clip_val=0.5,
+        enable_checkpointing=True,
+        devices=1,
+        accelerator="gpu",
+        accumulate_grad_batches=4,
+        callbacks=[checkpoint_callback],
+        min_epochs=50,
+        max_epochs=args.epochs,
+        log_every_n_steps=1,
+    )
+    # %%
+    
+    # Determine the checkpoint path for resuming
+    last_checkpoint_path = f"{model_dir}/last.ckpt"
+    best_checkpoint_path = checkpoint_callback.best_model_path
+    
+    # Check if a last checkpoint exists to resume from
+    if os.path.isfile(last_checkpoint_path):
+        resume_checkpoint = last_checkpoint_path
+    elif best_checkpoint_path and os.path.isfile(best_checkpoint_path):
+        resume_checkpoint = best_checkpoint_path
+    else:
+        resume_checkpoint = None
+    
+    trainer.fit(lit_model, datamodule=dataloader, ckpt_path=resume_checkpoint)
+    
+    lit_model.eval()
+    
+    validation = trainer.validate(lit_model, datamodule=dataloader)
+    testing = trainer.test(lit_model, datamodule=dataloader)
+    example_input = Variable(torch.rand(1, *args.input_dim))
+    
+    # torch.jit.save(lit_model.to_torchscript(), f"{model_dir}/model.pt")
+    # torch.onnx.export(lit_model, example_input, f"{model_dir}/model.onnx")
+    
+    # %%
+    # Inference
+    
+    dataloader = DataModule(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=args.num_workers,
+        # Transform is commented here to avoid augmentations in real data
+        # HOWEVER, applying a the transform multiple times and averaging the results might produce better latent embeddings
+        # transform=transform,
+        # transform=transform,
+    )
+    dataloader.setup()
+    
+    predictions = trainer.predict(lit_model, datamodule=dataloader)
+    
+    # Use the namespace variables
+    latent_space = torch.stack([d.out.z.flatten() for d in predictions])
+    scalings = torch.stack([d.x.scalings.flatten() for d in predictions])
+    idx_to_class = {v: k for k, v in dataset.dataset.class_to_idx.items()}
+    y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()])
+    
+    df = pd.DataFrame(latent_space.numpy())
+    df["Class"] = y
+    # Map numeric classes to their labels
+    idx_to_class = {0: "alive", 1: "dead"}
+    df["Class"] = df["Class"].map(idx_to_class).astype("category")
+    df["Scale"] = scalings[:, 0].squeeze()
+    df = df.set_index("Class")
+    df_shape_embed = df.copy()
+    
+    # %%
+    # %% UMAP plot
+    umap_plot(df, metadata, width, height,split=0.9)
+    
+    X = df_shape_embed.to_numpy()
+    y = df_shape_embed.index
+    
+    properties = [
+        "area",
+        "perimeter",
+        "centroid",
+        "major_axis_length",
+        "minor_axis_length",
+        "orientation",
+    ]
+    dfs = []
+    for i, data in enumerate(train_data["transform_crop"]):
+        X, y = data
+        # Do regionprops here
+        # Calculate shape summary statistics using regionprops
+        # We're considering that the mask has only one object, thus we take the first element [0]
+        # props = regionprops(np.array(X).astype(int))[0]
+        props_table = measure.regionprops_table(
+            np.array(X).astype(int), properties=properties
+        )
+    
+        # Store shape properties in a dataframe
+        df = pd.DataFrame(props_table)
+    
+        # Assuming the class or label is contained in 'y' variable
+        df["class"] = y
+        df.set_index("class", inplace=True)
+        dfs.append(df)
+    
+    df_regionprops = pd.concat(dfs)
+    
+    # Assuming 'dataset_contour' is your DataLoader for the dataset
+    dfs = []
+    for i, data in enumerate(train_data["transform_coords"]):
+        # Convert the tensor to a numpy array
+        X, y = data
+    
+        # Feed it to PyEFD's calculate_efd function
+        coeffs = pyefd.elliptic_fourier_descriptors(X, order=10, normalize=False)
+        # coeffs_df = pd.DataFrame({'class': [y], 'norm_coeffs': [norm_coeffs.flatten().tolist()]})
+    
+        norm_coeffs = pyefd.normalize_efd(coeffs)
+        df = pd.DataFrame(
+            {
+                "norm_coeffs": norm_coeffs.flatten().tolist(),
+                "coeffs": coeffs.flatten().tolist(),
+            }
+        ).T.rename_axis("coeffs")
+        df["class"] = y
+        df.set_index("class", inplace=True, append=True)
+        dfs.append(df)
+    
+    df_pyefd = pd.concat(dfs)
+    
+    trials = [
         {
-            "norm_coeffs": norm_coeffs.flatten().tolist(),
-            "coeffs": coeffs.flatten().tolist(),
-        }
-    ).T.rename_axis("coeffs")
-    df["class"] = y
-    df.set_index("class", inplace=True, append=True)
-    dfs.append(df)
-
-df_pyefd = pd.concat(dfs)
-
-trials = [
-    {
-        "name": "mask_embed",
-        "features": df_shape_embed.to_numpy(),
-        "labels": df_shape_embed.index,
-    },
-    {
-        "name": "fourier_coeffs",
-        "features": df_pyefd.xs("coeffs", level="coeffs"),
-        "labels": df_pyefd.xs("coeffs", level="coeffs").index,
-    },
-    # {"name": "fourier_norm_coeffs",
-    #  "features": df_pyefd.xs("norm_coeffs", level="coeffs"),
-    #  "labels": df_pyefd.xs("norm_coeffs", level="coeffs").index
-    # }
-    {
-        "name": "regionprops",
-        "features": df_regionprops,
-        "labels": df_regionprops.index,
-    },
-]
-
-trial_df = pd.DataFrame()
-for trial in trials:
-    X = trial["features"]
-    y = trial["labels"]
-    trial["score_df"] = scoring_df(X, y)
-    trial["score_df"]["trial"] = trial["name"]
-    print(trial["score_df"])
-    trial["score_df"].to_csv(metadata(f"{trial['name']}_score_df.csv"))
-    trial_df = pd.concat([trial_df, trial["score_df"]])
-trial_df = trial_df.drop(["fit_time", "score_time"], axis=1)
-
-trial_df.to_csv(metadata(f"trial_df.csv"))
-trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv"))
-trial_df.plot(kind="bar")
-
-#mean_df = trial_df.groupby("trial").mean()
-#std_df = trial_df.groupby("trial").std()
-#wandb.log_table(mean_df)
-#wandb.log_table(std_df) 
-
-#Special metrics for f1 score for wandb
-wandblogger.experiment.log({"trial_df": wandb.Table(dataframe=trial_df)})
-mean_df = trial_df.groupby("trial").mean()
-std_df = trial_df.groupby("trial").std()
-wandblogger.experiment.log({"Mean": wandb.Table(dataframe=mean_df)})
-wandblogger.experiment.log({"Std": wandb.Table(dataframe=std_df)})
-
-melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score")
-# fig, ax = plt.subplots(figsize=(width, height))
-ax = sns.catplot(
-    data=melted_df,
-    kind="bar",
-    x="trial",
-    hue="Metric",
-    y="Score",
-    errorbar="se",
-    height=height,
-    aspect=width * 2**0.5 / height,
-)
-# ax.xtick_params(labelrotation=45)
-# plt.legend(loc='lower center', bbox_to_anchor=(1, 1))
-# sns.move_legend(ax, "lower center", bbox_to_anchor=(1, 1))
-# ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
-# plt.tight_layout()
-plt.savefig(metadata(f"trials_barplot.pdf"))
-plt.close()
-
-avs = (
-    melted_df.set_index(["trial", "Metric"])
-    .xs("test_f1", level="Metric", drop_level=False)
-    .groupby("trial")
-    .mean()
-)
-print(avs)
-# tikzplotlib.save(metadata(f"trials_barplot.tikz"))
+            "name": "mask_embed",
+            "features": df_shape_embed.to_numpy(),
+            "labels": df_shape_embed.index,
+        },
+        {
+            "name": "fourier_coeffs",
+            "features": df_pyefd.xs("coeffs", level="coeffs"),
+            "labels": df_pyefd.xs("coeffs", level="coeffs").index,
+        },
+        # {"name": "fourier_norm_coeffs",
+        #  "features": df_pyefd.xs("norm_coeffs", level="coeffs"),
+        #  "labels": df_pyefd.xs("norm_coeffs", level="coeffs").index
+        # }
+        {
+            "name": "regionprops",
+            "features": df_regionprops,
+            "labels": df_regionprops.index,
+        },
+    ]
+    
+    trial_df = pd.DataFrame()
+    for trial in trials:
+        X = trial["features"]
+        y = trial["labels"]
+        trial["score_df"] = scoring_df(X, y)
+        trial["score_df"]["trial"] = trial["name"]
+        print(trial["score_df"])
+        trial["score_df"].to_csv(metadata(f"{trial['name']}_score_df.csv"))
+        trial_df = pd.concat([trial_df, trial["score_df"]])
+    trial_df = trial_df.drop(["fit_time", "score_time"], axis=1)
+    
+    trial_df.to_csv(metadata(f"trial_df.csv"))
+    trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv"))
+    trial_df.plot(kind="bar")
+    
+    #mean_df = trial_df.groupby("trial").mean()
+    #std_df = trial_df.groupby("trial").std()
+    #wandb.log_table(mean_df)
+    #wandb.log_table(std_df) 
+    
+    #Special metrics for f1 score for wandb
+    wandblogger.experiment.log({"trial_df": wandb.Table(dataframe=trial_df)})
+    mean_df = trial_df.groupby("trial").mean()
+    std_df = trial_df.groupby("trial").std()
+    wandblogger.experiment.log({"Mean": wandb.Table(dataframe=mean_df)})
+    wandblogger.experiment.log({"Std": wandb.Table(dataframe=std_df)})
+    
+    melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score")
+    # fig, ax = plt.subplots(figsize=(width, height))
+    ax = sns.catplot(
+        data=melted_df,
+        kind="bar",
+        x="trial",
+        hue="Metric",
+        y="Score",
+        errorbar="se",
+        height=height,
+        aspect=width * 2**0.5 / height,
+    )
+    # ax.xtick_params(labelrotation=45)
+    # plt.legend(loc='lower center', bbox_to_anchor=(1, 1))
+    # sns.move_legend(ax, "lower center", bbox_to_anchor=(1, 1))
+    # ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    # plt.tight_layout()
+    plt.savefig(metadata(f"trials_barplot.pdf"))
+    plt.close()
+    
+    avs = (
+        melted_df.set_index(["trial", "Metric"])
+        .xs("test_f1", level="Metric", drop_level=False)
+        .groupby("trial")
+        .mean()
+    )
+    print(avs)
+    # tikzplotlib.save(metadata(f"trials_barplot.tikz"))
 
 
 
@@ -582,11 +582,11 @@ def shape_embed_process(clargs):
 
 if __name__ == "__main__":
 
-def auto_pos_int (x):
-  val = int(x,0)
-  if val <= 0:
-      raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
-  return val
+    def auto_pos_int (x):
+      val = int(x,0)
+      if val <= 0:
+          raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
+      return val
     
     parser = argparse.ArgumentParser(description='Run the shape embed pipeline')
     
@@ -608,6 +608,12 @@ def auto_pos_int (x):
     parser.add_argument(
         '-m', '--model', choices=models, default=models[0], metavar='MODEL'
       , help=f"The MODEL to use, one of {models} (default {models[0]}).")
+    parser.add_argument(
+        '-d', '--dataset', nargs=2, default=("vampire", "vampire/torchvision/Control/"), metavar=('NAME', 'PATH')
+      , help=f"The NAME of and PATH to the dataset")
+    parser.add_argument(
+        '-w', '--wandb-project', default="shape-embed", metavar='PROJECT'
+      , help=f"The wandb PROJECT name")
     parser.add_argument(
         '-b', '--batch-size', default=int(4), metavar='BATCH_SIZE', type=auto_pos_int
       , help="The BATCH_SIZE for the run, a positive integer (default 4)")
@@ -619,4 +625,6 @@ def auto_pos_int (x):
     #parser.add_argument('-v', '--verbose', action='count', default=0,
     #  help="Increase verbosity level by adding more \"v\".")
     
+    #clargs=parser.parse_args()
+    #print(clargs.dataset)
     shape_embed_process(parser.parse_args())

From 19990760bcd8fd5e442b1d68d1f01f8d8b266fe7 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 22 Feb 2024 21:26:44 +0000
Subject: [PATCH 043/204] duplicated slurm script + specify dataset

---
 slurm_shape_embed_dataset.py | 104 +++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 slurm_shape_embed_dataset.py

diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py
new file mode 100644
index 00000000..2eb2ff61
--- /dev/null
+++ b/slurm_shape_embed_dataset.py
@@ -0,0 +1,104 @@
+#! /usr/bin/env python3
+
+import os
+import subprocess
+import tempfile
+
+## Assign the arguments to variables
+#model_arg=$1
+#sizes_list="${@:2}"
+#
+## Create SLURM job script
+#job_script="slurm_job.sh"
+#
+#echo "#!/bin/bash" > "$job_script"
+#echo "#SBATCH --job-name=ite_shape_embed" >> "$job_script"
+#echo "#SBATCH --output=ite_shape_embed.out" >> "$job_script"
+#echo "#SBATCH --error=ite_shape_embed.err" >> "$job_script"
+#echo "#SBATCH --gres=gpu:2" >> "$job_script"  # Adjust the number of CPUs as needed
+#echo "#SBATCH --mem=50GB" >> "$job_script"          # Adjust the memory requirement as needed
+#echo "" >> "$job_script"
+#
+## Loop through the sizes and append the Python command to the job script
+#for size in $sizes_list; do
+#    echo "python ite_shape_embed.py --model $model_arg --ls_size $size" >> "$job_script"
+#done
+#
+## Submit SLURM job
+#sbatch "$job_script"
+
+models = [
+  "resnet18_vae"
+, "resnet18_vqvae"
+, "resnet18_vqvae_legacy"
+, "resnet18_vae_legacy"
+]
+batch_sizes = [4]
+latent_space_sizes = [512]
+
+datasets = [
+  ("vampire", "vampire/torchvision/Control/")
+, ("bbbc010", "bbbc010/BBBC010_v1_foreground_eachworm/")
+, ("synthcell", "synthcellshapes_dataset")
+#, ("helakyoto", "")
+]
+
+wandb_project='shape-embed-ite-dataset'
+
+slurm_script="""#!/bin/bash
+
+echo "running shape embed with:"
+echo "  - model {model}"
+echo "  - dataset {dataset[0]} ({dataset[1]})"
+echo "  - batch size {b_size}"
+echo "  - latent space size {ls_size}"
+rand_name=$(cat /dev/urandom | tr -cd 'a-f0-9' | head -c 16)
+mkdir -p slurm_rundir/$rand_name
+cp -r $(ls | grep -v slurm_rundir) slurm_rundir/$rand_name/.
+cd slurm_rundir/$rand_name
+python3 scripts/shapes/shape_embed.py --wandb-project {wandb_project} --model {model} --dataset {dataset[0]} {dataset[1]} --batch-size {b_size} --latent-space-size {ls_size} --clear-checkpoints
+"""
+
+def mem_size(ls):
+    if ls <= 128:
+        return '50GB'
+    if ls > 128:
+        return '100GB'
+    if ls > 256:
+        return '300GB'
+
+def n_gpus(ls):
+    if ls <= 128:
+        return 'gpu:2'
+    if ls > 128:
+        return 'gpu:2'
+    if ls > 256:
+        return 'gpu:3'
+
+if __name__ == "__main__":
+    
+    slurmdir = f'{os.getcwd()}/slurmdir'
+    os.makedirs(slurmdir, exist_ok=True)
+    for m, bs, ls, ds in [ (m,bs,ls,ds) for  m in models
+                                 for bs in batch_sizes
+                                 for ls in latent_space_sizes
+                                 for ds in datasets ]:
+        jobname = f'shape_embed_{m}_{ds[0]}_{bs}_{ls}'
+        print(jobname)
+        fp = open(mode='w+', file=f'{slurmdir}/slurm_script_shape_embed_{m}_{bs}_{ls}.script')
+        fp.write(slurm_script.format(model=m, dataset=ds, b_size=bs, ls_size=ls, wandb_project=wandb_project))
+        fp.flush()
+        print(f'{fp.name}')
+        print(f'cat {fp.name}')
+        result = subprocess.run(['cat', fp.name], stdout=subprocess.PIPE)
+        print(result.stdout.decode('utf-8'))
+        print(mem_size(ls))
+        result = subprocess.run([ 'sbatch'
+                                , '--time', '10:00:00'
+                                , '--mem', mem_size(ls)
+                                , '--job-name', jobname
+                                , '--output', f'{slurmdir}/{jobname}.out'
+                                , '--error', f'{slurmdir}/{jobname}.err'
+                                , '--gres', n_gpus(ls)
+                                , fp.name], stdout=subprocess.PIPE)
+        print(result.stdout.decode('utf-8'))

From d3525a8817d11db0960743d19eafb4a34a2ce590 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 22 Feb 2024 21:52:28 +0000
Subject: [PATCH 044/204] Fix wandb logger

---
 scripts/shapes/shape_embed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 32e6b898..948e0f5c 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -367,10 +367,10 @@ def shape_embed_process(clargs):
         monitor="loss/val",
         mode="min",
     )
-    wandb.watch(lit_model, log="all")
+    wandblogger.watch(lit_model, log="all")
     
     trainer = pl.Trainer(
-        logger=[wandb, tb_logger],
+        logger=[wandblogger, tb_logger],
         gradient_clip_val=0.5,
         enable_checkpointing=True,
         devices=1,

From 7a60b2a5a66c4f152ab6cbfd65117cbe92de2ab9 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 22 Feb 2024 21:52:54 +0000
Subject: [PATCH 045/204] Add helakyoto dataset to the slurm script

---
 slurm_shape_embed_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py
index 2eb2ff61..a0b16047 100644
--- a/slurm_shape_embed_dataset.py
+++ b/slurm_shape_embed_dataset.py
@@ -39,8 +39,8 @@
 datasets = [
   ("vampire", "vampire/torchvision/Control/")
 , ("bbbc010", "bbbc010/BBBC010_v1_foreground_eachworm/")
-, ("synthcell", "synthcellshapes_dataset")
-#, ("helakyoto", "")
+, ("synthcell", "synthcellshapes_dataset/")
+, ("helakyoto", "H2b_10x_MD_exp665/samples/")
 ]
 
 wandb_project='shape-embed-ite-dataset'

From e43e8394bf97d1fa609b3e1637d812d4dc0f4f68 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 16:48:24 +0000
Subject: [PATCH 046/204] better imports

---
 scripts/shapes/shape_embed.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 49cc9c1d..f0850a4d 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -9,14 +9,11 @@
 import pandas as pd
 from sklearn import metrics
 import matplotlib as mpl
-import seaborn as sns
 from pathlib import Path
 from sklearn.pipeline import Pipeline
-import umap
 from torch.autograd import Variable
 from types import SimpleNamespace
 import numpy as np
-import logging
 from skimage import measure
 import umap.plot
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
@@ -25,17 +22,16 @@
 from types import SimpleNamespace
 from umap import UMAP
 import os
-
-# Deal with the filesystem
 import torch.multiprocessing
+import logging
+from tqdm import tqdm
+
+logging.basicConfig(level=logging.INFO)
 
 torch.multiprocessing.set_sharing_strategy("file_system")
 
 from bioimage_embed import shapes
 import bioimage_embed
-
-# Note - you must have torchvision installed for this example
-
 from pytorch_lightning import loggers as pl_loggers
 from torchvision import transforms
 from bioimage_embed.lightning import DataModule
@@ -47,16 +43,15 @@
     DistogramToCoords,
     MaskToDistogramPipeline,
     RotateIndexingClockwise,
+    CoordsToDistogram,
 )
-
 import matplotlib.pyplot as plt
 
 from bioimage_embed.lightning import DataModule
 import matplotlib as mpl
 from matplotlib import rc
 
-import logging
-import pickle 
+import pickle
 import base64
 import hashlib
 
@@ -66,6 +61,7 @@
 np.random.seed(42)
 pl.seed_everything(42)
 
+
 def hashing_fn(args):
     serialized_args = pickle.dumps(vars(args))
     hash_object = hashlib.sha256(serialized_args)

From ffbd8eae64043c10ed68d63a35e42e8b1476cfca Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 16:49:10 +0000
Subject: [PATCH 047/204] Adding dataset path to args for better checkpointing

---
 scripts/shapes/shape_embed.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index f0850a4d..7bb4ce24 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -172,6 +172,9 @@ def shape_embed_process():
         "latent_dim": int(128),
         "pretrained": True,
         "frobenius_norm": False,
+        # dataset = "bbbc010/BBBC010_v1_foreground_eachworm"
+        # dataset = "vampire/mefs/data/processed/Control"
+        "dataset": "synthcellshapes_dataset",
     }
 
     optimizer_params = {
@@ -193,15 +196,9 @@ def shape_embed_process():
 
     args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params)
 
-    #dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm"
-    dataset_path = "shape_embed_data/data/bbbc010/BBBC010_v1_foreground_eachworm/"
-    # dataset_path = "vampire/mefs/data/processed/Control"
-    # dataset_path = "shape_embed_data/data/vampire/torchvision/Control/"
-    # dataset_path = "vampire/torchvision/Control"
-    # dataset = "bbbc010"
+    dataset_path = args.dataset
 
-    # train_data_path = f"scripts/shapes/data/{dataset_path}"
-    train_data_path = f"scripts/shapes/data/{dataset_path}"
+    train_data_path = f"data/{dataset_path}"
     metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}"
 
     path = Path(metadata(""))

From a3e82a90eee22acccbcb2858a0ae4800631e5056 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 16:49:46 +0000
Subject: [PATCH 048/204] Imrproved dataset logic so that dist depends on
 coords

---
 scripts/shapes/shape_embed.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 7bb4ce24..b2f95c64 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -206,9 +206,10 @@ def shape_embed_process():
     # %%
 
     transform_crop = CropCentroidPipeline(window_size)
-    transform_dist = MaskToDistogramPipeline(
-        window_size, interp_size, matrix_normalised=False
-    )
+    # transform_dist = MaskToDistogramPipeline(
+    # window_size, interp_size, matrix_normalised=False
+    # )
+    transform_coord_to_dist = CoordsToDistogram(interp_size, matrix_normalised=False)
     transform_mdscoords = DistogramToCoords(window_size)
     transform_coords = ImageToCoords(window_size)
 
@@ -222,16 +223,27 @@ def shape_embed_process():
         ]
     )
 
-    transform_mask_to_dist = transforms.Compose(
+    transform_mask_to_coords = transforms.Compose(
         [
             transform_mask_to_crop,
-            transform_dist,
+            transform_coords,
         ]
     )
-    transform_mask_to_coords = transforms.Compose(
+
+    transform_mask_to_dist = transforms.Compose(
         [
-            transform_mask_to_crop,
-            transform_coords,
+            transform_mask_to_coords,
+            transform_coord_to_dist,
+        ]
+    )
+
+    gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1))
+    transform = transforms.Compose(
+        [
+            transform_mask_to_dist,
+            transforms.ToTensor(),
+            RotateIndexingClockwise(p=1),
+            gray2rgb,
         ]
     )
 

From 5b720d4ba3caaf4b50803b4496267fba49dd7828 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 16:50:13 +0000
Subject: [PATCH 049/204] Improved data rejection for datatsets

---
 scripts/shapes/shape_embed.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index b2f95c64..fc29eeb5 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -254,11 +254,31 @@ def shape_embed_process():
         "transform_coords": transform_mask_to_coords,
     }
 
+    # Apply transform to find which images don't work
+    dataset = datasets.ImageFolder(train_data_path, transform=transform)
+
+    valid_indices = []
+    # Iterate through the dataset and apply the transform to each image
+    for idx in range(len(dataset)):
+        try:
+            image, label = dataset[idx]
+            # If the transform works without errors, add the index to the list of valid indices
+            valid_indices.append(idx)
+        except Exception as e:
+            # A better way to do with would be with batch collation
+            logger.warning(f"Error occurred for image {idx}: {e}")
+
     train_data = {
-        key: datasets.ImageFolder(train_data_path, transform=value)
+        key: torch.utils.data.Subset(
+            datasets.ImageFolder(train_data_path, transform=value), valid_indices
+        )
         for key, value in transforms_dict.items()
     }
 
+    dataset = torch.utils.data.Subset(
+        datasets.ImageFolder(train_data_path, transform=transform), valid_indices
+    )
+
     for key, value in train_data.items():
         print(key, len(value))
         plt.imshow(train_data[key][0][0], cmap="gray")

From dea41f65c46a2c0a49b2d7ce9f73484b6d17b301 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 16:50:39 +0000
Subject: [PATCH 050/204] Removing redundant code and adding logging

---
 scripts/shapes/shape_embed.py | 42 ++---------------------------------
 1 file changed, 2 insertions(+), 40 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index fc29eeb5..3dab09b3 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -280,21 +280,12 @@ def shape_embed_process():
     )
 
     for key, value in train_data.items():
-        print(key, len(value))
-        plt.imshow(train_data[key][0][0], cmap="gray")
+        logger.info(key, len(value))
+        plt.imshow(np.array(train_data[key][0][0]), cmap="gray")
         plt.imsave(metadata(f"{key}.png"), train_data[key][0][0], cmap="gray")
         # plt.show()
         plt.close()
 
-    # plt.scatter(*train_data["transform_coords"][0][0])
-    # plt.savefig(metadata(f"transform_coords.png"))
-    # plt.show()
-
-    # plt.imshow(train_data["transform_crop"][0][0], cmap="gray")
-    # plt.scatter(*train_data["transform_coords"][0][0],c=np.arange(interp_size), cmap='rainbow', s=1)
-    # plt.show()
-    # plt.savefig(metadata(f"transform_coords.png"))
-
     # Retrieve the coordinates and cropped image
     coords = train_data["transform_coords"][0][0]
     crop_image = train_data["transform_crop"][0][0]
@@ -315,33 +306,8 @@ def shape_embed_process():
 
     # Close the plot
     plt.close()
-    # import albumentations as A
-    # %%
-    gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1))
-    transform = transforms.Compose(
-        [
-            transform_mask_to_dist,
-            transforms.ToTensor(),
-            RotateIndexingClockwise(p=1),
-            gray2rgb,
-        ]
-    )
-
-    dataset = datasets.ImageFolder(train_data_path, transform=transform)
-
-    valid_indices = []
-    # Iterate through the dataset and apply the transform to each image
-    for idx in range(len(dataset)):
-        try:
-            image, label = dataset[idx]
-            # If the transform works without errors, add the index to the list of valid indices
-            valid_indices.append(idx)
-        except Exception as e:
-            # A better way to do with would be with batch collation
-            print(f"Error occurred for image {idx}: {e}")
 
     # Create a Subset using the valid indices
-    dataset = torch.utils.data.Subset(dataset, valid_indices)
     dataloader = DataModule(
         dataset,
         batch_size=args.batch_size,
@@ -349,8 +315,6 @@ def shape_embed_process():
         num_workers=args.num_workers,
     )
 
-    # model = bioimage_embed.models.create_model("resnet18_vqvae_legacy", **vars(args))
-    # 
     model = bioimage_embed.models.create_model(
         model=args.model,
         input_dim=args.input_dim,
@@ -358,8 +322,6 @@ def shape_embed_process():
         pretrained=args.pretrained,
     )
 
-    # model = bioimage_embed.models.factory.ModelFactory(**vars(args)).resnet50_vqvae_legacy()
-
     # lit_model = shapes.MaskEmbedLatentAugment(model, args)
     lit_model = shapes.MaskEmbed(model, args)
     test_data = dataset[0][0].unsqueeze(0)

From 85f1211860dc9567b902767ae10da57ecb1c1496 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 16:51:09 +0000
Subject: [PATCH 051/204] [bug] Removing hard coded idx mapper

---
 scripts/shapes/shape_embed.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 3dab09b3..19bf4267 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -385,16 +385,14 @@ def shape_embed_process():
     # torch.onnx.export(lit_model, example_input, f"{model_dir}/model.onnx")
 
     # %%
-    # Inference
-
+    # Inference on full dataset
     dataloader = DataModule(
         dataset,
         batch_size=1,
         shuffle=False,
         num_workers=args.num_workers,
         # Transform is commented here to avoid augmentations in real data
-        # HOWEVER, applying a the transform multiple times and averaging the results might produce better latent embeddings
-        # transform=transform,
+        # HOWEVER, applying the transform multiple times and averaging the results might produce better latent embeddings
         # transform=transform,
     )
     dataloader.setup()
@@ -408,16 +406,14 @@ def shape_embed_process():
     y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()])
 
     df = pd.DataFrame(latent_space.numpy())
-    df["Class"] = y
-    # Map numeric classes to their labels
-    idx_to_class = {0: "alive", 1: "dead"}
-    df["Class"] = df["Class"].map(idx_to_class).astype("category")
+    df["Class"] = pd.Series(y).map(idx_to_class).astype("category")
     df["Scale"] = scalings[:, 0].squeeze()
     df = df.set_index("Class")
     df_shape_embed = df.copy()
 
     # %% UMAP plot
-    umap_plot(df, metadata, width, height,split=0.9)
+
+    umap_plot(df, metadata, width, height, split=0.9)
 
     X = df_shape_embed.to_numpy()
     y = df_shape_embed.index

From e411a9da9577d5641763cf15f172ac8a6469764d Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 16:51:44 +0000
Subject: [PATCH 052/204] Adding logging and and tqdm so it doesnt look like
 the code is hanging

---
 scripts/shapes/shape_embed.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 19bf4267..abb02879 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -427,11 +427,12 @@ def shape_embed_process():
         "orientation",
     ]
     dfs = []
-    for i, data in enumerate(train_data["transform_crop"]):
+    # Distance matrix data
+    for i, data in enumerate(tqdm(train_data["transform_crop"])):
         X, y = data
         # Do regionprops here
         # Calculate shape summary statistics using regionprops
-        # We're considering that the mask has only one object, thus we take the first element [0]
+        # We're considering that the mask has only one object, so we take the first element [0]
         # props = regionprops(np.array(X).astype(int))[0]
         props_table = measure.regionprops_table(
             np.array(X).astype(int), properties=properties
@@ -447,9 +448,8 @@ def shape_embed_process():
 
     df_regionprops = pd.concat(dfs)
 
-    # Assuming 'dataset_contour' is your DataLoader for the dataset
     dfs = []
-    for i, data in enumerate(train_data["transform_coords"]):
+    for i, data in enumerate(tqdm(train_data["transform_coords"])):
         # Convert the tensor to a numpy array
         X, y = data
 
@@ -498,7 +498,7 @@ def shape_embed_process():
         y = trial["labels"]
         trial["score_df"] = scoring_df(X, y)
         trial["score_df"]["trial"] = trial["name"]
-        print(trial["score_df"])
+        logger.info(trial["score_df"])
         trial["score_df"].to_csv(metadata(f"{trial['name']}_score_df.csv"))
         trial_df = pd.concat([trial_df, trial["score_df"]])
     trial_df = trial_df.drop(["fit_time", "score_time"], axis=1)

From 66df0b991e4a5b03b3636017f49e0a2360c747ab Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 16:52:10 +0000
Subject: [PATCH 053/204] More logging

---
 scripts/shapes/shape_embed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index abb02879..87e8be5a 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -533,7 +533,7 @@ def shape_embed_process():
         .groupby("trial")
         .mean()
     )
-    print(avs)
+    logger.info(avs)
     # tikzplotlib.save(metadata(f"trials_barplot.tikz"))
 
 

From b5db3670241277a732fa6ae982f542664d035fad Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 16:57:31 +0000
Subject: [PATCH 054/204] Adding a disk cleanup step

---
 .github/workflows/test.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index dbf6e63d..290f82f8 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -21,6 +21,16 @@ jobs:
           #   shell: bash -l {0}
     steps:
     - uses: actions/checkout@v2
+    - name: Free Disk Space (Ubuntu)
+      uses: jlumbroso/free-disk-space@main
+      with:
+        tool-cache: false
+        android: true
+        dotnet: true
+        haskell: true
+        large-packages: true
+        docker-images: true
+        swap-storage: true
     - uses: conda-incubator/setup-miniconda@v2
       with:
         environment-file: environment.yml

From f580f97b921f7cef00c4a73e4423f9eca0ef66fa Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 16:59:33 +0000
Subject: [PATCH 055/204] Reverting to other path structure

---
 scripts/shapes/shape_embed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 87e8be5a..61f34366 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -198,7 +198,7 @@ def shape_embed_process():
 
     dataset_path = args.dataset
 
-    train_data_path = f"data/{dataset_path}"
+    train_data_path = f"scripts/shapes/data/{dataset_path}"
     metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}"
 
     path = Path(metadata(""))

From ae9f8d1907e1a57a6fc5fd5a3867e31f9621e32d Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 17:26:12 +0000
Subject: [PATCH 056/204] Adding average cross val to logs

---
 scripts/shapes/shape_embed.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 61f34366..071127ed 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -507,6 +507,10 @@ def shape_embed_process():
     trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv"))
     trial_df.plot(kind="bar")
 
+    avg = trial_df.groupby("trial").mean()
+    logger.info(avg)
+    avg.to_latex(metadata(f"trial_df.tex"))
+
     melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score")
     # fig, ax = plt.subplots(figsize=(width, height))
     ax = sns.catplot(

From dcdfa6389b34233c5aa9f744b76ab95ebb42f5f1 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Thu, 29 Feb 2024 17:26:23 +0000
Subject: [PATCH 057/204] Adding opencv to env file

---
 environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/environment.yml b/environment.yml
index 568d68ce..cdd9cd54 100644
--- a/environment.yml
+++ b/environment.yml
@@ -14,5 +14,6 @@ dependencies:
 - pytorch
 - pillow=9.5.0
 - pip
+- conda-forge::opencv
 - pip:
   - -e .

From e8ca2cb52f540ad78637671a613822d41699840c Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 1 Mar 2024 08:26:47 +0000
Subject: [PATCH 058/204] Added allen dataset

---
 slurm_shape_embed_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py
index a0b16047..4298dc18 100644
--- a/slurm_shape_embed_dataset.py
+++ b/slurm_shape_embed_dataset.py
@@ -41,6 +41,7 @@
 , ("bbbc010", "bbbc010/BBBC010_v1_foreground_eachworm/")
 , ("synthcell", "synthcellshapes_dataset/")
 , ("helakyoto", "H2b_10x_MD_exp665/samples/")
+, ("allen", "allen_dataset/")
 ]
 
 wandb_project='shape-embed-ite-dataset'

From 775442c471ea477c9c7e0fcc9a322a338179449f Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 1 Mar 2024 08:27:17 +0000
Subject: [PATCH 059/204] Limit time per job increased to 24h

---
 slurm_shape_embed_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py
index 4298dc18..6adf0bad 100644
--- a/slurm_shape_embed_dataset.py
+++ b/slurm_shape_embed_dataset.py
@@ -95,7 +95,7 @@ def n_gpus(ls):
         print(result.stdout.decode('utf-8'))
         print(mem_size(ls))
         result = subprocess.run([ 'sbatch'
-                                , '--time', '10:00:00'
+                                , '--time', '24:00:00'
                                 , '--mem', mem_size(ls)
                                 , '--job-name', jobname
                                 , '--output', f'{slurmdir}/{jobname}.out'

From 2fce547fa7747e02f230d0db9b86f4138c885e8b Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Fri, 1 Mar 2024 17:30:05 +0000
Subject: [PATCH 060/204] Fixing case where multiple contours are found, chose
 the longest

---
 bioimage_embed/shapes/contours.py   | 2 +-
 bioimage_embed/shapes/transforms.py | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/bioimage_embed/shapes/contours.py b/bioimage_embed/shapes/contours.py
index fd82c4ba..6845b97f 100644
--- a/bioimage_embed/shapes/contours.py
+++ b/bioimage_embed/shapes/contours.py
@@ -35,7 +35,7 @@ def cubic_polar_resample_contour(contour: np.array, size: int) -> np.array:
 
 
 def contour_to_xy(contour: np.array):
-    return contour[0][:, 0], contour[0][:, 1]
+    return contour[:, 0], contour[:, 1]
 
 
 def uniform_spline_resample_contour(contour: np.array, size: int) -> np.array:
diff --git a/bioimage_embed/shapes/transforms.py b/bioimage_embed/shapes/transforms.py
index 1d350a04..2abd401f 100644
--- a/bioimage_embed/shapes/transforms.py
+++ b/bioimage_embed/shapes/transforms.py
@@ -173,6 +173,12 @@ def get_distogram(self, coords, matrix_normalised=False):
             return distance_matrix / np.linalg.norm([self.size, self.size])
 
 
+def find_longest_array(arrays):
+    lengths = [len(arr.flatten()) for arr in arrays]
+    max_length_index = np.argmax(lengths)
+    return arrays[max_length_index]
+
+
 class ImageToCoords(torch.nn.Module):
     def __init__(self, size):
         super().__init__()
@@ -204,7 +210,8 @@ def get_coords_C(
         return torch.tensor(np.array(coords_list))
 
     def get_coords(self, image, size, method="uniform_spline", contour_level=0.8):
-        contour = find_contours(np.array(image), contour_level)
+        contour_list = find_contours(np.array(image), contour_level)
+        contour = find_longest_array(contour_list)
         if method == "uniform_spline":
             return contours.uniform_spline_resample_contour(contour=contour, size=size)
         if method == "cubic_polar":

From 6e14ffd8f858d884dfcf1a10eccaf5ca7e221938 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 3 Mar 2024 14:21:23 +0000
Subject: [PATCH 061/204] change back to use dataset name from clarg + change
 default wandb jobname and latent space size

---
 scripts/shapes/shape_embed.py | 3 ++-
 slurm_shape_embed_dataset.py  | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index ff3c8f27..d1ae640e 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -186,7 +186,8 @@ def shape_embed_process(clargs):
         "frobenius_norm": False,
         # dataset = "bbbc010/BBBC010_v1_foreground_eachworm"
         # dataset = "vampire/mefs/data/processed/Control"
-        "dataset": "synthcellshapes_dataset",
+        #"dataset": "synthcellshapes_dataset",
+        "dataset": clargs.dataset[0],
     }
     
     optimizer_params = {
diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py
index 6adf0bad..1eb570b1 100644
--- a/slurm_shape_embed_dataset.py
+++ b/slurm_shape_embed_dataset.py
@@ -34,7 +34,7 @@
 , "resnet18_vae_legacy"
 ]
 batch_sizes = [4]
-latent_space_sizes = [512]
+latent_space_sizes = [128]
 
 datasets = [
   ("vampire", "vampire/torchvision/Control/")
@@ -44,7 +44,7 @@
 , ("allen", "allen_dataset/")
 ]
 
-wandb_project='shape-embed-ite-dataset'
+wandb_project='shape-embed-test-changes'
 
 slurm_script="""#!/bin/bash
 

From f31b9a0a816c29af2f1b68f2835090fca30d1dd7 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 3 Mar 2024 16:45:58 +0000
Subject: [PATCH 062/204] added back dataset subseting

---
 scripts/shapes/shape_embed.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index d1ae640e..f4a922b5 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -353,6 +353,7 @@ def shape_embed_process(clargs):
             print(f"Error occurred for image {idx}: {e}")
     
     # Create a Subset using the valid indices
+    dataset = torch.utils.data.Subset(dataset, valid_indices)
     dataloader = DataModule(
         dataset,
         batch_size=args.batch_size,

From d052800a74207da86840e06a2958b1e7d5c37b34 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 3 Mar 2024 16:47:10 +0000
Subject: [PATCH 063/204] Added a tiny dataset for quick debugging (commented
 out in the slurm script)

---
 slurm_shape_embed_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py
index 1eb570b1..38a964d9 100644
--- a/slurm_shape_embed_dataset.py
+++ b/slurm_shape_embed_dataset.py
@@ -37,6 +37,7 @@
 latent_space_sizes = [128]
 
 datasets = [
+#  ("tiny_synthcell", "tiny_synthcellshapes_dataset/")
   ("vampire", "vampire/torchvision/Control/")
 , ("bbbc010", "bbbc010/BBBC010_v1_foreground_eachworm/")
 , ("synthcell", "synthcellshapes_dataset/")

From df41415f1dca7014021859678a3c12878560c4ab Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 4 Mar 2024 20:35:37 +0000
Subject: [PATCH 064/204] use specific gpu resource

---
 scripts/shapes/shape_embed.py |  2 +-
 slurm_shape_embed_dataset.py  | 17 +++++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index f4a922b5..bb2c65ff 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -183,7 +183,7 @@ def shape_embed_process(clargs):
         "pretrained": True,
         "commitment_cost": 0.25,
         "decay": 0.99,
-        "frobenius_norm": False,
+        "frobenius_norm": True,
         # dataset = "bbbc010/BBBC010_v1_foreground_eachworm"
         # dataset = "vampire/mefs/data/processed/Control"
         #"dataset": "synthcellshapes_dataset",
diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py
index 38a964d9..95e7a90c 100644
--- a/slurm_shape_embed_dataset.py
+++ b/slurm_shape_embed_dataset.py
@@ -28,13 +28,17 @@
 #sbatch "$job_script"
 
 models = [
-  "resnet18_vae"
+  "resnet50_vae"
+, "resnet50_vqvae"
+, "resnet50_vqvae_legacy"
+, "resnet50_vae_legacy"
+, "resnet18_vae"
 , "resnet18_vqvae"
 , "resnet18_vqvae_legacy"
-, "resnet18_vae_legacy"
-]
+, "resnet18_vae_legacy"]
+
 batch_sizes = [4]
-latent_space_sizes = [128]
+latent_space_sizes = [512]
 
 datasets = [
 #  ("tiny_synthcell", "tiny_synthcellshapes_dataset/")
@@ -45,7 +49,7 @@
 , ("allen", "allen_dataset/")
 ]
 
-wandb_project='shape-embed-test-changes'
+wandb_project='shape-embed-biggest'
 
 slurm_script="""#!/bin/bash
 
@@ -101,6 +105,7 @@ def n_gpus(ls):
                                 , '--job-name', jobname
                                 , '--output', f'{slurmdir}/{jobname}.out'
                                 , '--error', f'{slurmdir}/{jobname}.err'
-                                , '--gres', n_gpus(ls)
+                                #, '--gres', n_gpus(ls)
+                                , '--gpus=a100:1'
                                 , fp.name], stdout=subprocess.PIPE)
         print(result.stdout.decode('utf-8'))

From f1e5a3cf95b308eb73c1396e9efda2c7b123c158 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Tue, 5 Mar 2024 09:35:00 +0000
Subject: [PATCH 065/204] Adding roc_auc and using balanced accuracy

---
 scripts/shapes/shape_embed.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 071127ed..36f130d3 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -113,10 +113,11 @@ def scoring_df(X, y):
     )
     # Define a dictionary of metrics
     scoring = {
-        "accuracy": make_scorer(metrics.accuracy_score),
+        "accuracy": make_scorer(metrics.balanced_accuracy_score),
         "precision": make_scorer(metrics.precision_score, average="macro"),
         "recall": make_scorer(metrics.recall_score, average="macro"),
         "f1": make_scorer(metrics.f1_score, average="macro"),
+        "roc_auc": make_scorer(metrics.roc_auc_score, average="macro"),
     }
 
     # Create a random forest classifier

From d31ca9600ca3b754ef91d541050857bbba84477f Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Tue, 5 Mar 2024 09:35:31 +0000
Subject: [PATCH 066/204] Probably should stratify

---
 scripts/shapes/shape_embed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 36f130d3..e5a40242 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -4,7 +4,7 @@
 from sklearn.decomposition import PCA
 from sklearn.discriminant_analysis import StandardScaler
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import cross_validate, KFold, train_test_split
+from sklearn.model_selection import cross_validate, KFold, train_test_split, StratifiedKFold
 from sklearn.metrics import make_scorer
 import pandas as pd
 from sklearn import metrics
@@ -137,7 +137,7 @@ def scoring_df(X, y):
         estimator=pipeline,
         X=X,
         y=y,
-        cv=KFold(n_splits=k_folds),
+        cv=StratifiedKFold(n_splits=k_folds),
         scoring=scoring,
         n_jobs=-1,
         return_train_score=False,

From abe2664aa11404801779be28e215a3cac9dcc72c Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Tue, 5 Mar 2024 09:35:47 +0000
Subject: [PATCH 067/204] Adding coordinate debug (unchecked)

---
 scripts/shapes/shape_embed.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index e5a40242..49cd29d6 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -400,6 +400,23 @@ def shape_embed_process():
 
     predictions = trainer.predict(lit_model, datamodule=dataloader)
 
+
+    test_dist_pred = predictions[0].out.recon_x
+    plt.imsave(metadata(f"test_dist_pred.png"), test_dist_pred.mean(axis=(0,1)))
+    plt.close()
+
+    test_dist_in = predictions[0].x.data
+    plt.imsave(metadata(f"test_dist_in.png"), test_dist_in.mean(axis=(0,1)))
+    plt.close()
+
+    test_pred_coords = AsymmetricDistogramToCoordsPipeline(window_size=window_size)(
+        np.array(test_dist_pred[:, 0, :, :].unsqueeze(dim=0))
+    )
+
+    plt.scatter(*test_pred_coords[0,0].T)
+    # Save the plot as an image without border and coordinate axes
+    plt.savefig(metadata(f"test_pred_coords.png"), bbox_inches="tight", pad_inches=0)
+    plt.close()
     # Use the namespace variables
     latent_space = torch.stack([d.out.z.flatten() for d in predictions])
     scalings = torch.stack([d.x.scalings.flatten() for d in predictions])

From ab48c09436dce7c9c1e1a2647f18d8d79da5639b Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 6 Mar 2024 09:24:36 +0000
Subject: [PATCH 068/204] put back frobenius norm false

---
 scripts/shapes/shape_embed.py | 2 +-
 slurm_shape_embed_dataset.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index bb2c65ff..f4a922b5 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -183,7 +183,7 @@ def shape_embed_process(clargs):
         "pretrained": True,
         "commitment_cost": 0.25,
         "decay": 0.99,
-        "frobenius_norm": True,
+        "frobenius_norm": False,
         # dataset = "bbbc010/BBBC010_v1_foreground_eachworm"
         # dataset = "vampire/mefs/data/processed/Control"
         #"dataset": "synthcellshapes_dataset",
diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py
index 95e7a90c..0651c361 100644
--- a/slurm_shape_embed_dataset.py
+++ b/slurm_shape_embed_dataset.py
@@ -49,7 +49,7 @@
 , ("allen", "allen_dataset/")
 ]
 
-wandb_project='shape-embed-biggest'
+wandb_project='shape-embed-no-norm'
 
 slurm_script="""#!/bin/bash
 

From 46eb3d1eb2a716ccd833bc6cdd6f5ea036466736 Mon Sep 17 00:00:00 2001
From: Craig Russell <craig.russell.phd@gmail.com>
Date: Wed, 6 Mar 2024 12:08:00 +0000
Subject: [PATCH 069/204] Forgot an import

---
 scripts/shapes/shape_embed.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 49cd29d6..375b1fc7 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -44,6 +44,7 @@
     MaskToDistogramPipeline,
     RotateIndexingClockwise,
     CoordsToDistogram,
+    AsymmetricDistogramToCoordsPipeline,
 )
 import matplotlib.pyplot as plt
 

From db88d341ee12f6cc3a78d0750a8506e620a29b0f Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 7 Mar 2024 09:28:36 +0000
Subject: [PATCH 070/204] add the hardcode entity and add model dir

---
 scripts/shapes/shape_embed.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index ae5d5ce0..961f5ad0 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -383,6 +383,8 @@ def shape_embed_process(clargs):
     dataloader.setup()
     model.eval()
     
+    model_dir = f"checkpoints/{hashing_fn(args)}"
+    
     if clargs.clear_checkpoints:
         print("cleaning checkpoints")
         shutil.rmtree("checkpoints/")
@@ -390,7 +392,8 @@ def shape_embed_process(clargs):
     
     tb_logger = pl_loggers.TensorBoardLogger(f"logs/")
     jobname = f"{params['model']}_{interp_size}_{params['batch_size']}_{clargs.dataset[0]}"
-    wandblogger = pl_loggers.WandbLogger(project=clargs.wandb_project, name=jobname)
+    wandblogger = pl_loggers.WandbLogger(entity='foix', project="shape_embed_fixes", name=jobname)
+    #wandblogger = pl_loggers.WandbLogger(project=clargs.wandb_project, name=jobname)
     
     Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True)
     

From 2505a2dfcba17cd38f9e02a85c6a0ab1a564d449 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 11 Mar 2024 14:56:31 +0000
Subject: [PATCH 071/204] reduce epochs

---
 scripts/shapes/shape_embed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index 4839daf7..eeb797d6 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -174,7 +174,7 @@ def shape_embed_process(clargs):
     params = {
         "model":clargs.model,
         #"model":"resnet18_vae",
-        "epochs": 250,
+        "epochs": 150,
         "batch_size": clargs.batch_size,
         #"batch_size": 4,
         "num_workers": 2**4,

From f588a2d9a8cd983b3a031e4619bb286b5858bb18 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 27 Mar 2024 18:56:58 +0000
Subject: [PATCH 072/204] all changes

---
 bioimage_embed/shapes/mds.py          |   5 +-
 scripts/shapes/distmatrix2embeding.py |   9 +
 scripts/shapes/masks2distmatrices.py  | 249 ++++++++++++++++++++++++++
 scripts/shapes/shape_embed.py         |   2 +-
 4 files changed, 262 insertions(+), 3 deletions(-)
 create mode 100644 scripts/shapes/distmatrix2embeding.py
 create mode 100644 scripts/shapes/masks2distmatrices.py

diff --git a/bioimage_embed/shapes/mds.py b/bioimage_embed/shapes/mds.py
index fdcf2af1..19846375 100644
--- a/bioimage_embed/shapes/mds.py
+++ b/bioimage_embed/shapes/mds.py
@@ -7,11 +7,12 @@ def mds(d):
     :return: A matrix of x, y coordinates.
     """
     n = d.size(0)
-    I = torch.eye(n)
+    I = torch.eye(n, dtype=torch.float64)
     H = I - torch.ones((n, n)) / n
 
     S = -0.5 * H @ d @ H
-    eigvals, eigvecs = S.symeig(eigenvectors=True)
+    #eigvals, eigvecs = S.symeig(eigenvectors=True)
+    eigvals, eigvecs = torch.linalg.eigh(S)
 
     # Sort the eigenvalues and eigenvectors in decreasing order
     idx = eigvals.argsort(descending=True)
diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
new file mode 100644
index 00000000..0f7fe82a
--- /dev/null
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -0,0 +1,9 @@
+# Loading the data (matrices)
+
+# TO DO: Apply transformation
+
+# Build the model
+
+# Train the model
+
+# Pull the embedings
\ No newline at end of file
diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py
new file mode 100644
index 00000000..42051416
--- /dev/null
+++ b/scripts/shapes/masks2distmatrices.py
@@ -0,0 +1,249 @@
+# Imports when necessary
+import numpy as np
+import torch
+import logging
+import sklearn
+import skimage as sk
+import scipy.spatial
+from scipy.interpolate import splprep, splev
+import matplotlib.image
+import matplotlib.pyplot as plt
+import glob
+
+from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.manifold import MDS
+ 
+from bioimage_embed import shapes
+import bioimage_embed
+from pytorch_lightning import loggers as pl_loggers
+from torchvision import transforms
+from bioimage_embed.lightning import DataModule
+
+from torchvision import datasets
+
+from bioimage_embed.shapes.mds import mds
+
+from bioimage_embed.shapes.transforms import (
+    CropCentroidPipeline,
+    CoordsToDistogram,
+    ImageToCoords,
+    RotateIndexingClockwise,
+)
+
+logger = logging.getLogger(__name__)
+
+# Where is the datat I want to transform
+dataset = f"/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/"
+
+##########################################################################
+####### Simplified version in order to make the things properly work #####
+##########################################################################
+
+def find_contour(mask):
+    contour = sk.measure.find_contours(mask, 0.8)[0]
+    x, y = contour[:, 0], contour[:, 1]
+    return x, y
+
+def spline_interpolation(x, y):
+    sparsity_contour = 4 # Sparsity of the contour. Dropping some of the sample (points) to make the spline smoother
+    tck, u = splprep([x[::sparsity_contour], y[::sparsity_contour]], s = 0)
+    sample_points = 200
+    # How many times to sample the spline
+    new_u = np.linspace(u.min(), u.max(), sample_points) # Last parameter is how dense is our spline, how many points. 
+    # Evaluate the spline
+    x_spline, y_spline = splev(new_u, tck)
+    return x_spline, y_spline
+
+def build_distance_matrix(x_reinterpolated, y_reinterpolated):
+    reinterpolated_contour = np.column_stack([x_reinterpolated, y_reinterpolated])
+    dm = scipy.spatial.distance_matrix(reinterpolated_contour, reinterpolated_contour)
+    return dm
+
+def dist_to_coords(dst_mat):
+  embedding = MDS(n_components=2, dissimilarity='precomputed')
+  return embedding.fit_transform(dst_mat)
+    
+
+# Simplified version for test
+def process_png_file(mask_path):
+    # Perform specific action for each PNG file
+    print("Processing:", mask_path)
+    mask = plt.imread(mask_path)
+    
+    # Get the contour
+    x, y = find_contour(mask)
+
+    # Reinterpolate (spline)
+    x_reinterpolated, y_reinterpolated = spline_interpolation(x, y)
+    plt.scatter(x_reinterpolated, y_reinterpolated, s=6)
+    plt.savefig(f'results/reconstruction/original_contour{i}.png')
+    plt.clf()
+
+    # Build the distance matrix
+    dm = build_distance_matrix(x_reinterpolated, y_reinterpolated)
+    # print("Distance matrix")
+    # print(dm)
+
+    # Reconstruction coordinates and matrix (MDS)
+    reconstructed_coords = dist_to_coords(dm)
+    print(reconstructed_coords)
+    plt.scatter(*zip(*reconstructed_coords), s=6)
+    plt.savefig(f'results/reconstruction/reconstructed_contour{i}.png')
+    plt.clf()
+    reconstructed_matrix = euclidean_distances(reconstructed_coords)
+
+    # Error with matrix
+    err = np.average(dm - reconstructed_matrix)
+    print(f"Dist error is: {err}")
+
+# Specify the folder path containing PNG files
+folder_path = "/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/*/*.png"
+
+# Use glob to find all PNG files in the folder
+png_files = glob.glob(folder_path)
+
+# Iterate through all PNG files found
+for i, file_path in enumerate(png_files):
+    # Process the PNG file
+    process_png_file(file_path)
+
+
+
+
+########################################
+############# Other code ###############
+########################################
+
+# # Needed variables 
+# window_size = 256 # needs to be the same as the latent space size
+# interp_size = 256 # latent space size needs to match the window size
+
+# # This crops the image using the centroid by window sizes. (remember to removed and see what happens)
+# transform_crop = CropCentroidPipeline(window_size) 
+
+# # From the coordinates of the distance matrix, this is actually building the distance matrix
+# transform_coord_to_dist = CoordsToDistogram(interp_size, matrix_normalised=False)
+
+# # It takes the images and converts it into a numpy array  of the image and the size
+# transform_coords = ImageToCoords(window_size)
+
+# # Combination of transforms
+# transform_mask_to_gray = transforms.Compose([transforms.Grayscale(1)])
+
+# transform_mask_to_crop = transforms.Compose(
+#         [
+#             # transforms.ToTensor(),
+#             transform_mask_to_gray,
+#             transform_crop,
+#         ]
+#     )
+
+# transform_mask_to_coords = transforms.Compose(
+#         [
+#             transform_mask_to_crop,
+#             transform_coords,
+#         ]
+#     )
+
+# transform_mask_to_dist = transforms.Compose(
+#         [
+#             transform_mask_to_coords,
+#             transform_coord_to_dist,
+#         ]
+#     )
+
+# def dist_to_coords(dst_mat):
+#   embedding = MDS(n_components=2, dissimilarity='precomputed', max_iter=1)
+#   return embedding.fit_transform(dst_mat)
+  
+  #coords_prime = MDS(
+    #n_components=2, dissimilarity="precomputed", random_state=0).fit_transform(dst_mat)
+  
+  #return coords_prime
+  #return mds(dst_mat)
+  
+  # from https://math.stackexchange.com/a/423898 and https://stackoverflow.com/a/17177833/16632916
+#   m = np.zeros(shape=dst_mat.shape)
+#   for i in range(dst_mat.shape[0]):
+#     for j in range(dst_mat.shape[1]):
+#       m[i,j]= 0.5*(dst_mat[0, j]**2 + dst_mat[i, 0]**2 - dst_mat[i, j]**2)
+#   eigenvalues, eigenvectors = np.linalg.eig(m)
+#   print(f'm:{m}')
+#   print(f'eigenvalues:{eigenvalues}')
+#   print(f'eigenvectors:{eigenvectors}')
+#   return np.sqrt(eigenvalues)*eigenvectors
+
+# # Convert your image to gray scale
+# gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1))
+
+# # choose the transformation you want to apply to your data and Compose
+# transform = transforms.Compose(
+#         [
+#             transform_mask_to_dist,
+#             transforms.ToTensor(),
+#             RotateIndexingClockwise(p=1), # This module effectively allows for random clockwise rotations of input images with a specified probability.
+#             gray2rgb,
+#         ]
+#     )
+
+# transforms_dict = {
+#         "none": transform_mask_to_gray,
+#         "transform_crop": transform_mask_to_crop,
+#         "transform_dist": transform_mask_to_dist,
+#         "transform_coords": transform_mask_to_coords,
+#     }
+
+
+
+# diagonal = np.diag(dm)
+
+# if np.all(diagonal == 0):
+#   print("All elements in the diagonal are zeros.")
+#   dataset_raw[i][0].save(f'original_{i}.png')
+#   np.save(f"random_matrix_{i}.npy", dataset_trans[i][0][0])
+#   matplotlib.image.imsave(f'dist_mat_{i}.png', dataset_trans[i][0][0])
+#   coords = dist_to_coords(dataset_trans[i][0][0])
+#   print(coords)
+#   x, y = list(zip(*coords))
+#   plt.scatter(x_reinterpolated, y_reinterpolated)
+#   plt.savefig(f'mask_{i}.png')
+#   plt.clf()
+#   fig, ax = plt.subplots(1, 4, figsize=(20, 5))
+#   ax[0].imshow(mask)
+#   ax[1].scatter(x_reinterpolated, y_reinterpolated)
+#   ax[1].imshow(dm)
+#   ax[3].scatter(x, y)
+#   fig.savefig(f'combined_{i}.png')
+# else:
+#   print("Not all elements in the diagonal are zeros.")
+
+
+
+# # Apply transform to find which images don't work
+# dataset_raw = datasets.ImageFolder(dataset)
+# dataset_contours = datasets.ImageFolder(dataset, transform=transform_mask_to_coords)
+# dataset_trans = datasets.ImageFolder(dataset, transform=transform)
+
+# # This is a single image distance matrix
+# for i in range(0, 10):
+#     print(dataset_trans[i][0][0])
+#     diagonal = np.diag(dataset_trans[i][0][0])
+#     if np.all(diagonal == 0):
+#         print("All elements in the diagonal are zeros.")
+#         dataset_raw[i][0].save(f'original_{i}.png')
+#         np.save(f"random_matrix_{i}.npy", dataset_trans[i][0][0])
+#         matplotlib.image.imsave(f'dist_mat_{i}.png', dataset_trans[i][0][0])
+#         coords = dist_to_coords(dataset_trans[i][0][0])
+#         print(coords)
+#         x, y = list(zip(*coords))
+#         plt.scatter(x, y)
+#         plt.savefig(f'mask_{i}.png')
+#         plt.clf()
+#         fig, ax = plt.subplots(1, 4, figsize=(20, 5))
+#         ax[0].imshow(dataset_raw[i][0])
+#         ax[1].imshow(dataset_trans[i][0][0])
+#         ax[2].scatter(dataset_contours[i][0][0], dataset_contours[i][0][1])
+#         ax[3].scatter(x, y)
+#         fig.savefig(f'combined_{i}.png')
+#     else:
+#         print("Not all elements in the diagonal are zeros.")
diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py
index eeb797d6..2a82c708 100644
--- a/scripts/shapes/shape_embed.py
+++ b/scripts/shapes/shape_embed.py
@@ -224,7 +224,7 @@ def shape_embed_process(clargs):
     # window_size, interp_size, matrix_normalised=False
     # )
     transform_coord_to_dist = CoordsToDistogram(interp_size, matrix_normalised=False)
-    transform_mdscoords = DistogramToCoords(window_size)
+    #transform_mdscoords = DistogramToCoords(window_size)
     transform_coords = ImageToCoords(window_size)
     
     transform_mask_to_gray = transforms.Compose([transforms.Grayscale(1)])

From 6151a6f3a08f3dca3dd29127dad86344064d2a0a Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 27 Mar 2024 22:24:07 +0000
Subject: [PATCH 073/204] first structure

---
 scripts/shapes/distmatrix2embeding.py | 184 +++++++++++++++++++++++++-
 scripts/shapes/masks2distmatrices.py  |  88 +++++++-----
 2 files changed, 237 insertions(+), 35 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 0f7fe82a..1dbeb7c3 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -1,9 +1,183 @@
-# Loading the data (matrices)
+from torchvision import datasets, transforms
+import pytorch_lightning as pl
+import bioimage_embed
+import bioimage_embed.shapes
+import bioimage_embed.lightning
+import argparse
+import types
 
-# TO DO: Apply transformation
+# misc helpers
+###############################################################################
 
-# Build the model
+def vprint(tgtlvl, msg, pfx = f"{'':<5}"):
+  try:
+    if (tgtlvl <= vprint.lvl):
+      print(f"{pfx}{msg}")
+  except AttributeError:
+    print("verbosity level not set, defaulting to 0")
+    vprint.lvl = 0
+    vprint(tgtlvl, msg)
 
-# Train the model
+# Main process
+###############################################################################
 
-# Pull the embedings
\ No newline at end of file
+def main_process(params):
+
+    # Loading the data (matrices)
+    ###########################################################################
+
+    preproc_transform = transforms.Compose([
+        transforms.ToTensor(),
+    ])
+    dataset = datasets.ImageFolder(params.dataset[1], transform = preproc_transform)
+    dataloader = bioimage_embed.lightning.DataModule(
+        dataset,
+        batch_size=params.batch_size,
+        shuffle=True,
+        num_workers=params.num_workers,
+    )
+    dataloader.setup()
+    vprint(1, f'dataloader ready')
+
+    # Build the model
+    ###########################################################################
+
+    model = bioimage_embed.models.create_model(
+        model=params.model,
+        input_dim=params.input_dim,
+        latent_dim=params.latent_dim,
+        pretrained=params.pretrained,
+    )
+    lit_model = bioimage_embed.shapes.MaskEmbed(model, params)
+    vprint(1, f'model ready')
+
+    # Train the model
+    ###########################################################################
+    
+    trainer = pl.Trainer(
+        #TODO logger=[wandblogger, tb_logger],
+        gradient_clip_val=0.5,
+        enable_checkpointing=True,
+        devices=1,
+        #TODO accelerator="gpu",
+        accumulate_grad_batches=4,
+        #TODO callbacks=[checkpoint_callback],
+        min_epochs=50,
+        max_epochs=params.epochs,
+        log_every_n_steps=1,
+    )
+    trainer.fit(lit_model, datamodule=dataloader)
+    lit_model.eval()
+    vprint(1, f'trainer fitted')
+
+    # Pull the embedings
+    ###########################################################################
+    vprint(1, f'TODO')
+
+# default parameters
+###############################################################################
+
+params = types.SimpleNamespace(**{
+    # general params
+    "model":"resnet18_vae",
+    "epochs": 150,
+    "batch_size": 4,
+    "num_workers": 2**4,
+    "input_dim": (3, 512, 512),
+    "latent_dim": 512,
+    "num_embeddings": 512,
+    "num_hiddens": 512,
+    "pretrained": True,
+    "commitment_cost": 0.25,
+    "decay": 0.99,
+    "frobenius_norm": False,
+    "dataset": "bbbc010/BBBC010_v1_foreground_eachworm",
+    # optimizer_params
+    "opt": "AdamW",
+    "lr": 0.001,
+    "weight_decay": 0.0001,
+    "momentum": 0.9,
+    # lr_scheduler_params
+    "sched": "cosine",
+    "min_lr": 1e-4,
+    "warmup_epochs": 5,
+    "warmup_lr": 1e-6,
+    "cooldown_epochs": 10,
+    "t_max": 50,
+    "cycle_momentum": False,
+})
+
+###############################################################################
+
+if __name__ == "__main__":
+
+    def auto_pos_int (x):
+      val = int(x,0)
+      if val <= 0:
+          raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
+      return val
+    
+    parser = argparse.ArgumentParser(description='Run the shape embed pipeline')
+    
+    models = [
+      "resnet18_vae"
+    , "resnet50_vae"
+    , "resnet18_vae_bolt"
+    , "resnet50_vae_bolt"
+    , "resnet18_vqvae"
+    , "resnet50_vqvae"
+    , "resnet18_vqvae_legacy"
+    , "resnet50_vqvae_legacy"
+    , "resnet101_vqvae_legacy"
+    , "resnet110_vqvae_legacy"
+    , "resnet152_vqvae_legacy"
+    , "resnet18_vae_legacy"
+    , "resnet50_vae_legacy"
+    ]
+    parser.add_argument(
+        '-m', '--model', choices=models, default=models[0], metavar='MODEL'
+      , help=f"The MODEL to use, one of {models} (default {models[0]}).")
+    parser.add_argument(
+        '-d', '--dataset', nargs=2, default=("vampire", "vampire/torchvision/Control/"), metavar=('NAME', 'PATH')
+      , help=f"The NAME of and PATH to the dataset")
+    parser.add_argument(
+        '-w', '--wandb-project', default="shape-embed", metavar='PROJECT'
+      , help=f"The wandb PROJECT name")
+    parser.add_argument(
+        '-b', '--batch-size', default=int(4), metavar='BATCH_SIZE', type=auto_pos_int
+      , help="The BATCH_SIZE for the run, a positive integer (default 4)")
+    parser.add_argument(
+        '-l', '--latent-space-size', default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int
+      , help="The LATENT_SPACE_SIZE, a positive integer (default 128)")
+    parser.add_argument(
+        '-n', '--num-workers', default=int(2**4), metavar='NUM_WORKERS', type=auto_pos_int
+      , help="The NUM_WORKERS for the run, a positive integer (default 2**4)")
+    parser.add_argument(
+        '-e', '--num-epochs', default=int(150), metavar='NUM_EPOCHS', type=auto_pos_int
+      , help="The NUM_EPOCHS for the run, a positive integer (default 150)")
+    #parser.add_argument('--clear-checkpoints', action='store_true'
+    #  , help='remove checkpoints')
+    parser.add_argument('-v', '--verbose', action='count', default=0
+      , help="Increase verbosity level by adding more \"v\".")
+    
+    # parse command line arguments
+    clargs=parser.parse_args()
+    
+    # set verbosity level for vprint function
+    vprint.lvl = clargs.verbose
+    
+    # update default params with clargs
+    params.model = clargs.model
+    params.dataset = clargs.dataset
+    params.wandb_project = clargs.wandb_project
+    params.batch_size = clargs.batch_size
+    interp_size = clargs.latent_space_size * 2
+    params.input_dim = (3, interp_size, interp_size)
+    params.latent_dim = interp_size
+    params.num_embeddings = interp_size
+    params.num_hiddens = interp_size
+    params.num_workers = clargs.num_workers
+    params.epochs = clargs.num_epochs
+    
+    # run main process
+    main_process(params)
\ No newline at end of file
diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py
index 42051416..7d31bb0c 100644
--- a/scripts/shapes/masks2distmatrices.py
+++ b/scripts/shapes/masks2distmatrices.py
@@ -1,27 +1,16 @@
 # Imports when necessary
 import numpy as np
-import torch
-import logging
-import sklearn
 import skimage as sk
 import scipy.spatial
 from scipy.interpolate import splprep, splev
-import matplotlib.image
 import matplotlib.pyplot as plt
 import glob
 
 from sklearn.metrics.pairwise import euclidean_distances
 from sklearn.manifold import MDS
  
-from bioimage_embed import shapes
-import bioimage_embed
-from pytorch_lightning import loggers as pl_loggers
-from torchvision import transforms
-from bioimage_embed.lightning import DataModule
 
-from torchvision import datasets
-
-from bioimage_embed.shapes.mds import mds
+from torchvision import datasets, transforms
 
 from bioimage_embed.shapes.transforms import (
     CropCentroidPipeline,
@@ -30,24 +19,39 @@
     RotateIndexingClockwise,
 )
 
-logger = logging.getLogger(__name__)
-
 # Where is the datat I want to transform
-dataset = f"/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/"
+#folder_path = f"/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/"
+folder_path = f"/nfs/research/uhlmann/afoix/tiny_synthcellshapes_dataset/"
 
 ##########################################################################
 ####### Simplified version in order to make the things properly work #####
 ##########################################################################
 
+def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140):
+  """Turn an rgb array into a greyscale array using the following reduction:
+     grey = cr * r + cg * g + cb * b
+
+    :param rgb: The rgb array
+    :param cr: The red coefficient
+    :param cg: The green coefficient
+    :param cb: The blue coefficient
+
+    :returns: The greyscale array.
+    """
+  r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
+  return cr * r + cg * g + cb * b
+
 def find_contour(mask):
+    if len(mask.shape) == 3: # (lines, columns, number of channels)
+      mask = rgb2grey(mask)
     contour = sk.measure.find_contours(mask, 0.8)[0]
     x, y = contour[:, 0], contour[:, 1]
     return x, y
 
-def spline_interpolation(x, y):
-    sparsity_contour = 4 # Sparsity of the contour. Dropping some of the sample (points) to make the spline smoother
+def spline_interpolation(x, y, sparsity_contour = 4, sample_points = 200):
+    # Sparsity of the contour. Dropping some of the sample (points) to make the spline smoother
+    sparsity_contour = max(1, sparsity_contour)
     tck, u = splprep([x[::sparsity_contour], y[::sparsity_contour]], s = 0)
-    sample_points = 200
     # How many times to sample the spline
     new_u = np.linspace(u.min(), u.max(), sample_points) # Last parameter is how dense is our spline, how many points. 
     # Evaluate the spline
@@ -63,9 +67,28 @@ def dist_to_coords(dst_mat):
   embedding = MDS(n_components=2, dissimilarity='precomputed')
   return embedding.fit_transform(dst_mat)
     
+def mask2distmatrix(mask):
+  # extract mask contour
+  x, y = find_contour(mask)
+  # Reinterpolate (spline)
+  x_reinterpolated, y_reinterpolated = spline_interpolation(x, y)
+  # Build the distance matrix
+  dm = build_distance_matrix(x_reinterpolated, y_reinterpolated)
+  return dm
+
+def masks2distmatrices(mask_dataset_path=folder_path, output_path=None):
+  print('loading base dataset')
+  dataset = datasets.ImageFolder(mask_dataset_path, transform=transforms.Compose([
+    np.array,
+    mask2distmatrix
+  ]))
+  for idx, data in enumerate(dataset):
+    print(f'idx: {idx}')
+    print(f'data: {data}')
+    #torch.save(data, 'data_drive_path{}'.format(idx))
 
 # Simplified version for test
-def process_png_file(mask_path):
+def process_png_file(mask_path, idx, output_folder='./results/reconstruction'):
     # Perform specific action for each PNG file
     print("Processing:", mask_path)
     mask = plt.imread(mask_path)
@@ -76,11 +99,12 @@ def process_png_file(mask_path):
     # Reinterpolate (spline)
     x_reinterpolated, y_reinterpolated = spline_interpolation(x, y)
     plt.scatter(x_reinterpolated, y_reinterpolated, s=6)
-    plt.savefig(f'results/reconstruction/original_contour{i}.png')
+    plt.savefig(f'{output_folder}/original_contour{idx}.png')
     plt.clf()
 
     # Build the distance matrix
     dm = build_distance_matrix(x_reinterpolated, y_reinterpolated)
+    np.save(f"{output_folder}/matrix_{idx}.npy", dm)
     # print("Distance matrix")
     # print(dm)
 
@@ -88,28 +112,32 @@ def process_png_file(mask_path):
     reconstructed_coords = dist_to_coords(dm)
     print(reconstructed_coords)
     plt.scatter(*zip(*reconstructed_coords), s=6)
-    plt.savefig(f'results/reconstruction/reconstructed_contour{i}.png')
+    plt.savefig(f'{output_folder}/reconstructed_contour{idx}.png')
     plt.clf()
     reconstructed_matrix = euclidean_distances(reconstructed_coords)
 
     # Error with matrix
     err = np.average(dm - reconstructed_matrix)
     print(f"Dist error is: {err}")
+###############################################################################
 
-# Specify the folder path containing PNG files
-folder_path = "/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/*/*.png"
-
-# Use glob to find all PNG files in the folder
-png_files = glob.glob(folder_path)
+if __name__ == "__main__":
 
-# Iterate through all PNG files found
-for i, file_path in enumerate(png_files):
-    # Process the PNG file
-    process_png_file(file_path)
+  ## Use glob to find all PNG files in the folder
+  #png_files = glob.glob(folder_path+"*/*.png")
+  #
+  ## Iterate through all PNG files found
+  #for i, file_path in enumerate(png_files):
+  #    # Process the PNG file
+  #    process_png_file(file_path, i)
 
+  masks2distmatrices()
 
 
 
+###############################################################################
+###############################################################################
+###############################################################################
 ########################################
 ############# Other code ###############
 ########################################

From 8208238dcc6b078a29b2cf91d23d774c007e0ed4 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 1 Apr 2024 12:13:05 +0100
Subject: [PATCH 074/204] Properly overwrite default params from clargs

---
 scripts/shapes/distmatrix2embeding.py | 88 +++++++++++++++------------
 1 file changed, 48 insertions(+), 40 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 1dbeb7c3..9c7845df 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -77,9 +77,25 @@ def main_process(params):
 # default parameters
 ###############################################################################
 
+models = [
+  "resnet18_vae"
+, "resnet50_vae"
+, "resnet18_vae_bolt"
+, "resnet50_vae_bolt"
+, "resnet18_vqvae"
+, "resnet50_vqvae"
+, "resnet18_vqvae_legacy"
+, "resnet50_vqvae_legacy"
+, "resnet101_vqvae_legacy"
+, "resnet110_vqvae_legacy"
+, "resnet152_vqvae_legacy"
+, "resnet18_vae_legacy"
+, "resnet50_vae_legacy"
+]
+
 params = types.SimpleNamespace(**{
     # general params
-    "model":"resnet18_vae",
+    "model": "resnet18_vae",
     "epochs": 150,
     "batch_size": 4,
     "num_workers": 2**4,
@@ -91,7 +107,7 @@ def main_process(params):
     "commitment_cost": 0.25,
     "decay": 0.99,
     "frobenius_norm": False,
-    "dataset": "bbbc010/BBBC010_v1_foreground_eachworm",
+    "dataset": ("tiny_dist", "/nfs/research/uhlmann/afoix/tiny_synthcellshapes_dataset_distmat"),
     # optimizer_params
     "opt": "AdamW",
     "lr": 0.001,
@@ -119,42 +135,27 @@ def auto_pos_int (x):
     
     parser = argparse.ArgumentParser(description='Run the shape embed pipeline')
     
-    models = [
-      "resnet18_vae"
-    , "resnet50_vae"
-    , "resnet18_vae_bolt"
-    , "resnet50_vae_bolt"
-    , "resnet18_vqvae"
-    , "resnet50_vqvae"
-    , "resnet18_vqvae_legacy"
-    , "resnet50_vqvae_legacy"
-    , "resnet101_vqvae_legacy"
-    , "resnet110_vqvae_legacy"
-    , "resnet152_vqvae_legacy"
-    , "resnet18_vae_legacy"
-    , "resnet50_vae_legacy"
-    ]
     parser.add_argument(
-        '-m', '--model', choices=models, default=models[0], metavar='MODEL'
-      , help=f"The MODEL to use, one of {models} (default {models[0]}).")
+        '-m', '--model', choices=models, metavar='MODEL'
+      , help=f"The MODEL to use, one of {models} (default {params.model}).")
     parser.add_argument(
-        '-d', '--dataset', nargs=2, default=("vampire", "vampire/torchvision/Control/"), metavar=('NAME', 'PATH')
-      , help=f"The NAME of and PATH to the dataset")
+        '-d', '--dataset', nargs=2, metavar=('NAME', 'PATH')
+      , help=f"The NAME of and PATH to the dataset (default: {params.dataset})")
     parser.add_argument(
         '-w', '--wandb-project', default="shape-embed", metavar='PROJECT'
       , help=f"The wandb PROJECT name")
     parser.add_argument(
-        '-b', '--batch-size', default=int(4), metavar='BATCH_SIZE', type=auto_pos_int
-      , help="The BATCH_SIZE for the run, a positive integer (default 4)")
+        '-b', '--batch-size', metavar='BATCH_SIZE', type=auto_pos_int
+      , help=f"The BATCH_SIZE for the run, a positive integer (default {params.batch_size})")
     parser.add_argument(
-        '-l', '--latent-space-size', default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int
-      , help="The LATENT_SPACE_SIZE, a positive integer (default 128)")
+        '-l', '--latent-space-size', metavar='LATENT_SPACE_SIZE', type=auto_pos_int
+      , help=f"The LATENT_SPACE_SIZE, a positive integer (default {params.latent_dim})")
     parser.add_argument(
-        '-n', '--num-workers', default=int(2**4), metavar='NUM_WORKERS', type=auto_pos_int
-      , help="The NUM_WORKERS for the run, a positive integer (default 2**4)")
+        '-n', '--num-workers', metavar='NUM_WORKERS', type=auto_pos_int
+      , help=f"The NUM_WORKERS for the run, a positive integer (default {params.num_workers})")
     parser.add_argument(
-        '-e', '--num-epochs', default=int(150), metavar='NUM_EPOCHS', type=auto_pos_int
-      , help="The NUM_EPOCHS for the run, a positive integer (default 150)")
+        '-e', '--num-epochs', metavar='NUM_EPOCHS', type=auto_pos_int
+      , help=f"The NUM_EPOCHS for the run, a positive integer (default {params.epochs})")
     #parser.add_argument('--clear-checkpoints', action='store_true'
     #  , help='remove checkpoints')
     parser.add_argument('-v', '--verbose', action='count', default=0
@@ -167,17 +168,24 @@ def auto_pos_int (x):
     vprint.lvl = clargs.verbose
     
     # update default params with clargs
-    params.model = clargs.model
-    params.dataset = clargs.dataset
-    params.wandb_project = clargs.wandb_project
-    params.batch_size = clargs.batch_size
-    interp_size = clargs.latent_space_size * 2
-    params.input_dim = (3, interp_size, interp_size)
-    params.latent_dim = interp_size
-    params.num_embeddings = interp_size
-    params.num_hiddens = interp_size
-    params.num_workers = clargs.num_workers
-    params.epochs = clargs.num_epochs
+    if clargs.model:
+      params.model = clargs.model
+    if clargs.dataset:
+      params.dataset = clargs.dataset
+    if clargs.wandb_project:
+      params.wandb_project = clargs.wandb_project
+    if clargs.batch_size:
+      params.batch_size = clargs.batch_size
+    if clargs.latent_space_size:
+      interp_size = clargs.latent_space_size * 2
+      params.input_dim = (params.input_dim[0], interp_size, interp_size)
+      params.latent_dim = interp_size
+      params.num_embeddings = interp_size
+      params.num_hiddens = interp_size
+    if clargs.num_workers:
+      params.num_workers = clargs.num_workers
+    if clargs.num_epochs:
+      params.epochs = clargs.num_epochs
     
     # run main process
     main_process(params)
\ No newline at end of file

From aacac25deaf527653f2f7406d5eb8665fa39e9f3 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 1 Apr 2024 12:16:43 +0100
Subject: [PATCH 075/204] Use DatasetFolder to load .npy and turn the dist
 matrix into a 3 channels copy for models to be happy

---
 scripts/shapes/distmatrix2embeding.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 9c7845df..a1026b67 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -1,9 +1,11 @@
 from torchvision import datasets, transforms
 import pytorch_lightning as pl
+import numpy as np
 import bioimage_embed
 import bioimage_embed.shapes
 import bioimage_embed.lightning
 import argparse
+import torch
 import types
 
 # misc helpers
@@ -27,9 +29,10 @@ def main_process(params):
     ###########################################################################
 
     preproc_transform = transforms.Compose([
-        transforms.ToTensor(),
+        torch.as_tensor, # turn (H,W) numpy array into a (H,W) tensor
+        lambda x: x.repeat(3, 1, 1) # turn (H,W) tensor into a (3,H,W) tensor (to fit downstream model expectations)
     ])
-    dataset = datasets.ImageFolder(params.dataset[1], transform = preproc_transform)
+    dataset = datasets.DatasetFolder(params.dataset[1], loader=np.load, extensions=('npy'), transform = preproc_transform)
     dataloader = bioimage_embed.lightning.DataModule(
         dataset,
         batch_size=params.batch_size,

From 579ab0bf7eb94ee85891d16db7070e13a4daea36 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 1 Apr 2024 12:19:07 +0100
Subject: [PATCH 076/204] Disable checkpoints in training by default (maybe
 re-enable at some future point)

---
 scripts/shapes/distmatrix2embeding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index a1026b67..c1168798 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -60,7 +60,7 @@ def main_process(params):
     trainer = pl.Trainer(
         #TODO logger=[wandblogger, tb_logger],
         gradient_clip_val=0.5,
-        enable_checkpointing=True,
+        enable_checkpointing=False,
         devices=1,
         #TODO accelerator="gpu",
         accumulate_grad_batches=4,

From f0fee887c6a4bb93e341c673f4e27022c1585954 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 1 Apr 2024 12:19:56 +0100
Subject: [PATCH 077/204] Enable gpu accelleration by default

---
 scripts/shapes/distmatrix2embeding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index c1168798..144b5102 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -62,7 +62,7 @@ def main_process(params):
         gradient_clip_val=0.5,
         enable_checkpointing=False,
         devices=1,
-        #TODO accelerator="gpu",
+        accelerator="gpu",
         accumulate_grad_batches=4,
         #TODO callbacks=[checkpoint_callback],
         min_epochs=50,

From 5e97713d757c18b1869ec3eaea1beae6a32dce40 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 1 Apr 2024 12:20:24 +0100
Subject: [PATCH 078/204] more informative verbose print

---
 scripts/shapes/distmatrix2embeding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 144b5102..79166af5 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -75,7 +75,7 @@ def main_process(params):
 
     # Pull the embedings
     ###########################################################################
-    vprint(1, f'TODO')
+    vprint(1, f'TODO: pull the embedings')
 
 # default parameters
 ###############################################################################

From bb425dfcce248d138089f0ced7614dd00be17190 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 1 Apr 2024 13:41:54 +0100
Subject: [PATCH 079/204] bring argparse to the masks2distmatrices script

---
 scripts/shapes/masks2distmatrices.py | 236 +++++++++++++++++----------
 1 file changed, 151 insertions(+), 85 deletions(-)

diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py
index 7d31bb0c..023617ae 100644
--- a/scripts/shapes/masks2distmatrices.py
+++ b/scripts/shapes/masks2distmatrices.py
@@ -1,31 +1,25 @@
-# Imports when necessary
 import numpy as np
+import imageio.v3 as iio
 import skimage as sk
-import scipy.spatial
 from scipy.interpolate import splprep, splev
-import matplotlib.pyplot as plt
+import scipy.spatial
+import argparse
+import pathlib
+import types
 import glob
+import os
 
-from sklearn.metrics.pairwise import euclidean_distances
-from sklearn.manifold import MDS
- 
-
-from torchvision import datasets, transforms
-
-from bioimage_embed.shapes.transforms import (
-    CropCentroidPipeline,
-    CoordsToDistogram,
-    ImageToCoords,
-    RotateIndexingClockwise,
-)
-
-# Where is the datat I want to transform
-#folder_path = f"/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/"
-folder_path = f"/nfs/research/uhlmann/afoix/tiny_synthcellshapes_dataset/"
+# misc helpers
+###############################################################################
 
-##########################################################################
-####### Simplified version in order to make the things properly work #####
-##########################################################################
+def vprint(tgtlvl, msg, pfx = f"{'':<5}"):
+  try:
+    if (tgtlvl <= vprint.lvl):
+      print(f"{pfx}{msg}")
+  except AttributeError:
+    print("verbosity level not set, defaulting to 0")
+    vprint.lvl = 0
+    vprint(tgtlvl, msg)
 
 def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140):
   """Turn an rgb array into a greyscale array using the following reduction:
@@ -41,6 +35,10 @@ def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140):
   r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
   return cr * r + cg * g + cb * b
 
+##########################################################################
+####### Simplified version in order to make the things properly work #####
+##########################################################################
+
 def find_contour(mask):
     if len(mask.shape) == 3: # (lines, columns, number of channels)
       mask = rgb2grey(mask)
@@ -48,12 +46,12 @@ def find_contour(mask):
     x, y = contour[:, 0], contour[:, 1]
     return x, y
 
-def spline_interpolation(x, y, sparsity_contour = 4, sample_points = 200):
+def spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling):
     # Sparsity of the contour. Dropping some of the sample (points) to make the spline smoother
-    sparsity_contour = max(1, sparsity_contour)
-    tck, u = splprep([x[::sparsity_contour], y[::sparsity_contour]], s = 0)
+    raw_sampling_sparsity = max(1, raw_sampling_sparsity)
+    tck, u = splprep([x[::raw_sampling_sparsity], y[::raw_sampling_sparsity]], s = 0)
     # How many times to sample the spline
-    new_u = np.linspace(u.min(), u.max(), sample_points) # Last parameter is how dense is our spline, how many points. 
+    new_u = np.linspace(u.min(), u.max(), spline_sampling) # Last parameter is how dense is our spline, how many points.
     # Evaluate the spline
     x_spline, y_spline = splev(new_u, tck)
     return x_spline, y_spline
@@ -66,72 +64,140 @@ def build_distance_matrix(x_reinterpolated, y_reinterpolated):
 def dist_to_coords(dst_mat):
   embedding = MDS(n_components=2, dissimilarity='precomputed')
   return embedding.fit_transform(dst_mat)
-    
-def mask2distmatrix(mask):
+
+def mask2distmatrix(mask, raw_sampling_sparsity=1, spline_sampling=512):
+  vprint(3, f'running with raw_sampling_sparsity {raw_sampling_sparsity} and spline_sampling {spline_sampling}')
   # extract mask contour
   x, y = find_contour(mask)
   # Reinterpolate (spline)
-  x_reinterpolated, y_reinterpolated = spline_interpolation(x, y)
+  x_reinterpolated, y_reinterpolated = spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling)
   # Build the distance matrix
   dm = build_distance_matrix(x_reinterpolated, y_reinterpolated)
+  vprint(3, f'created distance matrix shape {dm.shape}')
   return dm
 
-def masks2distmatrices(mask_dataset_path=folder_path, output_path=None):
-  print('loading base dataset')
-  dataset = datasets.ImageFolder(mask_dataset_path, transform=transforms.Compose([
-    np.array,
-    mask2distmatrix
-  ]))
-  for idx, data in enumerate(dataset):
-    print(f'idx: {idx}')
-    print(f'data: {data}')
-    #torch.save(data, 'data_drive_path{}'.format(idx))
-
-# Simplified version for test
-def process_png_file(mask_path, idx, output_folder='./results/reconstruction'):
-    # Perform specific action for each PNG file
-    print("Processing:", mask_path)
-    mask = plt.imread(mask_path)
-    
-    # Get the contour
-    x, y = find_contour(mask)
-
-    # Reinterpolate (spline)
-    x_reinterpolated, y_reinterpolated = spline_interpolation(x, y)
-    plt.scatter(x_reinterpolated, y_reinterpolated, s=6)
-    plt.savefig(f'{output_folder}/original_contour{idx}.png')
-    plt.clf()
-
-    # Build the distance matrix
-    dm = build_distance_matrix(x_reinterpolated, y_reinterpolated)
-    np.save(f"{output_folder}/matrix_{idx}.npy", dm)
-    # print("Distance matrix")
-    # print(dm)
-
-    # Reconstruction coordinates and matrix (MDS)
-    reconstructed_coords = dist_to_coords(dm)
-    print(reconstructed_coords)
-    plt.scatter(*zip(*reconstructed_coords), s=6)
-    plt.savefig(f'{output_folder}/reconstructed_contour{idx}.png')
-    plt.clf()
-    reconstructed_matrix = euclidean_distances(reconstructed_coords)
-
-    # Error with matrix
-    err = np.average(dm - reconstructed_matrix)
-    print(f"Dist error is: {err}")
+def masks2distmatrices(params):
+
+  vprint(1, 'loading base dataset')
+
+  if not params.mask_dataset_path:
+    sys.exit("no mask dataset provided")
+  if not params.output_path:
+    p = pathlib.Path(params.mask_dataset_path)
+    params.output_path=p.joinpath(p.parent, p.name+'_distmat')
+
+  vprint(2, f'>>>> params.mask_dataset_path: {params.mask_dataset_path}')
+  vprint(2, f'>>>> params.mask_dataset_path: {next(os.walk(params.mask_dataset_path))[1]}')
+  vprint(2, f'>>>> params.output_path: {params.output_path}')
+  pathlib.Path(params.output_path).mkdir(parents=True, exist_ok=True)
+  class_folders = next(os.walk(params.mask_dataset_path))[1]
+  vprint(2, f'>>>> class_folders: {class_folders}')
+  for class_folder in class_folders:
+    vprint(2, f'>>>> class_folder: {class_folder}')
+    output_class_folder=os.path.join(params.output_path, class_folder)
+    vprint(2, f'creating output class folder: {output_class_folder}')
+    pathlib.Path(output_class_folder).mkdir(parents=True, exist_ok=True)
+    for mask_png in glob.glob(params.mask_dataset_path+'/'+class_folder+'/'+'*.png'):
+      vprint(3, f'{"-"*80}')
+      vprint(3, f'working on {mask_png}')
+      filename = os.path.basename(mask_png).split('.')[0]
+      vprint(3, f'filename {filename}')
+      mask = iio.imread(mask_png)
+      dm = mask2distmatrix(mask, params.raw_sampling_sparsity, params.spline_sampling)
+      output_file_name=f"{output_class_folder}/{filename}.npy"
+      vprint(3, f'saving {output_file_name}')
+      vprint(3, f'{"-"*80}')
+      np.save(output_file_name, dm)
+
+
+  #print('loading base dataset')
+  #dataset = datasets.ImageFolder(mask_dataset_path, transform=transforms.Compose([
+  #  np.array,
+  #  mask2distmatrix
+  #]))
+  #for idx, data in enumerate(dataset):
+  #  print(f'idx: {idx}')
+  #  print(f'data: {data}')
+  #  #torch.save(data, 'data_drive_path{}'.format(idx))
+  #print(dataset)
+
+# # Simplified version for test
+# def process_png_file(mask_path, idx, output_folder='./results/reconstruction'):
+#     # Perform specific action for each PNG file
+#     print("Processing:", mask_path)
+#     mask = plt.imread(mask_path)
+
+#     # Get the contour
+#     x, y = find_contour(mask)
+
+#     # Reinterpolate (spline)
+#     x_reinterpolated, y_reinterpolated = spline_interpolation(x, y)
+#     plt.scatter(x_reinterpolated, y_reinterpolated, s=6)
+#     plt.savefig(f'{output_folder}/original_contour{idx}.png')
+#     plt.clf()
+
+#     # Build the distance matrix
+#     dm = build_distance_matrix(x_reinterpolated, y_reinterpolated)
+#     np.save(f"{output_folder}/matrix_{idx}.npy", dm)
+
+#     # Reconstruction coordinates and matrix (MDS)
+#     reconstructed_coords = dist_to_coords(dm)
+#     print(reconstructed_coords)
+#     plt.scatter(*zip(*reconstructed_coords), s=6)
+#     plt.savefig(f'{output_folder}/reconstructed_contour{idx}.png')
+#     plt.clf()
+#     reconstructed_matrix = euclidean_distances(reconstructed_coords)
+
+#     # Error with matrix
+#     err = np.average(dm - reconstructed_matrix)
+#     print(f"Dist error is: {err}")
+
 ###############################################################################
 
+params = types.SimpleNamespace(**{
+    "mask_dataset_path": None
+  , "output_path": None
+  , "raw_sampling_sparsity": 1
+  , "spline_sampling": 512
+})
+
 if __name__ == "__main__":
 
-  ## Use glob to find all PNG files in the folder
-  #png_files = glob.glob(folder_path+"*/*.png")
-  #
-  ## Iterate through all PNG files found
-  #for i, file_path in enumerate(png_files):
-  #    # Process the PNG file
-  #    process_png_file(file_path, i)
+  def auto_pos_int (x):
+    val = int(x,0)
+    if val <= 0:
+        raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
+    return val
+
+  parser = argparse.ArgumentParser(description='Turn mask dataset into distance matrix dataset')
+
+  parser.add_argument('path', metavar='PATH', help=f"The PATH to the dataset")
+  parser.add_argument('-o', '--output-path', help="The desired output path to the generated dataset")
+  parser.add_argument('-s', '--raw-sampling-sparsity', type=auto_pos_int
+    , help=f"The desired sparsity (in number of points) when sampling the raw contour (default, every {params.raw_sampling_sparsity} point(s))")
+  parser.add_argument('-n', '--spline-sampling', type=auto_pos_int
+    , help=f"The desired number of points when sampling the spline contour (default, {params.spline_sampling} point(s))")
+  parser.add_argument('-v', '--verbose', action='count', default=0
+    , help="Increase verbosity level by adding more \"v\".")
 
-  masks2distmatrices()
+  # parse command line arguments
+  clargs=parser.parse_args()
+
+  # set verbosity level for vprint function
+  vprint.lvl = clargs.verbose
+
+  # update default params with clargs
+  if clargs.path:
+    params.mask_dataset_path = clargs.path
+  #params.mask_dataset_path = "/nfs/research/uhlmann/afoix/tiny_synthcellshapes_dataset"
+  if clargs.output_path:
+    params.output_path = clargs.output_path
+  if clargs.raw_sampling_sparsity:
+    params.raw_sampling_sparsity = clargs.raw_sampling_sparsity
+  if clargs.spline_sampling:
+    params.spline_sampling = clargs.spline_sampling
+
+  masks2distmatrices(params)
 
 
 
@@ -142,12 +208,12 @@ def process_png_file(mask_path, idx, output_folder='./results/reconstruction'):
 ############# Other code ###############
 ########################################
 
-# # Needed variables 
+# # Needed variables
 # window_size = 256 # needs to be the same as the latent space size
 # interp_size = 256 # latent space size needs to match the window size
 
 # # This crops the image using the centroid by window sizes. (remember to removed and see what happens)
-# transform_crop = CropCentroidPipeline(window_size) 
+# transform_crop = CropCentroidPipeline(window_size)
 
 # # From the coordinates of the distance matrix, this is actually building the distance matrix
 # transform_coord_to_dist = CoordsToDistogram(interp_size, matrix_normalised=False)
@@ -183,13 +249,13 @@ def process_png_file(mask_path, idx, output_folder='./results/reconstruction'):
 # def dist_to_coords(dst_mat):
 #   embedding = MDS(n_components=2, dissimilarity='precomputed', max_iter=1)
 #   return embedding.fit_transform(dst_mat)
-  
+
   #coords_prime = MDS(
     #n_components=2, dissimilarity="precomputed", random_state=0).fit_transform(dst_mat)
-  
+
   #return coords_prime
   #return mds(dst_mat)
-  
+
   # from https://math.stackexchange.com/a/423898 and https://stackoverflow.com/a/17177833/16632916
 #   m = np.zeros(shape=dst_mat.shape)
 #   for i in range(dst_mat.shape[0]):

From ac3b85a79875b59073f91731c177837c98d6d32c Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 2 Apr 2024 11:35:06 +0100
Subject: [PATCH 080/204] training and test model

---
 scripts/shapes/distmatrix2embeding.py | 33 +++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 79166af5..277e7269 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -4,6 +4,7 @@
 import bioimage_embed
 import bioimage_embed.shapes
 import bioimage_embed.lightning
+from bioimage_embed.lightning import DataModule
 import argparse
 import torch
 import types
@@ -72,10 +73,38 @@ def main_process(params):
     trainer.fit(lit_model, datamodule=dataloader)
     lit_model.eval()
     vprint(1, f'trainer fitted')
-
-    # Pull the embedings
+    
+    #TODO: Validate the model
+    ########################################################################### 
+    vprint(1, f'TODO: Validate the model')
+    validation = trainer.validate(lit_model, datamodule=dataloader)
+    
+    #TODO: Test the model
+    ###########################################################################  
+    vprint(1, f'TODO: Test the model')
+    testing = trainer.test(lit_model, datamodule=dataloader)
+    
+    # Inference on full dataset
+    dataloader = DataModule(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=params.num_workers,
+        # Transform is commented here to avoid augmentations in real data
+        # HOWEVER, applying the transform multiple times and averaging the results might produce better latent embeddings
+        # transform=transform,
+    )
+    dataloader.setup()
+    
+    predictions = trainer.predict(lit_model, datamodule=dataloader)
+    
+    #TODO: Pull the embedings
     ###########################################################################
     vprint(1, f'TODO: pull the embedings')
+    # Use the namespace variables
+    latent_space = torch.stack([d.out.z.flatten() for d in predictions])
+    # Save the latent space
+    np.save('latent_space.npy', latent_space)
 
 # default parameters
 ###############################################################################

From 7e7c7a28cd6d99d21cd6d00c722193028a8c4653 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 2 Apr 2024 21:43:04 +0100
Subject: [PATCH 081/204] Roll indices + normalisation + sanity_check + dataset
 name for latent space

---
 scripts/shapes/distmatrix2embeding.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 277e7269..ad26555d 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -6,6 +6,7 @@
 import bioimage_embed.lightning
 from bioimage_embed.lightning import DataModule
 import argparse
+import datetime
 import torch
 import types
 
@@ -21,6 +22,21 @@ def vprint(tgtlvl, msg, pfx = f"{'':<5}"):
     vprint.lvl = 0
     vprint(tgtlvl, msg)
 
+def maybe_roll (dist_mat, p = 0.5):
+  if np.random.rand() < p:
+    return np.roll(dist_mat, np.random.randint(0, dist_mat.shape[0]), (0,1))
+  else:
+    return dist_mat
+
+def sanity_check (dist_mat):
+  if not np.allclose(dist_mat, dist_mat.T):
+    raise ValueError("Matrix is not symmetric")
+  if np.any(dist_mat < 0):
+    raise ValueError("Matrix has negative values")
+  if np.any(np.diag(dist_mat)):
+    raise ValueError("Matrix has non-zero diagonal")
+  return dist_mat
+
 # Main process
 ###############################################################################
 
@@ -30,10 +46,14 @@ def main_process(params):
     ###########################################################################
 
     preproc_transform = transforms.Compose([
+        lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix
+        lambda x: maybe_roll(x, p = 1.0), # "potentially" roll the matrix
+        sanity_check, # check if the matrix is symmetric and positive, and the diagonal is zero
         torch.as_tensor, # turn (H,W) numpy array into a (H,W) tensor
         lambda x: x.repeat(3, 1, 1) # turn (H,W) tensor into a (3,H,W) tensor (to fit downstream model expectations)
     ])
     dataset = datasets.DatasetFolder(params.dataset[1], loader=np.load, extensions=('npy'), transform = preproc_transform)
+    #dataset = datasets.DatasetFolder(params.dataset[1], loader=lambda x: np.load(x, allow_pickle=True), extensions=('npy'), transform = preproc_transform)
     dataloader = bioimage_embed.lightning.DataModule(
         dataset,
         batch_size=params.batch_size,
@@ -104,7 +124,7 @@ def main_process(params):
     # Use the namespace variables
     latent_space = torch.stack([d.out.z.flatten() for d in predictions])
     # Save the latent space
-    np.save('latent_space.npy', latent_space)
+    np.save(f'{params.dataset[0]}_{str(datetime.datetime.now()).replace(" ", "_")}.npy', latent_space)
 
 # default parameters
 ###############################################################################

From da4acf89c30cba05dc54e29373ac02adcd7e4b9e Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 2 Apr 2024 22:00:25 +0100
Subject: [PATCH 082/204] Added wandb logging

---
 scripts/shapes/distmatrix2embeding.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index ad26555d..c7adae0f 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -5,6 +5,7 @@
 import bioimage_embed.shapes
 import bioimage_embed.lightning
 from bioimage_embed.lightning import DataModule
+from pytorch_lightning import loggers as pl_loggers
 import argparse
 import datetime
 import torch
@@ -75,11 +76,17 @@ def main_process(params):
     lit_model = bioimage_embed.shapes.MaskEmbed(model, params)
     vprint(1, f'model ready')
 
+    # WandB logger
+    ###########################################################################
+    jobname = f"{params.model}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}"
+    wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=jobname)
+    wandblogger.watch(lit_model, log="all")
+
     # Train the model
     ###########################################################################
     
     trainer = pl.Trainer(
-        #TODO logger=[wandblogger, tb_logger],
+        logger=[wandblogger],
         gradient_clip_val=0.5,
         enable_checkpointing=False,
         devices=1,
@@ -194,8 +201,11 @@ def auto_pos_int (x):
         '-d', '--dataset', nargs=2, metavar=('NAME', 'PATH')
       , help=f"The NAME of and PATH to the dataset (default: {params.dataset})")
     parser.add_argument(
-        '-w', '--wandb-project', default="shape-embed", metavar='PROJECT'
-      , help=f"The wandb PROJECT name")
+        '--wandb-entity', default="foix", metavar='WANDB_ENTITY'
+      , help=f"The WANDB_ENTITY name")
+    parser.add_argument(
+        '--wandb-project', default="simply-shape", metavar='WANDB_PROJECT'
+      , help=f"The WANDB_PROJECT name")
     parser.add_argument(
         '-b', '--batch-size', metavar='BATCH_SIZE', type=auto_pos_int
       , help=f"The BATCH_SIZE for the run, a positive integer (default {params.batch_size})")
@@ -224,6 +234,8 @@ def auto_pos_int (x):
       params.model = clargs.model
     if clargs.dataset:
       params.dataset = clargs.dataset
+    if clargs.wandb_entity:
+      params.wandb_entity = clargs.wandb_entity
     if clargs.wandb_project:
       params.wandb_project = clargs.wandb_project
     if clargs.batch_size:

From fd7d1225e2f27c42e2657909d736badcaef3bccf Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 3 Apr 2024 00:37:34 +0100
Subject: [PATCH 083/204] Added the extraction of original/reconstructed
 matrices + clarg for output dir

---
 scripts/shapes/distmatrix2embeding.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index c7adae0f..9f3616e5 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -8,6 +8,7 @@
 from pytorch_lightning import loggers as pl_loggers
 import argparse
 import datetime
+import pathlib
 import torch
 import types
 
@@ -125,13 +126,22 @@ def main_process(params):
     
     predictions = trainer.predict(lit_model, datamodule=dataloader)
     
-    #TODO: Pull the embedings
+    #TODO: Pull the embedings and reconstructed distance matrices
     ###########################################################################
-    vprint(1, f'TODO: pull the embedings')
+    vprint(1, f'pull the embedings')
     # Use the namespace variables
     latent_space = torch.stack([d.out.z.flatten() for d in predictions])
+    # create the output directory
+    output_dir = params.output_dir
+    if output_dir is None:
+      output_dir = f'./{params.model}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
+    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
     # Save the latent space
-    np.save(f'{params.dataset[0]}_{str(datetime.datetime.now()).replace(" ", "_")}.npy', latent_space)
+    np.save(f'{output_dir}/latent_space.npy', latent_space)
+    # Save the reconstructions
+    for i, pred in enumerate(predictions):
+      np.save(f'{output_dir}/original_{i}.npy', pred.x.data[0,0])
+      np.save(f'{output_dir}/reconstruction_{i}.npy', pred.out.recon_x[0,0])
 
 # default parameters
 ###############################################################################
@@ -200,6 +210,9 @@ def auto_pos_int (x):
     parser.add_argument(
         '-d', '--dataset', nargs=2, metavar=('NAME', 'PATH')
       , help=f"The NAME of and PATH to the dataset (default: {params.dataset})")
+    parser.add_argument(
+        '-o', '--output-dir', metavar='OUTPUT_DIR', default=None
+      , help=f"The OUTPUT_DIR path to use to dump results")
     parser.add_argument(
         '--wandb-entity', default="foix", metavar='WANDB_ENTITY'
       , help=f"The WANDB_ENTITY name")
@@ -232,6 +245,7 @@ def auto_pos_int (x):
     # update default params with clargs
     if clargs.model:
       params.model = clargs.model
+    params.output_dir = clargs.output_dir
     if clargs.dataset:
       params.dataset = clargs.dataset
     if clargs.wandb_entity:

From ee7cd7f2be432a8ad4c52693a3da978e962cae6f Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 3 Apr 2024 00:38:29 +0100
Subject: [PATCH 084/204] created a script that renders dist matrices .npy as
 .png images

---
 scripts/shapes/distmatrices2contour.py | 73 ++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 scripts/shapes/distmatrices2contour.py

diff --git a/scripts/shapes/distmatrices2contour.py b/scripts/shapes/distmatrices2contour.py
new file mode 100644
index 00000000..70e15a1d
--- /dev/null
+++ b/scripts/shapes/distmatrices2contour.py
@@ -0,0 +1,73 @@
+import matplotlib.pyplot as plt
+from sklearn.manifold import MDS
+import numpy as np
+import argparse
+import pathlib
+import types
+import glob
+
+# misc helpers
+###############################################################################
+
+def vprint(tgtlvl, msg, pfx = f"{'':<5}"):
+  try:
+    if (tgtlvl <= vprint.lvl):
+      print(f"{pfx}{msg}")
+  except AttributeError:
+    print("verbosity level not set, defaulting to 0")
+    vprint.lvl = 0
+    vprint(tgtlvl, msg)
+    return dm
+
+def asym_to_sym(asym_dist_mat):
+  return np.max(np.stack([asym_dist_mat, asym_dist_mat.T]), axis=0)
+
+def dist_to_coords(dst_mat):
+  embedding = MDS(n_components=2, dissimilarity='precomputed', normalized_stress='auto')
+  return embedding.fit_transform(dst_mat)
+
+def distmatrices2contour(params):
+  plt.clf()
+  dm_npys = glob.glob(f'{params.matrices_folder}/orig*.npy') + glob.glob(f'{params.matrices_folder}/recon*.npy')
+  for dm_npy in dm_npys:
+    dm = np.load(dm_npy)
+    vprint(2, f'{dm_npy}: dm.shape={dm.shape}')
+    dm = asym_to_sym(dm)
+    p = pathlib.Path(dm_npy)
+    p = p.with_suffix('.png')
+    reconstructed_coords = dist_to_coords(dm)
+    plt.scatter(*zip(*reconstructed_coords), s=6)
+    plt.savefig(p)
+    vprint(2, f'saved {p}')
+    plt.clf()
+
+###############################################################################
+
+params = types.SimpleNamespace(**{
+    "matrices_folder": None
+})
+
+if __name__ == "__main__":
+
+  def auto_pos_int (x):
+    val = int(x,0)
+    if val <= 0:
+        raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
+    return val
+
+  parser = argparse.ArgumentParser(description='Turn distance matrices into contours')
+
+  parser.add_argument('matrices_folder', metavar='MATRICES_FOLDER', help=f"The path to the matrices folder")
+  parser.add_argument('-v', '--verbose', action='count', default=0
+    , help="Increase verbosity level by adding more \"v\".")
+
+  # parse command line arguments
+  clargs=parser.parse_args()
+
+  # set verbosity level for vprint function
+  vprint.lvl = clargs.verbose
+
+  # update default params with clargs
+  params.matrices_folder = clargs.matrices_folder
+
+  distmatrices2contour(params)

From 20db342bf3aa2a0044fbb00f7e608b93bf384f06 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 9 Apr 2024 08:38:59 +0100
Subject: [PATCH 085/204] new changes: sparisity, periodicity and also add a
 script to draw  contours from dm

---
 scripts/shapes/drawContourFromDM.py | 74 +++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 scripts/shapes/drawContourFromDM.py

diff --git a/scripts/shapes/drawContourFromDM.py b/scripts/shapes/drawContourFromDM.py
new file mode 100644
index 00000000..39863aee
--- /dev/null
+++ b/scripts/shapes/drawContourFromDM.py
@@ -0,0 +1,74 @@
+
+import matplotlib.pyplot as plt
+from sklearn.manifold import MDS
+import numpy as np
+import argparse
+import pathlib
+import types
+import glob
+
+# misc helpers
+###############################################################################
+
+def vprint(tgtlvl, msg, pfx = f"{'':<5}"):
+  try:
+    if (tgtlvl <= vprint.lvl):
+      print(f"{pfx}{msg}")
+  except AttributeError:
+    print("verbosity level not set, defaulting to 0")
+    vprint.lvl = 0
+    vprint(tgtlvl, msg)
+    #return dm
+
+def asym_to_sym(asym_dist_mat):
+  return np.max(np.stack([asym_dist_mat, asym_dist_mat.T]), axis=0)
+
+def dist_to_coords(dst_mat):
+  embedding = MDS(n_components=2, dissimilarity='precomputed', normalized_stress='auto')
+  return embedding.fit_transform(dst_mat)
+
+def distmatrices2contour(params):
+  plt.clf()
+  dm_npys = glob.glob(f'{params.matrices_folder}/*.npy')
+  for dm_npy in dm_npys:
+    dm = np.load(dm_npy)
+    vprint(2, f'{dm_npy}: dm.shape={dm.shape}')
+    dm = asym_to_sym(dm)
+    p = pathlib.Path(dm_npy)
+    p = p.with_suffix('.png')
+    reconstructed_coords = dist_to_coords(dm)
+    plt.scatter(*zip(*reconstructed_coords), s=6)
+    plt.savefig(p)
+    vprint(2, f'saved {p}')
+    plt.clf()
+
+###############################################################################
+
+params = types.SimpleNamespace(**{
+    "matrices_folder": None
+})
+
+if __name__ == "__main__":
+
+  def auto_pos_int (x):
+    val = int(x,0)
+    if val <= 0:
+        raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
+    return val
+
+  parser = argparse.ArgumentParser(description='Turn distance matrices into contours')
+
+  parser.add_argument('matrices_folder', metavar='MATRICES_FOLDER', help=f"The path to the matrices folder")
+  parser.add_argument('-v', '--verbose', action='count', default=0
+    , help="Increase verbosity level by adding more \"v\".")
+
+  # parse command line arguments
+  clargs=parser.parse_args()
+
+  # set verbosity level for vprint function
+  vprint.lvl = clargs.verbose
+
+  # update default params with clargs
+  params.matrices_folder = clargs.matrices_folder
+
+  distmatrices2contour(params)

From 784479870ee85ae04441253d6f2edd67894e0dc1 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 9 Apr 2024 19:37:39 +0100
Subject: [PATCH 086/204] masks2distmat: turn find_contour into
 find_longest_contour

---
 scripts/shapes/masks2distmatrices.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py
index 023617ae..6b41755c 100644
--- a/scripts/shapes/masks2distmatrices.py
+++ b/scripts/shapes/masks2distmatrices.py
@@ -39,11 +39,13 @@ def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140):
 ####### Simplified version in order to make the things properly work #####
 ##########################################################################
 
-def find_contour(mask):
+def find_longest_contour(mask):
     if len(mask.shape) == 3: # (lines, columns, number of channels)
       mask = rgb2grey(mask)
-    contour = sk.measure.find_contours(mask, 0.8)[0]
-    x, y = contour[:, 0], contour[:, 1]
+    contours = sk.measure.find_contours(mask, 0.8)
+    vprint(4, f'len(contours) {len(contours)}')
+    contours = sorted(contours, key=lambda x: len(x), reverse=True)
+    x, y = contours[0][:, 0], contours[0][:, 1]
     return x, y
 
 def spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling):
@@ -68,7 +70,8 @@ def dist_to_coords(dst_mat):
 def mask2distmatrix(mask, raw_sampling_sparsity=1, spline_sampling=512):
   vprint(3, f'running with raw_sampling_sparsity {raw_sampling_sparsity} and spline_sampling {spline_sampling}')
   # extract mask contour
-  x, y = find_contour(mask)
+  x, y = find_longest_contour(mask)
+  vprint(3, f'found contour shape x {x.shape} y {y.shape}')
   # Reinterpolate (spline)
   x_reinterpolated, y_reinterpolated = spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling)
   # Build the distance matrix
@@ -128,7 +131,7 @@ def masks2distmatrices(params):
 #     mask = plt.imread(mask_path)
 
 #     # Get the contour
-#     x, y = find_contour(mask)
+#     x, y = find_longest_contour(mask)
 
 #     # Reinterpolate (spline)
 #     x_reinterpolated, y_reinterpolated = spline_interpolation(x, y)

From 177df9ef0f7cb407a39a67147b7dd4bf3cc5adda Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 9 Apr 2024 19:38:35 +0100
Subject: [PATCH 087/204] masks2distmat: enable periodic splprep for closed
 contours

---
 scripts/shapes/masks2distmatrices.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py
index 6b41755c..b48c429d 100644
--- a/scripts/shapes/masks2distmatrices.py
+++ b/scripts/shapes/masks2distmatrices.py
@@ -51,7 +51,9 @@ def find_longest_contour(mask):
 def spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling):
     # Sparsity of the contour. Dropping some of the sample (points) to make the spline smoother
     raw_sampling_sparsity = max(1, raw_sampling_sparsity)
-    tck, u = splprep([x[::raw_sampling_sparsity], y[::raw_sampling_sparsity]], s = 0)
+    vprint(3, f'running with raw_sampling_sparsity {raw_sampling_sparsity} and spline_sampling {spline_sampling}')
+    vprint(3, f'x.shape {x.shape} y.shape {y.shape}')
+    tck, u = splprep([x[::raw_sampling_sparsity], y[::raw_sampling_sparsity]], s = 0, per = True)
     # How many times to sample the spline
     new_u = np.linspace(u.min(), u.max(), spline_sampling) # Last parameter is how dense is our spline, how many points.
     # Evaluate the spline

From 4bd148756c0b50b39080c8ee8990dc9b249afd98 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 9 Apr 2024 19:39:26 +0100
Subject: [PATCH 088/204] masks2distmat: updated default sparsity to 4

---
 scripts/shapes/masks2distmatrices.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py
index b48c429d..06e99f7b 100644
--- a/scripts/shapes/masks2distmatrices.py
+++ b/scripts/shapes/masks2distmatrices.py
@@ -162,7 +162,7 @@ def masks2distmatrices(params):
 params = types.SimpleNamespace(**{
     "mask_dataset_path": None
   , "output_path": None
-  , "raw_sampling_sparsity": 1
+  , "raw_sampling_sparsity": 4
   , "spline_sampling": 512
 })
 

From 6d951e4da3f568a691dc098138792a034d015fa0 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 9 Apr 2024 19:44:06 +0100
Subject: [PATCH 089/204] distmat2contour: removed spurious return statement in
 vprint

---
 scripts/shapes/distmatrices2contour.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/shapes/distmatrices2contour.py b/scripts/shapes/distmatrices2contour.py
index 70e15a1d..754dd5fa 100644
--- a/scripts/shapes/distmatrices2contour.py
+++ b/scripts/shapes/distmatrices2contour.py
@@ -17,7 +17,6 @@ def vprint(tgtlvl, msg, pfx = f"{'':<5}"):
     print("verbosity level not set, defaulting to 0")
     vprint.lvl = 0
     vprint(tgtlvl, msg)
-    return dm
 
 def asym_to_sym(asym_dist_mat):
   return np.max(np.stack([asym_dist_mat, asym_dist_mat.T]), axis=0)

From 1481b51e8b09bae2eb2d36fa4b3ab5769deccba3 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 9 Apr 2024 19:47:35 +0100
Subject: [PATCH 090/204] drawContourFromDM: removed spurious return statement
 in vprint

---
 scripts/shapes/drawContourFromDM.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/shapes/drawContourFromDM.py b/scripts/shapes/drawContourFromDM.py
index 39863aee..671a1b96 100644
--- a/scripts/shapes/drawContourFromDM.py
+++ b/scripts/shapes/drawContourFromDM.py
@@ -18,7 +18,6 @@ def vprint(tgtlvl, msg, pfx = f"{'':<5}"):
     print("verbosity level not set, defaulting to 0")
     vprint.lvl = 0
     vprint(tgtlvl, msg)
-    #return dm
 
 def asym_to_sym(asym_dist_mat):
   return np.max(np.stack([asym_dist_mat, asym_dist_mat.T]), axis=0)

From 01c499ae5def089ac2333e7303a55f8fca1fb54e Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 9 Apr 2024 20:11:53 +0100
Subject: [PATCH 091/204] set correct aspect ratio for distmat2contour scripts

---
 scripts/shapes/distmatrices2contour.py | 1 +
 scripts/shapes/drawContourFromDM.py    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/scripts/shapes/distmatrices2contour.py b/scripts/shapes/distmatrices2contour.py
index 754dd5fa..23b56bb8 100644
--- a/scripts/shapes/distmatrices2contour.py
+++ b/scripts/shapes/distmatrices2contour.py
@@ -35,6 +35,7 @@ def distmatrices2contour(params):
     p = pathlib.Path(dm_npy)
     p = p.with_suffix('.png')
     reconstructed_coords = dist_to_coords(dm)
+    plt.axes().set_aspect('equal')
     plt.scatter(*zip(*reconstructed_coords), s=6)
     plt.savefig(p)
     vprint(2, f'saved {p}')
diff --git a/scripts/shapes/drawContourFromDM.py b/scripts/shapes/drawContourFromDM.py
index 671a1b96..fde5172f 100644
--- a/scripts/shapes/drawContourFromDM.py
+++ b/scripts/shapes/drawContourFromDM.py
@@ -36,6 +36,7 @@ def distmatrices2contour(params):
     p = pathlib.Path(dm_npy)
     p = p.with_suffix('.png')
     reconstructed_coords = dist_to_coords(dm)
+    plt.axes().set_aspect('equal')
     plt.scatter(*zip(*reconstructed_coords), s=6)
     plt.savefig(p)
     vprint(2, f'saved {p}')

From 879c5f4534018b08416bbb60bd06449a05978daa Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 9 Apr 2024 22:01:50 +0100
Subject: [PATCH 092/204] add different normalisations in dataset initial
 transformations

---
 scripts/shapes/distmatrix2embeding.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 9f3616e5..e97d694c 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -48,7 +48,10 @@ def main_process(params):
     ###########################################################################
 
     preproc_transform = transforms.Compose([
-        lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix
+        #lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix
+        #lambda x: x*1000, # scale the matrix
+        lambda x: x / x.max(), # normalize each element to one using the max value (0-1)
+        lambda x: x*255, # scale the matrix to 255
         lambda x: maybe_roll(x, p = 1.0), # "potentially" roll the matrix
         sanity_check, # check if the matrix is symmetric and positive, and the diagonal is zero
         torch.as_tensor, # turn (H,W) numpy array into a (H,W) tensor

From fc58ad329d1e921d9301b7d8e27122c8967211a9 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 9 Apr 2024 22:02:52 +0100
Subject: [PATCH 093/204] Add notion of class label to ditmat2emb script output

---
 scripts/shapes/distmatrix2embeding.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index e97d694c..a699eeb6 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -127,7 +127,10 @@ def main_process(params):
     )
     dataloader.setup()
     
+    # Predict
+    ###########################################################################
     predictions = trainer.predict(lit_model, datamodule=dataloader)
+    class_indices = np.array([int(data[-1]) for data in dataloader.predict_dataloader()])
     
     #TODO: Pull the embedings and reconstructed distance matrices
     ###########################################################################
@@ -142,9 +145,12 @@ def main_process(params):
     # Save the latent space
     np.save(f'{output_dir}/latent_space.npy', latent_space)
     # Save the reconstructions
-    for i, pred in enumerate(predictions):
-      np.save(f'{output_dir}/original_{i}.npy', pred.x.data[0,0])
-      np.save(f'{output_dir}/reconstruction_{i}.npy', pred.out.recon_x[0,0])
+    for class_label in dataset.classes:
+      pathlib.Path(f'{output_dir}/{class_label}').mkdir(parents=True, exist_ok=True)
+    for i, (pred, class_idx) in enumerate(zip(predictions, class_indices)):
+      class_label = dataset.classes[class_idx]
+      np.save(f'{output_dir}/{class_label}/original_{i}_{class_label}.npy', pred.x.data[0,0])
+      np.save(f'{output_dir}/{class_label}/reconstruction_{i}_{class_label}.npy', pred.out.recon_x[0,0])
 
 # default parameters
 ###############################################################################

From 2a4d9559ee461c10a144d6260b843da4e6a82253 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 9 Apr 2024 22:59:49 +0100
Subject: [PATCH 094/204] Updated default model path in distmat2emb script

---
 scripts/shapes/distmatrix2embeding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index a699eeb6..432b5075 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -185,7 +185,7 @@ def main_process(params):
     "commitment_cost": 0.25,
     "decay": 0.99,
     "frobenius_norm": False,
-    "dataset": ("tiny_dist", "/nfs/research/uhlmann/afoix/tiny_synthcellshapes_dataset_distmat"),
+    "dataset": ("tiny_dist", "/nfs/research/uhlmann/afoix/distmat_datasets/tiny_synthcellshapes_dataset_distmat"),
     # optimizer_params
     "opt": "AdamW",
     "lr": 0.001,

From 0c166ecfa128ac13a67bb9ad91499a9de76f66a5 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 15 Apr 2024 22:16:25 +0100
Subject: [PATCH 095/204] Added umap and kmeans + original filenames list

---
 scripts/shapes/distmatrix2embeding.py | 57 +++++++++++++++++++++++----
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 432b5075..ee234a14 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -1,6 +1,11 @@
 from torchvision import datasets, transforms
 import pytorch_lightning as pl
+import pandas as pd
 import numpy as np
+import umap
+import umap.plot
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
 import bioimage_embed
 import bioimage_embed.shapes
 import bioimage_embed.lightning
@@ -130,27 +135,65 @@ def main_process(params):
     # Predict
     ###########################################################################
     predictions = trainer.predict(lit_model, datamodule=dataloader)
+    filenames = [sample[0] for sample in dataloader.get_dataset().samples]
     class_indices = np.array([int(data[-1]) for data in dataloader.predict_dataloader()])
     
     #TODO: Pull the embedings and reconstructed distance matrices
     ###########################################################################
-    vprint(1, f'pull the embedings')
-    # Use the namespace variables
-    latent_space = torch.stack([d.out.z.flatten() for d in predictions])
     # create the output directory
     output_dir = params.output_dir
     if output_dir is None:
       output_dir = f'./{params.model}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
     pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
-    # Save the latent space
-    np.save(f'{output_dir}/latent_space.npy', latent_space)
-    # Save the reconstructions
     for class_label in dataset.classes:
       pathlib.Path(f'{output_dir}/{class_label}').mkdir(parents=True, exist_ok=True)
-    for i, (pred, class_idx) in enumerate(zip(predictions, class_indices)):
+    # Save the latent space
+    vprint(1, f'pull the embedings')
+    latent_space = torch.stack([d.out.z.flatten() for d in predictions]).numpy()
+    np.save(f'{output_dir}/latent_space.npy', latent_space)
+    # Save the (original input and) reconstructions
+    for i, (pred, class_idx, fname) in enumerate(zip(predictions, class_indices, filenames)):
+      vprint(5, f'pred#={i}, class_idx={class_idx}, fname={fname}')
       class_label = dataset.classes[class_idx]
       np.save(f'{output_dir}/{class_label}/original_{i}_{class_label}.npy', pred.x.data[0,0])
       np.save(f'{output_dir}/{class_label}/reconstruction_{i}_{class_label}.npy', pred.out.recon_x[0,0])
+    # umap
+    vprint(4, f'generate umap')
+    umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
+    mapper = umap_model.fit(latent_space)
+    umap.plot.points(mapper, labels=np.array([dataset.classes[x] for x in class_indices]))
+    plt.savefig(f'{output_dir}/umap.png')
+
+    # kmean and clustering information
+    # Perform KMeans clustering on the UMAP result
+    vprint(4, f'cluster data with kmean')
+    n_clusters = 4  # Define the number of clusters
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    umap_result = umap_model.fit_transform(latent_space)
+    cluster_labels = kmeans.fit_predict(umap_result)
+
+    # Concatenate the original data, UMAP result, and cluster labels
+    data_with_clusters = np.column_stack((latent_space, umap_result, cluster_labels))
+
+    # Convert to DataFrame for better handling
+    columns = [f'Feature_{i}' for i in range(latent_space.shape[1])] + \
+              ['UMAP_Dimension_1', 'UMAP_Dimension_2', 'Cluster_Label']
+    df = pd.DataFrame(data_with_clusters, columns=columns)
+    df['fname'] = filenames
+
+    df.to_csv(f'{output_dir}/clustered_data.csv', index=False)
+
+    # Plot the UMAP result with cluster labels
+    plt.figure(figsize=(10, 8))
+    for i in range(n_clusters):
+      plt.scatter(umap_result[cluster_labels == i, 0], umap_result[cluster_labels == i, 1], label=f'Cluster {i+1}', s=5)
+    plt.title('UMAP Visualization of Latent Space with KMeans Clustering')
+    plt.xlabel('UMAP Dimension 1')
+    plt.ylabel('UMAP Dimension 2')
+    plt.legend()
+
+    # Save the figure
+    plt.savefig(f'{output_dir}/umap_with_kmeans_clusters.png')
 
 # default parameters
 ###############################################################################

From 4826ba3554b69f11324fbc537c3349e68495887b Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 17 Apr 2024 21:11:18 +0100
Subject: [PATCH 096/204] dist2emb: random seed for np and pl

---
 scripts/shapes/distmatrix2embeding.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index ee234a14..0b9bc12d 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -17,6 +17,10 @@
 import torch
 import types
 
+# Seed everything
+np.random.seed(42)
+pl.seed_everything(42)
+
 # misc helpers
 ###############################################################################
 

From 3e562472c49a1620210b75f07cd7ae93dfa2e41b Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 17 Apr 2024 21:12:07 +0100
Subject: [PATCH 097/204] dist2emb: test different initial transformations

---
 scripts/shapes/distmatrix2embeding.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 0b9bc12d..e1221f70 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -57,15 +57,17 @@ def main_process(params):
     ###########################################################################
 
     preproc_transform = transforms.Compose([
+        lambda x: x / 256, # scale the matrix to the number of pixels
         #lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix
-        #lambda x: x*1000, # scale the matrix
-        lambda x: x / x.max(), # normalize each element to one using the max value (0-1)
-        lambda x: x*255, # scale the matrix to 255
+        lambda x: x*100, # scale the matrix
+        #lambda x: x / x.max(), # normalize each element to one using the max value (0-1)
+        #lambda x: x*255, # scale the matrix to 255
         lambda x: maybe_roll(x, p = 1.0), # "potentially" roll the matrix
         sanity_check, # check if the matrix is symmetric and positive, and the diagonal is zero
         torch.as_tensor, # turn (H,W) numpy array into a (H,W) tensor
         lambda x: x.repeat(3, 1, 1) # turn (H,W) tensor into a (3,H,W) tensor (to fit downstream model expectations)
     ])
+
     dataset = datasets.DatasetFolder(params.dataset[1], loader=np.load, extensions=('npy'), transform = preproc_transform)
     #dataset = datasets.DatasetFolder(params.dataset[1], loader=lambda x: np.load(x, allow_pickle=True), extensions=('npy'), transform = preproc_transform)
     dataloader = bioimage_embed.lightning.DataModule(

From a29c57d9abf3acf641ad9186c4f693bbb3b59136 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 17 Apr 2024 21:13:09 +0100
Subject: [PATCH 098/204] dist2emb: remove "TODO" from prints

---
 scripts/shapes/distmatrix2embeding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index e1221f70..f815cb74 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -118,12 +118,12 @@ def main_process(params):
     
     #TODO: Validate the model
     ########################################################################### 
-    vprint(1, f'TODO: Validate the model')
+    vprint(1, f'Validate the model')
     validation = trainer.validate(lit_model, datamodule=dataloader)
     
     #TODO: Test the model
     ###########################################################################  
-    vprint(1, f'TODO: Test the model')
+    vprint(1, f'Test the model')
     testing = trainer.test(lit_model, datamodule=dataloader)
     
     # Inference on full dataset

From bf5ce7f8bdd6f5acbe826b916e4dc5d336542f0f Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 17 Apr 2024 21:19:39 +0100
Subject: [PATCH 099/204] cosmetics

---
 bioimage_embed/lightning/torch.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bioimage_embed/lightning/torch.py b/bioimage_embed/lightning/torch.py
index 53d649fe..e559215d 100644
--- a/bioimage_embed/lightning/torch.py
+++ b/bioimage_embed/lightning/torch.py
@@ -9,7 +9,6 @@
 from pythae.models.base.base_utils import ModelOutput
 import torch.nn.functional as F
 
-
 class LitAutoEncoderTorch(pl.LightningModule):
     args = argparse.Namespace(
         opt="adamw",

From 41890e564d0e02c8202d1dee14d5e0a2d91e911c Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 17 Apr 2024 21:20:15 +0100
Subject: [PATCH 100/204] LitAutoEncoderTorch: return both loss and recon_loss

---
 bioimage_embed/lightning/torch.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/bioimage_embed/lightning/torch.py b/bioimage_embed/lightning/torch.py
index e559215d..ab730c3f 100644
--- a/bioimage_embed/lightning/torch.py
+++ b/bioimage_embed/lightning/torch.py
@@ -93,7 +93,11 @@ def training_step(self, batch, batch_idx):
         return loss
 
     def loss_function(self, model_output, *args, **kwargs):
-        return model_output.loss
+        #return model_output.loss
+        return {
+            "loss": model_output.loss,
+            "recon_loss": model_output.recon_loss,
+        }
 
     # def logging_step(self, z, loss, x, model_output, batch_idx):
     #     self.logger.experiment.add_embedding(

From beb571483fa1e21947c0a900ff59a8618cfd61d0 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 17 Apr 2024 21:21:56 +0100
Subject: [PATCH 101/204] MaskEmbed: turn off normalisation in
 DistanceMatrixLoss

---
 bioimage_embed/shapes/lightning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py
index e5ec529e..2664695a 100644
--- a/bioimage_embed/shapes/lightning.py
+++ b/bioimage_embed/shapes/lightning.py
@@ -35,7 +35,7 @@ def batch_to_tensor(self, batch):
         return ModelOutput(data=normalised_data / scalings, scalings=scalings)
 
     def loss_function(self, model_output, *args, **kwargs):
-        loss_ops = lf.DistanceMatrixLoss(model_output.recon_x, norm=True)
+        loss_ops = lf.DistanceMatrixLoss(model_output.recon_x, norm=False)
         loss = model_output.loss
         loss += torch.sum(
             torch.stack(

From da092b5c0e697511b88092a552b5c23ef12ee2c1 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 17 Apr 2024 21:27:53 +0100
Subject: [PATCH 102/204] MaskEmbed: log losses in loss_function method

---
 bioimage_embed/shapes/lightning.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py
index 2664695a..941007a9 100644
--- a/bioimage_embed/shapes/lightning.py
+++ b/bioimage_embed/shapes/lightning.py
@@ -37,22 +37,38 @@ def batch_to_tensor(self, batch):
     def loss_function(self, model_output, *args, **kwargs):
         loss_ops = lf.DistanceMatrixLoss(model_output.recon_x, norm=False)
         loss = model_output.loss
-        loss += torch.sum(
+        shape_loss = torch.sum(
             torch.stack(
                 [
                     loss_ops.diagonal_loss(),
                     loss_ops.symmetry_loss(),
-                    # loss_ops.triangle_inequality(),
                     loss_ops.non_negative_loss(),
+                    # loss_ops.triangle_inequality(),
                     # loss_ops.clockwise_order_loss(),
                 ]
             )
         )
+        loss += shape_loss
 
         # loss += lf.diagonal_loss(model_output.recon_x)
         # loss += lf.symmetry_loss(model_output.recon_x)
         # loss += lf.triangle_inequality_loss(model_output.recon_x)
         # loss += lf.non_negative_loss(model_output.recon_x)
+        #return loss
+
+        #variational_loss = model_output.loss - model_output.recon_loss
+
+        self.log_dict(
+            {
+            "loss": loss,
+            "shape_loss": shape_loss,
+            "reconstruction_loss": model_output.recon_loss,
+            "variational_loss": model_output.vq_loss,
+            },
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+        )
         return loss
 
 

From 78d067d46916a024a384260e2293767a8900f1d4 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 24 Apr 2024 21:00:38 +0100
Subject: [PATCH 103/204] renamed varitional_loss to vq_loss + comment out
 kdl_vae_loss

---
 bioimage_embed/shapes/lightning.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py
index 941007a9..02ef0e4e 100644
--- a/bioimage_embed/shapes/lightning.py
+++ b/bioimage_embed/shapes/lightning.py
@@ -63,7 +63,8 @@ def loss_function(self, model_output, *args, **kwargs):
             "loss": loss,
             "shape_loss": shape_loss,
             "reconstruction_loss": model_output.recon_loss,
-            "variational_loss": model_output.vq_loss,
+            "vq_loss": model_output.vq_loss,
+            #"kdl_vae_loss": model_output.KLD
             },
             on_epoch=True,
             prog_bar=True,

From d0c4ffbcf6de0205afd628cabe8fd4ddaaa19316 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 24 Apr 2024 21:01:44 +0100
Subject: [PATCH 104/204] removed new line

---
 bioimage_embed/shapes/lightning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py
index 02ef0e4e..e8dfee80 100644
--- a/bioimage_embed/shapes/lightning.py
+++ b/bioimage_embed/shapes/lightning.py
@@ -144,4 +144,4 @@ def training_step(self, batch, batch_idx, optimizer_idx=0):
 
     def configure_optimizers(self):
         opt_ed, lr_s_ed = self.timm_optimizers(self.model)
-        return self.timm_to_lightning(optimizer=opt_ed, lr_scheduler=lr_s_ed)
+        return self.timm_to_lightning(optimizer=opt_ed, lr_scheduler=lr_s_ed)
\ No newline at end of file

From 253b3f4b7aeb05ff8f5d73b2c6bceb85efb16dc7 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 24 Apr 2024 21:03:28 +0100
Subject: [PATCH 105/204] Normalise contour coord in mask2distmat script

---
 scripts/shapes/masks2distmatrices.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py
index 06e99f7b..c6af9ae8 100644
--- a/scripts/shapes/masks2distmatrices.py
+++ b/scripts/shapes/masks2distmatrices.py
@@ -39,13 +39,18 @@ def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140):
 ####### Simplified version in order to make the things properly work #####
 ##########################################################################
 
-def find_longest_contour(mask):
+def find_longest_contour(mask, normalise_coord=False):
     if len(mask.shape) == 3: # (lines, columns, number of channels)
       mask = rgb2grey(mask)
     contours = sk.measure.find_contours(mask, 0.8)
     vprint(4, f'len(contours) {len(contours)}')
     contours = sorted(contours, key=lambda x: len(x), reverse=True)
     x, y = contours[0][:, 0], contours[0][:, 1]
+    if normalise_coord:
+      x = x - np.min(x)
+      x = x / np.max(x)
+      y = y - np.min(y)
+      y = y / np.max(y)
     return x, y
 
 def spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling):
@@ -72,7 +77,7 @@ def dist_to_coords(dst_mat):
 def mask2distmatrix(mask, raw_sampling_sparsity=1, spline_sampling=512):
   vprint(3, f'running with raw_sampling_sparsity {raw_sampling_sparsity} and spline_sampling {spline_sampling}')
   # extract mask contour
-  x, y = find_longest_contour(mask)
+  x, y = find_longest_contour(mask, normalise_coord=True)
   vprint(3, f'found contour shape x {x.shape} y {y.shape}')
   # Reinterpolate (spline)
   x_reinterpolated, y_reinterpolated = spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling)

From 5066e3db864c1be535c3dcced3240654454fa61f Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 24 Apr 2024 21:05:07 +0100
Subject: [PATCH 106/204] Use bokeh for interactive umap plot (save as html
 file)

---
 scripts/shapes/distmatrix2embeding.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index f815cb74..b4d1762e 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -4,6 +4,7 @@
 import numpy as np
 import umap
 import umap.plot
+import bokeh.plotting
 import matplotlib.pyplot as plt
 from sklearn.cluster import KMeans
 import bioimage_embed
@@ -157,6 +158,11 @@ def main_process(params):
     vprint(1, f'pull the embedings')
     latent_space = torch.stack([d.out.z.flatten() for d in predictions]).numpy()
     np.save(f'{output_dir}/latent_space.npy', latent_space)
+    df = pd.DataFrame(latent_space)
+    df['class_idx'] = class_indices
+    df['class'] = [dataset.classes[x] for x in class_indices]
+    df['fname'] = filenames
+    #df.to_pickle(f'{output_dir}/latent_space.pkl')
     # Save the (original input and) reconstructions
     for i, (pred, class_idx, fname) in enumerate(zip(predictions, class_indices, filenames)):
       vprint(5, f'pred#={i}, class_idx={class_idx}, fname={fname}')
@@ -166,9 +172,13 @@ def main_process(params):
     # umap
     vprint(4, f'generate umap')
     umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
-    mapper = umap_model.fit(latent_space)
-    umap.plot.points(mapper, labels=np.array([dataset.classes[x] for x in class_indices]))
+    mapper = umap_model.fit(df.drop(['class_idx','class','fname'], axis=1))
+    umap.plot.points(mapper, labels=np.array(df['class']))
     plt.savefig(f'{output_dir}/umap.png')
+    p = umap.plot.interactive(mapper, labels=df['class_idx'], hover_data=df[['class','fname']])
+    # save interactive plot as html
+    bokeh.plotting.output_file(f"{output_dir}/umap.html")
+    bokeh.plotting.save(p)
 
     # kmean and clustering information
     # Perform KMeans clustering on the UMAP result

From 977620ecb13a0beca07f61bb61b7157b4d5fd236 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 25 Apr 2024 22:59:20 +0100
Subject: [PATCH 107/204] save latent_space with extra info as pickle again and
 have a separate gen UMAPs script

---
 scripts/shapes/distmatrix2embeding.py |   7 +-
 scripts/shapes/genUMAPs.py            | 133 ++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 3 deletions(-)
 create mode 100755 scripts/shapes/genUMAPs.py

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index b4d1762e..07b0062a 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -162,7 +162,7 @@ def main_process(params):
     df['class_idx'] = class_indices
     df['class'] = [dataset.classes[x] for x in class_indices]
     df['fname'] = filenames
-    #df.to_pickle(f'{output_dir}/latent_space.pkl')
+    df.to_pickle(f'{output_dir}/latent_space.pkl')
     # Save the (original input and) reconstructions
     for i, (pred, class_idx, fname) in enumerate(zip(predictions, class_indices, filenames)):
       vprint(5, f'pred#={i}, class_idx={class_idx}, fname={fname}')
@@ -171,11 +171,12 @@ def main_process(params):
       np.save(f'{output_dir}/{class_label}/reconstruction_{i}_{class_label}.npy', pred.out.recon_x[0,0])
     # umap
     vprint(4, f'generate umap')
-    umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
+    umap_model = umap.UMAP(n_neighbors=50, min_dist=0.8, n_components=2, random_state=42)
     mapper = umap_model.fit(df.drop(['class_idx','class','fname'], axis=1))
     umap.plot.points(mapper, labels=np.array(df['class']))
     plt.savefig(f'{output_dir}/umap.png')
-    p = umap.plot.interactive(mapper, labels=df['class_idx'], hover_data=df[['class','fname']])
+    #p = umap.plot.interactive(mapper, labels=df['class_idx'], hover_data=df[['class','fname']])
+    p = umap.plot.interactive(mapper, values=df.drop(['class_idx','class','fname'], axis=1).mean(axis=1), theme='viridis', hover_data=df[['class','fname']])
     # save interactive plot as html
     bokeh.plotting.output_file(f"{output_dir}/umap.html")
     bokeh.plotting.save(p)
diff --git a/scripts/shapes/genUMAPs.py b/scripts/shapes/genUMAPs.py
new file mode 100755
index 00000000..09121647
--- /dev/null
+++ b/scripts/shapes/genUMAPs.py
@@ -0,0 +1,133 @@
+#! /usr/bin/env python3
+
+import os
+import os.path
+import pandas as pd
+import numpy as np
+import umap
+import umap.plot
+import matplotlib.pyplot as plt
+import bokeh.plotting
+import argparse
+import datetime
+import pathlib
+import multiprocessing
+import subprocess
+
+# Seed everything
+np.random.seed(42)
+
+# misc helpers
+###############################################################################
+
+def vprint(tgtlvl, msg, pfx = f"{'':<5}"):
+  try:
+    if (tgtlvl <= vprint.lvl):
+      print(f"{pfx}{msg}")
+  except AttributeError:
+    print("verbosity level not set, defaulting to 0")
+    vprint.lvl = 0
+    vprint(tgtlvl, msg)
+
+# render UMAPS
+def render_umap_core(df, output_dir, n_neighbors, min_dist, n_components):
+  name = f'umap_{n_neighbors}_{min_dist}_{n_components}'
+  vprint(4, f'generate {name}')
+  vprint(5, f'n_neigbhors: {type(n_neighbors)} {n_neighbors}')
+  vprint(5, f'min_dist: {type(min_dist)} {min_dist}')
+  vprint(5, f'n_components: {type(n_components)} {n_components}')
+  umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=42)
+  mapper = umap_model.fit(df.drop(['class_idx','class','fname'], axis=1))
+  umap.plot.points(mapper, labels=np.array(df['class']))
+  plt.savefig(f'{output_dir}/{name}.png')
+  #p = umap.plot.interactive(mapper, labels=df['class_idx'], hover_data=df[['class','fname']])
+  p = umap.plot.interactive(mapper, values=df.drop(['class_idx','class','fname'], axis=1).mean(axis=1), theme='viridis', hover_data=df[['class','fname']])
+  # save interactive plot as html
+  bokeh.plotting.output_file(f"{output_dir}/{name}.html")
+  bokeh.plotting.save(p)
+
+def render_umap(latent_space_pkl, output_dir, n_neighbors, min_dist, n_components):
+  # create output directory if it does not already exist
+  os.makedirs(output_dir, exist_ok=True)
+  # load latent space
+  df = pd.read_pickle(latent_space_pkl)
+  # render umap
+  render_umap_core(df, output_dir, n_neighbors, min_dist, n_components)
+
+###############################################################################
+
+if __name__ == "__main__":
+
+  def auto_pos_int (x):
+    val = int(x,0)
+    if val <= 0:
+        raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
+    return val
+
+  parser = argparse.ArgumentParser(description='generate umaps')
+    
+  parser.add_argument('latent_space', metavar='LATENT_SPACE', type=os.path.abspath
+                     , help=f"The path to the latent space")
+  parser.add_argument('-j', '--n_jobs', type=auto_pos_int, default=2*os.cpu_count()
+                     , help="number of jobs to start. Default is 2x the number of CPUs.")
+  parser.add_argument('--slurm', action=argparse.BooleanOptionalAction)
+  parser.add_argument('-n', '--n_neighbors', nargs='+', type=auto_pos_int, default=[50]
+                     , help="A list of the number of neighbors to use in UMAP. Default is [50].")
+  parser.add_argument('-m', '--min_dist', nargs='+', type=float, default=[0.8]
+                     , help="A list of the minimum distances to use in UMAP. Default is [0.8].")
+  parser.add_argument('-c', '--n_components', nargs='+', type=auto_pos_int, default=[2]
+                     , help="A list of the number of components to use in UMAP. Default is [2].")
+  parser.add_argument( '-o', '--output-dir', metavar='OUTPUT_DIR', default=f'{os.getcwd()}/umaps'
+                     , help=f"The OUTPUT_DIR path to use to dump results")
+  parser.add_argument('-v', '--verbose', action='count', default=0
+    , help="Increase verbosity level by adding more \"v\".")
+
+  # parse command line arguments
+  clargs=parser.parse_args()
+
+  # set verbosity level for vprint function
+  vprint.lvl = clargs.verbose
+
+  #for x,y,z in [(x, y, z) for x in clargs.n_neighbors
+  #                        for y in clargs.min_dist
+  #                        for z in clargs.n_components]:
+  #  render_umap(df, x, y, z)
+
+  params=[(x, y, z) for x in clargs.n_neighbors
+                    for y in clargs.min_dist
+                    for z in clargs.n_components]
+  if clargs.slurm:
+    vprint(2, f'running with slurm')
+    for (n_neighbors, min_dist, n_components) in params:
+      vprint(3, f'running with n_neighbors={n_neighbors}, min_dist={min_dist}, n_components={n_components}')
+      print('Directory Name: ', os.path.dirname(__file__))
+
+      cmd = [ "srun"
+            , "-t", "50:00:00"
+            , "--mem=200G"
+            , "--gpus=a100:1"
+            ,  "--job-name", f"render_umap_{n_neighbors}_{min_dist}_{n_components}"
+            , "--pty"
+            , "python3", "-c"
+            , f"""
+import sys
+sys.path.insert(1, '{os.path.dirname(__file__)}')
+import genUMAPs
+genUMAPs.render_umap('{clargs.latent_space}','{clargs.output_dir}',{n_neighbors},{min_dist},{n_components})
+"""]
+      vprint(4, cmd)
+      subprocess.run(cmd)
+
+  else:
+    vprint(2, f'running with python multiprocessing')
+
+    # create output directory if it does not already exist
+    os.makedirs(clargs.output_dir, exist_ok=True)
+
+    # load latent space
+    df = pd.read_pickle(clargs.latent_space)
+
+    def render_umap_wrapper(args):
+      render_umap(df, clargs.output_dir, *args)
+    with multiprocessing.Pool(clargs.n_jobs) as pool:
+      pool.starmap(render_umap_wrapper, params)
\ No newline at end of file

From ccdc090caf77188c3053beb0036db2abdbeacb63 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 26 Apr 2024 08:52:05 +0100
Subject: [PATCH 108/204] updated the render umap script with a _hardcoded_
 trick to extract index from filename for the tree dataset (should use a user
 specified regex instead)

---
 scripts/shapes/genUMAPs.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/genUMAPs.py b/scripts/shapes/genUMAPs.py
index 09121647..265f0525 100755
--- a/scripts/shapes/genUMAPs.py
+++ b/scripts/shapes/genUMAPs.py
@@ -40,8 +40,15 @@ def render_umap_core(df, output_dir, n_neighbors, min_dist, n_components):
   mapper = umap_model.fit(df.drop(['class_idx','class','fname'], axis=1))
   umap.plot.points(mapper, labels=np.array(df['class']))
   plt.savefig(f'{output_dir}/{name}.png')
+  theme_values = df.drop(['class_idx','class','fname'], axis=1).mean(axis=1)
+  vprint(5, f'theme_values type: {type(theme_values)}')
+  if True: #temporary condition to work ONLY with the tree dataset
+    theme_values = list(map(lambda x: int(x.split('_')[-1].split('.')[0]), df['fname']))
+    vprint(5, f'new theme_values type: {type(theme_values)}')
+  vprint(5, f'theme_values: {theme_values}')
   #p = umap.plot.interactive(mapper, labels=df['class_idx'], hover_data=df[['class','fname']])
-  p = umap.plot.interactive(mapper, values=df.drop(['class_idx','class','fname'], axis=1).mean(axis=1), theme='viridis', hover_data=df[['class','fname']])
+  #p = umap.plot.interactive(mapper, values=df.drop(['class_idx','class','fname'], axis=1).mean(axis=1), theme='viridis', hover_data=df[['class','fname']])
+  p = umap.plot.interactive(mapper, values=theme_values, theme='viridis', hover_data=df[['class','fname']])
   # save interactive plot as html
   bokeh.plotting.output_file(f"{output_dir}/{name}.html")
   bokeh.plotting.save(p)
@@ -113,6 +120,7 @@ def auto_pos_int (x):
 import sys
 sys.path.insert(1, '{os.path.dirname(__file__)}')
 import genUMAPs
+genUMAPs.vprint.lvl = {clargs.verbose}
 genUMAPs.render_umap('{clargs.latent_space}','{clargs.output_dir}',{n_neighbors},{min_dist},{n_components})
 """]
       vprint(4, cmd)

From 3601e3bd2f8ee5f892d3786f1dc924be431ca1b8 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 29 Apr 2024 22:11:00 +0100
Subject: [PATCH 109/204] minor config + comments

---
 scripts/shapes/distmatrix2embeding.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 07b0062a..ff02861a 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -58,11 +58,9 @@ def main_process(params):
     ###########################################################################
 
     preproc_transform = transforms.Compose([
-        lambda x: x / 256, # scale the matrix to the number of pixels
-        #lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix
-        lambda x: x*100, # scale the matrix
+        lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix
+        #lambda x: x*1000, # scale the matrix
         #lambda x: x / x.max(), # normalize each element to one using the max value (0-1)
-        #lambda x: x*255, # scale the matrix to 255
         lambda x: maybe_roll(x, p = 1.0), # "potentially" roll the matrix
         sanity_check, # check if the matrix is symmetric and positive, and the diagonal is zero
         torch.as_tensor, # turn (H,W) numpy array into a (H,W) tensor
@@ -97,6 +95,9 @@ def main_process(params):
     jobname = f"{params.model}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}"
     wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=jobname)
     wandblogger.watch(lit_model, log="all")
+    # TODO: Sanity check:
+    # test_data = dataset[0][0].unsqueeze(0)
+    # test_output = lit_model.forward((test_data,))
 
     # Train the model
     ###########################################################################
@@ -211,6 +212,9 @@ def main_process(params):
 
     # Save the figure
     plt.savefig(f'{output_dir}/umap_with_kmeans_clusters.png')
+    
+    # Test embeding for a classifcation task
+    
 
 # default parameters
 ###############################################################################

From 3f98d068b31cc1a8b576a9a495ab230cd90b9215 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 8 May 2024 22:26:50 +0100
Subject: [PATCH 110/204] added a beta vae model

---
 bioimage_embed/models/factory.py      | 14 ++++++++++++++
 bioimage_embed/shapes/lightning.py    |  4 ++--
 scripts/shapes/distmatrix2embeding.py | 15 ++++++++++++++-
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/bioimage_embed/models/factory.py b/bioimage_embed/models/factory.py
index 749ebaa6..713b98af 100644
--- a/bioimage_embed/models/factory.py
+++ b/bioimage_embed/models/factory.py
@@ -123,6 +123,19 @@ def resnet18_vqvae(self):
             bolts.ResNet18VQVAEDecoder,
         )
 
+    def resnet18_beta_vae(self):
+        return self.create_model(
+            partial(
+                pythae.models.BetaVAEConfig,
+                use_default_encoder=False,
+                use_default_decoder=False,
+                **self.kwargs
+            ),
+            pythae.models.BetaVAE,
+            bolts.ResNet18VAEEncoder,
+            bolts.ResNet18VAEDecoder,
+        )
+
     def resnet50_vqvae(self):
         return self.create_model(
             partial(
@@ -177,6 +190,7 @@ def resnet152_vqvae_legacy(self):
 
 MODELS = [
     "resnet18_vae",
+    "resnet18_beta_vae",
     "resnet50_vae",
     "resnet18_vae_bolt",
     "resnet50_vae_bolt",
diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py
index e8dfee80..02202fb8 100644
--- a/bioimage_embed/shapes/lightning.py
+++ b/bioimage_embed/shapes/lightning.py
@@ -63,8 +63,8 @@ def loss_function(self, model_output, *args, **kwargs):
             "loss": loss,
             "shape_loss": shape_loss,
             "reconstruction_loss": model_output.recon_loss,
-            "vq_loss": model_output.vq_loss,
-            #"kdl_vae_loss": model_output.KLD
+            #"vq_loss": model_output.vq_loss,
+            "KLD_loss": model_output.reg_loss,
             },
             on_epoch=True,
             prog_bar=True,
diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index ff02861a..9a8f3dee 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -17,6 +17,7 @@
 import pathlib
 import torch
 import types
+import re
 
 # Seed everything
 np.random.seed(42)
@@ -81,11 +82,15 @@ def main_process(params):
     # Build the model
     ###########################################################################
 
+    extra_params = {}
+    if re.match(".*_beta_vae", params.model):
+      extra_params['beta'] = params.model_beta_vae_beta
     model = bioimage_embed.models.create_model(
         model=params.model,
         input_dim=params.input_dim,
         latent_dim=params.latent_dim,
         pretrained=params.pretrained,
+        **extra_params
     )
     lit_model = bioimage_embed.shapes.MaskEmbed(model, params)
     vprint(1, f'model ready')
@@ -222,6 +227,7 @@ def main_process(params):
 models = [
   "resnet18_vae"
 , "resnet50_vae"
+, "resnet18_beta_vae"
 , "resnet18_vae_bolt"
 , "resnet50_vae_bolt"
 , "resnet18_vqvae"
@@ -250,6 +256,8 @@ def main_process(params):
     "decay": 0.99,
     "frobenius_norm": False,
     "dataset": ("tiny_dist", "/nfs/research/uhlmann/afoix/distmat_datasets/tiny_synthcellshapes_dataset_distmat"),
+    # model-specific params
+    "model_beta_vae_beta": 1,
     # optimizer_params
     "opt": "AdamW",
     "lr": 0.001,
@@ -280,6 +288,9 @@ def auto_pos_int (x):
     parser.add_argument(
         '-m', '--model', choices=models, metavar='MODEL'
       , help=f"The MODEL to use, one of {models} (default {params.model}).")
+    parser.add_argument(
+        '--model-beta-vae-beta', type=float, metavar='BETA'
+      , help=f"The BETA parameter to use for a beta-vae model.")
     parser.add_argument(
         '-d', '--dataset', nargs=2, metavar=('NAME', 'PATH')
       , help=f"The NAME of and PATH to the dataset (default: {params.dataset})")
@@ -318,6 +329,8 @@ def auto_pos_int (x):
     # update default params with clargs
     if clargs.model:
       params.model = clargs.model
+    if clargs.model_beta_vae_beta:
+      params.model_beta_vae_beta = clargs.model_beta_vae_beta
     params.output_dir = clargs.output_dir
     if clargs.dataset:
       params.dataset = clargs.dataset
@@ -339,4 +352,4 @@ def auto_pos_int (x):
       params.epochs = clargs.num_epochs
     
     # run main process
-    main_process(params)
\ No newline at end of file
+    main_process(params)

From f6ee1ac158291d110d2bd22e1859b3b5f16af707 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 8 May 2024 22:45:06 +0100
Subject: [PATCH 111/204] added extra parameters in the wandb jobname

---
 scripts/shapes/distmatrix2embeding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 9a8f3dee..227b795b 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -97,7 +97,7 @@ def main_process(params):
 
     # WandB logger
     ###########################################################################
-    jobname = f"{params.model}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}"
+    jobname = f"{params.model}_{'_'.join([f'{k}{v}' for k, v in extra_params.items()])}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}"
     wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=jobname)
     wandblogger.watch(lit_model, log="all")
     # TODO: Sanity check:

From 01b746fc3b44ef7371312fc600cf26f6e0cb260a Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 10 May 2024 22:42:21 +0100
Subject: [PATCH 112/204] finer grained clargs around latent space related
 parameters

---
 scripts/shapes/distmatrix2embeding.py | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 227b795b..eaf28376 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -248,9 +248,9 @@ def main_process(params):
     "batch_size": 4,
     "num_workers": 2**4,
     "input_dim": (3, 512, 512),
-    "latent_dim": 512,
-    "num_embeddings": 512,
-    "num_hiddens": 512,
+    "latent_dim": 1024,
+    "num_embeddings": 1024,
+    "num_hiddens": 1024,
     "pretrained": True,
     "commitment_cost": 0.25,
     "decay": 0.99,
@@ -309,6 +309,15 @@ def auto_pos_int (x):
     parser.add_argument(
         '-l', '--latent-space-size', metavar='LATENT_SPACE_SIZE', type=auto_pos_int
       , help=f"The LATENT_SPACE_SIZE, a positive integer (default {params.latent_dim})")
+    parser.add_argument(
+        '--input-dimensions', metavar='INPUT_DIM', nargs=2, type=auto_pos_int
+      , help=f"The width and height INPUT_DIM for the input dimensions (default {params.input_dim[1]} and {params.input_dim[2]})")
+    parser.add_argument(
+        '--number-embeddings', metavar='NUM_EMBEDDINGS', type=auto_pos_int
+      , help=f"The NUM_EMBEDDINGS, a positive integer (default {params.num_embeddings})")
+    parser.add_argument(
+        '--number-hiddens', metavar='NUM_HIDDENS', type=auto_pos_int
+      , help=f"The NUM_HIDDENS, a positive integer (default {params.num_hiddens})")
     parser.add_argument(
         '-n', '--num-workers', metavar='NUM_WORKERS', type=auto_pos_int
       , help=f"The NUM_WORKERS for the run, a positive integer (default {params.num_workers})")
@@ -341,11 +350,13 @@ def auto_pos_int (x):
     if clargs.batch_size:
       params.batch_size = clargs.batch_size
     if clargs.latent_space_size:
-      interp_size = clargs.latent_space_size * 2
-      params.input_dim = (params.input_dim[0], interp_size, interp_size)
-      params.latent_dim = interp_size
-      params.num_embeddings = interp_size
-      params.num_hiddens = interp_size
+      params.latent_dim = clargs.latent_space_size
+    if clargs.input_dimensions:
+      params.input_dim = (params.input_dim[0], clargs.input_dimensions[0], clargs.input_dimensions[1])
+    if clargs.number_embeddings:
+      params.num_embeddings = clargs.number_embeddings
+    if clargs.number_hiddens:
+      params.num_hiddens = clargs.number_hiddens
     if clargs.num_workers:
       params.num_workers = clargs.num_workers
     if clargs.num_epochs:

From d493ea5f06cb5532f0f415da07dd8fc5d9b7f971 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 10 May 2024 23:09:51 +0100
Subject: [PATCH 113/204] log different losses for vq or beta models

---
 bioimage_embed/shapes/lightning.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py
index 02202fb8..a9a1e947 100644
--- a/bioimage_embed/shapes/lightning.py
+++ b/bioimage_embed/shapes/lightning.py
@@ -7,6 +7,7 @@
 from torch import nn
 from ..lightning import LitAutoEncoderTorch
 from . import loss_functions as lf
+import pythae
 from pythae.models.base.base_utils import ModelOutput
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 from types import SimpleNamespace
@@ -58,14 +59,18 @@ def loss_function(self, model_output, *args, **kwargs):
 
         #variational_loss = model_output.loss - model_output.recon_loss
 
+        metrics = {
+          "loss": loss,
+          "shape_loss": shape_loss,
+          "reconstruction_loss": model_output.recon_loss,
+        }
+        if isinstance(self.model, pythae.models.VQVAE):
+            metrics["vq_loss"] = model_output.vq_loss
+        if isinstance(self.model, pythae.models.BetaVAE):
+            metrics['KLD_loss'] = model_output.reg_loss
+
         self.log_dict(
-            {
-            "loss": loss,
-            "shape_loss": shape_loss,
-            "reconstruction_loss": model_output.recon_loss,
-            #"vq_loss": model_output.vq_loss,
-            "KLD_loss": model_output.reg_loss,
-            },
+            metrics,
             on_epoch=True,
             prog_bar=True,
             logger=True,
@@ -144,4 +149,4 @@ def training_step(self, batch, batch_idx, optimizer_idx=0):
 
     def configure_optimizers(self):
         opt_ed, lr_s_ed = self.timm_optimizers(self.model)
-        return self.timm_to_lightning(optimizer=opt_ed, lr_scheduler=lr_s_ed)
\ No newline at end of file
+        return self.timm_to_lightning(optimizer=opt_ed, lr_scheduler=lr_s_ed)

From b15dff4d6dc15150ce1703fb7e9cdf1abdf27ac4 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 13 May 2024 17:24:58 +0100
Subject: [PATCH 114/204] code to do classification using the features of the
 latent space

---
 scripts/shapes/check_latent_space.py | 125 +++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 scripts/shapes/check_latent_space.py

diff --git a/scripts/shapes/check_latent_space.py b/scripts/shapes/check_latent_space.py
new file mode 100644
index 00000000..6fb085a4
--- /dev/null
+++ b/scripts/shapes/check_latent_space.py
@@ -0,0 +1,125 @@
+import pandas as pd
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split, cross_validate
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+from sklearn import svm
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.metrics import classification_report, confusion_matrix
+import umap
+import seaborn as sns
+import matplotlib.pyplot as plt
+import os
+from tabulate import tabulate
+import json
+
+pd.set_option('display.max_colwidth', None)
+
+df = pd.read_csv("clustered_data.csv")
+
+df.insert(0, 'label', df['fname'].str.extract(r'^(?:[^/]*/){7}([^/]*)').squeeze())
+df.insert(0, 'n_label', df['label'].apply(lambda x: 0 if x == 'alive' else 1))
+
+new_df = df.iloc[:, :-4]
+
+y = new_df.iloc[:, 0] 
+X = new_df.iloc[:, 2:]
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+
+def build_and_evaluate_model(clf, X_train, y_train, X_test, y_test):
+    model = Pipeline(
+            [
+                ("scaler", StandardScaler()),
+                ("pca", PCA(n_components=0.95, whiten=True, random_state=42)),
+                ("clf", clf),
+            ]
+        )
+
+    pipeline = model.fit(X_train, y_train)
+
+    score = pipeline.score(X_test, y_test)
+    print(f"Classification score: {score}")
+
+    y_pred = pipeline.predict(X_test)
+
+    print("Classification Report:")
+    print(classification_report(y_test, y_pred))
+
+    print("Confusion Matrix:")
+    cm = confusion_matrix(y_test, y_pred)
+    print(cm)
+
+    # Cross-validation
+    cv_results = cross_validate(pipeline, X, y, cv=5)
+    print("Cross-validation results:")
+    print(cv_results)
+
+    # Plot and save the confusion matrix
+    plt.figure(figsize=(10,7))
+    sns.heatmap(cm, annot=True, fmt='d')
+    plt.xlabel('Predicted')
+    plt.ylabel('Truth')
+    plt.title(f'Confusion Matrix for {clf.__class__.__name__}')
+    plt.savefig(f'confusion_matrix_{clf.__class__.__name__}.png')
+    plt.clf()  # Clear the current figure
+
+    return score, cm, cv_results
+
+classifiers = [RandomForestClassifier(), GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0), svm.SVC()]
+
+results = []
+
+for clf in classifiers:
+    score, cm, cv_results = build_and_evaluate_model(clf, X_train, y_train, X_test, y_test)
+    results.append((clf.__class__.__name__, score, cm, cv_results))
+
+known_labels = list(y[:50])
+unknown_labels = [-1]*len(y[50:])
+partial_labels = known_labels + unknown_labels
+
+reducer = umap.UMAP()
+embedding = reducer.fit_transform(X, y=partial_labels)
+
+plt.scatter(embedding[:, 0], embedding[:, 1], c=partial_labels, cmap='Spectral', s=5)
+plt.gca().set_aspect('equal', 'datalim')
+plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
+plt.title('UMAP projection of the dataset', fontsize=24)
+
+plt.savefig('umap_visualization.png')
+plt.clf()  # Clear the current figure
+
+# Generate LaTeX report
+with open('final_report.tex', 'w') as f:
+    f.write("\\documentclass{article}\n\\usepackage{graphicx}\n\\usepackage{longtable}\n\\usepackage{listings}\n\\begin{document}\n")
+    for name, score, cm, cv_results in results:
+        f.write(f"\\section*{{Results for {name}}}\n")
+        f.write("\\begin{longtable}{|l|l|}\n")
+        f.write("\\hline\n")
+        f.write(f"Classification Score & {score} \\\\\n")
+        f.write("\\hline\n")
+        f.write("Confusion Matrix & \\\\\n")
+        f.write("\\begin{lstlisting}\n")
+        f.write(np.array2string(cm).replace('\n', ' \\\\\n'))
+        f.write("\\end{lstlisting}\n")
+        f.write("\\hline\n")
+        f.write("Cross-validation Results & \\\\\n")
+        f.write("\\begin{lstlisting}\n")
+        cv_results_df = pd.DataFrame(cv_results)
+        cv_results_df = cv_results_df.applymap(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
+        f.write(cv_results_df.to_string().replace('\n', ' \\\\\n'))
+        f.write("\\end{lstlisting}\n")
+        f.write("\\hline\n")
+        f.write("\\end{longtable}\n")
+    f.write("\\section*{UMAP visualization}\n")
+    f.write("\\includegraphics[width=\\textwidth]{umap_visualization.png}\n")
+    f.write("\\end{document}\n")
+
+os.system('pdflatex final_report.tex')
+
+# Generate CSV report
+report_df = pd.DataFrame(results, columns=['Classifier', 'Score', 'Confusion Matrix', 'Cross-validation Results'])
+report_df['Cross-validation Results'] = report_df['Cross-validation Results'].apply(lambda x: pd.DataFrame(x).applymap(lambda y: y.tolist() if isinstance(y, np.ndarray) else y).to_dict())
+report_df.to_csv('final_report.csv', index=False)

From 1a2bcf15fb51e60f192064f8920137c69138399e Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 5 Jun 2024 10:36:47 +0100
Subject: [PATCH 115/204] new latent space size

---
 scripts/shapes/distmatrix2embeding.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index eaf28376..c388b047 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -241,6 +241,7 @@ def main_process(params):
 , "resnet50_vae_legacy"
 ]
 
+matrix_dim = 512
 params = types.SimpleNamespace(**{
     # general params
     "model": "resnet18_vae",
@@ -248,7 +249,7 @@ def main_process(params):
     "batch_size": 4,
     "num_workers": 2**4,
     "input_dim": (3, 512, 512),
-    "latent_dim": 1024,
+    "latent_dim": int((matrix_dim**2 - matrix_dim) / 2),
     "num_embeddings": 1024,
     "num_hiddens": 1024,
     "pretrained": True,

From b6f12c2dfb32c6c903f4c7b1de1ede03efc79aac Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 5 Jun 2024 10:38:38 +0100
Subject: [PATCH 116/204] Added imports that will be needed for next commits

---
 scripts/shapes/distmatrix2embeding.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index c388b047..ff5145c2 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -1,3 +1,5 @@
+import seaborn as sns
+import pyefd
 from torchvision import datasets, transforms
 import pytorch_lightning as pl
 import pandas as pd
@@ -12,12 +14,34 @@
 import bioimage_embed.lightning
 from bioimage_embed.lightning import DataModule
 from pytorch_lightning import loggers as pl_loggers
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 import argparse
 import datetime
 import pathlib
 import torch
 import types
 import re
+import shutil
+from pathlib import Path
+from sklearn.model_selection import cross_validate, KFold, train_test_split, StratifiedKFold
+from sklearn.metrics import make_scorer
+from sklearn import metrics
+from sklearn.discriminant_analysis import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.pipeline import Pipeline
+from skimage import measure
+from tqdm import tqdm
+import logging
+
+from bioimage_embed.shapes.transforms import (
+    ImageToCoords,
+    CropCentroidPipeline
+)
+
+import pickle
+import base64
+import hashlib
+import os
 
 # Seed everything
 np.random.seed(42)

From 79e75ced94925d971fad1542b0a45dd7fed3fc57 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 5 Jun 2024 10:40:22 +0100
Subject: [PATCH 117/204] Added checkpoint mechanism

---
 scripts/shapes/distmatrix2embeding.py | 42 +++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index ff5145c2..45da013f 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -118,7 +118,15 @@ def main_process(params):
     )
     lit_model = bioimage_embed.shapes.MaskEmbed(model, params)
     vprint(1, f'model ready')
-
+    
+    model_dir = f"checkpoints/{hashing_fn(params)}"
+    
+    
+    if clargs.clear_checkpoints:
+        print("cleaning checkpoints")
+        shutil.rmtree("checkpoints/")
+        model_dir = f"checkpoints/{hashing_fn(params)}"
+    
     # WandB logger
     ###########################################################################
     jobname = f"{params.model}_{'_'.join([f'{k}{v}' for k, v in extra_params.items()])}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}"
@@ -131,22 +139,46 @@ def main_process(params):
     # Train the model
     ###########################################################################
     
+    Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True)
+    
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=f"{model_dir}/",
+        save_last=True,
+        save_top_k=1,
+        monitor="loss/val",
+        mode="min",
+    )
+    
     trainer = pl.Trainer(
         logger=[wandblogger],
         gradient_clip_val=0.5,
-        enable_checkpointing=False,
+        enable_checkpointing=True,
         devices=1,
         accelerator="gpu",
         accumulate_grad_batches=4,
-        #TODO callbacks=[checkpoint_callback],
+        callbacks=[checkpoint_callback],
         min_epochs=50,
         max_epochs=params.epochs,
         log_every_n_steps=1,
     )
+    
+     # Determine the checkpoint path for resuming
+    last_checkpoint_path = f"{model_dir}/last.ckpt"
+    best_checkpoint_path = checkpoint_callback.best_model_path
+    
+    # Check if a last checkpoint exists to resume from
+    if os.path.isfile(last_checkpoint_path):
+        resume_checkpoint = last_checkpoint_path
+    elif best_checkpoint_path and os.path.isfile(best_checkpoint_path):
+        resume_checkpoint = best_checkpoint_path
+    else:
+        resume_checkpoint = None
+    
     trainer.fit(lit_model, datamodule=dataloader)
     lit_model.eval()
     vprint(1, f'trainer fitted')
     
+    
     #TODO: Validate the model
     ########################################################################### 
     vprint(1, f'Validate the model')
@@ -349,8 +381,8 @@ def auto_pos_int (x):
     parser.add_argument(
         '-e', '--num-epochs', metavar='NUM_EPOCHS', type=auto_pos_int
       , help=f"The NUM_EPOCHS for the run, a positive integer (default {params.epochs})")
-    #parser.add_argument('--clear-checkpoints', action='store_true'
-    #  , help='remove checkpoints')
+    parser.add_argument('--clear-checkpoints', action='store_true'
+      , help='remove checkpoints')
     parser.add_argument('-v', '--verbose', action='count', default=0
       , help="Increase verbosity level by adding more \"v\".")
     

From a8406ea52dcdbf87cd07d1590077ee1d873bc1ac Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 5 Jun 2024 10:41:35 +0100
Subject: [PATCH 118/204] Added regionprops + fourrier decomposition trials (!
 hardcoded path to synthetic shapes dataset needs to be generalised)

---
 scripts/shapes/distmatrix2embeding.py | 203 +++++++++++++++++++++++++-
 1 file changed, 201 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 45da013f..b42cd663 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -74,6 +74,156 @@ def sanity_check (dist_mat):
     raise ValueError("Matrix has non-zero diagonal")
   return dist_mat
 
+def hashing_fn(args):
+    serialized_args = pickle.dumps(vars(args))
+    hash_object = hashlib.sha256(serialized_args)
+    hashed_string = base64.urlsafe_b64encode(hash_object.digest()).decode()
+    return hashed_string
+
+def scoring_df(X, y):
+    # Split the data into training and test sets
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y
+    )
+    # Define a dictionary of metrics
+    scoring = {
+        "accuracy": make_scorer(metrics.accuracy_score),
+        "precision": make_scorer(metrics.precision_score, average="macro"),
+        "recall": make_scorer(metrics.recall_score, average="macro"),
+        "f1": make_scorer(metrics.f1_score, average="macro"),
+    }
+
+    # Create a random forest classifier
+    pipeline = Pipeline(
+        [
+            ("scaler", StandardScaler()),
+            #  ("pca", PCA(n_components=0.95, whiten=True, random_state=42)),
+            ("clf", RandomForestClassifier()),
+            # ("clf", DummyClassifier()),
+        ]
+    )
+
+    # Specify the number of folds
+    k_folds = 5
+
+    # Perform k-fold cross-validation
+    cv_results = cross_validate(
+        estimator=pipeline,
+        X=X,
+        y=y,
+        cv=KFold(n_splits=k_folds),
+        scoring=scoring,
+        n_jobs=-1,
+        return_train_score=False,
+    )
+
+    # Put the results into a DataFrame
+    return pd.DataFrame(cv_results)
+
+def create_regionprops_df( dataset
+                         , properties = [ "area"
+                                        , "perimeter"
+                                        , "centroid"
+                                        , "major_axis_length"
+                                        , "minor_axis_length"
+                                        , "orientation" ] ):
+    dfs = []
+    # Distance matrix data
+    for i, data in enumerate(tqdm(dataset)):
+        X, y = data
+        # Do regionprops here
+        # Calculate shape summary statistics using regionprops
+        # We're considering that the mask has only one object, so we take the first element [0]
+        # props = regionprops(np.array(X).astype(int))[0]
+        props_table = measure.regionprops_table(
+            np.array(X).astype(int), properties=properties
+        )
+
+        # Store shape properties in a dataframe
+        df = pd.DataFrame(props_table)
+
+        # Assuming the class or label is contained in 'y' variable
+        df["class"] = y
+        df.set_index("class", inplace=True)
+        dfs.append(df)
+
+    return pd.concat(dfs)
+
+def create_efd_df(dataset):
+    dfs = []
+    for i, data in enumerate(tqdm(dataset)):
+        # Convert the tensor to a numpy array
+        X, y = data
+        print(f" The image: {i}")
+
+        # Feed it to PyEFD's calculate_efd function
+        coeffs = pyefd.elliptic_fourier_descriptors(X, order=10, normalize=False)
+        # coeffs_df = pd.DataFrame({'class': [y], 'norm_coeffs': [norm_coeffs.flatten().tolist()]})
+
+        norm_coeffs = pyefd.normalize_efd(coeffs)
+        df = pd.DataFrame(
+            {
+                "norm_coeffs": norm_coeffs.flatten().tolist(),
+                "coeffs": coeffs.flatten().tolist(),
+            }
+        ).T.rename_axis("coeffs")
+        df["class"] = y
+        df.set_index("class", inplace=True, append=True)
+        dfs.append(df)
+
+    return pd.concat(dfs)
+
+def run_trials( trials, outputdir
+              , logger = logging.getLogger(__name__)
+              , width = 3.45
+              , height = 3.45 / 1.618 ):
+    trial_df = pd.DataFrame()
+    for trial in trials:
+        X = trial["features"]
+        y = trial["labels"]
+        trial["score_df"] = scoring_df(X, y)
+        trial["score_df"]["trial"] = trial["name"]
+        logger.info(trial["score_df"])
+        trial["score_df"].to_csv(f"{outputdir}/{trial['name']}_score_df.csv")
+        trial_df = pd.concat([trial_df, trial["score_df"]])
+    trial_df = trial_df.drop(["fit_time", "score_time"], axis=1)
+
+    trial_df.to_csv(f"{outputdir}/trial_df.csv")
+    trial_df.groupby("trial").mean().to_csv(f"{outputdir}/trial_df_mean.csv")
+    trial_df.plot(kind="bar")
+
+    avg = trial_df.groupby("trial").mean()
+    logger.info(avg)
+    avg.to_latex(f"{outputdir}/trial_df.tex")
+
+    melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score")
+    # fig, ax = plt.subplots(figsize=(width, height))
+    ax = sns.catplot(
+        data=melted_df,
+        kind="bar",
+        x="trial",
+        hue="Metric",
+        y="Score",
+        errorbar="se",
+        height=height,
+        aspect=width * 2**0.5 / height,
+    )
+    # ax.xtick_params(labelrotation=45)
+    # plt.legend(loc='lower center', bbox_to_anchor=(1, 1))
+    # sns.move_legend(ax, "lower center", bbox_to_anchor=(1, 1))
+    # ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    # plt.tight_layout()
+    plt.savefig(f"{outputdir}/trials_barplot.pdf")
+    plt.close()
+
+    avs = (
+        melted_df.set_index(["trial", "Metric"])
+        .xs("test_f1", level="Metric", drop_level=False)
+        .groupby("trial")
+        .mean()
+    )
+    logger.info(avs)
+
 # Main process
 ###############################################################################
 
@@ -219,12 +369,60 @@ def main_process(params):
     # Save the latent space
     vprint(1, f'pull the embedings')
     latent_space = torch.stack([d.out.z.flatten() for d in predictions]).numpy()
+    scalings = torch.stack([d.x.scalings.flatten() for d in predictions])
+    
     np.save(f'{output_dir}/latent_space.npy', latent_space)
     df = pd.DataFrame(latent_space)
     df['class_idx'] = class_indices
-    df['class'] = [dataset.classes[x] for x in class_indices]
+    #df['class'] = [dataset.classes[x] for x in class_indices]
+    df['class'] = pd.Series([dataset.classes[x] for x in class_indices]).astype("category")
     df['fname'] = filenames
+    #df['scale'] = scalings[:,0].squeeze()
     df.to_pickle(f'{output_dir}/latent_space.pkl')
+
+    df_shape_embed = df.drop('fname', axis=1).copy()
+    df_shape_embed = df_shape_embed.set_index('class')
+    #regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([
+    regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([
+        transforms.Grayscale(1)
+      #, CropCentroidPipeline(128 * 2)
+    ]))
+    df_regionprops = create_regionprops_df(regionprop_dataset)
+    #efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([
+    efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([
+        transforms.Grayscale(1)
+      #, CropCentroidPipeline(128 * 2)
+      , ImageToCoords(128 * 2)
+    ]))
+    print(efd_dataset)
+    df_efd = create_efd_df(efd_dataset)
+
+    # setup trials
+    trials = [
+      {
+          "name": "mask_embed",
+          "features": df_shape_embed.to_numpy(),
+          "labels": df_shape_embed.index,
+      },
+      {
+          "name": "fourier_coeffs",
+          "features": df_efd.xs("coeffs", level="coeffs"),
+          "labels": df_efd.xs("coeffs", level="coeffs").index,
+      },
+      # {"name": "fourier_norm_coeffs",
+      #  "features": df_efd.xs("norm_coeffs", level="coeffs"),
+      #  "labels": df_efd.xs("norm_coeffs", level="coeffs").index
+      # }
+      {
+          "name": "regionprops",
+          "features": df_regionprops,
+          "labels": df_regionprops.index,
+      }
+    ]
+
+    run_trials(trials, output_dir)
+
+    
     # Save the (original input and) reconstructions
     for i, (pred, class_idx, fname) in enumerate(zip(predictions, class_indices, filenames)):
       vprint(5, f'pred#={i}, class_idx={class_idx}, fname={fname}')
@@ -419,5 +617,6 @@ def auto_pos_int (x):
     if clargs.num_epochs:
       params.epochs = clargs.num_epochs
     
+    logging.basicConfig(level=logging.INFO)
     # run main process
-    main_process(params)
+    main_process(params)
\ No newline at end of file

From 03a4cf7f03928a7c65382e65d84d6da383e9663d Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 16 Jun 2024 12:41:05 +0100
Subject: [PATCH 119/204] Adding a n compression parameter for the latent space
 size

---
 scripts/shapes/distmatrix2embeding.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index b42cd663..6c4ff928 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -496,6 +496,7 @@ def main_process(params):
 ]
 
 matrix_dim = 512
+n = 2
 params = types.SimpleNamespace(**{
     # general params
     "model": "resnet18_vae",
@@ -503,7 +504,8 @@ def main_process(params):
     "batch_size": 4,
     "num_workers": 2**4,
     "input_dim": (3, 512, 512),
-    "latent_dim": int((matrix_dim**2 - matrix_dim) / 2),
+    #"latent_dim": int((matrix_dim**2 - matrix_dim) / 2),
+    "latent_dim": int((matrix_dim*(matrix_dim-1))/2**n),
     "num_embeddings": 1024,
     "num_hiddens": 1024,
     "pretrained": True,

From 5467f9f3ecc1dc0ee325bdcb730ddac9d9634257 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 16 Jun 2024 13:06:25 +0100
Subject: [PATCH 120/204] improve scoring function and use StratifiedKFold
 instead of KFold for cross validation

---
 scripts/shapes/distmatrix2embeding.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 6c4ff928..2746e793 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -87,10 +87,11 @@ def scoring_df(X, y):
     )
     # Define a dictionary of metrics
     scoring = {
-        "accuracy": make_scorer(metrics.accuracy_score),
+        "accuracy": make_scorer(metrics.balanced_accuracy_score),
         "precision": make_scorer(metrics.precision_score, average="macro"),
         "recall": make_scorer(metrics.recall_score, average="macro"),
         "f1": make_scorer(metrics.f1_score, average="macro"),
+        #"roc_auc": make_scorer(metrics.roc_auc_score, average="macro")
     }
 
     # Create a random forest classifier
@@ -111,7 +112,7 @@ def scoring_df(X, y):
         estimator=pipeline,
         X=X,
         y=y,
-        cv=KFold(n_splits=k_folds),
+        cv=StratifiedKFold(n_splits=k_folds),
         scoring=scoring,
         n_jobs=-1,
         return_train_score=False,

From 344c8ab7fc1351d74f6167d9bd171eab75489845 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 16 Jun 2024 14:03:10 +0100
Subject: [PATCH 121/204] hardcoded commited setup now points to quick test
 setup

---
 scripts/shapes/distmatrix2embeding.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py
index 2746e793..f23b1d2a 100644
--- a/scripts/shapes/distmatrix2embeding.py
+++ b/scripts/shapes/distmatrix2embeding.py
@@ -383,14 +383,14 @@ def main_process(params):
 
     df_shape_embed = df.drop('fname', axis=1).copy()
     df_shape_embed = df_shape_embed.set_index('class')
-    #regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([
-    regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([
+    regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([
+    #regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([
         transforms.Grayscale(1)
       #, CropCentroidPipeline(128 * 2)
     ]))
     df_regionprops = create_regionprops_df(regionprop_dataset)
-    #efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([
-    efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([
+    efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([
+    #efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([
         transforms.Grayscale(1)
       #, CropCentroidPipeline(128 * 2)
       , ImageToCoords(128 * 2)
@@ -496,7 +496,8 @@ def main_process(params):
 , "resnet50_vae_legacy"
 ]
 
-matrix_dim = 512
+#matrix_dim = 512
+matrix_dim = 4
 n = 2
 params = types.SimpleNamespace(**{
     # general params

From 9a6456e288cf9d1bf7b12a61ff5480dab48abb52 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 16 Jun 2024 19:51:52 +0100
Subject: [PATCH 122/204] initial refactor commit, script with split up
 functionnalities, missing metrics

---
 scripts/shapeembed/__init__.py                |   1 +
 scripts/shapeembed/dataset_transformations.py | 148 ++++++++
 scripts/shapeembed/shapeembed.py              | 358 ++++++++++++++++++
 3 files changed, 507 insertions(+)
 create mode 100644 scripts/shapeembed/__init__.py
 create mode 100644 scripts/shapeembed/dataset_transformations.py
 create mode 100755 scripts/shapeembed/shapeembed.py

diff --git a/scripts/shapeembed/__init__.py b/scripts/shapeembed/__init__.py
new file mode 100644
index 00000000..e5853d2e
--- /dev/null
+++ b/scripts/shapeembed/__init__.py
@@ -0,0 +1 @@
+from .dataset_transformations import mask2distmatrix
diff --git a/scripts/shapeembed/dataset_transformations.py b/scripts/shapeembed/dataset_transformations.py
new file mode 100644
index 00000000..1cd76c7f
--- /dev/null
+++ b/scripts/shapeembed/dataset_transformations.py
@@ -0,0 +1,148 @@
+import numpy as np
+import imageio.v3 as iio
+import skimage as sk
+from scipy.interpolate import splprep, splev
+import scipy.spatial
+import argparse
+import pathlib
+import types
+import glob
+import os
+import logging
+
+# logging facilities
+###############################################################################
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+# misc helpers
+###############################################################################
+
+def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140):
+  """Turn an rgb array into a greyscale array using the following reduction:
+     grey = cr * r + cg * g + cb * b
+
+     :param rgb: The rgb array
+     :param cr: The red coefficient
+     :param cg: The green coefficient
+     :param cb: The blue coefficient
+
+     :returns: The greyscale array.
+  """
+  r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
+  return cr * r + cg * g + cb * b
+
+# API functions
+###############################################################################
+
+def find_longest_contour(mask, normalise_coord=False):
+  """Find all contours existing in 'mask' and return the longest one
+
+     :param mask: The image with masked objects
+     :param normalise_coord(default: False): optionally normalise coordinates
+
+     :returns: the longest contour as a pair of lists for the x and y
+               coordinates
+  """
+  # force the image to grayscale
+  if len(mask.shape) == 3: # (lines, columns, number of channels)
+    mask = rgb2grey(mask)
+  # extract the contours from the now grayscale image
+  contours = sk.measure.find_contours(mask, 0.8)
+  logger.debug(f'find_longest_contour: len(contours) {len(contours)}')
+  # sort the contours by length
+  contours = sorted(contours, key=lambda x: len(x), reverse=True)
+  # isolate the longest contour (first in the sorted list)
+  x, y = contours[0][:, 0], contours[0][:, 1]
+  # optionally normalise the coordinates in the countour
+  if normalise_coord:
+    x = x - np.min(x)
+    x = x / np.max(x)
+    y = y - np.min(y)
+    y = y / np.max(y)
+  # return the contour as a pair of lists of x and y coordinates
+  return x, y
+
+def spline_interpolation(x, y, spline_sampling, raw_sampling_sparsity=1):
+  """Return a resampled spline interpolation of a provided contour
+
+     :param x: The list of x coordinates of a contour
+     :param y: The list of y coordinates of a contour
+     :param spline_sampling: The number of points to sample on the spline
+     :param raw_sampling_sparsity (default=1):
+       The distance (in number of gaps) to the next point to consider in the
+       raw contour (i.e. whether consider every point, every other point
+       , every 3 points... This might be considered to avoid artifacts due to
+       high point count contours over low pixel resolution images, with contour
+       effectively curving around individual pixel edges)
+
+     :returns: the resampled spline with spline_sampling points as a pair of
+               lists of x and y coordinates
+  """
+  # Force sparsity to be at least one
+  raw_sampling_sparsity = max(1, raw_sampling_sparsity)
+  logger.debug(f'spline_interpolation: running with raw_sampling_sparsity {raw_sampling_sparsity} and spline_sampling {spline_sampling}')
+  logger.debug(f'spline_interpolation: x.shape {x.shape} y.shape {y.shape}')
+  # prepare the spline interpolation of the given contour
+  tck, u = splprep( [x[::raw_sampling_sparsity], y[::raw_sampling_sparsity]]
+                  , s = 0 # XXX
+                  , per = True # closed contour (periodic spline)
+                  )
+  # how many times to sample the spline
+  # last parameter is how dense is our spline, how many points.
+  new_u = np.linspace(u.min(), u.max(), spline_sampling)
+  # evaluate and return the sampled spline
+  x_spline, y_spline = splev(new_u, tck)
+  return x_spline, y_spline
+
+def build_distance_matrix(x_reinterpolated, y_reinterpolated):
+  """Turn a (reinterpolated) contour into a distance matrix
+
+     :param x_reinterpolated: The list of x coordinates of a contour
+     :param y_reinterpolated: The list of y coordinates of a contour
+
+     :returns: the distance matrix characteristic of the provided contour
+  """
+  # reshape the pair of lists of individual x and y coordinates as a single
+  # numpy array of pairs of (x,y) coordinates
+  reinterpolated_contour = np.column_stack([ x_reinterpolated
+                                           , y_reinterpolated ])
+  # build the distance matrix from the reshaped input data
+  dm = scipy.spatial.distance_matrix( reinterpolated_contour
+                                    , reinterpolated_contour )
+  return dm
+
+def dist_to_coords(dst_mat):
+  """Turn a distance matrix into the corresponding contour
+     XXX
+     TODO sort out exactly the specifics here...
+  """
+  embedding = MDS(n_components=2, dissimilarity='precomputed')
+  return embedding.fit_transform(dst_mat)
+
+def mask2distmatrix(mask, matrix_size=512, raw_sampling_sparsity=1):
+  """Get the distance matrix characteristic of the (biggest) object in the
+     provided image
+
+     :param mask: The image with masked objects
+     :param matrix_size(default: 512): the desired matrix size
+     :param raw_sampling_sparsity (default=1):
+       The distance (in number of gaps) to the next point to consider in the
+       raw contour (i.e. whether consider every point, every other point
+       , every 3 points... This might be considered to avoid artifacts due to
+       high point count contours over low pixel resolution images, with contour
+       effectively curving around individual pixel edges)
+
+     :returns: the distance matrix characteristic of the (biggest) object in
+               the provided image
+  """
+  logger.debug(f'mask2distmatrix: running with raw_sampling_sparsity {raw_sampling_sparsity} and matrix_size {matrix_size}')
+  # extract mask contour
+  x, y = find_longest_contour(mask, normalise_coord=True)
+  logger.debug(f'mask2distmatrix: found contour shape x {x.shape} y {y.shape}')
+  # Reinterpolate (spline)
+  x_reinterpolated, y_reinterpolated = spline_interpolation(x, y, matrix_size, raw_sampling_sparsity)
+  # Build the distance matrix
+  dm = build_distance_matrix(x_reinterpolated, y_reinterpolated)
+  logger.debug(f'mask2distmatrix: created distance matrix shape {dm.shape}')
+  return dm
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
new file mode 100755
index 00000000..d4bf8a40
--- /dev/null
+++ b/scripts/shapeembed/shapeembed.py
@@ -0,0 +1,358 @@
+#! /usr/bin/env python3
+
+# general utils
+import os
+import re
+import copy
+import types
+import pickle
+import base64
+import hashlib
+import logging
+import functools
+
+# machine learning utils
+import torch
+from torchvision import datasets, transforms
+import pytorch_lightning as pl
+from pytorch_lightning import loggers as pl_loggers
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
+
+# own source files
+import bioimage_embed
+import bioimage_embed.shapes
+from dataset_transformations import *
+
+# logging facilities
+###############################################################################
+logger = logging.getLogger(__name__)
+
+# script inputs and parameters
+###############################################################################
+
+# available types of datasets (raw, masks, distance matrix)
+dataset_types = [
+  "raw_image"
+, "mask"
+, "distance_matrix"
+]
+
+# available models
+models = [
+  "resnet18_vae"
+, "resnet50_vae"
+, "resnet18_beta_vae"
+, "resnet18_vae_bolt"
+, "resnet50_vae_bolt"
+, "resnet18_vqvae"
+, "resnet50_vqvae"
+, "resnet18_vqvae_legacy"
+, "resnet50_vqvae_legacy"
+, "resnet101_vqvae_legacy"
+, "resnet110_vqvae_legacy"
+, "resnet152_vqvae_legacy"
+, "resnet18_vae_legacy"
+, "resnet50_vae_legacy"
+]
+
+# set of parameters for a run, with default values
+dflt_params = types.SimpleNamespace(
+  model_name='resnet18_vae'
+, dataset=types.SimpleNamespace(
+    name='tiny_synthetic_shapes'
+  , path='/nfs/research/uhlmann/afoix/image_datasets/tiny_synthetic_shapes'
+  , type='mask'
+  )
+, batch_size=4
+, compression_factor=2
+, matrix_size=512
+, num_embeddings=1024
+, num_hiddens=1024
+, num_workers=16
+, epochs=150
+, pretrained=False
+, frobenius_norm=False
+, checkpoints_path='./checkpoints'
+, commitment_cost=0.25
+, decay=0.99
+# optimizer_params
+, opt="AdamW"
+, lr=0.001
+, weight_decay=0.0001
+, momentum=0.9
+# lr_scheduler_params
+, sched="cosine"
+, min_lr=1e-4
+, warmup_epochs=5
+, warmup_lr=1e-6
+, cooldown_epochs=10
+, t_max=50
+, cycle_momentum=False
+)
+
+# data
+###############################################################################
+
+def maybe_roll(dist_mat, p = 0.5):
+  if np.random.rand() < p:
+    return np.roll(dist_mat, np.random.randint(0, dist_mat.shape[0]), (0,1))
+  else:
+    return dist_mat
+
+def sanity_check(dist_mat):
+  if not np.allclose(dist_mat, dist_mat.T):
+    raise ValueError("Matrix is not symmetric")
+  if np.any(dist_mat < 0):
+    raise ValueError("Matrix has negative values")
+  if np.any(np.diag(dist_mat)):
+    raise ValueError("Matrix has non-zero diagonal")
+  return dist_mat
+
+def get_dataloader(params):
+  # transformations / checks to run on distance matrices
+  distmat_ts = transforms.Compose([
+    lambda x: x / np.linalg.norm(x, "fro") # normalize the matrix
+  , lambda x: maybe_roll(x, p = 1.0) # "potentially" roll the matrix
+  , sanity_check # check if the matrix is symmetric and positive, and the diagonal is zero
+  , torch.as_tensor # turn (H,W) numpy array into a (H,W) tensor
+  , lambda x: x.repeat(3, 1, 1) # turn (H,W) tensor into a (3,H,W) tensor (to fit downstream model expectations)
+  ])
+  # dataset to load
+  logger.info(f'loading dataset {params.dataset.name}')
+  dataset = None
+  if params.dataset.type == 'raw_image': # TODO
+    raise NotImplementedError("raw images not yet supported")
+  elif params.dataset.type == 'mask': # mask data, convert to distance matrix first
+    dataset = datasets.ImageFolder(
+      params.dataset.path
+    , transforms.Compose([ np.array
+                         , functools.partial( mask2distmatrix
+                                            , matrix_size=params.matrix_size )
+                         , distmat_ts ]))
+  elif params.dataset.type == 'distance_matrix': # distance matrix data
+    dataset = datasets.DatasetFolder( params.dataset.path
+                                    , loader=np.load
+                                    , extensions=('npy')
+                                    , transform = distmat_ts )
+  assert dataset, f"could not load dataset {params.dataset.name}"
+  # create the dataloader from the dataset and other parameters
+  dataloader = bioimage_embed.lightning.DataModule(
+    dataset
+  , batch_size=params.batch_size
+  , shuffle=True
+  , num_workers=params.num_workers
+  )
+  dataloader.setup()
+  logger.info(f'dataloader ready')
+  return dataloader
+
+# model
+###############################################################################
+
+def get_model(params):
+  logger.info(f'setup model')
+  model = bioimage_embed.models.create_model(
+    model=params.model_name
+  , input_dim=params.input_dim
+  , latent_dim=params.latent_dim
+  , pretrained=params.pretrained
+  , **vars(params.model_args)
+  )
+  lit_model = bioimage_embed.shapes.MaskEmbed(model, params)
+  logger.info(f'model ready')
+  return lit_model
+
+# trainer
+###############################################################################
+
+def hashing_fn(args):
+  serialized_args = pickle.dumps(vars(args))
+  hash_object = hashlib.sha256(serialized_args)
+  hashed_string = base64.urlsafe_b64encode(hash_object.digest()).decode()
+  return hashed_string
+
+def get_trainer(model, params):
+
+  # setup WandB logger
+  logger.info('setup wandb logger')
+  jobname = f"{params.model_name}_{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}_{params.latent_dim}_{params.batch_size}_{params.dataset.name}"
+  wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=jobname)
+  wandblogger.watch(model, log="all")
+
+  # setup checkpoints
+  logger.info('setup checkpoints')
+  model_dir = f"{params.checkpoints_path}/{hashing_fn(params)}"
+  os.makedirs(f"{model_dir}/", exist_ok=True)
+  checkpoint_callback = ModelCheckpoint(
+    dirpath=f"{model_dir}/"
+  , save_last=True
+  , save_top_k=1
+  , monitor="loss/val"
+  , mode="min"
+  )
+
+  # setup trainer
+  logger.info('setup trainer')
+  trainer = pl.Trainer(
+    logger=[wandblogger]
+  , gradient_clip_val=0.5
+  , enable_checkpointing=True
+  , devices=1
+  , accelerator="gpu"
+  , accumulate_grad_batches=4
+  , callbacks=[ checkpoint_callback
+              , EarlyStopping(monitor="loss/val", mode="min")
+              ]
+  , min_epochs=50
+  , max_epochs=params.epochs
+  , log_every_n_steps=1
+  )
+
+  logger.info(f'trainer ready')
+  return trainer
+
+# train / validate / test the model
+###############################################################################
+
+def train_model(trainer, model, dataloader):
+  # retrieve the checkpoint information from the trainer and check if a
+  # checkpoint exists to resume from
+  checkpoint_callback = trainer.checkpoint_callback
+  last_checkpoint_path = checkpoint_callback.last_model_path
+  best_checkpoint_path = checkpoint_callback.best_model_path
+  if os.path.isfile(last_checkpoint_path):
+      resume_checkpoint = last_checkpoint_path
+  elif best_checkpoint_path and os.path.isfile(best_checkpoint_path):
+      resume_checkpoint = best_checkpoint_path
+  else:
+      resume_checkpoint = None
+  # train the model
+  logger.info('training the model')
+  trainer.fit(model, datamodule=dataloader, ckpt_path=resume_checkpoint)
+  model.eval()
+
+  return model
+
+def validate_model(trainer, model, dataloader):
+  logger.info('validating the model')
+  validation = trainer.validate(model, datamodule=dataloader)
+  return validation
+
+def test_model(trainer, model, dataloader):
+  logger.info('testing the model')
+  testing = trainer.test(model, datamodule=dataloader)
+  return testing
+
+# main process
+###############################################################################
+
+def main_process(params):
+
+  # setup
+  model = get_model(params)
+  trainer = get_trainer(model, params)
+  dataloader = get_dataloader(params)
+
+  # run actual work
+  train_model(trainer, model, dataloader)
+  validate_model(trainer, model, dataloader)
+  test_model(trainer, model, dataloader)
+
+  # gather results
+
+# main entry point
+###############################################################################
+if __name__ == '__main__':
+  def auto_pos_int (x):
+    val = int(x,0)
+    if val <= 0:
+      raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
+    return val
+  
+  parser = argparse.ArgumentParser(description='Run the shape embed pipeline')
+  
+  parser.add_argument(
+      '-m', '--model', choices=models, metavar='MODEL'
+    , help=f"The MODEL to use, one of {models} (default {dflt_params.model_name}).")
+  parser.add_argument(
+      '--model-arg-beta', type=float, metavar='BETA'
+    , help=f"The BETA parameter to use for a beta-vae model.")
+  parser.add_argument(
+      '-d', '--dataset', nargs=3, metavar=('NAME', 'PATH', 'TYPE')
+    , help=f"The NAME, PATH and TYPE of the dataset (default: {dflt_params.dataset})")
+  parser.add_argument(
+      '-o', '--output-dir', metavar='OUTPUT_DIR', default=None
+    , help=f"The OUTPUT_DIR path to use to dump results")
+  parser.add_argument(
+      '--wandb-entity', default="foix", metavar='WANDB_ENTITY'
+    , help=f"The WANDB_ENTITY name")
+  parser.add_argument(
+      '--wandb-project', default="simply-shape", metavar='WANDB_PROJECT'
+    , help=f"The WANDB_PROJECT name")
+  parser.add_argument(
+      '-b', '--batch-size', metavar='BATCH_SIZE', type=auto_pos_int
+    , help=f"The BATCH_SIZE for the run, a positive integer (default {dflt_params.batch_size})")
+  parser.add_argument(
+      '-c', '--compression-factor', metavar='COMPRESSION_FACTOR', type=auto_pos_int
+    , help=f"The COMPRESSION_FACTOR, a positive integer (default {dflt_params.compression_factor})")
+  parser.add_argument(
+      '--distance-matrix-size', metavar='MATRIX_SIZE', type=auto_pos_int
+    , help=f"The size of the distance matrix (default {dflt_params.matrix_size})")
+  parser.add_argument(
+      '--number-embeddings', metavar='NUM_EMBEDDINGS', type=auto_pos_int
+    , help=f"The NUM_EMBEDDINGS, a positive integer (default {dflt_params.num_embeddings})")
+  parser.add_argument(
+      '--number-hiddens', metavar='NUM_HIDDENS', type=auto_pos_int
+    , help=f"The NUM_HIDDENS, a positive integer (default {dflt_params.num_hiddens})")
+  parser.add_argument(
+      '-n', '--num-workers', metavar='NUM_WORKERS', type=auto_pos_int
+    , help=f"The NUM_WORKERS for the run, a positive integer (default {dflt_params.num_workers})")
+  parser.add_argument(
+      '-e', '--num-epochs', metavar='NUM_EPOCHS', type=auto_pos_int
+    , help=f"The NUM_EPOCHS for the run, a positive integer (default {dflt_params.epochs})")
+  parser.add_argument('--clear-checkpoints', action='store_true'
+    , help='remove checkpoints')
+  parser.add_argument('-v', '--verbose', action='count', default=0
+    , help="Increase verbosity level by adding more \"v\".")
+  
+  # parse command line arguments
+  clargs=parser.parse_args()
+  
+  # set verbosity level
+  if clargs.verbose > 0:
+    logging.basicConfig(level=logging.INFO)
+  
+  params = copy.deepcopy(dflt_params)
+  # update default params with clargs
+  if clargs.model:
+    params.model = clargs.model
+  params.model_args = types.SimpleNamespace()
+  if clargs.model_arg_beta:
+    params.model_args.beta = clargs.model_arg_beta
+  params.output_dir = clargs.output_dir
+  if clargs.dataset:
+    params.dataset = clargs.dataset
+  if clargs.wandb_entity:
+    params.wandb_entity = clargs.wandb_entity
+  if clargs.wandb_project:
+    params.wandb_project = clargs.wandb_project
+  if clargs.batch_size:
+    params.batch_size = clargs.batch_size
+  if clargs.distance_matrix_size:
+    params.matrix_size = clargs.distance_matrix_size
+  params.input_dim = (3, params.matrix_size, params.matrix_size)
+  if clargs.compression_factor:
+    params.compression_factor = clargs.compression_factor
+  n_features = lambda d, n: d*(d-1)/(2**n)
+  params.latent_dim = n_features(params.matrix_size, params.compression_factor)
+  if clargs.number_embeddings:
+    params.num_embeddings = clargs.number_embeddings
+  if clargs.number_hiddens:
+    params.num_hiddens = clargs.number_hiddens
+  if clargs.num_workers:
+    params.num_workers = clargs.num_workers
+  if clargs.num_epochs:
+    params.epochs = clargs.num_epochs
+  
+  main_process(params)

From 751394e44251a1603268a1e95e25a59024312d40 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 17 Jun 2024 01:29:02 +0100
Subject: [PATCH 123/204] Added predictions + kmeans of input data

---
 scripts/shapeembed/shapeembed.py | 111 +++++++++++++++++++++++++++----
 1 file changed, 99 insertions(+), 12 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index d4bf8a40..048f0bcf 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -1,5 +1,15 @@
 #! /usr/bin/env python3
 
+# machine learning utils
+import torch
+from torchvision import datasets, transforms
+import pytorch_lightning as pl
+from pytorch_lightning import loggers as pl_loggers
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
+from sklearn.cluster import KMeans
+from sklearn.metrics import confusion_matrix, accuracy_score
+
 # general utils
 import os
 import re
@@ -7,18 +17,12 @@
 import types
 import pickle
 import base64
+import pandas
 import hashlib
 import logging
+import datetime
 import functools
 
-# machine learning utils
-import torch
-from torchvision import datasets, transforms
-import pytorch_lightning as pl
-from pytorch_lightning import loggers as pl_loggers
-from pytorch_lightning.callbacks.early_stopping import EarlyStopping
-from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
-
 # own source files
 import bioimage_embed
 import bioimage_embed.shapes
@@ -69,7 +73,7 @@
 , matrix_size=512
 , num_embeddings=1024
 , num_hiddens=1024
-, num_workers=16
+, num_workers=8
 , epochs=150
 , pretrained=False
 , frobenius_norm=False
@@ -244,22 +248,98 @@ def test_model(trainer, model, dataloader):
   testing = trainer.test(model, datamodule=dataloader)
   return testing
 
+def run_predictions(trainer, model, dataloader, num_workers=8):
+
+  # prepare new unshuffled datamodule
+  datamod = bioimage_embed.lightning.DataModule(
+    dataloader.dataset
+  , batch_size=1
+  , shuffle=False
+  , num_workers=num_workers
+  )
+  datamod.setup()
+
+  # run predictions
+  predictions = trainer.predict(model, datamodule=datamod)
+
+  # extract latent space
+  latent_space = torch.stack([d.out.z.flatten() for d in predictions]).numpy()
+
+  # extract class indices and filenames and provide a richer pandas dataframe
+  ds = datamod.get_dataset()
+  class_indices = np.array([ int(lbl)
+                             for _, lbl in datamod.predict_dataloader() ])
+  fnames = [fname for fname, _ in ds.samples]
+  df = pandas.DataFrame(latent_space)
+  df['class_idx'] = class_indices
+  #df['class'] = [ds.classes[x] for x in class_indices]
+  df['class'] = pandas.Series([ ds.classes[x]
+                                for x in class_indices]).astype("category")
+  df['fname'] = fnames
+  #df['scale'] = scalings[:,0].squeeze()
+
+  return (predictions, latent_space, df)
+
+def dataloader_to_dataframe(dataloader):
+  # gather the data and the associated labels, and drop rows with NaNs
+  all_data = []
+  all_lbls = []
+  for batch in dataloader:
+    inputs, lbls = batch
+    for data, lbl in zip(inputs, lbls):
+      all_data.append(data.flatten().numpy())
+      all_lbls.append(int(lbl))
+  df = pandas.DataFrame(all_data)
+  df['label'] = all_lbls
+  df.dropna()
+  return df
+
+def run_kmeans(dataframe, random_seed=42):
+  # run KMeans and derive accuracy metric and confusion matrix
+  kmeans = KMeans( n_clusters=len(dataframe['label'].unique())
+                 , random_state=random_seed
+                 ).fit(dataframe.drop('label', axis=1))
+  accuracy = accuracy_score(dataframe['label'], kmeans.labels_)
+  conf_mat = confusion_matrix(dataframe['label'], kmeans.labels_)
+
+  return kmeans, accuracy, conf_mat
+
 # main process
 ###############################################################################
 
 def main_process(params):
 
   # setup
+  #######
   model = get_model(params)
   trainer = get_trainer(model, params)
   dataloader = get_dataloader(params)
 
   # run actual work
+  #################
   train_model(trainer, model, dataloader)
   validate_model(trainer, model, dataloader)
   test_model(trainer, model, dataloader)
 
-  # gather results
+  # run predictions
+  #################
+  # ... and gather latent space
+  predictions, latent_space, df = run_predictions(
+    trainer, model, dataloader
+  , num_workers=params.num_workers
+  )
+  # ... and prepare output directory and save latent space
+  os.makedirs(f"{params.output_dir}/", exist_ok=True)
+  np.save(f'{params.output_dir}/latent_space.npy', latent_space)
+  df.to_pickle(f'{params.output_dir}/latent_space.pkl')
+
+  # gather metrics
+  ################
+  # kmeans on input data
+  _, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
+  logger.info(f'-- kmeans on input data --')
+  logger.info(f'-- accuracy: {accuracy}')
+  logger.info(f'-- confusion matrix:\n{conf_mat}')
 
 # main entry point
 ###############################################################################
@@ -321,10 +401,10 @@ def auto_pos_int (x):
   
   # set verbosity level
   if clargs.verbose > 0:
-    logging.basicConfig(level=logging.INFO)
+    logging.basicConfig(level=logging.DEBUG)
   
-  params = copy.deepcopy(dflt_params)
   # update default params with clargs
+  params = copy.deepcopy(dflt_params)
   if clargs.model:
     params.model = clargs.model
   params.model_args = types.SimpleNamespace()
@@ -354,5 +434,12 @@ def auto_pos_int (x):
     params.num_workers = clargs.num_workers
   if clargs.num_epochs:
     params.epochs = clargs.num_epochs
+  if clargs.output_dir:
+    params.output_dir = clargs.output_dir
+  else:
+    params.output_dir = f'./{params.model_name}_{params.latent_dim}_{params.batch_size}_{params.dataset.name}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
   
+  # XXX
+  torch.set_float32_matmul_precision('medium')
+  # XXX
   main_process(params)

From 949254d1bd13da5ec3675d3eaaa1cc58fd483fce Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 17 Jun 2024 22:24:49 +0100
Subject: [PATCH 124/204] factored out evaluation functionality + added
 regionprops, efd and scoring

---
 scripts/shapeembed/__init__.py   |   1 +
 scripts/shapeembed/evaluation.py | 130 +++++++++++++++++++++++++++++++
 scripts/shapeembed/shapeembed.py |  47 +++++------
 3 files changed, 149 insertions(+), 29 deletions(-)
 create mode 100644 scripts/shapeembed/evaluation.py

diff --git a/scripts/shapeembed/__init__.py b/scripts/shapeembed/__init__.py
index e5853d2e..cd331ee4 100644
--- a/scripts/shapeembed/__init__.py
+++ b/scripts/shapeembed/__init__.py
@@ -1 +1,2 @@
 from .dataset_transformations import mask2distmatrix
+from .evaluation import *
diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
new file mode 100644
index 00000000..f6575b96
--- /dev/null
+++ b/scripts/shapeembed/evaluation.py
@@ -0,0 +1,130 @@
+from torchvision import datasets, transforms
+import pyefd
+from skimage import measure
+from sklearn.cluster import KMeans
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.discriminant_analysis import StandardScaler
+from sklearn import metrics
+from sklearn.metrics import make_scorer
+from sklearn.metrics import confusion_matrix, accuracy_score
+from sklearn.model_selection import cross_validate, KFold, train_test_split, StratifiedKFold
+
+import tqdm
+import numpy
+import pandas
+import logging
+
+from bioimage_embed.shapes.transforms import ImageToCoords
+
+# logging facilities
+###############################################################################
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+def dataloader_to_dataframe(dataloader):
+  # gather the data and the associated labels, and drop rows with NaNs
+  all_data = []
+  all_lbls = []
+  for batch in dataloader:
+    inputs, lbls = batch
+    for data, lbl in zip(inputs, lbls):
+      all_data.append(data.flatten().numpy())
+      all_lbls.append(int(lbl))
+  df = pandas.DataFrame(all_data)
+  df['label'] = all_lbls
+  df.dropna()
+  return df
+
+def run_kmeans(dataframe, random_seed=42):
+  # run KMeans and derive accuracy metric and confusion matrix
+  kmeans = KMeans( n_clusters=len(dataframe['label'].unique())
+                 , random_state=random_seed
+                 ).fit(dataframe.drop('label', axis=1))
+  accuracy = accuracy_score(dataframe['label'], kmeans.labels_)
+  conf_mat = confusion_matrix(dataframe['label'], kmeans.labels_)
+
+  return kmeans, accuracy, conf_mat
+
+def run_regionprops( dataset_params
+                   , properties = [ "area"
+                                  , "perimeter"
+                                  , "centroid"
+                                  , "major_axis_length"
+                                  , "minor_axis_length"
+                                  , "orientation" ] ):
+  # access the dataset
+  assert dataset_params.type == 'mask'
+  ds = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1))
+  # ... and run regionprops for the given properties for each image
+  dfs = []
+  logger.info(f'running regionprops on {dataset_params.name}')
+  logger.info(f'({dataset_params.path})')
+  for i, (img, lbl) in enumerate(tqdm.tqdm(ds)):
+    t = measure.regionprops_table(numpy.array(img), properties=properties)
+    df = pandas.DataFrame(t)
+    df['class'] = lbl
+    df.set_index("class", inplace=True)
+    dfs.append(df)
+  # concatenate results as a single dataframe and return it
+  df = pandas.concat(dfs)
+  return df
+
+def run_elliptic_fourier_descriptors(dataset_params, contour_size=512):
+  # access the dataset
+  assert dataset_params.type == 'mask'
+  ds = datasets.ImageFolder( dataset_params.path
+                           , transform=transforms.Compose([
+                               transforms.Grayscale(1)
+                             , ImageToCoords(contour_size) ]))
+  # ... and run efd on each image
+  dfs = []
+  logger.info(f'running efd on {dataset_params.name}')
+  logger.info(f'({dataset_params.path})')
+  for i, (img, lbl) in enumerate(tqdm.tqdm(ds)):
+    coeffs = pyefd.elliptic_fourier_descriptors(img, order=10, normalize=False)
+    norm_coeffs = pyefd.normalize_efd(coeffs)
+    df = pandas.DataFrame({
+      "norm_coeffs": norm_coeffs.flatten().tolist()
+    , "coeffs": coeffs.flatten().tolist()
+    }).T.rename_axis("coeffs")
+    df['class'] = lbl
+    df.set_index("class", inplace=True, append=True)
+    dfs.append(df)
+  # concatenate results as a single dataframe and return it
+  return pandas.concat(dfs).xs('coeffs', level='coeffs')
+
+def score_dataframe(df, test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5):
+  # TODO, currently unused
+  # Split the data into training and test sets
+  #X_train, X_test, y_train, y_test = train_test_split(
+  #  df, df.index, stratify=df.index
+  #, test_size=test_sz, randm_state=rand_seed, shuffle=shuffle
+  #)
+  # Define a dictionary of metrics
+  scoring = {
+    "accuracy": make_scorer(metrics.balanced_accuracy_score)
+  , "precision": make_scorer(metrics.precision_score, average="macro")
+  , "recall": make_scorer(metrics.recall_score, average="macro")
+  , "f1": make_scorer(metrics.f1_score, average="macro")
+  #, "roc_auc": make_scorer(metrics.roc_auc_score, average="macro")
+  }
+  # Create a random forest classifier
+  pipeline = Pipeline([
+    ("scaler", StandardScaler())
+  #, ("pca", PCA(n_components=0.95, whiten=True, random_state=rand_seed))
+  , ("clf", RandomForestClassifier())
+  #, ("clf", DummyClassifier())
+  ])
+  # Perform k-fold cross-validation
+  cv_results = cross_validate(
+    estimator=pipeline
+  , X=df
+  , y=df.index
+  , cv=StratifiedKFold(n_splits=k_folds)
+  , scoring=scoring
+  , n_jobs=-1
+  , return_train_score=False
+  )
+  # Put the results into a DataFrame
+  return pandas.DataFrame(cv_results)
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 048f0bcf..420af2ee 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -7,8 +7,6 @@
 from pytorch_lightning import loggers as pl_loggers
 from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
-from sklearn.cluster import KMeans
-from sklearn.metrics import confusion_matrix, accuracy_score
 
 # general utils
 import os
@@ -27,6 +25,7 @@
 import bioimage_embed
 import bioimage_embed.shapes
 from dataset_transformations import *
+from evaluation import *
 
 # logging facilities
 ###############################################################################
@@ -280,30 +279,6 @@ def run_predictions(trainer, model, dataloader, num_workers=8):
 
   return (predictions, latent_space, df)
 
-def dataloader_to_dataframe(dataloader):
-  # gather the data and the associated labels, and drop rows with NaNs
-  all_data = []
-  all_lbls = []
-  for batch in dataloader:
-    inputs, lbls = batch
-    for data, lbl in zip(inputs, lbls):
-      all_data.append(data.flatten().numpy())
-      all_lbls.append(int(lbl))
-  df = pandas.DataFrame(all_data)
-  df['label'] = all_lbls
-  df.dropna()
-  return df
-
-def run_kmeans(dataframe, random_seed=42):
-  # run KMeans and derive accuracy metric and confusion matrix
-  kmeans = KMeans( n_clusters=len(dataframe['label'].unique())
-                 , random_state=random_seed
-                 ).fit(dataframe.drop('label', axis=1))
-  accuracy = accuracy_score(dataframe['label'], kmeans.labels_)
-  conf_mat = confusion_matrix(dataframe['label'], kmeans.labels_)
-
-  return kmeans, accuracy, conf_mat
-
 # main process
 ###############################################################################
 
@@ -335,11 +310,25 @@ def main_process(params):
 
   # gather metrics
   ################
+  # regionprops on input data
+  logger.info(f'-- regionprops on input data --')
+  regionprops_df = run_regionprops(params.dataset)
+  logger.debug(regionprops_df)
+  regionprops_score_df = score_dataframe(regionprops_df)
+  logger.info(f'-- regionprops on input data, score:')
+  logger.info(regionprops_score_df)
+  # elliptic fourier descriptors on input data
+  logger.info(f'-- elliptic fourier descriptors on input data --')
+  efd_df = run_elliptic_fourier_descriptors(params.dataset)
+  logger.debug(efd_df)
+  efd_score_df = score_dataframe(efd_df)
+  logger.info(f'-- elliptic fourier descriptors on input data, score:')
+  logger.info(efd_score_df)
   # kmeans on input data
-  _, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
   logger.info(f'-- kmeans on input data --')
-  logger.info(f'-- accuracy: {accuracy}')
-  logger.info(f'-- confusion matrix:\n{conf_mat}')
+  _, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
+  logger.info(f'-- kmeans accuracy: {accuracy}')
+  logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
 
 # main entry point
 ###############################################################################

From 43a396978ea15d2aa626086d6fbe020a988970d4 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 18 Jun 2024 19:24:08 +0100
Subject: [PATCH 125/204] cleaner logging + score shapeembed itself

---
 scripts/shapeembed/evaluation.py |  9 +++++---
 scripts/shapeembed/shapeembed.py | 39 ++++++++++++++++++--------------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index f6575b96..e00b3418 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -95,10 +95,13 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size=512):
   return pandas.concat(dfs).xs('coeffs', level='coeffs')
 
 def score_dataframe(df, test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5):
+  # drop strings and python object columns
+  #clean_df = df.select_dtypes(exclude=['object'])
+  clean_df = df.select_dtypes(include=['number'])
   # TODO, currently unused
   # Split the data into training and test sets
   #X_train, X_test, y_train, y_test = train_test_split(
-  #  df, df.index, stratify=df.index
+  #  clean_df, clean_df.index, stratify=clean_df.index
   #, test_size=test_sz, randm_state=rand_seed, shuffle=shuffle
   #)
   # Define a dictionary of metrics
@@ -119,8 +122,8 @@ def score_dataframe(df, test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5):
   # Perform k-fold cross-validation
   cv_results = cross_validate(
     estimator=pipeline
-  , X=df
-  , y=df.index
+  , X=clean_df
+  , y=clean_df.index
   , cv=StratifiedKFold(n_splits=k_folds)
   , scoring=scoring
   , n_jobs=-1
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 420af2ee..25341893 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -299,36 +299,39 @@ def main_process(params):
   # run predictions
   #################
   # ... and gather latent space
-  predictions, latent_space, df = run_predictions(
+  predictions, latent_space, shapeembed_df = run_predictions(
     trainer, model, dataloader
   , num_workers=params.num_workers
   )
   # ... and prepare output directory and save latent space
   os.makedirs(f"{params.output_dir}/", exist_ok=True)
   np.save(f'{params.output_dir}/latent_space.npy', latent_space)
-  df.to_pickle(f'{params.output_dir}/latent_space.pkl')
+  shapeembed_df.to_pickle(f'{params.output_dir}/latent_space.pkl')
 
   # gather metrics
   ################
-  # regionprops on input data
+  # regionprops on input data and score
   logger.info(f'-- regionprops on input data --')
   regionprops_df = run_regionprops(params.dataset)
-  logger.debug(regionprops_df)
+  logger.debug(f'\n{regionprops_df}')
   regionprops_score_df = score_dataframe(regionprops_df)
-  logger.info(f'-- regionprops on input data, score:')
-  logger.info(regionprops_score_df)
-  # elliptic fourier descriptors on input data
+  logger.info(f'-- regionprops on input data, score:\n{regionprops_score_df}')
+  # elliptic fourier descriptors on input data and score
   logger.info(f'-- elliptic fourier descriptors on input data --')
   efd_df = run_elliptic_fourier_descriptors(params.dataset)
-  logger.debug(efd_df)
+  logger.debug(f'\n{efd_df}')
   efd_score_df = score_dataframe(efd_df)
-  logger.info(f'-- elliptic fourier descriptors on input data, score:')
-  logger.info(efd_score_df)
-  # kmeans on input data
+  logger.info(f'-- elliptic fourier descriptors on input data, score:\n{efd_score_df}')
+  # kmeans on input data and score
   logger.info(f'-- kmeans on input data --')
   _, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
   logger.info(f'-- kmeans accuracy: {accuracy}')
   logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
+  # score shape embed
+  logger.info(f'-- score shape embed --')
+  logger.debug(f'\n{shapeembed_df}')
+  shapeembed_score_df = score_dataframe(shapeembed_df)
+  logger.info(f'-- shapeembed on input data, score:\n{shapeembed_score_df}')
 
 # main entry point
 ###############################################################################
@@ -384,14 +387,16 @@ def auto_pos_int (x):
     , help='remove checkpoints')
   parser.add_argument('-v', '--verbose', action='count', default=0
     , help="Increase verbosity level by adding more \"v\".")
-  
+
   # parse command line arguments
   clargs=parser.parse_args()
-  
+
   # set verbosity level
-  if clargs.verbose > 0:
-    logging.basicConfig(level=logging.DEBUG)
-  
+  if clargs.verbose > 2:
+    logger.setLevel(logging.DEBUG)
+  elif clargs.verbose > 0:
+    logger.setLevel(logging.INFO)
+
   # update default params with clargs
   params = copy.deepcopy(dflt_params)
   if clargs.model:
@@ -427,7 +432,7 @@ def auto_pos_int (x):
     params.output_dir = clargs.output_dir
   else:
     params.output_dir = f'./{params.model_name}_{params.latent_dim}_{params.batch_size}_{params.dataset.name}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
-  
+
   # XXX
   torch.set_float32_matmul_precision('medium')
   # XXX

From 9030e487e933cd87f31fa2a614b3fab563647635 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 18 Jun 2024 19:34:09 +0100
Subject: [PATCH 126/204] reshaped shapeembed reported dataframe

---
 scripts/shapeembed/shapeembed.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 25341893..4fe4338f 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -270,12 +270,12 @@ def run_predictions(trainer, model, dataloader, num_workers=8):
                              for _, lbl in datamod.predict_dataloader() ])
   fnames = [fname for fname, _ in ds.samples]
   df = pandas.DataFrame(latent_space)
-  df['class_idx'] = class_indices
-  #df['class'] = [ds.classes[x] for x in class_indices]
-  df['class'] = pandas.Series([ ds.classes[x]
-                                for x in class_indices]).astype("category")
-  df['fname'] = fnames
-  #df['scale'] = scalings[:,0].squeeze()
+  df.insert(loc=0, column='fname', value=fnames)
+  #df.insert(loc=0, column='scale', value=scalings[:,0].squeeze())
+  df.insert( loc=0, column='class_name'
+           , value=[ds.classes[x] for x in class_indices])
+  df.insert(loc=0, column='class', value=class_indices)
+  df.set_index("class", inplace=True)
 
   return (predictions, latent_space, df)
 

From 6a5c7d028b1469de8d876f2e4bc6f856a43d9b17 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 18 Jun 2024 21:09:56 +0100
Subject: [PATCH 127/204] renamed label to class

---
 scripts/shapeembed/evaluation.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index e00b3418..641ff27c 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -32,18 +32,17 @@ def dataloader_to_dataframe(dataloader):
       all_data.append(data.flatten().numpy())
       all_lbls.append(int(lbl))
   df = pandas.DataFrame(all_data)
-  df['label'] = all_lbls
+  df['class'] = all_lbls
   df.dropna()
   return df
 
 def run_kmeans(dataframe, random_seed=42):
   # run KMeans and derive accuracy metric and confusion matrix
-  kmeans = KMeans( n_clusters=len(dataframe['label'].unique())
+  kmeans = KMeans( n_clusters=len(dataframe['class'].unique())
                  , random_state=random_seed
-                 ).fit(dataframe.drop('label', axis=1))
-  accuracy = accuracy_score(dataframe['label'], kmeans.labels_)
-  conf_mat = confusion_matrix(dataframe['label'], kmeans.labels_)
-
+                 ).fit(dataframe.drop('class', axis=1))
+  accuracy = accuracy_score(dataframe['class'], kmeans.labels_)
+  conf_mat = confusion_matrix(dataframe['class'], kmeans.labels_)
   return kmeans, accuracy, conf_mat
 
 def run_regionprops( dataset_params

From d216cc7df541597c92854c70d2630174509b3543 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 18 Jun 2024 21:11:01 +0100
Subject: [PATCH 128/204] updated scoring function + collate and save results

---
 scripts/shapeembed/evaluation.py | 38 ++++++++++++++++++++++++++++++--
 scripts/shapeembed/shapeembed.py | 24 ++++++++++++--------
 2 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 641ff27c..e8692255 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -14,6 +14,8 @@
 import numpy
 import pandas
 import logging
+import seaborn
+import matplotlib.pyplot as plt
 
 from bioimage_embed.shapes.transforms import ImageToCoords
 
@@ -93,7 +95,8 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size=512):
   # concatenate results as a single dataframe and return it
   return pandas.concat(dfs).xs('coeffs', level='coeffs')
 
-def score_dataframe(df, test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5):
+def score_dataframe( df, name
+                   , test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5 ):
   # drop strings and python object columns
   #clean_df = df.select_dtypes(exclude=['object'])
   clean_df = df.select_dtypes(include=['number'])
@@ -129,4 +132,35 @@ def score_dataframe(df, test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5):
   , return_train_score=False
   )
   # Put the results into a DataFrame
-  return pandas.DataFrame(cv_results)
+  df = pandas.DataFrame(cv_results)
+  df = df.drop(["fit_time", "score_time"], axis=1)
+  df.insert(loc=0, column='trial', value=name)
+  return df
+
+def save_scores( scores_df
+               , outputdir='.'
+               , width = 3.45
+               , height = 3.45 / 1.618 ):
+  # save all raw scores as csv
+  scores_df.to_csv(f"{outputdir}/scores_df.csv")
+  # save score means as csv
+  scores_df.groupby("trial").mean().to_csv(f"{outputdir}/scores_df_mean.csv")
+  # save a barplot representation of scores
+  melted_df = scores_df.melt( id_vars="trial"
+                            , var_name="Metric"
+                            , value_name="Score" )
+  seaborn.catplot( data=melted_df
+                 , kind="bar"
+                 , x="trial"
+                 , hue="Metric"
+                 , y="Score"
+                 , errorbar="se"
+                 , height=height
+                 , aspect=width * 2**0.5 / height )
+  plt.savefig(f"{outputdir}/scores_barplot.pdf")
+  plt.close()
+  # log info
+  logger.info(melted_df.set_index(["trial", "Metric"])
+                .xs("test_f1", level="Metric", drop_level=False)
+                .groupby("trial")
+                .mean())
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 4fe4338f..aa491b58 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -94,7 +94,7 @@
 , cycle_momentum=False
 )
 
-# data
+# dataset loading functions
 ###############################################################################
 
 def maybe_roll(dist_mat, p = 0.5):
@@ -310,28 +310,34 @@ def main_process(params):
 
   # gather metrics
   ################
+  # kmeans on input data and score
+  logger.info(f'-- kmeans on input data --')
+  kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
+  print(kmeans)
+  logger.info(f'-- kmeans accuracy: {accuracy}')
+  logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
   # regionprops on input data and score
   logger.info(f'-- regionprops on input data --')
   regionprops_df = run_regionprops(params.dataset)
   logger.debug(f'\n{regionprops_df}')
-  regionprops_score_df = score_dataframe(regionprops_df)
+  regionprops_score_df = score_dataframe(regionprops_df, 'regionprops')
   logger.info(f'-- regionprops on input data, score:\n{regionprops_score_df}')
   # elliptic fourier descriptors on input data and score
   logger.info(f'-- elliptic fourier descriptors on input data --')
   efd_df = run_elliptic_fourier_descriptors(params.dataset)
   logger.debug(f'\n{efd_df}')
-  efd_score_df = score_dataframe(efd_df)
+  efd_score_df = score_dataframe(efd_df, 'efd')
   logger.info(f'-- elliptic fourier descriptors on input data, score:\n{efd_score_df}')
-  # kmeans on input data and score
-  logger.info(f'-- kmeans on input data --')
-  _, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
-  logger.info(f'-- kmeans accuracy: {accuracy}')
-  logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
   # score shape embed
   logger.info(f'-- score shape embed --')
   logger.debug(f'\n{shapeembed_df}')
-  shapeembed_score_df = score_dataframe(shapeembed_df)
+  shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed')
   logger.info(f'-- shapeembed on input data, score:\n{shapeembed_score_df}')
+  # collate and save gathered results TODO KMeans
+  scores_df = pandas.concat([ regionprops_score_df
+                            , efd_score_df
+                            , shapeembed_score_df ])
+  save_scores(scores_df, outputdir=params.output_dir)
 
 # main entry point
 ###############################################################################

From 7e2bdbd5edf16432e791c56cf4f1419bbcf85ffd Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 18 Jun 2024 22:07:40 +0100
Subject: [PATCH 129/204] Added clargs to control matrix normalization and roll

---
 scripts/shapeembed/shapeembed.py | 57 ++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index aa491b58..fbd5570e 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -69,13 +69,15 @@
   )
 , batch_size=4
 , compression_factor=2
-, matrix_size=512
+, distance_matrix_size=512
 , num_embeddings=1024
 , num_hiddens=1024
 , num_workers=8
 , epochs=150
 , pretrained=False
 , frobenius_norm=False
+, distance_matrix_normalize=True
+, distance_matrix_roll_probability=1.0
 , checkpoints_path='./checkpoints'
 , commitment_cost=0.25
 , decay=0.99
@@ -113,14 +115,23 @@ def sanity_check(dist_mat):
   return dist_mat
 
 def get_dataloader(params):
+
   # transformations / checks to run on distance matrices
-  distmat_ts = transforms.Compose([
-    lambda x: x / np.linalg.norm(x, "fro") # normalize the matrix
-  , lambda x: maybe_roll(x, p = 1.0) # "potentially" roll the matrix
-  , sanity_check # check if the matrix is symmetric and positive, and the diagonal is zero
-  , torch.as_tensor # turn (H,W) numpy array into a (H,W) tensor
-  , lambda x: x.repeat(3, 1, 1) # turn (H,W) tensor into a (3,H,W) tensor (to fit downstream model expectations)
-  ])
+  ts = []
+  if params.distance_matrix_normalize: # optionally normalize the matrix
+    ts.append(lambda x: x / np.linalg.norm(x, "fro"))
+  if params.distance_matrix_roll_probability > 0.0: # optionally try to roll the matrix
+    ts.append(lambda x: maybe_roll(x, p=params.distance_matrix_roll_probability))
+  # always check if the matrix is symmetric, positive, and diagonal is zero
+  ts.append(sanity_check)
+  # turn (H,W) numpy array into a (H,W) tensor 
+  ts.append(torch.as_tensor)
+  # turn (H,W) tensor into a (3,H,W) tensor (downstream model expectations)
+  ts.append(lambda x: x.repeat(3, 1, 1))
+  # compose the all the distance matrix transformations
+  logger.debug(f'transformations to run: {len(ts)}')
+  distmat_ts = transforms.Compose(ts)
+
   # dataset to load
   logger.info(f'loading dataset {params.dataset.name}')
   dataset = None
@@ -131,7 +142,7 @@ def get_dataloader(params):
       params.dataset.path
     , transforms.Compose([ np.array
                          , functools.partial( mask2distmatrix
-                                            , matrix_size=params.matrix_size )
+                                            , matrix_size=params.distance_matrix_size )
                          , distmat_ts ]))
   elif params.dataset.type == 'distance_matrix': # distance matrix data
     dataset = datasets.DatasetFolder( params.dataset.path
@@ -342,10 +353,17 @@ def main_process(params):
 # main entry point
 ###############################################################################
 if __name__ == '__main__':
+
   def auto_pos_int (x):
     val = int(x,0)
     if val <= 0:
-      raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val))
+      raise argparse.ArgumentTypeError(f"argument must be a positive int. Got {val:d}.")
+    return val
+
+  def prob (x):
+    val = float(x)
+    if val < 0.0 or val > 1.0:
+      raise argparse.ArgumentTypeError(f"argument must be between 0.0 and 1.0. Got {val:f}.")
     return val
   
   parser = argparse.ArgumentParser(description='Run the shape embed pipeline')
@@ -371,12 +389,18 @@ def auto_pos_int (x):
   parser.add_argument(
       '-b', '--batch-size', metavar='BATCH_SIZE', type=auto_pos_int
     , help=f"The BATCH_SIZE for the run, a positive integer (default {dflt_params.batch_size})")
+  parser.add_argument(
+      '--distance-matrix-normalize', action=argparse.BooleanOptionalAction, default=None
+    , help=f'Whether to normalize the distance matrices or not')
+  parser.add_argument(
+      '--distance-matrix-roll-probability', metavar='ROLL_PROB', type=prob, default=None
+    , help=f'Probability to roll the distance matrices along the diagonal (default {dflt_params.distance_matrix_roll_probability})')
   parser.add_argument(
       '-c', '--compression-factor', metavar='COMPRESSION_FACTOR', type=auto_pos_int
     , help=f"The COMPRESSION_FACTOR, a positive integer (default {dflt_params.compression_factor})")
   parser.add_argument(
       '--distance-matrix-size', metavar='MATRIX_SIZE', type=auto_pos_int
-    , help=f"The size of the distance matrix (default {dflt_params.matrix_size})")
+    , help=f"The size of the distance matrix (default {dflt_params.distance_matrix_size})")
   parser.add_argument(
       '--number-embeddings', metavar='NUM_EMBEDDINGS', type=auto_pos_int
     , help=f"The NUM_EMBEDDINGS, a positive integer (default {dflt_params.num_embeddings})")
@@ -420,12 +444,16 @@ def auto_pos_int (x):
   if clargs.batch_size:
     params.batch_size = clargs.batch_size
   if clargs.distance_matrix_size:
-    params.matrix_size = clargs.distance_matrix_size
-  params.input_dim = (3, params.matrix_size, params.matrix_size)
+    params.distance_matrix_size = clargs.distance_matrix_size
+  params.input_dim = (3, params.distance_matrix_size, params.distance_matrix_size)
+  if clargs.distance_matrix_normalize is not None:
+    params.distance_matrix_normalize = clargs.distance_matrix_normalize
+  if clargs.distance_matrix_roll_probability is not None:
+    params.distance_matrix_roll_probability = clargs.distance_matrix_roll_probability
   if clargs.compression_factor:
     params.compression_factor = clargs.compression_factor
   n_features = lambda d, n: d*(d-1)/(2**n)
-  params.latent_dim = n_features(params.matrix_size, params.compression_factor)
+  params.latent_dim = n_features(params.distance_matrix_size, params.compression_factor)
   if clargs.number_embeddings:
     params.num_embeddings = clargs.number_embeddings
   if clargs.number_hiddens:
@@ -442,4 +470,5 @@ def auto_pos_int (x):
   # XXX
   torch.set_float32_matmul_precision('medium')
   # XXX
+  logger.debug(f'run parameters:\n{params}')
   main_process(params)

From 86ede7b00f76d84431a908f1881996a09831c893 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 18 Jun 2024 23:23:27 +0100
Subject: [PATCH 130/204] Added umap_plot

---
 scripts/shapeembed/evaluation.py | 49 +++++++++++++++++++++++++++++++-
 scripts/shapeembed/shapeembed.py | 25 +++++++++-------
 2 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index e8692255..9655fbd7 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -1,5 +1,6 @@
 from torchvision import datasets, transforms
 import pyefd
+from umap import UMAP
 from skimage import measure
 from sklearn.cluster import KMeans
 from sklearn.pipeline import Pipeline
@@ -22,7 +23,7 @@
 # logging facilities
 ###############################################################################
 logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
+logging.basicConfig(level=logging.DEBUG)
 
 def dataloader_to_dataframe(dataloader):
   # gather the data and the associated labels, and drop rows with NaNs
@@ -137,6 +138,52 @@ def score_dataframe( df, name
   df.insert(loc=0, column='trial', value=name)
   return df
 
+def umap_plot( df
+             , name
+             , outputdir='.'
+             , n_neighbors=15
+             , min_dist=0.1
+             , n_components=2
+             , rand_seed=42
+             , split=0.7
+             , width=3.45
+             , height=3.45 / 1.618 ):
+  clean_df = df.select_dtypes(include=['number'])
+  umap_reducer = UMAP( n_neighbors=n_neighbors
+                     , min_dist=min_dist
+                     , n_components=n_components
+                     , random_state=rand_seed )
+  mask = numpy.random.rand(len(clean_df)) < split
+
+  clean_df.reset_index(level='class', inplace=True)
+  classes = clean_df['class'].copy()
+  semi_labels = classes.copy()
+  semi_labels[~mask] = -1  # Assuming -1 indicates unknown label for semi-supervision
+  clean_df.drop('class', axis=1, inplace=True)
+
+  umap_embedding = umap_reducer.fit_transform(clean_df, y=semi_labels)
+  umap_data=pandas.DataFrame(umap_embedding, columns=["umap0", "umap1"])
+  umap_data['class'] = classes
+
+  ax = seaborn.relplot( data=umap_data
+                      , x="umap0"
+                      , y="umap1"
+                      , hue="class"
+                      , palette="deep"
+                      , alpha=0.5
+                      , edgecolor=None
+                      , s=5
+                      , height=height
+                      , aspect=0.5 * width / height )
+
+  seaborn.move_legend(ax, "upper center")
+  ax.set(xlabel=None, ylabel=None)
+  seaborn.despine(left=True, bottom=True)
+  plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
+  plt.tight_layout()
+  plt.savefig(f"{outputdir}/umap_{name}.pdf")
+  plt.close()
+
 def save_scores( scores_df
                , outputdir='.'
                , width = 3.45
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index fbd5570e..384871f9 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -310,23 +310,25 @@ def main_process(params):
   # run predictions
   #################
   # ... and gather latent space
+  logger.info(f'-- run predictions and extract latent space --')
   predictions, latent_space, shapeembed_df = run_predictions(
     trainer, model, dataloader
   , num_workers=params.num_workers
   )
+  logger.debug(f'\n{shapeembed_df}')
   # ... and prepare output directory and save latent space
   os.makedirs(f"{params.output_dir}/", exist_ok=True)
   np.save(f'{params.output_dir}/latent_space.npy', latent_space)
   shapeembed_df.to_pickle(f'{params.output_dir}/latent_space.pkl')
+  logger.info(f'-- generate shapeembed umap --')
+  umap_plot(shapeembed_df, 'shapeembed', outputdir=params.output_dir)
 
   # gather metrics
   ################
-  # kmeans on input data and score
-  logger.info(f'-- kmeans on input data --')
-  kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
-  print(kmeans)
-  logger.info(f'-- kmeans accuracy: {accuracy}')
-  logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
+  # score shape embed
+  logger.info(f'-- score shape embed --')
+  shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed')
+  logger.info(f'-- shapeembed on input data, score:\n{shapeembed_score_df}')
   # regionprops on input data and score
   logger.info(f'-- regionprops on input data --')
   regionprops_df = run_regionprops(params.dataset)
@@ -339,11 +341,12 @@ def main_process(params):
   logger.debug(f'\n{efd_df}')
   efd_score_df = score_dataframe(efd_df, 'efd')
   logger.info(f'-- elliptic fourier descriptors on input data, score:\n{efd_score_df}')
-  # score shape embed
-  logger.info(f'-- score shape embed --')
-  logger.debug(f'\n{shapeembed_df}')
-  shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed')
-  logger.info(f'-- shapeembed on input data, score:\n{shapeembed_score_df}')
+  # kmeans on input data and score
+  logger.info(f'-- kmeans on input data --')
+  kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
+  print(kmeans)
+  logger.info(f'-- kmeans accuracy: {accuracy}')
+  logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
   # collate and save gathered results TODO KMeans
   scores_df = pandas.concat([ regionprops_score_df
                             , efd_score_df

From 19edf47216914ad70b9a1ea5140c5289e215cec1 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 24 Jun 2024 14:47:02 +0100
Subject: [PATCH 131/204] fix dataset clarg

---
 scripts/shapeembed/shapeembed.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 384871f9..86b79fdd 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -439,7 +439,10 @@ def prob (x):
     params.model_args.beta = clargs.model_arg_beta
   params.output_dir = clargs.output_dir
   if clargs.dataset:
-    params.dataset = clargs.dataset
+    params.dataset = types.SimpleNamespace( name=clargs.dataset[0]
+                                          , path=clargs.dataset[1]
+                                          , type=clargs.dataset[2] )
+
   if clargs.wandb_entity:
     params.wandb_entity = clargs.wandb_entity
   if clargs.wandb_project:

From 976edc23cab552611eafb7c0ccd03c11898bb306 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 24 Jun 2024 14:54:14 +0100
Subject: [PATCH 132/204] fix model name clarg

---
 scripts/shapeembed/shapeembed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 86b79fdd..f2d59573 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -432,8 +432,8 @@ def prob (x):
 
   # update default params with clargs
   params = copy.deepcopy(dflt_params)
-  if clargs.model:
-    params.model = clargs.model
+  if clargs.model_name:
+    params.model_name = clargs.model_name
   params.model_args = types.SimpleNamespace()
   if clargs.model_arg_beta:
     params.model_args.beta = clargs.model_arg_beta

From d1c5d3c713bdbf1ccb497111c1858fa77a0f8498 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 24 Jun 2024 15:01:47 +0100
Subject: [PATCH 133/204] fix model_name clarg again

---
 scripts/shapeembed/shapeembed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index f2d59573..7d3f46e5 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -432,8 +432,8 @@ def prob (x):
 
   # update default params with clargs
   params = copy.deepcopy(dflt_params)
-  if clargs.model_name:
-    params.model_name = clargs.model_name
+  if clargs.model:
+    params.model_name = clargs.model
   params.model_args = types.SimpleNamespace()
   if clargs.model_arg_beta:
     params.model_args.beta = clargs.model_arg_beta

From 485124112fb333f05172e0aac584c8982eaf1728 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 25 Jun 2024 07:29:11 +0100
Subject: [PATCH 134/204] Added early stop clarg (default no early stop)

---
 scripts/shapeembed/shapeembed.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 7d3f46e5..aa0b75b4 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -76,6 +76,7 @@
 , epochs=150
 , pretrained=False
 , frobenius_norm=False
+, early_stop=False
 , distance_matrix_normalize=True
 , distance_matrix_roll_probability=1.0
 , checkpoints_path='./checkpoints'
@@ -208,6 +209,9 @@ def get_trainer(model, params):
 
   # setup trainer
   logger.info('setup trainer')
+  trainer_callbacks = [checkpoint_callback]
+  if params.early_stop:
+    trainer_callbacks.append(EarlyStopping(monitor="loss/val", mode="min"))
   trainer = pl.Trainer(
     logger=[wandblogger]
   , gradient_clip_val=0.5
@@ -215,9 +219,7 @@ def get_trainer(model, params):
   , devices=1
   , accelerator="gpu"
   , accumulate_grad_batches=4
-  , callbacks=[ checkpoint_callback
-              , EarlyStopping(monitor="loss/val", mode="min")
-              ]
+  , callbacks=trainer_callbacks
   , min_epochs=50
   , max_epochs=params.epochs
   , log_every_n_steps=1
@@ -392,6 +394,9 @@ def prob (x):
   parser.add_argument(
       '-b', '--batch-size', metavar='BATCH_SIZE', type=auto_pos_int
     , help=f"The BATCH_SIZE for the run, a positive integer (default {dflt_params.batch_size})")
+  parser.add_argument(
+      '--early-stop', action=argparse.BooleanOptionalAction, default=None
+    , help=f'Whether to stop training early or not (when loss "stops" decreasing. Beware of second decay...)')
   parser.add_argument(
       '--distance-matrix-normalize', action=argparse.BooleanOptionalAction, default=None
     , help=f'Whether to normalize the distance matrices or not')
@@ -452,6 +457,8 @@ def prob (x):
   if clargs.distance_matrix_size:
     params.distance_matrix_size = clargs.distance_matrix_size
   params.input_dim = (3, params.distance_matrix_size, params.distance_matrix_size)
+  if clargs.early_stop is not None:
+    params.early_stop = clargs.early_stop
   if clargs.distance_matrix_normalize is not None:
     params.distance_matrix_normalize = clargs.distance_matrix_normalize
   if clargs.distance_matrix_roll_probability is not None:

From b95272e958ac627c7c0bf2bfdf34a8e837cac3c9 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 26 Jun 2024 20:04:15 +0100
Subject: [PATCH 135/204] added confusion matrices to scoring function

---
 scripts/shapeembed/evaluation.py |  7 +++++--
 scripts/shapeembed/shapeembed.py | 18 ++++++++++++------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 9655fbd7..84d8c8b3 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -9,7 +9,7 @@
 from sklearn import metrics
 from sklearn.metrics import make_scorer
 from sklearn.metrics import confusion_matrix, accuracy_score
-from sklearn.model_selection import cross_validate, KFold, train_test_split, StratifiedKFold
+from sklearn.model_selection import cross_validate, cross_val_predict, KFold, train_test_split, StratifiedKFold
 
 import tqdm
 import numpy
@@ -122,6 +122,9 @@ def score_dataframe( df, name
   , ("clf", RandomForestClassifier())
   #, ("clf", DummyClassifier())
   ])
+  # build confusion matrix
+  lbl_pred = cross_val_predict(pipeline, clean_df, clean_df.index)
+  conf_mat = confusion_matrix(clean_df.index, lbl_pred)
   # Perform k-fold cross-validation
   cv_results = cross_validate(
     estimator=pipeline
@@ -136,7 +139,7 @@ def score_dataframe( df, name
   df = pandas.DataFrame(cv_results)
   df = df.drop(["fit_time", "score_time"], axis=1)
   df.insert(loc=0, column='trial', value=name)
-  return df
+  return conf_mat, df
 
 def umap_plot( df
              , name
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index aa0b75b4..c0ba2f68 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -329,20 +329,26 @@ def main_process(params):
   ################
   # score shape embed
   logger.info(f'-- score shape embed --')
-  shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed')
-  logger.info(f'-- shapeembed on input data, score:\n{shapeembed_score_df}')
+  shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed')
+  logger.info(f'-- shapeembed on input data')
+  logger.info(f'-- score:\n{shapeembed_score_df}')
+  logger.info(f'-- confusion matrix:\n{shapeembed_cm}')
   # regionprops on input data and score
   logger.info(f'-- regionprops on input data --')
   regionprops_df = run_regionprops(params.dataset)
   logger.debug(f'\n{regionprops_df}')
-  regionprops_score_df = score_dataframe(regionprops_df, 'regionprops')
-  logger.info(f'-- regionprops on input data, score:\n{regionprops_score_df}')
+  regionprops_cm, regionprops_score_df = score_dataframe(regionprops_df, 'regionprops')
+  logger.info(f'-- regionprops on input data')
+  logger.info(f'-- score:\n{regionprops_score_df}')
+  logger.info(f'-- confusion matrix:\n{regionprops_cm}')
   # elliptic fourier descriptors on input data and score
   logger.info(f'-- elliptic fourier descriptors on input data --')
   efd_df = run_elliptic_fourier_descriptors(params.dataset)
   logger.debug(f'\n{efd_df}')
-  efd_score_df = score_dataframe(efd_df, 'efd')
-  logger.info(f'-- elliptic fourier descriptors on input data, score:\n{efd_score_df}')
+  efd_cm, efd_score_df = score_dataframe(efd_df, 'efd')
+  logger.info(f'-- elliptic fourier descriptors on input data')
+  logger.info(f'-- score:\n{efd_score_df}')
+  logger.info(f'-- confusion matrix:\n{efd_cm}')
   # kmeans on input data and score
   logger.info(f'-- kmeans on input data --')
   kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))

From 222f69892f11af2afb513d282eaa79d3c2c4b8eb Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 26 Jun 2024 23:18:01 +0100
Subject: [PATCH 136/204] use integer division for compression factor clarg

---
 scripts/shapeembed/shapeembed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index c0ba2f68..a47927c4 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -471,7 +471,7 @@ def prob (x):
     params.distance_matrix_roll_probability = clargs.distance_matrix_roll_probability
   if clargs.compression_factor:
     params.compression_factor = clargs.compression_factor
-  n_features = lambda d, n: d*(d-1)/(2**n)
+  n_features = lambda d, n: d*(d-1)//(2**n)
   params.latent_dim = n_features(params.distance_matrix_size, params.compression_factor)
   if clargs.number_embeddings:
     params.num_embeddings = clargs.number_embeddings

From 793b72079eabd9f30b5f553fdc849fcd89f9889e Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 26 Jun 2024 23:19:49 +0100
Subject: [PATCH 137/204] explicitly binarise image when running regionprops

---
 scripts/shapeembed/evaluation.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 84d8c8b3..2cfa3d0c 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -56,15 +56,18 @@ def run_regionprops( dataset_params
                                   , "minor_axis_length"
                                   , "orientation" ] ):
   # access the dataset
-  assert dataset_params.type == 'mask'
+  assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}'
   ds = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1))
   # ... and run regionprops for the given properties for each image
   dfs = []
   logger.info(f'running regionprops on {dataset_params.name}')
   logger.info(f'({dataset_params.path})')
   for i, (img, lbl) in enumerate(tqdm.tqdm(ds)):
-    t = measure.regionprops_table(numpy.array(img), properties=properties)
+    data = numpy.where(numpy.array(img)>20, 255, 0)
+    t = measure.regionprops_table(data, properties=properties)
     df = pandas.DataFrame(t)
+    assert df.shape[0] == 1, f'More than one object in image #{i}'
+    df.index = [i]
     df['class'] = lbl
     df.set_index("class", inplace=True)
     dfs.append(df)

From 43673ee07805fb7240f302cb882681b282659f6f Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 26 Jun 2024 23:22:39 +0100
Subject: [PATCH 138/204] keep 'class' as a column rather than index + keeps
 column names as strings

---
 scripts/shapeembed/evaluation.py | 22 ++++++++++++++--------
 scripts/shapeembed/shapeembed.py |  3 ++-
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 2cfa3d0c..1bb582d4 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -69,7 +69,7 @@ def run_regionprops( dataset_params
     assert df.shape[0] == 1, f'More than one object in image #{i}'
     df.index = [i]
     df['class'] = lbl
-    df.set_index("class", inplace=True)
+    #df.set_index("class", inplace=True)
     dfs.append(df)
   # concatenate results as a single dataframe and return it
   df = pandas.concat(dfs)
@@ -97,7 +97,9 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size=512):
     df.set_index("class", inplace=True, append=True)
     dfs.append(df)
   # concatenate results as a single dataframe and return it
-  return pandas.concat(dfs).xs('coeffs', level='coeffs')
+  df = pandas.concat(dfs).xs('coeffs', level='coeffs')
+  df.reset_index(level='class', inplace=True)
+  return df
 
 def score_dataframe( df, name
                    , test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5 ):
@@ -107,7 +109,8 @@ def score_dataframe( df, name
   # TODO, currently unused
   # Split the data into training and test sets
   #X_train, X_test, y_train, y_test = train_test_split(
-  #  clean_df, clean_df.index, stratify=clean_df.index
+  #  clean_df.drop('class', axis=1), clean_df['class']
+  #, stratify=clean_df['class']
   #, test_size=test_sz, randm_state=rand_seed, shuffle=shuffle
   #)
   # Define a dictionary of metrics
@@ -126,13 +129,16 @@ def score_dataframe( df, name
   #, ("clf", DummyClassifier())
   ])
   # build confusion matrix
-  lbl_pred = cross_val_predict(pipeline, clean_df, clean_df.index)
-  conf_mat = confusion_matrix(clean_df.index, lbl_pred)
+  clean_df.columns = clean_df.columns.astype(str) # only string column names
+  lbl_pred = cross_val_predict( pipeline
+                              , clean_df.drop('class', axis=1)
+                              , clean_df['class'])
+  conf_mat = confusion_matrix(clean_df['class'], lbl_pred)
   # Perform k-fold cross-validation
   cv_results = cross_validate(
     estimator=pipeline
-  , X=clean_df
-  , y=clean_df.index
+  , X=clean_df.drop('class', axis=1)
+  , y=clean_df['class']
   , cv=StratifiedKFold(n_splits=k_folds)
   , scoring=scoring
   , n_jobs=-1
@@ -161,7 +167,7 @@ def umap_plot( df
                      , random_state=rand_seed )
   mask = numpy.random.rand(len(clean_df)) < split
 
-  clean_df.reset_index(level='class', inplace=True)
+  #clean_df.reset_index(level='class', inplace=True)
   classes = clean_df['class'].copy()
   semi_labels = classes.copy()
   semi_labels[~mask] = -1  # Assuming -1 indicates unknown label for semi-supervision
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index a47927c4..29189140 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -288,7 +288,8 @@ def run_predictions(trainer, model, dataloader, num_workers=8):
   df.insert( loc=0, column='class_name'
            , value=[ds.classes[x] for x in class_indices])
   df.insert(loc=0, column='class', value=class_indices)
-  df.set_index("class", inplace=True)
+  #df.set_index("class", inplace=True)
+  df.columns = df.columns.astype(str) # only string column names
 
   return (predictions, latent_space, df)
 

From 11a6e69e1a51df57d7768f25aea49bd96299c85a Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 26 Jun 2024 23:23:34 +0100
Subject: [PATCH 139/204] change len for shape[0]

---
 scripts/shapeembed/evaluation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 1bb582d4..56c24b05 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -165,7 +165,7 @@ def umap_plot( df
                      , min_dist=min_dist
                      , n_components=n_components
                      , random_state=rand_seed )
-  mask = numpy.random.rand(len(clean_df)) < split
+  mask = numpy.random.rand(clean_df.shape[0]) < split
 
   #clean_df.reset_index(level='class', inplace=True)
   classes = clean_df['class'].copy()

From 91dd1c5dba9bd3f261f32b0bca18dc3ceee944fb Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 26 Jun 2024 23:24:06 +0100
Subject: [PATCH 140/204] drop not needed return value from run_predictions

---
 scripts/shapeembed/shapeembed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 29189140..c0be264e 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -291,7 +291,7 @@ def run_predictions(trainer, model, dataloader, num_workers=8):
   #df.set_index("class", inplace=True)
   df.columns = df.columns.astype(str) # only string column names
 
-  return (predictions, latent_space, df)
+  return latent_space, df
 
 # main process
 ###############################################################################
@@ -314,7 +314,7 @@ def main_process(params):
   #################
   # ... and gather latent space
   logger.info(f'-- run predictions and extract latent space --')
-  predictions, latent_space, shapeembed_df = run_predictions(
+  latent_space, shapeembed_df = run_predictions(
     trainer, model, dataloader
   , num_workers=params.num_workers
   )

From 963d59048d6eb6a542f696bd78de583b593c8fe0 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Wed, 26 Jun 2024 23:24:52 +0100
Subject: [PATCH 141/204] added combined shapeembed + efd + regionprops scoring
 and comment out kmeans

---
 scripts/shapeembed/shapeembed.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index c0be264e..b4a9dfbb 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -350,13 +350,24 @@ def main_process(params):
   logger.info(f'-- elliptic fourier descriptors on input data')
   logger.info(f'-- score:\n{efd_score_df}')
   logger.info(f'-- confusion matrix:\n{efd_cm}')
-  # kmeans on input data and score
-  logger.info(f'-- kmeans on input data --')
-  kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
-  print(kmeans)
-  logger.info(f'-- kmeans accuracy: {accuracy}')
-  logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
-  # collate and save gathered results TODO KMeans
+  # combined shapeembed + efd + regionprops
+  logger.info(f'-- shapeembed + efd + regionprops --')
+  comb_df = pandas.concat([ shapeembed_df
+                          , efd_df.drop('class', axis=1)
+                          , regionprops_df.drop('class', axis=1) ], axis=1)
+  logger.debug(f'\n{comb_df}')
+  comb_cm, comb_score_df = score_dataframe(comb_df, 'combined')
+  logger.info(f'-- shapeembed + efd + regionprops on input data')
+  logger.info(f'-- score:\n{comb_score_df}')
+  logger.info(f'-- confusion matrix:\n{comb_cm}')
+  # XXX Not currently doing the kmeans
+  # XXX kmeans on input data and score
+  #logger.info(f'-- kmeans on input data --')
+  #kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
+  #print(kmeans)
+  #logger.info(f'-- kmeans accuracy: {accuracy}')
+  #logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
+  ## collate and save gathered results TODO KMeans
   scores_df = pandas.concat([ regionprops_score_df
                             , efd_score_df
                             , shapeembed_score_df ])

From df09ad5f957d3ef55526f9be36aa1b346ef0aca9 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 27 Jun 2024 17:35:06 +0100
Subject: [PATCH 142/204] save combined score

---
 scripts/shapeembed/shapeembed.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index b4a9dfbb..dabbf190 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -356,7 +356,7 @@ def main_process(params):
                           , efd_df.drop('class', axis=1)
                           , regionprops_df.drop('class', axis=1) ], axis=1)
   logger.debug(f'\n{comb_df}')
-  comb_cm, comb_score_df = score_dataframe(comb_df, 'combined')
+  comb_cm, comb_score_df = score_dataframe(comb_df, 'combined_all')
   logger.info(f'-- shapeembed + efd + regionprops on input data')
   logger.info(f'-- score:\n{comb_score_df}')
   logger.info(f'-- confusion matrix:\n{comb_cm}')
@@ -367,10 +367,12 @@ def main_process(params):
   #print(kmeans)
   #logger.info(f'-- kmeans accuracy: {accuracy}')
   #logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
-  ## collate and save gathered results TODO KMeans
+
+  # collate and save gathered results TODO KMeans
   scores_df = pandas.concat([ regionprops_score_df
                             , efd_score_df
-                            , shapeembed_score_df ])
+                            , shapeembed_score_df
+                            , comb_score_df ])
   save_scores(scores_df, outputdir=params.output_dir)
 
 # main entry point

From ff6fbd483b90c113461546164585aedcd729b1c1 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 27 Jun 2024 17:47:22 +0100
Subject: [PATCH 143/204] save confusion matrices

---
 scripts/shapeembed/evaluation.py | 12 ++++++++++++
 scripts/shapeembed/shapeembed.py |  4 ++++
 2 files changed, 16 insertions(+)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 56c24b05..b5d55cec 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -150,6 +150,18 @@ def score_dataframe( df, name
   df.insert(loc=0, column='trial', value=name)
   return conf_mat, df
 
+def confusion_matrix_plot( cm, name, outputdir
+                         , figsize=(10,7) ):
+  # Plot confusion matrix
+  plt.clf()  # Clear figure
+  plt.figure(figsize=figsize)
+  seaborn.heatmap(cm, annot=True, fmt='d')
+  plt.title(f'{name} - Confusion Matrix')
+  plt.xlabel('Predicted')
+  plt.ylabel('Actual')
+  plt.savefig(f'{outputdir}/{name}_confusion_matrix.png')
+  plt.clf()  # Clear figure
+
 def umap_plot( df
              , name
              , outputdir='.'
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index dabbf190..1e05b222 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -334,6 +334,7 @@ def main_process(params):
   logger.info(f'-- shapeembed on input data')
   logger.info(f'-- score:\n{shapeembed_score_df}')
   logger.info(f'-- confusion matrix:\n{shapeembed_cm}')
+  confusion_matrix_plot(shapeembed_cm, 'shapeembed', params.output_dir)
   # regionprops on input data and score
   logger.info(f'-- regionprops on input data --')
   regionprops_df = run_regionprops(params.dataset)
@@ -342,6 +343,7 @@ def main_process(params):
   logger.info(f'-- regionprops on input data')
   logger.info(f'-- score:\n{regionprops_score_df}')
   logger.info(f'-- confusion matrix:\n{regionprops_cm}')
+  confusion_matrix_plot(regionprops_cm, 'regionprops_cm', params.output_dir)
   # elliptic fourier descriptors on input data and score
   logger.info(f'-- elliptic fourier descriptors on input data --')
   efd_df = run_elliptic_fourier_descriptors(params.dataset)
@@ -350,6 +352,7 @@ def main_process(params):
   logger.info(f'-- elliptic fourier descriptors on input data')
   logger.info(f'-- score:\n{efd_score_df}')
   logger.info(f'-- confusion matrix:\n{efd_cm}')
+  confusion_matrix_plot(efd_cm, 'efd', params.output_dir)
   # combined shapeembed + efd + regionprops
   logger.info(f'-- shapeembed + efd + regionprops --')
   comb_df = pandas.concat([ shapeembed_df
@@ -360,6 +363,7 @@ def main_process(params):
   logger.info(f'-- shapeembed + efd + regionprops on input data')
   logger.info(f'-- score:\n{comb_score_df}')
   logger.info(f'-- confusion matrix:\n{comb_cm}')
+  confusion_matrix_plot(comb_cm, 'combined_all', params.output_dir)
   # XXX Not currently doing the kmeans
   # XXX kmeans on input data and score
   #logger.info(f'-- kmeans on input data --')

From ebadaff76a84cd66d6775b55921c9a448148b494 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 5 Jul 2024 02:16:17 +0100
Subject: [PATCH 144/204] First attempt at a result gathering script

---
 scripts/shapeembed/gather_run_results.py | 138 +++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100755 scripts/shapeembed/gather_run_results.py

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
new file mode 100755
index 00000000..10d66fa6
--- /dev/null
+++ b/scripts/shapeembed/gather_run_results.py
@@ -0,0 +1,138 @@
+#! /usr/bin/env python3
+
+import pandas as pd
+import logging
+import argparse
+import shutil
+import os
+import functools
+  
+# define a Custom aggregation  
+# function for finding total 
+def keep_first_fname(series): 
+  return functools.reduce(lambda x, y: y if x == 'nofile' else y, series)
+
+def get_run_info(run):
+  x = run.split('_')
+  return f'{x[0]}_{x[1]}', x[2], x[4]
+
+def main_process(clargs, logger=logging.getLogger(__name__)):
+  print(clargs)
+  os.makedirs(clargs.output_dir, exist_ok=True)
+  dfs = []
+  for d in clargs.run_folder:
+    csv = f'{d}/scores_df.csv'
+    #csv = f'{d}/scores_df_mean.csv'
+    if not os.path.isfile(csv):
+      print(f'WARNING: no {csv} found, skipping')
+      continue
+    
+    run_name = os.path.basename(d)
+    model, latent_space_sz, dataset = get_run_info(run_name)
+    df = pd.read_csv(csv)
+    df['model'] = model
+    df['latent_space_sz'] = latent_space_sz
+    df['dataset'] = dataset
+    for trial in ['efd','regionprops','shapeembed', 'combined_all']:
+      conf_mat = f'{trial}_confusion_matrix.png'
+      if os.path.isfile(f'{d}/{conf_mat}'):
+        shutil.copy(f'{d}/{conf_mat}',f'{clargs.output_dir}/{run_name}_{conf_mat}')
+        df.loc[df['trial'] == trial, 'conf_mat'] = f'./{run_name}_{conf_mat}'
+      else:
+        df.loc[df['trial'] == trial, 'conf_mat'] = f'nofile'
+      umap = f'umap_{trial}.pdf'
+      if os.path.isfile(f'{d}/{umap}'):
+        shutil.copy(f'{d}/{umap}',f'{clargs.output_dir}/{run_name}_{umap}')
+        df.loc[df['trial'] == trial, 'umap'] = f'./{run_name}_{umap}'
+      else:
+        df.loc[df['trial'] == trial, 'umap'] = f'nofile'
+    dfs.append(df.convert_dtypes())
+  df = pd.concat(dfs)
+  df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df
+  df.set_index(['dataset', 'trial', 'model', 'latent_space_sz'], inplace=True)
+  df.sort_index(inplace=True)
+  df = df.groupby(level=['dataset', 'trial', 'model', 'latent_space_sz']).agg({
+    'test_accuracy': 'mean'
+  , 'test_precision': 'mean'
+  , 'test_recall': 'mean'
+  , 'test_f1': 'mean'
+  , 'conf_mat': keep_first_fname
+  , 'umap': keep_first_fname
+  })
+
+  print('-'*80)
+  print(df)
+  print('-'*80)
+
+
+  cell_hover = {  # for row hover use <tr> instead of <td>
+              'selector': 'td:hover',
+                  'props': [('background-color', '#ffffb3')]
+                  }
+  index_names = {
+              'selector': '.index_name',
+                  'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
+                  }
+  headers = {
+              'selector': 'th:not(.index_name)',
+                  'props': 'background-color: #eeeeee; color: #333333;'
+                  }
+
+  def html_img(path):
+      if os.path.splitext(path)[1][1:] == 'png':
+        return f'<img class="zoom" src="{path}" width="50">'
+      if os.path.splitext(path)[1][1:] == 'pdf':
+        return f'<iframe class="zoom" src="{path}" width="50" height="50"></iframe>'
+      return '<div style="width: 50px">:(</div>'
+  df['conf_mat'] = df['conf_mat'].apply(html_img)
+  df['umap'] = df['umap'].apply(html_img)
+
+  def render_html(fname, d):
+    with open(fname, 'w') as f:
+      f.write('''<style>
+      .df tbody tr:nth-child(even) { background-color: #dddddd; }
+      .zoom {transition: transform .2s;}
+      .zoom:hover{transform: scale(16);}
+      </style>
+      ''')
+      s = d.style
+      s.set_table_styles([cell_hover, index_names, headers])
+      s.to_html(f, classes='df')
+
+  with open(f'{clargs.output_dir}/gathered_table.tex', 'w') as f:
+    f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n')
+    df.to_latex(f)
+    f.write('\\end{decument}')
+  render_html(f'{clargs.output_dir}/gathered_table.html', df)
+
+  dft = df.transpose()
+  with open(f'{clargs.output_dir}/gathered_table_transpose.tex', 'w') as f:
+    f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n')
+    dft.to_latex(f)
+    f.write('\\end{decument}')
+  render_html(f'{clargs.output_dir}/gathered_table_transpose.html', dft)
+
+
+if __name__ == "__main__":
+  
+  parser = argparse.ArgumentParser(description='Run the shape embed pipeline')
+  
+  parser.add_argument( 'run_folder',  nargs="+", type=str
+    , help=f"The runs folders to gather results from")
+  parser.add_argument( '-o', '--output-dir', metavar='OUTPUT_DIR'
+    , default=f'{os.getcwd()}/gathered_results'
+    , help=f"The OUTPUT_DIR path to use to gather results")
+  parser.add_argument('-v', '--verbose', action='count', default=0
+    , help="Increase verbosity level by adding more \"v\".")
+
+  # parse command line arguments
+  clargs=parser.parse_args()
+
+  # set verbosity level
+  logger = logging.getLogger(__name__)
+  if clargs.verbose > 2:
+    logger.setLevel(logging.DEBUG)
+  elif clargs.verbose > 0:
+    logger.setLevel(logging.INFO)
+
+  main_process(clargs, logger)

From 59fab42f7eeaed9144a5ec5d2011e4a40555c1b9 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 5 Jul 2024 02:37:28 +0100
Subject: [PATCH 145/204] added barplots

---
 scripts/shapeembed/gather_run_results.py | 28 +++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index 10d66fa6..1af719be 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -33,20 +33,32 @@ def main_process(clargs, logger=logging.getLogger(__name__)):
     df['model'] = model
     df['latent_space_sz'] = latent_space_sz
     df['dataset'] = dataset
+
     for trial in ['efd','regionprops','shapeembed', 'combined_all']:
+
       conf_mat = f'{trial}_confusion_matrix.png'
       if os.path.isfile(f'{d}/{conf_mat}'):
         shutil.copy(f'{d}/{conf_mat}',f'{clargs.output_dir}/{run_name}_{conf_mat}')
         df.loc[df['trial'] == trial, 'conf_mat'] = f'./{run_name}_{conf_mat}'
       else:
         df.loc[df['trial'] == trial, 'conf_mat'] = f'nofile'
+
       umap = f'umap_{trial}.pdf'
       if os.path.isfile(f'{d}/{umap}'):
         shutil.copy(f'{d}/{umap}',f'{clargs.output_dir}/{run_name}_{umap}')
         df.loc[df['trial'] == trial, 'umap'] = f'./{run_name}_{umap}'
       else:
         df.loc[df['trial'] == trial, 'umap'] = f'nofile'
+
+      barplot = f'scores_barplot.pdf'
+      if os.path.isfile(f'{d}/{barplot}'):
+        shutil.copy(f'{d}/{barplot}',f'{clargs.output_dir}/{run_name}_{barplot}')
+        df.loc[df['trial'] == trial, 'barplot'] = f'./{run_name}_{barplot}'
+      else:
+        df.loc[df['trial'] == trial, 'barplot'] = f'nofile'
+
     dfs.append(df.convert_dtypes())
+
   df = pd.concat(dfs)
   df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df
   df.set_index(['dataset', 'trial', 'model', 'latent_space_sz'], inplace=True)
@@ -58,6 +70,7 @@ def main_process(clargs, logger=logging.getLogger(__name__)):
   , 'test_f1': 'mean'
   , 'conf_mat': keep_first_fname
   , 'umap': keep_first_fname
+  , 'barplot': keep_first_fname
   })
 
   print('-'*80)
@@ -80,24 +93,29 @@ def main_process(clargs, logger=logging.getLogger(__name__)):
 
   def html_img(path):
       if os.path.splitext(path)[1][1:] == 'png':
-        return f'<img class="zoom" src="{path}" width="50">'
+        return f'<a href="{path}"><img class="zoom" src="{path}" width="50"></a>'
       if os.path.splitext(path)[1][1:] == 'pdf':
-        return f'<iframe class="zoom" src="{path}" width="50" height="50"></iframe>'
+        return f'<a href="{path}"><object class="zoom" data="{path}" width="50" height="50"></a>'
       return '<div style="width: 50px">:(</div>'
   df['conf_mat'] = df['conf_mat'].apply(html_img)
   df['umap'] = df['umap'].apply(html_img)
+  df['barplot'] = df['barplot'].apply(html_img)
 
   def render_html(fname, d):
     with open(fname, 'w') as f:
-      f.write('''<style>
-      .df tbody tr:nth-child(even) { background-color: #dddddd; }
+      f.write('''<head>
+      <style>
+      .df tbody tr:nth-child(even) { background-color: lightblue; }
       .zoom {transition: transform .2s;}
-      .zoom:hover{transform: scale(16);}
+      .zoom:hover{transform: scale(10);}
       </style>
+      </head>
+      <body>
       ''')
       s = d.style
       s.set_table_styles([cell_hover, index_names, headers])
       s.to_html(f, classes='df')
+      f.write('</body>')
 
   with open(f'{clargs.output_dir}/gathered_table.tex', 'w') as f:
     f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n')

From bfab20d34107346376f4e52e3aca526c45c46b8d Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 19:13:31 +0100
Subject: [PATCH 146/204] Added a separate regionprops script

---
 scripts/shapeembed/regionprops.py | 85 +++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100755 scripts/shapeembed/regionprops.py

diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py
new file mode 100755
index 00000000..b57fa517
--- /dev/null
+++ b/scripts/shapeembed/regionprops.py
@@ -0,0 +1,85 @@
+#! /usr/bin/env python3
+
+import types
+import logging
+import argparse
+from skimage import measure
+
+# own imports
+from evaluation import *
+
+def run_regionprops( dataset_params
+                   , properties
+                   , logger ):
+  # access the dataset
+  assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}'
+  ds = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1))
+  # ... and run regionprops for the given properties for each image
+  dfs = []
+  logger.info(f'running regionprops on {dataset_params.name}')
+  logger.info(f'({dataset_params.path})')
+  for i, (img, lbl) in enumerate(tqdm.tqdm(ds)):
+    data = numpy.where(numpy.array(img)>20, 255, 0)
+    t = measure.regionprops_table(data, properties=properties)
+    df = pandas.DataFrame(t)
+    assert df.shape[0] == 1, f'More than one object in image #{i}'
+    df.index = [i]
+    df['class'] = lbl
+    #df.set_index("class", inplace=True)
+    dfs.append(df)
+  # concatenate results as a single dataframe and return it
+  df = pandas.concat(dfs)
+  return df
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(description='Run regionprops on a given dataset')
+  
+  dflt_dataset=('tiny_synthetic_shapes', '/nfs/research/uhlmann/afoix/datasets/image_datasets/tiny_synthetic_shapes', 'mask')
+  parser.add_argument(
+      '-d', '--dataset', nargs=3, metavar=('NAME', 'PATH', 'TYPE'), default=dflt_dataset
+    , help=f"The NAME, PATH and TYPE of the dataset (default: {dflt_dataset})")
+
+  dflt_properties=[ "area"
+                  , "perimeter"
+                  , "centroid"
+                  , "major_axis_length"
+                  , "minor_axis_length"
+                  , "orientation" ]
+
+  parser.add_argument(
+      '-o', '--output-dir', metavar='OUTPUT_DIR', default='./'
+    , help=f"The OUTPUT_DIR path to use to dump results")
+
+  parser.add_argument('-v', '--verbose', action='count', default=0
+    , help="Increase verbosity level by adding more \"v\".")
+
+  # parse command line arguments
+  clargs=parser.parse_args()
+
+  # set verbosity level
+  logger = logging.getLogger(__name__)
+  if clargs.verbose > 2:
+    logger.setLevel(logging.DEBUG)
+  elif clargs.verbose > 0:
+    logger.setLevel(logging.INFO)
+
+  # update default params with clargs
+  dataset = types.SimpleNamespace( name=clargs.dataset[0]
+                                 , path=clargs.dataset[1]
+                                 , type=clargs.dataset[2] )
+  properties = dflt_properties
+
+  # regionprops on input data and score
+
+  regionprops_df = run_regionprops(dataset, properties, logger)
+
+  logger.info(f'-- regionprops on {dataset.name}, raw\n{regionprops_df}')
+  regionprops_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_df.csv")
+  umap_plot(regionprops_df, f'{dataset.name}_regionprops_umap', outputdir=clargs.output_dir)
+
+  regionprops_cm, regionprops_score_df = score_dataframe(regionprops_df, 'regionprops')
+
+  logger.info(f'-- regionprops on {dataset.name}, score\n{regionprops_score_df}')
+  regionprops_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_score_df.csv")
+  logger.info(f'-- confusion matrix:\n{regionprops_cm}')
+  confusion_matrix_plot(regionprops_cm, f'{dataset.name}_regionprops_cm', clargs.output_dir)

From c4a3a23ae6b534fa97ff1670429b50153468e941 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 19:13:52 +0100
Subject: [PATCH 147/204] added a separate efd script

---
 scripts/shapeembed/efd.py | 83 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100755 scripts/shapeembed/efd.py

diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py
new file mode 100755
index 00000000..e2dee043
--- /dev/null
+++ b/scripts/shapeembed/efd.py
@@ -0,0 +1,83 @@
+#! /usr/bin/env python3
+
+import types
+import logging
+import argparse
+import pyefd
+
+# own imports
+from evaluation import *
+
+def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger):
+  # access the dataset
+  assert dataset_params.type == 'mask'
+  ds = datasets.ImageFolder( dataset_params.path
+                           , transform=transforms.Compose([
+                               transforms.Grayscale(1)
+                             , ImageToCoords(contour_size) ]))
+  # ... and run efd on each image
+  dfs = []
+  logger.info(f'running efd on {dataset_params.name}')
+  logger.info(f'({dataset_params.path})')
+  for i, (img, lbl) in enumerate(tqdm.tqdm(ds)):
+    coeffs = pyefd.elliptic_fourier_descriptors(img, order=10, normalize=False)
+    norm_coeffs = pyefd.normalize_efd(coeffs)
+    df = pandas.DataFrame({
+      "norm_coeffs": norm_coeffs.flatten().tolist()
+    , "coeffs": coeffs.flatten().tolist()
+    }).T.rename_axis("coeffs")
+    df['class'] = lbl
+    df.set_index("class", inplace=True, append=True)
+    dfs.append(df)
+  # concatenate results as a single dataframe and return it
+  df = pandas.concat(dfs).xs('coeffs', level='coeffs')
+  df.reset_index(level='class', inplace=True)
+  return df
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(description='Run efd on a given dataset')
+  
+  dflt_dataset=('tiny_synthetic_shapes', '/nfs/research/uhlmann/afoix/datasets/image_datasets/tiny_synthetic_shapes', 'mask')
+  parser.add_argument(
+      '-d', '--dataset', nargs=3, metavar=('NAME', 'PATH', 'TYPE'), default=dflt_dataset
+    , help=f"The NAME, PATH and TYPE of the dataset (default: {dflt_dataset})")
+
+  dflt_contour_size=512
+
+  parser.add_argument(
+      '-o', '--output-dir', metavar='OUTPUT_DIR', default='./'
+    , help=f"The OUTPUT_DIR path to use to dump results")
+
+  parser.add_argument('-v', '--verbose', action='count', default=0
+    , help="Increase verbosity level by adding more \"v\".")
+
+  # parse command line arguments
+  clargs=parser.parse_args()
+
+  # set verbosity level
+  logger = logging.getLogger(__name__)
+  if clargs.verbose > 2:
+    logger.setLevel(logging.DEBUG)
+  elif clargs.verbose > 0:
+    logger.setLevel(logging.INFO)
+
+  # update default params with clargs
+  dataset = types.SimpleNamespace( name=clargs.dataset[0]
+                                 , path=clargs.dataset[1]
+                                 , type=clargs.dataset[2] )
+  contour_size = dflt_contour_size
+
+  # efd on input data and score
+
+  efd_df = run_elliptic_fourier_descriptors(dataset, contour_size, logger)
+
+  logger.info(f'-- efd on {dataset.name}, raw\n{efd_df}')
+  efd_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_df.csv")
+  umap_plot(efd_df, f'{dataset.name}_efd_umap', outputdir=clargs.output_dir)
+
+  efd_cm, efd_score_df = score_dataframe(efd_df, 'efd')
+
+  logger.info(f'-- efd on {dataset.name}, score\n{efd_score_df}')
+  efd_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_score_df.csv")
+  logger.info(f'-- confusion matrix:\n{efd_cm}')
+  confusion_matrix_plot(efd_cm, f'{dataset.name}_efd_cm', clargs.output_dir)

From b7998039ea4dfb75dd10d33164fb3c88491e8dbc Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 19:17:52 +0100
Subject: [PATCH 148/204] refactor efd and regionprops out of evaluation
 helpers

---
 scripts/shapeembed/efd.py         |  4 +--
 scripts/shapeembed/evaluation.py  | 55 +------------------------------
 scripts/shapeembed/regionprops.py |  4 +--
 3 files changed, 5 insertions(+), 58 deletions(-)

diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py
index e2dee043..d24db81d 100755
--- a/scripts/shapeembed/efd.py
+++ b/scripts/shapeembed/efd.py
@@ -73,11 +73,11 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger):
 
   logger.info(f'-- efd on {dataset.name}, raw\n{efd_df}')
   efd_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_df.csv")
-  umap_plot(efd_df, f'{dataset.name}_efd_umap', outputdir=clargs.output_dir)
+  umap_plot(efd_df, f'{dataset.name}_efd', outputdir=clargs.output_dir)
 
   efd_cm, efd_score_df = score_dataframe(efd_df, 'efd')
 
   logger.info(f'-- efd on {dataset.name}, score\n{efd_score_df}')
   efd_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_score_df.csv")
   logger.info(f'-- confusion matrix:\n{efd_cm}')
-  confusion_matrix_plot(efd_cm, f'{dataset.name}_efd_cm', clargs.output_dir)
+  confusion_matrix_plot(efd_cm, f'{dataset.name}_efd', clargs.output_dir)
diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index b5d55cec..a273fb9d 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -48,59 +48,6 @@ def run_kmeans(dataframe, random_seed=42):
   conf_mat = confusion_matrix(dataframe['class'], kmeans.labels_)
   return kmeans, accuracy, conf_mat
 
-def run_regionprops( dataset_params
-                   , properties = [ "area"
-                                  , "perimeter"
-                                  , "centroid"
-                                  , "major_axis_length"
-                                  , "minor_axis_length"
-                                  , "orientation" ] ):
-  # access the dataset
-  assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}'
-  ds = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1))
-  # ... and run regionprops for the given properties for each image
-  dfs = []
-  logger.info(f'running regionprops on {dataset_params.name}')
-  logger.info(f'({dataset_params.path})')
-  for i, (img, lbl) in enumerate(tqdm.tqdm(ds)):
-    data = numpy.where(numpy.array(img)>20, 255, 0)
-    t = measure.regionprops_table(data, properties=properties)
-    df = pandas.DataFrame(t)
-    assert df.shape[0] == 1, f'More than one object in image #{i}'
-    df.index = [i]
-    df['class'] = lbl
-    #df.set_index("class", inplace=True)
-    dfs.append(df)
-  # concatenate results as a single dataframe and return it
-  df = pandas.concat(dfs)
-  return df
-
-def run_elliptic_fourier_descriptors(dataset_params, contour_size=512):
-  # access the dataset
-  assert dataset_params.type == 'mask'
-  ds = datasets.ImageFolder( dataset_params.path
-                           , transform=transforms.Compose([
-                               transforms.Grayscale(1)
-                             , ImageToCoords(contour_size) ]))
-  # ... and run efd on each image
-  dfs = []
-  logger.info(f'running efd on {dataset_params.name}')
-  logger.info(f'({dataset_params.path})')
-  for i, (img, lbl) in enumerate(tqdm.tqdm(ds)):
-    coeffs = pyefd.elliptic_fourier_descriptors(img, order=10, normalize=False)
-    norm_coeffs = pyefd.normalize_efd(coeffs)
-    df = pandas.DataFrame({
-      "norm_coeffs": norm_coeffs.flatten().tolist()
-    , "coeffs": coeffs.flatten().tolist()
-    }).T.rename_axis("coeffs")
-    df['class'] = lbl
-    df.set_index("class", inplace=True, append=True)
-    dfs.append(df)
-  # concatenate results as a single dataframe and return it
-  df = pandas.concat(dfs).xs('coeffs', level='coeffs')
-  df.reset_index(level='class', inplace=True)
-  return df
-
 def score_dataframe( df, name
                    , test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5 ):
   # drop strings and python object columns
@@ -205,7 +152,7 @@ def umap_plot( df
   seaborn.despine(left=True, bottom=True)
   plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
   plt.tight_layout()
-  plt.savefig(f"{outputdir}/umap_{name}.pdf")
+  plt.savefig(f"{outputdir}/{name}_umap.pdf")
   plt.close()
 
 def save_scores( scores_df
diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py
index b57fa517..3af36220 100755
--- a/scripts/shapeembed/regionprops.py
+++ b/scripts/shapeembed/regionprops.py
@@ -75,11 +75,11 @@ def run_regionprops( dataset_params
 
   logger.info(f'-- regionprops on {dataset.name}, raw\n{regionprops_df}')
   regionprops_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_df.csv")
-  umap_plot(regionprops_df, f'{dataset.name}_regionprops_umap', outputdir=clargs.output_dir)
+  umap_plot(regionprops_df, f'{dataset.name}_regionprops', outputdir=clargs.output_dir)
 
   regionprops_cm, regionprops_score_df = score_dataframe(regionprops_df, 'regionprops')
 
   logger.info(f'-- regionprops on {dataset.name}, score\n{regionprops_score_df}')
   regionprops_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_score_df.csv")
   logger.info(f'-- confusion matrix:\n{regionprops_cm}')
-  confusion_matrix_plot(regionprops_cm, f'{dataset.name}_regionprops_cm', clargs.output_dir)
+  confusion_matrix_plot(regionprops_cm, f'{dataset.name}_regionprops', clargs.output_dir)

From 6bc1947f63de4ae74ba92c19f297a7b2596ffc02 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 19:23:47 +0100
Subject: [PATCH 149/204] less debug info by default + create outdir if not
 there

---
 scripts/shapeembed/efd.py         | 6 +++++-
 scripts/shapeembed/evaluation.py  | 2 +-
 scripts/shapeembed/regionprops.py | 4 ++++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py
index d24db81d..20f250c5 100755
--- a/scripts/shapeembed/efd.py
+++ b/scripts/shapeembed/efd.py
@@ -1,9 +1,10 @@
 #! /usr/bin/env python3
 
+import os
 import types
+import pyefd
 import logging
 import argparse
-import pyefd
 
 # own imports
 from evaluation import *
@@ -67,6 +68,9 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger):
                                  , type=clargs.dataset[2] )
   contour_size = dflt_contour_size
 
+  # create output dir if it does not exist
+  os.makedirs(clargs.output_dir, exist_ok=True)
+
   # efd on input data and score
 
   efd_df = run_elliptic_fourier_descriptors(dataset, contour_size, logger)
diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index a273fb9d..0c2fec76 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -23,7 +23,7 @@
 # logging facilities
 ###############################################################################
 logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.DEBUG)
+#logging.basicConfig(level=logging.DEBUG)
 
 def dataloader_to_dataframe(dataloader):
   # gather the data and the associated labels, and drop rows with NaNs
diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py
index 3af36220..1d76309c 100755
--- a/scripts/shapeembed/regionprops.py
+++ b/scripts/shapeembed/regionprops.py
@@ -1,5 +1,6 @@
 #! /usr/bin/env python3
 
+import os
 import types
 import logging
 import argparse
@@ -69,6 +70,9 @@ def run_regionprops( dataset_params
                                  , type=clargs.dataset[2] )
   properties = dflt_properties
 
+  # create output dir if it does not exist
+  os.makedirs(clargs.output_dir, exist_ok=True)
+
   # regionprops on input data and score
 
   regionprops_df = run_regionprops(dataset, properties, logger)

From db47da97e65b82f14a824d9ca2ef1f09972a7199 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 19:52:42 +0100
Subject: [PATCH 150/204] removed regionprops/efd from main shapeembed script +
 filename sanitisation

---
 scripts/shapeembed/shapeembed.py | 91 ++++++++++++--------------------
 1 file changed, 35 insertions(+), 56 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 1e05b222..53e963a8 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -64,7 +64,7 @@
   model_name='resnet18_vae'
 , dataset=types.SimpleNamespace(
     name='tiny_synthetic_shapes'
-  , path='/nfs/research/uhlmann/afoix/image_datasets/tiny_synthetic_shapes'
+  , path='/nfs/research/uhlmann/afoix/datasets/image_datasets/tiny_synthetic_shapes'
   , type='mask'
   )
 , batch_size=4
@@ -313,71 +313,50 @@ def main_process(params):
   # run predictions
   #################
   # ... and gather latent space
+  os.makedirs(f"{params.output_dir}/", exist_ok=True)
   logger.info(f'-- run predictions and extract latent space --')
   latent_space, shapeembed_df = run_predictions(
     trainer, model, dataloader
   , num_workers=params.num_workers
   )
   logger.debug(f'\n{shapeembed_df}')
-  # ... and prepare output directory and save latent space
-  os.makedirs(f"{params.output_dir}/", exist_ok=True)
-  np.save(f'{params.output_dir}/latent_space.npy', latent_space)
-  shapeembed_df.to_pickle(f'{params.output_dir}/latent_space.pkl')
+  np.save(f'{params.output_dir}/{params.dataset.name}_shapeembed_latent_space.npy', latent_space)
+  shapeembed_df.to_pickle(f'{params.output_dir}/{params.dataset.name}_shapeembed_latent_space.pkl')
+  shapeembed_df.to_csv(f"{params.output_dir}/{params.dataset.name}_shapeembed_df.csv")
   logger.info(f'-- generate shapeembed umap --')
-  umap_plot(shapeembed_df, 'shapeembed', outputdir=params.output_dir)
-
-  # gather metrics
-  ################
-  # score shape embed
+  umap_plot(shapeembed_df, f'{params.dataset.name}_shapeembed', outputdir=params.output_dir)
   logger.info(f'-- score shape embed --')
   shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed')
-  logger.info(f'-- shapeembed on input data')
-  logger.info(f'-- score:\n{shapeembed_score_df}')
+  logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}')
+  shapeembed_score_df.to_csv(f"{params.output_dir}/{params.dataset.name}_shapeembed_score_df.csv")
   logger.info(f'-- confusion matrix:\n{shapeembed_cm}')
-  confusion_matrix_plot(shapeembed_cm, 'shapeembed', params.output_dir)
-  # regionprops on input data and score
-  logger.info(f'-- regionprops on input data --')
-  regionprops_df = run_regionprops(params.dataset)
-  logger.debug(f'\n{regionprops_df}')
-  regionprops_cm, regionprops_score_df = score_dataframe(regionprops_df, 'regionprops')
-  logger.info(f'-- regionprops on input data')
-  logger.info(f'-- score:\n{regionprops_score_df}')
-  logger.info(f'-- confusion matrix:\n{regionprops_cm}')
-  confusion_matrix_plot(regionprops_cm, 'regionprops_cm', params.output_dir)
-  # elliptic fourier descriptors on input data and score
-  logger.info(f'-- elliptic fourier descriptors on input data --')
-  efd_df = run_elliptic_fourier_descriptors(params.dataset)
-  logger.debug(f'\n{efd_df}')
-  efd_cm, efd_score_df = score_dataframe(efd_df, 'efd')
-  logger.info(f'-- elliptic fourier descriptors on input data')
-  logger.info(f'-- score:\n{efd_score_df}')
-  logger.info(f'-- confusion matrix:\n{efd_cm}')
-  confusion_matrix_plot(efd_cm, 'efd', params.output_dir)
-  # combined shapeembed + efd + regionprops
-  logger.info(f'-- shapeembed + efd + regionprops --')
-  comb_df = pandas.concat([ shapeembed_df
-                          , efd_df.drop('class', axis=1)
-                          , regionprops_df.drop('class', axis=1) ], axis=1)
-  logger.debug(f'\n{comb_df}')
-  comb_cm, comb_score_df = score_dataframe(comb_df, 'combined_all')
-  logger.info(f'-- shapeembed + efd + regionprops on input data')
-  logger.info(f'-- score:\n{comb_score_df}')
-  logger.info(f'-- confusion matrix:\n{comb_cm}')
-  confusion_matrix_plot(comb_cm, 'combined_all', params.output_dir)
-  # XXX Not currently doing the kmeans
-  # XXX kmeans on input data and score
-  #logger.info(f'-- kmeans on input data --')
-  #kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
-  #print(kmeans)
-  #logger.info(f'-- kmeans accuracy: {accuracy}')
-  #logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
-
-  # collate and save gathered results TODO KMeans
-  scores_df = pandas.concat([ regionprops_score_df
-                            , efd_score_df
-                            , shapeembed_score_df
-                            , comb_score_df ])
-  save_scores(scores_df, outputdir=params.output_dir)
+  confusion_matrix_plot(shapeembed_cm, f'{params.dataset.name}_shapeembed', params.output_dir)
+  # XXX TODO move somewhere else if desired XXX
+  ## combined shapeembed + efd + regionprops
+  #logger.info(f'-- shapeembed + efd + regionprops --')
+  #comb_df = pandas.concat([ shapeembed_df
+  #                        , efd_df.drop('class', axis=1)
+  #                        , regionprops_df.drop('class', axis=1) ], axis=1)
+  #logger.debug(f'\n{comb_df}')
+  #comb_cm, comb_score_df = score_dataframe(comb_df, 'combined_all')
+  #logger.info(f'-- shapeembed + efd + regionprops on input data')
+  #logger.info(f'-- score:\n{comb_score_df}')
+  #logger.info(f'-- confusion matrix:\n{comb_cm}')
+  #confusion_matrix_plot(comb_cm, 'combined_all', params.output_dir)
+  ## XXX Not currently doing the kmeans
+  ## XXX kmeans on input data and score
+  ##logger.info(f'-- kmeans on input data --')
+  ##kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader()))
+  ##print(kmeans)
+  ##logger.info(f'-- kmeans accuracy: {accuracy}')
+  ##logger.info(f'-- kmeans confusion matrix:\n{conf_mat}')
+
+  ## collate and save gathered results TODO KMeans
+  #scores_df = pandas.concat([ regionprops_score_df
+  #                          , efd_score_df
+  #                          , shapeembed_score_df
+  #                          , comb_score_df ])
+  #save_scores(scores_df, outputdir=params.output_dir)
 
 # main entry point
 ###############################################################################

From 5a8b27498194495e282f8cd6fa0150da0a582b9b Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 20:01:54 +0100
Subject: [PATCH 151/204] unify file names across efd/regionprops/shapeembed

---
 scripts/shapeembed/efd.py         |  8 ++++----
 scripts/shapeembed/evaluation.py  |  4 ++--
 scripts/shapeembed/regionprops.py |  8 ++++----
 scripts/shapeembed/shapeembed.py  | 12 ++++++------
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py
index 20f250c5..4f910990 100755
--- a/scripts/shapeembed/efd.py
+++ b/scripts/shapeembed/efd.py
@@ -76,12 +76,12 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger):
   efd_df = run_elliptic_fourier_descriptors(dataset, contour_size, logger)
 
   logger.info(f'-- efd on {dataset.name}, raw\n{efd_df}')
-  efd_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_df.csv")
-  umap_plot(efd_df, f'{dataset.name}_efd', outputdir=clargs.output_dir)
+  efd_df.to_csv(f"{clargs.output_dir}/{dataset.name}-efd-raw_df.csv")
+  umap_plot(efd_df, f'{dataset.name}-efd', outputdir=clargs.output_dir)
 
   efd_cm, efd_score_df = score_dataframe(efd_df, 'efd')
 
   logger.info(f'-- efd on {dataset.name}, score\n{efd_score_df}')
-  efd_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_score_df.csv")
+  efd_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}-efd-score_df.csv")
   logger.info(f'-- confusion matrix:\n{efd_cm}')
-  confusion_matrix_plot(efd_cm, f'{dataset.name}_efd', clargs.output_dir)
+  confusion_matrix_plot(efd_cm, f'{dataset.name}-efd', clargs.output_dir)
diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 0c2fec76..b5b2fbb8 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -106,7 +106,7 @@ def confusion_matrix_plot( cm, name, outputdir
   plt.title(f'{name} - Confusion Matrix')
   plt.xlabel('Predicted')
   plt.ylabel('Actual')
-  plt.savefig(f'{outputdir}/{name}_confusion_matrix.png')
+  plt.savefig(f'{outputdir}/{name}-confusion_matrix.png')
   plt.clf()  # Clear figure
 
 def umap_plot( df
@@ -152,7 +152,7 @@ def umap_plot( df
   seaborn.despine(left=True, bottom=True)
   plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False)
   plt.tight_layout()
-  plt.savefig(f"{outputdir}/{name}_umap.pdf")
+  plt.savefig(f"{outputdir}/{name}-umap.pdf")
   plt.close()
 
 def save_scores( scores_df
diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py
index 1d76309c..3b65933f 100755
--- a/scripts/shapeembed/regionprops.py
+++ b/scripts/shapeembed/regionprops.py
@@ -78,12 +78,12 @@ def run_regionprops( dataset_params
   regionprops_df = run_regionprops(dataset, properties, logger)
 
   logger.info(f'-- regionprops on {dataset.name}, raw\n{regionprops_df}')
-  regionprops_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_df.csv")
-  umap_plot(regionprops_df, f'{dataset.name}_regionprops', outputdir=clargs.output_dir)
+  regionprops_df.to_csv(f"{clargs.output_dir}/{dataset.name}-regionprops-raw_df.csv")
+  umap_plot(regionprops_df, f'{dataset.name}-regionprops', outputdir=clargs.output_dir)
 
   regionprops_cm, regionprops_score_df = score_dataframe(regionprops_df, 'regionprops')
 
   logger.info(f'-- regionprops on {dataset.name}, score\n{regionprops_score_df}')
-  regionprops_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_score_df.csv")
+  regionprops_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}-regionprops-score_df.csv")
   logger.info(f'-- confusion matrix:\n{regionprops_cm}')
-  confusion_matrix_plot(regionprops_cm, f'{dataset.name}_regionprops', clargs.output_dir)
+  confusion_matrix_plot(regionprops_cm, f'{dataset.name}-regionprops', clargs.output_dir)
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 53e963a8..7ae9130b 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -320,17 +320,17 @@ def main_process(params):
   , num_workers=params.num_workers
   )
   logger.debug(f'\n{shapeembed_df}')
-  np.save(f'{params.output_dir}/{params.dataset.name}_shapeembed_latent_space.npy', latent_space)
-  shapeembed_df.to_pickle(f'{params.output_dir}/{params.dataset.name}_shapeembed_latent_space.pkl')
-  shapeembed_df.to_csv(f"{params.output_dir}/{params.dataset.name}_shapeembed_df.csv")
+  np.save(f'{params.output_dir}/{params.dataset.name}-shapeembed-latent_space.npy', latent_space)
+  shapeembed_df.to_pickle(f'{params.output_dir}/{params.dataset.name}-shapeembed-latent_space.pkl')
+  shapeembed_df.to_csv(f"{params.output_dir}/{params.dataset.name}-shapeembed-raw_df.csv")
   logger.info(f'-- generate shapeembed umap --')
-  umap_plot(shapeembed_df, f'{params.dataset.name}_shapeembed', outputdir=params.output_dir)
+  umap_plot(shapeembed_df, f'{params.dataset.name}-shapeembed', outputdir=params.output_dir)
   logger.info(f'-- score shape embed --')
   shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed')
   logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}')
-  shapeembed_score_df.to_csv(f"{params.output_dir}/{params.dataset.name}_shapeembed_score_df.csv")
+  shapeembed_score_df.to_csv(f"{params.output_dir}/{params.dataset.name}-shapeembed-score_df.csv")
   logger.info(f'-- confusion matrix:\n{shapeembed_cm}')
-  confusion_matrix_plot(shapeembed_cm, f'{params.dataset.name}_shapeembed', params.output_dir)
+  confusion_matrix_plot(shapeembed_cm, f'{params.dataset.name}-shapeembed', params.output_dir)
   # XXX TODO move somewhere else if desired XXX
   ## combined shapeembed + efd + regionprops
   #logger.info(f'-- shapeembed + efd + regionprops --')

From 9d3a0538a16c2568fbf177b2aaf4661cc88a08e1 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 20:04:33 +0100
Subject: [PATCH 152/204] Added a readme

---
 scripts/shapeembed/readme.md | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 scripts/shapeembed/readme.md

diff --git a/scripts/shapeembed/readme.md b/scripts/shapeembed/readme.md
new file mode 100644
index 00000000..76bebf92
--- /dev/null
+++ b/scripts/shapeembed/readme.md
@@ -0,0 +1,9 @@
+# Shape Embed
+
+There are currently 3 toplevel scripts:
+
+- shapeembed.py
+- regionprops.py
+- efd.py
+
+Each can be run to generate results, a umap and a confusion matrix. Each have a `-o` option to specify an output directory.

From 8326afcd2b4b4a66fe730985443afc1ccfa56c0b Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 20:55:20 +0100
Subject: [PATCH 153/204] track params in reporting

---
 scripts/shapeembed/evaluation.py |  4 ++++
 scripts/shapeembed/shapeembed.py | 38 +++++++++++++++++++++++---------
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index b5b2fbb8..3f3452d8 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -49,6 +49,7 @@ def run_kmeans(dataframe, random_seed=42):
   return kmeans, accuracy, conf_mat
 
 def score_dataframe( df, name
+                   , tag_columns=[]
                    , test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5 ):
   # drop strings and python object columns
   #clean_df = df.select_dtypes(exclude=['object'])
@@ -95,6 +96,9 @@ def score_dataframe( df, name
   df = pandas.DataFrame(cv_results)
   df = df.drop(["fit_time", "score_time"], axis=1)
   df.insert(loc=0, column='trial', value=name)
+  tag_columns.reverse()
+  for tag_col_name, tag_col_value in tag_columns:
+    df.insert(loc=0, column=tag_col_name, value=tag_col_value)
   return conf_mat, df
 
 def confusion_matrix_plot( cm, name, outputdir
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 7ae9130b..5e365dbd 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -97,6 +97,24 @@
 , cycle_momentum=False
 )
 
+def model_str(params):
+  s = f'{params.model_name}'
+  if vars(params.model_args):
+    s += f"-{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}"
+  return s
+
+def job_str(params):
+  return f"{params.dataset.name}-{model_str(params)}-{params.compression_factor}-{params.latent_dim}-{params.batch_size}"
+
+def tag_cols(params):
+  cols = []
+  cols.append(('dataset', params.dataset.name))
+  cols.append(('model', model_str(params)))
+  cols.append(('compression_factor', params.compression_factor))
+  cols.append(('latent_dim', params.latent_dim))
+  cols.append(('batch_size', params.batch_size))
+  return cols
+
 # dataset loading functions
 ###############################################################################
 
@@ -191,8 +209,7 @@ def get_trainer(model, params):
 
   # setup WandB logger
   logger.info('setup wandb logger')
-  jobname = f"{params.model_name}_{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}_{params.latent_dim}_{params.batch_size}_{params.dataset.name}"
-  wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=jobname)
+  wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=job_str(params))
   wandblogger.watch(model, log="all")
 
   # setup checkpoints
@@ -320,17 +337,18 @@ def main_process(params):
   , num_workers=params.num_workers
   )
   logger.debug(f'\n{shapeembed_df}')
-  np.save(f'{params.output_dir}/{params.dataset.name}-shapeembed-latent_space.npy', latent_space)
-  shapeembed_df.to_pickle(f'{params.output_dir}/{params.dataset.name}-shapeembed-latent_space.pkl')
-  shapeembed_df.to_csv(f"{params.output_dir}/{params.dataset.name}-shapeembed-raw_df.csv")
+  pfx=job_str(params)
+  np.save(f'{params.output_dir}/{pfx}-shapeembed-latent_space.npy', latent_space)
+  shapeembed_df.to_pickle(f'{params.output_dir}/{pfx}-shapeembed-latent_space.pkl')
+  shapeembed_df.to_csv(f"{params.output_dir}/{pfx}-shapeembed-raw_df.csv")
   logger.info(f'-- generate shapeembed umap --')
-  umap_plot(shapeembed_df, f'{params.dataset.name}-shapeembed', outputdir=params.output_dir)
+  umap_plot(shapeembed_df, f'{pfx}-shapeembed', outputdir=params.output_dir)
   logger.info(f'-- score shape embed --')
-  shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed')
+  shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params))
   logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}')
-  shapeembed_score_df.to_csv(f"{params.output_dir}/{params.dataset.name}-shapeembed-score_df.csv")
+  shapeembed_score_df.to_csv(f"{params.output_dir}/{pfx}-shapeembed-score_df.csv")
   logger.info(f'-- confusion matrix:\n{shapeembed_cm}')
-  confusion_matrix_plot(shapeembed_cm, f'{params.dataset.name}-shapeembed', params.output_dir)
+  confusion_matrix_plot(shapeembed_cm, f'{pfx}-shapeembed', params.output_dir)
   # XXX TODO move somewhere else if desired XXX
   ## combined shapeembed + efd + regionprops
   #logger.info(f'-- shapeembed + efd + regionprops --')
@@ -481,7 +499,7 @@ def prob (x):
   if clargs.output_dir:
     params.output_dir = clargs.output_dir
   else:
-    params.output_dir = f'./{params.model_name}_{params.latent_dim}_{params.batch_size}_{params.dataset.name}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
+    params.output_dir = f'./{job_str(params)}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
 
   # XXX
   torch.set_float32_matmul_precision('medium')

From aaa55dbc6ee5df7fefa4fbc41b47bcc416019088 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 21:10:22 +0100
Subject: [PATCH 154/204] also add model specific params as tag columns

---
 scripts/shapeembed/shapeembed.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 5e365dbd..182a2415 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -110,6 +110,7 @@ def tag_cols(params):
   cols = []
   cols.append(('dataset', params.dataset.name))
   cols.append(('model', model_str(params)))
+  for k, v in vars(params.model_args).items(): cols.append((k, v))
   cols.append(('compression_factor', params.compression_factor))
   cols.append(('latent_dim', params.latent_dim))
   cols.append(('batch_size', params.batch_size))

From ba67d36237d7614fcb1e968dc18c927d48be1234 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 22:19:33 +0100
Subject: [PATCH 155/204] added a slurm script to sweap shapeembed parameters

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 132 +++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100755 scripts/shapeembed/slurm_sweap_shapeembed.py

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
new file mode 100755
index 00000000..f8af3c85
--- /dev/null
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -0,0 +1,132 @@
+#! /usr/bin/env python3
+
+import os
+import logging
+import argparse
+import datetime
+import subprocess
+
+# shapeembed parameters to sweap
+################################################################################
+
+datasets_pfx = '/nfs/research/uhlmann/afoix/datasets/image_datasets'
+datasets = [
+#  ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask")
+  ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask")
+#, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask")
+, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask")
+#, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask")
+#, ("allen", f"{datasets_pfx}/allen_dataset/", "mask")
+]
+
+models = [
+  "resnet18_vae"
+#, "resnet50_vae"
+#, "resnet18_beta_vae"
+#, "resnet18_vae_bolt"
+#, "resnet50_vae_bolt"
+, "resnet18_vqvae"
+#, "resnet50_vqvae"
+#, "resnet18_vqvae_legacy"
+#, "resnet50_vqvae_legacy"
+#, "resnet101_vqvae_legacy"
+#, "resnet110_vqvae_legacy"
+#, "resnet152_vqvae_legacy"
+#, "resnet18_vae_legacy"
+#, "resnet50_vae_legacy"
+]
+
+model_params = {
+  "resnet18_beta_vae": {'beta': [0.5, 1.0, 2]}
+}
+
+compression_factors = [2, 4]
+
+batch_sizes = [4]
+
+# other parameters
+################################################################################
+
+dflt_slurm_dir=f'{os.getcwd()}/slurm_info_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
+dflt_out_dir=f'{os.getcwd()}/output_results_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
+
+slurm_time = '50:00:00'
+slurm_mem = '200G'
+slurm_gpus = 'a100:1'
+
+n_epochs = 2
+
+wandb_project='shapeembed'
+
+slurm_script="""#! /bin/bash
+echo "running shape embed with:"
+echo "  - dataset {dataset[0]} ({dataset[1]}, {dataset[2]})"
+echo "  - model {model} ({model_params})"
+echo "  - compression_factor {compression_factor}"
+echo "  - batch size {batch_size}"
+python3 shapeembed.py --wandb-project {wandb_project} --num-epochs {n_epochs} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir}
+"""
+
+################################################################################
+
+def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_factor, batch_size):
+  jobname = f'shapeembed_{dataset[0]}_{model}_{compression_factor}_{batch_size}'
+  logger.info(f'spawning {jobname}')
+  with open(f'{slurm_out_dir}/{jobname}.script', mode='w+') as fp:
+    fp.write(slurm_script.format( dataset=dataset
+                                , model=model
+                                , model_params=[]
+                                , compression_factor=compression_factor
+                                , batch_size=batch_size
+                                , out_dir=out_dir
+                                , wandb_project=wandb_project
+                                , n_epochs=n_epochs ))
+    fp.flush()
+    logger.info(f'written {fp.name}')
+    logger.debug(f'cat {fp.name}')
+    result = subprocess.run(['cat', fp.name], stdout=subprocess.PIPE)
+    logger.debug(result.stdout.decode('utf-8'))
+    result = subprocess.run([ 'sbatch'
+                            , '--time', slurm_time
+                            , '--mem', slurm_mem
+                            , '--job-name', jobname
+                            , '--output', f'{slurm_out_dir}/{jobname}.out'
+                            , '--error', f'{slurm_out_dir}/{jobname}.err'
+                            #, '--gres', n_gpus(ls)
+                            , f'--gpus={slurm_gpus}'
+                            , fp.name ], stdout=subprocess.PIPE)
+    logger.info(result.stdout.decode('utf-8'))
+
+if __name__ == "__main__":
+
+  parser = argparse.ArgumentParser(description='Sweap parameters for shapeembed')
+  
+  parser.add_argument(
+      '-s', '--slurm-output-dir', metavar='SLURM_OUTPUT_DIR', default=dflt_slurm_dir
+    , help=f"The SLURM_OUTPUT_DIR path to use to dump slurm info")
+  
+  parser.add_argument(
+      '-o', '--output-dir', metavar='OUTPUT_DIR', default=dflt_out_dir
+    , help=f"The OUTPUT_DIR path to use to dump results")
+
+  parser.add_argument('-v', '--verbose', action='count', default=0
+    , help="Increase verbosity level by adding more \"v\".")
+
+  # parse command line arguments
+  clargs=parser.parse_args()
+
+  # set verbosity level
+  logger = logging.getLogger(__name__)
+  if clargs.verbose > 2:
+    logger.setLevel(logging.DEBUG)
+  elif clargs.verbose > 0:
+    logger.setLevel(logging.INFO)
+
+  os.makedirs(clargs.slurm_output_dir, exist_ok=True)
+  os.makedirs(clargs.output_dir, exist_ok=True)
+
+  for params in [ (ds, m, cf, bs) for ds in datasets
+                                  for m in models
+                                  for cf in compression_factors
+                                  for bs in batch_sizes ]:
+    spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params)

From 7c422b1473a861690491d5da20e882051fd279d6 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 23:23:05 +0100
Subject: [PATCH 156/204] added resnet50_beta_vae to the factory

---
 bioimage_embed/models/factory.py | 40 +++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/bioimage_embed/models/factory.py b/bioimage_embed/models/factory.py
index 713b98af..8c6440d5 100644
--- a/bioimage_embed/models/factory.py
+++ b/bioimage_embed/models/factory.py
@@ -97,19 +97,6 @@ def resnet18_vae(self):
             bolts.ResNet18VAEDecoder,
         )
 
-    def resnet50_vae(self):
-        return self.create_model(
-            partial(
-                pythae.models.VAEConfig,
-                use_default_encoder=False,
-                use_default_decoder=False,
-                **self.kwargs
-            ),
-            pythae.models.VAE,
-            bolts.ResNet50VAEEncoder,
-            bolts.ResNet50VAEDecoder,
-        )
-
     def resnet18_vqvae(self):
         return self.create_model(
             partial(
@@ -136,6 +123,19 @@ def resnet18_beta_vae(self):
             bolts.ResNet18VAEDecoder,
         )
 
+    def resnet50_vae(self):
+        return self.create_model(
+            partial(
+                pythae.models.VAEConfig,
+                use_default_encoder=False,
+                use_default_decoder=False,
+                **self.kwargs
+            ),
+            pythae.models.VAE,
+            bolts.ResNet50VAEEncoder,
+            bolts.ResNet50VAEDecoder,
+        )
+
     def resnet50_vqvae(self):
         return self.create_model(
             partial(
@@ -149,6 +149,19 @@ def resnet50_vqvae(self):
             bolts.ResNet50VQVAEDecoder,
         )
 
+    def resnet50_beta_vae(self):
+        return self.create_model(
+            partial(
+                pythae.models.BetaVAEConfig,
+                use_default_encoder=False,
+                use_default_decoder=False,
+                **self.kwargs
+            ),
+            pythae.models.BetaVAE,
+            bolts.ResNet50VAEEncoder,
+            bolts.ResNet50VAEDecoder,
+        )
+
     def resnet_vae_legacy(self, depth):
         return self.create_model(
             pythae.models.VAEConfig,
@@ -192,6 +205,7 @@ def resnet152_vqvae_legacy(self):
     "resnet18_vae",
     "resnet18_beta_vae",
     "resnet50_vae",
+    "resnet50_beta_vae",
     "resnet18_vae_bolt",
     "resnet50_vae_bolt",
     "resnet18_vqvae",

From 1307579b2239fc83f9ab2998f4023e19bffa1a94 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 23:23:56 +0100
Subject: [PATCH 157/204] added resnet50_beta_vae to the shapeembed script

---
 scripts/shapeembed/shapeembed.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 182a2415..2b987875 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -46,6 +46,7 @@
   "resnet18_vae"
 , "resnet50_vae"
 , "resnet18_beta_vae"
+, "resnet50_beta_vae"
 , "resnet18_vae_bolt"
 , "resnet50_vae_bolt"
 , "resnet18_vqvae"

From 1f82d9f200524998b93eadd2919cd67c54d419e4 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 23:24:47 +0100
Subject: [PATCH 158/204] handle per model params in slurm script + chose some
 param values to sweap

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 41 +++++++++++++-------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index f8af3c85..b1ef3389 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -4,6 +4,7 @@
 import logging
 import argparse
 import datetime
+import itertools
 import subprocess
 
 # shapeembed parameters to sweap
@@ -11,22 +12,24 @@
 
 datasets_pfx = '/nfs/research/uhlmann/afoix/datasets/image_datasets'
 datasets = [
+  ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask")
 #  ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask")
-  ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask")
+#  ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask")
 #, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask")
-, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask")
+#, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask")
 #, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask")
 #, ("allen", f"{datasets_pfx}/allen_dataset/", "mask")
 ]
 
 models = [
   "resnet18_vae"
-#, "resnet50_vae"
-#, "resnet18_beta_vae"
+, "resnet50_vae"
+, "resnet18_beta_vae"
+, "resnet50_beta_vae"
 #, "resnet18_vae_bolt"
 #, "resnet50_vae_bolt"
 , "resnet18_vqvae"
-#, "resnet50_vqvae"
+, "resnet50_vqvae"
 #, "resnet18_vqvae_legacy"
 #, "resnet50_vqvae_legacy"
 #, "resnet101_vqvae_legacy"
@@ -37,12 +40,13 @@
 ]
 
 model_params = {
-  "resnet18_beta_vae": {'beta': [0.5, 1.0, 2]}
+  "resnet18_beta_vae": {'beta': [1,2,5,10,20]}
+, "resnet50_beta_vae": {'beta': [1,2,5,10,20]}
 }
 
-compression_factors = [2, 4]
+compression_factors = [1,2,3,5,10,20]
 
-batch_sizes = [4]
+batch_sizes = [4, 8, 16]
 
 # other parameters
 ################################################################################
@@ -54,8 +58,6 @@
 slurm_mem = '200G'
 slurm_gpus = 'a100:1'
 
-n_epochs = 2
-
 wandb_project='shapeembed'
 
 slurm_script="""#! /bin/bash
@@ -64,15 +66,19 @@
 echo "  - model {model} ({model_params})"
 echo "  - compression_factor {compression_factor}"
 echo "  - batch size {batch_size}"
-python3 shapeembed.py --wandb-project {wandb_project} --num-epochs {n_epochs} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir}
+python3 shapeembed.py --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args}
 """
 
 ################################################################################
 
-def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_factor, batch_size):
+def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_factor, batch_size, **kwargs):
   jobname = f'shapeembed_{dataset[0]}_{model}_{compression_factor}_{batch_size}'
   logger.info(f'spawning {jobname}')
   with open(f'{slurm_out_dir}/{jobname}.script', mode='w+') as fp:
+    extra_args=[]
+    for k, v in kwargs.items():
+      extra_args.append(f'--model-arg-{k}')
+      extra_args.append(f'{v}')
     fp.write(slurm_script.format( dataset=dataset
                                 , model=model
                                 , model_params=[]
@@ -80,7 +86,7 @@ def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_
                                 , batch_size=batch_size
                                 , out_dir=out_dir
                                 , wandb_project=wandb_project
-                                , n_epochs=n_epochs ))
+                                , extra_args=' '.join(extra_args) ))
     fp.flush()
     logger.info(f'written {fp.name}')
     logger.debug(f'cat {fp.name}')
@@ -129,4 +135,11 @@ def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_
                                   for m in models
                                   for cf in compression_factors
                                   for bs in batch_sizes ]:
-    spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params)
+    # per model params:
+    m = params[1]
+    if m in model_params:
+      mps = model_params[m]
+      for ps in [dict(zip(mps.keys(), vs)) for vs in itertools.product(*mps.values())]:
+        spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params, **ps)
+    else:
+      spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params)

From 2c49fc9c1ad010a077037cd1b2dbaac5db509d22 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 18 Jul 2024 23:38:48 +0100
Subject: [PATCH 159/204] better slurm jobname

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index b1ef3389..456246dd 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -72,7 +72,10 @@
 ################################################################################
 
 def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_factor, batch_size, **kwargs):
-  jobname = f'shapeembed_{dataset[0]}_{model}_{compression_factor}_{batch_size}'
+  model_str = model
+  if kwargs:
+    model_str += f"_{'_'.join([f'{k}{v}' for k, v in kwargs.items()])}"
+  jobname = f'shapeembed-{dataset[0]}-{model_str}-{compression_factor}-{batch_size}'
   logger.info(f'spawning {jobname}')
   with open(f'{slurm_out_dir}/{jobname}.script', mode='w+') as fp:
     extra_args=[]

From a291c037281154ff9d85a52cf17d64843c438442 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 19 Jul 2024 16:58:00 +0100
Subject: [PATCH 160/204] removed compression factor 20

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index 456246dd..5568b9b3 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -44,7 +44,7 @@
 , "resnet50_beta_vae": {'beta': [1,2,5,10,20]}
 }
 
-compression_factors = [1,2,3,5,10,20]
+compression_factors = [1,2,3,5,10]
 
 batch_sizes = [4, 8, 16]
 

From a29b6bd652315117e5ca44f4944f27667db61c08 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 19 Jul 2024 16:58:58 +0100
Subject: [PATCH 161/204] bumped up memory allocation to 250G

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index 5568b9b3..ffed3933 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -55,7 +55,7 @@
 dflt_out_dir=f'{os.getcwd()}/output_results_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
 
 slurm_time = '50:00:00'
-slurm_mem = '200G'
+slurm_mem = '250G'
 slurm_gpus = 'a100:1'
 
 wandb_project='shapeembed'

From dde96fc499fde1c124483e9439ff5ccedd5b3e03 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 19 Jul 2024 16:59:26 +0100
Subject: [PATCH 162/204] added --no-early-stop flag

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index ffed3933..50925bfe 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -66,7 +66,7 @@
 echo "  - model {model} ({model_params})"
 echo "  - compression_factor {compression_factor}"
 echo "  - batch size {batch_size}"
-python3 shapeembed.py --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args}
+python3 shapeembed.py --no-early-stop --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args}
 """
 
 ################################################################################

From 8dbf551cef3de04c66871959843252d554cdbcce Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jul 2024 15:35:04 +0100
Subject: [PATCH 163/204] added an oom_retry function

---
 scripts/shapeembed/shapeembed.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 2b987875..6cedd481 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -117,6 +117,18 @@ def tag_cols(params):
   cols.append(('batch_size', params.batch_size))
   return cols
 
+def oom_retry(f, *args, n_oom_retries=1, logger=logging.getLogger(__name__), **kwargs):
+  try:
+    logger.info(f'Trying {f.__name__} within oom_retry, n_oom_retries = {n_oom_retries}')
+    return f(*args, **kwargs)
+  except RuntimeError as e:
+    if 'out of memory' in str(e) and n_oom_retries > 0:
+      logger.warning(f'{f.__name__} ran out of memory, retrying')
+      torch.cuda.empty_cache()
+      return oom_retry(f, *args, n_oom_retries=n_oom_retries-1, logger=logger, **kwargs)
+  else:
+    raise e
+
 # dataset loading functions
 ###############################################################################
 
@@ -319,15 +331,15 @@ def main_process(params):
 
   # setup
   #######
-  model = get_model(params)
-  trainer = get_trainer(model, params)
-  dataloader = get_dataloader(params)
+  model = oom_retry(get_model, params)
+  trainer = oom_retry(get_trainer, model, params)
+  dataloader = oom_retry(get_dataloader, params)
 
   # run actual work
   #################
-  train_model(trainer, model, dataloader)
-  validate_model(trainer, model, dataloader)
-  test_model(trainer, model, dataloader)
+  oom_retry(train_model, trainer, model, dataloader, n_oom_retries=2)
+  oom_retry(validate_model, trainer, model, dataloader)
+  oom_retry(test_model, trainer, model, dataloader)
 
   # run predictions
   #################

From 38a9ff6429c4a6f3a3a3940dc4cf7868253d4157 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jul 2024 15:55:37 +0100
Subject: [PATCH 164/204] refined min / max epochs clargs

---
 scripts/shapeembed/shapeembed.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 6cedd481..9144e17a 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -74,7 +74,8 @@
 , num_embeddings=1024
 , num_hiddens=1024
 , num_workers=8
-, epochs=150
+, min_epochs=50
+, max_epochs=150
 , pretrained=False
 , frobenius_norm=False
 , early_stop=False
@@ -251,8 +252,8 @@ def get_trainer(model, params):
   , accelerator="gpu"
   , accumulate_grad_batches=4
   , callbacks=trainer_callbacks
-  , min_epochs=50
-  , max_epochs=params.epochs
+  , min_epochs=params.min_epochs
+  , max_epochs=params.max_epochs
   , log_every_n_steps=1
   )
 
@@ -453,9 +454,15 @@ def prob (x):
   parser.add_argument(
       '-n', '--num-workers', metavar='NUM_WORKERS', type=auto_pos_int
     , help=f"The NUM_WORKERS for the run, a positive integer (default {dflt_params.num_workers})")
+  parser.add_argument(
+      '--min-epochs', metavar='MIN_EPOCHS', type=auto_pos_int
+    , help=f"Set the MIN_EPOCHS for the run, a positive integer (default {dflt_params.min_epochs})")
+  parser.add_argument(
+      '--max-epochs', metavar='MAX_EPOCHS', type=auto_pos_int
+    , help=f"Set the MAX_EPOCHS for the run, a positive integer (default {dflt_params.max_epochs})")
   parser.add_argument(
       '-e', '--num-epochs', metavar='NUM_EPOCHS', type=auto_pos_int
-    , help=f"The NUM_EPOCHS for the run, a positive integer (default {dflt_params.epochs})")
+    , help=f"Forces the NUM_EPOCHS for the run, a positive integer (sets both min and max epoch)")
   parser.add_argument('--clear-checkpoints', action='store_true'
     , help='remove checkpoints')
   parser.add_argument('-v', '--verbose', action='count', default=0
@@ -508,8 +515,13 @@ def prob (x):
     params.num_hiddens = clargs.number_hiddens
   if clargs.num_workers:
     params.num_workers = clargs.num_workers
+  if clargs.min_epochs:
+    params.min_epochs = clargs.min_epochs
+  if clargs.max_epochs:
+    params.max_epochs = clargs.max_epochs
   if clargs.num_epochs:
-    params.epochs = clargs.num_epochs
+    params.min_epochs = clargs.num_epochs
+    params.max_epochs = clargs.num_epochs
   if clargs.output_dir:
     params.output_dir = clargs.output_dir
   else:

From 10c8b507688710b85ef63c63cf67b85f915f0626 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jul 2024 16:03:00 +0100
Subject: [PATCH 165/204] slurm script refactor args + force 150 epochs

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index 50925bfe..ffbc8b07 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -66,7 +66,7 @@
 echo "  - model {model} ({model_params})"
 echo "  - compression_factor {compression_factor}"
 echo "  - batch size {batch_size}"
-python3 shapeembed.py --no-early-stop --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args}
+python3 shapeembed.py --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args}
 """
 
 ################################################################################
@@ -79,6 +79,9 @@ def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_
   logger.info(f'spawning {jobname}')
   with open(f'{slurm_out_dir}/{jobname}.script', mode='w+') as fp:
     extra_args=[]
+    extra_args.append('--no-early-stop')
+    extra_args.append('--num-epochs')
+    extra_args.append('150')
     for k, v in kwargs.items():
       extra_args.append(f'--model-arg-{k}')
       extra_args.append(f'{v}')

From c0cd3b4b4b34362e3396089e6b137f5cd4c473fb Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jul 2024 19:11:06 +0100
Subject: [PATCH 166/204] bring triangular + compression computation in named
 function (to share use)

---
 scripts/shapeembed/shapeembed.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 9144e17a..cd41fbe7 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -99,6 +99,9 @@
 , cycle_momentum=False
 )
 
+def compressed_n_features(dist_mat_size, comp_fact):
+  return dist_mat_size*(dist_mat_size-1)//(2**comp_fact)
+
 def model_str(params):
   s = f'{params.model_name}'
   if vars(params.model_args):
@@ -507,8 +510,7 @@ def prob (x):
     params.distance_matrix_roll_probability = clargs.distance_matrix_roll_probability
   if clargs.compression_factor:
     params.compression_factor = clargs.compression_factor
-  n_features = lambda d, n: d*(d-1)//(2**n)
-  params.latent_dim = n_features(params.distance_matrix_size, params.compression_factor)
+  params.latent_dim = compressed_n_features(params.distance_matrix_size, params.compression_factor)
   if clargs.number_embeddings:
     params.num_embeddings = clargs.number_embeddings
   if clargs.number_hiddens:

From 3b1fdda50c23cdf9da9f5176b42a525ca0f09018 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jul 2024 19:12:00 +0100
Subject: [PATCH 167/204] fix in model_str function test of model_args

---
 scripts/shapeembed/shapeembed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index cd41fbe7..ab0f11a9 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -104,7 +104,7 @@ def compressed_n_features(dist_mat_size, comp_fact):
 
 def model_str(params):
   s = f'{params.model_name}'
-  if vars(params.model_args):
+  if hasattr(params, 'model_args'):
     s += f"-{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}"
   return s
 

From 65842f6ca0b217bd1b3cdc07ef399d4a72da1c10 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 20 Jul 2024 19:14:50 +0100
Subject: [PATCH 168/204] refactor slurm script to detect already completed
 jobs

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 162 +++++++++++++------
 1 file changed, 115 insertions(+), 47 deletions(-)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index ffbc8b07..a507c40f 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -1,12 +1,18 @@
 #! /usr/bin/env python3
 
 import os
+import glob
+import copy
+import types
 import logging
+import tempfile
 import argparse
 import datetime
 import itertools
 import subprocess
 
+import shapeembed
+
 # shapeembed parameters to sweap
 ################################################################################
 
@@ -48,6 +54,47 @@
 
 batch_sizes = [4, 8, 16]
 
+def gen_params_sweap_list():
+  p_sweap_list = []
+  for params in [ { 'dataset': types.SimpleNamespace(name=ds[0], path=ds[1], type=ds[2])
+                  , 'model_name': m
+                  , 'compression_factor': cf
+                  , 'latent_dim': shapeembed.compressed_n_features(512, cf)
+                  , 'batch_size': bs
+                  } for ds in datasets
+                    for m in models
+                    for cf in compression_factors
+                    for bs in batch_sizes ]:
+    # per model params:
+    if params['model_name'] in model_params:
+      mps = model_params[params['model_name']]
+      for ps in [dict(zip(mps.keys(), vs)) for vs in itertools.product(*mps.values())]:
+        newparams = copy.deepcopy(params)
+        newparams['model_args'] = types.SimpleNamespace(**ps)
+        p_sweap_list.append(types.SimpleNamespace(**newparams))
+    else:
+      p_sweap_list.append(types.SimpleNamespace(**params))
+  return p_sweap_list
+
+def params_match(x, ys):
+  found = False
+  def check_model_args(a, b):
+    a_yes = hasattr(a, 'model_args')
+    b_yes = hasattr(b, 'model_args')
+    if not a_yes and not b_yes: return True
+    if a_yes and b_yes: return a.model_args == b.model_args
+    return False
+  for y in ys:
+    if x.dataset.name == y.dataset \
+      and x.model_name == y.model_name \
+      and check_model_args(x, y) \
+      and x.compression_factor == y.compression_factor \
+      and x.latent_dim == y.latent_dim \
+      and x.batch_size == y.batch_size:
+      found = True
+      break
+  return found
+
 # other parameters
 ################################################################################
 
@@ -58,6 +105,7 @@
 slurm_mem = '250G'
 slurm_gpus = 'a100:1'
 
+shapeembed_script=f'{os.getcwd()}/shapeembed.py'
 wandb_project='shapeembed'
 
 slurm_script="""#! /bin/bash
@@ -71,43 +119,70 @@
 
 ################################################################################
 
-def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_factor, batch_size, **kwargs):
-  model_str = model
-  if kwargs:
-    model_str += f"_{'_'.join([f'{k}{v}' for k, v in kwargs.items()])}"
-  jobname = f'shapeembed-{dataset[0]}-{model_str}-{compression_factor}-{batch_size}'
-  logger.info(f'spawning {jobname}')
-  with open(f'{slurm_out_dir}/{jobname}.script', mode='w+') as fp:
-    extra_args=[]
-    extra_args.append('--no-early-stop')
-    extra_args.append('--num-epochs')
-    extra_args.append('150')
-    for k, v in kwargs.items():
-      extra_args.append(f'--model-arg-{k}')
-      extra_args.append(f'{v}')
-    fp.write(slurm_script.format( dataset=dataset
-                                , model=model
-                                , model_params=[]
-                                , compression_factor=compression_factor
-                                , batch_size=batch_size
-                                , out_dir=out_dir
-                                , wandb_project=wandb_project
-                                , extra_args=' '.join(extra_args) ))
+def model_params_from_model_params_str(modelparamsstr):
+  rawps = modelparamsstr.split('_')
+  ps = {}
+  for p in rawps:
+    if p[0:4] == 'beta': ps['beta'] = float(p[4:])
+  return types.SimpleNamespace(**ps)
+
+def params_from_job_str(jobstr):
+  raw = jobstr.split('-')
+  ps = {}
+  ps['batch_size'] = int(raw.pop())
+  ps['latent_dim'] = int(raw.pop())
+  ps['compression_factor'] = int(raw.pop())
+  if len(raw) == 3:
+    ps['model_args'] = model_params_from_model_params_str(raw.pop())
+  ps['model_name'] = raw.pop()
+  ps['dataset'] = raw.pop()
+  return types.SimpleNamespace(**ps)
+
+def find_done_params(out_dir):
+  ps = []
+  for f in glob.glob(f'{out_dir}/*-shapeembed-score_df.csv'):
+    ps.append(params_from_job_str(os.path.basename(f)[:-24]))
+  return ps
+
+def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name__)):
+
+  jobname = shapeembed.job_str(ps)
+  cmd = [ 'python3', shapeembed_script
+        , '--wandb-project', wandb_project
+        , '--output-dir', out_dir
+        ]
+  cmd += [ '--clear-checkpoints'
+         , '--no-early-stop'
+         , '--num-epochs', 150
+         ]
+  cmd += [ '--dataset', ps.dataset.name, ps.dataset.path, ps.dataset.type
+         , '--model', ps.model_name
+         , '--compression-factor', ps.compression_factor
+         , '--batch-size', ps.batch_size
+         ]
+  if hasattr(ps, 'model_args'):
+    for k, v in vars(ps.model_args).items():
+      cmd.append(f'--model-arg-{k}')
+      cmd.append(f'{v}')
+  logger.debug(" ".join(map(str,cmd)))
+  with tempfile.NamedTemporaryFile('w+') as fp:
+    fp.write('#! /usr/bin/env sh\n')
+    fp.write(" ".join(map(str,cmd)))
+    fp.write('\n')
     fp.flush()
-    logger.info(f'written {fp.name}')
-    logger.debug(f'cat {fp.name}')
-    result = subprocess.run(['cat', fp.name], stdout=subprocess.PIPE)
+    cmd = [ 'sbatch'
+          , '--time', slurm_time
+          , '--mem', slurm_mem
+          , '--job-name', jobname
+          , '--output', f'{slurm_out_dir}/{jobname}.out'
+          , '--error', f'{slurm_out_dir}/{jobname}.err'
+          , f'--gpus={slurm_gpus}'
+          , fp.name ]
+    logger.debug(" ".join(map(str,cmd)))
+    result = subprocess.run(cmd, stdout=subprocess.PIPE)
     logger.debug(result.stdout.decode('utf-8'))
-    result = subprocess.run([ 'sbatch'
-                            , '--time', slurm_time
-                            , '--mem', slurm_mem
-                            , '--job-name', jobname
-                            , '--output', f'{slurm_out_dir}/{jobname}.out'
-                            , '--error', f'{slurm_out_dir}/{jobname}.err'
-                            #, '--gres', n_gpus(ls)
-                            , f'--gpus={slurm_gpus}'
-                            , fp.name ], stdout=subprocess.PIPE)
-    logger.info(result.stdout.decode('utf-8'))
+  logger.info(f'job spawned for {ps}')
+
 
 if __name__ == "__main__":
 
@@ -137,15 +212,8 @@ def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_
   os.makedirs(clargs.slurm_output_dir, exist_ok=True)
   os.makedirs(clargs.output_dir, exist_ok=True)
 
-  for params in [ (ds, m, cf, bs) for ds in datasets
-                                  for m in models
-                                  for cf in compression_factors
-                                  for bs in batch_sizes ]:
-    # per model params:
-    m = params[1]
-    if m in model_params:
-      mps = model_params[m]
-      for ps in [dict(zip(mps.keys(), vs)) for vs in itertools.product(*mps.values())]:
-        spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params, **ps)
-    else:
-      spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params)
+  done_params = find_done_params(clargs.output_dir)
+  all_params  = gen_params_sweap_list()
+  todo_params = [x for x in all_params if not params_match(x, done_params)]
+  for ps in todo_params:
+    spawn_slurm_job(clargs.slurm_output_dir, clargs.output_dir, ps, logger=logger)

From f97657e37201dbaf9962ed2488cd0cdac1e36567 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 21 Jul 2024 00:02:15 +0100
Subject: [PATCH 169/204] factored out some common helpers

---
 scripts/shapeembed/common_helpers.py         | 38 +++++++++++++++++++
 scripts/shapeembed/shapeembed.py             | 14 +------
 scripts/shapeembed/slurm_sweap_shapeembed.py | 40 ++++----------------
 3 files changed, 48 insertions(+), 44 deletions(-)
 create mode 100644 scripts/shapeembed/common_helpers.py

diff --git a/scripts/shapeembed/common_helpers.py b/scripts/shapeembed/common_helpers.py
new file mode 100644
index 00000000..71a13847
--- /dev/null
+++ b/scripts/shapeembed/common_helpers.py
@@ -0,0 +1,38 @@
+import os
+import glob
+import types
+import logging
+
+def compressed_n_features(dist_mat_size, comp_fact):
+  return dist_mat_size*(dist_mat_size-1)//(2**comp_fact)
+
+def model_str(params):
+  s = f'{params.model_name}'
+  if hasattr(params, 'model_args'):
+    s += f"-{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}"
+  return s
+
+def job_str(params):
+  return f"{params.dataset.name}-{model_str(params)}-{params.compression_factor}-{params.latent_dim}-{params.batch_size}"
+
+def params_from_job_str(jobstr):
+  raw = jobstr.split('-')
+  ps = types.SimpleNamespace()
+  ps.batch_size = int(raw.pop())
+  ps.latent_dim = int(raw.pop())
+  ps.compression_factor = int(raw.pop())
+  if len(raw) == 3:
+    ps.model_args = types.SimpleNamespace()
+    for p in raw.pop().split('-'):
+      if p[0:4] == 'beta': ps.model_args.beta = float(p[4:])
+  ps.model_name = raw.pop()
+  ps.dataset = raw.pop()
+  return ps
+
+def find_existing_run_scores(dirname, logger=logging.getLogger(__name__)):
+  ps = []
+  for f in glob.glob(f'{dirname}/*-shapeembed-score_df.csv'):
+    p = params_from_job_str(os.path.basename(f)[:-24])
+    p.csv_file = f
+    ps.append(p)
+  return ps
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index ab0f11a9..744f29fa 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -27,6 +27,8 @@
 from dataset_transformations import *
 from evaluation import *
 
+from common_helpers import *
+
 # logging facilities
 ###############################################################################
 logger = logging.getLogger(__name__)
@@ -99,18 +101,6 @@
 , cycle_momentum=False
 )
 
-def compressed_n_features(dist_mat_size, comp_fact):
-  return dist_mat_size*(dist_mat_size-1)//(2**comp_fact)
-
-def model_str(params):
-  s = f'{params.model_name}'
-  if hasattr(params, 'model_args'):
-    s += f"-{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}"
-  return s
-
-def job_str(params):
-  return f"{params.dataset.name}-{model_str(params)}-{params.compression_factor}-{params.latent_dim}-{params.batch_size}"
-
 def tag_cols(params):
   cols = []
   cols.append(('dataset', params.dataset.name))
diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index a507c40f..6ebb81ac 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -11,7 +11,7 @@
 import itertools
 import subprocess
 
-import shapeembed
+from common_helpers import *
 
 # shapeembed parameters to sweap
 ################################################################################
@@ -59,7 +59,7 @@ def gen_params_sweap_list():
   for params in [ { 'dataset': types.SimpleNamespace(name=ds[0], path=ds[1], type=ds[2])
                   , 'model_name': m
                   , 'compression_factor': cf
-                  , 'latent_dim': shapeembed.compressed_n_features(512, cf)
+                  , 'latent_dim': compressed_n_features(512, cf)
                   , 'batch_size': bs
                   } for ds in datasets
                     for m in models
@@ -119,34 +119,9 @@ def check_model_args(a, b):
 
 ################################################################################
 
-def model_params_from_model_params_str(modelparamsstr):
-  rawps = modelparamsstr.split('_')
-  ps = {}
-  for p in rawps:
-    if p[0:4] == 'beta': ps['beta'] = float(p[4:])
-  return types.SimpleNamespace(**ps)
-
-def params_from_job_str(jobstr):
-  raw = jobstr.split('-')
-  ps = {}
-  ps['batch_size'] = int(raw.pop())
-  ps['latent_dim'] = int(raw.pop())
-  ps['compression_factor'] = int(raw.pop())
-  if len(raw) == 3:
-    ps['model_args'] = model_params_from_model_params_str(raw.pop())
-  ps['model_name'] = raw.pop()
-  ps['dataset'] = raw.pop()
-  return types.SimpleNamespace(**ps)
-
-def find_done_params(out_dir):
-  ps = []
-  for f in glob.glob(f'{out_dir}/*-shapeembed-score_df.csv'):
-    ps.append(params_from_job_str(os.path.basename(f)[:-24]))
-  return ps
-
 def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name__)):
 
-  jobname = shapeembed.job_str(ps)
+  jobname = job_str(ps)
   cmd = [ 'python3', shapeembed_script
         , '--wandb-project', wandb_project
         , '--output-dir', out_dir
@@ -203,16 +178,17 @@ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name_
   clargs=parser.parse_args()
 
   # set verbosity level
+  logging.basicConfig()
   logger = logging.getLogger(__name__)
-  if clargs.verbose > 2:
-    logger.setLevel(logging.DEBUG)
+  if clargs.verbose > 1:
+    logger.setLevel('DEBUG')
   elif clargs.verbose > 0:
-    logger.setLevel(logging.INFO)
+    logger.setLevel('INFO')
 
   os.makedirs(clargs.slurm_output_dir, exist_ok=True)
   os.makedirs(clargs.output_dir, exist_ok=True)
 
-  done_params = find_done_params(clargs.output_dir)
+  done_params = find_existing_run_scores(clargs.output_dir)
   all_params  = gen_params_sweap_list()
   todo_params = [x for x in all_params if not params_match(x, done_params)]
   for ps in todo_params:

From b74be39fc79fe068876527d44c06fe4576bd3fe0 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 21 Jul 2024 16:58:20 +0100
Subject: [PATCH 170/204] Add a comment/uncomment block for quick ad-hoc single
 config run

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index 6ebb81ac..0cbe042e 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -54,6 +54,16 @@
 
 batch_sizes = [4, 8, 16]
 
+# XXX XXX XXX XXX XXX XXX XXX #
+# XXX ad-hoc one-off config XXX #
+# XXX XXX XXX XXX XXX XXX XXX #
+# uncomment the lines below for a quick overwrite of the parameter sweep
+#datasets = [("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask")]
+#models = ["resnet50_vae"]
+#model_params = {} #{"resnet50_beta_vae": {'beta': [1]}}
+#compression_factors = [10]
+#batch_sizes = [16]
+
 def gen_params_sweap_list():
   p_sweap_list = []
   for params in [ { 'dataset': types.SimpleNamespace(name=ds[0], path=ds[1], type=ds[2])

From 9b8e93463123cf895957bda084ae5e22a8c61963 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 21 Jul 2024 16:59:40 +0100
Subject: [PATCH 171/204] added a function to find currently submitted slurm
 jobs

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index 0cbe042e..482afc5d 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -105,6 +105,10 @@ def check_model_args(a, b):
       break
   return found
 
+def find_submitted_slurm_jobs():
+  jobs = subprocess.run(['squeue', '--format', '%j'], stdout=subprocess.PIPE).stdout.decode('utf-8').split()
+  return list(map(params_from_job_str, jobs[1:]))
+
 # other parameters
 ################################################################################
 

From 2cead3d94bae977b9ab6036dd38f94a35689df2a Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 21 Jul 2024 17:00:22 +0100
Subject: [PATCH 172/204] added clargs for job filtering enabling/disabling
 (enabled by default)

---
 scripts/shapeembed/slurm_sweap_shapeembed.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py
index 482afc5d..0024f557 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweap_shapeembed.py
@@ -185,6 +185,14 @@ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name_
       '-o', '--output-dir', metavar='OUTPUT_DIR', default=dflt_out_dir
     , help=f"The OUTPUT_DIR path to use to dump results")
 
+  parser.add_argument(
+      '--filter-done', action=argparse.BooleanOptionalAction, default=True
+    , help=f'filter out jobs with results (a *scores_df.csv) in OUTPUT_DIR')
+
+  parser.add_argument(
+      '--filter-submitted', action=argparse.BooleanOptionalAction, default=True
+    , help=f'filter out jobs present in the current slurm `squeue`')
+
   parser.add_argument('-v', '--verbose', action='count', default=0
     , help="Increase verbosity level by adding more \"v\".")
 
@@ -203,7 +211,14 @@ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name_
   os.makedirs(clargs.output_dir, exist_ok=True)
 
   done_params = find_existing_run_scores(clargs.output_dir)
+  in_slurm_params = find_submitted_slurm_jobs()
   all_params  = gen_params_sweap_list()
-  todo_params = [x for x in all_params if not params_match(x, done_params)]
+
+  todo_params = all_params
+  if clargs.filter_done:
+    todo_params = [x for x in todo_params if not params_match(x, done_params)]
+  if clargs.filter_submitted:
+    todo_params = [x for x in todo_params if not params_match(x, in_slurm_params)]
+
   for ps in todo_params:
     spawn_slurm_job(clargs.slurm_output_dir, clargs.output_dir, ps, logger=logger)

From 650fcc17d5b2d7d5715ad2ddf2e653baad0470c8 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 21 Jul 2024 17:04:39 +0100
Subject: [PATCH 173/204] typo fix: sweap -> sweep

---
 ...eap_shapeembed.py => slurm_sweep_shapeembed.py} | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)
 rename scripts/shapeembed/{slurm_sweap_shapeembed.py => slurm_sweep_shapeembed.py} (96%)

diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py
similarity index 96%
rename from scripts/shapeembed/slurm_sweap_shapeembed.py
rename to scripts/shapeembed/slurm_sweep_shapeembed.py
index 0024f557..a9d89901 100755
--- a/scripts/shapeembed/slurm_sweap_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweep_shapeembed.py
@@ -13,7 +13,7 @@
 
 from common_helpers import *
 
-# shapeembed parameters to sweap
+# shapeembed parameters to sweep
 ################################################################################
 
 datasets_pfx = '/nfs/research/uhlmann/afoix/datasets/image_datasets'
@@ -64,8 +64,8 @@
 #compression_factors = [10]
 #batch_sizes = [16]
 
-def gen_params_sweap_list():
-  p_sweap_list = []
+def gen_params_sweep_list():
+  p_sweep_list = []
   for params in [ { 'dataset': types.SimpleNamespace(name=ds[0], path=ds[1], type=ds[2])
                   , 'model_name': m
                   , 'compression_factor': cf
@@ -81,10 +81,10 @@ def gen_params_sweap_list():
       for ps in [dict(zip(mps.keys(), vs)) for vs in itertools.product(*mps.values())]:
         newparams = copy.deepcopy(params)
         newparams['model_args'] = types.SimpleNamespace(**ps)
-        p_sweap_list.append(types.SimpleNamespace(**newparams))
+        p_sweep_list.append(types.SimpleNamespace(**newparams))
     else:
-      p_sweap_list.append(types.SimpleNamespace(**params))
-  return p_sweap_list
+      p_sweep_list.append(types.SimpleNamespace(**params))
+  return p_sweep_list
 
 def params_match(x, ys):
   found = False
@@ -212,7 +212,7 @@ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name_
 
   done_params = find_existing_run_scores(clargs.output_dir)
   in_slurm_params = find_submitted_slurm_jobs()
-  all_params  = gen_params_sweap_list()
+  all_params  = gen_params_sweep_list()
 
   todo_params = all_params
   if clargs.filter_done:

From 62704af70eba4880bd586918150edbb88a88127e Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 21 Jul 2024 18:08:51 +0100
Subject: [PATCH 174/204] parse dataset as a SimpleNamespace from job string

---
 scripts/shapeembed/common_helpers.py         | 2 +-
 scripts/shapeembed/slurm_sweep_shapeembed.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/common_helpers.py b/scripts/shapeembed/common_helpers.py
index 71a13847..204b4c09 100644
--- a/scripts/shapeembed/common_helpers.py
+++ b/scripts/shapeembed/common_helpers.py
@@ -26,7 +26,7 @@ def params_from_job_str(jobstr):
     for p in raw.pop().split('-'):
       if p[0:4] == 'beta': ps.model_args.beta = float(p[4:])
   ps.model_name = raw.pop()
-  ps.dataset = raw.pop()
+  ps.dataset = types.SimpleNamespace(name=raw.pop())
   return ps
 
 def find_existing_run_scores(dirname, logger=logging.getLogger(__name__)):
diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py
index a9d89901..a15e85cd 100755
--- a/scripts/shapeembed/slurm_sweep_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweep_shapeembed.py
@@ -95,7 +95,7 @@ def check_model_args(a, b):
     if a_yes and b_yes: return a.model_args == b.model_args
     return False
   for y in ys:
-    if x.dataset.name == y.dataset \
+    if x.dataset.name == y.dataset.name \
       and x.model_name == y.model_name \
       and check_model_args(x, y) \
       and x.compression_factor == y.compression_factor \

From 6e9ffcf2ca751ca23eadacfe0a04be98964180a5 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 21 Jul 2024 18:10:32 +0100
Subject: [PATCH 175/204] updated data gathering script to newer changes (still
 TODO for figures)

---
 scripts/shapeembed/gather_run_results.py | 220 ++++++++++++-----------
 1 file changed, 116 insertions(+), 104 deletions(-)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index 1af719be..11410d39 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -1,12 +1,15 @@
 #! /usr/bin/env python3
 
-import pandas as pd
+import os
+import shutil
 import logging
 import argparse
-import shutil
-import os
+import datetime
 import functools
-  
+import pandas as pd
+
+from common_helpers import *
+
 # define a Custom aggregation  
 # function for finding total 
 def keep_first_fname(series): 
@@ -17,128 +20,136 @@ def get_run_info(run):
   return f'{x[0]}_{x[1]}', x[2], x[4]
 
 def main_process(clargs, logger=logging.getLogger(__name__)):
-  print(clargs)
+
+  params = []
+  for f in clargs.run_folders:
+    ps = find_existing_run_scores(f)
+    for p in ps: p.folder = f
+    params.append(ps)
+  params = [x for ps in params for x in ps]
+  logger.debug(params)
+
   os.makedirs(clargs.output_dir, exist_ok=True)
-  dfs = []
-  for d in clargs.run_folder:
-    csv = f'{d}/scores_df.csv'
-    #csv = f'{d}/scores_df_mean.csv'
-    if not os.path.isfile(csv):
-      print(f'WARNING: no {csv} found, skipping')
-      continue
-    
-    run_name = os.path.basename(d)
-    model, latent_space_sz, dataset = get_run_info(run_name)
-    df = pd.read_csv(csv)
-    df['model'] = model
-    df['latent_space_sz'] = latent_space_sz
-    df['dataset'] = dataset
-
-    for trial in ['efd','regionprops','shapeembed', 'combined_all']:
-
-      conf_mat = f'{trial}_confusion_matrix.png'
-      if os.path.isfile(f'{d}/{conf_mat}'):
-        shutil.copy(f'{d}/{conf_mat}',f'{clargs.output_dir}/{run_name}_{conf_mat}')
-        df.loc[df['trial'] == trial, 'conf_mat'] = f'./{run_name}_{conf_mat}'
-      else:
-        df.loc[df['trial'] == trial, 'conf_mat'] = f'nofile'
-
-      umap = f'umap_{trial}.pdf'
-      if os.path.isfile(f'{d}/{umap}'):
-        shutil.copy(f'{d}/{umap}',f'{clargs.output_dir}/{run_name}_{umap}')
-        df.loc[df['trial'] == trial, 'umap'] = f'./{run_name}_{umap}'
-      else:
-        df.loc[df['trial'] == trial, 'umap'] = f'nofile'
-
-      barplot = f'scores_barplot.pdf'
-      if os.path.isfile(f'{d}/{barplot}'):
-        shutil.copy(f'{d}/{barplot}',f'{clargs.output_dir}/{run_name}_{barplot}')
-        df.loc[df['trial'] == trial, 'barplot'] = f'./{run_name}_{barplot}'
-      else:
-        df.loc[df['trial'] == trial, 'barplot'] = f'nofile'
 
+  dfs = []
+  for p in params:
+
+    # open scores dataframe
+    df = pd.read_csv(p.csv_file, index_col=0)
+
+    # pair up with confusion matrix
+    conf_mat_file = f'{job_str(p)}-shapeembed-confusion_matrix.png'
+    print(f'{p.folder}/{conf_mat_file}')
+    if os.path.isfile(f'{p.folder}/{conf_mat_file}'):
+      shutil.copy(f'{p.folder}/{conf_mat_file}',f'{clargs.output_dir}/{conf_mat_file}')
+      df['conf_mat'] = f'./{conf_mat_file}'
+    else:
+      df['conf_mat'] = f'nofile'
+
+    # pair up with umap
+    umap_file = f'{job_str(p)}-shapeembed-umap.pdf'
+    if os.path.isfile(f'{p.folder}/{umap_file}'):
+      shutil.copy(f'{p.folder}/{umap_file}',f'{clargs.output_dir}/{umap_file}')
+      df['umap'] = f'./{umap_file}'
+    else:
+      df['umap'] = f'nofile'
+
+    ## pair up with barplot
+    #barplot = f'scores_barplot.pdf'
+    #if os.path.isfile(f'{d}/{barplot}'):
+    #  shutil.copy(f'{d}/{barplot}',f'{clargs.output_dir}/{run_name}_{barplot}')
+    #  df.loc[df['trial'] == trial, 'barplot'] = f'./{run_name}_{barplot}'
+    #else:
+    #  df.loc[df['trial'] == trial, 'barplot'] = f'nofile'
+
+    # add dataframe to list for future concatenation
     dfs.append(df.convert_dtypes())
 
+  # gather all dataframes together
   df = pd.concat(dfs)
-  df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df
-  df.set_index(['dataset', 'trial', 'model', 'latent_space_sz'], inplace=True)
+  logger.debug(df)
+  df.to_csv(f'{clargs.output_dir}/all_scores_df.csv', index=False)
+
+  #df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df
+  df.set_index(['dataset', 'trial', 'model', 'compression_factor', 'latent_dim', 'batch_size'], inplace=True)
   df.sort_index(inplace=True)
-  df = df.groupby(level=['dataset', 'trial', 'model', 'latent_space_sz']).agg({
+  df = df.groupby(level=['dataset', 'trial', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({
     'test_accuracy': 'mean'
   , 'test_precision': 'mean'
   , 'test_recall': 'mean'
   , 'test_f1': 'mean'
   , 'conf_mat': keep_first_fname
   , 'umap': keep_first_fname
-  , 'barplot': keep_first_fname
+  #, 'barplot': keep_first_fname
   })
 
   print('-'*80)
   print(df)
   print('-'*80)
-
-
-  cell_hover = {  # for row hover use <tr> instead of <td>
-              'selector': 'td:hover',
-                  'props': [('background-color', '#ffffb3')]
-                  }
-  index_names = {
-              'selector': '.index_name',
-                  'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
-                  }
-  headers = {
-              'selector': 'th:not(.index_name)',
-                  'props': 'background-color: #eeeeee; color: #333333;'
-                  }
-
-  def html_img(path):
-      if os.path.splitext(path)[1][1:] == 'png':
-        return f'<a href="{path}"><img class="zoom" src="{path}" width="50"></a>'
-      if os.path.splitext(path)[1][1:] == 'pdf':
-        return f'<a href="{path}"><object class="zoom" data="{path}" width="50" height="50"></a>'
-      return '<div style="width: 50px">:(</div>'
-  df['conf_mat'] = df['conf_mat'].apply(html_img)
-  df['umap'] = df['umap'].apply(html_img)
-  df['barplot'] = df['barplot'].apply(html_img)
-
-  def render_html(fname, d):
-    with open(fname, 'w') as f:
-      f.write('''<head>
-      <style>
-      .df tbody tr:nth-child(even) { background-color: lightblue; }
-      .zoom {transition: transform .2s;}
-      .zoom:hover{transform: scale(10);}
-      </style>
-      </head>
-      <body>
-      ''')
-      s = d.style
-      s.set_table_styles([cell_hover, index_names, headers])
-      s.to_html(f, classes='df')
-      f.write('</body>')
-
-  with open(f'{clargs.output_dir}/gathered_table.tex', 'w') as f:
-    f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n')
-    df.to_latex(f)
-    f.write('\\end{decument}')
-  render_html(f'{clargs.output_dir}/gathered_table.html', df)
-
-  dft = df.transpose()
-  with open(f'{clargs.output_dir}/gathered_table_transpose.tex', 'w') as f:
-    f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n')
-    dft.to_latex(f)
-    f.write('\\end{decument}')
-  render_html(f'{clargs.output_dir}/gathered_table_transpose.html', dft)
+  df.to_csv(f'{clargs.output_dir}/all_scores_agg_df.csv')
+
+
+  #cell_hover = {  # for row hover use <tr> instead of <td>
+  #            'selector': 'td:hover',
+  #                'props': [('background-color', '#ffffb3')]
+  #                }
+  #index_names = {
+  #            'selector': '.index_name',
+  #                'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
+  #                }
+  #headers = {
+  #            'selector': 'th:not(.index_name)',
+  #                'props': 'background-color: #eeeeee; color: #333333;'
+  #                }
+
+  #def html_img(path):
+  #    if os.path.splitext(path)[1][1:] == 'png':
+  #      return f'<a href="{path}"><img class="zoom" src="{path}" width="50"></a>'
+  #    if os.path.splitext(path)[1][1:] == 'pdf':
+  #      return f'<a href="{path}"><object class="zoom" data="{path}" width="50" height="50"></a>'
+  #    return '<div style="width: 50px">:(</div>'
+  #df['conf_mat'] = df['conf_mat'].apply(html_img)
+  #df['umap'] = df['umap'].apply(html_img)
+  #df['barplot'] = df['barplot'].apply(html_img)
+
+  #def render_html(fname, d):
+  #  with open(fname, 'w') as f:
+  #    f.write('''<head>
+  #    <style>
+  #    .df tbody tr:nth-child(even) { background-color: lightblue; }
+  #    .zoom {transition: transform .2s;}
+  #    .zoom:hover{transform: scale(10);}
+  #    </style>
+  #    </head>
+  #    <body>
+  #    ''')
+  #    s = d.style
+  #    s.set_table_styles([cell_hover, index_names, headers])
+  #    s.to_html(f, classes='df')
+  #    f.write('</body>')
+
+  #with open(f'{clargs.output_dir}/gathered_table.tex', 'w') as f:
+  #  f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n')
+  #  df.to_latex(f)
+  #  f.write('\\end{decument}')
+  #render_html(f'{clargs.output_dir}/gathered_table.html', df)
+
+  #dft = df.transpose()
+  #with open(f'{clargs.output_dir}/gathered_table_transpose.tex', 'w') as f:
+  #  f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n')
+  #  dft.to_latex(f)
+  #  f.write('\\end{decument}')
+  #render_html(f'{clargs.output_dir}/gathered_table_transpose.html', dft)
 
 
 if __name__ == "__main__":
   
   parser = argparse.ArgumentParser(description='Run the shape embed pipeline')
   
-  parser.add_argument( 'run_folder',  nargs="+", type=str
+  parser.add_argument( 'run_folders', metavar='run_folder', nargs="+", type=str
     , help=f"The runs folders to gather results from")
   parser.add_argument( '-o', '--output-dir', metavar='OUTPUT_DIR'
-    , default=f'{os.getcwd()}/gathered_results'
+    , default=f'{os.getcwd()}/gathered_results_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
     , help=f"The OUTPUT_DIR path to use to gather results")
   parser.add_argument('-v', '--verbose', action='count', default=0
     , help="Increase verbosity level by adding more \"v\".")
@@ -147,10 +158,11 @@ def render_html(fname, d):
   clargs=parser.parse_args()
 
   # set verbosity level
+  logging.basicConfig()
   logger = logging.getLogger(__name__)
-  if clargs.verbose > 2:
-    logger.setLevel(logging.DEBUG)
+  if clargs.verbose > 1:
+    logger.setLevel('DEBUG')
   elif clargs.verbose > 0:
-    logger.setLevel(logging.INFO)
+    logger.setLevel('INFO')
 
   main_process(clargs, logger)

From ffce0d38677c84fe5e47fca85e4e61a4329647e9 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 21 Jul 2024 18:26:48 +0100
Subject: [PATCH 176/204] removed stale script string

---
 scripts/shapeembed/slurm_sweep_shapeembed.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py
index a15e85cd..cc022bed 100755
--- a/scripts/shapeembed/slurm_sweep_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweep_shapeembed.py
@@ -122,15 +122,6 @@ def find_submitted_slurm_jobs():
 shapeembed_script=f'{os.getcwd()}/shapeembed.py'
 wandb_project='shapeembed'
 
-slurm_script="""#! /bin/bash
-echo "running shape embed with:"
-echo "  - dataset {dataset[0]} ({dataset[1]}, {dataset[2]})"
-echo "  - model {model} ({model_params})"
-echo "  - compression_factor {compression_factor}"
-echo "  - batch size {batch_size}"
-python3 shapeembed.py --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args}
-"""
-
 ################################################################################
 
 def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name__)):

From d802f9a751855da13568e1866afd87f677125bb6 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 21 Jul 2024 23:36:21 +0100
Subject: [PATCH 177/204] Split model name in two columns if there are model
 args

---
 scripts/shapeembed/gather_run_results.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index 11410d39..9a78f003 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -37,6 +37,13 @@ def main_process(clargs, logger=logging.getLogger(__name__)):
     # open scores dataframe
     df = pd.read_csv(p.csv_file, index_col=0)
 
+    # split model column in case model args are present
+    model_cols = df['model'].str.split('-', n=1, expand=True)
+    if model_cols.shape[1] == 2:
+      df = df.drop('model', axis=1)
+      df.insert(1, 'model_args', model_cols[1])
+      df.insert(1, 'model', model_cols[0])
+
     # pair up with confusion matrix
     conf_mat_file = f'{job_str(p)}-shapeembed-confusion_matrix.png'
     print(f'{p.folder}/{conf_mat_file}')

From 663cc52f1e02fbe575409dc94400b4aa0beab776 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 22 Jul 2024 00:25:03 +0100
Subject: [PATCH 178/204] remove stale import

---
 scripts/shapeembed/evaluation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 3f3452d8..26f344b7 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -18,8 +18,6 @@
 import seaborn
 import matplotlib.pyplot as plt
 
-from bioimage_embed.shapes.transforms import ImageToCoords
-
 # logging facilities
 ###############################################################################
 logger = logging.getLogger(__name__)

From 7d328d9102bc765492362e57b6c8c7183e53d313 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 22 Jul 2024 00:25:38 +0100
Subject: [PATCH 179/204] experiment with plots

---
 scripts/shapeembed/evaluation.py         | 48 ++++++++++++++++++++++++
 scripts/shapeembed/gather_run_results.py | 20 +++++-----
 2 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 26f344b7..bd71cb17 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -184,3 +184,51 @@ def save_scores( scores_df
                 .xs("test_f1", level="Metric", drop_level=False)
                 .groupby("trial")
                 .mean())
+
+def save_barplot( scores_df
+                , outputdir='.'
+                , width = 7
+                , height = 7 / 1.2 ):
+  # save a barplot representation of scores
+  melted_df = scores_df[['model', 'beta', 'compression_factor', 'batch_size', 'test_f1']].melt(
+    id_vars=['model', 'beta', 'compression_factor', 'batch_size']
+  , var_name="Metric"
+  , value_name="Score"
+  )
+  for m in melted_df['model'].unique():
+    for cf in melted_df['compression_factor'].unique():
+      if 'beta' in m:
+        for bs in melted_df['batch_size'].unique():
+          ax = seaborn.catplot( data=melted_df.loc[ (melted_df['model'] == m) & (melted_df['compression_factor'] == cf) & (melted_df['batch_size'] == bs)
+                                                  , ['beta', 'Metric', 'Score'] ]
+                              , kind="bar"
+                              , x='beta'
+                              , hue="Metric"
+                              , y="Score"
+                              , errorbar="se"
+                              , height=height
+                              , aspect=width * 2**0.5 / height )
+          ax.tick_params(axis='x', rotation=90)
+          ax.fig.subplots_adjust(top=0.9)
+          ax.set(title=f'f1 score against beta ({m}, compression factor {cf}, batch size {bs})')
+          plt.savefig(f"{outputdir}/beta_barplot_{m}_{cf}_{bs}.pdf")
+          plt.close()
+      ax = seaborn.catplot( data=melted_df.loc[ (melted_df['model'] == m) & (melted_df['compression_factor'] == cf)
+                                              , ['batch_size', 'beta', 'Metric', 'Score'] ]
+                          , kind="bar"
+                          , x='batch_size'
+                          , hue='beta' if 'beta' in m else 'Metric'
+                          , y="Score"
+                          , errorbar="se"
+                          , height=height
+                          , aspect=width * 2**0.5 / height )
+      ax.tick_params(axis='x', rotation=90)
+      ax.fig.subplots_adjust(top=0.9)
+      ax.set(title=f'f1 score against batch size ({m}, compression factor {cf})')
+      plt.savefig(f"{outputdir}/barplot_{m}_{cf}.pdf")
+      plt.close()
+  # log info
+  #logger.info(melted_df.set_index(["trial", "Metric"])
+  #              .xs("test_f1", level="Metric", drop_level=False)
+  #              .groupby("trial")
+  #              .mean())
diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index 9a78f003..1c9b33e0 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -9,15 +9,7 @@
 import pandas as pd
 
 from common_helpers import *
-
-# define a Custom aggregation  
-# function for finding total 
-def keep_first_fname(series): 
-  return functools.reduce(lambda x, y: y if x == 'nofile' else y, series)
-
-def get_run_info(run):
-  x = run.split('_')
-  return f'{x[0]}_{x[1]}', x[2], x[4]
+from evaluation import *
 
 def main_process(clargs, logger=logging.getLogger(__name__)):
 
@@ -76,11 +68,17 @@ def main_process(clargs, logger=logging.getLogger(__name__)):
   df = pd.concat(dfs)
   logger.debug(df)
   df.to_csv(f'{clargs.output_dir}/all_scores_df.csv', index=False)
+  save_barplot(df, clargs.output_dir)
 
   #df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df
-  df.set_index(['dataset', 'trial', 'model', 'compression_factor', 'latent_dim', 'batch_size'], inplace=True)
+  # define a Custom aggregation
+  # function for finding total
+  def keep_first_fname(series): 
+    return functools.reduce(lambda x, y: y if x == 'nofile' else x, series)
+  df.set_index(['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size'], inplace=True)
   df.sort_index(inplace=True)
-  df = df.groupby(level=['dataset', 'trial', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({
+  #df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({
+  df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({
     'test_accuracy': 'mean'
   , 'test_precision': 'mean'
   , 'test_recall': 'mean'

From e3796ba8684b4115180360d99bf4320a750d79da Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 22 Jul 2024 08:42:41 +0100
Subject: [PATCH 180/204] keep exploring potential plots

---
 scripts/shapeembed/evaluation.py | 42 ++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index bd71cb17..96c399c8 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -190,12 +190,50 @@ def save_barplot( scores_df
                 , width = 7
                 , height = 7 / 1.2 ):
   # save a barplot representation of scores
-  melted_df = scores_df[['model', 'beta', 'compression_factor', 'batch_size', 'test_f1']].melt(
-    id_vars=['model', 'beta', 'compression_factor', 'batch_size']
+  melted_df = scores_df[['model', 'beta', 'compression_factor', 'latent_dim', 'batch_size', 'test_f1']].melt(
+    id_vars=['model', 'beta', 'compression_factor', 'latent_dim', 'batch_size']
   , var_name="Metric"
   , value_name="Score"
   )
+  # test plots...
   for m in melted_df['model'].unique():
+    # 1 - general overview plot...
+    df = melted_df.loc[ (melted_df['model'] == m)
+                      , ['compression_factor', 'latent_dim', 'batch_size', 'beta', 'Metric', 'Score'] ].sort_values(by=['compression_factor', 'latent_dim', 'batch_size', 'beta'])
+    hue = df[['compression_factor', 'latent_dim']].apply(lambda r: f'cf: {r.compression_factor}({r.latent_dim})', axis=1)
+    if 'beta' in m:
+      hue = df[['compression_factor', 'latent_dim', 'beta']].apply(lambda r: f'cf: {r.compression_factor}({r.latent_dim}), beta: {r.beta}', axis=1)
+    ax = seaborn.catplot( data=df
+                        , kind="bar"
+                        , x='batch_size'
+                        , y="Score"
+                        , hue=hue
+                        , errorbar="se"
+                        , height=height
+                        , aspect=width * 2**0.5 / height )
+    #ax.tick_params(axis='x', rotation=90)
+    #ax.set(xlabel=None)
+    #ax.set(xticklabels=[])
+    ax._legend.remove()
+    #ax.fig.legend(loc='upper center', bbox_to_anchor=(0.5, 0.0), ncol=3)
+    #ax.fig.legend(ncol=4, loc='lower center')
+    ax.fig.legend(ncol=1)
+    #ax.fig.subplots_adjust(top=0.9)
+    #ax.set(title=f'f1 score against batch size ({m})')
+
+    #add overall title
+    plt.title(f'f1 score against batch size ({m})', fontsize=16)
+
+    ##add axis titles
+    #plt.xlabel('')
+    #plt.ylabel('')
+
+    #rotate x-axis labels
+    #plt.xticks(rotation=45)
+
+    plt.savefig(f"{outputdir}/barplot_{m}.pdf", bbox_inches="tight")
+    plt.close()
+    # 2 - more specific plots
     for cf in melted_df['compression_factor'].unique():
       if 'beta' in m:
         for bs in melted_df['batch_size'].unique():

From cecde4e12997981a056384eb967a5a099390c684 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 22 Jul 2024 10:08:35 +0100
Subject: [PATCH 181/204] more graphs

---
 scripts/shapeembed/evaluation.py | 56 ++++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 96c399c8..1a872eeb 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -231,8 +231,46 @@ def save_barplot( scores_df
     #rotate x-axis labels
     #plt.xticks(rotation=45)
 
-    plt.savefig(f"{outputdir}/barplot_{m}.pdf", bbox_inches="tight")
+    plt.savefig(f"{outputdir}/barplot_{m}_x_bs.pdf", bbox_inches="tight")
     plt.close()
+
+    # 1b - general overview plot...
+    df = melted_df.loc[ (melted_df['model'] == m)
+                      , ['batch_size', 'compression_factor', 'latent_dim', 'beta', 'Metric', 'Score'] ].sort_values(by=['batch_size', 'compression_factor', 'latent_dim', 'beta'])
+    hue = df['batch_size'].apply(lambda r: f'bs: {r}')
+    if 'beta' in m:
+      hue = df[['batch_size', 'beta']].apply(lambda r: f'bs: {r.batch_size}, beta: {r.beta}', axis=1)
+    ax = seaborn.catplot( data=df
+                        , kind="bar"
+                        , x=df[['compression_factor', 'latent_dim']].apply(lambda r: f'cf: {r.compression_factor}({r.latent_dim})', axis=1)
+                        , y="Score"
+                        , hue=hue
+                        , errorbar="se"
+                        , height=height
+                        , aspect=width * 2**0.5 / height )
+    #ax.tick_params(axis='x', rotation=90)
+    #ax.set(xlabel=None)
+    #ax.set(xticklabels=[])
+    ax._legend.remove()
+    #ax.fig.legend(loc='upper center', bbox_to_anchor=(0.5, 0.0), ncol=3)
+    #ax.fig.legend(ncol=4, loc='lower center')
+    ax.fig.legend(ncol=1)
+    #ax.fig.subplots_adjust(top=0.9)
+    #ax.set(title=f'f1 score against batch size ({m})')
+
+    #add overall title
+    plt.title(f'f1 score against compression factor (latent space size) ({m})', fontsize=16)
+
+    ##add axis titles
+    #plt.xlabel('')
+    #plt.ylabel('')
+
+    #rotate x-axis labels
+    #plt.xticks(rotation=45)
+
+    plt.savefig(f"{outputdir}/barplot_{m}_x_cf.pdf", bbox_inches="tight")
+    plt.close()
+
     # 2 - more specific plots
     for cf in melted_df['compression_factor'].unique():
       if 'beta' in m:
@@ -263,7 +301,21 @@ def save_barplot( scores_df
       ax.tick_params(axis='x', rotation=90)
       ax.fig.subplots_adjust(top=0.9)
       ax.set(title=f'f1 score against batch size ({m}, compression factor {cf})')
-      plt.savefig(f"{outputdir}/barplot_{m}_{cf}.pdf")
+      plt.savefig(f"{outputdir}/barplot_{m}_x_bs_cf{cf}.pdf")
+      plt.close()
+      ax = seaborn.catplot( data=melted_df.loc[ (melted_df['model'] == m) & (melted_df['batch_size'] == cf)
+                                              , ['compression_factor', 'beta', 'Metric', 'Score'] ]
+                          , kind="bar"
+                          , x='compression_factor'
+                          , hue='beta' if 'beta' in m else 'Metric'
+                          , y="Score"
+                          , errorbar="se"
+                          , height=height
+                          , aspect=width * 2**0.5 / height )
+      ax.tick_params(axis='x', rotation=90)
+      ax.fig.subplots_adjust(top=0.9)
+      ax.set(title=f'f1 score against batch size ({m}, compression factor {cf})')
+      plt.savefig(f"{outputdir}/barplot_{m}_x_cf_bs{bs}.pdf")
       plt.close()
   # log info
   #logger.info(melted_df.set_index(["trial", "Metric"])

From 344cff131aee49594f3cf2f61ad729f7a00a6756 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 22 Jul 2024 20:12:45 +0100
Subject: [PATCH 182/204] fix model name in shapeembed output csv

---
 scripts/shapeembed/shapeembed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index 744f29fa..b027a3c9 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -104,7 +104,7 @@
 def tag_cols(params):
   cols = []
   cols.append(('dataset', params.dataset.name))
-  cols.append(('model', model_str(params)))
+  cols.append(('model', params.model_name))
   for k, v in vars(params.model_args).items(): cols.append((k, v))
   cols.append(('compression_factor', params.compression_factor))
   cols.append(('latent_dim', params.latent_dim))

From 6846ce15f278e5238d57b85d08f6d7ee3b912a69 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 22 Jul 2024 21:28:09 +0100
Subject: [PATCH 183/204] Added loss / mse to shapeembed's generated csv

---
 bioimage_embed/lightning/torch.py | 26 ++++++++++++++------------
 scripts/shapeembed/shapeembed.py  |  5 ++++-
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/bioimage_embed/lightning/torch.py b/bioimage_embed/lightning/torch.py
index ab730c3f..22147b81 100644
--- a/bioimage_embed/lightning/torch.py
+++ b/bioimage_embed/lightning/torch.py
@@ -49,6 +49,8 @@ def __init__(self, model, args=SimpleNamespace()):
         # self.args = SimpleNamespace(**{**merged_kwargs, **vars(self.args)})
         self.save_hyperparameters(vars(self.args))
         # self.model.train()
+        # keep a handle on metrics logged by the model
+        self.metrics = {}
 
     def forward(self, batch):
         x = self.batch_to_tensor(batch)
@@ -118,12 +120,12 @@ def validation_step(self, batch, batch_idx):
         x = self.batch_to_tensor(batch)
         model_output, loss = self.get_model_output(x, batch_idx)
         z = self.embedding_from_output(model_output)
-        self.log_dict(
-            {
-                "loss/val": loss,
-                "mse/val": F.mse_loss(model_output.recon_x, x["data"]),
-            }
-        )
+        val_metrics ={
+            "loss/val": loss,
+            "mse/val": F.mse_loss(model_output.recon_x, x["data"]),
+        }
+        self.log_dict( val_metrics,)
+        self.metrics = {**self.metrics, **val_metrics}
         return loss
 
     # def lr_scheduler_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
@@ -171,12 +173,12 @@ def test_step(self, batch, batch_idx):
         loss = self.loss_function(model_output)
 
         # Log test metrics
-        self.log_dict(
-            {
-                "loss/test": loss,
-                "mse/test": F.mse_loss(model_output.recon_x, x["data"]),
-            }
-        )
+        test_metrics = {
+            "loss/test": loss,
+            "mse/test": F.mse_loss(model_output.recon_x, x["data"]),
+        }
+        self.log_dict(test_metrics)
+        self.metrics = {**self.metrics, **test_metrics}
 
         return loss
     
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index b027a3c9..e62dbd59 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -344,6 +344,9 @@ def main_process(params):
     trainer, model, dataloader
   , num_workers=params.num_workers
   )
+
+  # gather and log stats
+  ######################
   logger.debug(f'\n{shapeembed_df}')
   pfx=job_str(params)
   np.save(f'{params.output_dir}/{pfx}-shapeembed-latent_space.npy', latent_space)
@@ -352,7 +355,7 @@ def main_process(params):
   logger.info(f'-- generate shapeembed umap --')
   umap_plot(shapeembed_df, f'{pfx}-shapeembed', outputdir=params.output_dir)
   logger.info(f'-- score shape embed --')
-  shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params))
+  shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params)+[(k, v.item()) for k, v in model.metrics.items()])
   logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}')
   shapeembed_score_df.to_csv(f"{params.output_dir}/{pfx}-shapeembed-score_df.csv")
   logger.info(f'-- confusion matrix:\n{shapeembed_cm}')

From 7a8972fcc90b5d67005f67c009c4ef41c766ede9 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 25 Jul 2024 19:22:13 +0100
Subject: [PATCH 184/204] updated slurm script with regex filtering of squeue
 output

---
 scripts/shapeembed/common_helpers.py         |  4 ++++
 scripts/shapeembed/slurm_sweep_shapeembed.py | 17 +++++++++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/scripts/shapeembed/common_helpers.py b/scripts/shapeembed/common_helpers.py
index 204b4c09..fd09a241 100644
--- a/scripts/shapeembed/common_helpers.py
+++ b/scripts/shapeembed/common_helpers.py
@@ -1,3 +1,4 @@
+import re
 import os
 import glob
 import types
@@ -15,6 +16,9 @@ def model_str(params):
 def job_str(params):
   return f"{params.dataset.name}-{model_str(params)}-{params.compression_factor}-{params.latent_dim}-{params.batch_size}"
 
+def job_str_re():
+  return re.compile("(.*)-(.*)-(\d+)-(\d+)-(\d+)")
+
 def params_from_job_str(jobstr):
   raw = jobstr.split('-')
   ps = types.SimpleNamespace()
diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py
index cc022bed..7d4aa40c 100755
--- a/scripts/shapeembed/slurm_sweep_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweep_shapeembed.py
@@ -18,9 +18,10 @@
 
 datasets_pfx = '/nfs/research/uhlmann/afoix/datasets/image_datasets'
 datasets = [
-  ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask")
+#  ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask")
 #  ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask")
-#  ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask")
+  ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask")
+, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask")
 #, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask")
 #, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask")
 #, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask")
@@ -28,14 +29,14 @@
 ]
 
 models = [
-  "resnet18_vae"
+  "resnet18_vqvae"
+, "resnet50_vqvae"
+, "resnet18_vae"
 , "resnet50_vae"
 , "resnet18_beta_vae"
 , "resnet50_beta_vae"
 #, "resnet18_vae_bolt"
 #, "resnet50_vae_bolt"
-, "resnet18_vqvae"
-, "resnet50_vqvae"
 #, "resnet18_vqvae_legacy"
 #, "resnet50_vqvae_legacy"
 #, "resnet101_vqvae_legacy"
@@ -46,8 +47,8 @@
 ]
 
 model_params = {
-  "resnet18_beta_vae": {'beta': [1,2,5,10,20]}
-, "resnet50_beta_vae": {'beta': [1,2,5,10,20]}
+  "resnet18_beta_vae": {'beta': [2,5]}
+, "resnet50_beta_vae": {'beta': [2,5]}
 }
 
 compression_factors = [1,2,3,5,10]
@@ -107,7 +108,7 @@ def check_model_args(a, b):
 
 def find_submitted_slurm_jobs():
   jobs = subprocess.run(['squeue', '--format', '%j'], stdout=subprocess.PIPE).stdout.decode('utf-8').split()
-  return list(map(params_from_job_str, jobs[1:]))
+  return list(map(params_from_job_str, filter(lambda x: x, map(job_str_re().match, jobs[1:]))))
 
 # other parameters
 ################################################################################

From d9c87a3f7eca8ea59e29dd5edb456b55739b3eee Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 25 Jul 2024 23:43:04 +0100
Subject: [PATCH 185/204] added a simple latex table to the gather_run_results
 script

---
 scripts/shapeembed/gather_run_results.py | 26 +++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index 1c9b33e0..db968aa0 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -1,6 +1,7 @@
 #! /usr/bin/env python3
 
 import os
+import re
 import shutil
 import logging
 import argparse
@@ -11,6 +12,21 @@
 from common_helpers import *
 from evaluation import *
 
+def simple_table(df, tname, model_re=".*vq.*"):
+  cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1']
+  df = df.loc[df.model.str.contains(model_re), cols].sort_values(by=cols)
+  df = df.sort_values(by='test_f1', ascending=False).iloc[:10]
+
+  with open(f'{tname}_tabular.tex', 'w') as fp:
+    fp.write("\\begin{tabular}{|llll|r|} \hline\n")
+    fp.write("Model & CF (and latent space size) & batch size & BETA & F1 score \\\\ \hline\n")
+    for _, r in df.iterrows():
+      mname = r['model'].replace('_','\_')
+      beta = '-' if pd.isna(r['beta']) else r['beta']
+      fp.write(f"{mname} & {r['compression_factor']} ({r['latent_dim']}) & {r['batch_size']} & {beta} & {r['test_f1']:.4f} \\\\\n")
+    fp.write("\hline\n")
+    fp.write("\end{tabular}\n")
+
 def main_process(clargs, logger=logging.getLogger(__name__)):
 
   params = []
@@ -75,11 +91,13 @@ def main_process(clargs, logger=logging.getLogger(__name__)):
   # function for finding total
   def keep_first_fname(series): 
     return functools.reduce(lambda x, y: y if x == 'nofile' else x, series)
-  df.set_index(['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size'], inplace=True)
+  idx_cols = ['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']
+  df.set_index(idx_cols, inplace=True)
   df.sort_index(inplace=True)
   #df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({
-  df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({
-    'test_accuracy': 'mean'
+  df = df.groupby(level=idx_cols).agg({
+    'beta': 'mean'
+  , 'test_accuracy': 'mean'
   , 'test_precision': 'mean'
   , 'test_recall': 'mean'
   , 'test_f1': 'mean'
@@ -93,6 +111,8 @@ def keep_first_fname(series):
   print('-'*80)
   df.to_csv(f'{clargs.output_dir}/all_scores_agg_df.csv')
 
+  simple_table(df.reset_index(), f'{clargs.output_dir}/simple_table')
+
 
   #cell_hover = {  # for row hover use <tr> instead of <td>
   #            'selector': 'td:hover',

From cced4e9090791be6c2178952329b06e60ee2eb73 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 27 Jul 2024 22:28:32 +0100
Subject: [PATCH 186/204] minor refactor in efd

---
 scripts/shapeembed/efd.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py
index 4f910990..28e8aa0b 100755
--- a/scripts/shapeembed/efd.py
+++ b/scripts/shapeembed/efd.py
@@ -7,20 +7,27 @@
 import argparse
 
 # own imports
+#import bioimage_embed # necessary for the datamodule class to make sure we get the same test set
+from bioimage_embed.shapes.transforms import ImageToCoords
 from evaluation import *
 
-def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger):
+def get_dataset(dataset_params):
   # access the dataset
-  assert dataset_params.type == 'mask'
-  ds = datasets.ImageFolder( dataset_params.path
-                           , transform=transforms.Compose([
-                               transforms.Grayscale(1)
-                             , ImageToCoords(contour_size) ]))
-  # ... and run efd on each image
+  assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}'
+  dataset = datasets.ImageFolder( dataset_params.path
+                                , transform=transforms.Compose([
+                                    transforms.Grayscale(1)
+                                  , ImageToCoords(contour_size) ]))
+  return dataset
+  #dataloader = bioimage_embed.lightning.DataModule(dataset, shuffle=True)
+  #dataloader.setup()
+  #return dataloader.test
+
+def run_elliptic_fourier_descriptors(dataset, contour_size, logger):
+  # run efd on each image
   dfs = []
-  logger.info(f'running efd on {dataset_params.name}')
-  logger.info(f'({dataset_params.path})')
-  for i, (img, lbl) in enumerate(tqdm.tqdm(ds)):
+  logger.info(f'running efd on {dataset}')
+  for i, (img, lbl) in enumerate(tqdm.tqdm(dataset)):
     coeffs = pyefd.elliptic_fourier_descriptors(img, order=10, normalize=False)
     norm_coeffs = pyefd.normalize_efd(coeffs)
     df = pandas.DataFrame({
@@ -73,7 +80,7 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger):
 
   # efd on input data and score
 
-  efd_df = run_elliptic_fourier_descriptors(dataset, contour_size, logger)
+  efd_df = run_elliptic_fourier_descriptors(get_dataset(dataset), contour_size, logger)
 
   logger.info(f'-- efd on {dataset.name}, raw\n{efd_df}')
   efd_df.to_csv(f"{clargs.output_dir}/{dataset.name}-efd-raw_df.csv")

From 3e6e4c9f2e0d401f2687b90c0545e5334fd94b84 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 27 Jul 2024 22:28:41 +0100
Subject: [PATCH 187/204] minor refactor in regionprops

---
 scripts/shapeembed/regionprops.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py
index 3b65933f..db37ac25 100755
--- a/scripts/shapeembed/regionprops.py
+++ b/scripts/shapeembed/regionprops.py
@@ -7,19 +7,25 @@
 from skimage import measure
 
 # own imports
+#import bioimage_embed # necessary for the datamodule class to make sure we get the same test set
 from evaluation import *
 
-def run_regionprops( dataset_params
-                   , properties
-                   , logger ):
+def get_dataset(dataset_params):
   # access the dataset
   assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}'
-  ds = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1))
-  # ... and run regionprops for the given properties for each image
+  dataset = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1))
+  return dataset
+  #dataloader = bioimage_embed.lightning.DataModule(dataset, shuffle=True)
+  #dataloader.setup()
+  #return dataloader.test
+
+def run_regionprops( dataset
+                   , properties
+                   , logger ):
+  # run regionprops for the given properties for each image
   dfs = []
-  logger.info(f'running regionprops on {dataset_params.name}')
-  logger.info(f'({dataset_params.path})')
-  for i, (img, lbl) in enumerate(tqdm.tqdm(ds)):
+  logger.info(f'running regionprops on {dataset}')
+  for i, (img, lbl) in enumerate(tqdm.tqdm(dataset)):
     data = numpy.where(numpy.array(img)>20, 255, 0)
     t = measure.regionprops_table(data, properties=properties)
     df = pandas.DataFrame(t)
@@ -75,7 +81,7 @@ def run_regionprops( dataset_params
 
   # regionprops on input data and score
 
-  regionprops_df = run_regionprops(dataset, properties, logger)
+  regionprops_df = run_regionprops(get_dataset(dataset), properties, logger)
 
   logger.info(f'-- regionprops on {dataset.name}, raw\n{regionprops_df}')
   regionprops_df.to_csv(f"{clargs.output_dir}/{dataset.name}-regionprops-raw_df.csv")

From 2908c155e713a2e37000fe26d736c6e7d816e128 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 27 Jul 2024 22:29:19 +0100
Subject: [PATCH 188/204] generated plots and more tables in gather_run_results

---
 scripts/shapeembed/gather_run_results.py | 70 +++++++++++++++++++++---
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index db968aa0..e354e55b 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -4,6 +4,7 @@
 import re
 import shutil
 import logging
+import seaborn
 import argparse
 import datetime
 import functools
@@ -12,18 +13,44 @@
 from common_helpers import *
 from evaluation import *
 
-def simple_table(df, tname, model_re=".*vq.*"):
-  cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1']
+#def simple_table(df, tname, model_re=".*vq.*"):
+def simple_table(df, tname, model_re=".*", sort_by_col=None, ascending=False, best_n=40):
+  cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1', 'mse/test']
   df = df.loc[df.model.str.contains(model_re), cols].sort_values(by=cols)
-  df = df.sort_values(by='test_f1', ascending=False).iloc[:10]
+  if sort_by_col:
+    df = df.sort_values(by=sort_by_col, ascending=ascending)
+  df = df.iloc[:best_n]
 
   with open(f'{tname}_tabular.tex', 'w') as fp:
-    fp.write("\\begin{tabular}{|llll|r|} \hline\n")
-    fp.write("Model & CF (and latent space size) & batch size & BETA & F1 score \\\\ \hline\n")
+    fp.write("\\begin{tabular}{|llll|r|r|} \hline\n")
+    fp.write("Model & CF (and latent space size) & batch size & BETA & F1 score & Mse \\\\ \hline\n")
     for _, r in df.iterrows():
       mname = r['model'].replace('_','\_')
       beta = '-' if pd.isna(r['beta']) else r['beta']
-      fp.write(f"{mname} & {r['compression_factor']} ({r['latent_dim']}) & {r['batch_size']} & {beta} & {r['test_f1']:.4f} \\\\\n")
+      fp.write(f"{mname} & {r['compression_factor']} ({r['latent_dim']}) & {r['batch_size']} & {beta} & {r['test_f1']:f} & {r['mse/test']:f} \\\\\n")
+    fp.write("\hline\n")
+    fp.write("\end{tabular}\n")
+
+def compare_f1_mse_table(df, tname, best_n=40):
+  cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1', 'mse/test']
+  df0 = df[cols].sort_values(by=cols)
+  df0 = df0.sort_values(by='test_f1', ascending=False)
+  df0 = df0.iloc[:best_n]
+  df1 = df[cols].sort_values(by=cols)
+  df1 = df1.sort_values(by='mse/test', ascending=True)
+  df1 = df1.iloc[:best_n]
+  df = pd.concat([df0.reset_index(), df1.reset_index()], axis=1, keys=['f1', 'mse'])
+  print(df)
+  with open(f'{tname}_tabular.tex', 'w') as fp:
+    fp.write("\\begin{tabular}{|llll|r|r|llll|r|r|} \hline\n")
+    fp.write("\multicolumn{6}{|l}{Best F1 score} & \multicolumn{6}{|l|}{Best Mse} \\\\\n")
+    fp.write("Model & CF (latent space) & batch size & BETA & F1 score & Mse & Model & CF (latent space) & batch size & BETA & F1 score & Mse \\\\ \hline\n")
+    for _, r in df.iterrows():
+      f1_name = r[('f1', 'model')].replace('_','\_')
+      mse_name = r[('mse', 'model')].replace('_','\_')
+      f1_beta = '-' if pd.isna(r[('f1', 'beta')]) else r[('f1', 'beta')]
+      mse_beta = '-' if pd.isna(r[('mse', 'beta')]) else r[('mse', 'beta')]
+      fp.write(f"{f1_name} & {r[('f1', 'compression_factor')]} ({r[('f1', 'latent_dim')]}) & {r[('f1', 'batch_size')]} & {f1_beta} & {r[('f1', 'test_f1')]:f} & {r[('f1', 'mse/test')]:f} & {mse_name} & {r[('mse', 'compression_factor')]} ({r[('mse', 'latent_dim')]}) & {r[('mse', 'batch_size')]} & {mse_beta} & {r[('mse', 'test_f1')]:f} & {r[('mse', 'mse/test')]:f} \\\\\n")
     fp.write("\hline\n")
     fp.write("\end{tabular}\n")
 
@@ -101,6 +128,10 @@ def keep_first_fname(series):
   , 'test_precision': 'mean'
   , 'test_recall': 'mean'
   , 'test_f1': 'mean'
+  , 'mse/test': 'mean'
+  , 'loss/test': 'mean'
+  , 'mse/val': 'mean'
+  , 'loss/val': 'mean'
   , 'conf_mat': keep_first_fname
   , 'umap': keep_first_fname
   #, 'barplot': keep_first_fname
@@ -110,9 +141,30 @@ def keep_first_fname(series):
   print(df)
   print('-'*80)
   df.to_csv(f'{clargs.output_dir}/all_scores_agg_df.csv')
-
-  simple_table(df.reset_index(), f'{clargs.output_dir}/simple_table')
-
+  df = df.reset_index()
+
+  # table results for f1 and mse comparison
+  simple_table(df, f'{clargs.output_dir}/table_top40_f1', sort_by_col='test_f1')
+  simple_table(df, f'{clargs.output_dir}/table_top40_mse', sort_by_col='mse/test', ascending=True)
+  compare_f1_mse_table(df, f'{clargs.output_dir}/table_top5_compare', best_n=5)
+
+  # mse / f1 plots
+  dff=df[df['mse/test']<df['mse/test'].quantile(0.9)] # drop mse outlier
+  #mse=df['mse/test']
+  #print(f'mse, mean: {mse.mean()}, std: {mse.std()}')
+  ax = seaborn.relplot(data=dff, x='mse/test', y='test_f1', hue='model', aspect=1.61)
+  ax.figure.savefig(f'{clargs.output_dir}/f1VSmse_scatter.png')
+
+  for m in df['model'].unique():
+    dff = df[df['model']==m]
+    print(m)
+    ax = seaborn.relplot(kind='line', data=dff.dropna(subset=['test_f1']), x='compression_factor', y='test_f1', hue='batch_size')
+    ax.figure.suptitle(f'{m}: f1 VS compression factor')
+    ax.figure.savefig(f'{clargs.output_dir}/{m}_f1VScompression_factor_line.png')
+    ax = seaborn.relplot(kind='line', data=dff.dropna(subset=['mse/test']), x='compression_factor', y='mse/test', hue='batch_size')
+    ax.figure.suptitle(f'{m}: Mse VS compression factor')
+    ax.figure.savefig(f'{clargs.output_dir}/{m}_mseVScompression_factor_line.png')
+    simple_table(dff, f'{clargs.output_dir}/{m}_summary_table')
 
   #cell_hover = {  # for row hover use <tr> instead of <td>
   #            'selector': 'td:hover',

From 41d460343d39d98787b5bd9e0c38b762d3ee4b18 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 27 Jul 2024 23:38:56 +0100
Subject: [PATCH 189/204] added regionprops and efd to gather results script

---
 scripts/shapeembed/gather_run_results.py | 34 +++++++++++++++++++-----
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index e354e55b..3b5c7e32 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -13,6 +13,17 @@
 from common_helpers import *
 from evaluation import *
 
+def trial_table(df, tname):
+  best_model = df.dropna(subset=['model']).sort_values(by='test_f1', ascending=False).iloc[0]
+  with open(f'{tname}_tabular.tex', 'w') as fp:
+    fp.write("\\begin{tabular}{|l|r|} \hline\n")
+    fp.write("Trial & F1 score \\\\ \hline\n")
+    name = best_model['trial'].replace('_','\_')
+    fp.write(f"{name} & {best_model['test_f1']} \\\\ \hline\n")
+    fp.write(f"regionprops & {df[df['trial'] == 'regionprops'].iloc[0]['test_f1']} \\\\ \hline\n")
+    fp.write(f"efd & {df[df['trial'] == 'efd'].iloc[0]['test_f1']} \\\\ \hline\n")
+    fp.write("\end{tabular}\n")
+
 #def simple_table(df, tname, model_re=".*vq.*"):
 def simple_table(df, tname, model_re=".*", sort_by_col=None, ascending=False, best_n=40):
   cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1', 'mse/test']
@@ -56,6 +67,16 @@ def compare_f1_mse_table(df, tname, best_n=40):
 
 def main_process(clargs, logger=logging.getLogger(__name__)):
 
+  dfs = []
+
+  # regionprops / efd
+  for dirname in clargs.run_folders:
+    for f in glob.glob(f'{dirname}/*-regionprops-score_df.csv'):
+      dfs.append(pd.read_csv(f, index_col=0))
+    for f in glob.glob(f'{dirname}/*-efd-score_df.csv'):
+      dfs.append(pd.read_csv(f, index_col=0))
+
+  # shapeembed
   params = []
   for f in clargs.run_folders:
     ps = find_existing_run_scores(f)
@@ -66,7 +87,6 @@ def main_process(clargs, logger=logging.getLogger(__name__)):
 
   os.makedirs(clargs.output_dir, exist_ok=True)
 
-  dfs = []
   for p in params:
 
     # open scores dataframe
@@ -111,18 +131,18 @@ def main_process(clargs, logger=logging.getLogger(__name__)):
   df = pd.concat(dfs)
   logger.debug(df)
   df.to_csv(f'{clargs.output_dir}/all_scores_df.csv', index=False)
-  save_barplot(df, clargs.output_dir)
+  save_barplot(df.dropna(subset=['model']), clargs.output_dir)
 
   #df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df
   # define a Custom aggregation
   # function for finding total
   def keep_first_fname(series): 
-    return functools.reduce(lambda x, y: y if x == 'nofile' else x, series)
+    return functools.reduce(lambda x, y: y if str(x) == 'nofile' else x, series)
   idx_cols = ['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']
   df.set_index(idx_cols, inplace=True)
   df.sort_index(inplace=True)
   #df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({
-  df = df.groupby(level=idx_cols).agg({
+  df = df.groupby(level=idx_cols, dropna=False).agg({
     'beta': 'mean'
   , 'test_accuracy': 'mean'
   , 'test_precision': 'mean'
@@ -147,6 +167,7 @@ def keep_first_fname(series):
   simple_table(df, f'{clargs.output_dir}/table_top40_f1', sort_by_col='test_f1')
   simple_table(df, f'{clargs.output_dir}/table_top40_mse', sort_by_col='mse/test', ascending=True)
   compare_f1_mse_table(df, f'{clargs.output_dir}/table_top5_compare', best_n=5)
+  trial_table(df, f'{clargs.output_dir}/trials')
 
   # mse / f1 plots
   dff=df[df['mse/test']<df['mse/test'].quantile(0.9)] # drop mse outlier
@@ -155,8 +176,9 @@ def keep_first_fname(series):
   ax = seaborn.relplot(data=dff, x='mse/test', y='test_f1', hue='model', aspect=1.61)
   ax.figure.savefig(f'{clargs.output_dir}/f1VSmse_scatter.png')
 
-  for m in df['model'].unique():
-    dff = df[df['model']==m]
+  dff = df.dropna(subset=['model'])
+  for m in dff['model'].unique():
+    dff = dff[dff['model']==m]
     print(m)
     ax = seaborn.relplot(kind='line', data=dff.dropna(subset=['test_f1']), x='compression_factor', y='test_f1', hue='batch_size')
     ax.figure.suptitle(f'{m}: f1 VS compression factor')

From 251d85f2e26f885356ac2d92de4b8dbce132cd6b Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 29 Jul 2024 21:10:59 +0100
Subject: [PATCH 190/204] Updated graphs titles

---
 scripts/shapeembed/evaluation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py
index 1a872eeb..d530e9f6 100644
--- a/scripts/shapeembed/evaluation.py
+++ b/scripts/shapeembed/evaluation.py
@@ -303,7 +303,8 @@ def save_barplot( scores_df
       ax.set(title=f'f1 score against batch size ({m}, compression factor {cf})')
       plt.savefig(f"{outputdir}/barplot_{m}_x_bs_cf{cf}.pdf")
       plt.close()
-      ax = seaborn.catplot( data=melted_df.loc[ (melted_df['model'] == m) & (melted_df['batch_size'] == cf)
+    for bs in melted_df['batch_size'].unique():
+      ax = seaborn.catplot( data=melted_df.loc[ (melted_df['model'] == m) & (melted_df['batch_size'] == bs)
                                               , ['compression_factor', 'beta', 'Metric', 'Score'] ]
                           , kind="bar"
                           , x='compression_factor'
@@ -314,7 +315,7 @@ def save_barplot( scores_df
                           , aspect=width * 2**0.5 / height )
       ax.tick_params(axis='x', rotation=90)
       ax.fig.subplots_adjust(top=0.9)
-      ax.set(title=f'f1 score against batch size ({m}, compression factor {cf})')
+      ax.set(title=f'f1 score against compression factor ({m}, compression batch size {bs})')
       plt.savefig(f"{outputdir}/barplot_{m}_x_cf_bs{bs}.pdf")
       plt.close()
   # log info

From 932b13a7bcc926cdde63168783ec45afe91952cb Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 29 Jul 2024 21:13:43 +0100
Subject: [PATCH 191/204] fake beta column if necessary and filter out
 regionprops and efd for f1 Vs Mse comparison

---
 scripts/shapeembed/gather_run_results.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index 3b5c7e32..b58e5fd1 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -116,6 +116,10 @@ def main_process(clargs, logger=logging.getLogger(__name__)):
     else:
       df['umap'] = f'nofile'
 
+    # NA desired columns if not already present
+    if 'beta' not in df.keys():
+      df['beta'] = pd.NA
+
     ## pair up with barplot
     #barplot = f'scores_barplot.pdf'
     #if os.path.isfile(f'{d}/{barplot}'):
@@ -166,8 +170,13 @@ def keep_first_fname(series):
   # table results for f1 and mse comparison
   simple_table(df, f'{clargs.output_dir}/table_top40_f1', sort_by_col='test_f1')
   simple_table(df, f'{clargs.output_dir}/table_top40_mse', sort_by_col='mse/test', ascending=True)
-  compare_f1_mse_table(df, f'{clargs.output_dir}/table_top5_compare', best_n=5)
-  trial_table(df, f'{clargs.output_dir}/trials')
+  # temporarily drop regionprops and efd rows for F1 and MSE comparison
+  dff = df[(df['trial'] != 'regionprops') & (df['trial'] != 'efd')]
+  compare_f1_mse_table(dff, f'{clargs.output_dir}/table_top5_compare', best_n=5)
+  if 'regionprops' in df['trial'].values and 'efd' in df['trial'].values:
+    trial_table(df, f'{clargs.output_dir}/trials')
+  else:
+    logger.info('skipped trial table comparison (need both regionprops and efd results)')
 
   # mse / f1 plots
   dff=df[df['mse/test']<df['mse/test'].quantile(0.9)] # drop mse outlier

From 61f8e1719694f93c8f0b01d81257532bd7eae477 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 29 Jul 2024 21:24:09 +0100
Subject: [PATCH 192/204] updated datasets + only find jobs and scores if
 corresponding filter active

---
 scripts/shapeembed/slurm_sweep_shapeembed.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py
index 7d4aa40c..1080394c 100755
--- a/scripts/shapeembed/slurm_sweep_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweep_shapeembed.py
@@ -20,8 +20,10 @@
 datasets = [
 #  ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask")
 #  ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask")
-  ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask")
-, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask")
+#  ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask")
+# ("vampire_cells", f"{datasets_pfx}/vampire_cells/", "mask")
+ ("vampire_nuclei", f"{datasets_pfx}/vampire_nuclei/", "mask")
+#, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask")
 #, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask")
 #, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask")
 #, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask")
@@ -202,14 +204,13 @@ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name_
   os.makedirs(clargs.slurm_output_dir, exist_ok=True)
   os.makedirs(clargs.output_dir, exist_ok=True)
 
-  done_params = find_existing_run_scores(clargs.output_dir)
-  in_slurm_params = find_submitted_slurm_jobs()
-  all_params  = gen_params_sweep_list()
+  todo_params  = gen_params_sweep_list()
 
-  todo_params = all_params
   if clargs.filter_done:
+    done_params = find_existing_run_scores(clargs.output_dir)
     todo_params = [x for x in todo_params if not params_match(x, done_params)]
   if clargs.filter_submitted:
+    in_slurm_params = find_submitted_slurm_jobs()
     todo_params = [x for x in todo_params if not params_match(x, in_slurm_params)]
 
   for ps in todo_params:

From 583e835fcdf3ae637df792f2a7ac531201ec358d Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 29 Jul 2024 21:54:19 +0100
Subject: [PATCH 193/204] bugfix overwriting loop dataframe

---
 scripts/shapeembed/gather_run_results.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index b58e5fd1..478a4c73 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -187,15 +187,15 @@ def keep_first_fname(series):
 
   dff = df.dropna(subset=['model'])
   for m in dff['model'].unique():
-    dff = dff[dff['model']==m]
+    local_df = dff[dff['model']==m]
     print(m)
-    ax = seaborn.relplot(kind='line', data=dff.dropna(subset=['test_f1']), x='compression_factor', y='test_f1', hue='batch_size')
+    ax = seaborn.relplot(kind='line', data=local_df.dropna(subset=['test_f1']), x='compression_factor', y='test_f1', hue='batch_size')
     ax.figure.suptitle(f'{m}: f1 VS compression factor')
     ax.figure.savefig(f'{clargs.output_dir}/{m}_f1VScompression_factor_line.png')
-    ax = seaborn.relplot(kind='line', data=dff.dropna(subset=['mse/test']), x='compression_factor', y='mse/test', hue='batch_size')
+    ax = seaborn.relplot(kind='line', data=local_df.dropna(subset=['mse/test']), x='compression_factor', y='mse/test', hue='batch_size')
     ax.figure.suptitle(f'{m}: Mse VS compression factor')
     ax.figure.savefig(f'{clargs.output_dir}/{m}_mseVScompression_factor_line.png')
-    simple_table(dff, f'{clargs.output_dir}/{m}_summary_table')
+    simple_table(local_df, f'{clargs.output_dir}/{m}_summary_table')
 
   #cell_hover = {  # for row hover use <tr> instead of <td>
   #            'selector': 'td:hover',

From 07bb89c57ff4a266b0a67b229bdf025c3fb6ea8f Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Tue, 30 Jul 2024 10:41:50 +0100
Subject: [PATCH 194/204] dded a clarg to control region prop properties

---
 scripts/shapeembed/regionprops.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py
index db37ac25..07431864 100755
--- a/scripts/shapeembed/regionprops.py
+++ b/scripts/shapeembed/regionprops.py
@@ -24,7 +24,7 @@ def run_regionprops( dataset
                    , logger ):
   # run regionprops for the given properties for each image
   dfs = []
-  logger.info(f'running regionprops on {dataset}')
+  logger.info(f'running regionprops on {dataset}, properties: {properties}')
   for i, (img, lbl) in enumerate(tqdm.tqdm(dataset)):
     data = numpy.where(numpy.array(img)>20, 255, 0)
     t = measure.regionprops_table(data, properties=properties)
@@ -53,6 +53,10 @@ def run_regionprops( dataset
                   , "minor_axis_length"
                   , "orientation" ]
 
+  parser.add_argument(
+      '-p', '--properties', metavar='PROP', default=dflt_properties, nargs='+'
+    , help=f"Overwrite the list of properties to consider (default: {dflt_properties})")
+
   parser.add_argument(
       '-o', '--output-dir', metavar='OUTPUT_DIR', default='./'
     , help=f"The OUTPUT_DIR path to use to dump results")
@@ -74,7 +78,7 @@ def run_regionprops( dataset
   dataset = types.SimpleNamespace( name=clargs.dataset[0]
                                  , path=clargs.dataset[1]
                                  , type=clargs.dataset[2] )
-  properties = dflt_properties
+  properties = clargs.properties
 
   # create output dir if it does not exist
   os.makedirs(clargs.output_dir, exist_ok=True)

From a35490c65e45a7fdd41dbc35593b04d5a76df054 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Thu, 8 Aug 2024 11:25:11 +0100
Subject: [PATCH 195/204] Added random order to efd and regionprops

---
 scripts/shapeembed/efd.py         | 14 +++++++-------
 scripts/shapeembed/regionprops.py |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py
index 28e8aa0b..9b9525f8 100755
--- a/scripts/shapeembed/efd.py
+++ b/scripts/shapeembed/efd.py
@@ -3,6 +3,7 @@
 import os
 import types
 import pyefd
+import random
 import logging
 import argparse
 
@@ -14,14 +15,13 @@
 def get_dataset(dataset_params):
   # access the dataset
   assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}'
-  dataset = datasets.ImageFolder( dataset_params.path
-                                , transform=transforms.Compose([
-                                    transforms.Grayscale(1)
-                                  , ImageToCoords(contour_size) ]))
+  raw_dataset = datasets.ImageFolder( dataset_params.path
+                                    , transform=transforms.Compose([
+                                        transforms.Grayscale(1)
+                                      , ImageToCoords(contour_size) ]))
+  dataset = [x for x in raw_dataset]
+  random.shuffle(dataset)
   return dataset
-  #dataloader = bioimage_embed.lightning.DataModule(dataset, shuffle=True)
-  #dataloader.setup()
-  #return dataloader.test
 
 def run_elliptic_fourier_descriptors(dataset, contour_size, logger):
   # run efd on each image
diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py
index 07431864..a2325c86 100755
--- a/scripts/shapeembed/regionprops.py
+++ b/scripts/shapeembed/regionprops.py
@@ -2,6 +2,7 @@
 
 import os
 import types
+import random
 import logging
 import argparse
 from skimage import measure
@@ -13,11 +14,10 @@
 def get_dataset(dataset_params):
   # access the dataset
   assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}'
-  dataset = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1))
+  raw_dataset = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1))
+  dataset = [x for x in raw_dataset]
+  random.shuffle(dataset)
   return dataset
-  #dataloader = bioimage_embed.lightning.DataModule(dataset, shuffle=True)
-  #dataloader.setup()
-  #return dataloader.test
 
 def run_regionprops( dataset
                    , properties

From 83a7679bb893928be9d46e5eb5a60ac38edfb4ce Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sat, 7 Sep 2024 20:35:52 +0100
Subject: [PATCH 196/204] force different markers for scatter plot F1vMSE

---
 scripts/shapeembed/gather_run_results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index 478a4c73..1d2ca37e 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -182,7 +182,7 @@ def keep_first_fname(series):
   dff=df[df['mse/test']<df['mse/test'].quantile(0.9)] # drop mse outlier
   #mse=df['mse/test']
   #print(f'mse, mean: {mse.mean()}, std: {mse.std()}')
-  ax = seaborn.relplot(data=dff, x='mse/test', y='test_f1', hue='model', aspect=1.61)
+  ax = seaborn.relplot(data=dff, x='mse/test', y='test_f1', hue='model', aspect=1.61, style='model')
   ax.figure.savefig(f'{clargs.output_dir}/f1VSmse_scatter.png')
 
   dff = df.dropna(subset=['model'])

From b9c64d5bddb4e93fbf9d04a5ee14dbf10307153b Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 9 Sep 2024 18:35:27 +0100
Subject: [PATCH 197/204] updated scatterplot

---
 scripts/shapeembed/gather_run_results.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index 1d2ca37e..35098ebd 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -182,7 +182,9 @@ def keep_first_fname(series):
   dff=df[df['mse/test']<df['mse/test'].quantile(0.9)] # drop mse outlier
   #mse=df['mse/test']
   #print(f'mse, mean: {mse.mean()}, std: {mse.std()}')
-  ax = seaborn.relplot(data=dff, x='mse/test', y='test_f1', hue='model', aspect=1.61, style='model')
+  ax = seaborn.scatterplot(data=dff, x='mse/test', y='test_f1', hue='model', style='model')
+  ax.tick_params(axis='x', rotation=22.5)
+  plt.tight_layout()
   ax.figure.savefig(f'{clargs.output_dir}/f1VSmse_scatter.png')
 
   dff = df.dropna(subset=['model'])

From d51a1200e55e10560a75f508dafc93d45938d2c1 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 27 Sep 2024 13:42:39 +0100
Subject: [PATCH 198/204] add standard deviation to the report for regions
 props and efd

---
 scripts/shapeembed/gather_run_results.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py
index 35098ebd..b14ffb58 100755
--- a/scripts/shapeembed/gather_run_results.py
+++ b/scripts/shapeembed/gather_run_results.py
@@ -26,7 +26,7 @@ def trial_table(df, tname):
 
 #def simple_table(df, tname, model_re=".*vq.*"):
 def simple_table(df, tname, model_re=".*", sort_by_col=None, ascending=False, best_n=40):
-  cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1', 'mse/test']
+  cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1', 'test_f1_std', 'mse/test']
   df = df.loc[df.model.str.contains(model_re), cols].sort_values(by=cols)
   if sort_by_col:
     df = df.sort_values(by=sort_by_col, ascending=ascending)
@@ -34,11 +34,11 @@ def simple_table(df, tname, model_re=".*", sort_by_col=None, ascending=False, be
 
   with open(f'{tname}_tabular.tex', 'w') as fp:
     fp.write("\\begin{tabular}{|llll|r|r|} \hline\n")
-    fp.write("Model & CF (and latent space size) & batch size & BETA & F1 score & Mse \\\\ \hline\n")
+    fp.write("Model & CF (and latent space size) & batch size & BETA & F1 score & F1 score (std) & Mse \\\\ \hline\n")
     for _, r in df.iterrows():
       mname = r['model'].replace('_','\_')
       beta = '-' if pd.isna(r['beta']) else r['beta']
-      fp.write(f"{mname} & {r['compression_factor']} ({r['latent_dim']}) & {r['batch_size']} & {beta} & {r['test_f1']:f} & {r['mse/test']:f} \\\\\n")
+      fp.write(f"{mname} & {r['compression_factor']} ({r['latent_dim']}) & {r['batch_size']} & {beta} & {r['test_f1']:f} & {r['test_f1_std']:f} & {r['mse/test']:f} \\\\\n")
     fp.write("\hline\n")
     fp.write("\end{tabular}\n")
 
@@ -146,12 +146,14 @@ def keep_first_fname(series):
   df.set_index(idx_cols, inplace=True)
   df.sort_index(inplace=True)
   #df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({
+  df['test_f1_std'] = df['test_f1'].astype(float)
   df = df.groupby(level=idx_cols, dropna=False).agg({
     'beta': 'mean'
   , 'test_accuracy': 'mean'
   , 'test_precision': 'mean'
   , 'test_recall': 'mean'
   , 'test_f1': 'mean'
+  , 'test_f1_std': 'std'
   , 'mse/test': 'mean'
   , 'loss/test': 'mean'
   , 'mse/val': 'mean'

From c0232c7c36bd883941cc609a5e795cea894267b5 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Fri, 27 Sep 2024 13:48:29 +0100
Subject: [PATCH 199/204] modification slurm script

---
 scripts/shapeembed/slurm_sweep_shapeembed.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py
index 1080394c..7a94fda2 100755
--- a/scripts/shapeembed/slurm_sweep_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweep_shapeembed.py
@@ -21,8 +21,8 @@
 #  ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask")
 #  ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask")
 #  ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask")
-# ("vampire_cells", f"{datasets_pfx}/vampire_cells/", "mask")
- ("vampire_nuclei", f"{datasets_pfx}/vampire_nuclei/", "mask")
+ ("mefs_cells", f"{datasets_pfx}/mefs_single_object_cell/", "mask")
+# ("vampire_nuclei", f"{datasets_pfx}/vampire_nuclei/", "mask")
 #, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask")
 #, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask")
 #, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask")
@@ -32,11 +32,11 @@
 
 models = [
   "resnet18_vqvae"
-, "resnet50_vqvae"
-, "resnet18_vae"
-, "resnet50_vae"
+#, "resnet50_vqvae"
+#, "resnet18_vae"
+#, "resnet50_vae"
 , "resnet18_beta_vae"
-, "resnet50_beta_vae"
+#, "resnet50_beta_vae"
 #, "resnet18_vae_bolt"
 #, "resnet50_vae_bolt"
 #, "resnet18_vqvae_legacy"
@@ -49,8 +49,10 @@
 ]
 
 model_params = {
-  "resnet18_beta_vae": {'beta': [2,5]}
-, "resnet50_beta_vae": {'beta': [2,5]}
+  #"resnet18_beta_vae": {'beta': [2,5]}
+  "resnet18_beta_vae": {'beta': [0.0001]}
+#, "resnet50_beta_vae": {'beta': [2,5]}
+, "resnet50_beta_vae": {'beta': [0.00001]}
 }
 
 compression_factors = [1,2,3,5,10]
@@ -119,7 +121,7 @@ def find_submitted_slurm_jobs():
 dflt_out_dir=f'{os.getcwd()}/output_results_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}'
 
 slurm_time = '50:00:00'
-slurm_mem = '250G'
+slurm_mem = '80G'
 slurm_gpus = 'a100:1'
 
 shapeembed_script=f'{os.getcwd()}/shapeembed.py'

From 870aa4211d522a7069b4b1ed0963026fa5ed6cde Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 29 Sep 2024 18:26:17 +0100
Subject: [PATCH 200/204] changes to test o2vae integration XXX relies on an
 adapted o2vae repo present in bioimage_embed/modles/o2vae

---
 bioimage_embed/models/factory.py              | 67 ++++++++++++++++++-
 scripts/shapeembed/dataset_transformations.py | 66 ++++++++++++++++++
 scripts/shapeembed/shapeembed.py              | 25 ++++++-
 3 files changed, 154 insertions(+), 4 deletions(-)

diff --git a/bioimage_embed/models/factory.py b/bioimage_embed/models/factory.py
index 8c6440d5..4c5f1a21 100644
--- a/bioimage_embed/models/factory.py
+++ b/bioimage_embed/models/factory.py
@@ -18,7 +18,6 @@
 from . import bolts
 from functools import partial
 
-
 class ModelFactory:
     def __init__(
         self, input_dim, latent_dim, pretrained=False, progress=True, **kwargs
@@ -200,6 +199,71 @@ def resnet110_vqvae_legacy(self):
     def resnet152_vqvae_legacy(self):
         return self.resnet_vqvae_legacy(152)
 
+    def o2vae(self):
+        from .o2vae.models.decoders.cnn_decoder import CnnDecoder
+        from .o2vae.models.encoders_o2.e2scnn import E2SFCNN
+        from .o2vae.models.vae import VAE as O2VAE
+
+        # encoder
+        q_net = E2SFCNN(
+                  n_channels = 1,
+                  n_classes = 64 * 2, # bc vae saves mean and stdDev vecors
+                  # `name`: 'o2_cnn' for o2-invariant encoder. 'cnn_encoder' for standard cnn encoder.
+                  name="o2_cnn_encoder",
+                  # `cnn_dims`: must be 6 elements long. Increase numbers for larger model capacity
+                  cnn_dims=[6, 9, 12, 12, 19, 25],
+                  # `layer_type`: type of cnn layer (following e2cnn library examples)
+                  layer_type="inducedgated_norm",  # recommend not changing
+                  # `N`: Ignored if `name!='o2'`. Negative means the model will be O2-invariant.
+                  #     Again, see (e2cnn library examples). Recommend not changing.
+                  N=-3,
+                )
+
+        # decoder
+        p_net = CnnDecoder(
+                  zdim = 64,
+                  name="cnn_decoder",  # 'cnn' is the ony option
+                  # `cnn_dims`: each extra layer doubles the dimension (image width) by a factor of 2.
+                  #    E.g. if there are 6 elements, image width is 2^6=64
+                  cnn_dims=[192, 96, 96, 48, 48, 48],
+                  #cnn_dims=[192, 96, 96, 48, 48, 24, 24, 12, 12],
+                  out_channels=1,
+                )
+
+        # vae
+        model = O2VAE(
+                  q_net = q_net,
+                  p_net = p_net,
+                  zdim = 64,         # vae bottleneck layer
+                  do_sigmoid = True, # whether to make the output be between [0,1]. Usually True. 
+                  loss_kwargs = dict(
+                    # 'beta' from beta-vae, or the weight on the KL-divergence term https://openreview.net/forum?id=Sy2fzU9gl
+                    beta=0.01,
+                    # `recon_loss_type`: "bce" (binary cross entropy) or "mse" (mean square error)
+                    #    or "ce" (cross-entropy, but warning, not been tested well)
+                    #recon_loss_type="bce",
+                    recon_loss_type="mse",
+                    # for reconstrutcion loss, pixel mask. Must be either `None` or an array with same dimension as the images.
+                    mask=None,
+                    align_loss=True,  # whether to align the output image to the input image
+                    # whether to use efficient Foureier-based loss alignment. (Ignored if align_loss==False)
+                    align_fourier=True,
+                    # whether to do align the best rotation AND flip, instead of just rotation. (Ignored if align_loss==False)
+                    do_flip=True,
+                    # if doing brute force align loss, this is the rotation discretization. (Ignored if
+                    #   align_loss==False or if align_fourier==True)
+                    rot_steps=2,
+                    # Recommend not changing. The vae prior distribution. Optoins: ("standard","normal","gmm"). See models.vae.VAE for deatils.
+                    prior_kwargs=dict( prior="standard",),
+                  )
+                )
+
+        # extra attributes
+        model.encoder = q_net
+        model.decoder = p_net
+
+        return model
+
 
 MODELS = [
     "resnet18_vae",
@@ -217,6 +281,7 @@ def resnet152_vqvae_legacy(self):
     "resnet152_vqvae_legacy",
     "resnet18_vae_legacy",
     "resnet50_vae_legacy",
+    "o2vae",
 ]
 
 from typing import Tuple
diff --git a/scripts/shapeembed/dataset_transformations.py b/scripts/shapeembed/dataset_transformations.py
index 1cd76c7f..ad3789c7 100644
--- a/scripts/shapeembed/dataset_transformations.py
+++ b/scripts/shapeembed/dataset_transformations.py
@@ -146,3 +146,69 @@ def mask2distmatrix(mask, matrix_size=512, raw_sampling_sparsity=1):
   dm = build_distance_matrix(x_reinterpolated, y_reinterpolated)
   logger.debug(f'mask2distmatrix: created distance matrix shape {dm.shape}')
   return dm
+
+def bbox(img):
+  """
+  This function returns the bounding box of the content of an image, where
+  "content" is any non 0-valued pixel. The bounding box is returned as the
+  quadruple ymin, ymax, xmin, xmax.
+
+  Parameters
+  ----------
+  img : 2-d numpy array
+    An image with an object to find the bounding box for. The truth value of
+    object pixels should be True and of non-object pixels should be False.
+
+  Returns
+  -------
+  ymin: int
+    The lowest index row containing object pixels
+  ymax: int
+    The highest index row containing object pixels
+  xmin: int
+    The lowest index column containing object pixels
+  xmax: int
+    The highest index column containing object pixels
+  """
+  rows = np.any(img, axis=1)
+  cols = np.any(img, axis=0)
+  ymin, ymax = np.where(rows)[0][[0, -1]]
+  xmin, xmax = np.where(cols)[0][[0, -1]]
+  return ymin, ymax, xmin, xmax
+
+def recrop_image(img, square=False):
+  """
+  This function returns an image recroped to its content.
+
+  Parameters
+  ----------
+  img : 3-d numpy array
+    A 3-channels (rgb) 2-d image with an object to recrop around. The value of
+    object pixels should be non-zero (and zero for non-object pixels).
+
+  Returns
+  -------
+  3-d numpy array
+    The recroped image
+  """
+
+  ymin, ymax, xmin, xmax = bbox(img)
+  newimg = img[ymin:ymax+1, xmin:xmax+1]
+
+  if square: # slot the new image into a black square
+    dx, dy = xmax - xmin + 1, ymax - ymin + 1
+    dmax = max(dx, dy)
+    dmin = min(dx, dy)
+    dd = max(dx, dy) - min(dx, dy)
+    off = dd // 2
+    res = np.full((dmax, dmax, 3), [.0,.0,.0]) # big black square
+    if dx < dy: # fewer columns, center horizontally
+      res[:, off+1:off+1+newimg.shape[1]] = newimg
+    else: # fewer lines, center vertically
+      #print(f"DEBUG: dx {dx}, dy {dy}, dmax {dmax}, dd {dd}, off {off}")
+      #print(f"DEBUG: res[off+1:off+1+newimg.shape[0],:].shape: {res[off+1:off+1+newimg.shape[0],:].shape}")
+      #print(f"DEBUG: newimg.shape: {newimg.shape}")
+      res[off+1:off+1+newimg.shape[0],:] = newimg
+    return res
+  else:
+    return newimg
diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py
index e62dbd59..9f18a7c0 100755
--- a/scripts/shapeembed/shapeembed.py
+++ b/scripts/shapeembed/shapeembed.py
@@ -60,6 +60,7 @@
 , "resnet152_vqvae_legacy"
 , "resnet18_vae_legacy"
 , "resnet50_vae_legacy"
+, "o2vae"
 ]
 
 # set of parameters for a run, with default values
@@ -165,12 +166,30 @@ def get_dataloader(params):
   if params.dataset.type == 'raw_image': # TODO
     raise NotImplementedError("raw images not yet supported")
   elif params.dataset.type == 'mask': # mask data, convert to distance matrix first
+    #dataset = datasets.ImageFolder(
+    #  params.dataset.path
+    #, transforms.Compose([ np.array
+    #                     , functools.partial( mask2distmatrix
+    #                                        , matrix_size=params.distance_matrix_size )
+    #                     , distmat_ts ]))
+    def f(x):
+      print(f"DEBUG: shape:{x.shape}")
+      return x
+    def g(x):
+      print(f"-------------")
+      return x
     dataset = datasets.ImageFolder(
       params.dataset.path
     , transforms.Compose([ np.array
-                         , functools.partial( mask2distmatrix
-                                            , matrix_size=params.distance_matrix_size )
-                         , distmat_ts ]))
+                         , functools.partial(recrop_image, square=True)
+                         , torch.as_tensor
+                         , lambda x: torch.transpose(x, 0, 2)
+                         , transforms.Resize(64)
+                         , lambda x: torch.transpose(x, 0, 2)
+                         , rgb2grey
+                         #, lambda x: x.repeat(3, 1, 1)
+                         , lambda x: x.repeat(1, 1, 1)
+                         ]))
   elif params.dataset.type == 'distance_matrix': # distance matrix data
     dataset = datasets.DatasetFolder( params.dataset.path
                                     , loader=np.load

From 6fa5cc903f518e2bd8a766734bf332416ec84448 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Sun, 29 Sep 2024 22:28:50 +0100
Subject: [PATCH 201/204] off-by-one in square recrop

---
 scripts/shapeembed/dataset_transformations.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/scripts/shapeembed/dataset_transformations.py b/scripts/shapeembed/dataset_transformations.py
index ad3789c7..8c4c6693 100644
--- a/scripts/shapeembed/dataset_transformations.py
+++ b/scripts/shapeembed/dataset_transformations.py
@@ -196,19 +196,21 @@ def recrop_image(img, square=False):
   newimg = img[ymin:ymax+1, xmin:xmax+1]
 
   if square: # slot the new image into a black square
-    dx, dy = xmax - xmin + 1, ymax - ymin + 1
+    dx, dy = xmax+1 - xmin, ymax+1 - ymin
     dmax = max(dx, dy)
-    dmin = min(dx, dy)
+    #dmin = min(dx, dy)
     dd = max(dx, dy) - min(dx, dy)
     off = dd // 2
     res = np.full((dmax, dmax, 3), [.0,.0,.0]) # big black square
+    #print(f"DEBUG: dx {dx}, dy {dy}, dmax {dmax}, dd {dd}, off {off}")
+    #print(f"DEBUG: res[off+1:off+1+newimg.shape[0],:].shape: {res[off+1:off+1+newimg.shape[0],:].shape}")
+    #print(f"DEBUG: newimg.shape: {newimg.shape}")
     if dx < dy: # fewer columns, center horizontally
-      res[:, off+1:off+1+newimg.shape[1]] = newimg
+      res[:, off:off+newimg.shape[1]] = newimg
     else: # fewer lines, center vertically
-      #print(f"DEBUG: dx {dx}, dy {dy}, dmax {dmax}, dd {dd}, off {off}")
-      #print(f"DEBUG: res[off+1:off+1+newimg.shape[0],:].shape: {res[off+1:off+1+newimg.shape[0],:].shape}")
-      #print(f"DEBUG: newimg.shape: {newimg.shape}")
-      res[off+1:off+1+newimg.shape[0],:] = newimg
+      res[off:off+newimg.shape[0],:] = newimg
+    #print(f"DEBUG: res img updated")
+    #print(f"DEBUG: ------------------------------")
     return res
   else:
     return newimg

From 85ce853ad6fb2a47ee17246716cabb3b4f3c1958 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 30 Sep 2024 08:26:40 +0100
Subject: [PATCH 202/204] added drop_last for uneven dataset sizes

---
 bioimage_embed/lightning/dataloader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bioimage_embed/lightning/dataloader.py b/bioimage_embed/lightning/dataloader.py
index 29f608a4..34b84097 100644
--- a/bioimage_embed/lightning/dataloader.py
+++ b/bioimage_embed/lightning/dataloader.py
@@ -35,6 +35,7 @@ def __init__(
             "pin_memory": True,
             "shuffle": False,
             "sampler": sampler,
+            "drop_last": True,
             # "collate_fn": self.collate_wrapper(self.collate_filter_for_none),
             # "collate_fn": self.collate_filter_for_none,
         }

From db3a4fb975f6c0a4cc33b31c3ecbd50f08336345 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 30 Sep 2024 17:20:51 +0100
Subject: [PATCH 203/204] specialized slurm script

---
 scripts/shapeembed/slurm_sweep_shapeembed.py | 28 +++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py
index 7a94fda2..d04a5d5f 100755
--- a/scripts/shapeembed/slurm_sweep_shapeembed.py
+++ b/scripts/shapeembed/slurm_sweep_shapeembed.py
@@ -21,21 +21,22 @@
 #  ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask")
 #  ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask")
 #  ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask")
- ("mefs_cells", f"{datasets_pfx}/mefs_single_object_cell/", "mask")
-# ("vampire_nuclei", f"{datasets_pfx}/vampire_nuclei/", "mask")
-#, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask")
-#, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask")
-#, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask")
-#, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask")
-#, ("allen", f"{datasets_pfx}/allen_dataset/", "mask")
+  ("mefs_cells", f"{datasets_pfx}/mefs_single_object_cell/", "mask")
+, ("vampire_nuclei", f"{datasets_pfx}/vampire_nuclei/", "mask")
+, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask")
+, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask")
+, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask")
+, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask")
+, ("allen", f"{datasets_pfx}/allen_dataset/", "mask")
 ]
 
 models = [
-  "resnet18_vqvae"
+  "o2vae"
+#  "resnet18_vqvae"
 #, "resnet50_vqvae"
 #, "resnet18_vae"
 #, "resnet50_vae"
-, "resnet18_beta_vae"
+#, "resnet18_beta_vae"
 #, "resnet50_beta_vae"
 #, "resnet18_vae_bolt"
 #, "resnet50_vae_bolt"
@@ -50,14 +51,15 @@
 
 model_params = {
   #"resnet18_beta_vae": {'beta': [2,5]}
-  "resnet18_beta_vae": {'beta': [0.0001]}
+#  "resnet18_beta_vae": {'beta': [0.0001]}
 #, "resnet50_beta_vae": {'beta': [2,5]}
-, "resnet50_beta_vae": {'beta': [0.00001]}
+#, "resnet50_beta_vae": {'beta': [0.00001]}
 }
 
-compression_factors = [1,2,3,5,10]
+#compression_factors = [1,2,3,5,10]
+compression_factors = [1]
 
-batch_sizes = [4, 8, 16]
+batch_sizes = [4, 16, 64, 128, 256]
 
 # XXX XXX XXX XXX XXX XXX XXX #
 # XXX ad-hoc one-off config XXX #

From c45b884273216eef610fd8b2dca4094dae5bb176 Mon Sep 17 00:00:00 2001
From: Anna Foix <afoix@ebi.ac.uk>
Date: Mon, 30 Sep 2024 17:21:56 +0100
Subject: [PATCH 204/204] added o2vae repo patch

---
 .../models/o2vae_shapeembed_integration.diff  | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 bioimage_embed/models/o2vae_shapeembed_integration.diff

diff --git a/bioimage_embed/models/o2vae_shapeembed_integration.diff b/bioimage_embed/models/o2vae_shapeembed_integration.diff
new file mode 100644
index 00000000..309d7206
--- /dev/null
+++ b/bioimage_embed/models/o2vae_shapeembed_integration.diff
@@ -0,0 +1,97 @@
+diff --git a/models/align_reconstructions.py b/models/align_reconstructions.py
+index d07d1ab..c52b40d 100644
+--- a/models/align_reconstructions.py
++++ b/models/align_reconstructions.py
+@@ -6,7 +6,7 @@ import torch
+ import torchgeometry as tgm
+ import torchvision.transforms.functional as T_f
+ 
+-from registration import registration
++from ..registration import registration
+ 
+ 
+ def loss_reconstruction_fourier_batch(x, y, recon_loss_type="bce", mask=None):
+diff --git a/models/decoders/cnn_decoder.py b/models/decoders/cnn_decoder.py
+index ba3a1cc..1740945 100644
+--- a/models/decoders/cnn_decoder.py
++++ b/models/decoders/cnn_decoder.py
+@@ -58,7 +58,7 @@ class CnnDecoder(nn.Module):
+ 
+         self.dec_conv = nn.Sequential(*layers)
+ 
+-    def forward(self, x):
++    def forward(self, x, epoch = None):
+         bs = x.size(0)
+         x = self.fc(x)
+         dim = x.size(1)
+diff --git a/models/encoders_o2/e2scnn.py b/models/encoders_o2/e2scnn.py
+index 9c4f47f..e292b1e 100644
+--- a/models/encoders_o2/e2scnn.py
++++ b/models/encoders_o2/e2scnn.py
+@@ -219,14 +219,20 @@ class E2SFCNN(torch.nn.Module):
+                 repr += f"\t{i: <3} - {name: <70} | {params: <8} |\n"
+         return repr
+ 
+-    def forward(self, input: torch.tensor):
++    def forward(self, input: torch.tensor, epoch = None):
++        #print(f"DEBUG: e2scnn forward: input.shape: {input.shape}")
+         x = GeometricTensor(input, self.in_repr)
++        #print(f"DEBUG: e2scnn forward: pre layers x.shape: {x.shape}")
+ 
+         for layer in self.eq_layers:
+             x = layer(x)
+ 
++        #print(f"DEBUG: e2scnn forward: pre fully_net x.shape: {x.shape}")
++
+         x = self.fully_net(x.tensor.reshape(x.tensor.shape[0], -1))
+ 
++        #print(f"DEBUG: e2scnn forward: pre final x.shape: {x.shape}")
++
+         return x
+ 
+     def build_layer_regular(
+diff --git a/models/vae.py b/models/vae.py
+index 3af262b..af1a2dc 100644
+--- a/models/vae.py
++++ b/models/vae.py
+@@ -3,8 +3,9 @@ import importlib
+ import numpy as np
+ import torch
+ import torchvision
++from pythae.models.base.base_utils import ModelOutput
+ 
+-from models import align_reconstructions
++from . import align_reconstructions
+ 
+ from . import model_utils as mut
+ 
+@@ -273,10 +274,11 @@ class VAE(torch.nn.Module):
+ 
+         return y
+ 
+-    def forward(self, x):
++    def forward(self, x, epoch = None):
++        x = x["data"]
+         in_shape = x.shape
+         bs = in_shape[0]
+-        assert x.ndim == 4
++        assert len(in_shape) == 4
+ 
+         # inference and sample
+         z = self.q_net(x)
+@@ -290,8 +292,12 @@ class VAE(torch.nn.Module):
+             y = torch.sigmoid(y)
+         # check the spatial dimensions are good (if doing multiclass prediction per pixel, the `c` dim may be different)
+         assert in_shape[-2:] == y.shape[-2:], (
+-            "output image different dimension to "
+-            "input image ... probably change the number of layers (cnn_dims) in the decoder"
++            f"output image different dimension {y.shape[-2:]} to "
++            f"input image {in_shape[-2:]} ... probably change the number of layers (cnn_dims) in the decoder"
+         )
+ 
+-        return x, y, mu, logvar
++        # gather losses
++        losses = self.loss(x, y, mu, logvar)
++
++        return ModelOutput(recon_x=y, z=z_sample, loss=losses['loss'], recon_loss=losses['loss_recon'])
++        #return ModelOutput(recon_x=y, z=z_sample)