From 7b43109dc5dcc5a0471b60be8c1c41c25c50f6f8 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Mon, 15 Jan 2024 15:07:24 +0000 Subject: [PATCH 001/204] Removing umap feature compression --- scripts/shapes/shape_embed.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 0c4efec4..7eb1160e 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -342,30 +342,15 @@ def shape_embed_process(): # Use the namespace variables latent_space = torch.stack([d.out.z.flatten() for d in predictions]) scalings = torch.stack([d.x.scalings.flatten() for d in predictions]) - idx_to_class = {v: k for k, v in dataset.dataset.class_to_idx.items()} - y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()]) y_partial = y.copy() indices = np.random.choice(y.size, int(0.3 * y.size), replace=False) y_partial[indices] = -1 y_blind = -1 * np.ones_like(y) - umap_labels = y_blind - classes = np.array([idx_to_class[i] for i in y]) - - n_components = 64 # Number of UMAP components - component_names = [f"umap{i}" for i in range(n_components)] # List of column names - - logger.info("UMAP fitting") - mapper = umap.UMAP(n_components=64, random_state=42).fit( - latent_space.numpy(), y=umap_labels - ) - - logger.info("UMAP transforming") - semi_supervised_latent = mapper.transform(latent_space.numpy()) - - df = pd.DataFrame(semi_supervised_latent, columns=component_names) + + df = pd.DataFrame(latent_space.numpy()) df["Class"] = y # Map numeric classes to their labels idx_to_class = {0: "alive", 1: "dead"} From b17827a40f6e0af43a7c520bf99374f80955e72a Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Fri, 5 Jan 2024 18:50:29 +0000 Subject: [PATCH 002/204] Indexation augmentation (forgot this wasnt in here) --- bioimage_embed/shapes/transforms.py | 26 ++++++++++++++++++++++++-- scripts/shapes/shape_embed.py | 7 ++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/bioimage_embed/shapes/transforms.py b/bioimage_embed/shapes/transforms.py index 33535871..504b71cc 100644 --- a/bioimage_embed/shapes/transforms.py +++ b/bioimage_embed/shapes/transforms.py @@ -159,8 +159,13 @@ def __repr__(self): def get_distogram(self, coords, matrix_normalised=False): xii, yii = coords - distance_matrix = euclidean_distances(np.array([xii, yii]).T) - # Fro norm is the same as the L2 norm, but for positive semi-definite matrices + distance_matrix = euclidean_distances(np.array([xii, yii]).T) / ( + np.sqrt(2) * self.size + ) + # TODO size should be shape of matrix and the normalisation should be + # D / (np.linalg.norm(x.shape[-2:])) + + norm = np.linalg.norm(distance_matrix, "fro") if matrix_normalised: return distance_matrix / np.linalg.norm(distance_matrix, "fro") if not matrix_normalised: @@ -365,3 +370,20 @@ def asym_dist_to_sym_dist(self, asymm_dist): sym_dist = np.max(dist_stack, axis=0) return torch.tensor(np.array(sym_dist)) + + +class RotateIndexingClockwise(nn.Module): + def __init__(self, max_rotations=None, p=1.0): + super(RotateIndexingClockwise, self).__init__() + self.max_rotations = max_rotations + self.probability = p + + def forward(self, img): + if np.random.rand() < self.probability: + if self.max_rotations is None: + self.max_rotations = img.shape[0] + num_rotations = np.random.randint(0, self.max_rotations) + img = np.roll( + img.numpy(), shift=[num_rotations, num_rotations], axis=[0, 1] + ) + return torch.from_numpy(img) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index b6834cac..49f9dff9 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -241,7 +241,12 @@ def shape_embed_process(): # %% gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1)) transform = transforms.Compose( - [transform_mask_to_dist, transforms.ToTensor(), gray2rgb] + [ + transform_mask_to_dist, + transforms.ToTensor(), + RotateIndexingClockwise(p=1), + gray2rgb, + ] ) dataset = datasets.ImageFolder(train_data_path, transform=transform) From 5cf77fc9bd88553dd23df8cdb67cc81e774c504d Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Mon, 8 Jan 2024 09:54:11 +0000 Subject: [PATCH 003/204] Fixed the import issue --- scripts/shapes/shape_embed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 49f9dff9..df8f26fd 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -41,6 +41,7 @@ CropCentroidPipeline, DistogramToCoords, MaskToDistogramPipeline, + RotateIndexingClockwise, ) import matplotlib.pyplot as plt From 9273f8cfd161a36daf9466ad7718c8a502c059ac Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Mon, 8 Jan 2024 13:51:23 +0000 Subject: [PATCH 004/204] missing import --- bioimage_embed/shapes/transforms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bioimage_embed/shapes/transforms.py b/bioimage_embed/shapes/transforms.py index 504b71cc..1d350a04 100644 --- a/bioimage_embed/shapes/transforms.py +++ b/bioimage_embed/shapes/transforms.py @@ -11,6 +11,7 @@ from sklearn.metrics.pairwise import euclidean_distances from skimage.measure import find_contours import torch +from torch import nn import torch.nn.functional as F from . import contours From 5f46b74ba775e4ab53e053dcdfcf8003025d40f4 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 17 Jan 2024 13:29:14 +0000 Subject: [PATCH 005/204] Fixing tests --- bioimage_embed/tests/test_lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bioimage_embed/tests/test_lightning.py b/bioimage_embed/tests/test_lightning.py index e1e5dc4a..a02ed2ca 100644 --- a/bioimage_embed/tests/test_lightning.py +++ b/bioimage_embed/tests/test_lightning.py @@ -109,7 +109,7 @@ def data(input_dim): @pytest.fixture() def dataset(data): - return data.unsqueeze(0) + return data @pytest.fixture() From 0fc9066d1a4c4b3075af79bf7337bf4e976594fc Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Tue, 16 Jan 2024 14:16:38 +0000 Subject: [PATCH 006/204] First attempt at setting up the testing cicd --- .github/workflows/docker.yaml | 60 ++++++++++----------- .github/workflows/test.yaml | 56 ++++++++++---------- Makefile | 98 ----------------------------------- environment.yml | 32 ++++++------ 4 files changed, 69 insertions(+), 177 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index fa528792..d5c5392e 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -2,8 +2,8 @@ name: Publish Docker on: push: branches: - - main - - master + - main + - master # pull_request: ~ env: @@ -14,37 +14,29 @@ jobs: build: runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v3.3.0 - with: - fetch-depth: 2 - - name: Log in to the Container registry - uses: docker/login-action@v2.1.0 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout + uses: actions/checkout@v3.3.0 + with: + fetch-depth: 2 + - name: Log in to the Container registry + if: ${{ !env.ACT }} + uses: docker/login-action@v2.1.0 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@v4.3.0 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4.3.0 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - - name: Build and push Docker image (version tag) - if: steps.check-version.outputs.current-version - uses: docker/build-push-action@v3.3.0 - with: - context: . - push: true - tags: ghcr.io/${{ github.repository }}:${{ steps.check-version.outputs.current-version }} - labels: ${{ steps.meta.outputs.labels }} - - - name: Build and push Docker image (latest tag) - if: steps.check-version.outputs.current-version - uses: docker/build-push-action@v3.3.0 - with: - context: . - push: true - tags: ghcr.io/${{ github.repository }}:latest - labels: ${{ steps.meta.outputs.labels }} \ No newline at end of file + - name: Build and push Docker image (version tag) + if: steps.check-version.outputs.current-version + uses: docker/build-push-action@v3.3.0 + with: + context: . + push: true + tags: ghcr.io/${{ github.repository }}:${{ steps.check-version.outputs.current-version }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5f3d9f5f..cc5c6b31 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,36 +1,34 @@ -# https://github.com/marketplace/actions/install-poetry-action -name: test - -on: [pull_request,push] - +name: conda +on: [push] jobs: - test: + constructor: + name: conda build (${{ matrix.python-version }}, ${{ matrix.os }}) + runs-on: ${{ matrix.os }}-latest defaults: run: - shell: bash -l {0} + shell: ${{ matrix.shell }} strategy: - fail-fast: false matrix: + os: [ubuntu] python-version: ["3.9"] - os: [ubuntu-latest] - # os: [ubuntu-18.04, macos-latest, windows-latest] - runs-on: ${{ matrix.os }} + include: + - os: ubuntu + shell: bash -l {0} + - os: windows + shell: cmd /C call {0} + - os: macos + shell: bash -l {0} steps: - - name: Check out repository - uses: actions/checkout@v2 - - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - use-mamba: true - environment-file: environment.yml - python-version: ${{ matrix.python-version }} - - name: poetry env - run: poetry env use python - - name: Poetry lock - run: poetry lock - - name: Install library - run: poetry install --no-interaction - # - name: Run tests - # run: | - # source .venv/bin/activate - # pytest tests/ \ No newline at end of file + - uses: actions/checkout@v2 + - uses: conda-incubator/setup-miniconda@v2 + with: + environment-file: environment.yml + miniforge-variant: Mambaforge + miniforge-version: latest + mamba-version: "*" + use-mamba: true + python-version: ${{ matrix.python-version }} + - name: Run tests + run: | + source .venv/bin/activate + make test diff --git a/Makefile b/Makefile index 1f1fb42a..08d73569 100644 --- a/Makefile +++ b/Makefile @@ -9,101 +9,3 @@ download.data: test: pytest - -GOOGLE_APPLICATION_CREDENTIALS=$(shell pwd)/credentials.json -BUCKET_NAME=idr-hipsci -TRAINING_DIR=idr0034-kilpinen-hipsci -PROJECT=prj-ext-dev-bia-binder-113155 - -JOB_PREFIX=vae -JOB_NAME=$(JOB_PREFIX)_$(shell date +%Y%m%d_%H%M%S) -JOB_DIR=gs://${BUCKET_NAME}/${JOB_NAME}/models -DATA_DIR=gs://${BUCKET_NAME}/${TRAINING_DIR} - -.EXPORT_ALL_VARIABLES: - GOOGLE_APPLICATION_CREDENTIALS - BUCKET_NAME - TRAINING_DIR - JOB_PREFIX - JOB_NAME - JOB_DIR - - -# MY_VAR := $(shell echo whatever) - -# test: -# @echo MY_VAR IS $(MY_VAR) - -test: - @echo $$GOOGLE_APPLICATION_CREDENTIALS $$BUCKET_NAME $$TRAINING_DIR - -all: get_data_list build - -build: - conda activate torch - python idr_get_data.py - -get_data_list: - ls /nfs/bioimage/drop/idr*/**/*.tiff > file_list.txt - ls -u /nfs/bioimage/drop/idr*/**/*.tiff > file_list.txt - -run.on.cloud: - python idr_get_data_s3.py - -run.on.cloud.snake: - snakemake --use-conda --cores all \ - --verbose --google-lifesciences \ - --default-remote-prefix idr-hipsci \ - --google-lifesciences-region eu-west2 - -run.snake: - snakemake --cores all -F --use-conda --verbose - -get.env.file: - conda env export --from-history -f environment.yml -n torch - -on.gcp: - gcloud ai-platform jobs submit training ${JOB_NAME} \ - --region=europe-west2 \ - --master-image-uri=gcr.io/cloud-ml-public/training/pytorch-gpu.1-9 \ - --scale-tier=CUSTOM \ - --master-machine-type=n1-standard-8 \ - --master-accelerator=type=nvidia-tesla-t4,count=1 \ - --job-dir=${JOB_DIR} \ - --package-path=./trainer \ - --module-name=trainer.train \ - --stream-logs \ - -- \ - --num-epochs=10 \ - --batch-size=100 \ - --learning-rate=0.001 \ - --gpus=1 - - -on.gcp.big: - gcloud ai-platform jobs submit training ${JOB_NAME} \ - --region=europe-west2 \ - --master-image-uri=gcr.io/cloud-ml-public/training/pytorch-gpu.1-9 \ - --config=config.yaml \ - --job-dir=${JOB_DIR} \ - --package-path=./trainer \ - --module-name=trainer.train \ - --stream-logs \ - -- \ - --num-epochs=10 \ - --batch-size=100 \ - --learning-rate=0.001 \ - --gpus=2 \ - --accelerator='ddp'\ - --num_nodes=3 - -tensorboard: - tensorboard --logdir=gs://$(BUCKET_NAME)/${JOB_NAME} -download.data: - kaggle competitions download -c data-science-bowl-2018 - -test: - pytest - -download.idr: - rsync -avR --progress ctr26@noah-login:/nfs/bioimage/drop/idr0093-mueller-perturbation/ data/idr diff --git a/environment.yml b/environment.yml index 32343b75..e1887e27 100644 --- a/environment.yml +++ b/environment.yml @@ -1,19 +1,19 @@ # name: bioimage_embed channels: - - conda-forge - - defaults - - torch - - bioconda +- conda-forge +- defaults +- torch +- bioconda dependencies: - - cudatoolkit-dev=10 - - python=3.9 - - mamba - - poetry - - gcc - - libgcc - - pytorch - - pillow=9.5.0 - - snakemake-minimal - - pip - - pip: - - -e . +- cudatoolkit-dev=10 +- python=3.9 +- mamba +- poetry +- gcc +- libgcc +- pytorch +- pillow=9.5.0 +- snakemake-minimal +- pip +- pip: + - -e . From 8a2d14d77f9a54e58766e3c253ca81dc46ff5bc2 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Tue, 16 Jan 2024 14:18:32 +0000 Subject: [PATCH 007/204] adding windows back in? --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index cc5c6b31..1bdc01fd 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -9,7 +9,7 @@ jobs: shell: ${{ matrix.shell }} strategy: matrix: - os: [ubuntu] + os: [ubuntu, windows, macos] python-version: ["3.9"] include: - os: ubuntu From 74f2d13cd0f12f38403fa6f1ad30383f64978c3a Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Tue, 16 Jan 2024 14:19:00 +0000 Subject: [PATCH 008/204] commented instead I think this makes more sense --- .github/workflows/test.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1bdc01fd..e0e9d468 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -9,15 +9,16 @@ jobs: shell: ${{ matrix.shell }} strategy: matrix: - os: [ubuntu, windows, macos] + # os: [ubuntu, windows, macos] + os: [ubuntu] python-version: ["3.9"] include: - os: ubuntu shell: bash -l {0} - - os: windows - shell: cmd /C call {0} - - os: macos - shell: bash -l {0} + # - os: windows + # shell: cmd /C call {0} + # - os: macos + # shell: bash -l {0} steps: - uses: actions/checkout@v2 - uses: conda-incubator/setup-miniconda@v2 From c190e04e7160fc8d047617ecabd91675774965bb Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Tue, 16 Jan 2024 14:19:40 +0000 Subject: [PATCH 009/204] removing snakemake from env --- environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environment.yml b/environment.yml index e1887e27..568d68ce 100644 --- a/environment.yml +++ b/environment.yml @@ -13,7 +13,6 @@ dependencies: - libgcc - pytorch - pillow=9.5.0 -- snakemake-minimal - pip - pip: - -e . From 1e657fdac2d5a2e912a32d33dda8232e5d86c005 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Tue, 16 Jan 2024 14:26:20 +0000 Subject: [PATCH 010/204] Forgot to remove sourceing --- .github/workflows/test.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index e0e9d468..dbf6e63d 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -31,5 +31,4 @@ jobs: python-version: ${{ matrix.python-version }} - name: Run tests run: | - source .venv/bin/activate make test From 510aa042dcc2c5ae67bada8d830cccc64e6c8c73 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 18 Jan 2024 10:35:40 +0000 Subject: [PATCH 011/204] Generalised the logging a bit --- bioimage_embed/lightning/torch.py | 73 +++++++++---------- bioimage_embed/models/pythae/legacy/vq_vae.py | 6 +- scripts/shapes/shape_embed.py | 5 +- 3 files changed, 42 insertions(+), 42 deletions(-) diff --git a/bioimage_embed/lightning/torch.py b/bioimage_embed/lightning/torch.py index e9eef522..53d649fe 100644 --- a/bioimage_embed/lightning/torch.py +++ b/bioimage_embed/lightning/torch.py @@ -7,6 +7,7 @@ import argparse import timm from pythae.models.base.base_utils import ModelOutput +import torch.nn.functional as F class LitAutoEncoderTorch(pl.LightningModule): @@ -45,8 +46,8 @@ def __init__(self, model, args=SimpleNamespace()): if args: self.args = SimpleNamespace(**{**vars(args), **vars(self.args)}) # if kwargs: - # merged_kwargs = {k: v for d in kwargs.values() for k, v in d.items()} - # self.args = SimpleNamespace(**{**merged_kwargs, **vars(self.args)}) + # merged_kwargs = {k: v for d in kwargs.values() for k, v in d.items()} + # self.args = SimpleNamespace(**{**merged_kwargs, **vars(self.args)}) self.save_hyperparameters(vars(self.args)) # self.model.train() @@ -72,31 +73,24 @@ def get_model_output(self, x, batch_idx): return model_output, loss def training_step(self, batch, batch_idx): - # results = self.get_results(batch) self.model.train() x = self.batch_to_tensor(batch) model_output, loss = self.get_model_output( x, batch_idx, ) - # loss = self.model.training_step(x) - # loss = self.loss_function(model_output,optimizer_idx) - - # self.log("train_loss", self.loss) - # self.log("train_loss", loss) - self.logger.experiment.add_scalar("Loss/train", loss, batch_idx) - - self.logger.experiment.add_image( - "input", torchvision.utils.make_grid(x["data"]), batch_idx - ) - - # if self.PYTHAE_FLAG: - self.logger.experiment.add_image( - "output", - torchvision.utils.make_grid(model_output.recon_x), - batch_idx, + self.log_dict( + { + "loss/train": loss, + "mse/train": F.mse_loss(model_output.recon_x, x["data"]), + }, + # on_step=True, + on_epoch=True, + prog_bar=True, + logger=True, ) - + if isinstance(self.logger, pl.loggers.TensorBoardLogger): + self.log_tensorboard(model_output, x) return loss def loss_function(self, model_output, *args, **kwargs): @@ -121,20 +115,13 @@ def validation_step(self, batch, batch_idx): x = self.batch_to_tensor(batch) model_output, loss = self.get_model_output(x, batch_idx) z = self.embedding_from_output(model_output) - # z_indices - self.logger.experiment.add_embedding( - z, - label_img=x["data"], - global_step=self.current_epoch, - tag="z", - ) - - self.logger.experiment.add_scalar("Loss/val", loss, batch_idx) - self.logger.experiment.add_image( - "val", - torchvision.utils.make_grid(model_output["recon_x"]), - batch_idx, + self.log_dict( + { + "loss/val": loss, + "mse/val": F.mse_loss(model_output.recon_x, x["data"]), + } ) + return loss # def lr_scheduler_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None): # # Implement your own logic for updating the lr scheduler @@ -181,19 +168,27 @@ def test_step(self, batch, batch_idx): loss = self.loss_function(model_output) # Log test metrics - self.log("test_loss", loss) + self.log_dict( + { + "loss/test": loss, + "mse/test": F.mse_loss(model_output.recon_x, x["data"]), + } + ) + return loss + + def log_wandb(self): + pass + + def log_tensorboard(self, model_output, x): # Optionally you can add more logging, for example, visualizations: self.logger.experiment.add_image( "test_input", torchvision.utils.make_grid(x["data"]), - batch_idx, + self.global_step, ) self.logger.experiment.add_image( "test_output", torchvision.utils.make_grid(model_output.recon_x), - batch_idx, + self.global_step, ) - - # Return whatever data you need, for example, the loss - return loss diff --git a/bioimage_embed/models/pythae/legacy/vq_vae.py b/bioimage_embed/models/pythae/legacy/vq_vae.py index 38a45706..8ddc00c1 100644 --- a/bioimage_embed/models/pythae/legacy/vq_vae.py +++ b/bioimage_embed/models/pythae/legacy/vq_vae.py @@ -132,10 +132,12 @@ def forward(self, x, epoch=None): input=x["data"], ) # This matches how pythae returns the loss + + indices = (encodings == 1).nonzero(as_tuple=True) + recon_loss = F.mse_loss(x_recon, x["data"], reduction="sum") - mse_loss = F.mse_loss(x_recon, x["data"]) + mse_loss = F.mse_loss(x_recon, x["data"], reduction="mean") - indices = (encodings == 1).nonzero(as_tuple=True) variational_loss = loss-mse_loss pythae_loss_dict = { diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index df8f26fd..bf7e5ec5 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -295,13 +295,15 @@ def shape_embed_process(): model_dir = f"my_models/{dataset_path}_{model._get_name()}_{lit_model._get_name()}" tb_logger = pl_loggers.TensorBoardLogger(f"logs/") + wandb = pl_loggers.WandbLogger(project="bioimage-embed", name="shapes") Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True) checkpoint_callback = ModelCheckpoint(dirpath=f"{model_dir}/", save_last=True) + wandb.watch(lit_model, log="all") trainer = pl.Trainer( - logger=tb_logger, + logger=[wandb,tb_logger], gradient_clip_val=0.5, enable_checkpointing=True, devices=1, @@ -310,6 +312,7 @@ def shape_embed_process(): callbacks=[checkpoint_callback], min_epochs=50, max_epochs=args.epochs, + log_every_n_steps=1, ) # %% try: From 3b37ca94c5eb040a86b83a2fbedfebf0988e91f5 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 18 Jan 2024 10:44:17 +0000 Subject: [PATCH 012/204] First attempt as arg hashing for checkpoints --- scripts/shapes/shape_embed.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index df8f26fd..af88d130 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -51,9 +51,17 @@ from matplotlib import rc import logging +import pickle +import base64 +import hashlib logger = logging.getLogger(__name__) +def hashing_fn(args): + serialized_args = pickle.dumps(vars(args)) + hash_object = hashlib.sha256(serialized_args) + hashed_string = base64.urlsafe_b64encode(hash_object.digest()).decode() + return hashed_string def scoring_df(X, y): # Split the data into training and test sets @@ -157,7 +165,6 @@ def shape_embed_process(): path = Path(metadata("")) path.mkdir(parents=True, exist_ok=True) - model_dir = f"models/{dataset_path}_{args.model}" # %% transform_crop = CropCentroidPipeline(window_size) @@ -292,7 +299,7 @@ def shape_embed_process(): dataloader.setup() model.eval() - model_dir = f"my_models/{dataset_path}_{model._get_name()}_{lit_model._get_name()}" + model_dir = f"checkpoints/{hashing_fn(args)}" tb_logger = pl_loggers.TensorBoardLogger(f"logs/") From 8e1894340000e1354b9d5905aa1e6fc455e1bc07 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 18 Jan 2024 10:55:36 +0000 Subject: [PATCH 013/204] Early stopping on val loss to stop overfitting --- scripts/shapes/shape_embed.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index bf7e5ec5..37792a1b 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -20,6 +20,7 @@ import pytorch_lightning as pl import torch from types import SimpleNamespace +from pytorch_lightning.callbacks.early_stopping import EarlyStopping # Deal with the filesystem import torch.multiprocessing @@ -312,6 +313,7 @@ def shape_embed_process(): callbacks=[checkpoint_callback], min_epochs=50, max_epochs=args.epochs, + callbacks=[EarlyStopping(monitor="loss/val", mode="min")], log_every_n_steps=1, ) # %% From 397abe8fac046250ed493de4b9892d0209c73eed Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 18 Jan 2024 12:29:21 +0000 Subject: [PATCH 014/204] Attempt at cli --- bioimage_embed/augmentations.py | 73 +++++++-------- bioimage_embed/cli.py | 12 +++ bioimage_embed/hydra.py | 106 +++++++++++++++++++++ bioimage_embed/tests/test_cli.py | 42 +++++++++ conf/augmentations/default.yaml | 70 -------------- conf/bio_vae/default.yaml | 8 -- conf/checkpoints/default.yaml | 3 - conf/config.yaml | 153 ------------------------------- conf/dataloader/default.yaml | 7 -- conf/dataset/default.yaml | 0 conf/hydra/default.yaml | 14 --- conf/ivy_gap.yaml | 103 --------------------- conf/lightning/default.yaml | 3 - conf/logger/default.yaml | 2 - conf/paths/default.yaml | 18 ---- conf/pythae/default.yaml | 17 ---- conf/timm/default.yaml | 15 --- conf/trainer/default.yaml | 18 ---- scripts/shapes/shape_embed.py | 33 +------ 19 files changed, 197 insertions(+), 500 deletions(-) create mode 100644 bioimage_embed/cli.py create mode 100644 bioimage_embed/hydra.py create mode 100644 bioimage_embed/tests/test_cli.py delete mode 100644 conf/augmentations/default.yaml delete mode 100644 conf/bio_vae/default.yaml delete mode 100644 conf/checkpoints/default.yaml delete mode 100644 conf/config.yaml delete mode 100644 conf/dataloader/default.yaml delete mode 100644 conf/dataset/default.yaml delete mode 100644 conf/hydra/default.yaml delete mode 100644 conf/ivy_gap.yaml delete mode 100644 conf/lightning/default.yaml delete mode 100644 conf/logger/default.yaml delete mode 100644 conf/paths/default.yaml delete mode 100644 conf/pythae/default.yaml delete mode 100644 conf/timm/default.yaml delete mode 100644 conf/trainer/default.yaml diff --git a/bioimage_embed/augmentations.py b/bioimage_embed/augmentations.py index e2c14074..6c9daba4 100644 --- a/bioimage_embed/augmentations.py +++ b/bioimage_embed/augmentations.py @@ -1,40 +1,6 @@ import albumentations as A import cv2 -DEFAULT_AUGMENTATION = A.Compose( - [ - # Flip the images horizontally or vertically with a 50% chance - A.OneOf( - [ - A.HorizontalFlip(p=0.5), - A.VerticalFlip(p=0.5), - ], - p=0.5, - ), - # Rotate the images by a random angle within a specified range - A.Rotate(limit=45, p=0.5), - # Randomly scale the image intensity to adjust brightness and contrast - A.RandomGamma(gamma_limit=(80, 120), p=0.5), - # Apply random elastic transformations to the images - A.ElasticTransform( - alpha=1, - sigma=50, - alpha_affine=50, - p=0.5, - ), - # Shift the image channels along the intensity axis - A.ChannelShuffle(p=0.5), - # Add a small amount of noise to the images - A.GaussNoise(var_limit=(10.0, 50.0), p=0.5), - # Crop a random part of the image and resize it back to the original size - A.RandomResizedCrop( - height=512, width=512, scale=(0.9, 1.0), ratio=(0.9, 1.1), p=0.5 - ), - # Adjust image intensity with a specified range for individual channels - A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5), - ] -) - from typing import Any import albumentations @@ -43,6 +9,39 @@ from omegaconf import DictConfig from PIL import Image +DEFAULT_AUGMENTATION_LIST = [ + # Flip the images horizontally or vertically with a 50% chance + A.OneOf( + [ + A.HorizontalFlip(p=0.5), + A.VerticalFlip(p=0.5), + ], + p=0.5, + ), + # Rotate the images by a random angle within a specified range + A.Rotate(limit=45, p=0.5), + # Randomly scale the image intensity to adjust brightness and contrast + A.RandomGamma(gamma_limit=(80, 120), p=0.5), + # Apply random elastic transformations to the images + A.ElasticTransform( + alpha=1, + sigma=50, + alpha_affine=50, + p=0.5, + ), + # Shift the image channels along the intensity axis + A.ChannelShuffle(p=0.5), + # Add a small amount of noise to the images + A.GaussNoise(var_limit=(10.0, 50.0), p=0.5), + # Crop a random part of the image and resize it back to the original size + A.RandomResizedCrop( + height=512, width=512, scale=(0.9, 1.0), ratio=(0.9, 1.1), p=0.5 + ), + # Adjust image intensity with a specified range for individual channels + A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5), +] + +DEFAULT_AUGMENTATION = A.Compose(DEFAULT_AUGMENTATION_LIST) class TransformsWrapper: def __init__(self, transforms_cfg: DictConfig) -> None: @@ -81,9 +80,7 @@ def __init__(self, transforms_cfg: DictConfig) -> None: _convert_="object", ) valid_test_predict_aug.append(aug) - self.valid_test_predict_aug = albumentations.Compose( - valid_test_predict_aug - ) + self.valid_test_predict_aug = albumentations.Compose(valid_test_predict_aug) def set_mode(self, mode: str) -> None: """Set `__call__` mode. @@ -111,4 +108,4 @@ def __call__(self, image: Any, **kwargs: Any) -> Any: image = np.asarray(image) if self.mode == "train": return self.train_aug(image=image, **kwargs) - return self.valid_test_predict_aug(image=image, **kwargs) \ No newline at end of file + return self.valid_test_predict_aug(image=image, **kwargs) diff --git a/bioimage_embed/cli.py b/bioimage_embed/cli.py new file mode 100644 index 00000000..45529654 --- /dev/null +++ b/bioimage_embed/cli.py @@ -0,0 +1,12 @@ +from .hydra import train, infer +from typer import Typer + +app = Typer() +app.command()(train) +app.command()(infer) + +def main(): + app() + +if __name__ == "__main__": + main() diff --git a/bioimage_embed/hydra.py b/bioimage_embed/hydra.py new file mode 100644 index 00000000..46ad75de --- /dev/null +++ b/bioimage_embed/hydra.py @@ -0,0 +1,106 @@ +from hydra.core.config_store import ConfigStore +from dataclasses import dataclass +from hydra import compose, initialize +from omegaconf import OmegaConf +from types import SimpleNamespace +import hydra +from hydra.core.config_store import ConfigStore +from omegaconf import OmegaConf +import albumentations +from dataclasses import dataclass, field +from bioimage_embed.augmentations import DEFAULT_AUGMENTATION_LIST +import albumentations as A +import os + +@dataclass +class Receipe: + _target_: str = "types.SimpleNamespace" + opt: str = "adamw" + weight_decay: float = 0.001 + momentum: float = 0.9 + sched: str = "cosine" + epochs: int = 50 + lr: float = 1e-4 + min_lr: float = 1e-6 + t_initial: int = 10 + t_mul: int = 2 + lr_min: float = None + decay_rate: float = 0.1 + warmup_lr: float = 1e-6 + warmup_lr_init: float = 1e-6 + warmup_epochs: int = 5 + cycle_limit: int = None + t_in_epochs: bool = False + noisy: bool = False + noise_std: float = 0.1 + noise_pct: float = 0.67 + noise_seed: int = None + cooldown_epochs: int = 5 + warmup_t: int = 0 + + +@dataclass +class Transform: + _target_: str = "albumentations.Compose" + transforms: A.Compose = field(default_factory=A.Compose(DEFAULT_AUGMENTATION_LIST)) + + +# @dataclass +# class AlbumentationsTransform: +# _target_: str = "albumentations.from_dict" +# transform_dict: dict = field(default_factory=A.from_dict) +# transform = A.from_dict(OmegaConf.to_container(cfg.albumentations, resolve=True)) + + +@dataclass +class ImageDataset: + _target_: str = "torchvision.datasets.ImageFolder" + transform: Transform = field(default_factory=Transform) + + +@dataclass +class Dataset: + pass + + +@dataclass +class DataLoader: + _target_: str = "bioimage_embed.lightning.dataloader.DataModule" + dataset: str = field(default_factory=ImageDataset) + + +# def cs_generator(): +cs = ConfigStore.instance() +cs.store(name="receipe", node=Receipe) +cs.store(name="dataloader", node=DataLoader) + + +# return cs +def train(): + main(job_name="test_app") + + +def write_default_config_file(config_path, config_filename, config): + os.makedirs(config_path, exist_ok=True) + with open(os.path.join(config_path, config_filename), "w") as file: + file.write(OmegaConf.to_yaml(config)) + + +def main(config_path="conf", job_name="test_app"): + config_file = os.path.join(config_path, "config.yaml") + + # Check if the configuration directory exists, if not, create it + if not os.path.exists(config_path): + os.makedirs(config_path) + # Initialize Hydra with a basic configuration + hydra.initialize(version_base=None, config_path=config_path, job_name=job_name) + cfg = hydra.compose(config_name="config") + # Save the default configuration + with open(config_file, "w") as file: + file.write(OmegaConf.to_yaml(cfg)) + else: + # Initialize Hydra normally if the configuration directory exists + hydra.initialize(version_base=None, config_path=config_path, job_name=job_name) + cfg = hydra.compose(config_name="config") + + print(OmegaConf.to_yaml(cfg)) diff --git a/bioimage_embed/tests/test_cli.py b/bioimage_embed/tests/test_cli.py new file mode 100644 index 00000000..dca082aa --- /dev/null +++ b/bioimage_embed/tests/test_cli.py @@ -0,0 +1,42 @@ +import os +import pytest +from ..hydra import main + +def test_main_creates_config(): + # Arrange + config_path = "test_conf" + job_name = "test_app" + + # Ensure the configuration directory does not exist initially + if os.path.exists(config_path): + os.rmdir(config_path) + + # Act + main(config_path=config_path, job_name=job_name) + + # Assert + assert os.path.exists(config_path), "Config directory was not created" + assert os.path.isfile(os.path.join(config_path, "config.yaml")), "Config file was not created" + + # Clean up + os.remove(os.path.join(config_path, "config.yaml")) + os.rmdir(config_path) + +@pytest.mark.parametrize("config_path, job_name", [ + ("conf", "test_app"), + ("another_conf", "another_job") +]) +def test_hydra_initializes(config_path, job_name): + # Act + main(config_path=config_path, job_name=job_name) + + # Assert + # Here you can assert specifics about the cfg object if needed. + # Since main does not return anything, you might need to adjust + # the main function to return the cfg for more thorough testing. + + # Clean up + if os.path.exists(config_path): + os.remove(os.path.join(config_path, "config.yaml")) + os.rmdir(config_path) + \ No newline at end of file diff --git a/conf/augmentations/default.yaml b/conf/augmentations/default.yaml deleted file mode 100644 index 3ab17c45..00000000 --- a/conf/augmentations/default.yaml +++ /dev/null @@ -1,70 +0,0 @@ -# __version__: 1.3.0 -# transform: -# __class_fullname__: Compose -# additional_targets: {} -# bbox_params: null -# keypoint_params: null -# p: 1.0 -# transforms: -# - __class_fullname__: OneOf -# p: 0.5 -# transforms: -# - __class_fullname__: HorizontalFlip -# always_apply: false -# p: 0.5 -# - __class_fullname__: VerticalFlip -# always_apply: false -# p: 0.5 -# - __class_fullname__: Rotate -# always_apply: false -# border_mode: 4 -# crop_border: false -# interpolation: 1 -# limit: -# - -45 -# - 45 -# mask_value: null -# p: 0.5 -# rotate_method: largest_box -# value: null -# - __class_fullname__: RandomGamma -# always_apply: false -# eps: null -# gamma_limit: -# - 80 -# - 120 -# p: 0.5 -# - __class_fullname__: ElasticTransform -# alpha: 1 -# alpha_affine: 50 -# always_apply: false -# approximate: false -# border_mode: 4 -# interpolation: 1 -# mask_value: null -# p: 0.5 -# same_dxdy: false -# sigma: 50 -# value: null -# - __class_fullname__: GaussNoise -# always_apply: false -# mean: 0 -# p: 0.5 -# per_channel: true -# var_limit: -# - 10.0 -# - 50.0 -# - __class_fullname__: RandomCrop -# always_apply: false -# height: ${dataset.crop_size[0]} -# p: 1 -# width: ${dataset.crop_size[1]} -# - __class_fullname__: Normalize -# always_apply: true -# p: 1.0 -# transpose_mask: false -# - __class_fullname__: ToTensorV2 -# always_apply: true -# p: 1.0 -# transpose_mask: false - diff --git a/conf/bio_vae/default.yaml b/conf/bio_vae/default.yaml deleted file mode 100644 index 12f762d1..00000000 --- a/conf/bio_vae/default.yaml +++ /dev/null @@ -1,8 +0,0 @@ -_target_: bioimage_embed.models.BioimageEmbed -model: "VQVAE" -input_dim: - - 3 - - 128 - - 128 -latent_dim: 64 -model_config: ${pythae.model_config} diff --git a/conf/checkpoints/default.yaml b/conf/checkpoints/default.yaml deleted file mode 100644 index 76ebb7cf..00000000 --- a/conf/checkpoints/default.yaml +++ /dev/null @@ -1,3 +0,0 @@ -_target_: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint -dirpath: ${paths.output_dir} -save_last: True \ No newline at end of file diff --git a/conf/config.yaml b/conf/config.yaml deleted file mode 100644 index d8156ab3..00000000 --- a/conf/config.yaml +++ /dev/null @@ -1,153 +0,0 @@ -defaults: - - _self_ - - trainer: default.yaml - - pythae: default.yaml - # - optimizer: default.yaml - # - scheulder: default.yaml - - timm: default.yaml - - augmentations: default.yaml - # - dataset: default.yaml - - dataloader: default.yaml - - paths: default.yaml - - lightning: default.yaml - - bioimage_embed: default.yaml - - logger: default.yaml - - checkpoints: default.yaml - -version_base: 2.0 - -# seed for random number generators in pytorch, numpy and python.random -seed: 42 - -# name of the run, accessed by loggers -name: null - -trainer: - accelerator: "gpu" - devices: "auto" - gradient_clip_val: 1 - accumulate_grad_batches: 16 - min_epochs: 0 - max_epochs: 200 - strategy: "ddp" - profiler: null - fast_dev_run: False - -dataset: - name: "ivy_gap" - # dir: "data" - train_dataset_glob: ${paths.data_dir}/${dataset.name}/random/*png - crop_size: - - 256 - - 256 - -dataloader: - batch_size: 32 - num_workers: 8 - pin_memory: false - shuffle: true - persistent_workers: true - -model: - _target_: bioimage_embed.models.create_model - name: "resnet18_vqvae_legacy" - # Dims match ImageNet - input_dim: [3, 64, 64] - latent_dim: 8 - opt: LAMB - lr: 1.0e-4 - weight_decay: 0.0001 - momentum: 0.9 - sched: cosine - min_lr: 1.0e-6 - warmup_epochs: 5 - warmup_lr: 1.0e-6 - cooldown_epochs: 10 - t_max: 50 - cycle_momentum: false - -# pythae: -# encoder: bioimage_embed.models.ResNet18VAEEncoder -# # _target_: Encoder_ResNet_VQVAE_CELEBA -# decoder: bioimage_embed.models.ResNet18VAEDecoder -# model_config: -# _target_: pythae.models.VAEConfig - -albumentations: - __version__: 1.3.0 - transform: - __class_fullname__: Compose - additional_targets: {} - bbox_params: null - keypoint_params: null - p: 1.0 - transforms: - - __class_fullname__: OneOf - p: 0.5 - transforms: - - __class_fullname__: HorizontalFlip - always_apply: false - p: 0.5 - - __class_fullname__: VerticalFlip - always_apply: false - p: 0.5 - - __class_fullname__: RandomCrop - always_apply: true - height: ${dataset.crop_size[0]} - p: 1 - width: ${dataset.crop_size[1]} - # scale: - # - 1.0 - # - 1.0 - # - __class_fullname__: Rotate - # always_apply: false - # border_mode: 4 - # crop_border: false - # interpolation: 1 - # limit: - # - -45 - # - 45 - # mask_value: null - # p: 0.5 - # rotate_method: largest_box - # value: null - # - __class_fullname__: RandomGamma - # always_apply: false - # eps: null - # gamma_limit: - # - 80 - # - 120 - # p: 0.5 - # - __class_fullname__: ElasticTransform - # alpha: 1 - # alpha_affine: 50 - # always_apply: false - # approximate: false - # border_mode: 4 - # interpolation: 1 - # mask_value: null - # p: 0.5 - # same_dxdy: false - # sigma: 50 - # value: null - # - __class_fullname__: GaussNoise - # always_apply: false - # mean: 0 - # p: 0.5 - # per_channel: true - # var_limit: - # - 10.0 - # - 50.0 - - __class_fullname__: Resize - always_apply: true - height: ${model.input_dim[1]} - p: 1 - width: ${model.input_dim[2]} - - __class_fullname__: ToFloat - always_apply: true - p: 1.0 - max_value: 1.0 - - __class_fullname__: ToTensorV2 - always_apply: true - p: 1.0 - # transpose_mask: false diff --git a/conf/dataloader/default.yaml b/conf/dataloader/default.yaml deleted file mode 100644 index 872861b6..00000000 --- a/conf/dataloader/default.yaml +++ /dev/null @@ -1,7 +0,0 @@ -_target_: bioimage_embed.lightning.DatamoduleGlob -glob_str: ${dataset.train_dataset_glob} -batch_size: 32 -num_workers: 4 -pin_memory: true -shuffle: true -persistent_workers: true \ No newline at end of file diff --git a/conf/dataset/default.yaml b/conf/dataset/default.yaml deleted file mode 100644 index e69de29b..00000000 diff --git a/conf/hydra/default.yaml b/conf/hydra/default.yaml deleted file mode 100644 index 9de8ac12..00000000 --- a/conf/hydra/default.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# https://hydra.cc/docs/configure_hydra/intro/ -# https://github.com/ashleve/lightning-hydra-template/blob/main/configs/hydra/default.yaml - -# enable color logging -defaults: - - override hydra_logging: colorlog - - override job_logging: colorlog - -# output directory, generated dynamically on each run -run: - dir: ${paths.log_dir}/${task_name}/runs/${now:%Y-%m-%d}_${now:%H-%M-%S} -sweep: - dir: ${paths.log_dir}/${task_name}/multiruns/${now:%Y-%m-%d}_${now:%H-%M-%S} - subdir: ${hydra.job.num} \ No newline at end of file diff --git a/conf/ivy_gap.yaml b/conf/ivy_gap.yaml deleted file mode 100644 index 777faccc..00000000 --- a/conf/ivy_gap.yaml +++ /dev/null @@ -1,103 +0,0 @@ -dataset: "ivy_gap" -data_dir: "data" -train_dataset_glob: f"{data_dir}/{dataset}/random/*png" - -optimizer_params: - opt: LAMB - lr: 0.001 - weight_decay: 0.0001 - momentum: 0.9 - -lr_scheduler_params: - sched: cosine - min_lr: 1.0e-6 - warmup_epochs: 5 - warmup_lr: 1.0e-6 - cooldown_epochs: 10 - t_max: 50 - cycle_momentum: false - -albumentations: - __version__: 1.3.0 - transform: - __class_fullname__: Compose - additional_targets: {} - bbox_params: null - keypoint_params: null - p: 1.0 - transforms: - - __class_fullname__: OneOf - p: 0.5 - transforms: - - __class_fullname__: HorizontalFlip - always_apply: false - p: 0.5 - - __class_fullname__: VerticalFlip - always_apply: false - p: 0.5 - - __class_fullname__: Rotate - always_apply: false - border_mode: 4 - crop_border: false - interpolation: 1 - limit: - - -45 - - 45 - mask_value: null - p: 0.5 - rotate_method: largest_box - value: null - - __class_fullname__: RandomGamma - always_apply: false - eps: null - gamma_limit: - - 80 - - 120 - p: 0.5 - - __class_fullname__: ElasticTransform - alpha: 1 - alpha_affine: 50 - always_apply: false - approximate: false - border_mode: 4 - interpolation: 1 - mask_value: null - p: 0.5 - same_dxdy: false - sigma: 50 - value: null - - __class_fullname__: GaussNoise - always_apply: false - mean: 0 - p: 0.5 - per_channel: true - var_limit: - - 10.0 - - 50.0 - - __class_fullname__: RandomCrop - always_apply: false - height: 128 - p: 1 - width: 128 - - __class_fullname__: RandomBrightnessContrast - always_apply: false - brightness_by_max: true - brightness_limit: - - -0.2 - - 0.2 - contrast_limit: - - -0.2 - - 0.2 - p: 0.5 - - __class_fullname__: Normalize - always_apply: false - max_pixel_value: 255.0 - mean: - - 0.485 - - 0.456 - - 0.406 - p: 1.0 - std: - - 0.229 - - 0.224 - - 0.225 diff --git a/conf/lightning/default.yaml b/conf/lightning/default.yaml deleted file mode 100644 index 6a45b2de..00000000 --- a/conf/lightning/default.yaml +++ /dev/null @@ -1,3 +0,0 @@ -_target_: bioimage_embed.lightning.LitAutoEncoderTorch -model: ${pythae} -args: ${timm} \ No newline at end of file diff --git a/conf/logger/default.yaml b/conf/logger/default.yaml deleted file mode 100644 index 2ad96e8b..00000000 --- a/conf/logger/default.yaml +++ /dev/null @@ -1,2 +0,0 @@ -_target_: pytorch_lightning.loggers.TensorBoardLogger -save_dir: ${paths.log_dir} diff --git a/conf/paths/default.yaml b/conf/paths/default.yaml deleted file mode 100644 index d8738dc1..00000000 --- a/conf/paths/default.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# path to root directory -# this requires PROJECT_ROOT environment variable to exist -# you can replace it with "." if you want the root to be the current working directory -# root_dir: ${oc.env:PROJECT_ROOT} -root_dir: . -# path to data directory -data_dir: ${paths.root_dir}/data/ - -# path to logging directory -log_dir: ${paths.root_dir}/logs/ - -# path to output directory, created dynamically by hydra -# path generation pattern is specified in `configs/hydra/default.yaml` -# use it to store all files generated during the run, like ckpts and metrics -output_dir: ${hydra:runtime.output_dir} - -# path to working directory -work_dir: ${hydra:runtime.cwd} \ No newline at end of file diff --git a/conf/pythae/default.yaml b/conf/pythae/default.yaml deleted file mode 100644 index f4c01e7f..00000000 --- a/conf/pythae/default.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# model_name: VQVAE - -# model: -_target_: pythae.models.VAE -# model_config: $(model.model_config) -encoder: - _target_: bioimage_embed.models.ResNet18VAEEncoder - model_config: ${pythae.model_config} -decoder: - _target_: bioimage_embed.models.ResNet18VAEDecoder - model_config: ${pythae.model_config} - -model_config: - _target_: pythae.models.VAEConfig - _convert_: all - input_dim: ${model.input_dim} - latent_dim: ${model.latent_dim} diff --git a/conf/timm/default.yaml b/conf/timm/default.yaml deleted file mode 100644 index 0d61e8c3..00000000 --- a/conf/timm/default.yaml +++ /dev/null @@ -1,15 +0,0 @@ - # _target_: timm.optim.optimizer -opt: LAMB -lr: 1.0e-3 -weight_decay: 0.0001 -momentum: 0.9 -# scheduler: -# _target_: timm.scheduler.scheduler -sched: cosine -min_lr: 1.0e-6 -warmup_epochs: 5 -warmup_lr: 1.0e-6 -cooldown_epochs: 10 -t_max: 50 -cycle_momentum: false -epochs: ${trainer.max_epochs} \ No newline at end of file diff --git a/conf/trainer/default.yaml b/conf/trainer/default.yaml deleted file mode 100644 index 86d4d552..00000000 --- a/conf/trainer/default.yaml +++ /dev/null @@ -1,18 +0,0 @@ -_target_: pytorch_lightning.Trainer - -accelerator: "gpu" -devices: "1" -# weights_summary: null -# progress_bar_refresh_rate: 5 -# resume_from_checkpoint: null -# val_check_interval: 1 -check_val_every_n_epoch: 1 -logger: ${logger} -gradient_clip_val: 1 -enable_checkpointing: True -accumulate_grad_batches: 4 -callbacks: - - ${checkpoints} -min_epochs: 50 -max_epochs: 200 -precision: 32 \ No newline at end of file diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 37792a1b..1034256e 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -116,11 +116,7 @@ def shape_embed_process(): "latent_dim": interp_size, "num_embeddings": interp_size, "num_hiddens": interp_size, - "num_residual_hiddens": 32, - "num_residual_layers": 150, "pretrained": True, - # "embedding_dim": 32, - # "num_embeddings": 16, "commitment_cost": 0.25, "decay": 0.99, "frobenius_norm": False, @@ -153,7 +149,7 @@ def shape_embed_process(): # dataset = "bbbc010" # train_data_path = f"scripts/shapes/data/{dataset_path}" - train_data_path = f"scripts/shapes/data/{dataset_path}" + train_data_path = f"data/{dataset_path}" metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}" path = Path(metadata("")) @@ -360,7 +356,7 @@ def shape_embed_process(): indices = np.random.choice(y.size, int(0.3 * y.size), replace=False) y_partial[indices] = -1 y_blind = -1 * np.ones_like(y) - + df = pd.DataFrame(latent_space.numpy()) df["Class"] = y # Map numeric classes to their labels @@ -370,31 +366,6 @@ def shape_embed_process(): df = df.set_index("Class") df_shape_embed = df.copy() - ax = sns.relplot( - data=df, - x="umap0", - y="umap1", - hue="Class", - palette="deep", - alpha=0.5, - edgecolor=None, - s=5, - height=height, - aspect=0.5 * width / height, - ) - - sns.move_legend( - ax, - "upper center", - ) - ax.set(xlabel=None, ylabel=None) - sns.despine(left=True, bottom=True) - plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False) - plt.tight_layout() - plt.savefig(metadata(f"umap_no_axes.pdf")) - # plt.show() - plt.close() - # %% X = df_shape_embed.to_numpy() From 4bf27f9930fe86bdb64e3c48701ddfe7742a8fea Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 18 Jan 2024 10:55:36 +0000 Subject: [PATCH 015/204] Early stopping on val loss to stop overfitting --- scripts/shapes/shape_embed.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index f70ed089..91c466d6 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -20,6 +20,7 @@ import pytorch_lightning as pl import torch from types import SimpleNamespace +from pytorch_lightning.callbacks.early_stopping import EarlyStopping # Deal with the filesystem import torch.multiprocessing @@ -319,6 +320,7 @@ def shape_embed_process(): callbacks=[checkpoint_callback], min_epochs=50, max_epochs=args.epochs, + callbacks=[EarlyStopping(monitor="loss/val", mode="min")], log_every_n_steps=1, ) # %% From 4ca9dd01f6bb411291abb7317cc409b465d59eed Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Sat, 20 Jan 2024 08:19:41 +0000 Subject: [PATCH 016/204] adding branch prose back --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6b34690d..ccf6c625 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,13 @@ This utility makes it simple to fetch the necessary datasets: ```bash make download.data ``` +If you don't have a Kaggle account you must create one and then follow the next steps: +1. Install the Kaggle API package so you can download the data from the Makefile you have all the information in their [Github repository](https://github.com/Kaggle/kaggle-api). +2. To use the Kaggle API you need also to create an API token. + You can found how to do it in their [documentation](https://github.com/Kaggle/kaggle-api#api-credentials) +4. After that you will need to add your user and key in a file called `kaggle.json` in this location in your home directory `chmod 600 ~/.kaggle/kaggle.json` +5. Don't forget to accept the conditions for the "2018 Data Science Bowl" on the Kaggle website. + Otherwise you would not be able to pull this data from the command line. ### 4. Developer Installation: @@ -88,4 +95,4 @@ bioimage_embed is licensed under the MIT License. Please refer to the [LICENSE]( --- -Happy Embedding! 🧬🔬 \ No newline at end of file +Happy Embedding! 🧬🔬 From a34ee720594c02af5aec304a7c521a3c5ee3a22e Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 12:36:43 +0000 Subject: [PATCH 017/204] local changes to run --- scripts/shapes/shape_embed.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 986f556c..038707d4 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -150,14 +150,14 @@ def shape_embed_process(): args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params) #dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm" - dataset_path = "shape_embed_data/data/bbbc010/BBBC010_v1_foreground_eachworm/" + dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm/" # dataset_path = "vampire/mefs/data/processed/Control" # dataset_path = "shape_embed_data/data/vampire/torchvision/Control/" # dataset_path = "vampire/torchvision/Control" # dataset = "bbbc010" # train_data_path = f"scripts/shapes/data/{dataset_path}" - train_data_path = f"data/{dataset_path}" + train_data_path = f"/nfs/research/uhlmann/afoix/{dataset_path}" metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}" path = Path(metadata("")) @@ -316,7 +316,6 @@ def shape_embed_process(): callbacks=[checkpoint_callback], min_epochs=50, max_epochs=args.epochs, - callbacks=[EarlyStopping(monitor="loss/val", mode="min")], log_every_n_steps=1, ) # %% From 9135043708fbba92c0637b91ce41664917a6ab28 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 12:44:54 +0000 Subject: [PATCH 018/204] command line arguments --- scripts/shapes/shape_embed.py | 57 +++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 038707d4..1ced1d5f 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -21,6 +21,7 @@ import torch from types import SimpleNamespace from pytorch_lightning.callbacks.early_stopping import EarlyStopping +import argparse # Deal with the filesystem import torch.multiprocessing @@ -98,7 +99,7 @@ def scoring_df(X, y): return pd.DataFrame(cv_results) -def shape_embed_process(): +def shape_embed_process(clargs): # Setting the font size mpl.rcParams["font.size"] = 10 @@ -111,14 +112,18 @@ def shape_embed_process(): sns.set(style="white", context="notebook", rc={"figure.figsize": (width, height)}) # matplotlib.use("TkAgg") - interp_size = 128 * 2 + interp_size = clargs.latent_space_size * 2 + #interp_size = 128 * 2 max_epochs = 100 - window_size = 128 * 2 + window_size = clargs.latent_space_size * 2 + #window_size = 128 * 2 params = { - "model":"resnet18_vqvae_legacy", + "model":clargs.model, + #"model":"resnet18_vae", "epochs": 75, - "batch_size": 4, + "batch_size": clargs.batch_size, + #"batch_size": 4, "num_workers": 2**4, "input_dim": (3, interp_size, interp_size), "latent_dim": interp_size, @@ -496,5 +501,45 @@ def shape_embed_process(): # tikzplotlib.save(metadata(f"trials_barplot.tikz")) + + +############################################################################### + if __name__ == "__main__": - shape_embed_process() + + def auto_pos_int (x): + val = int(x,0) + if val <= 0: + raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) + return val + + parser = argparse.ArgumentParser(description='Run the shape embed pipeline') + + models = [ + "resnet18_vae" + , "resnet50_vae" + , "resnet18_vae_bolt" + , "resnet50_vae_bolt" + , "resnet18_vqvae" + , "resnet50_vqvae" + , "resnet18_vqvae_legacy" + , "resnet50_vqvae_legacy" + , "resnet101_vqvae_legacy" + , "resnet110_vqvae_legacy" + , "resnet152_vqvae_legacy" + , "resnet18_vae_legacy" + , "resnet50_vae_legacy" + ] + parser.add_argument( + '-m', '--model', choices=models, default=models[0], metavar='MODEL' + , help=f"The MODEL to use, one of {models} (default {models[0]}).") + parser.add_argument( + '-b', '--batch-size', nargs=1, default=int(4), metavar='BATCH_SIZE', type=auto_pos_int + , help="The BATCH_SIZE for the run, a positive integer (default 4)") + parser.add_argument( + '-l', '--latent-space-size', nargs=1, default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int + , help="The LATENT_SPACE_SIZE, a positive integer (default 128)") + #parser.add_argument('-v', '--verbose', action='count', default=0, + # help="Increase verbosity level by adding more \"v\".") + + shape_embed_process(parser.parse_args()) From 43c6ed0ea98a94d648d02dc623f745e640ef7a8a Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 12:47:20 +0000 Subject: [PATCH 019/204] enable testing + uncomment dataset --- scripts/shapes/shape_embed.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 1ced1d5f..9130892e 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -154,14 +154,8 @@ def shape_embed_process(clargs): args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params) - #dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm" dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm/" - # dataset_path = "vampire/mefs/data/processed/Control" - # dataset_path = "shape_embed_data/data/vampire/torchvision/Control/" - # dataset_path = "vampire/torchvision/Control" - # dataset = "bbbc010" - - # train_data_path = f"scripts/shapes/data/{dataset_path}" + dataset = "bbbc010" train_data_path = f"/nfs/research/uhlmann/afoix/{dataset_path}" metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}" @@ -334,7 +328,7 @@ def shape_embed_process(clargs): lit_model.eval() validation = trainer.validate(lit_model, datamodule=dataloader) - # testing = trainer.test(lit_model, datamodule=dataloader) + testing = trainer.test(lit_model, datamodule=dataloader) example_input = Variable(torch.rand(1, *args.input_dim)) # torch.jit.save(lit_model.to_torchscript(), f"{model_dir}/model.pt") From e4e0aaeac9bedac111433d9a43c6f14f97131630 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 21:58:18 +0000 Subject: [PATCH 020/204] added a slurm python script --- slurm_shape_embed.py | 82 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 slurm_shape_embed.py diff --git a/slurm_shape_embed.py b/slurm_shape_embed.py new file mode 100644 index 00000000..68e16cb9 --- /dev/null +++ b/slurm_shape_embed.py @@ -0,0 +1,82 @@ +#! /usr/bin/env python3 + +import os +import subprocess +import tempfile + +## Assign the arguments to variables +#model_arg=$1 +#sizes_list="${@:2}" +# +## Create SLURM job script +#job_script="slurm_job.sh" +# +#echo "#!/bin/bash" > "$job_script" +#echo "#SBATCH --job-name=ite_shape_embed" >> "$job_script" +#echo "#SBATCH --output=ite_shape_embed.out" >> "$job_script" +#echo "#SBATCH --error=ite_shape_embed.err" >> "$job_script" +#echo "#SBATCH --gres=gpu:2" >> "$job_script" # Adjust the number of CPUs as needed +#echo "#SBATCH --mem=50GB" >> "$job_script" # Adjust the memory requirement as needed +#echo "" >> "$job_script" +# +## Loop through the sizes and append the Python command to the job script +#for size in $sizes_list; do +# echo "python ite_shape_embed.py --model $model_arg --ls_size $size" >> "$job_script" +#done +# +## Submit SLURM job +#sbatch "$job_script" + +models = [ + "resnet18_vae" +, "resnet50_vae" +, "resnet18_vae_bolt" +, "resnet50_vae_bolt" +, "resnet18_vqvae" +, "resnet50_vqvae" +, "resnet18_vqvae_legacy" +, "resnet50_vqvae_legacy" +, "resnet101_vqvae_legacy" +, "resnet110_vqvae_legacy" +, "resnet152_vqvae_legacy" +, "resnet18_vae_legacy" +, "resnet50_vae_legacy" +] +batch_sizes = [4, 8, 16] +latent_space_sizes = [64, 128, 256, 512] + +slurm_script="""#!/bin/bash + +JOB_NAME=shape_embed_{model}_{b_size}_{ls_size} +echo "running shape embed with:" +echo " - model {model}" +echo " - batch size {b_size}" +echo " - latent space size {ls_size}" +python3 scripts/shapes/shape_embed.py --model {model} --batch-size {b_size} --latent-space-size {ls_size} +""" + +if __name__ == "__main__": + + slurmdir = f'{os.getcwd()}/slurmdir' + os.makedirs(slurmdir, exist_ok=True) + for m, bs, ls in [ (m,bs,ls) for m in models + for bs in batch_sizes + for ls in latent_space_sizes ]: + jobname = f'shape_embed_{m}_{bs}_{ls}' + print(jobname) + fp = open(mode='w+', file=f'{slurmdir}/slurm_script_shape_embed_{m}_{bs}_{ls}.script') + fp.write(slurm_script.format(model=m, b_size=bs, ls_size=ls)) + fp.flush() + print(f'{fp.name}') + print(f'cat {fp.name}') + result = subprocess.run(['cat', fp.name], stdout=subprocess.PIPE) + print(result.stdout.decode('utf-8')) + result = subprocess.run([ 'sbatch' + , '--time', '10:00:00' + , '--mem', '50GB' + , '--job-name', jobname + , '--output', f'{slurmdir}/{jobname}.out' + , '--error', f'{slurmdir}/{jobname}.err' + , '--gres', 'gpu:2' + , fp.name], stdout=subprocess.PIPE) + print(result.stdout.decode('utf-8')) From e78afd640ac0164064ee83927294d009b01fce87 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 21:59:37 +0000 Subject: [PATCH 021/204] fix cli type --- scripts/shapes/shape_embed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 9130892e..c0560da0 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -528,10 +528,10 @@ def auto_pos_int (x): '-m', '--model', choices=models, default=models[0], metavar='MODEL' , help=f"The MODEL to use, one of {models} (default {models[0]}).") parser.add_argument( - '-b', '--batch-size', nargs=1, default=int(4), metavar='BATCH_SIZE', type=auto_pos_int + '-b', '--batch-size', default=int(4), metavar='BATCH_SIZE', type=auto_pos_int , help="The BATCH_SIZE for the run, a positive integer (default 4)") parser.add_argument( - '-l', '--latent-space-size', nargs=1, default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int + '-l', '--latent-space-size', default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int , help="The LATENT_SPACE_SIZE, a positive integer (default 128)") #parser.add_argument('-v', '--verbose', action='count', default=0, # help="Increase verbosity level by adding more \"v\".") From ee021c64786b82d9284812e43861afdf84474046 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 22:04:33 +0000 Subject: [PATCH 022/204] add correct name for the jobs --- scripts/shapes/shape_embed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index c0560da0..96124967 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -298,7 +298,7 @@ def shape_embed_process(clargs): model_dir = f"checkpoints/{hashing_fn(args)}" tb_logger = pl_loggers.TensorBoardLogger(f"logs/") - wandb = pl_loggers.WandbLogger(project="bioimage-embed", name="shapes") + wandb = pl_loggers.WandbLogger(project="shape-embed", name=f"{params['model']}_{interp_size}_{params['batch_size']}") Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True) From 30c34ccd740d5c88a2d25c21a0cbb5f2e1fb5106 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 22:07:59 +0000 Subject: [PATCH 023/204] Log f1 score mean and std in wandb --- scripts/shapes/shape_embed.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 96124967..27cd5a38 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -465,6 +465,13 @@ def shape_embed_process(clargs): trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv")) trial_df.plot(kind="bar") + # Special metrics for f1 score for wandb + wandb.log({"trial_df": wandb.Table(dataframe=trial_df)}) + mean_df = trial_df.groupby("trial").mean() + std_df = trial_df.groupby("trial").std() + wandb.log({"Mean": wandb.Table(dataframe=mean_df)}) + wandb.log({"Std": wandb.Table(dataframe=std_df)}) + melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score") # fig, ax = plt.subplots(figsize=(width, height)) ax = sns.catplot( From b77c4fc0d3838a6c3dd225bbdba180d885375807 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 22:34:40 +0000 Subject: [PATCH 024/204] choose memory allocation base on latent space size --- scripts/shapes/shape_embed.py | 1 + slurm_shape_embed.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 27cd5a38..d9c6eb86 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -22,6 +22,7 @@ from types import SimpleNamespace from pytorch_lightning.callbacks.early_stopping import EarlyStopping import argparse +import wandb # Deal with the filesystem import torch.multiprocessing diff --git a/slurm_shape_embed.py b/slurm_shape_embed.py index 68e16cb9..3b78ee35 100644 --- a/slurm_shape_embed.py +++ b/slurm_shape_embed.py @@ -55,6 +55,14 @@ python3 scripts/shapes/shape_embed.py --model {model} --batch-size {b_size} --latent-space-size {ls_size} """ +def mem_size(ls): + if ls <= 128: + return '50GB' + if ls <= 256: + return '100GB' + if ls <= 512: + return '300GB' + if __name__ == "__main__": slurmdir = f'{os.getcwd()}/slurmdir' @@ -71,9 +79,10 @@ print(f'cat {fp.name}') result = subprocess.run(['cat', fp.name], stdout=subprocess.PIPE) print(result.stdout.decode('utf-8')) + print(mem_size(ls)) result = subprocess.run([ 'sbatch' , '--time', '10:00:00' - , '--mem', '50GB' + , '--mem', mem_size(ls) , '--job-name', jobname , '--output', f'{slurmdir}/{jobname}.out' , '--error', f'{slurmdir}/{jobname}.err' From 8f7d9d8e80c405b167a9bc51784eafec94db550b Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 22:42:36 +0000 Subject: [PATCH 025/204] dynamically chose n gpus based on latent space size + fix mem allocation as well --- slurm_shape_embed.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/slurm_shape_embed.py b/slurm_shape_embed.py index 3b78ee35..bd47eef3 100644 --- a/slurm_shape_embed.py +++ b/slurm_shape_embed.py @@ -58,11 +58,19 @@ def mem_size(ls): if ls <= 128: return '50GB' - if ls <= 256: + if ls > 128: return '100GB' - if ls <= 512: + if ls > 256: return '300GB' +def n_gpus(ls): + if ls <= 128: + return 'gpus:2' + if ls > 128: + return 'gpus:2' + if ls > 256: + return 'gpus:3' + if __name__ == "__main__": slurmdir = f'{os.getcwd()}/slurmdir' @@ -86,6 +94,6 @@ def mem_size(ls): , '--job-name', jobname , '--output', f'{slurmdir}/{jobname}.out' , '--error', f'{slurmdir}/{jobname}.err' - , '--gres', 'gpu:2' + , '--gres', n_gpus(ls) , fp.name], stdout=subprocess.PIPE) print(result.stdout.decode('utf-8')) From 41dec50d1ba7afc5c32ff9f0e0e3e9a1d3b709d0 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 22:44:30 +0000 Subject: [PATCH 026/204] fix gpu allocation typo --- slurm_shape_embed.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/slurm_shape_embed.py b/slurm_shape_embed.py index bd47eef3..c542dd3e 100644 --- a/slurm_shape_embed.py +++ b/slurm_shape_embed.py @@ -65,11 +65,11 @@ def mem_size(ls): def n_gpus(ls): if ls <= 128: - return 'gpus:2' + return 'gpu:2' if ls > 128: - return 'gpus:2' + return 'gpu:2' if ls > 256: - return 'gpus:3' + return 'gpu:3' if __name__ == "__main__": From 775548a671cc85574873f85e75984c6e67214246 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jan 2024 22:48:03 +0000 Subject: [PATCH 027/204] comment out all the mean and std login for f1 --- scripts/shapes/shape_embed.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index d9c6eb86..f9d21974 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -22,7 +22,6 @@ from types import SimpleNamespace from pytorch_lightning.callbacks.early_stopping import EarlyStopping import argparse -import wandb # Deal with the filesystem import torch.multiprocessing @@ -467,11 +466,11 @@ def shape_embed_process(clargs): trial_df.plot(kind="bar") # Special metrics for f1 score for wandb - wandb.log({"trial_df": wandb.Table(dataframe=trial_df)}) - mean_df = trial_df.groupby("trial").mean() - std_df = trial_df.groupby("trial").std() - wandb.log({"Mean": wandb.Table(dataframe=mean_df)}) - wandb.log({"Std": wandb.Table(dataframe=std_df)}) + #wandb.log({"trial_df": wandb.Table(dataframe=trial_df)}) + #mean_df = trial_df.groupby("trial").mean() + #std_df = trial_df.groupby("trial").std() + #wandb.log({"Mean": wandb.Table(dataframe=mean_df)}) + #wandb.log({"Std": wandb.Table(dataframe=std_df)}) melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score") # fig, ax = plt.subplots(figsize=(width, height)) From 1cf6646619a840abed247653943c39cd9bb57091 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 23 Jan 2024 21:55:25 +0000 Subject: [PATCH 028/204] added a --clear-checkpoints clarg --- scripts/shapes/shape_embed.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index f9d21974..a70d02fc 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -22,6 +22,8 @@ from types import SimpleNamespace from pytorch_lightning.callbacks.early_stopping import EarlyStopping import argparse +import wandb +import shutil # Deal with the filesystem import torch.multiprocessing @@ -295,6 +297,9 @@ def shape_embed_process(clargs): dataloader.setup() model.eval() + if clargs.clear_checkpoints: + print("cleaning checkpoints") + shutil.rmtree("checkpoints/") model_dir = f"checkpoints/{hashing_fn(args)}" tb_logger = pl_loggers.TensorBoardLogger(f"logs/") @@ -540,7 +545,9 @@ def auto_pos_int (x): parser.add_argument( '-l', '--latent-space-size', default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int , help="The LATENT_SPACE_SIZE, a positive integer (default 128)") + parser.add_argument('--clear-checkpoints', action='store_true' + , help='remove checkpoints') #parser.add_argument('-v', '--verbose', action='count', default=0, # help="Increase verbosity level by adding more \"v\".") - shape_embed_process(parser.parse_args()) + shape_embed_process(parser.parse_args()) \ No newline at end of file From 704c88fc8dcdf6ba915f0ea59f0fa70e5ee4092e Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 23 Jan 2024 21:55:57 +0000 Subject: [PATCH 029/204] use wandblogger to log info (mean, std dev...) --- scripts/shapes/shape_embed.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index a70d02fc..25326431 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -303,15 +303,15 @@ def shape_embed_process(clargs): model_dir = f"checkpoints/{hashing_fn(args)}" tb_logger = pl_loggers.TensorBoardLogger(f"logs/") - wandb = pl_loggers.WandbLogger(project="shape-embed", name=f"{params['model']}_{interp_size}_{params['batch_size']}") + wandblogger = pl_loggers.WandbLogger(project="shape-embed", name=f"{params['model']}_{interp_size}_{params['batch_size']}") Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True) checkpoint_callback = ModelCheckpoint(dirpath=f"{model_dir}/", save_last=True) - wandb.watch(lit_model, log="all") + wandblogger.watch(lit_model, log="all") trainer = pl.Trainer( - logger=[wandb,tb_logger], + logger=[wandblogger,tb_logger], gradient_clip_val=0.5, enable_checkpointing=True, devices=1, @@ -469,13 +469,18 @@ def shape_embed_process(clargs): trial_df.to_csv(metadata(f"trial_df.csv")) trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv")) trial_df.plot(kind="bar") - - # Special metrics for f1 score for wandb - #wandb.log({"trial_df": wandb.Table(dataframe=trial_df)}) + #mean_df = trial_df.groupby("trial").mean() #std_df = trial_df.groupby("trial").std() - #wandb.log({"Mean": wandb.Table(dataframe=mean_df)}) - #wandb.log({"Std": wandb.Table(dataframe=std_df)}) + #wandb.log_table(mean_df) + #wandb.log_table(std_df) + + #Special metrics for f1 score for wandb + wandblogger.experiment.log({"trial_df": wandb.Table(dataframe=trial_df)}) + mean_df = trial_df.groupby("trial").mean() + std_df = trial_df.groupby("trial").std() + wandblogger.experiment.log({"Mean": wandb.Table(dataframe=mean_df)}) + wandblogger.experiment.log({"Std": wandb.Table(dataframe=std_df)}) melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score") # fig, ax = plt.subplots(figsize=(width, height)) From 15343f691d85a4f9137b76d79a1071a35891a353 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 23 Jan 2024 23:13:19 +0000 Subject: [PATCH 030/204] run individual jobs in own folder to work around checkpoints --- slurm_shape_embed.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/slurm_shape_embed.py b/slurm_shape_embed.py index c542dd3e..daea5ca5 100644 --- a/slurm_shape_embed.py +++ b/slurm_shape_embed.py @@ -52,7 +52,11 @@ echo " - model {model}" echo " - batch size {b_size}" echo " - latent space size {ls_size}" -python3 scripts/shapes/shape_embed.py --model {model} --batch-size {b_size} --latent-space-size {ls_size} +rand_name=$(cat /dev/urandom | tr -cd 'a-f0-9' | head -c 16) +mkdir -p slurm_rundir/$rand_name +cp -r $(ls | grep -v slurm_rundir) slurm_rundir/$rand_name/. +cd slurm_rundir/$rand_name +python3 scripts/shapes/shape_embed.py --model {model} --batch-size {b_size} --latent-space-size {ls_size} --clear-checkpoints """ def mem_size(ls): From 49813811129bf89db88460dd0c24dc9c58e4dada Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 21 Feb 2024 11:47:13 +0000 Subject: [PATCH 031/204] Updating pythae --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 82fb7df8..23672073 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ scikit-image = "^0.21.0" iteround = "^1.0.4" ipykernel = "^6.25.1" nonechucks = "^0.4.2" -pythae = "^0.1.1" +pythae = { git = "https://github.com/clementchadebec/benchmark_VAE.git", branch = "main" } pytest = "^7.4.0" pandas = "^2.1.0" bokeh = "^3.2.2" From 9da2a22a00b9de833e36e50c13f65de46fd66346 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 21 Feb 2024 11:48:30 +0000 Subject: [PATCH 032/204] Adding standard scalar to df scoring fun --- scripts/shapes/shape_embed.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index f70ed089..59544fed 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -1,6 +1,8 @@ # %% import seaborn as sns import pyefd +from sklearn.decomposition import PCA +from sklearn.discriminant_analysis import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_validate, KFold, train_test_split from sklearn.metrics import make_scorer @@ -9,6 +11,7 @@ import matplotlib as mpl import seaborn as sns from pathlib import Path +from sklearn.pipeline import Pipeline import umap from torch.autograd import Variable from types import SimpleNamespace @@ -77,14 +80,20 @@ def scoring_df(X, y): } # Create a random forest classifier - clf = RandomForestClassifier() + pipeline = Pipeline( + [ + ("scaler", StandardScaler()), + # ("pca", PCA(n_components=0.95, whiten=True, random_state=42)), + ("clf", RandomForestClassifier()), + ] + ) # Specify the number of folds k_folds = 10 # Perform k-fold cross-validation cv_results = cross_validate( - estimator=clf, + estimator=pipeline, X=X, y=y, cv=KFold(n_splits=k_folds), From 873ef920fe40d3a7a6782bd4323fa998c65711c7 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 21 Feb 2024 11:49:31 +0000 Subject: [PATCH 033/204] Refactoring and adding back umap --- scripts/shapes/shape_embed.py | 69 ++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 59544fed..e8fda82f 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -23,6 +23,8 @@ import pytorch_lightning as pl import torch from types import SimpleNamespace +from umap import UMAP +import os # Deal with the filesystem import torch.multiprocessing @@ -66,6 +68,41 @@ def hashing_fn(args): hashed_string = base64.urlsafe_b64encode(hash_object.digest()).decode() return hashed_string + +def umap_plot(df, metadata, width=3.45, height=3.45 / 1.618): + umap_reducer = UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42) + mask = np.random.rand(len(df)) < 0.7 + + semi_labels = df["Class"].copy() + semi_labels[~mask] = -1 # Assuming -1 indicates unknown label for semi-supervision + + umap_embedding = umap_reducer.fit_transform(df, y=semi_labels) + + ax = sns.relplot( + data=pd.DataFrame(umap_embedding, columns=["umap0", "umap1"]), + x="umap0", + y="umap1", + hue="Class", + palette="deep", + alpha=0.5, + edgecolor=None, + s=5, + height=height, + aspect=0.5 * width / height, + ) + + sns.move_legend( + ax, + "upper center", + ) + ax.set(xlabel=None, ylabel=None) + sns.despine(left=True, bottom=True) + plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False) + plt.tight_layout() + plt.savefig(metadata(f"umap_no_axes.pdf")) + # plt.show() + plt.close() + def scoring_df(X, y): # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split( @@ -370,11 +407,6 @@ def shape_embed_process(): idx_to_class = {v: k for k, v in dataset.dataset.class_to_idx.items()} y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()]) - y_partial = y.copy() - indices = np.random.choice(y.size, int(0.3 * y.size), replace=False) - y_partial[indices] = -1 - y_blind = -1 * np.ones_like(y) - df = pd.DataFrame(latent_space.numpy()) df["Class"] = y # Map numeric classes to their labels @@ -384,31 +416,8 @@ def shape_embed_process(): df = df.set_index("Class") df_shape_embed = df.copy() - ax = sns.relplot( - data=df, - x="umap0", - y="umap1", - hue="Class", - palette="deep", - alpha=0.5, - edgecolor=None, - s=5, - height=height, - aspect=0.5 * width / height, - ) - - sns.move_legend( - ax, - "upper center", - ) - ax.set(xlabel=None, ylabel=None) - sns.despine(left=True, bottom=True) - plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False) - plt.tight_layout() - plt.savefig(metadata(f"umap_no_axes.pdf")) - # plt.show() - plt.close() - + # %% UMAP plot + umap_plot(df, metadata, width, height) # %% X = df_shape_embed.to_numpy() From 752736386338364f5ec780bb9aef30732e6bcf41 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 21 Feb 2024 11:49:52 +0000 Subject: [PATCH 034/204] Seed "everything" --- scripts/shapes/shape_embed.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index e8fda82f..4efd6352 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -62,6 +62,10 @@ logger = logging.getLogger(__name__) +# Seed everything +np.random.seed(42) +pl.seed_everything(42) + def hashing_fn(args): serialized_args = pickle.dumps(vars(args)) hash_object = hashlib.sha256(serialized_args) From afb0368e68cd3b9b68c4b3236f7137249ac50a0e Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 21 Feb 2024 11:50:07 +0000 Subject: [PATCH 035/204] Reduce k folds (should be a hparam) --- scripts/shapes/shape_embed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 4efd6352..f9e0d716 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -130,7 +130,7 @@ def scoring_df(X, y): ) # Specify the number of folds - k_folds = 10 + k_folds = 5 # Perform k-fold cross-validation cv_results = cross_validate( From 941dc804b195a4f9ec918b01166149f39bb11536 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 21 Feb 2024 11:50:28 +0000 Subject: [PATCH 036/204] Update args to match what we now think is good --- scripts/shapes/shape_embed.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index f9e0d716..72239453 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -165,26 +165,18 @@ def shape_embed_process(): window_size = 128 * 2 params = { - "model":"resnet18_vqvae_legacy", - "epochs": 75, + "model": "resnet50_vqvae", + "epochs": 250, "batch_size": 4, "num_workers": 2**4, "input_dim": (3, interp_size, interp_size), - "latent_dim": interp_size, - "num_embeddings": interp_size, - "num_hiddens": interp_size, - "num_residual_hiddens": 32, - "num_residual_layers": 150, + "latent_dim": int(128), "pretrained": True, - # "embedding_dim": 32, - # "num_embeddings": 16, - "commitment_cost": 0.25, - "decay": 0.99, "frobenius_norm": False, } optimizer_params = { - "opt": "LAMB", + "opt": "AdamW", "lr": 0.001, "weight_decay": 0.0001, "momentum": 0.9, From 2ccf8b4ed64564fc398830e680bf693f60f14712 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 21 Feb 2024 11:50:53 +0000 Subject: [PATCH 037/204] Dynamic best weights finding --- scripts/shapes/shape_embed.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 72239453..b1b253f8 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -348,11 +348,17 @@ def shape_embed_process(): Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True) - checkpoint_callback = ModelCheckpoint(dirpath=f"{model_dir}/", save_last=True) + checkpoint_callback = ModelCheckpoint( + dirpath=f"{model_dir}/", + save_last=True, + save_top_k=1, + monitor="loss/val", + mode="min", + ) wandb.watch(lit_model, log="all") trainer = pl.Trainer( - logger=[wandb,tb_logger], + logger=[wandb, tb_logger], gradient_clip_val=0.5, enable_checkpointing=True, devices=1, @@ -364,12 +370,20 @@ def shape_embed_process(): log_every_n_steps=1, ) # %% - try: - trainer.fit( - lit_model, datamodule=dataloader, ckpt_path=f"{model_dir}/last.ckpt" - ) - except: - trainer.fit(lit_model, datamodule=dataloader) + + # Determine the checkpoint path for resuming + last_checkpoint_path = f"{model_dir}/last.ckpt" + best_checkpoint_path = checkpoint_callback.best_model_path + + # Check if a last checkpoint exists to resume from + if os.path.isfile(last_checkpoint_path): + resume_checkpoint = last_checkpoint_path + elif best_checkpoint_path and os.path.isfile(best_checkpoint_path): + resume_checkpoint = best_checkpoint_path + else: + resume_checkpoint = None + + trainer.fit(lit_model, datamodule=dataloader, ckpt_path=resume_checkpoint) lit_model.eval() From d7761b6d9faa8df7d14ecb670ba5def3d2124365 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 21 Feb 2024 13:09:25 +0000 Subject: [PATCH 038/204] Fixed umap --- scripts/shapes/shape_embed.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index b1b253f8..b38b4722 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -73,17 +73,19 @@ def hashing_fn(args): return hashed_string -def umap_plot(df, metadata, width=3.45, height=3.45 / 1.618): +def umap_plot(df, metadata, width=3.45, height=3.45 / 1.618, split=0.8): umap_reducer = UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42) - mask = np.random.rand(len(df)) < 0.7 + mask = np.random.rand(len(df)) < split - semi_labels = df["Class"].copy() - semi_labels[~mask] = -1 # Assuming -1 indicates unknown label for semi-supervision + semi_labels = df.index.codes.copy() + semi_labels[~mask] = -1 - umap_embedding = umap_reducer.fit_transform(df, y=semi_labels) + umap_embedding = umap_reducer.fit_transform(df.sample(frac=1), y=semi_labels) ax = sns.relplot( - data=pd.DataFrame(umap_embedding, columns=["umap0", "umap1"]), + data=pd.DataFrame( + umap_embedding, columns=["umap0", "umap1"], index=df.index + ).reset_index(), x="umap0", y="umap1", hue="Class", @@ -107,6 +109,7 @@ def umap_plot(df, metadata, width=3.45, height=3.45 / 1.618): # plt.show() plt.close() + def scoring_df(X, y): # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split( From 22832d6b131266c44d7b71b1c6a954d3196c64ae Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 21 Feb 2024 13:10:09 +0000 Subject: [PATCH 039/204] Made the class column categorical --- scripts/shapes/shape_embed.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index b38b4722..49cc9c1d 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -424,17 +424,16 @@ def shape_embed_process(): df["Class"] = y # Map numeric classes to their labels idx_to_class = {0: "alive", 1: "dead"} - df["Class"] = df["Class"].map(idx_to_class) + df["Class"] = df["Class"].map(idx_to_class).astype("category") df["Scale"] = scalings[:, 0].squeeze() df = df.set_index("Class") df_shape_embed = df.copy() # %% UMAP plot - umap_plot(df, metadata, width, height) - # %% + umap_plot(df, metadata, width, height,split=0.9) X = df_shape_embed.to_numpy() - y = df_shape_embed.index.values + y = df_shape_embed.index properties = [ "area", From e567748a3e12e935e8e1e847941fd9459ed0d98c Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 22 Feb 2024 12:31:26 +0000 Subject: [PATCH 040/204] modification for slurm --- scripts/shapes/shape_embed_backup.py | 558 +++++++++++++++++++++++++++ 1 file changed, 558 insertions(+) create mode 100644 scripts/shapes/shape_embed_backup.py diff --git a/scripts/shapes/shape_embed_backup.py b/scripts/shapes/shape_embed_backup.py new file mode 100644 index 00000000..eea708e4 --- /dev/null +++ b/scripts/shapes/shape_embed_backup.py @@ -0,0 +1,558 @@ +# %% +import seaborn as sns +import pyefd +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import cross_validate, KFold, train_test_split +from sklearn.metrics import make_scorer +import pandas as pd +from sklearn import metrics +import matplotlib as mpl +import seaborn as sns +from pathlib import Path +import umap +from torch.autograd import Variable +from types import SimpleNamespace +import numpy as np +import logging +from skimage import measure +import umap.plot +from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint +import pytorch_lightning as pl +import torch +from types import SimpleNamespace +from pytorch_lightning.callbacks.early_stopping import EarlyStopping +import argparse +import wandb +import shutil + +# Deal with the filesystem +import torch.multiprocessing + +torch.multiprocessing.set_sharing_strategy("file_system") + +from bioimage_embed import shapes +import bioimage_embed + +# Note - you must have torchvision installed for this example + +from pytorch_lightning import loggers as pl_loggers +from torchvision import transforms +from bioimage_embed.lightning import DataModule + +from torchvision import datasets +from bioimage_embed.shapes.transforms import ( + ImageToCoords, + CropCentroidPipeline, + DistogramToCoords, + MaskToDistogramPipeline, + RotateIndexingClockwise, +) + +import matplotlib.pyplot as plt + +from bioimage_embed.lightning import DataModule +import matplotlib as mpl +from matplotlib import rc + +import logging +import pickle +import base64 +import hashlib + +logger = logging.getLogger(__name__) + +def hashing_fn(args): + serialized_args = pickle.dumps(vars(args)) + hash_object = hashlib.sha256(serialized_args) + hashed_string = base64.urlsafe_b64encode(hash_object.digest()).decode() + return hashed_string + +def scoring_df(X, y): + # Split the data into training and test sets + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y + ) + # Define a dictionary of metrics + scoring = { + "accuracy": make_scorer(metrics.accuracy_score), + "precision": make_scorer(metrics.precision_score, average="macro"), + "recall": make_scorer(metrics.recall_score, average="macro"), + "f1": make_scorer(metrics.f1_score, average="macro"), + } + + # Create a random forest classifier + clf = RandomForestClassifier() + + # Specify the number of folds + k_folds = 10 + + # Perform k-fold cross-validation + cv_results = cross_validate( + estimator=clf, + X=X, + y=y, + cv=KFold(n_splits=k_folds), + scoring=scoring, + n_jobs=-1, + return_train_score=False, + ) + + # Put the results into a DataFrame + return pd.DataFrame(cv_results) + + +def shape_embed_process(clargs): + # Setting the font size + mpl.rcParams["font.size"] = 10 + + # rc("text", usetex=True) + rc("font", **{"family": "sans-serif", "sans-serif": ["Arial"]}) + width = 3.45 + height = width / 1.618 + plt.rcParams["figure.figsize"] = [width, height] + + sns.set(style="white", context="notebook", rc={"figure.figsize": (width, height)}) + + # matplotlib.use("TkAgg") + interp_size = clargs.latent_space_size * 2 + #interp_size = 128 * 2 + max_epochs = 100 + window_size = clargs.latent_space_size * 2 + #window_size = 128 * 2 + + params = { + "model":clargs.model, + #"model":"resnet18_vae", + "epochs": 75, + "batch_size": clargs.batch_size, + #"batch_size": 4, + "num_workers": 2**4, + "input_dim": (3, interp_size, interp_size), + "latent_dim": interp_size, + "num_embeddings": interp_size, + "num_hiddens": interp_size, + "pretrained": True, + "commitment_cost": 0.25, + "decay": 0.99, + "frobenius_norm": False, + } + + optimizer_params = { + "opt": "AdamW", + "lr": 0.001, + "weight_decay": 0.0001, + "momentum": 0.9, + } + + lr_scheduler_params = { + "sched": "cosine", + "min_lr": 1e-4, + "warmup_epochs": 5, + "warmup_lr": 1e-6, + "cooldown_epochs": 10, + "t_max": 50, + "cycle_momentum": False, + } + + args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params) + + dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm/" + dataset = "bbbc010" + train_data_path = f"/nfs/research/uhlmann/afoix/{dataset_path}" + metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}" + + path = Path(metadata("")) + path.mkdir(parents=True, exist_ok=True) + # %% + + transform_crop = CropCentroidPipeline(window_size) + transform_dist = MaskToDistogramPipeline( + window_size, interp_size, matrix_normalised=False + ) + transform_mdscoords = DistogramToCoords(window_size) + transform_coords = ImageToCoords(window_size) + + transform_mask_to_gray = transforms.Compose([transforms.Grayscale(1)]) + + transform_mask_to_crop = transforms.Compose( + [ + # transforms.ToTensor(), + transform_mask_to_gray, + transform_crop, + ] + ) + + transform_mask_to_dist = transforms.Compose( + [ + transform_mask_to_crop, + transform_dist, + ] + ) + transform_mask_to_coords = transforms.Compose( + [ + transform_mask_to_crop, + transform_coords, + ] + ) + + transforms_dict = { + "none": transform_mask_to_gray, + "transform_crop": transform_mask_to_crop, + "transform_dist": transform_mask_to_dist, + "transform_coords": transform_mask_to_coords, + } + + train_data = { + key: datasets.ImageFolder(train_data_path, transform=value) + for key, value in transforms_dict.items() + } + + for key, value in train_data.items(): + print(key, len(value)) + plt.imshow(train_data[key][0][0], cmap="gray") + plt.imsave(metadata(f"{key}.png"), train_data[key][0][0], cmap="gray") + # plt.show() + plt.close() + + # plt.scatter(*train_data["transform_coords"][0][0]) + # plt.savefig(metadata(f"transform_coords.png")) + # plt.show() + + # plt.imshow(train_data["transform_crop"][0][0], cmap="gray") + # plt.scatter(*train_data["transform_coords"][0][0],c=np.arange(interp_size), cmap='rainbow', s=1) + # plt.show() + # plt.savefig(metadata(f"transform_coords.png")) + + # Retrieve the coordinates and cropped image + coords = train_data["transform_coords"][0][0] + crop_image = train_data["transform_crop"][0][0] + + fig = plt.figure(frameon=True) + ax = plt.Axes(fig, [0, 0, 1, 1]) + ax.set_axis_off() + fig.add_axes(ax) + + # Display the cropped image using grayscale colormap + plt.imshow(crop_image, cmap="gray_r") + + # Scatter plot with smaller point size + plt.scatter(*coords, c=np.arange(interp_size), cmap="rainbow", s=2) + + # Save the plot as an image without border and coordinate axes + plt.savefig(metadata(f"transform_coords.png"), bbox_inches="tight", pad_inches=0) + + # Close the plot + plt.close() + # import albumentations as A + # %% + gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1)) + transform = transforms.Compose( + [ + transform_mask_to_dist, + transforms.ToTensor(), + RotateIndexingClockwise(p=1), + gray2rgb, + ] + ) + + dataset = datasets.ImageFolder(train_data_path, transform=transform) + + valid_indices = [] + # Iterate through the dataset and apply the transform to each image + for idx in range(len(dataset)): + try: + image, label = dataset[idx] + # If the transform works without errors, add the index to the list of valid indices + valid_indices.append(idx) + except Exception as e: + # A better way to do with would be with batch collation + print(f"Error occurred for image {idx}: {e}") + + # Create a Subset using the valid indices + dataset = torch.utils.data.Subset(dataset, valid_indices) + dataloader = DataModule( + dataset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers, + ) + + # model = bioimage_embed.models.create_model("resnet18_vqvae_legacy", **vars(args)) + # + model = bioimage_embed.models.create_model( + model=args.model, + input_dim=args.input_dim, + latent_dim=args.latent_dim, + pretrained=args.pretrained, + ) + + # model = bioimage_embed.models.factory.ModelFactory(**vars(args)).resnet50_vqvae_legacy() + + # lit_model = shapes.MaskEmbedLatentAugment(model, args) + lit_model = shapes.MaskEmbed(model, args) + test_data = dataset[0][0].unsqueeze(0) + # test_lit_data = 2*(dataset[0][0].unsqueeze(0).repeat_interleave(3, dim=1),) + test_output = lit_model.forward((test_data,)) + + dataloader.setup() + model.eval() + + if clargs.clear_checkpoints: + print("cleaning checkpoints") + shutil.rmtree("checkpoints/") + model_dir = f"checkpoints/{hashing_fn(args)}" + + tb_logger = pl_loggers.TensorBoardLogger(f"logs/") + wandblogger = pl_loggers.WandbLogger(project="shape-embed", name=f"{params['model']}_{interp_size}_{params['batch_size']}") + + Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True) + + checkpoint_callback = ModelCheckpoint(dirpath=f"{model_dir}/", save_last=True) + wandblogger.watch(lit_model, log="all") + + trainer = pl.Trainer( + logger=[wandblogger,tb_logger], + gradient_clip_val=0.5, + enable_checkpointing=True, + devices=1, + accelerator="gpu", + accumulate_grad_batches=4, + callbacks=[checkpoint_callback], + min_epochs=50, + max_epochs=args.epochs, + log_every_n_steps=1, + ) + # %% + try: + trainer.fit( + lit_model, datamodule=dataloader, ckpt_path=f"{model_dir}/last.ckpt" + ) + except: + trainer.fit(lit_model, datamodule=dataloader) + + lit_model.eval() + + validation = trainer.validate(lit_model, datamodule=dataloader) + testing = trainer.test(lit_model, datamodule=dataloader) + example_input = Variable(torch.rand(1, *args.input_dim)) + + # torch.jit.save(lit_model.to_torchscript(), f"{model_dir}/model.pt") + # torch.onnx.export(lit_model, example_input, f"{model_dir}/model.onnx") + + # %% + # Inference + + dataloader = DataModule( + dataset, + batch_size=1, + shuffle=False, + num_workers=args.num_workers, + # Transform is commented here to avoid augmentations in real data + # HOWEVER, applying a the transform multiple times and averaging the results might produce better latent embeddings + # transform=transform, + # transform=transform, + ) + dataloader.setup() + + predictions = trainer.predict(lit_model, datamodule=dataloader) + + # Use the namespace variables + latent_space = torch.stack([d.out.z.flatten() for d in predictions]) + scalings = torch.stack([d.x.scalings.flatten() for d in predictions]) + idx_to_class = {v: k for k, v in dataset.dataset.class_to_idx.items()} + y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()]) + + y_partial = y.copy() + indices = np.random.choice(y.size, int(0.3 * y.size), replace=False) + y_partial[indices] = -1 + y_blind = -1 * np.ones_like(y) + + df = pd.DataFrame(latent_space.numpy()) + df["Class"] = y + # Map numeric classes to their labels + idx_to_class = {0: "alive", 1: "dead"} + df["Class"] = df["Class"].map(idx_to_class) + df["Scale"] = scalings[:, 0].squeeze() + df = df.set_index("Class") + df_shape_embed = df.copy() + + # %% + + X = df_shape_embed.to_numpy() + y = df_shape_embed.index.values + + properties = [ + "area", + "perimeter", + "centroid", + "major_axis_length", + "minor_axis_length", + "orientation", + ] + dfs = [] + for i, data in enumerate(train_data["transform_crop"]): + X, y = data + # Do regionprops here + # Calculate shape summary statistics using regionprops + # We're considering that the mask has only one object, thus we take the first element [0] + # props = regionprops(np.array(X).astype(int))[0] + props_table = measure.regionprops_table( + np.array(X).astype(int), properties=properties + ) + + # Store shape properties in a dataframe + df = pd.DataFrame(props_table) + + # Assuming the class or label is contained in 'y' variable + df["class"] = y + df.set_index("class", inplace=True) + dfs.append(df) + + df_regionprops = pd.concat(dfs) + + # Assuming 'dataset_contour' is your DataLoader for the dataset + dfs = [] + for i, data in enumerate(train_data["transform_coords"]): + # Convert the tensor to a numpy array + X, y = data + + # Feed it to PyEFD's calculate_efd function + coeffs = pyefd.elliptic_fourier_descriptors(X, order=10, normalize=False) + # coeffs_df = pd.DataFrame({'class': [y], 'norm_coeffs': [norm_coeffs.flatten().tolist()]}) + + norm_coeffs = pyefd.normalize_efd(coeffs) + df = pd.DataFrame( + { + "norm_coeffs": norm_coeffs.flatten().tolist(), + "coeffs": coeffs.flatten().tolist(), + } + ).T.rename_axis("coeffs") + df["class"] = y + df.set_index("class", inplace=True, append=True) + dfs.append(df) + + df_pyefd = pd.concat(dfs) + + trials = [ + { + "name": "mask_embed", + "features": df_shape_embed.to_numpy(), + "labels": df_shape_embed.index, + }, + { + "name": "fourier_coeffs", + "features": df_pyefd.xs("coeffs", level="coeffs"), + "labels": df_pyefd.xs("coeffs", level="coeffs").index, + }, + # {"name": "fourier_norm_coeffs", + # "features": df_pyefd.xs("norm_coeffs", level="coeffs"), + # "labels": df_pyefd.xs("norm_coeffs", level="coeffs").index + # } + { + "name": "regionprops", + "features": df_regionprops, + "labels": df_regionprops.index, + }, + ] + + trial_df = pd.DataFrame() + for trial in trials: + X = trial["features"] + y = trial["labels"] + trial["score_df"] = scoring_df(X, y) + trial["score_df"]["trial"] = trial["name"] + print(trial["score_df"]) + trial["score_df"].to_csv(metadata(f"{trial['name']}_score_df.csv")) + trial_df = pd.concat([trial_df, trial["score_df"]]) + trial_df = trial_df.drop(["fit_time", "score_time"], axis=1) + + trial_df.to_csv(metadata(f"trial_df.csv")) + trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv")) + trial_df.plot(kind="bar") + + #mean_df = trial_df.groupby("trial").mean() + #std_df = trial_df.groupby("trial").std() + #wandb.log_table(mean_df) + #wandb.log_table(std_df) + + #Special metrics for f1 score for wandb + wandblogger.experiment.log({"trial_df": wandb.Table(dataframe=trial_df)}) + mean_df = trial_df.groupby("trial").mean() + std_df = trial_df.groupby("trial").std() + wandblogger.experiment.log({"Mean": wandb.Table(dataframe=mean_df)}) + wandblogger.experiment.log({"Std": wandb.Table(dataframe=std_df)}) + + melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score") + # fig, ax = plt.subplots(figsize=(width, height)) + ax = sns.catplot( + data=melted_df, + kind="bar", + x="trial", + hue="Metric", + y="Score", + errorbar="se", + height=height, + aspect=width * 2**0.5 / height, + ) + # ax.xtick_params(labelrotation=45) + # plt.legend(loc='lower center', bbox_to_anchor=(1, 1)) + # sns.move_legend(ax, "lower center", bbox_to_anchor=(1, 1)) + # ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + # plt.tight_layout() + plt.savefig(metadata(f"trials_barplot.pdf")) + plt.close() + + avs = ( + melted_df.set_index(["trial", "Metric"]) + .xs("test_f1", level="Metric", drop_level=False) + .groupby("trial") + .mean() + ) + print(avs) + # tikzplotlib.save(metadata(f"trials_barplot.tikz")) + + + + +############################################################################### + +if __name__ == "__main__": + + def auto_pos_int (x): + val = int(x,0) + if val <= 0: + raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) + return val + + parser = argparse.ArgumentParser(description='Run the shape embed pipeline') + + models = [ + "resnet18_vae" + , "resnet50_vae" + , "resnet18_vae_bolt" + , "resnet50_vae_bolt" + , "resnet18_vqvae" + , "resnet50_vqvae" + , "resnet18_vqvae_legacy" + , "resnet50_vqvae_legacy" + , "resnet101_vqvae_legacy" + , "resnet110_vqvae_legacy" + , "resnet152_vqvae_legacy" + , "resnet18_vae_legacy" + , "resnet50_vae_legacy" + ] + parser.add_argument( + '-m', '--model', choices=models, default=models[0], metavar='MODEL' + , help=f"The MODEL to use, one of {models} (default {models[0]}).") + parser.add_argument( + '-b', '--batch-size', default=int(4), metavar='BATCH_SIZE', type=auto_pos_int + , help="The BATCH_SIZE for the run, a positive integer (default 4)") + parser.add_argument( + '-l', '--latent-space-size', default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int + , help="The LATENT_SPACE_SIZE, a positive integer (default 128)") + parser.add_argument('--clear-checkpoints', action='store_true' + , help='remove checkpoints') + #parser.add_argument('-v', '--verbose', action='count', default=0, + # help="Increase verbosity level by adding more \"v\".") + + shape_embed_process(parser.parse_args()) From 161b0a0e6f9fe6dfa323628043e8ad8b634aff23 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 22 Feb 2024 12:33:00 +0000 Subject: [PATCH 041/204] changes in the shape embed script --- scripts/shapes/shape_embed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 25326431..eea708e4 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -138,7 +138,7 @@ def shape_embed_process(clargs): } optimizer_params = { - "opt": "LAMB", + "opt": "AdamW", "lr": 0.001, "weight_decay": 0.0001, "momentum": 0.9, @@ -555,4 +555,4 @@ def auto_pos_int (x): #parser.add_argument('-v', '--verbose', action='count', default=0, # help="Increase verbosity level by adding more \"v\".") - shape_embed_process(parser.parse_args()) \ No newline at end of file + shape_embed_process(parser.parse_args()) From a6cb292c1650c0c626ce08bb4f1c7d024fd48a60 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 22 Feb 2024 21:24:25 +0000 Subject: [PATCH 042/204] fix merge commit + add command line args for dataset (name and path) and wandb project --- scripts/shapes/shape_embed.py | 810 +++++++++++++++++----------------- 1 file changed, 409 insertions(+), 401 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 24aad0fc..32e6b898 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -174,406 +174,406 @@ def shape_embed_process(clargs): #window_size = 128 * 2 params = { - "model":clargs.model, - #"model":"resnet18_vae", - "epochs": 250, - "batch_size": clargs.batch_size, - #"batch_size": 4, - "num_workers": 2**4, - "input_dim": (3, interp_size, interp_size), - "latent_dim": interp_size, - "num_embeddings": interp_size, - "num_hiddens": interp_size, - "pretrained": True, - "commitment_cost": 0.25, - "decay": 0.99, - "frobenius_norm": False, -} - -optimizer_params = { - "opt": "AdamW", - "lr": 0.001, - "weight_decay": 0.0001, - "momentum": 0.9, -} - -lr_scheduler_params = { - "sched": "cosine", - "min_lr": 1e-4, - "warmup_epochs": 5, - "warmup_lr": 1e-6, - "cooldown_epochs": 10, - "t_max": 50, - "cycle_momentum": False, -} - -args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params) - -dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm/" -dataset = "bbbc010" -train_data_path = f"/nfs/research/uhlmann/afoix/{dataset_path}" -metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}" - -path = Path(metadata("")) -path.mkdir(parents=True, exist_ok=True) -# %% - -transform_crop = CropCentroidPipeline(window_size) -transform_dist = MaskToDistogramPipeline( - window_size, interp_size, matrix_normalised=False -) -transform_mdscoords = DistogramToCoords(window_size) -transform_coords = ImageToCoords(window_size) - -transform_mask_to_gray = transforms.Compose([transforms.Grayscale(1)]) - -transform_mask_to_crop = transforms.Compose( - [ - # transforms.ToTensor(), - transform_mask_to_gray, - transform_crop, - ] -) - -transform_mask_to_dist = transforms.Compose( - [ - transform_mask_to_crop, - transform_dist, - ] -) -transform_mask_to_coords = transforms.Compose( - [ - transform_mask_to_crop, - transform_coords, - ] -) - -transforms_dict = { - "none": transform_mask_to_gray, - "transform_crop": transform_mask_to_crop, - "transform_dist": transform_mask_to_dist, - "transform_coords": transform_mask_to_coords, -} - -train_data = { - key: datasets.ImageFolder(train_data_path, transform=value) - for key, value in transforms_dict.items() -} - -for key, value in train_data.items(): - print(key, len(value)) - plt.imshow(train_data[key][0][0], cmap="gray") - plt.imsave(metadata(f"{key}.png"), train_data[key][0][0], cmap="gray") + "model":clargs.model, + #"model":"resnet18_vae", + "epochs": 250, + "batch_size": clargs.batch_size, + #"batch_size": 4, + "num_workers": 2**4, + "input_dim": (3, interp_size, interp_size), + "latent_dim": interp_size, + "num_embeddings": interp_size, + "num_hiddens": interp_size, + "pretrained": True, + "commitment_cost": 0.25, + "decay": 0.99, + "frobenius_norm": False, + } + + optimizer_params = { + "opt": "AdamW", + "lr": 0.001, + "weight_decay": 0.0001, + "momentum": 0.9, + } + + lr_scheduler_params = { + "sched": "cosine", + "min_lr": 1e-4, + "warmup_epochs": 5, + "warmup_lr": 1e-6, + "cooldown_epochs": 10, + "t_max": 50, + "cycle_momentum": False, + } + + args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params) + + dataset_path = clargs.dataset[1] + train_data_path = f"/nfs/research/uhlmann/afoix/{dataset_path}" + metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}" + + path = Path(metadata("")) + path.mkdir(parents=True, exist_ok=True) + # %% + + transform_crop = CropCentroidPipeline(window_size) + transform_dist = MaskToDistogramPipeline( + window_size, interp_size, matrix_normalised=False + ) + transform_mdscoords = DistogramToCoords(window_size) + transform_coords = ImageToCoords(window_size) + + transform_mask_to_gray = transforms.Compose([transforms.Grayscale(1)]) + + transform_mask_to_crop = transforms.Compose( + [ + # transforms.ToTensor(), + transform_mask_to_gray, + transform_crop, + ] + ) + + transform_mask_to_dist = transforms.Compose( + [ + transform_mask_to_crop, + transform_dist, + ] + ) + transform_mask_to_coords = transforms.Compose( + [ + transform_mask_to_crop, + transform_coords, + ] + ) + + transforms_dict = { + "none": transform_mask_to_gray, + "transform_crop": transform_mask_to_crop, + "transform_dist": transform_mask_to_dist, + "transform_coords": transform_mask_to_coords, + } + + train_data = { + key: datasets.ImageFolder(train_data_path, transform=value) + for key, value in transforms_dict.items() + } + + for key, value in train_data.items(): + print(key, len(value)) + plt.imshow(train_data[key][0][0], cmap="gray") + plt.imsave(metadata(f"{key}.png"), train_data[key][0][0], cmap="gray") + # plt.show() + plt.close() + + # plt.scatter(*train_data["transform_coords"][0][0]) + # plt.savefig(metadata(f"transform_coords.png")) # plt.show() + + # plt.imshow(train_data["transform_crop"][0][0], cmap="gray") + # plt.scatter(*train_data["transform_coords"][0][0],c=np.arange(interp_size), cmap='rainbow', s=1) + # plt.show() + # plt.savefig(metadata(f"transform_coords.png")) + + # Retrieve the coordinates and cropped image + coords = train_data["transform_coords"][0][0] + crop_image = train_data["transform_crop"][0][0] + + fig = plt.figure(frameon=True) + ax = plt.Axes(fig, [0, 0, 1, 1]) + ax.set_axis_off() + fig.add_axes(ax) + + # Display the cropped image using grayscale colormap + plt.imshow(crop_image, cmap="gray_r") + + # Scatter plot with smaller point size + plt.scatter(*coords, c=np.arange(interp_size), cmap="rainbow", s=2) + + # Save the plot as an image without border and coordinate axes + plt.savefig(metadata(f"transform_coords.png"), bbox_inches="tight", pad_inches=0) + + # Close the plot plt.close() - -# plt.scatter(*train_data["transform_coords"][0][0]) -# plt.savefig(metadata(f"transform_coords.png")) -# plt.show() - -# plt.imshow(train_data["transform_crop"][0][0], cmap="gray") -# plt.scatter(*train_data["transform_coords"][0][0],c=np.arange(interp_size), cmap='rainbow', s=1) -# plt.show() -# plt.savefig(metadata(f"transform_coords.png")) - -# Retrieve the coordinates and cropped image -coords = train_data["transform_coords"][0][0] -crop_image = train_data["transform_crop"][0][0] - -fig = plt.figure(frameon=True) -ax = plt.Axes(fig, [0, 0, 1, 1]) -ax.set_axis_off() -fig.add_axes(ax) - -# Display the cropped image using grayscale colormap -plt.imshow(crop_image, cmap="gray_r") - -# Scatter plot with smaller point size -plt.scatter(*coords, c=np.arange(interp_size), cmap="rainbow", s=2) - -# Save the plot as an image without border and coordinate axes -plt.savefig(metadata(f"transform_coords.png"), bbox_inches="tight", pad_inches=0) - -# Close the plot -plt.close() -# import albumentations as A -# %% -gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1)) -transform = transforms.Compose( - [ - transform_mask_to_dist, - transforms.ToTensor(), - RotateIndexingClockwise(p=1), - gray2rgb, - ] -) - -dataset = datasets.ImageFolder(train_data_path, transform=transform) - -valid_indices = [] -# Iterate through the dataset and apply the transform to each image -for idx in range(len(dataset)): - try: - image, label = dataset[idx] - # If the transform works without errors, add the index to the list of valid indices - valid_indices.append(idx) - except Exception as e: - # A better way to do with would be with batch collation - print(f"Error occurred for image {idx}: {e}") - -# Create a Subset using the valid indices -dataset = torch.utils.data.Subset(dataset, valid_indices) -dataloader = DataModule( - dataset, - batch_size=args.batch_size, - shuffle=True, - num_workers=args.num_workers, -) - -# model = bioimage_embed.models.create_model("resnet18_vqvae_legacy", **vars(args)) -# -model = bioimage_embed.models.create_model( - model=args.model, - input_dim=args.input_dim, - latent_dim=args.latent_dim, - pretrained=args.pretrained, -) - -# model = bioimage_embed.models.factory.ModelFactory(**vars(args)).resnet50_vqvae_legacy() - -# lit_model = shapes.MaskEmbedLatentAugment(model, args) -lit_model = shapes.MaskEmbed(model, args) -test_data = dataset[0][0].unsqueeze(0) -# test_lit_data = 2*(dataset[0][0].unsqueeze(0).repeat_interleave(3, dim=1),) -test_output = lit_model.forward((test_data,)) - -dataloader.setup() -model.eval() - -if clargs.clear_checkpoints: - print("cleaning checkpoints") - shutil.rmtree("checkpoints/") -model_dir = f"checkpoints/{hashing_fn(args)}" - -tb_logger = pl_loggers.TensorBoardLogger(f"logs/") -wandblogger = pl_loggers.WandbLogger(project="shape-embed", name=f"{params['model']}_{interp_size}_{params['batch_size']}") - -Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True) - -checkpoint_callback = ModelCheckpoint( - dirpath=f"{model_dir}/", - save_last=True, - save_top_k=1, - monitor="loss/val", - mode="min", -) -wandb.watch(lit_model, log="all") - -trainer = pl.Trainer( - logger=[wandb, tb_logger], - gradient_clip_val=0.5, - enable_checkpointing=True, - devices=1, - accelerator="gpu", - accumulate_grad_batches=4, - callbacks=[checkpoint_callback], - min_epochs=50, - max_epochs=args.epochs, - log_every_n_steps=1, -) -# %% - -# Determine the checkpoint path for resuming -last_checkpoint_path = f"{model_dir}/last.ckpt" -best_checkpoint_path = checkpoint_callback.best_model_path - -# Check if a last checkpoint exists to resume from -if os.path.isfile(last_checkpoint_path): - resume_checkpoint = last_checkpoint_path -elif best_checkpoint_path and os.path.isfile(best_checkpoint_path): - resume_checkpoint = best_checkpoint_path -else: - resume_checkpoint = None - -trainer.fit(lit_model, datamodule=dataloader, ckpt_path=resume_checkpoint) - -lit_model.eval() - -validation = trainer.validate(lit_model, datamodule=dataloader) -testing = trainer.test(lit_model, datamodule=dataloader) -example_input = Variable(torch.rand(1, *args.input_dim)) - -# torch.jit.save(lit_model.to_torchscript(), f"{model_dir}/model.pt") -# torch.onnx.export(lit_model, example_input, f"{model_dir}/model.onnx") - -# %% -# Inference - -dataloader = DataModule( - dataset, - batch_size=1, - shuffle=False, - num_workers=args.num_workers, - # Transform is commented here to avoid augmentations in real data - # HOWEVER, applying a the transform multiple times and averaging the results might produce better latent embeddings - # transform=transform, - # transform=transform, -) -dataloader.setup() - -predictions = trainer.predict(lit_model, datamodule=dataloader) - -# Use the namespace variables -latent_space = torch.stack([d.out.z.flatten() for d in predictions]) -scalings = torch.stack([d.x.scalings.flatten() for d in predictions]) -idx_to_class = {v: k for k, v in dataset.dataset.class_to_idx.items()} -y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()]) - -df = pd.DataFrame(latent_space.numpy()) -df["Class"] = y -# Map numeric classes to their labels -idx_to_class = {0: "alive", 1: "dead"} -df["Class"] = df["Class"].map(idx_to_class).astype("category") -df["Scale"] = scalings[:, 0].squeeze() -df = df.set_index("Class") -df_shape_embed = df.copy() - -# %% -# %% UMAP plot -umap_plot(df, metadata, width, height,split=0.9) - -X = df_shape_embed.to_numpy() -y = df_shape_embed.index - -properties = [ - "area", - "perimeter", - "centroid", - "major_axis_length", - "minor_axis_length", - "orientation", -] -dfs = [] -for i, data in enumerate(train_data["transform_crop"]): - X, y = data - # Do regionprops here - # Calculate shape summary statistics using regionprops - # We're considering that the mask has only one object, thus we take the first element [0] - # props = regionprops(np.array(X).astype(int))[0] - props_table = measure.regionprops_table( - np.array(X).astype(int), properties=properties + # import albumentations as A + # %% + gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1)) + transform = transforms.Compose( + [ + transform_mask_to_dist, + transforms.ToTensor(), + RotateIndexingClockwise(p=1), + gray2rgb, + ] ) - - # Store shape properties in a dataframe - df = pd.DataFrame(props_table) - - # Assuming the class or label is contained in 'y' variable - df["class"] = y - df.set_index("class", inplace=True) - dfs.append(df) - -df_regionprops = pd.concat(dfs) - -# Assuming 'dataset_contour' is your DataLoader for the dataset -dfs = [] -for i, data in enumerate(train_data["transform_coords"]): - # Convert the tensor to a numpy array - X, y = data - - # Feed it to PyEFD's calculate_efd function - coeffs = pyefd.elliptic_fourier_descriptors(X, order=10, normalize=False) - # coeffs_df = pd.DataFrame({'class': [y], 'norm_coeffs': [norm_coeffs.flatten().tolist()]}) - - norm_coeffs = pyefd.normalize_efd(coeffs) - df = pd.DataFrame( + + dataset = datasets.ImageFolder(train_data_path, transform=transform) + + valid_indices = [] + # Iterate through the dataset and apply the transform to each image + for idx in range(len(dataset)): + try: + image, label = dataset[idx] + # If the transform works without errors, add the index to the list of valid indices + valid_indices.append(idx) + except Exception as e: + # A better way to do with would be with batch collation + print(f"Error occurred for image {idx}: {e}") + + # Create a Subset using the valid indices + dataset = torch.utils.data.Subset(dataset, valid_indices) + dataloader = DataModule( + dataset, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.num_workers, + ) + + # model = bioimage_embed.models.create_model("resnet18_vqvae_legacy", **vars(args)) + # + model = bioimage_embed.models.create_model( + model=args.model, + input_dim=args.input_dim, + latent_dim=args.latent_dim, + pretrained=args.pretrained, + ) + + # model = bioimage_embed.models.factory.ModelFactory(**vars(args)).resnet50_vqvae_legacy() + + # lit_model = shapes.MaskEmbedLatentAugment(model, args) + lit_model = shapes.MaskEmbed(model, args) + test_data = dataset[0][0].unsqueeze(0) + # test_lit_data = 2*(dataset[0][0].unsqueeze(0).repeat_interleave(3, dim=1),) + test_output = lit_model.forward((test_data,)) + + dataloader.setup() + model.eval() + + if clargs.clear_checkpoints: + print("cleaning checkpoints") + shutil.rmtree("checkpoints/") + model_dir = f"checkpoints/{hashing_fn(args)}" + + tb_logger = pl_loggers.TensorBoardLogger(f"logs/") + jobname = f"{params['model']}_{interp_size}_{params['batch_size']}_{clargs.dataset[0]}" + wandblogger = pl_loggers.WandbLogger(project=clargs.wandb_project, name=jobname) + + Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True) + + checkpoint_callback = ModelCheckpoint( + dirpath=f"{model_dir}/", + save_last=True, + save_top_k=1, + monitor="loss/val", + mode="min", + ) + wandb.watch(lit_model, log="all") + + trainer = pl.Trainer( + logger=[wandb, tb_logger], + gradient_clip_val=0.5, + enable_checkpointing=True, + devices=1, + accelerator="gpu", + accumulate_grad_batches=4, + callbacks=[checkpoint_callback], + min_epochs=50, + max_epochs=args.epochs, + log_every_n_steps=1, + ) + # %% + + # Determine the checkpoint path for resuming + last_checkpoint_path = f"{model_dir}/last.ckpt" + best_checkpoint_path = checkpoint_callback.best_model_path + + # Check if a last checkpoint exists to resume from + if os.path.isfile(last_checkpoint_path): + resume_checkpoint = last_checkpoint_path + elif best_checkpoint_path and os.path.isfile(best_checkpoint_path): + resume_checkpoint = best_checkpoint_path + else: + resume_checkpoint = None + + trainer.fit(lit_model, datamodule=dataloader, ckpt_path=resume_checkpoint) + + lit_model.eval() + + validation = trainer.validate(lit_model, datamodule=dataloader) + testing = trainer.test(lit_model, datamodule=dataloader) + example_input = Variable(torch.rand(1, *args.input_dim)) + + # torch.jit.save(lit_model.to_torchscript(), f"{model_dir}/model.pt") + # torch.onnx.export(lit_model, example_input, f"{model_dir}/model.onnx") + + # %% + # Inference + + dataloader = DataModule( + dataset, + batch_size=1, + shuffle=False, + num_workers=args.num_workers, + # Transform is commented here to avoid augmentations in real data + # HOWEVER, applying a the transform multiple times and averaging the results might produce better latent embeddings + # transform=transform, + # transform=transform, + ) + dataloader.setup() + + predictions = trainer.predict(lit_model, datamodule=dataloader) + + # Use the namespace variables + latent_space = torch.stack([d.out.z.flatten() for d in predictions]) + scalings = torch.stack([d.x.scalings.flatten() for d in predictions]) + idx_to_class = {v: k for k, v in dataset.dataset.class_to_idx.items()} + y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()]) + + df = pd.DataFrame(latent_space.numpy()) + df["Class"] = y + # Map numeric classes to their labels + idx_to_class = {0: "alive", 1: "dead"} + df["Class"] = df["Class"].map(idx_to_class).astype("category") + df["Scale"] = scalings[:, 0].squeeze() + df = df.set_index("Class") + df_shape_embed = df.copy() + + # %% + # %% UMAP plot + umap_plot(df, metadata, width, height,split=0.9) + + X = df_shape_embed.to_numpy() + y = df_shape_embed.index + + properties = [ + "area", + "perimeter", + "centroid", + "major_axis_length", + "minor_axis_length", + "orientation", + ] + dfs = [] + for i, data in enumerate(train_data["transform_crop"]): + X, y = data + # Do regionprops here + # Calculate shape summary statistics using regionprops + # We're considering that the mask has only one object, thus we take the first element [0] + # props = regionprops(np.array(X).astype(int))[0] + props_table = measure.regionprops_table( + np.array(X).astype(int), properties=properties + ) + + # Store shape properties in a dataframe + df = pd.DataFrame(props_table) + + # Assuming the class or label is contained in 'y' variable + df["class"] = y + df.set_index("class", inplace=True) + dfs.append(df) + + df_regionprops = pd.concat(dfs) + + # Assuming 'dataset_contour' is your DataLoader for the dataset + dfs = [] + for i, data in enumerate(train_data["transform_coords"]): + # Convert the tensor to a numpy array + X, y = data + + # Feed it to PyEFD's calculate_efd function + coeffs = pyefd.elliptic_fourier_descriptors(X, order=10, normalize=False) + # coeffs_df = pd.DataFrame({'class': [y], 'norm_coeffs': [norm_coeffs.flatten().tolist()]}) + + norm_coeffs = pyefd.normalize_efd(coeffs) + df = pd.DataFrame( + { + "norm_coeffs": norm_coeffs.flatten().tolist(), + "coeffs": coeffs.flatten().tolist(), + } + ).T.rename_axis("coeffs") + df["class"] = y + df.set_index("class", inplace=True, append=True) + dfs.append(df) + + df_pyefd = pd.concat(dfs) + + trials = [ { - "norm_coeffs": norm_coeffs.flatten().tolist(), - "coeffs": coeffs.flatten().tolist(), - } - ).T.rename_axis("coeffs") - df["class"] = y - df.set_index("class", inplace=True, append=True) - dfs.append(df) - -df_pyefd = pd.concat(dfs) - -trials = [ - { - "name": "mask_embed", - "features": df_shape_embed.to_numpy(), - "labels": df_shape_embed.index, - }, - { - "name": "fourier_coeffs", - "features": df_pyefd.xs("coeffs", level="coeffs"), - "labels": df_pyefd.xs("coeffs", level="coeffs").index, - }, - # {"name": "fourier_norm_coeffs", - # "features": df_pyefd.xs("norm_coeffs", level="coeffs"), - # "labels": df_pyefd.xs("norm_coeffs", level="coeffs").index - # } - { - "name": "regionprops", - "features": df_regionprops, - "labels": df_regionprops.index, - }, -] - -trial_df = pd.DataFrame() -for trial in trials: - X = trial["features"] - y = trial["labels"] - trial["score_df"] = scoring_df(X, y) - trial["score_df"]["trial"] = trial["name"] - print(trial["score_df"]) - trial["score_df"].to_csv(metadata(f"{trial['name']}_score_df.csv")) - trial_df = pd.concat([trial_df, trial["score_df"]]) -trial_df = trial_df.drop(["fit_time", "score_time"], axis=1) - -trial_df.to_csv(metadata(f"trial_df.csv")) -trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv")) -trial_df.plot(kind="bar") - -#mean_df = trial_df.groupby("trial").mean() -#std_df = trial_df.groupby("trial").std() -#wandb.log_table(mean_df) -#wandb.log_table(std_df) - -#Special metrics for f1 score for wandb -wandblogger.experiment.log({"trial_df": wandb.Table(dataframe=trial_df)}) -mean_df = trial_df.groupby("trial").mean() -std_df = trial_df.groupby("trial").std() -wandblogger.experiment.log({"Mean": wandb.Table(dataframe=mean_df)}) -wandblogger.experiment.log({"Std": wandb.Table(dataframe=std_df)}) - -melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score") -# fig, ax = plt.subplots(figsize=(width, height)) -ax = sns.catplot( - data=melted_df, - kind="bar", - x="trial", - hue="Metric", - y="Score", - errorbar="se", - height=height, - aspect=width * 2**0.5 / height, -) -# ax.xtick_params(labelrotation=45) -# plt.legend(loc='lower center', bbox_to_anchor=(1, 1)) -# sns.move_legend(ax, "lower center", bbox_to_anchor=(1, 1)) -# ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') -# plt.tight_layout() -plt.savefig(metadata(f"trials_barplot.pdf")) -plt.close() - -avs = ( - melted_df.set_index(["trial", "Metric"]) - .xs("test_f1", level="Metric", drop_level=False) - .groupby("trial") - .mean() -) -print(avs) -# tikzplotlib.save(metadata(f"trials_barplot.tikz")) + "name": "mask_embed", + "features": df_shape_embed.to_numpy(), + "labels": df_shape_embed.index, + }, + { + "name": "fourier_coeffs", + "features": df_pyefd.xs("coeffs", level="coeffs"), + "labels": df_pyefd.xs("coeffs", level="coeffs").index, + }, + # {"name": "fourier_norm_coeffs", + # "features": df_pyefd.xs("norm_coeffs", level="coeffs"), + # "labels": df_pyefd.xs("norm_coeffs", level="coeffs").index + # } + { + "name": "regionprops", + "features": df_regionprops, + "labels": df_regionprops.index, + }, + ] + + trial_df = pd.DataFrame() + for trial in trials: + X = trial["features"] + y = trial["labels"] + trial["score_df"] = scoring_df(X, y) + trial["score_df"]["trial"] = trial["name"] + print(trial["score_df"]) + trial["score_df"].to_csv(metadata(f"{trial['name']}_score_df.csv")) + trial_df = pd.concat([trial_df, trial["score_df"]]) + trial_df = trial_df.drop(["fit_time", "score_time"], axis=1) + + trial_df.to_csv(metadata(f"trial_df.csv")) + trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv")) + trial_df.plot(kind="bar") + + #mean_df = trial_df.groupby("trial").mean() + #std_df = trial_df.groupby("trial").std() + #wandb.log_table(mean_df) + #wandb.log_table(std_df) + + #Special metrics for f1 score for wandb + wandblogger.experiment.log({"trial_df": wandb.Table(dataframe=trial_df)}) + mean_df = trial_df.groupby("trial").mean() + std_df = trial_df.groupby("trial").std() + wandblogger.experiment.log({"Mean": wandb.Table(dataframe=mean_df)}) + wandblogger.experiment.log({"Std": wandb.Table(dataframe=std_df)}) + + melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score") + # fig, ax = plt.subplots(figsize=(width, height)) + ax = sns.catplot( + data=melted_df, + kind="bar", + x="trial", + hue="Metric", + y="Score", + errorbar="se", + height=height, + aspect=width * 2**0.5 / height, + ) + # ax.xtick_params(labelrotation=45) + # plt.legend(loc='lower center', bbox_to_anchor=(1, 1)) + # sns.move_legend(ax, "lower center", bbox_to_anchor=(1, 1)) + # ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + # plt.tight_layout() + plt.savefig(metadata(f"trials_barplot.pdf")) + plt.close() + + avs = ( + melted_df.set_index(["trial", "Metric"]) + .xs("test_f1", level="Metric", drop_level=False) + .groupby("trial") + .mean() + ) + print(avs) + # tikzplotlib.save(metadata(f"trials_barplot.tikz")) @@ -582,11 +582,11 @@ def shape_embed_process(clargs): if __name__ == "__main__": -def auto_pos_int (x): - val = int(x,0) - if val <= 0: - raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) - return val + def auto_pos_int (x): + val = int(x,0) + if val <= 0: + raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) + return val parser = argparse.ArgumentParser(description='Run the shape embed pipeline') @@ -608,6 +608,12 @@ def auto_pos_int (x): parser.add_argument( '-m', '--model', choices=models, default=models[0], metavar='MODEL' , help=f"The MODEL to use, one of {models} (default {models[0]}).") + parser.add_argument( + '-d', '--dataset', nargs=2, default=("vampire", "vampire/torchvision/Control/"), metavar=('NAME', 'PATH') + , help=f"The NAME of and PATH to the dataset") + parser.add_argument( + '-w', '--wandb-project', default="shape-embed", metavar='PROJECT' + , help=f"The wandb PROJECT name") parser.add_argument( '-b', '--batch-size', default=int(4), metavar='BATCH_SIZE', type=auto_pos_int , help="The BATCH_SIZE for the run, a positive integer (default 4)") @@ -619,4 +625,6 @@ def auto_pos_int (x): #parser.add_argument('-v', '--verbose', action='count', default=0, # help="Increase verbosity level by adding more \"v\".") + #clargs=parser.parse_args() + #print(clargs.dataset) shape_embed_process(parser.parse_args()) From 19990760bcd8fd5e442b1d68d1f01f8d8b266fe7 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 22 Feb 2024 21:26:44 +0000 Subject: [PATCH 043/204] duplicated slurm script + specify dataset --- slurm_shape_embed_dataset.py | 104 +++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 slurm_shape_embed_dataset.py diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py new file mode 100644 index 00000000..2eb2ff61 --- /dev/null +++ b/slurm_shape_embed_dataset.py @@ -0,0 +1,104 @@ +#! /usr/bin/env python3 + +import os +import subprocess +import tempfile + +## Assign the arguments to variables +#model_arg=$1 +#sizes_list="${@:2}" +# +## Create SLURM job script +#job_script="slurm_job.sh" +# +#echo "#!/bin/bash" > "$job_script" +#echo "#SBATCH --job-name=ite_shape_embed" >> "$job_script" +#echo "#SBATCH --output=ite_shape_embed.out" >> "$job_script" +#echo "#SBATCH --error=ite_shape_embed.err" >> "$job_script" +#echo "#SBATCH --gres=gpu:2" >> "$job_script" # Adjust the number of CPUs as needed +#echo "#SBATCH --mem=50GB" >> "$job_script" # Adjust the memory requirement as needed +#echo "" >> "$job_script" +# +## Loop through the sizes and append the Python command to the job script +#for size in $sizes_list; do +# echo "python ite_shape_embed.py --model $model_arg --ls_size $size" >> "$job_script" +#done +# +## Submit SLURM job +#sbatch "$job_script" + +models = [ + "resnet18_vae" +, "resnet18_vqvae" +, "resnet18_vqvae_legacy" +, "resnet18_vae_legacy" +] +batch_sizes = [4] +latent_space_sizes = [512] + +datasets = [ + ("vampire", "vampire/torchvision/Control/") +, ("bbbc010", "bbbc010/BBBC010_v1_foreground_eachworm/") +, ("synthcell", "synthcellshapes_dataset") +#, ("helakyoto", "") +] + +wandb_project='shape-embed-ite-dataset' + +slurm_script="""#!/bin/bash + +echo "running shape embed with:" +echo " - model {model}" +echo " - dataset {dataset[0]} ({dataset[1]})" +echo " - batch size {b_size}" +echo " - latent space size {ls_size}" +rand_name=$(cat /dev/urandom | tr -cd 'a-f0-9' | head -c 16) +mkdir -p slurm_rundir/$rand_name +cp -r $(ls | grep -v slurm_rundir) slurm_rundir/$rand_name/. +cd slurm_rundir/$rand_name +python3 scripts/shapes/shape_embed.py --wandb-project {wandb_project} --model {model} --dataset {dataset[0]} {dataset[1]} --batch-size {b_size} --latent-space-size {ls_size} --clear-checkpoints +""" + +def mem_size(ls): + if ls <= 128: + return '50GB' + if ls > 128: + return '100GB' + if ls > 256: + return '300GB' + +def n_gpus(ls): + if ls <= 128: + return 'gpu:2' + if ls > 128: + return 'gpu:2' + if ls > 256: + return 'gpu:3' + +if __name__ == "__main__": + + slurmdir = f'{os.getcwd()}/slurmdir' + os.makedirs(slurmdir, exist_ok=True) + for m, bs, ls, ds in [ (m,bs,ls,ds) for m in models + for bs in batch_sizes + for ls in latent_space_sizes + for ds in datasets ]: + jobname = f'shape_embed_{m}_{ds[0]}_{bs}_{ls}' + print(jobname) + fp = open(mode='w+', file=f'{slurmdir}/slurm_script_shape_embed_{m}_{bs}_{ls}.script') + fp.write(slurm_script.format(model=m, dataset=ds, b_size=bs, ls_size=ls, wandb_project=wandb_project)) + fp.flush() + print(f'{fp.name}') + print(f'cat {fp.name}') + result = subprocess.run(['cat', fp.name], stdout=subprocess.PIPE) + print(result.stdout.decode('utf-8')) + print(mem_size(ls)) + result = subprocess.run([ 'sbatch' + , '--time', '10:00:00' + , '--mem', mem_size(ls) + , '--job-name', jobname + , '--output', f'{slurmdir}/{jobname}.out' + , '--error', f'{slurmdir}/{jobname}.err' + , '--gres', n_gpus(ls) + , fp.name], stdout=subprocess.PIPE) + print(result.stdout.decode('utf-8')) From d3525a8817d11db0960743d19eafb4a34a2ce590 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 22 Feb 2024 21:52:28 +0000 Subject: [PATCH 044/204] Fix wandb logger --- scripts/shapes/shape_embed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 32e6b898..948e0f5c 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -367,10 +367,10 @@ def shape_embed_process(clargs): monitor="loss/val", mode="min", ) - wandb.watch(lit_model, log="all") + wandblogger.watch(lit_model, log="all") trainer = pl.Trainer( - logger=[wandb, tb_logger], + logger=[wandblogger, tb_logger], gradient_clip_val=0.5, enable_checkpointing=True, devices=1, From 7a60b2a5a66c4f152ab6cbfd65117cbe92de2ab9 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 22 Feb 2024 21:52:54 +0000 Subject: [PATCH 045/204] Add helakyoto dataset to the slurm script --- slurm_shape_embed_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py index 2eb2ff61..a0b16047 100644 --- a/slurm_shape_embed_dataset.py +++ b/slurm_shape_embed_dataset.py @@ -39,8 +39,8 @@ datasets = [ ("vampire", "vampire/torchvision/Control/") , ("bbbc010", "bbbc010/BBBC010_v1_foreground_eachworm/") -, ("synthcell", "synthcellshapes_dataset") -#, ("helakyoto", "") +, ("synthcell", "synthcellshapes_dataset/") +, ("helakyoto", "H2b_10x_MD_exp665/samples/") ] wandb_project='shape-embed-ite-dataset' From e43e8394bf97d1fa609b3e1637d812d4dc0f4f68 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 16:48:24 +0000 Subject: [PATCH 046/204] better imports --- scripts/shapes/shape_embed.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 49cc9c1d..f0850a4d 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -9,14 +9,11 @@ import pandas as pd from sklearn import metrics import matplotlib as mpl -import seaborn as sns from pathlib import Path from sklearn.pipeline import Pipeline -import umap from torch.autograd import Variable from types import SimpleNamespace import numpy as np -import logging from skimage import measure import umap.plot from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint @@ -25,17 +22,16 @@ from types import SimpleNamespace from umap import UMAP import os - -# Deal with the filesystem import torch.multiprocessing +import logging +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) torch.multiprocessing.set_sharing_strategy("file_system") from bioimage_embed import shapes import bioimage_embed - -# Note - you must have torchvision installed for this example - from pytorch_lightning import loggers as pl_loggers from torchvision import transforms from bioimage_embed.lightning import DataModule @@ -47,16 +43,15 @@ DistogramToCoords, MaskToDistogramPipeline, RotateIndexingClockwise, + CoordsToDistogram, ) - import matplotlib.pyplot as plt from bioimage_embed.lightning import DataModule import matplotlib as mpl from matplotlib import rc -import logging -import pickle +import pickle import base64 import hashlib @@ -66,6 +61,7 @@ np.random.seed(42) pl.seed_everything(42) + def hashing_fn(args): serialized_args = pickle.dumps(vars(args)) hash_object = hashlib.sha256(serialized_args) From ffbd8eae64043c10ed68d63a35e42e8b1476cfca Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 16:49:10 +0000 Subject: [PATCH 047/204] Adding dataset path to args for better checkpointing --- scripts/shapes/shape_embed.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index f0850a4d..7bb4ce24 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -172,6 +172,9 @@ def shape_embed_process(): "latent_dim": int(128), "pretrained": True, "frobenius_norm": False, + # dataset = "bbbc010/BBBC010_v1_foreground_eachworm" + # dataset = "vampire/mefs/data/processed/Control" + "dataset": "synthcellshapes_dataset", } optimizer_params = { @@ -193,15 +196,9 @@ def shape_embed_process(): args = SimpleNamespace(**params, **optimizer_params, **lr_scheduler_params) - #dataset_path = "bbbc010/BBBC010_v1_foreground_eachworm" - dataset_path = "shape_embed_data/data/bbbc010/BBBC010_v1_foreground_eachworm/" - # dataset_path = "vampire/mefs/data/processed/Control" - # dataset_path = "shape_embed_data/data/vampire/torchvision/Control/" - # dataset_path = "vampire/torchvision/Control" - # dataset = "bbbc010" + dataset_path = args.dataset - # train_data_path = f"scripts/shapes/data/{dataset_path}" - train_data_path = f"scripts/shapes/data/{dataset_path}" + train_data_path = f"data/{dataset_path}" metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}" path = Path(metadata("")) From a3e82a90eee22acccbcb2858a0ae4800631e5056 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 16:49:46 +0000 Subject: [PATCH 048/204] Imrproved dataset logic so that dist depends on coords --- scripts/shapes/shape_embed.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 7bb4ce24..b2f95c64 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -206,9 +206,10 @@ def shape_embed_process(): # %% transform_crop = CropCentroidPipeline(window_size) - transform_dist = MaskToDistogramPipeline( - window_size, interp_size, matrix_normalised=False - ) + # transform_dist = MaskToDistogramPipeline( + # window_size, interp_size, matrix_normalised=False + # ) + transform_coord_to_dist = CoordsToDistogram(interp_size, matrix_normalised=False) transform_mdscoords = DistogramToCoords(window_size) transform_coords = ImageToCoords(window_size) @@ -222,16 +223,27 @@ def shape_embed_process(): ] ) - transform_mask_to_dist = transforms.Compose( + transform_mask_to_coords = transforms.Compose( [ transform_mask_to_crop, - transform_dist, + transform_coords, ] ) - transform_mask_to_coords = transforms.Compose( + + transform_mask_to_dist = transforms.Compose( [ - transform_mask_to_crop, - transform_coords, + transform_mask_to_coords, + transform_coord_to_dist, + ] + ) + + gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1)) + transform = transforms.Compose( + [ + transform_mask_to_dist, + transforms.ToTensor(), + RotateIndexingClockwise(p=1), + gray2rgb, ] ) From 5b720d4ba3caaf4b50803b4496267fba49dd7828 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 16:50:13 +0000 Subject: [PATCH 049/204] Improved data rejection for datatsets --- scripts/shapes/shape_embed.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index b2f95c64..fc29eeb5 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -254,11 +254,31 @@ def shape_embed_process(): "transform_coords": transform_mask_to_coords, } + # Apply transform to find which images don't work + dataset = datasets.ImageFolder(train_data_path, transform=transform) + + valid_indices = [] + # Iterate through the dataset and apply the transform to each image + for idx in range(len(dataset)): + try: + image, label = dataset[idx] + # If the transform works without errors, add the index to the list of valid indices + valid_indices.append(idx) + except Exception as e: + # A better way to do with would be with batch collation + logger.warning(f"Error occurred for image {idx}: {e}") + train_data = { - key: datasets.ImageFolder(train_data_path, transform=value) + key: torch.utils.data.Subset( + datasets.ImageFolder(train_data_path, transform=value), valid_indices + ) for key, value in transforms_dict.items() } + dataset = torch.utils.data.Subset( + datasets.ImageFolder(train_data_path, transform=transform), valid_indices + ) + for key, value in train_data.items(): print(key, len(value)) plt.imshow(train_data[key][0][0], cmap="gray") From dea41f65c46a2c0a49b2d7ce9f73484b6d17b301 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 16:50:39 +0000 Subject: [PATCH 050/204] Removing redundant code and adding logging --- scripts/shapes/shape_embed.py | 42 ++--------------------------------- 1 file changed, 2 insertions(+), 40 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index fc29eeb5..3dab09b3 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -280,21 +280,12 @@ def shape_embed_process(): ) for key, value in train_data.items(): - print(key, len(value)) - plt.imshow(train_data[key][0][0], cmap="gray") + logger.info(key, len(value)) + plt.imshow(np.array(train_data[key][0][0]), cmap="gray") plt.imsave(metadata(f"{key}.png"), train_data[key][0][0], cmap="gray") # plt.show() plt.close() - # plt.scatter(*train_data["transform_coords"][0][0]) - # plt.savefig(metadata(f"transform_coords.png")) - # plt.show() - - # plt.imshow(train_data["transform_crop"][0][0], cmap="gray") - # plt.scatter(*train_data["transform_coords"][0][0],c=np.arange(interp_size), cmap='rainbow', s=1) - # plt.show() - # plt.savefig(metadata(f"transform_coords.png")) - # Retrieve the coordinates and cropped image coords = train_data["transform_coords"][0][0] crop_image = train_data["transform_crop"][0][0] @@ -315,33 +306,8 @@ def shape_embed_process(): # Close the plot plt.close() - # import albumentations as A - # %% - gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1)) - transform = transforms.Compose( - [ - transform_mask_to_dist, - transforms.ToTensor(), - RotateIndexingClockwise(p=1), - gray2rgb, - ] - ) - - dataset = datasets.ImageFolder(train_data_path, transform=transform) - - valid_indices = [] - # Iterate through the dataset and apply the transform to each image - for idx in range(len(dataset)): - try: - image, label = dataset[idx] - # If the transform works without errors, add the index to the list of valid indices - valid_indices.append(idx) - except Exception as e: - # A better way to do with would be with batch collation - print(f"Error occurred for image {idx}: {e}") # Create a Subset using the valid indices - dataset = torch.utils.data.Subset(dataset, valid_indices) dataloader = DataModule( dataset, batch_size=args.batch_size, @@ -349,8 +315,6 @@ def shape_embed_process(): num_workers=args.num_workers, ) - # model = bioimage_embed.models.create_model("resnet18_vqvae_legacy", **vars(args)) - # model = bioimage_embed.models.create_model( model=args.model, input_dim=args.input_dim, @@ -358,8 +322,6 @@ def shape_embed_process(): pretrained=args.pretrained, ) - # model = bioimage_embed.models.factory.ModelFactory(**vars(args)).resnet50_vqvae_legacy() - # lit_model = shapes.MaskEmbedLatentAugment(model, args) lit_model = shapes.MaskEmbed(model, args) test_data = dataset[0][0].unsqueeze(0) From 85f1211860dc9567b902767ae10da57ecb1c1496 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 16:51:09 +0000 Subject: [PATCH 051/204] [bug] Removing hard coded idx mapper --- scripts/shapes/shape_embed.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 3dab09b3..19bf4267 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -385,16 +385,14 @@ def shape_embed_process(): # torch.onnx.export(lit_model, example_input, f"{model_dir}/model.onnx") # %% - # Inference - + # Inference on full dataset dataloader = DataModule( dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, # Transform is commented here to avoid augmentations in real data - # HOWEVER, applying a the transform multiple times and averaging the results might produce better latent embeddings - # transform=transform, + # HOWEVER, applying the transform multiple times and averaging the results might produce better latent embeddings # transform=transform, ) dataloader.setup() @@ -408,16 +406,14 @@ def shape_embed_process(): y = np.array([int(data[-1]) for data in dataloader.predict_dataloader()]) df = pd.DataFrame(latent_space.numpy()) - df["Class"] = y - # Map numeric classes to their labels - idx_to_class = {0: "alive", 1: "dead"} - df["Class"] = df["Class"].map(idx_to_class).astype("category") + df["Class"] = pd.Series(y).map(idx_to_class).astype("category") df["Scale"] = scalings[:, 0].squeeze() df = df.set_index("Class") df_shape_embed = df.copy() # %% UMAP plot - umap_plot(df, metadata, width, height,split=0.9) + + umap_plot(df, metadata, width, height, split=0.9) X = df_shape_embed.to_numpy() y = df_shape_embed.index From e411a9da9577d5641763cf15f172ac8a6469764d Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 16:51:44 +0000 Subject: [PATCH 052/204] Adding logging and and tqdm so it doesnt look like the code is hanging --- scripts/shapes/shape_embed.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 19bf4267..abb02879 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -427,11 +427,12 @@ def shape_embed_process(): "orientation", ] dfs = [] - for i, data in enumerate(train_data["transform_crop"]): + # Distance matrix data + for i, data in enumerate(tqdm(train_data["transform_crop"])): X, y = data # Do regionprops here # Calculate shape summary statistics using regionprops - # We're considering that the mask has only one object, thus we take the first element [0] + # We're considering that the mask has only one object, so we take the first element [0] # props = regionprops(np.array(X).astype(int))[0] props_table = measure.regionprops_table( np.array(X).astype(int), properties=properties @@ -447,9 +448,8 @@ def shape_embed_process(): df_regionprops = pd.concat(dfs) - # Assuming 'dataset_contour' is your DataLoader for the dataset dfs = [] - for i, data in enumerate(train_data["transform_coords"]): + for i, data in enumerate(tqdm(train_data["transform_coords"])): # Convert the tensor to a numpy array X, y = data @@ -498,7 +498,7 @@ def shape_embed_process(): y = trial["labels"] trial["score_df"] = scoring_df(X, y) trial["score_df"]["trial"] = trial["name"] - print(trial["score_df"]) + logger.info(trial["score_df"]) trial["score_df"].to_csv(metadata(f"{trial['name']}_score_df.csv")) trial_df = pd.concat([trial_df, trial["score_df"]]) trial_df = trial_df.drop(["fit_time", "score_time"], axis=1) From 66df0b991e4a5b03b3636017f49e0a2360c747ab Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 16:52:10 +0000 Subject: [PATCH 053/204] More logging --- scripts/shapes/shape_embed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index abb02879..87e8be5a 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -533,7 +533,7 @@ def shape_embed_process(): .groupby("trial") .mean() ) - print(avs) + logger.info(avs) # tikzplotlib.save(metadata(f"trials_barplot.tikz")) From b5db3670241277a732fa6ae982f542664d035fad Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 16:57:31 +0000 Subject: [PATCH 054/204] Adding a disk cleanup step --- .github/workflows/test.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index dbf6e63d..290f82f8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -21,6 +21,16 @@ jobs: # shell: bash -l {0} steps: - uses: actions/checkout@v2 + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + tool-cache: false + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true - uses: conda-incubator/setup-miniconda@v2 with: environment-file: environment.yml From f580f97b921f7cef00c4a73e4423f9eca0ef66fa Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 16:59:33 +0000 Subject: [PATCH 055/204] Reverting to other path structure --- scripts/shapes/shape_embed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 87e8be5a..61f34366 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -198,7 +198,7 @@ def shape_embed_process(): dataset_path = args.dataset - train_data_path = f"data/{dataset_path}" + train_data_path = f"scripts/shapes/data/{dataset_path}" metadata = lambda x: f"results/{dataset_path}_{args.model}/{x}" path = Path(metadata("")) From ae9f8d1907e1a57a6fc5fd5a3867e31f9621e32d Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 17:26:12 +0000 Subject: [PATCH 056/204] Adding average cross val to logs --- scripts/shapes/shape_embed.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 61f34366..071127ed 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -507,6 +507,10 @@ def shape_embed_process(): trial_df.groupby("trial").mean().to_csv(metadata(f"trial_df_mean.csv")) trial_df.plot(kind="bar") + avg = trial_df.groupby("trial").mean() + logger.info(avg) + avg.to_latex(metadata(f"trial_df.tex")) + melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score") # fig, ax = plt.subplots(figsize=(width, height)) ax = sns.catplot( From dcdfa6389b34233c5aa9f744b76ab95ebb42f5f1 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Thu, 29 Feb 2024 17:26:23 +0000 Subject: [PATCH 057/204] Adding opencv to env file --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 568d68ce..cdd9cd54 100644 --- a/environment.yml +++ b/environment.yml @@ -14,5 +14,6 @@ dependencies: - pytorch - pillow=9.5.0 - pip +- conda-forge::opencv - pip: - -e . From e8ca2cb52f540ad78637671a613822d41699840c Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 1 Mar 2024 08:26:47 +0000 Subject: [PATCH 058/204] Added allen dataset --- slurm_shape_embed_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py index a0b16047..4298dc18 100644 --- a/slurm_shape_embed_dataset.py +++ b/slurm_shape_embed_dataset.py @@ -41,6 +41,7 @@ , ("bbbc010", "bbbc010/BBBC010_v1_foreground_eachworm/") , ("synthcell", "synthcellshapes_dataset/") , ("helakyoto", "H2b_10x_MD_exp665/samples/") +, ("allen", "allen_dataset/") ] wandb_project='shape-embed-ite-dataset' From 775442c471ea477c9c7e0fcc9a322a338179449f Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 1 Mar 2024 08:27:17 +0000 Subject: [PATCH 059/204] Limit time per job increased to 24h --- slurm_shape_embed_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py index 4298dc18..6adf0bad 100644 --- a/slurm_shape_embed_dataset.py +++ b/slurm_shape_embed_dataset.py @@ -95,7 +95,7 @@ def n_gpus(ls): print(result.stdout.decode('utf-8')) print(mem_size(ls)) result = subprocess.run([ 'sbatch' - , '--time', '10:00:00' + , '--time', '24:00:00' , '--mem', mem_size(ls) , '--job-name', jobname , '--output', f'{slurmdir}/{jobname}.out' From 2fce547fa7747e02f230d0db9b86f4138c885e8b Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Fri, 1 Mar 2024 17:30:05 +0000 Subject: [PATCH 060/204] Fixing case where multiple contours are found, chose the longest --- bioimage_embed/shapes/contours.py | 2 +- bioimage_embed/shapes/transforms.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/bioimage_embed/shapes/contours.py b/bioimage_embed/shapes/contours.py index fd82c4ba..6845b97f 100644 --- a/bioimage_embed/shapes/contours.py +++ b/bioimage_embed/shapes/contours.py @@ -35,7 +35,7 @@ def cubic_polar_resample_contour(contour: np.array, size: int) -> np.array: def contour_to_xy(contour: np.array): - return contour[0][:, 0], contour[0][:, 1] + return contour[:, 0], contour[:, 1] def uniform_spline_resample_contour(contour: np.array, size: int) -> np.array: diff --git a/bioimage_embed/shapes/transforms.py b/bioimage_embed/shapes/transforms.py index 1d350a04..2abd401f 100644 --- a/bioimage_embed/shapes/transforms.py +++ b/bioimage_embed/shapes/transforms.py @@ -173,6 +173,12 @@ def get_distogram(self, coords, matrix_normalised=False): return distance_matrix / np.linalg.norm([self.size, self.size]) +def find_longest_array(arrays): + lengths = [len(arr.flatten()) for arr in arrays] + max_length_index = np.argmax(lengths) + return arrays[max_length_index] + + class ImageToCoords(torch.nn.Module): def __init__(self, size): super().__init__() @@ -204,7 +210,8 @@ def get_coords_C( return torch.tensor(np.array(coords_list)) def get_coords(self, image, size, method="uniform_spline", contour_level=0.8): - contour = find_contours(np.array(image), contour_level) + contour_list = find_contours(np.array(image), contour_level) + contour = find_longest_array(contour_list) if method == "uniform_spline": return contours.uniform_spline_resample_contour(contour=contour, size=size) if method == "cubic_polar": From 6e14ffd8f858d884dfcf1a10eccaf5ca7e221938 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 3 Mar 2024 14:21:23 +0000 Subject: [PATCH 061/204] change back to use dataset name from clarg + change default wandb jobname and latent space size --- scripts/shapes/shape_embed.py | 3 ++- slurm_shape_embed_dataset.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index ff3c8f27..d1ae640e 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -186,7 +186,8 @@ def shape_embed_process(clargs): "frobenius_norm": False, # dataset = "bbbc010/BBBC010_v1_foreground_eachworm" # dataset = "vampire/mefs/data/processed/Control" - "dataset": "synthcellshapes_dataset", + #"dataset": "synthcellshapes_dataset", + "dataset": clargs.dataset[0], } optimizer_params = { diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py index 6adf0bad..1eb570b1 100644 --- a/slurm_shape_embed_dataset.py +++ b/slurm_shape_embed_dataset.py @@ -34,7 +34,7 @@ , "resnet18_vae_legacy" ] batch_sizes = [4] -latent_space_sizes = [512] +latent_space_sizes = [128] datasets = [ ("vampire", "vampire/torchvision/Control/") @@ -44,7 +44,7 @@ , ("allen", "allen_dataset/") ] -wandb_project='shape-embed-ite-dataset' +wandb_project='shape-embed-test-changes' slurm_script="""#!/bin/bash From f31b9a0a816c29af2f1b68f2835090fca30d1dd7 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 3 Mar 2024 16:45:58 +0000 Subject: [PATCH 062/204] added back dataset subseting --- scripts/shapes/shape_embed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index d1ae640e..f4a922b5 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -353,6 +353,7 @@ def shape_embed_process(clargs): print(f"Error occurred for image {idx}: {e}") # Create a Subset using the valid indices + dataset = torch.utils.data.Subset(dataset, valid_indices) dataloader = DataModule( dataset, batch_size=args.batch_size, From d052800a74207da86840e06a2958b1e7d5c37b34 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 3 Mar 2024 16:47:10 +0000 Subject: [PATCH 063/204] Added a tiny dataset for quick debugging (commented out in the slurm script) --- slurm_shape_embed_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py index 1eb570b1..38a964d9 100644 --- a/slurm_shape_embed_dataset.py +++ b/slurm_shape_embed_dataset.py @@ -37,6 +37,7 @@ latent_space_sizes = [128] datasets = [ +# ("tiny_synthcell", "tiny_synthcellshapes_dataset/") ("vampire", "vampire/torchvision/Control/") , ("bbbc010", "bbbc010/BBBC010_v1_foreground_eachworm/") , ("synthcell", "synthcellshapes_dataset/") From df41415f1dca7014021859678a3c12878560c4ab Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 4 Mar 2024 20:35:37 +0000 Subject: [PATCH 064/204] use specific gpu resource --- scripts/shapes/shape_embed.py | 2 +- slurm_shape_embed_dataset.py | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index f4a922b5..bb2c65ff 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -183,7 +183,7 @@ def shape_embed_process(clargs): "pretrained": True, "commitment_cost": 0.25, "decay": 0.99, - "frobenius_norm": False, + "frobenius_norm": True, # dataset = "bbbc010/BBBC010_v1_foreground_eachworm" # dataset = "vampire/mefs/data/processed/Control" #"dataset": "synthcellshapes_dataset", diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py index 38a964d9..95e7a90c 100644 --- a/slurm_shape_embed_dataset.py +++ b/slurm_shape_embed_dataset.py @@ -28,13 +28,17 @@ #sbatch "$job_script" models = [ - "resnet18_vae" + "resnet50_vae" +, "resnet50_vqvae" +, "resnet50_vqvae_legacy" +, "resnet50_vae_legacy" +, "resnet18_vae" , "resnet18_vqvae" , "resnet18_vqvae_legacy" -, "resnet18_vae_legacy" -] +, "resnet18_vae_legacy"] + batch_sizes = [4] -latent_space_sizes = [128] +latent_space_sizes = [512] datasets = [ # ("tiny_synthcell", "tiny_synthcellshapes_dataset/") @@ -45,7 +49,7 @@ , ("allen", "allen_dataset/") ] -wandb_project='shape-embed-test-changes' +wandb_project='shape-embed-biggest' slurm_script="""#!/bin/bash @@ -101,6 +105,7 @@ def n_gpus(ls): , '--job-name', jobname , '--output', f'{slurmdir}/{jobname}.out' , '--error', f'{slurmdir}/{jobname}.err' - , '--gres', n_gpus(ls) + #, '--gres', n_gpus(ls) + , '--gpus=a100:1' , fp.name], stdout=subprocess.PIPE) print(result.stdout.decode('utf-8')) From f1e5a3cf95b308eb73c1396e9efda2c7b123c158 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Tue, 5 Mar 2024 09:35:00 +0000 Subject: [PATCH 065/204] Adding roc_auc and using balanced accuracy --- scripts/shapes/shape_embed.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 071127ed..36f130d3 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -113,10 +113,11 @@ def scoring_df(X, y): ) # Define a dictionary of metrics scoring = { - "accuracy": make_scorer(metrics.accuracy_score), + "accuracy": make_scorer(metrics.balanced_accuracy_score), "precision": make_scorer(metrics.precision_score, average="macro"), "recall": make_scorer(metrics.recall_score, average="macro"), "f1": make_scorer(metrics.f1_score, average="macro"), + "roc_auc": make_scorer(metrics.roc_auc_score, average="macro"), } # Create a random forest classifier From d31ca9600ca3b754ef91d541050857bbba84477f Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Tue, 5 Mar 2024 09:35:31 +0000 Subject: [PATCH 066/204] Probably should stratify --- scripts/shapes/shape_embed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 36f130d3..e5a40242 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -4,7 +4,7 @@ from sklearn.decomposition import PCA from sklearn.discriminant_analysis import StandardScaler from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import cross_validate, KFold, train_test_split +from sklearn.model_selection import cross_validate, KFold, train_test_split, StratifiedKFold from sklearn.metrics import make_scorer import pandas as pd from sklearn import metrics @@ -137,7 +137,7 @@ def scoring_df(X, y): estimator=pipeline, X=X, y=y, - cv=KFold(n_splits=k_folds), + cv=StratifiedKFold(n_splits=k_folds), scoring=scoring, n_jobs=-1, return_train_score=False, From abe2664aa11404801779be28e215a3cac9dcc72c Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Tue, 5 Mar 2024 09:35:47 +0000 Subject: [PATCH 067/204] Adding coordinate debug (unchecked) --- scripts/shapes/shape_embed.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index e5a40242..49cd29d6 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -400,6 +400,23 @@ def shape_embed_process(): predictions = trainer.predict(lit_model, datamodule=dataloader) + + test_dist_pred = predictions[0].out.recon_x + plt.imsave(metadata(f"test_dist_pred.png"), test_dist_pred.mean(axis=(0,1))) + plt.close() + + test_dist_in = predictions[0].x.data + plt.imsave(metadata(f"test_dist_in.png"), test_dist_in.mean(axis=(0,1))) + plt.close() + + test_pred_coords = AsymmetricDistogramToCoordsPipeline(window_size=window_size)( + np.array(test_dist_pred[:, 0, :, :].unsqueeze(dim=0)) + ) + + plt.scatter(*test_pred_coords[0,0].T) + # Save the plot as an image without border and coordinate axes + plt.savefig(metadata(f"test_pred_coords.png"), bbox_inches="tight", pad_inches=0) + plt.close() # Use the namespace variables latent_space = torch.stack([d.out.z.flatten() for d in predictions]) scalings = torch.stack([d.x.scalings.flatten() for d in predictions]) From ab48c09436dce7c9c1e1a2647f18d8d79da5639b Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 6 Mar 2024 09:24:36 +0000 Subject: [PATCH 068/204] put back frobenius norm false --- scripts/shapes/shape_embed.py | 2 +- slurm_shape_embed_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index bb2c65ff..f4a922b5 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -183,7 +183,7 @@ def shape_embed_process(clargs): "pretrained": True, "commitment_cost": 0.25, "decay": 0.99, - "frobenius_norm": True, + "frobenius_norm": False, # dataset = "bbbc010/BBBC010_v1_foreground_eachworm" # dataset = "vampire/mefs/data/processed/Control" #"dataset": "synthcellshapes_dataset", diff --git a/slurm_shape_embed_dataset.py b/slurm_shape_embed_dataset.py index 95e7a90c..0651c361 100644 --- a/slurm_shape_embed_dataset.py +++ b/slurm_shape_embed_dataset.py @@ -49,7 +49,7 @@ , ("allen", "allen_dataset/") ] -wandb_project='shape-embed-biggest' +wandb_project='shape-embed-no-norm' slurm_script="""#!/bin/bash From 46eb3d1eb2a716ccd833bc6cdd6f5ea036466736 Mon Sep 17 00:00:00 2001 From: Craig Russell Date: Wed, 6 Mar 2024 12:08:00 +0000 Subject: [PATCH 069/204] Forgot an import --- scripts/shapes/shape_embed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 49cd29d6..375b1fc7 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -44,6 +44,7 @@ MaskToDistogramPipeline, RotateIndexingClockwise, CoordsToDistogram, + AsymmetricDistogramToCoordsPipeline, ) import matplotlib.pyplot as plt From db88d341ee12f6cc3a78d0750a8506e620a29b0f Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 7 Mar 2024 09:28:36 +0000 Subject: [PATCH 070/204] add the hardcode entity and add model dir --- scripts/shapes/shape_embed.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index ae5d5ce0..961f5ad0 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -383,6 +383,8 @@ def shape_embed_process(clargs): dataloader.setup() model.eval() + model_dir = f"checkpoints/{hashing_fn(args)}" + if clargs.clear_checkpoints: print("cleaning checkpoints") shutil.rmtree("checkpoints/") @@ -390,7 +392,8 @@ def shape_embed_process(clargs): tb_logger = pl_loggers.TensorBoardLogger(f"logs/") jobname = f"{params['model']}_{interp_size}_{params['batch_size']}_{clargs.dataset[0]}" - wandblogger = pl_loggers.WandbLogger(project=clargs.wandb_project, name=jobname) + wandblogger = pl_loggers.WandbLogger(entity='foix', project="shape_embed_fixes", name=jobname) + #wandblogger = pl_loggers.WandbLogger(project=clargs.wandb_project, name=jobname) Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True) From 2505a2dfcba17cd38f9e02a85c6a0ab1a564d449 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 11 Mar 2024 14:56:31 +0000 Subject: [PATCH 071/204] reduce epochs --- scripts/shapes/shape_embed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index 4839daf7..eeb797d6 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -174,7 +174,7 @@ def shape_embed_process(clargs): params = { "model":clargs.model, #"model":"resnet18_vae", - "epochs": 250, + "epochs": 150, "batch_size": clargs.batch_size, #"batch_size": 4, "num_workers": 2**4, From f588a2d9a8cd983b3a031e4619bb286b5858bb18 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 27 Mar 2024 18:56:58 +0000 Subject: [PATCH 072/204] all changes --- bioimage_embed/shapes/mds.py | 5 +- scripts/shapes/distmatrix2embeding.py | 9 + scripts/shapes/masks2distmatrices.py | 249 ++++++++++++++++++++++++++ scripts/shapes/shape_embed.py | 2 +- 4 files changed, 262 insertions(+), 3 deletions(-) create mode 100644 scripts/shapes/distmatrix2embeding.py create mode 100644 scripts/shapes/masks2distmatrices.py diff --git a/bioimage_embed/shapes/mds.py b/bioimage_embed/shapes/mds.py index fdcf2af1..19846375 100644 --- a/bioimage_embed/shapes/mds.py +++ b/bioimage_embed/shapes/mds.py @@ -7,11 +7,12 @@ def mds(d): :return: A matrix of x, y coordinates. """ n = d.size(0) - I = torch.eye(n) + I = torch.eye(n, dtype=torch.float64) H = I - torch.ones((n, n)) / n S = -0.5 * H @ d @ H - eigvals, eigvecs = S.symeig(eigenvectors=True) + #eigvals, eigvecs = S.symeig(eigenvectors=True) + eigvals, eigvecs = torch.linalg.eigh(S) # Sort the eigenvalues and eigenvectors in decreasing order idx = eigvals.argsort(descending=True) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py new file mode 100644 index 00000000..0f7fe82a --- /dev/null +++ b/scripts/shapes/distmatrix2embeding.py @@ -0,0 +1,9 @@ +# Loading the data (matrices) + +# TO DO: Apply transformation + +# Build the model + +# Train the model + +# Pull the embedings \ No newline at end of file diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py new file mode 100644 index 00000000..42051416 --- /dev/null +++ b/scripts/shapes/masks2distmatrices.py @@ -0,0 +1,249 @@ +# Imports when necessary +import numpy as np +import torch +import logging +import sklearn +import skimage as sk +import scipy.spatial +from scipy.interpolate import splprep, splev +import matplotlib.image +import matplotlib.pyplot as plt +import glob + +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.manifold import MDS + +from bioimage_embed import shapes +import bioimage_embed +from pytorch_lightning import loggers as pl_loggers +from torchvision import transforms +from bioimage_embed.lightning import DataModule + +from torchvision import datasets + +from bioimage_embed.shapes.mds import mds + +from bioimage_embed.shapes.transforms import ( + CropCentroidPipeline, + CoordsToDistogram, + ImageToCoords, + RotateIndexingClockwise, +) + +logger = logging.getLogger(__name__) + +# Where is the datat I want to transform +dataset = f"/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/" + +########################################################################## +####### Simplified version in order to make the things properly work ##### +########################################################################## + +def find_contour(mask): + contour = sk.measure.find_contours(mask, 0.8)[0] + x, y = contour[:, 0], contour[:, 1] + return x, y + +def spline_interpolation(x, y): + sparsity_contour = 4 # Sparsity of the contour. Dropping some of the sample (points) to make the spline smoother + tck, u = splprep([x[::sparsity_contour], y[::sparsity_contour]], s = 0) + sample_points = 200 + # How many times to sample the spline + new_u = np.linspace(u.min(), u.max(), sample_points) # Last parameter is how dense is our spline, how many points. + # Evaluate the spline + x_spline, y_spline = splev(new_u, tck) + return x_spline, y_spline + +def build_distance_matrix(x_reinterpolated, y_reinterpolated): + reinterpolated_contour = np.column_stack([x_reinterpolated, y_reinterpolated]) + dm = scipy.spatial.distance_matrix(reinterpolated_contour, reinterpolated_contour) + return dm + +def dist_to_coords(dst_mat): + embedding = MDS(n_components=2, dissimilarity='precomputed') + return embedding.fit_transform(dst_mat) + + +# Simplified version for test +def process_png_file(mask_path): + # Perform specific action for each PNG file + print("Processing:", mask_path) + mask = plt.imread(mask_path) + + # Get the contour + x, y = find_contour(mask) + + # Reinterpolate (spline) + x_reinterpolated, y_reinterpolated = spline_interpolation(x, y) + plt.scatter(x_reinterpolated, y_reinterpolated, s=6) + plt.savefig(f'results/reconstruction/original_contour{i}.png') + plt.clf() + + # Build the distance matrix + dm = build_distance_matrix(x_reinterpolated, y_reinterpolated) + # print("Distance matrix") + # print(dm) + + # Reconstruction coordinates and matrix (MDS) + reconstructed_coords = dist_to_coords(dm) + print(reconstructed_coords) + plt.scatter(*zip(*reconstructed_coords), s=6) + plt.savefig(f'results/reconstruction/reconstructed_contour{i}.png') + plt.clf() + reconstructed_matrix = euclidean_distances(reconstructed_coords) + + # Error with matrix + err = np.average(dm - reconstructed_matrix) + print(f"Dist error is: {err}") + +# Specify the folder path containing PNG files +folder_path = "/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/*/*.png" + +# Use glob to find all PNG files in the folder +png_files = glob.glob(folder_path) + +# Iterate through all PNG files found +for i, file_path in enumerate(png_files): + # Process the PNG file + process_png_file(file_path) + + + + +######################################## +############# Other code ############### +######################################## + +# # Needed variables +# window_size = 256 # needs to be the same as the latent space size +# interp_size = 256 # latent space size needs to match the window size + +# # This crops the image using the centroid by window sizes. (remember to removed and see what happens) +# transform_crop = CropCentroidPipeline(window_size) + +# # From the coordinates of the distance matrix, this is actually building the distance matrix +# transform_coord_to_dist = CoordsToDistogram(interp_size, matrix_normalised=False) + +# # It takes the images and converts it into a numpy array of the image and the size +# transform_coords = ImageToCoords(window_size) + +# # Combination of transforms +# transform_mask_to_gray = transforms.Compose([transforms.Grayscale(1)]) + +# transform_mask_to_crop = transforms.Compose( +# [ +# # transforms.ToTensor(), +# transform_mask_to_gray, +# transform_crop, +# ] +# ) + +# transform_mask_to_coords = transforms.Compose( +# [ +# transform_mask_to_crop, +# transform_coords, +# ] +# ) + +# transform_mask_to_dist = transforms.Compose( +# [ +# transform_mask_to_coords, +# transform_coord_to_dist, +# ] +# ) + +# def dist_to_coords(dst_mat): +# embedding = MDS(n_components=2, dissimilarity='precomputed', max_iter=1) +# return embedding.fit_transform(dst_mat) + + #coords_prime = MDS( + #n_components=2, dissimilarity="precomputed", random_state=0).fit_transform(dst_mat) + + #return coords_prime + #return mds(dst_mat) + + # from https://math.stackexchange.com/a/423898 and https://stackoverflow.com/a/17177833/16632916 +# m = np.zeros(shape=dst_mat.shape) +# for i in range(dst_mat.shape[0]): +# for j in range(dst_mat.shape[1]): +# m[i,j]= 0.5*(dst_mat[0, j]**2 + dst_mat[i, 0]**2 - dst_mat[i, j]**2) +# eigenvalues, eigenvectors = np.linalg.eig(m) +# print(f'm:{m}') +# print(f'eigenvalues:{eigenvalues}') +# print(f'eigenvectors:{eigenvectors}') +# return np.sqrt(eigenvalues)*eigenvectors + +# # Convert your image to gray scale +# gray2rgb = transforms.Lambda(lambda x: x.repeat(3, 1, 1)) + +# # choose the transformation you want to apply to your data and Compose +# transform = transforms.Compose( +# [ +# transform_mask_to_dist, +# transforms.ToTensor(), +# RotateIndexingClockwise(p=1), # This module effectively allows for random clockwise rotations of input images with a specified probability. +# gray2rgb, +# ] +# ) + +# transforms_dict = { +# "none": transform_mask_to_gray, +# "transform_crop": transform_mask_to_crop, +# "transform_dist": transform_mask_to_dist, +# "transform_coords": transform_mask_to_coords, +# } + + + +# diagonal = np.diag(dm) + +# if np.all(diagonal == 0): +# print("All elements in the diagonal are zeros.") +# dataset_raw[i][0].save(f'original_{i}.png') +# np.save(f"random_matrix_{i}.npy", dataset_trans[i][0][0]) +# matplotlib.image.imsave(f'dist_mat_{i}.png', dataset_trans[i][0][0]) +# coords = dist_to_coords(dataset_trans[i][0][0]) +# print(coords) +# x, y = list(zip(*coords)) +# plt.scatter(x_reinterpolated, y_reinterpolated) +# plt.savefig(f'mask_{i}.png') +# plt.clf() +# fig, ax = plt.subplots(1, 4, figsize=(20, 5)) +# ax[0].imshow(mask) +# ax[1].scatter(x_reinterpolated, y_reinterpolated) +# ax[1].imshow(dm) +# ax[3].scatter(x, y) +# fig.savefig(f'combined_{i}.png') +# else: +# print("Not all elements in the diagonal are zeros.") + + + +# # Apply transform to find which images don't work +# dataset_raw = datasets.ImageFolder(dataset) +# dataset_contours = datasets.ImageFolder(dataset, transform=transform_mask_to_coords) +# dataset_trans = datasets.ImageFolder(dataset, transform=transform) + +# # This is a single image distance matrix +# for i in range(0, 10): +# print(dataset_trans[i][0][0]) +# diagonal = np.diag(dataset_trans[i][0][0]) +# if np.all(diagonal == 0): +# print("All elements in the diagonal are zeros.") +# dataset_raw[i][0].save(f'original_{i}.png') +# np.save(f"random_matrix_{i}.npy", dataset_trans[i][0][0]) +# matplotlib.image.imsave(f'dist_mat_{i}.png', dataset_trans[i][0][0]) +# coords = dist_to_coords(dataset_trans[i][0][0]) +# print(coords) +# x, y = list(zip(*coords)) +# plt.scatter(x, y) +# plt.savefig(f'mask_{i}.png') +# plt.clf() +# fig, ax = plt.subplots(1, 4, figsize=(20, 5)) +# ax[0].imshow(dataset_raw[i][0]) +# ax[1].imshow(dataset_trans[i][0][0]) +# ax[2].scatter(dataset_contours[i][0][0], dataset_contours[i][0][1]) +# ax[3].scatter(x, y) +# fig.savefig(f'combined_{i}.png') +# else: +# print("Not all elements in the diagonal are zeros.") diff --git a/scripts/shapes/shape_embed.py b/scripts/shapes/shape_embed.py index eeb797d6..2a82c708 100644 --- a/scripts/shapes/shape_embed.py +++ b/scripts/shapes/shape_embed.py @@ -224,7 +224,7 @@ def shape_embed_process(clargs): # window_size, interp_size, matrix_normalised=False # ) transform_coord_to_dist = CoordsToDistogram(interp_size, matrix_normalised=False) - transform_mdscoords = DistogramToCoords(window_size) + #transform_mdscoords = DistogramToCoords(window_size) transform_coords = ImageToCoords(window_size) transform_mask_to_gray = transforms.Compose([transforms.Grayscale(1)]) From 6151a6f3a08f3dca3dd29127dad86344064d2a0a Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 27 Mar 2024 22:24:07 +0000 Subject: [PATCH 073/204] first structure --- scripts/shapes/distmatrix2embeding.py | 184 +++++++++++++++++++++++++- scripts/shapes/masks2distmatrices.py | 88 +++++++----- 2 files changed, 237 insertions(+), 35 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 0f7fe82a..1dbeb7c3 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -1,9 +1,183 @@ -# Loading the data (matrices) +from torchvision import datasets, transforms +import pytorch_lightning as pl +import bioimage_embed +import bioimage_embed.shapes +import bioimage_embed.lightning +import argparse +import types -# TO DO: Apply transformation +# misc helpers +############################################################################### -# Build the model +def vprint(tgtlvl, msg, pfx = f"{'':<5}"): + try: + if (tgtlvl <= vprint.lvl): + print(f"{pfx}{msg}") + except AttributeError: + print("verbosity level not set, defaulting to 0") + vprint.lvl = 0 + vprint(tgtlvl, msg) -# Train the model +# Main process +############################################################################### -# Pull the embedings \ No newline at end of file +def main_process(params): + + # Loading the data (matrices) + ########################################################################### + + preproc_transform = transforms.Compose([ + transforms.ToTensor(), + ]) + dataset = datasets.ImageFolder(params.dataset[1], transform = preproc_transform) + dataloader = bioimage_embed.lightning.DataModule( + dataset, + batch_size=params.batch_size, + shuffle=True, + num_workers=params.num_workers, + ) + dataloader.setup() + vprint(1, f'dataloader ready') + + # Build the model + ########################################################################### + + model = bioimage_embed.models.create_model( + model=params.model, + input_dim=params.input_dim, + latent_dim=params.latent_dim, + pretrained=params.pretrained, + ) + lit_model = bioimage_embed.shapes.MaskEmbed(model, params) + vprint(1, f'model ready') + + # Train the model + ########################################################################### + + trainer = pl.Trainer( + #TODO logger=[wandblogger, tb_logger], + gradient_clip_val=0.5, + enable_checkpointing=True, + devices=1, + #TODO accelerator="gpu", + accumulate_grad_batches=4, + #TODO callbacks=[checkpoint_callback], + min_epochs=50, + max_epochs=params.epochs, + log_every_n_steps=1, + ) + trainer.fit(lit_model, datamodule=dataloader) + lit_model.eval() + vprint(1, f'trainer fitted') + + # Pull the embedings + ########################################################################### + vprint(1, f'TODO') + +# default parameters +############################################################################### + +params = types.SimpleNamespace(**{ + # general params + "model":"resnet18_vae", + "epochs": 150, + "batch_size": 4, + "num_workers": 2**4, + "input_dim": (3, 512, 512), + "latent_dim": 512, + "num_embeddings": 512, + "num_hiddens": 512, + "pretrained": True, + "commitment_cost": 0.25, + "decay": 0.99, + "frobenius_norm": False, + "dataset": "bbbc010/BBBC010_v1_foreground_eachworm", + # optimizer_params + "opt": "AdamW", + "lr": 0.001, + "weight_decay": 0.0001, + "momentum": 0.9, + # lr_scheduler_params + "sched": "cosine", + "min_lr": 1e-4, + "warmup_epochs": 5, + "warmup_lr": 1e-6, + "cooldown_epochs": 10, + "t_max": 50, + "cycle_momentum": False, +}) + +############################################################################### + +if __name__ == "__main__": + + def auto_pos_int (x): + val = int(x,0) + if val <= 0: + raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) + return val + + parser = argparse.ArgumentParser(description='Run the shape embed pipeline') + + models = [ + "resnet18_vae" + , "resnet50_vae" + , "resnet18_vae_bolt" + , "resnet50_vae_bolt" + , "resnet18_vqvae" + , "resnet50_vqvae" + , "resnet18_vqvae_legacy" + , "resnet50_vqvae_legacy" + , "resnet101_vqvae_legacy" + , "resnet110_vqvae_legacy" + , "resnet152_vqvae_legacy" + , "resnet18_vae_legacy" + , "resnet50_vae_legacy" + ] + parser.add_argument( + '-m', '--model', choices=models, default=models[0], metavar='MODEL' + , help=f"The MODEL to use, one of {models} (default {models[0]}).") + parser.add_argument( + '-d', '--dataset', nargs=2, default=("vampire", "vampire/torchvision/Control/"), metavar=('NAME', 'PATH') + , help=f"The NAME of and PATH to the dataset") + parser.add_argument( + '-w', '--wandb-project', default="shape-embed", metavar='PROJECT' + , help=f"The wandb PROJECT name") + parser.add_argument( + '-b', '--batch-size', default=int(4), metavar='BATCH_SIZE', type=auto_pos_int + , help="The BATCH_SIZE for the run, a positive integer (default 4)") + parser.add_argument( + '-l', '--latent-space-size', default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int + , help="The LATENT_SPACE_SIZE, a positive integer (default 128)") + parser.add_argument( + '-n', '--num-workers', default=int(2**4), metavar='NUM_WORKERS', type=auto_pos_int + , help="The NUM_WORKERS for the run, a positive integer (default 2**4)") + parser.add_argument( + '-e', '--num-epochs', default=int(150), metavar='NUM_EPOCHS', type=auto_pos_int + , help="The NUM_EPOCHS for the run, a positive integer (default 150)") + #parser.add_argument('--clear-checkpoints', action='store_true' + # , help='remove checkpoints') + parser.add_argument('-v', '--verbose', action='count', default=0 + , help="Increase verbosity level by adding more \"v\".") + + # parse command line arguments + clargs=parser.parse_args() + + # set verbosity level for vprint function + vprint.lvl = clargs.verbose + + # update default params with clargs + params.model = clargs.model + params.dataset = clargs.dataset + params.wandb_project = clargs.wandb_project + params.batch_size = clargs.batch_size + interp_size = clargs.latent_space_size * 2 + params.input_dim = (3, interp_size, interp_size) + params.latent_dim = interp_size + params.num_embeddings = interp_size + params.num_hiddens = interp_size + params.num_workers = clargs.num_workers + params.epochs = clargs.num_epochs + + # run main process + main_process(params) \ No newline at end of file diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py index 42051416..7d31bb0c 100644 --- a/scripts/shapes/masks2distmatrices.py +++ b/scripts/shapes/masks2distmatrices.py @@ -1,27 +1,16 @@ # Imports when necessary import numpy as np -import torch -import logging -import sklearn import skimage as sk import scipy.spatial from scipy.interpolate import splprep, splev -import matplotlib.image import matplotlib.pyplot as plt import glob from sklearn.metrics.pairwise import euclidean_distances from sklearn.manifold import MDS -from bioimage_embed import shapes -import bioimage_embed -from pytorch_lightning import loggers as pl_loggers -from torchvision import transforms -from bioimage_embed.lightning import DataModule -from torchvision import datasets - -from bioimage_embed.shapes.mds import mds +from torchvision import datasets, transforms from bioimage_embed.shapes.transforms import ( CropCentroidPipeline, @@ -30,24 +19,39 @@ RotateIndexingClockwise, ) -logger = logging.getLogger(__name__) - # Where is the datat I want to transform -dataset = f"/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/" +#folder_path = f"/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/" +folder_path = f"/nfs/research/uhlmann/afoix/tiny_synthcellshapes_dataset/" ########################################################################## ####### Simplified version in order to make the things properly work ##### ########################################################################## +def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140): + """Turn an rgb array into a greyscale array using the following reduction: + grey = cr * r + cg * g + cb * b + + :param rgb: The rgb array + :param cr: The red coefficient + :param cg: The green coefficient + :param cb: The blue coefficient + + :returns: The greyscale array. + """ + r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2] + return cr * r + cg * g + cb * b + def find_contour(mask): + if len(mask.shape) == 3: # (lines, columns, number of channels) + mask = rgb2grey(mask) contour = sk.measure.find_contours(mask, 0.8)[0] x, y = contour[:, 0], contour[:, 1] return x, y -def spline_interpolation(x, y): - sparsity_contour = 4 # Sparsity of the contour. Dropping some of the sample (points) to make the spline smoother +def spline_interpolation(x, y, sparsity_contour = 4, sample_points = 200): + # Sparsity of the contour. Dropping some of the sample (points) to make the spline smoother + sparsity_contour = max(1, sparsity_contour) tck, u = splprep([x[::sparsity_contour], y[::sparsity_contour]], s = 0) - sample_points = 200 # How many times to sample the spline new_u = np.linspace(u.min(), u.max(), sample_points) # Last parameter is how dense is our spline, how many points. # Evaluate the spline @@ -63,9 +67,28 @@ def dist_to_coords(dst_mat): embedding = MDS(n_components=2, dissimilarity='precomputed') return embedding.fit_transform(dst_mat) +def mask2distmatrix(mask): + # extract mask contour + x, y = find_contour(mask) + # Reinterpolate (spline) + x_reinterpolated, y_reinterpolated = spline_interpolation(x, y) + # Build the distance matrix + dm = build_distance_matrix(x_reinterpolated, y_reinterpolated) + return dm + +def masks2distmatrices(mask_dataset_path=folder_path, output_path=None): + print('loading base dataset') + dataset = datasets.ImageFolder(mask_dataset_path, transform=transforms.Compose([ + np.array, + mask2distmatrix + ])) + for idx, data in enumerate(dataset): + print(f'idx: {idx}') + print(f'data: {data}') + #torch.save(data, 'data_drive_path{}'.format(idx)) # Simplified version for test -def process_png_file(mask_path): +def process_png_file(mask_path, idx, output_folder='./results/reconstruction'): # Perform specific action for each PNG file print("Processing:", mask_path) mask = plt.imread(mask_path) @@ -76,11 +99,12 @@ def process_png_file(mask_path): # Reinterpolate (spline) x_reinterpolated, y_reinterpolated = spline_interpolation(x, y) plt.scatter(x_reinterpolated, y_reinterpolated, s=6) - plt.savefig(f'results/reconstruction/original_contour{i}.png') + plt.savefig(f'{output_folder}/original_contour{idx}.png') plt.clf() # Build the distance matrix dm = build_distance_matrix(x_reinterpolated, y_reinterpolated) + np.save(f"{output_folder}/matrix_{idx}.npy", dm) # print("Distance matrix") # print(dm) @@ -88,28 +112,32 @@ def process_png_file(mask_path): reconstructed_coords = dist_to_coords(dm) print(reconstructed_coords) plt.scatter(*zip(*reconstructed_coords), s=6) - plt.savefig(f'results/reconstruction/reconstructed_contour{i}.png') + plt.savefig(f'{output_folder}/reconstructed_contour{idx}.png') plt.clf() reconstructed_matrix = euclidean_distances(reconstructed_coords) # Error with matrix err = np.average(dm - reconstructed_matrix) print(f"Dist error is: {err}") +############################################################################### -# Specify the folder path containing PNG files -folder_path = "/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/*/*.png" - -# Use glob to find all PNG files in the folder -png_files = glob.glob(folder_path) +if __name__ == "__main__": -# Iterate through all PNG files found -for i, file_path in enumerate(png_files): - # Process the PNG file - process_png_file(file_path) + ## Use glob to find all PNG files in the folder + #png_files = glob.glob(folder_path+"*/*.png") + # + ## Iterate through all PNG files found + #for i, file_path in enumerate(png_files): + # # Process the PNG file + # process_png_file(file_path, i) + masks2distmatrices() +############################################################################### +############################################################################### +############################################################################### ######################################## ############# Other code ############### ######################################## From 8208238dcc6b078a29b2cf91d23d774c007e0ed4 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 1 Apr 2024 12:13:05 +0100 Subject: [PATCH 074/204] Properly overwrite default params from clargs --- scripts/shapes/distmatrix2embeding.py | 88 +++++++++++++++------------ 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 1dbeb7c3..9c7845df 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -77,9 +77,25 @@ def main_process(params): # default parameters ############################################################################### +models = [ + "resnet18_vae" +, "resnet50_vae" +, "resnet18_vae_bolt" +, "resnet50_vae_bolt" +, "resnet18_vqvae" +, "resnet50_vqvae" +, "resnet18_vqvae_legacy" +, "resnet50_vqvae_legacy" +, "resnet101_vqvae_legacy" +, "resnet110_vqvae_legacy" +, "resnet152_vqvae_legacy" +, "resnet18_vae_legacy" +, "resnet50_vae_legacy" +] + params = types.SimpleNamespace(**{ # general params - "model":"resnet18_vae", + "model": "resnet18_vae", "epochs": 150, "batch_size": 4, "num_workers": 2**4, @@ -91,7 +107,7 @@ def main_process(params): "commitment_cost": 0.25, "decay": 0.99, "frobenius_norm": False, - "dataset": "bbbc010/BBBC010_v1_foreground_eachworm", + "dataset": ("tiny_dist", "/nfs/research/uhlmann/afoix/tiny_synthcellshapes_dataset_distmat"), # optimizer_params "opt": "AdamW", "lr": 0.001, @@ -119,42 +135,27 @@ def auto_pos_int (x): parser = argparse.ArgumentParser(description='Run the shape embed pipeline') - models = [ - "resnet18_vae" - , "resnet50_vae" - , "resnet18_vae_bolt" - , "resnet50_vae_bolt" - , "resnet18_vqvae" - , "resnet50_vqvae" - , "resnet18_vqvae_legacy" - , "resnet50_vqvae_legacy" - , "resnet101_vqvae_legacy" - , "resnet110_vqvae_legacy" - , "resnet152_vqvae_legacy" - , "resnet18_vae_legacy" - , "resnet50_vae_legacy" - ] parser.add_argument( - '-m', '--model', choices=models, default=models[0], metavar='MODEL' - , help=f"The MODEL to use, one of {models} (default {models[0]}).") + '-m', '--model', choices=models, metavar='MODEL' + , help=f"The MODEL to use, one of {models} (default {params.model}).") parser.add_argument( - '-d', '--dataset', nargs=2, default=("vampire", "vampire/torchvision/Control/"), metavar=('NAME', 'PATH') - , help=f"The NAME of and PATH to the dataset") + '-d', '--dataset', nargs=2, metavar=('NAME', 'PATH') + , help=f"The NAME of and PATH to the dataset (default: {params.dataset})") parser.add_argument( '-w', '--wandb-project', default="shape-embed", metavar='PROJECT' , help=f"The wandb PROJECT name") parser.add_argument( - '-b', '--batch-size', default=int(4), metavar='BATCH_SIZE', type=auto_pos_int - , help="The BATCH_SIZE for the run, a positive integer (default 4)") + '-b', '--batch-size', metavar='BATCH_SIZE', type=auto_pos_int + , help=f"The BATCH_SIZE for the run, a positive integer (default {params.batch_size})") parser.add_argument( - '-l', '--latent-space-size', default=int(128), metavar='LATENT_SPACE_SIZE', type=auto_pos_int - , help="The LATENT_SPACE_SIZE, a positive integer (default 128)") + '-l', '--latent-space-size', metavar='LATENT_SPACE_SIZE', type=auto_pos_int + , help=f"The LATENT_SPACE_SIZE, a positive integer (default {params.latent_dim})") parser.add_argument( - '-n', '--num-workers', default=int(2**4), metavar='NUM_WORKERS', type=auto_pos_int - , help="The NUM_WORKERS for the run, a positive integer (default 2**4)") + '-n', '--num-workers', metavar='NUM_WORKERS', type=auto_pos_int + , help=f"The NUM_WORKERS for the run, a positive integer (default {params.num_workers})") parser.add_argument( - '-e', '--num-epochs', default=int(150), metavar='NUM_EPOCHS', type=auto_pos_int - , help="The NUM_EPOCHS for the run, a positive integer (default 150)") + '-e', '--num-epochs', metavar='NUM_EPOCHS', type=auto_pos_int + , help=f"The NUM_EPOCHS for the run, a positive integer (default {params.epochs})") #parser.add_argument('--clear-checkpoints', action='store_true' # , help='remove checkpoints') parser.add_argument('-v', '--verbose', action='count', default=0 @@ -167,17 +168,24 @@ def auto_pos_int (x): vprint.lvl = clargs.verbose # update default params with clargs - params.model = clargs.model - params.dataset = clargs.dataset - params.wandb_project = clargs.wandb_project - params.batch_size = clargs.batch_size - interp_size = clargs.latent_space_size * 2 - params.input_dim = (3, interp_size, interp_size) - params.latent_dim = interp_size - params.num_embeddings = interp_size - params.num_hiddens = interp_size - params.num_workers = clargs.num_workers - params.epochs = clargs.num_epochs + if clargs.model: + params.model = clargs.model + if clargs.dataset: + params.dataset = clargs.dataset + if clargs.wandb_project: + params.wandb_project = clargs.wandb_project + if clargs.batch_size: + params.batch_size = clargs.batch_size + if clargs.latent_space_size: + interp_size = clargs.latent_space_size * 2 + params.input_dim = (params.input_dim[0], interp_size, interp_size) + params.latent_dim = interp_size + params.num_embeddings = interp_size + params.num_hiddens = interp_size + if clargs.num_workers: + params.num_workers = clargs.num_workers + if clargs.num_epochs: + params.epochs = clargs.num_epochs # run main process main_process(params) \ No newline at end of file From aacac25deaf527653f2f7406d5eb8665fa39e9f3 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 1 Apr 2024 12:16:43 +0100 Subject: [PATCH 075/204] Use DatasetFolder to load .npy and turn the dist matrix into a 3 channels copy for models to be happy --- scripts/shapes/distmatrix2embeding.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 9c7845df..a1026b67 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -1,9 +1,11 @@ from torchvision import datasets, transforms import pytorch_lightning as pl +import numpy as np import bioimage_embed import bioimage_embed.shapes import bioimage_embed.lightning import argparse +import torch import types # misc helpers @@ -27,9 +29,10 @@ def main_process(params): ########################################################################### preproc_transform = transforms.Compose([ - transforms.ToTensor(), + torch.as_tensor, # turn (H,W) numpy array into a (H,W) tensor + lambda x: x.repeat(3, 1, 1) # turn (H,W) tensor into a (3,H,W) tensor (to fit downstream model expectations) ]) - dataset = datasets.ImageFolder(params.dataset[1], transform = preproc_transform) + dataset = datasets.DatasetFolder(params.dataset[1], loader=np.load, extensions=('npy'), transform = preproc_transform) dataloader = bioimage_embed.lightning.DataModule( dataset, batch_size=params.batch_size, From 579ab0bf7eb94ee85891d16db7070e13a4daea36 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 1 Apr 2024 12:19:07 +0100 Subject: [PATCH 076/204] Disable checkpoints in training by default (maybe re-enable at some future point) --- scripts/shapes/distmatrix2embeding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index a1026b67..c1168798 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -60,7 +60,7 @@ def main_process(params): trainer = pl.Trainer( #TODO logger=[wandblogger, tb_logger], gradient_clip_val=0.5, - enable_checkpointing=True, + enable_checkpointing=False, devices=1, #TODO accelerator="gpu", accumulate_grad_batches=4, From f0fee887c6a4bb93e341c673f4e27022c1585954 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 1 Apr 2024 12:19:56 +0100 Subject: [PATCH 077/204] Enable gpu accelleration by default --- scripts/shapes/distmatrix2embeding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index c1168798..144b5102 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -62,7 +62,7 @@ def main_process(params): gradient_clip_val=0.5, enable_checkpointing=False, devices=1, - #TODO accelerator="gpu", + accelerator="gpu", accumulate_grad_batches=4, #TODO callbacks=[checkpoint_callback], min_epochs=50, From 5e97713d757c18b1869ec3eaea1beae6a32dce40 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 1 Apr 2024 12:20:24 +0100 Subject: [PATCH 078/204] more informative verbose print --- scripts/shapes/distmatrix2embeding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 144b5102..79166af5 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -75,7 +75,7 @@ def main_process(params): # Pull the embedings ########################################################################### - vprint(1, f'TODO') + vprint(1, f'TODO: pull the embedings') # default parameters ############################################################################### From bb425dfcce248d138089f0ced7614dd00be17190 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 1 Apr 2024 13:41:54 +0100 Subject: [PATCH 079/204] bring argparse to the masks2distmatrices script --- scripts/shapes/masks2distmatrices.py | 236 +++++++++++++++++---------- 1 file changed, 151 insertions(+), 85 deletions(-) diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py index 7d31bb0c..023617ae 100644 --- a/scripts/shapes/masks2distmatrices.py +++ b/scripts/shapes/masks2distmatrices.py @@ -1,31 +1,25 @@ -# Imports when necessary import numpy as np +import imageio.v3 as iio import skimage as sk -import scipy.spatial from scipy.interpolate import splprep, splev -import matplotlib.pyplot as plt +import scipy.spatial +import argparse +import pathlib +import types import glob +import os -from sklearn.metrics.pairwise import euclidean_distances -from sklearn.manifold import MDS - - -from torchvision import datasets, transforms - -from bioimage_embed.shapes.transforms import ( - CropCentroidPipeline, - CoordsToDistogram, - ImageToCoords, - RotateIndexingClockwise, -) - -# Where is the datat I want to transform -#folder_path = f"/nfs/research/uhlmann/afoix/bbbc010/BBBC010_v1_foreground_eachworm/" -folder_path = f"/nfs/research/uhlmann/afoix/tiny_synthcellshapes_dataset/" +# misc helpers +############################################################################### -########################################################################## -####### Simplified version in order to make the things properly work ##### -########################################################################## +def vprint(tgtlvl, msg, pfx = f"{'':<5}"): + try: + if (tgtlvl <= vprint.lvl): + print(f"{pfx}{msg}") + except AttributeError: + print("verbosity level not set, defaulting to 0") + vprint.lvl = 0 + vprint(tgtlvl, msg) def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140): """Turn an rgb array into a greyscale array using the following reduction: @@ -41,6 +35,10 @@ def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140): r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2] return cr * r + cg * g + cb * b +########################################################################## +####### Simplified version in order to make the things properly work ##### +########################################################################## + def find_contour(mask): if len(mask.shape) == 3: # (lines, columns, number of channels) mask = rgb2grey(mask) @@ -48,12 +46,12 @@ def find_contour(mask): x, y = contour[:, 0], contour[:, 1] return x, y -def spline_interpolation(x, y, sparsity_contour = 4, sample_points = 200): +def spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling): # Sparsity of the contour. Dropping some of the sample (points) to make the spline smoother - sparsity_contour = max(1, sparsity_contour) - tck, u = splprep([x[::sparsity_contour], y[::sparsity_contour]], s = 0) + raw_sampling_sparsity = max(1, raw_sampling_sparsity) + tck, u = splprep([x[::raw_sampling_sparsity], y[::raw_sampling_sparsity]], s = 0) # How many times to sample the spline - new_u = np.linspace(u.min(), u.max(), sample_points) # Last parameter is how dense is our spline, how many points. + new_u = np.linspace(u.min(), u.max(), spline_sampling) # Last parameter is how dense is our spline, how many points. # Evaluate the spline x_spline, y_spline = splev(new_u, tck) return x_spline, y_spline @@ -66,72 +64,140 @@ def build_distance_matrix(x_reinterpolated, y_reinterpolated): def dist_to_coords(dst_mat): embedding = MDS(n_components=2, dissimilarity='precomputed') return embedding.fit_transform(dst_mat) - -def mask2distmatrix(mask): + +def mask2distmatrix(mask, raw_sampling_sparsity=1, spline_sampling=512): + vprint(3, f'running with raw_sampling_sparsity {raw_sampling_sparsity} and spline_sampling {spline_sampling}') # extract mask contour x, y = find_contour(mask) # Reinterpolate (spline) - x_reinterpolated, y_reinterpolated = spline_interpolation(x, y) + x_reinterpolated, y_reinterpolated = spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling) # Build the distance matrix dm = build_distance_matrix(x_reinterpolated, y_reinterpolated) + vprint(3, f'created distance matrix shape {dm.shape}') return dm -def masks2distmatrices(mask_dataset_path=folder_path, output_path=None): - print('loading base dataset') - dataset = datasets.ImageFolder(mask_dataset_path, transform=transforms.Compose([ - np.array, - mask2distmatrix - ])) - for idx, data in enumerate(dataset): - print(f'idx: {idx}') - print(f'data: {data}') - #torch.save(data, 'data_drive_path{}'.format(idx)) - -# Simplified version for test -def process_png_file(mask_path, idx, output_folder='./results/reconstruction'): - # Perform specific action for each PNG file - print("Processing:", mask_path) - mask = plt.imread(mask_path) - - # Get the contour - x, y = find_contour(mask) - - # Reinterpolate (spline) - x_reinterpolated, y_reinterpolated = spline_interpolation(x, y) - plt.scatter(x_reinterpolated, y_reinterpolated, s=6) - plt.savefig(f'{output_folder}/original_contour{idx}.png') - plt.clf() - - # Build the distance matrix - dm = build_distance_matrix(x_reinterpolated, y_reinterpolated) - np.save(f"{output_folder}/matrix_{idx}.npy", dm) - # print("Distance matrix") - # print(dm) - - # Reconstruction coordinates and matrix (MDS) - reconstructed_coords = dist_to_coords(dm) - print(reconstructed_coords) - plt.scatter(*zip(*reconstructed_coords), s=6) - plt.savefig(f'{output_folder}/reconstructed_contour{idx}.png') - plt.clf() - reconstructed_matrix = euclidean_distances(reconstructed_coords) - - # Error with matrix - err = np.average(dm - reconstructed_matrix) - print(f"Dist error is: {err}") +def masks2distmatrices(params): + + vprint(1, 'loading base dataset') + + if not params.mask_dataset_path: + sys.exit("no mask dataset provided") + if not params.output_path: + p = pathlib.Path(params.mask_dataset_path) + params.output_path=p.joinpath(p.parent, p.name+'_distmat') + + vprint(2, f'>>>> params.mask_dataset_path: {params.mask_dataset_path}') + vprint(2, f'>>>> params.mask_dataset_path: {next(os.walk(params.mask_dataset_path))[1]}') + vprint(2, f'>>>> params.output_path: {params.output_path}') + pathlib.Path(params.output_path).mkdir(parents=True, exist_ok=True) + class_folders = next(os.walk(params.mask_dataset_path))[1] + vprint(2, f'>>>> class_folders: {class_folders}') + for class_folder in class_folders: + vprint(2, f'>>>> class_folder: {class_folder}') + output_class_folder=os.path.join(params.output_path, class_folder) + vprint(2, f'creating output class folder: {output_class_folder}') + pathlib.Path(output_class_folder).mkdir(parents=True, exist_ok=True) + for mask_png in glob.glob(params.mask_dataset_path+'/'+class_folder+'/'+'*.png'): + vprint(3, f'{"-"*80}') + vprint(3, f'working on {mask_png}') + filename = os.path.basename(mask_png).split('.')[0] + vprint(3, f'filename {filename}') + mask = iio.imread(mask_png) + dm = mask2distmatrix(mask, params.raw_sampling_sparsity, params.spline_sampling) + output_file_name=f"{output_class_folder}/{filename}.npy" + vprint(3, f'saving {output_file_name}') + vprint(3, f'{"-"*80}') + np.save(output_file_name, dm) + + + #print('loading base dataset') + #dataset = datasets.ImageFolder(mask_dataset_path, transform=transforms.Compose([ + # np.array, + # mask2distmatrix + #])) + #for idx, data in enumerate(dataset): + # print(f'idx: {idx}') + # print(f'data: {data}') + # #torch.save(data, 'data_drive_path{}'.format(idx)) + #print(dataset) + +# # Simplified version for test +# def process_png_file(mask_path, idx, output_folder='./results/reconstruction'): +# # Perform specific action for each PNG file +# print("Processing:", mask_path) +# mask = plt.imread(mask_path) + +# # Get the contour +# x, y = find_contour(mask) + +# # Reinterpolate (spline) +# x_reinterpolated, y_reinterpolated = spline_interpolation(x, y) +# plt.scatter(x_reinterpolated, y_reinterpolated, s=6) +# plt.savefig(f'{output_folder}/original_contour{idx}.png') +# plt.clf() + +# # Build the distance matrix +# dm = build_distance_matrix(x_reinterpolated, y_reinterpolated) +# np.save(f"{output_folder}/matrix_{idx}.npy", dm) + +# # Reconstruction coordinates and matrix (MDS) +# reconstructed_coords = dist_to_coords(dm) +# print(reconstructed_coords) +# plt.scatter(*zip(*reconstructed_coords), s=6) +# plt.savefig(f'{output_folder}/reconstructed_contour{idx}.png') +# plt.clf() +# reconstructed_matrix = euclidean_distances(reconstructed_coords) + +# # Error with matrix +# err = np.average(dm - reconstructed_matrix) +# print(f"Dist error is: {err}") + ############################################################################### +params = types.SimpleNamespace(**{ + "mask_dataset_path": None + , "output_path": None + , "raw_sampling_sparsity": 1 + , "spline_sampling": 512 +}) + if __name__ == "__main__": - ## Use glob to find all PNG files in the folder - #png_files = glob.glob(folder_path+"*/*.png") - # - ## Iterate through all PNG files found - #for i, file_path in enumerate(png_files): - # # Process the PNG file - # process_png_file(file_path, i) + def auto_pos_int (x): + val = int(x,0) + if val <= 0: + raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) + return val + + parser = argparse.ArgumentParser(description='Turn mask dataset into distance matrix dataset') + + parser.add_argument('path', metavar='PATH', help=f"The PATH to the dataset") + parser.add_argument('-o', '--output-path', help="The desired output path to the generated dataset") + parser.add_argument('-s', '--raw-sampling-sparsity', type=auto_pos_int + , help=f"The desired sparsity (in number of points) when sampling the raw contour (default, every {params.raw_sampling_sparsity} point(s))") + parser.add_argument('-n', '--spline-sampling', type=auto_pos_int + , help=f"The desired number of points when sampling the spline contour (default, {params.spline_sampling} point(s))") + parser.add_argument('-v', '--verbose', action='count', default=0 + , help="Increase verbosity level by adding more \"v\".") - masks2distmatrices() + # parse command line arguments + clargs=parser.parse_args() + + # set verbosity level for vprint function + vprint.lvl = clargs.verbose + + # update default params with clargs + if clargs.path: + params.mask_dataset_path = clargs.path + #params.mask_dataset_path = "/nfs/research/uhlmann/afoix/tiny_synthcellshapes_dataset" + if clargs.output_path: + params.output_path = clargs.output_path + if clargs.raw_sampling_sparsity: + params.raw_sampling_sparsity = clargs.raw_sampling_sparsity + if clargs.spline_sampling: + params.spline_sampling = clargs.spline_sampling + + masks2distmatrices(params) @@ -142,12 +208,12 @@ def process_png_file(mask_path, idx, output_folder='./results/reconstruction'): ############# Other code ############### ######################################## -# # Needed variables +# # Needed variables # window_size = 256 # needs to be the same as the latent space size # interp_size = 256 # latent space size needs to match the window size # # This crops the image using the centroid by window sizes. (remember to removed and see what happens) -# transform_crop = CropCentroidPipeline(window_size) +# transform_crop = CropCentroidPipeline(window_size) # # From the coordinates of the distance matrix, this is actually building the distance matrix # transform_coord_to_dist = CoordsToDistogram(interp_size, matrix_normalised=False) @@ -183,13 +249,13 @@ def process_png_file(mask_path, idx, output_folder='./results/reconstruction'): # def dist_to_coords(dst_mat): # embedding = MDS(n_components=2, dissimilarity='precomputed', max_iter=1) # return embedding.fit_transform(dst_mat) - + #coords_prime = MDS( #n_components=2, dissimilarity="precomputed", random_state=0).fit_transform(dst_mat) - + #return coords_prime #return mds(dst_mat) - + # from https://math.stackexchange.com/a/423898 and https://stackoverflow.com/a/17177833/16632916 # m = np.zeros(shape=dst_mat.shape) # for i in range(dst_mat.shape[0]): From ac3b85a79875b59073f91731c177837c98d6d32c Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 2 Apr 2024 11:35:06 +0100 Subject: [PATCH 080/204] training and test model --- scripts/shapes/distmatrix2embeding.py | 33 +++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 79166af5..277e7269 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -4,6 +4,7 @@ import bioimage_embed import bioimage_embed.shapes import bioimage_embed.lightning +from bioimage_embed.lightning import DataModule import argparse import torch import types @@ -72,10 +73,38 @@ def main_process(params): trainer.fit(lit_model, datamodule=dataloader) lit_model.eval() vprint(1, f'trainer fitted') - - # Pull the embedings + + #TODO: Validate the model + ########################################################################### + vprint(1, f'TODO: Validate the model') + validation = trainer.validate(lit_model, datamodule=dataloader) + + #TODO: Test the model + ########################################################################### + vprint(1, f'TODO: Test the model') + testing = trainer.test(lit_model, datamodule=dataloader) + + # Inference on full dataset + dataloader = DataModule( + dataset, + batch_size=1, + shuffle=False, + num_workers=params.num_workers, + # Transform is commented here to avoid augmentations in real data + # HOWEVER, applying the transform multiple times and averaging the results might produce better latent embeddings + # transform=transform, + ) + dataloader.setup() + + predictions = trainer.predict(lit_model, datamodule=dataloader) + + #TODO: Pull the embedings ########################################################################### vprint(1, f'TODO: pull the embedings') + # Use the namespace variables + latent_space = torch.stack([d.out.z.flatten() for d in predictions]) + # Save the latent space + np.save('latent_space.npy', latent_space) # default parameters ############################################################################### From 7e7c7a28cd6d99d21cd6d00c722193028a8c4653 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 2 Apr 2024 21:43:04 +0100 Subject: [PATCH 081/204] Roll indices + normalisation + sanity_check + dataset name for latent space --- scripts/shapes/distmatrix2embeding.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 277e7269..ad26555d 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -6,6 +6,7 @@ import bioimage_embed.lightning from bioimage_embed.lightning import DataModule import argparse +import datetime import torch import types @@ -21,6 +22,21 @@ def vprint(tgtlvl, msg, pfx = f"{'':<5}"): vprint.lvl = 0 vprint(tgtlvl, msg) +def maybe_roll (dist_mat, p = 0.5): + if np.random.rand() < p: + return np.roll(dist_mat, np.random.randint(0, dist_mat.shape[0]), (0,1)) + else: + return dist_mat + +def sanity_check (dist_mat): + if not np.allclose(dist_mat, dist_mat.T): + raise ValueError("Matrix is not symmetric") + if np.any(dist_mat < 0): + raise ValueError("Matrix has negative values") + if np.any(np.diag(dist_mat)): + raise ValueError("Matrix has non-zero diagonal") + return dist_mat + # Main process ############################################################################### @@ -30,10 +46,14 @@ def main_process(params): ########################################################################### preproc_transform = transforms.Compose([ + lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix + lambda x: maybe_roll(x, p = 1.0), # "potentially" roll the matrix + sanity_check, # check if the matrix is symmetric and positive, and the diagonal is zero torch.as_tensor, # turn (H,W) numpy array into a (H,W) tensor lambda x: x.repeat(3, 1, 1) # turn (H,W) tensor into a (3,H,W) tensor (to fit downstream model expectations) ]) dataset = datasets.DatasetFolder(params.dataset[1], loader=np.load, extensions=('npy'), transform = preproc_transform) + #dataset = datasets.DatasetFolder(params.dataset[1], loader=lambda x: np.load(x, allow_pickle=True), extensions=('npy'), transform = preproc_transform) dataloader = bioimage_embed.lightning.DataModule( dataset, batch_size=params.batch_size, @@ -104,7 +124,7 @@ def main_process(params): # Use the namespace variables latent_space = torch.stack([d.out.z.flatten() for d in predictions]) # Save the latent space - np.save('latent_space.npy', latent_space) + np.save(f'{params.dataset[0]}_{str(datetime.datetime.now()).replace(" ", "_")}.npy', latent_space) # default parameters ############################################################################### From da4acf89c30cba05dc54e29373ac02adcd7e4b9e Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 2 Apr 2024 22:00:25 +0100 Subject: [PATCH 082/204] Added wandb logging --- scripts/shapes/distmatrix2embeding.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index ad26555d..c7adae0f 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -5,6 +5,7 @@ import bioimage_embed.shapes import bioimage_embed.lightning from bioimage_embed.lightning import DataModule +from pytorch_lightning import loggers as pl_loggers import argparse import datetime import torch @@ -75,11 +76,17 @@ def main_process(params): lit_model = bioimage_embed.shapes.MaskEmbed(model, params) vprint(1, f'model ready') + # WandB logger + ########################################################################### + jobname = f"{params.model}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}" + wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=jobname) + wandblogger.watch(lit_model, log="all") + # Train the model ########################################################################### trainer = pl.Trainer( - #TODO logger=[wandblogger, tb_logger], + logger=[wandblogger], gradient_clip_val=0.5, enable_checkpointing=False, devices=1, @@ -194,8 +201,11 @@ def auto_pos_int (x): '-d', '--dataset', nargs=2, metavar=('NAME', 'PATH') , help=f"The NAME of and PATH to the dataset (default: {params.dataset})") parser.add_argument( - '-w', '--wandb-project', default="shape-embed", metavar='PROJECT' - , help=f"The wandb PROJECT name") + '--wandb-entity', default="foix", metavar='WANDB_ENTITY' + , help=f"The WANDB_ENTITY name") + parser.add_argument( + '--wandb-project', default="simply-shape", metavar='WANDB_PROJECT' + , help=f"The WANDB_PROJECT name") parser.add_argument( '-b', '--batch-size', metavar='BATCH_SIZE', type=auto_pos_int , help=f"The BATCH_SIZE for the run, a positive integer (default {params.batch_size})") @@ -224,6 +234,8 @@ def auto_pos_int (x): params.model = clargs.model if clargs.dataset: params.dataset = clargs.dataset + if clargs.wandb_entity: + params.wandb_entity = clargs.wandb_entity if clargs.wandb_project: params.wandb_project = clargs.wandb_project if clargs.batch_size: From fd7d1225e2f27c42e2657909d736badcaef3bccf Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 3 Apr 2024 00:37:34 +0100 Subject: [PATCH 083/204] Added the extraction of original/reconstructed matrices + clarg for output dir --- scripts/shapes/distmatrix2embeding.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index c7adae0f..9f3616e5 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -8,6 +8,7 @@ from pytorch_lightning import loggers as pl_loggers import argparse import datetime +import pathlib import torch import types @@ -125,13 +126,22 @@ def main_process(params): predictions = trainer.predict(lit_model, datamodule=dataloader) - #TODO: Pull the embedings + #TODO: Pull the embedings and reconstructed distance matrices ########################################################################### - vprint(1, f'TODO: pull the embedings') + vprint(1, f'pull the embedings') # Use the namespace variables latent_space = torch.stack([d.out.z.flatten() for d in predictions]) + # create the output directory + output_dir = params.output_dir + if output_dir is None: + output_dir = f'./{params.model}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' + pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) # Save the latent space - np.save(f'{params.dataset[0]}_{str(datetime.datetime.now()).replace(" ", "_")}.npy', latent_space) + np.save(f'{output_dir}/latent_space.npy', latent_space) + # Save the reconstructions + for i, pred in enumerate(predictions): + np.save(f'{output_dir}/original_{i}.npy', pred.x.data[0,0]) + np.save(f'{output_dir}/reconstruction_{i}.npy', pred.out.recon_x[0,0]) # default parameters ############################################################################### @@ -200,6 +210,9 @@ def auto_pos_int (x): parser.add_argument( '-d', '--dataset', nargs=2, metavar=('NAME', 'PATH') , help=f"The NAME of and PATH to the dataset (default: {params.dataset})") + parser.add_argument( + '-o', '--output-dir', metavar='OUTPUT_DIR', default=None + , help=f"The OUTPUT_DIR path to use to dump results") parser.add_argument( '--wandb-entity', default="foix", metavar='WANDB_ENTITY' , help=f"The WANDB_ENTITY name") @@ -232,6 +245,7 @@ def auto_pos_int (x): # update default params with clargs if clargs.model: params.model = clargs.model + params.output_dir = clargs.output_dir if clargs.dataset: params.dataset = clargs.dataset if clargs.wandb_entity: From ee7cd7f2be432a8ad4c52693a3da978e962cae6f Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 3 Apr 2024 00:38:29 +0100 Subject: [PATCH 084/204] created a script that renders dist matrices .npy as .png images --- scripts/shapes/distmatrices2contour.py | 73 ++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 scripts/shapes/distmatrices2contour.py diff --git a/scripts/shapes/distmatrices2contour.py b/scripts/shapes/distmatrices2contour.py new file mode 100644 index 00000000..70e15a1d --- /dev/null +++ b/scripts/shapes/distmatrices2contour.py @@ -0,0 +1,73 @@ +import matplotlib.pyplot as plt +from sklearn.manifold import MDS +import numpy as np +import argparse +import pathlib +import types +import glob + +# misc helpers +############################################################################### + +def vprint(tgtlvl, msg, pfx = f"{'':<5}"): + try: + if (tgtlvl <= vprint.lvl): + print(f"{pfx}{msg}") + except AttributeError: + print("verbosity level not set, defaulting to 0") + vprint.lvl = 0 + vprint(tgtlvl, msg) + return dm + +def asym_to_sym(asym_dist_mat): + return np.max(np.stack([asym_dist_mat, asym_dist_mat.T]), axis=0) + +def dist_to_coords(dst_mat): + embedding = MDS(n_components=2, dissimilarity='precomputed', normalized_stress='auto') + return embedding.fit_transform(dst_mat) + +def distmatrices2contour(params): + plt.clf() + dm_npys = glob.glob(f'{params.matrices_folder}/orig*.npy') + glob.glob(f'{params.matrices_folder}/recon*.npy') + for dm_npy in dm_npys: + dm = np.load(dm_npy) + vprint(2, f'{dm_npy}: dm.shape={dm.shape}') + dm = asym_to_sym(dm) + p = pathlib.Path(dm_npy) + p = p.with_suffix('.png') + reconstructed_coords = dist_to_coords(dm) + plt.scatter(*zip(*reconstructed_coords), s=6) + plt.savefig(p) + vprint(2, f'saved {p}') + plt.clf() + +############################################################################### + +params = types.SimpleNamespace(**{ + "matrices_folder": None +}) + +if __name__ == "__main__": + + def auto_pos_int (x): + val = int(x,0) + if val <= 0: + raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) + return val + + parser = argparse.ArgumentParser(description='Turn distance matrices into contours') + + parser.add_argument('matrices_folder', metavar='MATRICES_FOLDER', help=f"The path to the matrices folder") + parser.add_argument('-v', '--verbose', action='count', default=0 + , help="Increase verbosity level by adding more \"v\".") + + # parse command line arguments + clargs=parser.parse_args() + + # set verbosity level for vprint function + vprint.lvl = clargs.verbose + + # update default params with clargs + params.matrices_folder = clargs.matrices_folder + + distmatrices2contour(params) From 20db342bf3aa2a0044fbb00f7e608b93bf384f06 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 9 Apr 2024 08:38:59 +0100 Subject: [PATCH 085/204] new changes: sparisity, periodicity and also add a script to draw contours from dm --- scripts/shapes/drawContourFromDM.py | 74 +++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 scripts/shapes/drawContourFromDM.py diff --git a/scripts/shapes/drawContourFromDM.py b/scripts/shapes/drawContourFromDM.py new file mode 100644 index 00000000..39863aee --- /dev/null +++ b/scripts/shapes/drawContourFromDM.py @@ -0,0 +1,74 @@ + +import matplotlib.pyplot as plt +from sklearn.manifold import MDS +import numpy as np +import argparse +import pathlib +import types +import glob + +# misc helpers +############################################################################### + +def vprint(tgtlvl, msg, pfx = f"{'':<5}"): + try: + if (tgtlvl <= vprint.lvl): + print(f"{pfx}{msg}") + except AttributeError: + print("verbosity level not set, defaulting to 0") + vprint.lvl = 0 + vprint(tgtlvl, msg) + #return dm + +def asym_to_sym(asym_dist_mat): + return np.max(np.stack([asym_dist_mat, asym_dist_mat.T]), axis=0) + +def dist_to_coords(dst_mat): + embedding = MDS(n_components=2, dissimilarity='precomputed', normalized_stress='auto') + return embedding.fit_transform(dst_mat) + +def distmatrices2contour(params): + plt.clf() + dm_npys = glob.glob(f'{params.matrices_folder}/*.npy') + for dm_npy in dm_npys: + dm = np.load(dm_npy) + vprint(2, f'{dm_npy}: dm.shape={dm.shape}') + dm = asym_to_sym(dm) + p = pathlib.Path(dm_npy) + p = p.with_suffix('.png') + reconstructed_coords = dist_to_coords(dm) + plt.scatter(*zip(*reconstructed_coords), s=6) + plt.savefig(p) + vprint(2, f'saved {p}') + plt.clf() + +############################################################################### + +params = types.SimpleNamespace(**{ + "matrices_folder": None +}) + +if __name__ == "__main__": + + def auto_pos_int (x): + val = int(x,0) + if val <= 0: + raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) + return val + + parser = argparse.ArgumentParser(description='Turn distance matrices into contours') + + parser.add_argument('matrices_folder', metavar='MATRICES_FOLDER', help=f"The path to the matrices folder") + parser.add_argument('-v', '--verbose', action='count', default=0 + , help="Increase verbosity level by adding more \"v\".") + + # parse command line arguments + clargs=parser.parse_args() + + # set verbosity level for vprint function + vprint.lvl = clargs.verbose + + # update default params with clargs + params.matrices_folder = clargs.matrices_folder + + distmatrices2contour(params) From 784479870ee85ae04441253d6f2edd67894e0dc1 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 9 Apr 2024 19:37:39 +0100 Subject: [PATCH 086/204] masks2distmat: turn find_contour into find_longest_contour --- scripts/shapes/masks2distmatrices.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py index 023617ae..6b41755c 100644 --- a/scripts/shapes/masks2distmatrices.py +++ b/scripts/shapes/masks2distmatrices.py @@ -39,11 +39,13 @@ def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140): ####### Simplified version in order to make the things properly work ##### ########################################################################## -def find_contour(mask): +def find_longest_contour(mask): if len(mask.shape) == 3: # (lines, columns, number of channels) mask = rgb2grey(mask) - contour = sk.measure.find_contours(mask, 0.8)[0] - x, y = contour[:, 0], contour[:, 1] + contours = sk.measure.find_contours(mask, 0.8) + vprint(4, f'len(contours) {len(contours)}') + contours = sorted(contours, key=lambda x: len(x), reverse=True) + x, y = contours[0][:, 0], contours[0][:, 1] return x, y def spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling): @@ -68,7 +70,8 @@ def dist_to_coords(dst_mat): def mask2distmatrix(mask, raw_sampling_sparsity=1, spline_sampling=512): vprint(3, f'running with raw_sampling_sparsity {raw_sampling_sparsity} and spline_sampling {spline_sampling}') # extract mask contour - x, y = find_contour(mask) + x, y = find_longest_contour(mask) + vprint(3, f'found contour shape x {x.shape} y {y.shape}') # Reinterpolate (spline) x_reinterpolated, y_reinterpolated = spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling) # Build the distance matrix @@ -128,7 +131,7 @@ def masks2distmatrices(params): # mask = plt.imread(mask_path) # # Get the contour -# x, y = find_contour(mask) +# x, y = find_longest_contour(mask) # # Reinterpolate (spline) # x_reinterpolated, y_reinterpolated = spline_interpolation(x, y) From 177df9ef0f7cb407a39a67147b7dd4bf3cc5adda Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 9 Apr 2024 19:38:35 +0100 Subject: [PATCH 087/204] masks2distmat: enable periodic splprep for closed contours --- scripts/shapes/masks2distmatrices.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py index 6b41755c..b48c429d 100644 --- a/scripts/shapes/masks2distmatrices.py +++ b/scripts/shapes/masks2distmatrices.py @@ -51,7 +51,9 @@ def find_longest_contour(mask): def spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling): # Sparsity of the contour. Dropping some of the sample (points) to make the spline smoother raw_sampling_sparsity = max(1, raw_sampling_sparsity) - tck, u = splprep([x[::raw_sampling_sparsity], y[::raw_sampling_sparsity]], s = 0) + vprint(3, f'running with raw_sampling_sparsity {raw_sampling_sparsity} and spline_sampling {spline_sampling}') + vprint(3, f'x.shape {x.shape} y.shape {y.shape}') + tck, u = splprep([x[::raw_sampling_sparsity], y[::raw_sampling_sparsity]], s = 0, per = True) # How many times to sample the spline new_u = np.linspace(u.min(), u.max(), spline_sampling) # Last parameter is how dense is our spline, how many points. # Evaluate the spline From 4bd148756c0b50b39080c8ee8990dc9b249afd98 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 9 Apr 2024 19:39:26 +0100 Subject: [PATCH 088/204] masks2distmat: updated default sparsity to 4 --- scripts/shapes/masks2distmatrices.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py index b48c429d..06e99f7b 100644 --- a/scripts/shapes/masks2distmatrices.py +++ b/scripts/shapes/masks2distmatrices.py @@ -162,7 +162,7 @@ def masks2distmatrices(params): params = types.SimpleNamespace(**{ "mask_dataset_path": None , "output_path": None - , "raw_sampling_sparsity": 1 + , "raw_sampling_sparsity": 4 , "spline_sampling": 512 }) From 6d951e4da3f568a691dc098138792a034d015fa0 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 9 Apr 2024 19:44:06 +0100 Subject: [PATCH 089/204] distmat2contour: removed spurious return statement in vprint --- scripts/shapes/distmatrices2contour.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/shapes/distmatrices2contour.py b/scripts/shapes/distmatrices2contour.py index 70e15a1d..754dd5fa 100644 --- a/scripts/shapes/distmatrices2contour.py +++ b/scripts/shapes/distmatrices2contour.py @@ -17,7 +17,6 @@ def vprint(tgtlvl, msg, pfx = f"{'':<5}"): print("verbosity level not set, defaulting to 0") vprint.lvl = 0 vprint(tgtlvl, msg) - return dm def asym_to_sym(asym_dist_mat): return np.max(np.stack([asym_dist_mat, asym_dist_mat.T]), axis=0) From 1481b51e8b09bae2eb2d36fa4b3ab5769deccba3 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 9 Apr 2024 19:47:35 +0100 Subject: [PATCH 090/204] drawContourFromDM: removed spurious return statement in vprint --- scripts/shapes/drawContourFromDM.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/shapes/drawContourFromDM.py b/scripts/shapes/drawContourFromDM.py index 39863aee..671a1b96 100644 --- a/scripts/shapes/drawContourFromDM.py +++ b/scripts/shapes/drawContourFromDM.py @@ -18,7 +18,6 @@ def vprint(tgtlvl, msg, pfx = f"{'':<5}"): print("verbosity level not set, defaulting to 0") vprint.lvl = 0 vprint(tgtlvl, msg) - #return dm def asym_to_sym(asym_dist_mat): return np.max(np.stack([asym_dist_mat, asym_dist_mat.T]), axis=0) From 01c499ae5def089ac2333e7303a55f8fca1fb54e Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 9 Apr 2024 20:11:53 +0100 Subject: [PATCH 091/204] set correct aspect ratio for distmat2contour scripts --- scripts/shapes/distmatrices2contour.py | 1 + scripts/shapes/drawContourFromDM.py | 1 + 2 files changed, 2 insertions(+) diff --git a/scripts/shapes/distmatrices2contour.py b/scripts/shapes/distmatrices2contour.py index 754dd5fa..23b56bb8 100644 --- a/scripts/shapes/distmatrices2contour.py +++ b/scripts/shapes/distmatrices2contour.py @@ -35,6 +35,7 @@ def distmatrices2contour(params): p = pathlib.Path(dm_npy) p = p.with_suffix('.png') reconstructed_coords = dist_to_coords(dm) + plt.axes().set_aspect('equal') plt.scatter(*zip(*reconstructed_coords), s=6) plt.savefig(p) vprint(2, f'saved {p}') diff --git a/scripts/shapes/drawContourFromDM.py b/scripts/shapes/drawContourFromDM.py index 671a1b96..fde5172f 100644 --- a/scripts/shapes/drawContourFromDM.py +++ b/scripts/shapes/drawContourFromDM.py @@ -36,6 +36,7 @@ def distmatrices2contour(params): p = pathlib.Path(dm_npy) p = p.with_suffix('.png') reconstructed_coords = dist_to_coords(dm) + plt.axes().set_aspect('equal') plt.scatter(*zip(*reconstructed_coords), s=6) plt.savefig(p) vprint(2, f'saved {p}') From 879c5f4534018b08416bbb60bd06449a05978daa Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 9 Apr 2024 22:01:50 +0100 Subject: [PATCH 092/204] add different normalisations in dataset initial transformations --- scripts/shapes/distmatrix2embeding.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 9f3616e5..e97d694c 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -48,7 +48,10 @@ def main_process(params): ########################################################################### preproc_transform = transforms.Compose([ - lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix + #lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix + #lambda x: x*1000, # scale the matrix + lambda x: x / x.max(), # normalize each element to one using the max value (0-1) + lambda x: x*255, # scale the matrix to 255 lambda x: maybe_roll(x, p = 1.0), # "potentially" roll the matrix sanity_check, # check if the matrix is symmetric and positive, and the diagonal is zero torch.as_tensor, # turn (H,W) numpy array into a (H,W) tensor From fc58ad329d1e921d9301b7d8e27122c8967211a9 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 9 Apr 2024 22:02:52 +0100 Subject: [PATCH 093/204] Add notion of class label to ditmat2emb script output --- scripts/shapes/distmatrix2embeding.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index e97d694c..a699eeb6 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -127,7 +127,10 @@ def main_process(params): ) dataloader.setup() + # Predict + ########################################################################### predictions = trainer.predict(lit_model, datamodule=dataloader) + class_indices = np.array([int(data[-1]) for data in dataloader.predict_dataloader()]) #TODO: Pull the embedings and reconstructed distance matrices ########################################################################### @@ -142,9 +145,12 @@ def main_process(params): # Save the latent space np.save(f'{output_dir}/latent_space.npy', latent_space) # Save the reconstructions - for i, pred in enumerate(predictions): - np.save(f'{output_dir}/original_{i}.npy', pred.x.data[0,0]) - np.save(f'{output_dir}/reconstruction_{i}.npy', pred.out.recon_x[0,0]) + for class_label in dataset.classes: + pathlib.Path(f'{output_dir}/{class_label}').mkdir(parents=True, exist_ok=True) + for i, (pred, class_idx) in enumerate(zip(predictions, class_indices)): + class_label = dataset.classes[class_idx] + np.save(f'{output_dir}/{class_label}/original_{i}_{class_label}.npy', pred.x.data[0,0]) + np.save(f'{output_dir}/{class_label}/reconstruction_{i}_{class_label}.npy', pred.out.recon_x[0,0]) # default parameters ############################################################################### From 2a4d9559ee461c10a144d6260b843da4e6a82253 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 9 Apr 2024 22:59:49 +0100 Subject: [PATCH 094/204] Updated default model path in distmat2emb script --- scripts/shapes/distmatrix2embeding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index a699eeb6..432b5075 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -185,7 +185,7 @@ def main_process(params): "commitment_cost": 0.25, "decay": 0.99, "frobenius_norm": False, - "dataset": ("tiny_dist", "/nfs/research/uhlmann/afoix/tiny_synthcellshapes_dataset_distmat"), + "dataset": ("tiny_dist", "/nfs/research/uhlmann/afoix/distmat_datasets/tiny_synthcellshapes_dataset_distmat"), # optimizer_params "opt": "AdamW", "lr": 0.001, From 0c166ecfa128ac13a67bb9ad91499a9de76f66a5 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 15 Apr 2024 22:16:25 +0100 Subject: [PATCH 095/204] Added umap and kmeans + original filenames list --- scripts/shapes/distmatrix2embeding.py | 57 +++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 432b5075..ee234a14 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -1,6 +1,11 @@ from torchvision import datasets, transforms import pytorch_lightning as pl +import pandas as pd import numpy as np +import umap +import umap.plot +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans import bioimage_embed import bioimage_embed.shapes import bioimage_embed.lightning @@ -130,27 +135,65 @@ def main_process(params): # Predict ########################################################################### predictions = trainer.predict(lit_model, datamodule=dataloader) + filenames = [sample[0] for sample in dataloader.get_dataset().samples] class_indices = np.array([int(data[-1]) for data in dataloader.predict_dataloader()]) #TODO: Pull the embedings and reconstructed distance matrices ########################################################################### - vprint(1, f'pull the embedings') - # Use the namespace variables - latent_space = torch.stack([d.out.z.flatten() for d in predictions]) # create the output directory output_dir = params.output_dir if output_dir is None: output_dir = f'./{params.model}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) - # Save the latent space - np.save(f'{output_dir}/latent_space.npy', latent_space) - # Save the reconstructions for class_label in dataset.classes: pathlib.Path(f'{output_dir}/{class_label}').mkdir(parents=True, exist_ok=True) - for i, (pred, class_idx) in enumerate(zip(predictions, class_indices)): + # Save the latent space + vprint(1, f'pull the embedings') + latent_space = torch.stack([d.out.z.flatten() for d in predictions]).numpy() + np.save(f'{output_dir}/latent_space.npy', latent_space) + # Save the (original input and) reconstructions + for i, (pred, class_idx, fname) in enumerate(zip(predictions, class_indices, filenames)): + vprint(5, f'pred#={i}, class_idx={class_idx}, fname={fname}') class_label = dataset.classes[class_idx] np.save(f'{output_dir}/{class_label}/original_{i}_{class_label}.npy', pred.x.data[0,0]) np.save(f'{output_dir}/{class_label}/reconstruction_{i}_{class_label}.npy', pred.out.recon_x[0,0]) + # umap + vprint(4, f'generate umap') + umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42) + mapper = umap_model.fit(latent_space) + umap.plot.points(mapper, labels=np.array([dataset.classes[x] for x in class_indices])) + plt.savefig(f'{output_dir}/umap.png') + + # kmean and clustering information + # Perform KMeans clustering on the UMAP result + vprint(4, f'cluster data with kmean') + n_clusters = 4 # Define the number of clusters + kmeans = KMeans(n_clusters=n_clusters, random_state=42) + umap_result = umap_model.fit_transform(latent_space) + cluster_labels = kmeans.fit_predict(umap_result) + + # Concatenate the original data, UMAP result, and cluster labels + data_with_clusters = np.column_stack((latent_space, umap_result, cluster_labels)) + + # Convert to DataFrame for better handling + columns = [f'Feature_{i}' for i in range(latent_space.shape[1])] + \ + ['UMAP_Dimension_1', 'UMAP_Dimension_2', 'Cluster_Label'] + df = pd.DataFrame(data_with_clusters, columns=columns) + df['fname'] = filenames + + df.to_csv(f'{output_dir}/clustered_data.csv', index=False) + + # Plot the UMAP result with cluster labels + plt.figure(figsize=(10, 8)) + for i in range(n_clusters): + plt.scatter(umap_result[cluster_labels == i, 0], umap_result[cluster_labels == i, 1], label=f'Cluster {i+1}', s=5) + plt.title('UMAP Visualization of Latent Space with KMeans Clustering') + plt.xlabel('UMAP Dimension 1') + plt.ylabel('UMAP Dimension 2') + plt.legend() + + # Save the figure + plt.savefig(f'{output_dir}/umap_with_kmeans_clusters.png') # default parameters ############################################################################### From 4826ba3554b69f11324fbc537c3349e68495887b Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 17 Apr 2024 21:11:18 +0100 Subject: [PATCH 096/204] dist2emb: random seed for np and pl --- scripts/shapes/distmatrix2embeding.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index ee234a14..0b9bc12d 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -17,6 +17,10 @@ import torch import types +# Seed everything +np.random.seed(42) +pl.seed_everything(42) + # misc helpers ############################################################################### From 3e562472c49a1620210b75f07cd7ae93dfa2e41b Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 17 Apr 2024 21:12:07 +0100 Subject: [PATCH 097/204] dist2emb: test different initial transformations --- scripts/shapes/distmatrix2embeding.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 0b9bc12d..e1221f70 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -57,15 +57,17 @@ def main_process(params): ########################################################################### preproc_transform = transforms.Compose([ + lambda x: x / 256, # scale the matrix to the number of pixels #lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix - #lambda x: x*1000, # scale the matrix - lambda x: x / x.max(), # normalize each element to one using the max value (0-1) - lambda x: x*255, # scale the matrix to 255 + lambda x: x*100, # scale the matrix + #lambda x: x / x.max(), # normalize each element to one using the max value (0-1) + #lambda x: x*255, # scale the matrix to 255 lambda x: maybe_roll(x, p = 1.0), # "potentially" roll the matrix sanity_check, # check if the matrix is symmetric and positive, and the diagonal is zero torch.as_tensor, # turn (H,W) numpy array into a (H,W) tensor lambda x: x.repeat(3, 1, 1) # turn (H,W) tensor into a (3,H,W) tensor (to fit downstream model expectations) ]) + dataset = datasets.DatasetFolder(params.dataset[1], loader=np.load, extensions=('npy'), transform = preproc_transform) #dataset = datasets.DatasetFolder(params.dataset[1], loader=lambda x: np.load(x, allow_pickle=True), extensions=('npy'), transform = preproc_transform) dataloader = bioimage_embed.lightning.DataModule( From a29c57d9abf3acf641ad9186c4f693bbb3b59136 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 17 Apr 2024 21:13:09 +0100 Subject: [PATCH 098/204] dist2emb: remove "TODO" from prints --- scripts/shapes/distmatrix2embeding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index e1221f70..f815cb74 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -118,12 +118,12 @@ def main_process(params): #TODO: Validate the model ########################################################################### - vprint(1, f'TODO: Validate the model') + vprint(1, f'Validate the model') validation = trainer.validate(lit_model, datamodule=dataloader) #TODO: Test the model ########################################################################### - vprint(1, f'TODO: Test the model') + vprint(1, f'Test the model') testing = trainer.test(lit_model, datamodule=dataloader) # Inference on full dataset From bf5ce7f8bdd6f5acbe826b916e4dc5d336542f0f Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 17 Apr 2024 21:19:39 +0100 Subject: [PATCH 099/204] cosmetics --- bioimage_embed/lightning/torch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bioimage_embed/lightning/torch.py b/bioimage_embed/lightning/torch.py index 53d649fe..e559215d 100644 --- a/bioimage_embed/lightning/torch.py +++ b/bioimage_embed/lightning/torch.py @@ -9,7 +9,6 @@ from pythae.models.base.base_utils import ModelOutput import torch.nn.functional as F - class LitAutoEncoderTorch(pl.LightningModule): args = argparse.Namespace( opt="adamw", From 41890e564d0e02c8202d1dee14d5e0a2d91e911c Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 17 Apr 2024 21:20:15 +0100 Subject: [PATCH 100/204] LitAutoEncoderTorch: return both loss and recon_loss --- bioimage_embed/lightning/torch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bioimage_embed/lightning/torch.py b/bioimage_embed/lightning/torch.py index e559215d..ab730c3f 100644 --- a/bioimage_embed/lightning/torch.py +++ b/bioimage_embed/lightning/torch.py @@ -93,7 +93,11 @@ def training_step(self, batch, batch_idx): return loss def loss_function(self, model_output, *args, **kwargs): - return model_output.loss + #return model_output.loss + return { + "loss": model_output.loss, + "recon_loss": model_output.recon_loss, + } # def logging_step(self, z, loss, x, model_output, batch_idx): # self.logger.experiment.add_embedding( From beb571483fa1e21947c0a900ff59a8618cfd61d0 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 17 Apr 2024 21:21:56 +0100 Subject: [PATCH 101/204] MaskEmbed: turn off normalisation in DistanceMatrixLoss --- bioimage_embed/shapes/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py index e5ec529e..2664695a 100644 --- a/bioimage_embed/shapes/lightning.py +++ b/bioimage_embed/shapes/lightning.py @@ -35,7 +35,7 @@ def batch_to_tensor(self, batch): return ModelOutput(data=normalised_data / scalings, scalings=scalings) def loss_function(self, model_output, *args, **kwargs): - loss_ops = lf.DistanceMatrixLoss(model_output.recon_x, norm=True) + loss_ops = lf.DistanceMatrixLoss(model_output.recon_x, norm=False) loss = model_output.loss loss += torch.sum( torch.stack( From da092b5c0e697511b88092a552b5c23ef12ee2c1 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 17 Apr 2024 21:27:53 +0100 Subject: [PATCH 102/204] MaskEmbed: log losses in loss_function method --- bioimage_embed/shapes/lightning.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py index 2664695a..941007a9 100644 --- a/bioimage_embed/shapes/lightning.py +++ b/bioimage_embed/shapes/lightning.py @@ -37,22 +37,38 @@ def batch_to_tensor(self, batch): def loss_function(self, model_output, *args, **kwargs): loss_ops = lf.DistanceMatrixLoss(model_output.recon_x, norm=False) loss = model_output.loss - loss += torch.sum( + shape_loss = torch.sum( torch.stack( [ loss_ops.diagonal_loss(), loss_ops.symmetry_loss(), - # loss_ops.triangle_inequality(), loss_ops.non_negative_loss(), + # loss_ops.triangle_inequality(), # loss_ops.clockwise_order_loss(), ] ) ) + loss += shape_loss # loss += lf.diagonal_loss(model_output.recon_x) # loss += lf.symmetry_loss(model_output.recon_x) # loss += lf.triangle_inequality_loss(model_output.recon_x) # loss += lf.non_negative_loss(model_output.recon_x) + #return loss + + #variational_loss = model_output.loss - model_output.recon_loss + + self.log_dict( + { + "loss": loss, + "shape_loss": shape_loss, + "reconstruction_loss": model_output.recon_loss, + "variational_loss": model_output.vq_loss, + }, + on_epoch=True, + prog_bar=True, + logger=True, + ) return loss From 78d067d46916a024a384260e2293767a8900f1d4 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 24 Apr 2024 21:00:38 +0100 Subject: [PATCH 103/204] renamed varitional_loss to vq_loss + comment out kdl_vae_loss --- bioimage_embed/shapes/lightning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py index 941007a9..02ef0e4e 100644 --- a/bioimage_embed/shapes/lightning.py +++ b/bioimage_embed/shapes/lightning.py @@ -63,7 +63,8 @@ def loss_function(self, model_output, *args, **kwargs): "loss": loss, "shape_loss": shape_loss, "reconstruction_loss": model_output.recon_loss, - "variational_loss": model_output.vq_loss, + "vq_loss": model_output.vq_loss, + #"kdl_vae_loss": model_output.KLD }, on_epoch=True, prog_bar=True, From d0c4ffbcf6de0205afd628cabe8fd4ddaaa19316 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 24 Apr 2024 21:01:44 +0100 Subject: [PATCH 104/204] removed new line --- bioimage_embed/shapes/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py index 02ef0e4e..e8dfee80 100644 --- a/bioimage_embed/shapes/lightning.py +++ b/bioimage_embed/shapes/lightning.py @@ -144,4 +144,4 @@ def training_step(self, batch, batch_idx, optimizer_idx=0): def configure_optimizers(self): opt_ed, lr_s_ed = self.timm_optimizers(self.model) - return self.timm_to_lightning(optimizer=opt_ed, lr_scheduler=lr_s_ed) + return self.timm_to_lightning(optimizer=opt_ed, lr_scheduler=lr_s_ed) \ No newline at end of file From 253b3f4b7aeb05ff8f5d73b2c6bceb85efb16dc7 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 24 Apr 2024 21:03:28 +0100 Subject: [PATCH 105/204] Normalise contour coord in mask2distmat script --- scripts/shapes/masks2distmatrices.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/masks2distmatrices.py b/scripts/shapes/masks2distmatrices.py index 06e99f7b..c6af9ae8 100644 --- a/scripts/shapes/masks2distmatrices.py +++ b/scripts/shapes/masks2distmatrices.py @@ -39,13 +39,18 @@ def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140): ####### Simplified version in order to make the things properly work ##### ########################################################################## -def find_longest_contour(mask): +def find_longest_contour(mask, normalise_coord=False): if len(mask.shape) == 3: # (lines, columns, number of channels) mask = rgb2grey(mask) contours = sk.measure.find_contours(mask, 0.8) vprint(4, f'len(contours) {len(contours)}') contours = sorted(contours, key=lambda x: len(x), reverse=True) x, y = contours[0][:, 0], contours[0][:, 1] + if normalise_coord: + x = x - np.min(x) + x = x / np.max(x) + y = y - np.min(y) + y = y / np.max(y) return x, y def spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling): @@ -72,7 +77,7 @@ def dist_to_coords(dst_mat): def mask2distmatrix(mask, raw_sampling_sparsity=1, spline_sampling=512): vprint(3, f'running with raw_sampling_sparsity {raw_sampling_sparsity} and spline_sampling {spline_sampling}') # extract mask contour - x, y = find_longest_contour(mask) + x, y = find_longest_contour(mask, normalise_coord=True) vprint(3, f'found contour shape x {x.shape} y {y.shape}') # Reinterpolate (spline) x_reinterpolated, y_reinterpolated = spline_interpolation(x, y, raw_sampling_sparsity, spline_sampling) From 5066e3db864c1be535c3dcced3240654454fa61f Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 24 Apr 2024 21:05:07 +0100 Subject: [PATCH 106/204] Use bokeh for interactive umap plot (save as html file) --- scripts/shapes/distmatrix2embeding.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index f815cb74..b4d1762e 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -4,6 +4,7 @@ import numpy as np import umap import umap.plot +import bokeh.plotting import matplotlib.pyplot as plt from sklearn.cluster import KMeans import bioimage_embed @@ -157,6 +158,11 @@ def main_process(params): vprint(1, f'pull the embedings') latent_space = torch.stack([d.out.z.flatten() for d in predictions]).numpy() np.save(f'{output_dir}/latent_space.npy', latent_space) + df = pd.DataFrame(latent_space) + df['class_idx'] = class_indices + df['class'] = [dataset.classes[x] for x in class_indices] + df['fname'] = filenames + #df.to_pickle(f'{output_dir}/latent_space.pkl') # Save the (original input and) reconstructions for i, (pred, class_idx, fname) in enumerate(zip(predictions, class_indices, filenames)): vprint(5, f'pred#={i}, class_idx={class_idx}, fname={fname}') @@ -166,9 +172,13 @@ def main_process(params): # umap vprint(4, f'generate umap') umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42) - mapper = umap_model.fit(latent_space) - umap.plot.points(mapper, labels=np.array([dataset.classes[x] for x in class_indices])) + mapper = umap_model.fit(df.drop(['class_idx','class','fname'], axis=1)) + umap.plot.points(mapper, labels=np.array(df['class'])) plt.savefig(f'{output_dir}/umap.png') + p = umap.plot.interactive(mapper, labels=df['class_idx'], hover_data=df[['class','fname']]) + # save interactive plot as html + bokeh.plotting.output_file(f"{output_dir}/umap.html") + bokeh.plotting.save(p) # kmean and clustering information # Perform KMeans clustering on the UMAP result From 977620ecb13a0beca07f61bb61b7157b4d5fd236 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 25 Apr 2024 22:59:20 +0100 Subject: [PATCH 107/204] save latent_space with extra info as pickle again and have a separate gen UMAPs script --- scripts/shapes/distmatrix2embeding.py | 7 +- scripts/shapes/genUMAPs.py | 133 ++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 3 deletions(-) create mode 100755 scripts/shapes/genUMAPs.py diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index b4d1762e..07b0062a 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -162,7 +162,7 @@ def main_process(params): df['class_idx'] = class_indices df['class'] = [dataset.classes[x] for x in class_indices] df['fname'] = filenames - #df.to_pickle(f'{output_dir}/latent_space.pkl') + df.to_pickle(f'{output_dir}/latent_space.pkl') # Save the (original input and) reconstructions for i, (pred, class_idx, fname) in enumerate(zip(predictions, class_indices, filenames)): vprint(5, f'pred#={i}, class_idx={class_idx}, fname={fname}') @@ -171,11 +171,12 @@ def main_process(params): np.save(f'{output_dir}/{class_label}/reconstruction_{i}_{class_label}.npy', pred.out.recon_x[0,0]) # umap vprint(4, f'generate umap') - umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42) + umap_model = umap.UMAP(n_neighbors=50, min_dist=0.8, n_components=2, random_state=42) mapper = umap_model.fit(df.drop(['class_idx','class','fname'], axis=1)) umap.plot.points(mapper, labels=np.array(df['class'])) plt.savefig(f'{output_dir}/umap.png') - p = umap.plot.interactive(mapper, labels=df['class_idx'], hover_data=df[['class','fname']]) + #p = umap.plot.interactive(mapper, labels=df['class_idx'], hover_data=df[['class','fname']]) + p = umap.plot.interactive(mapper, values=df.drop(['class_idx','class','fname'], axis=1).mean(axis=1), theme='viridis', hover_data=df[['class','fname']]) # save interactive plot as html bokeh.plotting.output_file(f"{output_dir}/umap.html") bokeh.plotting.save(p) diff --git a/scripts/shapes/genUMAPs.py b/scripts/shapes/genUMAPs.py new file mode 100755 index 00000000..09121647 --- /dev/null +++ b/scripts/shapes/genUMAPs.py @@ -0,0 +1,133 @@ +#! /usr/bin/env python3 + +import os +import os.path +import pandas as pd +import numpy as np +import umap +import umap.plot +import matplotlib.pyplot as plt +import bokeh.plotting +import argparse +import datetime +import pathlib +import multiprocessing +import subprocess + +# Seed everything +np.random.seed(42) + +# misc helpers +############################################################################### + +def vprint(tgtlvl, msg, pfx = f"{'':<5}"): + try: + if (tgtlvl <= vprint.lvl): + print(f"{pfx}{msg}") + except AttributeError: + print("verbosity level not set, defaulting to 0") + vprint.lvl = 0 + vprint(tgtlvl, msg) + +# render UMAPS +def render_umap_core(df, output_dir, n_neighbors, min_dist, n_components): + name = f'umap_{n_neighbors}_{min_dist}_{n_components}' + vprint(4, f'generate {name}') + vprint(5, f'n_neigbhors: {type(n_neighbors)} {n_neighbors}') + vprint(5, f'min_dist: {type(min_dist)} {min_dist}') + vprint(5, f'n_components: {type(n_components)} {n_components}') + umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=42) + mapper = umap_model.fit(df.drop(['class_idx','class','fname'], axis=1)) + umap.plot.points(mapper, labels=np.array(df['class'])) + plt.savefig(f'{output_dir}/{name}.png') + #p = umap.plot.interactive(mapper, labels=df['class_idx'], hover_data=df[['class','fname']]) + p = umap.plot.interactive(mapper, values=df.drop(['class_idx','class','fname'], axis=1).mean(axis=1), theme='viridis', hover_data=df[['class','fname']]) + # save interactive plot as html + bokeh.plotting.output_file(f"{output_dir}/{name}.html") + bokeh.plotting.save(p) + +def render_umap(latent_space_pkl, output_dir, n_neighbors, min_dist, n_components): + # create output directory if it does not already exist + os.makedirs(output_dir, exist_ok=True) + # load latent space + df = pd.read_pickle(latent_space_pkl) + # render umap + render_umap_core(df, output_dir, n_neighbors, min_dist, n_components) + +############################################################################### + +if __name__ == "__main__": + + def auto_pos_int (x): + val = int(x,0) + if val <= 0: + raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) + return val + + parser = argparse.ArgumentParser(description='generate umaps') + + parser.add_argument('latent_space', metavar='LATENT_SPACE', type=os.path.abspath + , help=f"The path to the latent space") + parser.add_argument('-j', '--n_jobs', type=auto_pos_int, default=2*os.cpu_count() + , help="number of jobs to start. Default is 2x the number of CPUs.") + parser.add_argument('--slurm', action=argparse.BooleanOptionalAction) + parser.add_argument('-n', '--n_neighbors', nargs='+', type=auto_pos_int, default=[50] + , help="A list of the number of neighbors to use in UMAP. Default is [50].") + parser.add_argument('-m', '--min_dist', nargs='+', type=float, default=[0.8] + , help="A list of the minimum distances to use in UMAP. Default is [0.8].") + parser.add_argument('-c', '--n_components', nargs='+', type=auto_pos_int, default=[2] + , help="A list of the number of components to use in UMAP. Default is [2].") + parser.add_argument( '-o', '--output-dir', metavar='OUTPUT_DIR', default=f'{os.getcwd()}/umaps' + , help=f"The OUTPUT_DIR path to use to dump results") + parser.add_argument('-v', '--verbose', action='count', default=0 + , help="Increase verbosity level by adding more \"v\".") + + # parse command line arguments + clargs=parser.parse_args() + + # set verbosity level for vprint function + vprint.lvl = clargs.verbose + + #for x,y,z in [(x, y, z) for x in clargs.n_neighbors + # for y in clargs.min_dist + # for z in clargs.n_components]: + # render_umap(df, x, y, z) + + params=[(x, y, z) for x in clargs.n_neighbors + for y in clargs.min_dist + for z in clargs.n_components] + if clargs.slurm: + vprint(2, f'running with slurm') + for (n_neighbors, min_dist, n_components) in params: + vprint(3, f'running with n_neighbors={n_neighbors}, min_dist={min_dist}, n_components={n_components}') + print('Directory Name: ', os.path.dirname(__file__)) + + cmd = [ "srun" + , "-t", "50:00:00" + , "--mem=200G" + , "--gpus=a100:1" + , "--job-name", f"render_umap_{n_neighbors}_{min_dist}_{n_components}" + , "--pty" + , "python3", "-c" + , f""" +import sys +sys.path.insert(1, '{os.path.dirname(__file__)}') +import genUMAPs +genUMAPs.render_umap('{clargs.latent_space}','{clargs.output_dir}',{n_neighbors},{min_dist},{n_components}) +"""] + vprint(4, cmd) + subprocess.run(cmd) + + else: + vprint(2, f'running with python multiprocessing') + + # create output directory if it does not already exist + os.makedirs(clargs.output_dir, exist_ok=True) + + # load latent space + df = pd.read_pickle(clargs.latent_space) + + def render_umap_wrapper(args): + render_umap(df, clargs.output_dir, *args) + with multiprocessing.Pool(clargs.n_jobs) as pool: + pool.starmap(render_umap_wrapper, params) \ No newline at end of file From ccdc090caf77188c3053beb0036db2abdbeacb63 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 26 Apr 2024 08:52:05 +0100 Subject: [PATCH 108/204] updated the render umap script with a _hardcoded_ trick to extract index from filename for the tree dataset (should use a user specified regex instead) --- scripts/shapes/genUMAPs.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/genUMAPs.py b/scripts/shapes/genUMAPs.py index 09121647..265f0525 100755 --- a/scripts/shapes/genUMAPs.py +++ b/scripts/shapes/genUMAPs.py @@ -40,8 +40,15 @@ def render_umap_core(df, output_dir, n_neighbors, min_dist, n_components): mapper = umap_model.fit(df.drop(['class_idx','class','fname'], axis=1)) umap.plot.points(mapper, labels=np.array(df['class'])) plt.savefig(f'{output_dir}/{name}.png') + theme_values = df.drop(['class_idx','class','fname'], axis=1).mean(axis=1) + vprint(5, f'theme_values type: {type(theme_values)}') + if True: #temporary condition to work ONLY with the tree dataset + theme_values = list(map(lambda x: int(x.split('_')[-1].split('.')[0]), df['fname'])) + vprint(5, f'new theme_values type: {type(theme_values)}') + vprint(5, f'theme_values: {theme_values}') #p = umap.plot.interactive(mapper, labels=df['class_idx'], hover_data=df[['class','fname']]) - p = umap.plot.interactive(mapper, values=df.drop(['class_idx','class','fname'], axis=1).mean(axis=1), theme='viridis', hover_data=df[['class','fname']]) + #p = umap.plot.interactive(mapper, values=df.drop(['class_idx','class','fname'], axis=1).mean(axis=1), theme='viridis', hover_data=df[['class','fname']]) + p = umap.plot.interactive(mapper, values=theme_values, theme='viridis', hover_data=df[['class','fname']]) # save interactive plot as html bokeh.plotting.output_file(f"{output_dir}/{name}.html") bokeh.plotting.save(p) @@ -113,6 +120,7 @@ def auto_pos_int (x): import sys sys.path.insert(1, '{os.path.dirname(__file__)}') import genUMAPs +genUMAPs.vprint.lvl = {clargs.verbose} genUMAPs.render_umap('{clargs.latent_space}','{clargs.output_dir}',{n_neighbors},{min_dist},{n_components}) """] vprint(4, cmd) From 3601e3bd2f8ee5f892d3786f1dc924be431ca1b8 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 29 Apr 2024 22:11:00 +0100 Subject: [PATCH 109/204] minor config + comments --- scripts/shapes/distmatrix2embeding.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 07b0062a..ff02861a 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -58,11 +58,9 @@ def main_process(params): ########################################################################### preproc_transform = transforms.Compose([ - lambda x: x / 256, # scale the matrix to the number of pixels - #lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix - lambda x: x*100, # scale the matrix + lambda x: x / np.linalg.norm(x, "fro"), # normalize the matrix + #lambda x: x*1000, # scale the matrix #lambda x: x / x.max(), # normalize each element to one using the max value (0-1) - #lambda x: x*255, # scale the matrix to 255 lambda x: maybe_roll(x, p = 1.0), # "potentially" roll the matrix sanity_check, # check if the matrix is symmetric and positive, and the diagonal is zero torch.as_tensor, # turn (H,W) numpy array into a (H,W) tensor @@ -97,6 +95,9 @@ def main_process(params): jobname = f"{params.model}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}" wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=jobname) wandblogger.watch(lit_model, log="all") + # TODO: Sanity check: + # test_data = dataset[0][0].unsqueeze(0) + # test_output = lit_model.forward((test_data,)) # Train the model ########################################################################### @@ -211,6 +212,9 @@ def main_process(params): # Save the figure plt.savefig(f'{output_dir}/umap_with_kmeans_clusters.png') + + # Test embeding for a classifcation task + # default parameters ############################################################################### From 3f98d068b31cc1a8b576a9a495ab230cd90b9215 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 8 May 2024 22:26:50 +0100 Subject: [PATCH 110/204] added a beta vae model --- bioimage_embed/models/factory.py | 14 ++++++++++++++ bioimage_embed/shapes/lightning.py | 4 ++-- scripts/shapes/distmatrix2embeding.py | 15 ++++++++++++++- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/bioimage_embed/models/factory.py b/bioimage_embed/models/factory.py index 749ebaa6..713b98af 100644 --- a/bioimage_embed/models/factory.py +++ b/bioimage_embed/models/factory.py @@ -123,6 +123,19 @@ def resnet18_vqvae(self): bolts.ResNet18VQVAEDecoder, ) + def resnet18_beta_vae(self): + return self.create_model( + partial( + pythae.models.BetaVAEConfig, + use_default_encoder=False, + use_default_decoder=False, + **self.kwargs + ), + pythae.models.BetaVAE, + bolts.ResNet18VAEEncoder, + bolts.ResNet18VAEDecoder, + ) + def resnet50_vqvae(self): return self.create_model( partial( @@ -177,6 +190,7 @@ def resnet152_vqvae_legacy(self): MODELS = [ "resnet18_vae", + "resnet18_beta_vae", "resnet50_vae", "resnet18_vae_bolt", "resnet50_vae_bolt", diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py index e8dfee80..02202fb8 100644 --- a/bioimage_embed/shapes/lightning.py +++ b/bioimage_embed/shapes/lightning.py @@ -63,8 +63,8 @@ def loss_function(self, model_output, *args, **kwargs): "loss": loss, "shape_loss": shape_loss, "reconstruction_loss": model_output.recon_loss, - "vq_loss": model_output.vq_loss, - #"kdl_vae_loss": model_output.KLD + #"vq_loss": model_output.vq_loss, + "KLD_loss": model_output.reg_loss, }, on_epoch=True, prog_bar=True, diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index ff02861a..9a8f3dee 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -17,6 +17,7 @@ import pathlib import torch import types +import re # Seed everything np.random.seed(42) @@ -81,11 +82,15 @@ def main_process(params): # Build the model ########################################################################### + extra_params = {} + if re.match(".*_beta_vae", params.model): + extra_params['beta'] = params.model_beta_vae_beta model = bioimage_embed.models.create_model( model=params.model, input_dim=params.input_dim, latent_dim=params.latent_dim, pretrained=params.pretrained, + **extra_params ) lit_model = bioimage_embed.shapes.MaskEmbed(model, params) vprint(1, f'model ready') @@ -222,6 +227,7 @@ def main_process(params): models = [ "resnet18_vae" , "resnet50_vae" +, "resnet18_beta_vae" , "resnet18_vae_bolt" , "resnet50_vae_bolt" , "resnet18_vqvae" @@ -250,6 +256,8 @@ def main_process(params): "decay": 0.99, "frobenius_norm": False, "dataset": ("tiny_dist", "/nfs/research/uhlmann/afoix/distmat_datasets/tiny_synthcellshapes_dataset_distmat"), + # model-specific params + "model_beta_vae_beta": 1, # optimizer_params "opt": "AdamW", "lr": 0.001, @@ -280,6 +288,9 @@ def auto_pos_int (x): parser.add_argument( '-m', '--model', choices=models, metavar='MODEL' , help=f"The MODEL to use, one of {models} (default {params.model}).") + parser.add_argument( + '--model-beta-vae-beta', type=float, metavar='BETA' + , help=f"The BETA parameter to use for a beta-vae model.") parser.add_argument( '-d', '--dataset', nargs=2, metavar=('NAME', 'PATH') , help=f"The NAME of and PATH to the dataset (default: {params.dataset})") @@ -318,6 +329,8 @@ def auto_pos_int (x): # update default params with clargs if clargs.model: params.model = clargs.model + if clargs.model_beta_vae_beta: + params.model_beta_vae_beta = clargs.model_beta_vae_beta params.output_dir = clargs.output_dir if clargs.dataset: params.dataset = clargs.dataset @@ -339,4 +352,4 @@ def auto_pos_int (x): params.epochs = clargs.num_epochs # run main process - main_process(params) \ No newline at end of file + main_process(params) From f6ee1ac158291d110d2bd22e1859b3b5f16af707 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 8 May 2024 22:45:06 +0100 Subject: [PATCH 111/204] added extra parameters in the wandb jobname --- scripts/shapes/distmatrix2embeding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 9a8f3dee..227b795b 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -97,7 +97,7 @@ def main_process(params): # WandB logger ########################################################################### - jobname = f"{params.model}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}" + jobname = f"{params.model}_{'_'.join([f'{k}{v}' for k, v in extra_params.items()])}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}" wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=jobname) wandblogger.watch(lit_model, log="all") # TODO: Sanity check: From 01b746fc3b44ef7371312fc600cf26f6e0cb260a Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 10 May 2024 22:42:21 +0100 Subject: [PATCH 112/204] finer grained clargs around latent space related parameters --- scripts/shapes/distmatrix2embeding.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 227b795b..eaf28376 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -248,9 +248,9 @@ def main_process(params): "batch_size": 4, "num_workers": 2**4, "input_dim": (3, 512, 512), - "latent_dim": 512, - "num_embeddings": 512, - "num_hiddens": 512, + "latent_dim": 1024, + "num_embeddings": 1024, + "num_hiddens": 1024, "pretrained": True, "commitment_cost": 0.25, "decay": 0.99, @@ -309,6 +309,15 @@ def auto_pos_int (x): parser.add_argument( '-l', '--latent-space-size', metavar='LATENT_SPACE_SIZE', type=auto_pos_int , help=f"The LATENT_SPACE_SIZE, a positive integer (default {params.latent_dim})") + parser.add_argument( + '--input-dimensions', metavar='INPUT_DIM', nargs=2, type=auto_pos_int + , help=f"The width and height INPUT_DIM for the input dimensions (default {params.input_dim[1]} and {params.input_dim[2]})") + parser.add_argument( + '--number-embeddings', metavar='NUM_EMBEDDINGS', type=auto_pos_int + , help=f"The NUM_EMBEDDINGS, a positive integer (default {params.num_embeddings})") + parser.add_argument( + '--number-hiddens', metavar='NUM_HIDDENS', type=auto_pos_int + , help=f"The NUM_HIDDENS, a positive integer (default {params.num_hiddens})") parser.add_argument( '-n', '--num-workers', metavar='NUM_WORKERS', type=auto_pos_int , help=f"The NUM_WORKERS for the run, a positive integer (default {params.num_workers})") @@ -341,11 +350,13 @@ def auto_pos_int (x): if clargs.batch_size: params.batch_size = clargs.batch_size if clargs.latent_space_size: - interp_size = clargs.latent_space_size * 2 - params.input_dim = (params.input_dim[0], interp_size, interp_size) - params.latent_dim = interp_size - params.num_embeddings = interp_size - params.num_hiddens = interp_size + params.latent_dim = clargs.latent_space_size + if clargs.input_dimensions: + params.input_dim = (params.input_dim[0], clargs.input_dimensions[0], clargs.input_dimensions[1]) + if clargs.number_embeddings: + params.num_embeddings = clargs.number_embeddings + if clargs.number_hiddens: + params.num_hiddens = clargs.number_hiddens if clargs.num_workers: params.num_workers = clargs.num_workers if clargs.num_epochs: From d493ea5f06cb5532f0f415da07dd8fc5d9b7f971 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 10 May 2024 23:09:51 +0100 Subject: [PATCH 113/204] log different losses for vq or beta models --- bioimage_embed/shapes/lightning.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/bioimage_embed/shapes/lightning.py b/bioimage_embed/shapes/lightning.py index 02202fb8..a9a1e947 100644 --- a/bioimage_embed/shapes/lightning.py +++ b/bioimage_embed/shapes/lightning.py @@ -7,6 +7,7 @@ from torch import nn from ..lightning import LitAutoEncoderTorch from . import loss_functions as lf +import pythae from pythae.models.base.base_utils import ModelOutput from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint from types import SimpleNamespace @@ -58,14 +59,18 @@ def loss_function(self, model_output, *args, **kwargs): #variational_loss = model_output.loss - model_output.recon_loss + metrics = { + "loss": loss, + "shape_loss": shape_loss, + "reconstruction_loss": model_output.recon_loss, + } + if isinstance(self.model, pythae.models.VQVAE): + metrics["vq_loss"] = model_output.vq_loss + if isinstance(self.model, pythae.models.BetaVAE): + metrics['KLD_loss'] = model_output.reg_loss + self.log_dict( - { - "loss": loss, - "shape_loss": shape_loss, - "reconstruction_loss": model_output.recon_loss, - #"vq_loss": model_output.vq_loss, - "KLD_loss": model_output.reg_loss, - }, + metrics, on_epoch=True, prog_bar=True, logger=True, @@ -144,4 +149,4 @@ def training_step(self, batch, batch_idx, optimizer_idx=0): def configure_optimizers(self): opt_ed, lr_s_ed = self.timm_optimizers(self.model) - return self.timm_to_lightning(optimizer=opt_ed, lr_scheduler=lr_s_ed) \ No newline at end of file + return self.timm_to_lightning(optimizer=opt_ed, lr_scheduler=lr_s_ed) From b15dff4d6dc15150ce1703fb7e9cdf1abdf27ac4 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 13 May 2024 17:24:58 +0100 Subject: [PATCH 114/204] code to do classification using the features of the latent space --- scripts/shapes/check_latent_space.py | 125 +++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 scripts/shapes/check_latent_space.py diff --git a/scripts/shapes/check_latent_space.py b/scripts/shapes/check_latent_space.py new file mode 100644 index 00000000..6fb085a4 --- /dev/null +++ b/scripts/shapes/check_latent_space.py @@ -0,0 +1,125 @@ +import pandas as pd +import numpy as np +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split, cross_validate +from sklearn.preprocessing import StandardScaler +from sklearn.decomposition import PCA +from sklearn.pipeline import Pipeline +from sklearn import svm +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.metrics import classification_report, confusion_matrix +import umap +import seaborn as sns +import matplotlib.pyplot as plt +import os +from tabulate import tabulate +import json + +pd.set_option('display.max_colwidth', None) + +df = pd.read_csv("clustered_data.csv") + +df.insert(0, 'label', df['fname'].str.extract(r'^(?:[^/]*/){7}([^/]*)').squeeze()) +df.insert(0, 'n_label', df['label'].apply(lambda x: 0 if x == 'alive' else 1)) + +new_df = df.iloc[:, :-4] + +y = new_df.iloc[:, 0] +X = new_df.iloc[:, 2:] + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) + +def build_and_evaluate_model(clf, X_train, y_train, X_test, y_test): + model = Pipeline( + [ + ("scaler", StandardScaler()), + ("pca", PCA(n_components=0.95, whiten=True, random_state=42)), + ("clf", clf), + ] + ) + + pipeline = model.fit(X_train, y_train) + + score = pipeline.score(X_test, y_test) + print(f"Classification score: {score}") + + y_pred = pipeline.predict(X_test) + + print("Classification Report:") + print(classification_report(y_test, y_pred)) + + print("Confusion Matrix:") + cm = confusion_matrix(y_test, y_pred) + print(cm) + + # Cross-validation + cv_results = cross_validate(pipeline, X, y, cv=5) + print("Cross-validation results:") + print(cv_results) + + # Plot and save the confusion matrix + plt.figure(figsize=(10,7)) + sns.heatmap(cm, annot=True, fmt='d') + plt.xlabel('Predicted') + plt.ylabel('Truth') + plt.title(f'Confusion Matrix for {clf.__class__.__name__}') + plt.savefig(f'confusion_matrix_{clf.__class__.__name__}.png') + plt.clf() # Clear the current figure + + return score, cm, cv_results + +classifiers = [RandomForestClassifier(), GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0), svm.SVC()] + +results = [] + +for clf in classifiers: + score, cm, cv_results = build_and_evaluate_model(clf, X_train, y_train, X_test, y_test) + results.append((clf.__class__.__name__, score, cm, cv_results)) + +known_labels = list(y[:50]) +unknown_labels = [-1]*len(y[50:]) +partial_labels = known_labels + unknown_labels + +reducer = umap.UMAP() +embedding = reducer.fit_transform(X, y=partial_labels) + +plt.scatter(embedding[:, 0], embedding[:, 1], c=partial_labels, cmap='Spectral', s=5) +plt.gca().set_aspect('equal', 'datalim') +plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10)) +plt.title('UMAP projection of the dataset', fontsize=24) + +plt.savefig('umap_visualization.png') +plt.clf() # Clear the current figure + +# Generate LaTeX report +with open('final_report.tex', 'w') as f: + f.write("\\documentclass{article}\n\\usepackage{graphicx}\n\\usepackage{longtable}\n\\usepackage{listings}\n\\begin{document}\n") + for name, score, cm, cv_results in results: + f.write(f"\\section*{{Results for {name}}}\n") + f.write("\\begin{longtable}{|l|l|}\n") + f.write("\\hline\n") + f.write(f"Classification Score & {score} \\\\\n") + f.write("\\hline\n") + f.write("Confusion Matrix & \\\\\n") + f.write("\\begin{lstlisting}\n") + f.write(np.array2string(cm).replace('\n', ' \\\\\n')) + f.write("\\end{lstlisting}\n") + f.write("\\hline\n") + f.write("Cross-validation Results & \\\\\n") + f.write("\\begin{lstlisting}\n") + cv_results_df = pd.DataFrame(cv_results) + cv_results_df = cv_results_df.applymap(lambda x: x.tolist() if isinstance(x, np.ndarray) else x) + f.write(cv_results_df.to_string().replace('\n', ' \\\\\n')) + f.write("\\end{lstlisting}\n") + f.write("\\hline\n") + f.write("\\end{longtable}\n") + f.write("\\section*{UMAP visualization}\n") + f.write("\\includegraphics[width=\\textwidth]{umap_visualization.png}\n") + f.write("\\end{document}\n") + +os.system('pdflatex final_report.tex') + +# Generate CSV report +report_df = pd.DataFrame(results, columns=['Classifier', 'Score', 'Confusion Matrix', 'Cross-validation Results']) +report_df['Cross-validation Results'] = report_df['Cross-validation Results'].apply(lambda x: pd.DataFrame(x).applymap(lambda y: y.tolist() if isinstance(y, np.ndarray) else y).to_dict()) +report_df.to_csv('final_report.csv', index=False) From 1a2bcf15fb51e60f192064f8920137c69138399e Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 5 Jun 2024 10:36:47 +0100 Subject: [PATCH 115/204] new latent space size --- scripts/shapes/distmatrix2embeding.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index eaf28376..c388b047 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -241,6 +241,7 @@ def main_process(params): , "resnet50_vae_legacy" ] +matrix_dim = 512 params = types.SimpleNamespace(**{ # general params "model": "resnet18_vae", @@ -248,7 +249,7 @@ def main_process(params): "batch_size": 4, "num_workers": 2**4, "input_dim": (3, 512, 512), - "latent_dim": 1024, + "latent_dim": int((matrix_dim**2 - matrix_dim) / 2), "num_embeddings": 1024, "num_hiddens": 1024, "pretrained": True, From b6f12c2dfb32c6c903f4c7b1de1ede03efc79aac Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 5 Jun 2024 10:38:38 +0100 Subject: [PATCH 116/204] Added imports that will be needed for next commits --- scripts/shapes/distmatrix2embeding.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index c388b047..ff5145c2 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -1,3 +1,5 @@ +import seaborn as sns +import pyefd from torchvision import datasets, transforms import pytorch_lightning as pl import pandas as pd @@ -12,12 +14,34 @@ import bioimage_embed.lightning from bioimage_embed.lightning import DataModule from pytorch_lightning import loggers as pl_loggers +from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint import argparse import datetime import pathlib import torch import types import re +import shutil +from pathlib import Path +from sklearn.model_selection import cross_validate, KFold, train_test_split, StratifiedKFold +from sklearn.metrics import make_scorer +from sklearn import metrics +from sklearn.discriminant_analysis import StandardScaler +from sklearn.ensemble import RandomForestClassifier +from sklearn.pipeline import Pipeline +from skimage import measure +from tqdm import tqdm +import logging + +from bioimage_embed.shapes.transforms import ( + ImageToCoords, + CropCentroidPipeline +) + +import pickle +import base64 +import hashlib +import os # Seed everything np.random.seed(42) From 79e75ced94925d971fad1542b0a45dd7fed3fc57 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 5 Jun 2024 10:40:22 +0100 Subject: [PATCH 117/204] Added checkpoint mechanism --- scripts/shapes/distmatrix2embeding.py | 42 +++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index ff5145c2..45da013f 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -118,7 +118,15 @@ def main_process(params): ) lit_model = bioimage_embed.shapes.MaskEmbed(model, params) vprint(1, f'model ready') - + + model_dir = f"checkpoints/{hashing_fn(params)}" + + + if clargs.clear_checkpoints: + print("cleaning checkpoints") + shutil.rmtree("checkpoints/") + model_dir = f"checkpoints/{hashing_fn(params)}" + # WandB logger ########################################################################### jobname = f"{params.model}_{'_'.join([f'{k}{v}' for k, v in extra_params.items()])}_{params.latent_dim}_{params.batch_size}_{params.dataset[0]}" @@ -131,22 +139,46 @@ def main_process(params): # Train the model ########################################################################### + Path(f"{model_dir}/").mkdir(parents=True, exist_ok=True) + + checkpoint_callback = ModelCheckpoint( + dirpath=f"{model_dir}/", + save_last=True, + save_top_k=1, + monitor="loss/val", + mode="min", + ) + trainer = pl.Trainer( logger=[wandblogger], gradient_clip_val=0.5, - enable_checkpointing=False, + enable_checkpointing=True, devices=1, accelerator="gpu", accumulate_grad_batches=4, - #TODO callbacks=[checkpoint_callback], + callbacks=[checkpoint_callback], min_epochs=50, max_epochs=params.epochs, log_every_n_steps=1, ) + + # Determine the checkpoint path for resuming + last_checkpoint_path = f"{model_dir}/last.ckpt" + best_checkpoint_path = checkpoint_callback.best_model_path + + # Check if a last checkpoint exists to resume from + if os.path.isfile(last_checkpoint_path): + resume_checkpoint = last_checkpoint_path + elif best_checkpoint_path and os.path.isfile(best_checkpoint_path): + resume_checkpoint = best_checkpoint_path + else: + resume_checkpoint = None + trainer.fit(lit_model, datamodule=dataloader) lit_model.eval() vprint(1, f'trainer fitted') + #TODO: Validate the model ########################################################################### vprint(1, f'Validate the model') @@ -349,8 +381,8 @@ def auto_pos_int (x): parser.add_argument( '-e', '--num-epochs', metavar='NUM_EPOCHS', type=auto_pos_int , help=f"The NUM_EPOCHS for the run, a positive integer (default {params.epochs})") - #parser.add_argument('--clear-checkpoints', action='store_true' - # , help='remove checkpoints') + parser.add_argument('--clear-checkpoints', action='store_true' + , help='remove checkpoints') parser.add_argument('-v', '--verbose', action='count', default=0 , help="Increase verbosity level by adding more \"v\".") From a8406ea52dcdbf87cd07d1590077ee1d873bc1ac Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 5 Jun 2024 10:41:35 +0100 Subject: [PATCH 118/204] Added regionprops + fourrier decomposition trials (! hardcoded path to synthetic shapes dataset needs to be generalised) --- scripts/shapes/distmatrix2embeding.py | 203 +++++++++++++++++++++++++- 1 file changed, 201 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 45da013f..b42cd663 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -74,6 +74,156 @@ def sanity_check (dist_mat): raise ValueError("Matrix has non-zero diagonal") return dist_mat +def hashing_fn(args): + serialized_args = pickle.dumps(vars(args)) + hash_object = hashlib.sha256(serialized_args) + hashed_string = base64.urlsafe_b64encode(hash_object.digest()).decode() + return hashed_string + +def scoring_df(X, y): + # Split the data into training and test sets + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y + ) + # Define a dictionary of metrics + scoring = { + "accuracy": make_scorer(metrics.accuracy_score), + "precision": make_scorer(metrics.precision_score, average="macro"), + "recall": make_scorer(metrics.recall_score, average="macro"), + "f1": make_scorer(metrics.f1_score, average="macro"), + } + + # Create a random forest classifier + pipeline = Pipeline( + [ + ("scaler", StandardScaler()), + # ("pca", PCA(n_components=0.95, whiten=True, random_state=42)), + ("clf", RandomForestClassifier()), + # ("clf", DummyClassifier()), + ] + ) + + # Specify the number of folds + k_folds = 5 + + # Perform k-fold cross-validation + cv_results = cross_validate( + estimator=pipeline, + X=X, + y=y, + cv=KFold(n_splits=k_folds), + scoring=scoring, + n_jobs=-1, + return_train_score=False, + ) + + # Put the results into a DataFrame + return pd.DataFrame(cv_results) + +def create_regionprops_df( dataset + , properties = [ "area" + , "perimeter" + , "centroid" + , "major_axis_length" + , "minor_axis_length" + , "orientation" ] ): + dfs = [] + # Distance matrix data + for i, data in enumerate(tqdm(dataset)): + X, y = data + # Do regionprops here + # Calculate shape summary statistics using regionprops + # We're considering that the mask has only one object, so we take the first element [0] + # props = regionprops(np.array(X).astype(int))[0] + props_table = measure.regionprops_table( + np.array(X).astype(int), properties=properties + ) + + # Store shape properties in a dataframe + df = pd.DataFrame(props_table) + + # Assuming the class or label is contained in 'y' variable + df["class"] = y + df.set_index("class", inplace=True) + dfs.append(df) + + return pd.concat(dfs) + +def create_efd_df(dataset): + dfs = [] + for i, data in enumerate(tqdm(dataset)): + # Convert the tensor to a numpy array + X, y = data + print(f" The image: {i}") + + # Feed it to PyEFD's calculate_efd function + coeffs = pyefd.elliptic_fourier_descriptors(X, order=10, normalize=False) + # coeffs_df = pd.DataFrame({'class': [y], 'norm_coeffs': [norm_coeffs.flatten().tolist()]}) + + norm_coeffs = pyefd.normalize_efd(coeffs) + df = pd.DataFrame( + { + "norm_coeffs": norm_coeffs.flatten().tolist(), + "coeffs": coeffs.flatten().tolist(), + } + ).T.rename_axis("coeffs") + df["class"] = y + df.set_index("class", inplace=True, append=True) + dfs.append(df) + + return pd.concat(dfs) + +def run_trials( trials, outputdir + , logger = logging.getLogger(__name__) + , width = 3.45 + , height = 3.45 / 1.618 ): + trial_df = pd.DataFrame() + for trial in trials: + X = trial["features"] + y = trial["labels"] + trial["score_df"] = scoring_df(X, y) + trial["score_df"]["trial"] = trial["name"] + logger.info(trial["score_df"]) + trial["score_df"].to_csv(f"{outputdir}/{trial['name']}_score_df.csv") + trial_df = pd.concat([trial_df, trial["score_df"]]) + trial_df = trial_df.drop(["fit_time", "score_time"], axis=1) + + trial_df.to_csv(f"{outputdir}/trial_df.csv") + trial_df.groupby("trial").mean().to_csv(f"{outputdir}/trial_df_mean.csv") + trial_df.plot(kind="bar") + + avg = trial_df.groupby("trial").mean() + logger.info(avg) + avg.to_latex(f"{outputdir}/trial_df.tex") + + melted_df = trial_df.melt(id_vars="trial", var_name="Metric", value_name="Score") + # fig, ax = plt.subplots(figsize=(width, height)) + ax = sns.catplot( + data=melted_df, + kind="bar", + x="trial", + hue="Metric", + y="Score", + errorbar="se", + height=height, + aspect=width * 2**0.5 / height, + ) + # ax.xtick_params(labelrotation=45) + # plt.legend(loc='lower center', bbox_to_anchor=(1, 1)) + # sns.move_legend(ax, "lower center", bbox_to_anchor=(1, 1)) + # ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + # plt.tight_layout() + plt.savefig(f"{outputdir}/trials_barplot.pdf") + plt.close() + + avs = ( + melted_df.set_index(["trial", "Metric"]) + .xs("test_f1", level="Metric", drop_level=False) + .groupby("trial") + .mean() + ) + logger.info(avs) + # Main process ############################################################################### @@ -219,12 +369,60 @@ def main_process(params): # Save the latent space vprint(1, f'pull the embedings') latent_space = torch.stack([d.out.z.flatten() for d in predictions]).numpy() + scalings = torch.stack([d.x.scalings.flatten() for d in predictions]) + np.save(f'{output_dir}/latent_space.npy', latent_space) df = pd.DataFrame(latent_space) df['class_idx'] = class_indices - df['class'] = [dataset.classes[x] for x in class_indices] + #df['class'] = [dataset.classes[x] for x in class_indices] + df['class'] = pd.Series([dataset.classes[x] for x in class_indices]).astype("category") df['fname'] = filenames + #df['scale'] = scalings[:,0].squeeze() df.to_pickle(f'{output_dir}/latent_space.pkl') + + df_shape_embed = df.drop('fname', axis=1).copy() + df_shape_embed = df_shape_embed.set_index('class') + #regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([ + regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([ + transforms.Grayscale(1) + #, CropCentroidPipeline(128 * 2) + ])) + df_regionprops = create_regionprops_df(regionprop_dataset) + #efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([ + efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([ + transforms.Grayscale(1) + #, CropCentroidPipeline(128 * 2) + , ImageToCoords(128 * 2) + ])) + print(efd_dataset) + df_efd = create_efd_df(efd_dataset) + + # setup trials + trials = [ + { + "name": "mask_embed", + "features": df_shape_embed.to_numpy(), + "labels": df_shape_embed.index, + }, + { + "name": "fourier_coeffs", + "features": df_efd.xs("coeffs", level="coeffs"), + "labels": df_efd.xs("coeffs", level="coeffs").index, + }, + # {"name": "fourier_norm_coeffs", + # "features": df_efd.xs("norm_coeffs", level="coeffs"), + # "labels": df_efd.xs("norm_coeffs", level="coeffs").index + # } + { + "name": "regionprops", + "features": df_regionprops, + "labels": df_regionprops.index, + } + ] + + run_trials(trials, output_dir) + + # Save the (original input and) reconstructions for i, (pred, class_idx, fname) in enumerate(zip(predictions, class_indices, filenames)): vprint(5, f'pred#={i}, class_idx={class_idx}, fname={fname}') @@ -419,5 +617,6 @@ def auto_pos_int (x): if clargs.num_epochs: params.epochs = clargs.num_epochs + logging.basicConfig(level=logging.INFO) # run main process - main_process(params) + main_process(params) \ No newline at end of file From 03a4cf7f03928a7c65382e65d84d6da383e9663d Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 16 Jun 2024 12:41:05 +0100 Subject: [PATCH 119/204] Adding a n compression parameter for the latent space size --- scripts/shapes/distmatrix2embeding.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index b42cd663..6c4ff928 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -496,6 +496,7 @@ def main_process(params): ] matrix_dim = 512 +n = 2 params = types.SimpleNamespace(**{ # general params "model": "resnet18_vae", @@ -503,7 +504,8 @@ def main_process(params): "batch_size": 4, "num_workers": 2**4, "input_dim": (3, 512, 512), - "latent_dim": int((matrix_dim**2 - matrix_dim) / 2), + #"latent_dim": int((matrix_dim**2 - matrix_dim) / 2), + "latent_dim": int((matrix_dim*(matrix_dim-1))/2**n), "num_embeddings": 1024, "num_hiddens": 1024, "pretrained": True, From 5467f9f3ecc1dc0ee325bdcb730ddac9d9634257 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 16 Jun 2024 13:06:25 +0100 Subject: [PATCH 120/204] improve scoring function and use StratifiedKFold instead of KFold for cross validation --- scripts/shapes/distmatrix2embeding.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 6c4ff928..2746e793 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -87,10 +87,11 @@ def scoring_df(X, y): ) # Define a dictionary of metrics scoring = { - "accuracy": make_scorer(metrics.accuracy_score), + "accuracy": make_scorer(metrics.balanced_accuracy_score), "precision": make_scorer(metrics.precision_score, average="macro"), "recall": make_scorer(metrics.recall_score, average="macro"), "f1": make_scorer(metrics.f1_score, average="macro"), + #"roc_auc": make_scorer(metrics.roc_auc_score, average="macro") } # Create a random forest classifier @@ -111,7 +112,7 @@ def scoring_df(X, y): estimator=pipeline, X=X, y=y, - cv=KFold(n_splits=k_folds), + cv=StratifiedKFold(n_splits=k_folds), scoring=scoring, n_jobs=-1, return_train_score=False, From 344c8ab7fc1351d74f6167d9bd171eab75489845 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 16 Jun 2024 14:03:10 +0100 Subject: [PATCH 121/204] hardcoded commited setup now points to quick test setup --- scripts/shapes/distmatrix2embeding.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/shapes/distmatrix2embeding.py b/scripts/shapes/distmatrix2embeding.py index 2746e793..f23b1d2a 100644 --- a/scripts/shapes/distmatrix2embeding.py +++ b/scripts/shapes/distmatrix2embeding.py @@ -383,14 +383,14 @@ def main_process(params): df_shape_embed = df.drop('fname', axis=1).copy() df_shape_embed = df_shape_embed.set_index('class') - #regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([ - regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([ + regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([ + #regionprop_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([ transforms.Grayscale(1) #, CropCentroidPipeline(128 * 2) ])) df_regionprops = create_regionprops_df(regionprop_dataset) - #efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([ - efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([ + efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/tiny_broken_synthetic_shapes/', transform=transforms.Compose([ + #efd_dataset = datasets.ImageFolder('/nfs/research/uhlmann/afoix/image_datasets/synthetic_shapes/', transform=transforms.Compose([ transforms.Grayscale(1) #, CropCentroidPipeline(128 * 2) , ImageToCoords(128 * 2) @@ -496,7 +496,8 @@ def main_process(params): , "resnet50_vae_legacy" ] -matrix_dim = 512 +#matrix_dim = 512 +matrix_dim = 4 n = 2 params = types.SimpleNamespace(**{ # general params From 9a6456e288cf9d1bf7b12a61ff5480dab48abb52 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 16 Jun 2024 19:51:52 +0100 Subject: [PATCH 122/204] initial refactor commit, script with split up functionnalities, missing metrics --- scripts/shapeembed/__init__.py | 1 + scripts/shapeembed/dataset_transformations.py | 148 ++++++++ scripts/shapeembed/shapeembed.py | 358 ++++++++++++++++++ 3 files changed, 507 insertions(+) create mode 100644 scripts/shapeembed/__init__.py create mode 100644 scripts/shapeembed/dataset_transformations.py create mode 100755 scripts/shapeembed/shapeembed.py diff --git a/scripts/shapeembed/__init__.py b/scripts/shapeembed/__init__.py new file mode 100644 index 00000000..e5853d2e --- /dev/null +++ b/scripts/shapeembed/__init__.py @@ -0,0 +1 @@ +from .dataset_transformations import mask2distmatrix diff --git a/scripts/shapeembed/dataset_transformations.py b/scripts/shapeembed/dataset_transformations.py new file mode 100644 index 00000000..1cd76c7f --- /dev/null +++ b/scripts/shapeembed/dataset_transformations.py @@ -0,0 +1,148 @@ +import numpy as np +import imageio.v3 as iio +import skimage as sk +from scipy.interpolate import splprep, splev +import scipy.spatial +import argparse +import pathlib +import types +import glob +import os +import logging + +# logging facilities +############################################################################### +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +# misc helpers +############################################################################### + +def rgb2grey(rgb, cr = 0.2989, cg = 0.5870, cb = 0.1140): + """Turn an rgb array into a greyscale array using the following reduction: + grey = cr * r + cg * g + cb * b + + :param rgb: The rgb array + :param cr: The red coefficient + :param cg: The green coefficient + :param cb: The blue coefficient + + :returns: The greyscale array. + """ + r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2] + return cr * r + cg * g + cb * b + +# API functions +############################################################################### + +def find_longest_contour(mask, normalise_coord=False): + """Find all contours existing in 'mask' and return the longest one + + :param mask: The image with masked objects + :param normalise_coord(default: False): optionally normalise coordinates + + :returns: the longest contour as a pair of lists for the x and y + coordinates + """ + # force the image to grayscale + if len(mask.shape) == 3: # (lines, columns, number of channels) + mask = rgb2grey(mask) + # extract the contours from the now grayscale image + contours = sk.measure.find_contours(mask, 0.8) + logger.debug(f'find_longest_contour: len(contours) {len(contours)}') + # sort the contours by length + contours = sorted(contours, key=lambda x: len(x), reverse=True) + # isolate the longest contour (first in the sorted list) + x, y = contours[0][:, 0], contours[0][:, 1] + # optionally normalise the coordinates in the countour + if normalise_coord: + x = x - np.min(x) + x = x / np.max(x) + y = y - np.min(y) + y = y / np.max(y) + # return the contour as a pair of lists of x and y coordinates + return x, y + +def spline_interpolation(x, y, spline_sampling, raw_sampling_sparsity=1): + """Return a resampled spline interpolation of a provided contour + + :param x: The list of x coordinates of a contour + :param y: The list of y coordinates of a contour + :param spline_sampling: The number of points to sample on the spline + :param raw_sampling_sparsity (default=1): + The distance (in number of gaps) to the next point to consider in the + raw contour (i.e. whether consider every point, every other point + , every 3 points... This might be considered to avoid artifacts due to + high point count contours over low pixel resolution images, with contour + effectively curving around individual pixel edges) + + :returns: the resampled spline with spline_sampling points as a pair of + lists of x and y coordinates + """ + # Force sparsity to be at least one + raw_sampling_sparsity = max(1, raw_sampling_sparsity) + logger.debug(f'spline_interpolation: running with raw_sampling_sparsity {raw_sampling_sparsity} and spline_sampling {spline_sampling}') + logger.debug(f'spline_interpolation: x.shape {x.shape} y.shape {y.shape}') + # prepare the spline interpolation of the given contour + tck, u = splprep( [x[::raw_sampling_sparsity], y[::raw_sampling_sparsity]] + , s = 0 # XXX + , per = True # closed contour (periodic spline) + ) + # how many times to sample the spline + # last parameter is how dense is our spline, how many points. + new_u = np.linspace(u.min(), u.max(), spline_sampling) + # evaluate and return the sampled spline + x_spline, y_spline = splev(new_u, tck) + return x_spline, y_spline + +def build_distance_matrix(x_reinterpolated, y_reinterpolated): + """Turn a (reinterpolated) contour into a distance matrix + + :param x_reinterpolated: The list of x coordinates of a contour + :param y_reinterpolated: The list of y coordinates of a contour + + :returns: the distance matrix characteristic of the provided contour + """ + # reshape the pair of lists of individual x and y coordinates as a single + # numpy array of pairs of (x,y) coordinates + reinterpolated_contour = np.column_stack([ x_reinterpolated + , y_reinterpolated ]) + # build the distance matrix from the reshaped input data + dm = scipy.spatial.distance_matrix( reinterpolated_contour + , reinterpolated_contour ) + return dm + +def dist_to_coords(dst_mat): + """Turn a distance matrix into the corresponding contour + XXX + TODO sort out exactly the specifics here... + """ + embedding = MDS(n_components=2, dissimilarity='precomputed') + return embedding.fit_transform(dst_mat) + +def mask2distmatrix(mask, matrix_size=512, raw_sampling_sparsity=1): + """Get the distance matrix characteristic of the (biggest) object in the + provided image + + :param mask: The image with masked objects + :param matrix_size(default: 512): the desired matrix size + :param raw_sampling_sparsity (default=1): + The distance (in number of gaps) to the next point to consider in the + raw contour (i.e. whether consider every point, every other point + , every 3 points... This might be considered to avoid artifacts due to + high point count contours over low pixel resolution images, with contour + effectively curving around individual pixel edges) + + :returns: the distance matrix characteristic of the (biggest) object in + the provided image + """ + logger.debug(f'mask2distmatrix: running with raw_sampling_sparsity {raw_sampling_sparsity} and matrix_size {matrix_size}') + # extract mask contour + x, y = find_longest_contour(mask, normalise_coord=True) + logger.debug(f'mask2distmatrix: found contour shape x {x.shape} y {y.shape}') + # Reinterpolate (spline) + x_reinterpolated, y_reinterpolated = spline_interpolation(x, y, matrix_size, raw_sampling_sparsity) + # Build the distance matrix + dm = build_distance_matrix(x_reinterpolated, y_reinterpolated) + logger.debug(f'mask2distmatrix: created distance matrix shape {dm.shape}') + return dm diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py new file mode 100755 index 00000000..d4bf8a40 --- /dev/null +++ b/scripts/shapeembed/shapeembed.py @@ -0,0 +1,358 @@ +#! /usr/bin/env python3 + +# general utils +import os +import re +import copy +import types +import pickle +import base64 +import hashlib +import logging +import functools + +# machine learning utils +import torch +from torchvision import datasets, transforms +import pytorch_lightning as pl +from pytorch_lightning import loggers as pl_loggers +from pytorch_lightning.callbacks.early_stopping import EarlyStopping +from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint + +# own source files +import bioimage_embed +import bioimage_embed.shapes +from dataset_transformations import * + +# logging facilities +############################################################################### +logger = logging.getLogger(__name__) + +# script inputs and parameters +############################################################################### + +# available types of datasets (raw, masks, distance matrix) +dataset_types = [ + "raw_image" +, "mask" +, "distance_matrix" +] + +# available models +models = [ + "resnet18_vae" +, "resnet50_vae" +, "resnet18_beta_vae" +, "resnet18_vae_bolt" +, "resnet50_vae_bolt" +, "resnet18_vqvae" +, "resnet50_vqvae" +, "resnet18_vqvae_legacy" +, "resnet50_vqvae_legacy" +, "resnet101_vqvae_legacy" +, "resnet110_vqvae_legacy" +, "resnet152_vqvae_legacy" +, "resnet18_vae_legacy" +, "resnet50_vae_legacy" +] + +# set of parameters for a run, with default values +dflt_params = types.SimpleNamespace( + model_name='resnet18_vae' +, dataset=types.SimpleNamespace( + name='tiny_synthetic_shapes' + , path='/nfs/research/uhlmann/afoix/image_datasets/tiny_synthetic_shapes' + , type='mask' + ) +, batch_size=4 +, compression_factor=2 +, matrix_size=512 +, num_embeddings=1024 +, num_hiddens=1024 +, num_workers=16 +, epochs=150 +, pretrained=False +, frobenius_norm=False +, checkpoints_path='./checkpoints' +, commitment_cost=0.25 +, decay=0.99 +# optimizer_params +, opt="AdamW" +, lr=0.001 +, weight_decay=0.0001 +, momentum=0.9 +# lr_scheduler_params +, sched="cosine" +, min_lr=1e-4 +, warmup_epochs=5 +, warmup_lr=1e-6 +, cooldown_epochs=10 +, t_max=50 +, cycle_momentum=False +) + +# data +############################################################################### + +def maybe_roll(dist_mat, p = 0.5): + if np.random.rand() < p: + return np.roll(dist_mat, np.random.randint(0, dist_mat.shape[0]), (0,1)) + else: + return dist_mat + +def sanity_check(dist_mat): + if not np.allclose(dist_mat, dist_mat.T): + raise ValueError("Matrix is not symmetric") + if np.any(dist_mat < 0): + raise ValueError("Matrix has negative values") + if np.any(np.diag(dist_mat)): + raise ValueError("Matrix has non-zero diagonal") + return dist_mat + +def get_dataloader(params): + # transformations / checks to run on distance matrices + distmat_ts = transforms.Compose([ + lambda x: x / np.linalg.norm(x, "fro") # normalize the matrix + , lambda x: maybe_roll(x, p = 1.0) # "potentially" roll the matrix + , sanity_check # check if the matrix is symmetric and positive, and the diagonal is zero + , torch.as_tensor # turn (H,W) numpy array into a (H,W) tensor + , lambda x: x.repeat(3, 1, 1) # turn (H,W) tensor into a (3,H,W) tensor (to fit downstream model expectations) + ]) + # dataset to load + logger.info(f'loading dataset {params.dataset.name}') + dataset = None + if params.dataset.type == 'raw_image': # TODO + raise NotImplementedError("raw images not yet supported") + elif params.dataset.type == 'mask': # mask data, convert to distance matrix first + dataset = datasets.ImageFolder( + params.dataset.path + , transforms.Compose([ np.array + , functools.partial( mask2distmatrix + , matrix_size=params.matrix_size ) + , distmat_ts ])) + elif params.dataset.type == 'distance_matrix': # distance matrix data + dataset = datasets.DatasetFolder( params.dataset.path + , loader=np.load + , extensions=('npy') + , transform = distmat_ts ) + assert dataset, f"could not load dataset {params.dataset.name}" + # create the dataloader from the dataset and other parameters + dataloader = bioimage_embed.lightning.DataModule( + dataset + , batch_size=params.batch_size + , shuffle=True + , num_workers=params.num_workers + ) + dataloader.setup() + logger.info(f'dataloader ready') + return dataloader + +# model +############################################################################### + +def get_model(params): + logger.info(f'setup model') + model = bioimage_embed.models.create_model( + model=params.model_name + , input_dim=params.input_dim + , latent_dim=params.latent_dim + , pretrained=params.pretrained + , **vars(params.model_args) + ) + lit_model = bioimage_embed.shapes.MaskEmbed(model, params) + logger.info(f'model ready') + return lit_model + +# trainer +############################################################################### + +def hashing_fn(args): + serialized_args = pickle.dumps(vars(args)) + hash_object = hashlib.sha256(serialized_args) + hashed_string = base64.urlsafe_b64encode(hash_object.digest()).decode() + return hashed_string + +def get_trainer(model, params): + + # setup WandB logger + logger.info('setup wandb logger') + jobname = f"{params.model_name}_{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}_{params.latent_dim}_{params.batch_size}_{params.dataset.name}" + wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=jobname) + wandblogger.watch(model, log="all") + + # setup checkpoints + logger.info('setup checkpoints') + model_dir = f"{params.checkpoints_path}/{hashing_fn(params)}" + os.makedirs(f"{model_dir}/", exist_ok=True) + checkpoint_callback = ModelCheckpoint( + dirpath=f"{model_dir}/" + , save_last=True + , save_top_k=1 + , monitor="loss/val" + , mode="min" + ) + + # setup trainer + logger.info('setup trainer') + trainer = pl.Trainer( + logger=[wandblogger] + , gradient_clip_val=0.5 + , enable_checkpointing=True + , devices=1 + , accelerator="gpu" + , accumulate_grad_batches=4 + , callbacks=[ checkpoint_callback + , EarlyStopping(monitor="loss/val", mode="min") + ] + , min_epochs=50 + , max_epochs=params.epochs + , log_every_n_steps=1 + ) + + logger.info(f'trainer ready') + return trainer + +# train / validate / test the model +############################################################################### + +def train_model(trainer, model, dataloader): + # retrieve the checkpoint information from the trainer and check if a + # checkpoint exists to resume from + checkpoint_callback = trainer.checkpoint_callback + last_checkpoint_path = checkpoint_callback.last_model_path + best_checkpoint_path = checkpoint_callback.best_model_path + if os.path.isfile(last_checkpoint_path): + resume_checkpoint = last_checkpoint_path + elif best_checkpoint_path and os.path.isfile(best_checkpoint_path): + resume_checkpoint = best_checkpoint_path + else: + resume_checkpoint = None + # train the model + logger.info('training the model') + trainer.fit(model, datamodule=dataloader, ckpt_path=resume_checkpoint) + model.eval() + + return model + +def validate_model(trainer, model, dataloader): + logger.info('validating the model') + validation = trainer.validate(model, datamodule=dataloader) + return validation + +def test_model(trainer, model, dataloader): + logger.info('testing the model') + testing = trainer.test(model, datamodule=dataloader) + return testing + +# main process +############################################################################### + +def main_process(params): + + # setup + model = get_model(params) + trainer = get_trainer(model, params) + dataloader = get_dataloader(params) + + # run actual work + train_model(trainer, model, dataloader) + validate_model(trainer, model, dataloader) + test_model(trainer, model, dataloader) + + # gather results + +# main entry point +############################################################################### +if __name__ == '__main__': + def auto_pos_int (x): + val = int(x,0) + if val <= 0: + raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) + return val + + parser = argparse.ArgumentParser(description='Run the shape embed pipeline') + + parser.add_argument( + '-m', '--model', choices=models, metavar='MODEL' + , help=f"The MODEL to use, one of {models} (default {dflt_params.model_name}).") + parser.add_argument( + '--model-arg-beta', type=float, metavar='BETA' + , help=f"The BETA parameter to use for a beta-vae model.") + parser.add_argument( + '-d', '--dataset', nargs=3, metavar=('NAME', 'PATH', 'TYPE') + , help=f"The NAME, PATH and TYPE of the dataset (default: {dflt_params.dataset})") + parser.add_argument( + '-o', '--output-dir', metavar='OUTPUT_DIR', default=None + , help=f"The OUTPUT_DIR path to use to dump results") + parser.add_argument( + '--wandb-entity', default="foix", metavar='WANDB_ENTITY' + , help=f"The WANDB_ENTITY name") + parser.add_argument( + '--wandb-project', default="simply-shape", metavar='WANDB_PROJECT' + , help=f"The WANDB_PROJECT name") + parser.add_argument( + '-b', '--batch-size', metavar='BATCH_SIZE', type=auto_pos_int + , help=f"The BATCH_SIZE for the run, a positive integer (default {dflt_params.batch_size})") + parser.add_argument( + '-c', '--compression-factor', metavar='COMPRESSION_FACTOR', type=auto_pos_int + , help=f"The COMPRESSION_FACTOR, a positive integer (default {dflt_params.compression_factor})") + parser.add_argument( + '--distance-matrix-size', metavar='MATRIX_SIZE', type=auto_pos_int + , help=f"The size of the distance matrix (default {dflt_params.matrix_size})") + parser.add_argument( + '--number-embeddings', metavar='NUM_EMBEDDINGS', type=auto_pos_int + , help=f"The NUM_EMBEDDINGS, a positive integer (default {dflt_params.num_embeddings})") + parser.add_argument( + '--number-hiddens', metavar='NUM_HIDDENS', type=auto_pos_int + , help=f"The NUM_HIDDENS, a positive integer (default {dflt_params.num_hiddens})") + parser.add_argument( + '-n', '--num-workers', metavar='NUM_WORKERS', type=auto_pos_int + , help=f"The NUM_WORKERS for the run, a positive integer (default {dflt_params.num_workers})") + parser.add_argument( + '-e', '--num-epochs', metavar='NUM_EPOCHS', type=auto_pos_int + , help=f"The NUM_EPOCHS for the run, a positive integer (default {dflt_params.epochs})") + parser.add_argument('--clear-checkpoints', action='store_true' + , help='remove checkpoints') + parser.add_argument('-v', '--verbose', action='count', default=0 + , help="Increase verbosity level by adding more \"v\".") + + # parse command line arguments + clargs=parser.parse_args() + + # set verbosity level + if clargs.verbose > 0: + logging.basicConfig(level=logging.INFO) + + params = copy.deepcopy(dflt_params) + # update default params with clargs + if clargs.model: + params.model = clargs.model + params.model_args = types.SimpleNamespace() + if clargs.model_arg_beta: + params.model_args.beta = clargs.model_arg_beta + params.output_dir = clargs.output_dir + if clargs.dataset: + params.dataset = clargs.dataset + if clargs.wandb_entity: + params.wandb_entity = clargs.wandb_entity + if clargs.wandb_project: + params.wandb_project = clargs.wandb_project + if clargs.batch_size: + params.batch_size = clargs.batch_size + if clargs.distance_matrix_size: + params.matrix_size = clargs.distance_matrix_size + params.input_dim = (3, params.matrix_size, params.matrix_size) + if clargs.compression_factor: + params.compression_factor = clargs.compression_factor + n_features = lambda d, n: d*(d-1)/(2**n) + params.latent_dim = n_features(params.matrix_size, params.compression_factor) + if clargs.number_embeddings: + params.num_embeddings = clargs.number_embeddings + if clargs.number_hiddens: + params.num_hiddens = clargs.number_hiddens + if clargs.num_workers: + params.num_workers = clargs.num_workers + if clargs.num_epochs: + params.epochs = clargs.num_epochs + + main_process(params) From 751394e44251a1603268a1e95e25a59024312d40 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 17 Jun 2024 01:29:02 +0100 Subject: [PATCH 123/204] Added predictions + kmeans of input data --- scripts/shapeembed/shapeembed.py | 111 +++++++++++++++++++++++++++---- 1 file changed, 99 insertions(+), 12 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index d4bf8a40..048f0bcf 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -1,5 +1,15 @@ #! /usr/bin/env python3 +# machine learning utils +import torch +from torchvision import datasets, transforms +import pytorch_lightning as pl +from pytorch_lightning import loggers as pl_loggers +from pytorch_lightning.callbacks.early_stopping import EarlyStopping +from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint +from sklearn.cluster import KMeans +from sklearn.metrics import confusion_matrix, accuracy_score + # general utils import os import re @@ -7,18 +17,12 @@ import types import pickle import base64 +import pandas import hashlib import logging +import datetime import functools -# machine learning utils -import torch -from torchvision import datasets, transforms -import pytorch_lightning as pl -from pytorch_lightning import loggers as pl_loggers -from pytorch_lightning.callbacks.early_stopping import EarlyStopping -from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint - # own source files import bioimage_embed import bioimage_embed.shapes @@ -69,7 +73,7 @@ , matrix_size=512 , num_embeddings=1024 , num_hiddens=1024 -, num_workers=16 +, num_workers=8 , epochs=150 , pretrained=False , frobenius_norm=False @@ -244,22 +248,98 @@ def test_model(trainer, model, dataloader): testing = trainer.test(model, datamodule=dataloader) return testing +def run_predictions(trainer, model, dataloader, num_workers=8): + + # prepare new unshuffled datamodule + datamod = bioimage_embed.lightning.DataModule( + dataloader.dataset + , batch_size=1 + , shuffle=False + , num_workers=num_workers + ) + datamod.setup() + + # run predictions + predictions = trainer.predict(model, datamodule=datamod) + + # extract latent space + latent_space = torch.stack([d.out.z.flatten() for d in predictions]).numpy() + + # extract class indices and filenames and provide a richer pandas dataframe + ds = datamod.get_dataset() + class_indices = np.array([ int(lbl) + for _, lbl in datamod.predict_dataloader() ]) + fnames = [fname for fname, _ in ds.samples] + df = pandas.DataFrame(latent_space) + df['class_idx'] = class_indices + #df['class'] = [ds.classes[x] for x in class_indices] + df['class'] = pandas.Series([ ds.classes[x] + for x in class_indices]).astype("category") + df['fname'] = fnames + #df['scale'] = scalings[:,0].squeeze() + + return (predictions, latent_space, df) + +def dataloader_to_dataframe(dataloader): + # gather the data and the associated labels, and drop rows with NaNs + all_data = [] + all_lbls = [] + for batch in dataloader: + inputs, lbls = batch + for data, lbl in zip(inputs, lbls): + all_data.append(data.flatten().numpy()) + all_lbls.append(int(lbl)) + df = pandas.DataFrame(all_data) + df['label'] = all_lbls + df.dropna() + return df + +def run_kmeans(dataframe, random_seed=42): + # run KMeans and derive accuracy metric and confusion matrix + kmeans = KMeans( n_clusters=len(dataframe['label'].unique()) + , random_state=random_seed + ).fit(dataframe.drop('label', axis=1)) + accuracy = accuracy_score(dataframe['label'], kmeans.labels_) + conf_mat = confusion_matrix(dataframe['label'], kmeans.labels_) + + return kmeans, accuracy, conf_mat + # main process ############################################################################### def main_process(params): # setup + ####### model = get_model(params) trainer = get_trainer(model, params) dataloader = get_dataloader(params) # run actual work + ################# train_model(trainer, model, dataloader) validate_model(trainer, model, dataloader) test_model(trainer, model, dataloader) - # gather results + # run predictions + ################# + # ... and gather latent space + predictions, latent_space, df = run_predictions( + trainer, model, dataloader + , num_workers=params.num_workers + ) + # ... and prepare output directory and save latent space + os.makedirs(f"{params.output_dir}/", exist_ok=True) + np.save(f'{params.output_dir}/latent_space.npy', latent_space) + df.to_pickle(f'{params.output_dir}/latent_space.pkl') + + # gather metrics + ################ + # kmeans on input data + _, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) + logger.info(f'-- kmeans on input data --') + logger.info(f'-- accuracy: {accuracy}') + logger.info(f'-- confusion matrix:\n{conf_mat}') # main entry point ############################################################################### @@ -321,10 +401,10 @@ def auto_pos_int (x): # set verbosity level if clargs.verbose > 0: - logging.basicConfig(level=logging.INFO) + logging.basicConfig(level=logging.DEBUG) - params = copy.deepcopy(dflt_params) # update default params with clargs + params = copy.deepcopy(dflt_params) if clargs.model: params.model = clargs.model params.model_args = types.SimpleNamespace() @@ -354,5 +434,12 @@ def auto_pos_int (x): params.num_workers = clargs.num_workers if clargs.num_epochs: params.epochs = clargs.num_epochs + if clargs.output_dir: + params.output_dir = clargs.output_dir + else: + params.output_dir = f'./{params.model_name}_{params.latent_dim}_{params.batch_size}_{params.dataset.name}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' + # XXX + torch.set_float32_matmul_precision('medium') + # XXX main_process(params) From 949254d1bd13da5ec3675d3eaaa1cc58fd483fce Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 17 Jun 2024 22:24:49 +0100 Subject: [PATCH 124/204] factored out evaluation functionality + added regionprops, efd and scoring --- scripts/shapeembed/__init__.py | 1 + scripts/shapeembed/evaluation.py | 130 +++++++++++++++++++++++++++++++ scripts/shapeembed/shapeembed.py | 47 +++++------ 3 files changed, 149 insertions(+), 29 deletions(-) create mode 100644 scripts/shapeembed/evaluation.py diff --git a/scripts/shapeembed/__init__.py b/scripts/shapeembed/__init__.py index e5853d2e..cd331ee4 100644 --- a/scripts/shapeembed/__init__.py +++ b/scripts/shapeembed/__init__.py @@ -1 +1,2 @@ from .dataset_transformations import mask2distmatrix +from .evaluation import * diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py new file mode 100644 index 00000000..f6575b96 --- /dev/null +++ b/scripts/shapeembed/evaluation.py @@ -0,0 +1,130 @@ +from torchvision import datasets, transforms +import pyefd +from skimage import measure +from sklearn.cluster import KMeans +from sklearn.pipeline import Pipeline +from sklearn.ensemble import RandomForestClassifier +from sklearn.discriminant_analysis import StandardScaler +from sklearn import metrics +from sklearn.metrics import make_scorer +from sklearn.metrics import confusion_matrix, accuracy_score +from sklearn.model_selection import cross_validate, KFold, train_test_split, StratifiedKFold + +import tqdm +import numpy +import pandas +import logging + +from bioimage_embed.shapes.transforms import ImageToCoords + +# logging facilities +############################################################################### +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def dataloader_to_dataframe(dataloader): + # gather the data and the associated labels, and drop rows with NaNs + all_data = [] + all_lbls = [] + for batch in dataloader: + inputs, lbls = batch + for data, lbl in zip(inputs, lbls): + all_data.append(data.flatten().numpy()) + all_lbls.append(int(lbl)) + df = pandas.DataFrame(all_data) + df['label'] = all_lbls + df.dropna() + return df + +def run_kmeans(dataframe, random_seed=42): + # run KMeans and derive accuracy metric and confusion matrix + kmeans = KMeans( n_clusters=len(dataframe['label'].unique()) + , random_state=random_seed + ).fit(dataframe.drop('label', axis=1)) + accuracy = accuracy_score(dataframe['label'], kmeans.labels_) + conf_mat = confusion_matrix(dataframe['label'], kmeans.labels_) + + return kmeans, accuracy, conf_mat + +def run_regionprops( dataset_params + , properties = [ "area" + , "perimeter" + , "centroid" + , "major_axis_length" + , "minor_axis_length" + , "orientation" ] ): + # access the dataset + assert dataset_params.type == 'mask' + ds = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1)) + # ... and run regionprops for the given properties for each image + dfs = [] + logger.info(f'running regionprops on {dataset_params.name}') + logger.info(f'({dataset_params.path})') + for i, (img, lbl) in enumerate(tqdm.tqdm(ds)): + t = measure.regionprops_table(numpy.array(img), properties=properties) + df = pandas.DataFrame(t) + df['class'] = lbl + df.set_index("class", inplace=True) + dfs.append(df) + # concatenate results as a single dataframe and return it + df = pandas.concat(dfs) + return df + +def run_elliptic_fourier_descriptors(dataset_params, contour_size=512): + # access the dataset + assert dataset_params.type == 'mask' + ds = datasets.ImageFolder( dataset_params.path + , transform=transforms.Compose([ + transforms.Grayscale(1) + , ImageToCoords(contour_size) ])) + # ... and run efd on each image + dfs = [] + logger.info(f'running efd on {dataset_params.name}') + logger.info(f'({dataset_params.path})') + for i, (img, lbl) in enumerate(tqdm.tqdm(ds)): + coeffs = pyefd.elliptic_fourier_descriptors(img, order=10, normalize=False) + norm_coeffs = pyefd.normalize_efd(coeffs) + df = pandas.DataFrame({ + "norm_coeffs": norm_coeffs.flatten().tolist() + , "coeffs": coeffs.flatten().tolist() + }).T.rename_axis("coeffs") + df['class'] = lbl + df.set_index("class", inplace=True, append=True) + dfs.append(df) + # concatenate results as a single dataframe and return it + return pandas.concat(dfs).xs('coeffs', level='coeffs') + +def score_dataframe(df, test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5): + # TODO, currently unused + # Split the data into training and test sets + #X_train, X_test, y_train, y_test = train_test_split( + # df, df.index, stratify=df.index + #, test_size=test_sz, randm_state=rand_seed, shuffle=shuffle + #) + # Define a dictionary of metrics + scoring = { + "accuracy": make_scorer(metrics.balanced_accuracy_score) + , "precision": make_scorer(metrics.precision_score, average="macro") + , "recall": make_scorer(metrics.recall_score, average="macro") + , "f1": make_scorer(metrics.f1_score, average="macro") + #, "roc_auc": make_scorer(metrics.roc_auc_score, average="macro") + } + # Create a random forest classifier + pipeline = Pipeline([ + ("scaler", StandardScaler()) + #, ("pca", PCA(n_components=0.95, whiten=True, random_state=rand_seed)) + , ("clf", RandomForestClassifier()) + #, ("clf", DummyClassifier()) + ]) + # Perform k-fold cross-validation + cv_results = cross_validate( + estimator=pipeline + , X=df + , y=df.index + , cv=StratifiedKFold(n_splits=k_folds) + , scoring=scoring + , n_jobs=-1 + , return_train_score=False + ) + # Put the results into a DataFrame + return pandas.DataFrame(cv_results) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 048f0bcf..420af2ee 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -7,8 +7,6 @@ from pytorch_lightning import loggers as pl_loggers from pytorch_lightning.callbacks.early_stopping import EarlyStopping from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint -from sklearn.cluster import KMeans -from sklearn.metrics import confusion_matrix, accuracy_score # general utils import os @@ -27,6 +25,7 @@ import bioimage_embed import bioimage_embed.shapes from dataset_transformations import * +from evaluation import * # logging facilities ############################################################################### @@ -280,30 +279,6 @@ def run_predictions(trainer, model, dataloader, num_workers=8): return (predictions, latent_space, df) -def dataloader_to_dataframe(dataloader): - # gather the data and the associated labels, and drop rows with NaNs - all_data = [] - all_lbls = [] - for batch in dataloader: - inputs, lbls = batch - for data, lbl in zip(inputs, lbls): - all_data.append(data.flatten().numpy()) - all_lbls.append(int(lbl)) - df = pandas.DataFrame(all_data) - df['label'] = all_lbls - df.dropna() - return df - -def run_kmeans(dataframe, random_seed=42): - # run KMeans and derive accuracy metric and confusion matrix - kmeans = KMeans( n_clusters=len(dataframe['label'].unique()) - , random_state=random_seed - ).fit(dataframe.drop('label', axis=1)) - accuracy = accuracy_score(dataframe['label'], kmeans.labels_) - conf_mat = confusion_matrix(dataframe['label'], kmeans.labels_) - - return kmeans, accuracy, conf_mat - # main process ############################################################################### @@ -335,11 +310,25 @@ def main_process(params): # gather metrics ################ + # regionprops on input data + logger.info(f'-- regionprops on input data --') + regionprops_df = run_regionprops(params.dataset) + logger.debug(regionprops_df) + regionprops_score_df = score_dataframe(regionprops_df) + logger.info(f'-- regionprops on input data, score:') + logger.info(regionprops_score_df) + # elliptic fourier descriptors on input data + logger.info(f'-- elliptic fourier descriptors on input data --') + efd_df = run_elliptic_fourier_descriptors(params.dataset) + logger.debug(efd_df) + efd_score_df = score_dataframe(efd_df) + logger.info(f'-- elliptic fourier descriptors on input data, score:') + logger.info(efd_score_df) # kmeans on input data - _, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) logger.info(f'-- kmeans on input data --') - logger.info(f'-- accuracy: {accuracy}') - logger.info(f'-- confusion matrix:\n{conf_mat}') + _, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) + logger.info(f'-- kmeans accuracy: {accuracy}') + logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') # main entry point ############################################################################### From 43a396978ea15d2aa626086d6fbe020a988970d4 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 18 Jun 2024 19:24:08 +0100 Subject: [PATCH 125/204] cleaner logging + score shapeembed itself --- scripts/shapeembed/evaluation.py | 9 +++++--- scripts/shapeembed/shapeembed.py | 39 ++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index f6575b96..e00b3418 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -95,10 +95,13 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size=512): return pandas.concat(dfs).xs('coeffs', level='coeffs') def score_dataframe(df, test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5): + # drop strings and python object columns + #clean_df = df.select_dtypes(exclude=['object']) + clean_df = df.select_dtypes(include=['number']) # TODO, currently unused # Split the data into training and test sets #X_train, X_test, y_train, y_test = train_test_split( - # df, df.index, stratify=df.index + # clean_df, clean_df.index, stratify=clean_df.index #, test_size=test_sz, randm_state=rand_seed, shuffle=shuffle #) # Define a dictionary of metrics @@ -119,8 +122,8 @@ def score_dataframe(df, test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5): # Perform k-fold cross-validation cv_results = cross_validate( estimator=pipeline - , X=df - , y=df.index + , X=clean_df + , y=clean_df.index , cv=StratifiedKFold(n_splits=k_folds) , scoring=scoring , n_jobs=-1 diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 420af2ee..25341893 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -299,36 +299,39 @@ def main_process(params): # run predictions ################# # ... and gather latent space - predictions, latent_space, df = run_predictions( + predictions, latent_space, shapeembed_df = run_predictions( trainer, model, dataloader , num_workers=params.num_workers ) # ... and prepare output directory and save latent space os.makedirs(f"{params.output_dir}/", exist_ok=True) np.save(f'{params.output_dir}/latent_space.npy', latent_space) - df.to_pickle(f'{params.output_dir}/latent_space.pkl') + shapeembed_df.to_pickle(f'{params.output_dir}/latent_space.pkl') # gather metrics ################ - # regionprops on input data + # regionprops on input data and score logger.info(f'-- regionprops on input data --') regionprops_df = run_regionprops(params.dataset) - logger.debug(regionprops_df) + logger.debug(f'\n{regionprops_df}') regionprops_score_df = score_dataframe(regionprops_df) - logger.info(f'-- regionprops on input data, score:') - logger.info(regionprops_score_df) - # elliptic fourier descriptors on input data + logger.info(f'-- regionprops on input data, score:\n{regionprops_score_df}') + # elliptic fourier descriptors on input data and score logger.info(f'-- elliptic fourier descriptors on input data --') efd_df = run_elliptic_fourier_descriptors(params.dataset) - logger.debug(efd_df) + logger.debug(f'\n{efd_df}') efd_score_df = score_dataframe(efd_df) - logger.info(f'-- elliptic fourier descriptors on input data, score:') - logger.info(efd_score_df) - # kmeans on input data + logger.info(f'-- elliptic fourier descriptors on input data, score:\n{efd_score_df}') + # kmeans on input data and score logger.info(f'-- kmeans on input data --') _, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) logger.info(f'-- kmeans accuracy: {accuracy}') logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') + # score shape embed + logger.info(f'-- score shape embed --') + logger.debug(f'\n{shapeembed_df}') + shapeembed_score_df = score_dataframe(shapeembed_df) + logger.info(f'-- shapeembed on input data, score:\n{shapeembed_score_df}') # main entry point ############################################################################### @@ -384,14 +387,16 @@ def auto_pos_int (x): , help='remove checkpoints') parser.add_argument('-v', '--verbose', action='count', default=0 , help="Increase verbosity level by adding more \"v\".") - + # parse command line arguments clargs=parser.parse_args() - + # set verbosity level - if clargs.verbose > 0: - logging.basicConfig(level=logging.DEBUG) - + if clargs.verbose > 2: + logger.setLevel(logging.DEBUG) + elif clargs.verbose > 0: + logger.setLevel(logging.INFO) + # update default params with clargs params = copy.deepcopy(dflt_params) if clargs.model: @@ -427,7 +432,7 @@ def auto_pos_int (x): params.output_dir = clargs.output_dir else: params.output_dir = f'./{params.model_name}_{params.latent_dim}_{params.batch_size}_{params.dataset.name}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' - + # XXX torch.set_float32_matmul_precision('medium') # XXX From 9030e487e933cd87f31fa2a614b3fab563647635 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 18 Jun 2024 19:34:09 +0100 Subject: [PATCH 126/204] reshaped shapeembed reported dataframe --- scripts/shapeembed/shapeembed.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 25341893..4fe4338f 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -270,12 +270,12 @@ def run_predictions(trainer, model, dataloader, num_workers=8): for _, lbl in datamod.predict_dataloader() ]) fnames = [fname for fname, _ in ds.samples] df = pandas.DataFrame(latent_space) - df['class_idx'] = class_indices - #df['class'] = [ds.classes[x] for x in class_indices] - df['class'] = pandas.Series([ ds.classes[x] - for x in class_indices]).astype("category") - df['fname'] = fnames - #df['scale'] = scalings[:,0].squeeze() + df.insert(loc=0, column='fname', value=fnames) + #df.insert(loc=0, column='scale', value=scalings[:,0].squeeze()) + df.insert( loc=0, column='class_name' + , value=[ds.classes[x] for x in class_indices]) + df.insert(loc=0, column='class', value=class_indices) + df.set_index("class", inplace=True) return (predictions, latent_space, df) From 6a5c7d028b1469de8d876f2e4bc6f856a43d9b17 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 18 Jun 2024 21:09:56 +0100 Subject: [PATCH 127/204] renamed label to class --- scripts/shapeembed/evaluation.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index e00b3418..641ff27c 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -32,18 +32,17 @@ def dataloader_to_dataframe(dataloader): all_data.append(data.flatten().numpy()) all_lbls.append(int(lbl)) df = pandas.DataFrame(all_data) - df['label'] = all_lbls + df['class'] = all_lbls df.dropna() return df def run_kmeans(dataframe, random_seed=42): # run KMeans and derive accuracy metric and confusion matrix - kmeans = KMeans( n_clusters=len(dataframe['label'].unique()) + kmeans = KMeans( n_clusters=len(dataframe['class'].unique()) , random_state=random_seed - ).fit(dataframe.drop('label', axis=1)) - accuracy = accuracy_score(dataframe['label'], kmeans.labels_) - conf_mat = confusion_matrix(dataframe['label'], kmeans.labels_) - + ).fit(dataframe.drop('class', axis=1)) + accuracy = accuracy_score(dataframe['class'], kmeans.labels_) + conf_mat = confusion_matrix(dataframe['class'], kmeans.labels_) return kmeans, accuracy, conf_mat def run_regionprops( dataset_params From d216cc7df541597c92854c70d2630174509b3543 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 18 Jun 2024 21:11:01 +0100 Subject: [PATCH 128/204] updated scoring function + collate and save results --- scripts/shapeembed/evaluation.py | 38 ++++++++++++++++++++++++++++++-- scripts/shapeembed/shapeembed.py | 24 ++++++++++++-------- 2 files changed, 51 insertions(+), 11 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 641ff27c..e8692255 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -14,6 +14,8 @@ import numpy import pandas import logging +import seaborn +import matplotlib.pyplot as plt from bioimage_embed.shapes.transforms import ImageToCoords @@ -93,7 +95,8 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size=512): # concatenate results as a single dataframe and return it return pandas.concat(dfs).xs('coeffs', level='coeffs') -def score_dataframe(df, test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5): +def score_dataframe( df, name + , test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5 ): # drop strings and python object columns #clean_df = df.select_dtypes(exclude=['object']) clean_df = df.select_dtypes(include=['number']) @@ -129,4 +132,35 @@ def score_dataframe(df, test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5): , return_train_score=False ) # Put the results into a DataFrame - return pandas.DataFrame(cv_results) + df = pandas.DataFrame(cv_results) + df = df.drop(["fit_time", "score_time"], axis=1) + df.insert(loc=0, column='trial', value=name) + return df + +def save_scores( scores_df + , outputdir='.' + , width = 3.45 + , height = 3.45 / 1.618 ): + # save all raw scores as csv + scores_df.to_csv(f"{outputdir}/scores_df.csv") + # save score means as csv + scores_df.groupby("trial").mean().to_csv(f"{outputdir}/scores_df_mean.csv") + # save a barplot representation of scores + melted_df = scores_df.melt( id_vars="trial" + , var_name="Metric" + , value_name="Score" ) + seaborn.catplot( data=melted_df + , kind="bar" + , x="trial" + , hue="Metric" + , y="Score" + , errorbar="se" + , height=height + , aspect=width * 2**0.5 / height ) + plt.savefig(f"{outputdir}/scores_barplot.pdf") + plt.close() + # log info + logger.info(melted_df.set_index(["trial", "Metric"]) + .xs("test_f1", level="Metric", drop_level=False) + .groupby("trial") + .mean()) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 4fe4338f..aa491b58 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -94,7 +94,7 @@ , cycle_momentum=False ) -# data +# dataset loading functions ############################################################################### def maybe_roll(dist_mat, p = 0.5): @@ -310,28 +310,34 @@ def main_process(params): # gather metrics ################ + # kmeans on input data and score + logger.info(f'-- kmeans on input data --') + kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) + print(kmeans) + logger.info(f'-- kmeans accuracy: {accuracy}') + logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') # regionprops on input data and score logger.info(f'-- regionprops on input data --') regionprops_df = run_regionprops(params.dataset) logger.debug(f'\n{regionprops_df}') - regionprops_score_df = score_dataframe(regionprops_df) + regionprops_score_df = score_dataframe(regionprops_df, 'regionprops') logger.info(f'-- regionprops on input data, score:\n{regionprops_score_df}') # elliptic fourier descriptors on input data and score logger.info(f'-- elliptic fourier descriptors on input data --') efd_df = run_elliptic_fourier_descriptors(params.dataset) logger.debug(f'\n{efd_df}') - efd_score_df = score_dataframe(efd_df) + efd_score_df = score_dataframe(efd_df, 'efd') logger.info(f'-- elliptic fourier descriptors on input data, score:\n{efd_score_df}') - # kmeans on input data and score - logger.info(f'-- kmeans on input data --') - _, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) - logger.info(f'-- kmeans accuracy: {accuracy}') - logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') # score shape embed logger.info(f'-- score shape embed --') logger.debug(f'\n{shapeembed_df}') - shapeembed_score_df = score_dataframe(shapeembed_df) + shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed') logger.info(f'-- shapeembed on input data, score:\n{shapeembed_score_df}') + # collate and save gathered results TODO KMeans + scores_df = pandas.concat([ regionprops_score_df + , efd_score_df + , shapeembed_score_df ]) + save_scores(scores_df, outputdir=params.output_dir) # main entry point ############################################################################### From 7e2bdbd5edf16432e791c56cf4f1419bbcf85ffd Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 18 Jun 2024 22:07:40 +0100 Subject: [PATCH 129/204] Added clargs to control matrix normalization and roll --- scripts/shapeembed/shapeembed.py | 57 ++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index aa491b58..fbd5570e 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -69,13 +69,15 @@ ) , batch_size=4 , compression_factor=2 -, matrix_size=512 +, distance_matrix_size=512 , num_embeddings=1024 , num_hiddens=1024 , num_workers=8 , epochs=150 , pretrained=False , frobenius_norm=False +, distance_matrix_normalize=True +, distance_matrix_roll_probability=1.0 , checkpoints_path='./checkpoints' , commitment_cost=0.25 , decay=0.99 @@ -113,14 +115,23 @@ def sanity_check(dist_mat): return dist_mat def get_dataloader(params): + # transformations / checks to run on distance matrices - distmat_ts = transforms.Compose([ - lambda x: x / np.linalg.norm(x, "fro") # normalize the matrix - , lambda x: maybe_roll(x, p = 1.0) # "potentially" roll the matrix - , sanity_check # check if the matrix is symmetric and positive, and the diagonal is zero - , torch.as_tensor # turn (H,W) numpy array into a (H,W) tensor - , lambda x: x.repeat(3, 1, 1) # turn (H,W) tensor into a (3,H,W) tensor (to fit downstream model expectations) - ]) + ts = [] + if params.distance_matrix_normalize: # optionally normalize the matrix + ts.append(lambda x: x / np.linalg.norm(x, "fro")) + if params.distance_matrix_roll_probability > 0.0: # optionally try to roll the matrix + ts.append(lambda x: maybe_roll(x, p=params.distance_matrix_roll_probability)) + # always check if the matrix is symmetric, positive, and diagonal is zero + ts.append(sanity_check) + # turn (H,W) numpy array into a (H,W) tensor + ts.append(torch.as_tensor) + # turn (H,W) tensor into a (3,H,W) tensor (downstream model expectations) + ts.append(lambda x: x.repeat(3, 1, 1)) + # compose the all the distance matrix transformations + logger.debug(f'transformations to run: {len(ts)}') + distmat_ts = transforms.Compose(ts) + # dataset to load logger.info(f'loading dataset {params.dataset.name}') dataset = None @@ -131,7 +142,7 @@ def get_dataloader(params): params.dataset.path , transforms.Compose([ np.array , functools.partial( mask2distmatrix - , matrix_size=params.matrix_size ) + , matrix_size=params.distance_matrix_size ) , distmat_ts ])) elif params.dataset.type == 'distance_matrix': # distance matrix data dataset = datasets.DatasetFolder( params.dataset.path @@ -342,10 +353,17 @@ def main_process(params): # main entry point ############################################################################### if __name__ == '__main__': + def auto_pos_int (x): val = int(x,0) if val <= 0: - raise argparse.ArgumentTypeError("argument must be a positive int. Got {:d}.".format(val)) + raise argparse.ArgumentTypeError(f"argument must be a positive int. Got {val:d}.") + return val + + def prob (x): + val = float(x) + if val < 0.0 or val > 1.0: + raise argparse.ArgumentTypeError(f"argument must be between 0.0 and 1.0. Got {val:f}.") return val parser = argparse.ArgumentParser(description='Run the shape embed pipeline') @@ -371,12 +389,18 @@ def auto_pos_int (x): parser.add_argument( '-b', '--batch-size', metavar='BATCH_SIZE', type=auto_pos_int , help=f"The BATCH_SIZE for the run, a positive integer (default {dflt_params.batch_size})") + parser.add_argument( + '--distance-matrix-normalize', action=argparse.BooleanOptionalAction, default=None + , help=f'Whether to normalize the distance matrices or not') + parser.add_argument( + '--distance-matrix-roll-probability', metavar='ROLL_PROB', type=prob, default=None + , help=f'Probability to roll the distance matrices along the diagonal (default {dflt_params.distance_matrix_roll_probability})') parser.add_argument( '-c', '--compression-factor', metavar='COMPRESSION_FACTOR', type=auto_pos_int , help=f"The COMPRESSION_FACTOR, a positive integer (default {dflt_params.compression_factor})") parser.add_argument( '--distance-matrix-size', metavar='MATRIX_SIZE', type=auto_pos_int - , help=f"The size of the distance matrix (default {dflt_params.matrix_size})") + , help=f"The size of the distance matrix (default {dflt_params.distance_matrix_size})") parser.add_argument( '--number-embeddings', metavar='NUM_EMBEDDINGS', type=auto_pos_int , help=f"The NUM_EMBEDDINGS, a positive integer (default {dflt_params.num_embeddings})") @@ -420,12 +444,16 @@ def auto_pos_int (x): if clargs.batch_size: params.batch_size = clargs.batch_size if clargs.distance_matrix_size: - params.matrix_size = clargs.distance_matrix_size - params.input_dim = (3, params.matrix_size, params.matrix_size) + params.distance_matrix_size = clargs.distance_matrix_size + params.input_dim = (3, params.distance_matrix_size, params.distance_matrix_size) + if clargs.distance_matrix_normalize is not None: + params.distance_matrix_normalize = clargs.distance_matrix_normalize + if clargs.distance_matrix_roll_probability is not None: + params.distance_matrix_roll_probability = clargs.distance_matrix_roll_probability if clargs.compression_factor: params.compression_factor = clargs.compression_factor n_features = lambda d, n: d*(d-1)/(2**n) - params.latent_dim = n_features(params.matrix_size, params.compression_factor) + params.latent_dim = n_features(params.distance_matrix_size, params.compression_factor) if clargs.number_embeddings: params.num_embeddings = clargs.number_embeddings if clargs.number_hiddens: @@ -442,4 +470,5 @@ def auto_pos_int (x): # XXX torch.set_float32_matmul_precision('medium') # XXX + logger.debug(f'run parameters:\n{params}') main_process(params) From 86ede7b00f76d84431a908f1881996a09831c893 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 18 Jun 2024 23:23:27 +0100 Subject: [PATCH 130/204] Added umap_plot --- scripts/shapeembed/evaluation.py | 49 +++++++++++++++++++++++++++++++- scripts/shapeembed/shapeembed.py | 25 +++++++++------- 2 files changed, 62 insertions(+), 12 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index e8692255..9655fbd7 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -1,5 +1,6 @@ from torchvision import datasets, transforms import pyefd +from umap import UMAP from skimage import measure from sklearn.cluster import KMeans from sklearn.pipeline import Pipeline @@ -22,7 +23,7 @@ # logging facilities ############################################################################### logger = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) +logging.basicConfig(level=logging.DEBUG) def dataloader_to_dataframe(dataloader): # gather the data and the associated labels, and drop rows with NaNs @@ -137,6 +138,52 @@ def score_dataframe( df, name df.insert(loc=0, column='trial', value=name) return df +def umap_plot( df + , name + , outputdir='.' + , n_neighbors=15 + , min_dist=0.1 + , n_components=2 + , rand_seed=42 + , split=0.7 + , width=3.45 + , height=3.45 / 1.618 ): + clean_df = df.select_dtypes(include=['number']) + umap_reducer = UMAP( n_neighbors=n_neighbors + , min_dist=min_dist + , n_components=n_components + , random_state=rand_seed ) + mask = numpy.random.rand(len(clean_df)) < split + + clean_df.reset_index(level='class', inplace=True) + classes = clean_df['class'].copy() + semi_labels = classes.copy() + semi_labels[~mask] = -1 # Assuming -1 indicates unknown label for semi-supervision + clean_df.drop('class', axis=1, inplace=True) + + umap_embedding = umap_reducer.fit_transform(clean_df, y=semi_labels) + umap_data=pandas.DataFrame(umap_embedding, columns=["umap0", "umap1"]) + umap_data['class'] = classes + + ax = seaborn.relplot( data=umap_data + , x="umap0" + , y="umap1" + , hue="class" + , palette="deep" + , alpha=0.5 + , edgecolor=None + , s=5 + , height=height + , aspect=0.5 * width / height ) + + seaborn.move_legend(ax, "upper center") + ax.set(xlabel=None, ylabel=None) + seaborn.despine(left=True, bottom=True) + plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False) + plt.tight_layout() + plt.savefig(f"{outputdir}/umap_{name}.pdf") + plt.close() + def save_scores( scores_df , outputdir='.' , width = 3.45 diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index fbd5570e..384871f9 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -310,23 +310,25 @@ def main_process(params): # run predictions ################# # ... and gather latent space + logger.info(f'-- run predictions and extract latent space --') predictions, latent_space, shapeembed_df = run_predictions( trainer, model, dataloader , num_workers=params.num_workers ) + logger.debug(f'\n{shapeembed_df}') # ... and prepare output directory and save latent space os.makedirs(f"{params.output_dir}/", exist_ok=True) np.save(f'{params.output_dir}/latent_space.npy', latent_space) shapeembed_df.to_pickle(f'{params.output_dir}/latent_space.pkl') + logger.info(f'-- generate shapeembed umap --') + umap_plot(shapeembed_df, 'shapeembed', outputdir=params.output_dir) # gather metrics ################ - # kmeans on input data and score - logger.info(f'-- kmeans on input data --') - kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) - print(kmeans) - logger.info(f'-- kmeans accuracy: {accuracy}') - logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') + # score shape embed + logger.info(f'-- score shape embed --') + shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed') + logger.info(f'-- shapeembed on input data, score:\n{shapeembed_score_df}') # regionprops on input data and score logger.info(f'-- regionprops on input data --') regionprops_df = run_regionprops(params.dataset) @@ -339,11 +341,12 @@ def main_process(params): logger.debug(f'\n{efd_df}') efd_score_df = score_dataframe(efd_df, 'efd') logger.info(f'-- elliptic fourier descriptors on input data, score:\n{efd_score_df}') - # score shape embed - logger.info(f'-- score shape embed --') - logger.debug(f'\n{shapeembed_df}') - shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed') - logger.info(f'-- shapeembed on input data, score:\n{shapeembed_score_df}') + # kmeans on input data and score + logger.info(f'-- kmeans on input data --') + kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) + print(kmeans) + logger.info(f'-- kmeans accuracy: {accuracy}') + logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') # collate and save gathered results TODO KMeans scores_df = pandas.concat([ regionprops_score_df , efd_score_df From 19edf47216914ad70b9a1ea5140c5289e215cec1 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 24 Jun 2024 14:47:02 +0100 Subject: [PATCH 131/204] fix dataset clarg --- scripts/shapeembed/shapeembed.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 384871f9..86b79fdd 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -439,7 +439,10 @@ def prob (x): params.model_args.beta = clargs.model_arg_beta params.output_dir = clargs.output_dir if clargs.dataset: - params.dataset = clargs.dataset + params.dataset = types.SimpleNamespace( name=clargs.dataset[0] + , path=clargs.dataset[1] + , type=clargs.dataset[2] ) + if clargs.wandb_entity: params.wandb_entity = clargs.wandb_entity if clargs.wandb_project: From 976edc23cab552611eafb7c0ccd03c11898bb306 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 24 Jun 2024 14:54:14 +0100 Subject: [PATCH 132/204] fix model name clarg --- scripts/shapeembed/shapeembed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 86b79fdd..f2d59573 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -432,8 +432,8 @@ def prob (x): # update default params with clargs params = copy.deepcopy(dflt_params) - if clargs.model: - params.model = clargs.model + if clargs.model_name: + params.model_name = clargs.model_name params.model_args = types.SimpleNamespace() if clargs.model_arg_beta: params.model_args.beta = clargs.model_arg_beta From d1c5d3c713bdbf1ccb497111c1858fa77a0f8498 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 24 Jun 2024 15:01:47 +0100 Subject: [PATCH 133/204] fix model_name clarg again --- scripts/shapeembed/shapeembed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index f2d59573..7d3f46e5 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -432,8 +432,8 @@ def prob (x): # update default params with clargs params = copy.deepcopy(dflt_params) - if clargs.model_name: - params.model_name = clargs.model_name + if clargs.model: + params.model_name = clargs.model params.model_args = types.SimpleNamespace() if clargs.model_arg_beta: params.model_args.beta = clargs.model_arg_beta From 485124112fb333f05172e0aac584c8982eaf1728 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 25 Jun 2024 07:29:11 +0100 Subject: [PATCH 134/204] Added early stop clarg (default no early stop) --- scripts/shapeembed/shapeembed.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 7d3f46e5..aa0b75b4 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -76,6 +76,7 @@ , epochs=150 , pretrained=False , frobenius_norm=False +, early_stop=False , distance_matrix_normalize=True , distance_matrix_roll_probability=1.0 , checkpoints_path='./checkpoints' @@ -208,6 +209,9 @@ def get_trainer(model, params): # setup trainer logger.info('setup trainer') + trainer_callbacks = [checkpoint_callback] + if params.early_stop: + trainer_callbacks.append(EarlyStopping(monitor="loss/val", mode="min")) trainer = pl.Trainer( logger=[wandblogger] , gradient_clip_val=0.5 @@ -215,9 +219,7 @@ def get_trainer(model, params): , devices=1 , accelerator="gpu" , accumulate_grad_batches=4 - , callbacks=[ checkpoint_callback - , EarlyStopping(monitor="loss/val", mode="min") - ] + , callbacks=trainer_callbacks , min_epochs=50 , max_epochs=params.epochs , log_every_n_steps=1 @@ -392,6 +394,9 @@ def prob (x): parser.add_argument( '-b', '--batch-size', metavar='BATCH_SIZE', type=auto_pos_int , help=f"The BATCH_SIZE for the run, a positive integer (default {dflt_params.batch_size})") + parser.add_argument( + '--early-stop', action=argparse.BooleanOptionalAction, default=None + , help=f'Whether to stop training early or not (when loss "stops" decreasing. Beware of second decay...)') parser.add_argument( '--distance-matrix-normalize', action=argparse.BooleanOptionalAction, default=None , help=f'Whether to normalize the distance matrices or not') @@ -452,6 +457,8 @@ def prob (x): if clargs.distance_matrix_size: params.distance_matrix_size = clargs.distance_matrix_size params.input_dim = (3, params.distance_matrix_size, params.distance_matrix_size) + if clargs.early_stop is not None: + params.early_stop = clargs.early_stop if clargs.distance_matrix_normalize is not None: params.distance_matrix_normalize = clargs.distance_matrix_normalize if clargs.distance_matrix_roll_probability is not None: From b95272e958ac627c7c0bf2bfdf34a8e837cac3c9 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 26 Jun 2024 20:04:15 +0100 Subject: [PATCH 135/204] added confusion matrices to scoring function --- scripts/shapeembed/evaluation.py | 7 +++++-- scripts/shapeembed/shapeembed.py | 18 ++++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 9655fbd7..84d8c8b3 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -9,7 +9,7 @@ from sklearn import metrics from sklearn.metrics import make_scorer from sklearn.metrics import confusion_matrix, accuracy_score -from sklearn.model_selection import cross_validate, KFold, train_test_split, StratifiedKFold +from sklearn.model_selection import cross_validate, cross_val_predict, KFold, train_test_split, StratifiedKFold import tqdm import numpy @@ -122,6 +122,9 @@ def score_dataframe( df, name , ("clf", RandomForestClassifier()) #, ("clf", DummyClassifier()) ]) + # build confusion matrix + lbl_pred = cross_val_predict(pipeline, clean_df, clean_df.index) + conf_mat = confusion_matrix(clean_df.index, lbl_pred) # Perform k-fold cross-validation cv_results = cross_validate( estimator=pipeline @@ -136,7 +139,7 @@ def score_dataframe( df, name df = pandas.DataFrame(cv_results) df = df.drop(["fit_time", "score_time"], axis=1) df.insert(loc=0, column='trial', value=name) - return df + return conf_mat, df def umap_plot( df , name diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index aa0b75b4..c0ba2f68 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -329,20 +329,26 @@ def main_process(params): ################ # score shape embed logger.info(f'-- score shape embed --') - shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed') - logger.info(f'-- shapeembed on input data, score:\n{shapeembed_score_df}') + shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed') + logger.info(f'-- shapeembed on input data') + logger.info(f'-- score:\n{shapeembed_score_df}') + logger.info(f'-- confusion matrix:\n{shapeembed_cm}') # regionprops on input data and score logger.info(f'-- regionprops on input data --') regionprops_df = run_regionprops(params.dataset) logger.debug(f'\n{regionprops_df}') - regionprops_score_df = score_dataframe(regionprops_df, 'regionprops') - logger.info(f'-- regionprops on input data, score:\n{regionprops_score_df}') + regionprops_cm, regionprops_score_df = score_dataframe(regionprops_df, 'regionprops') + logger.info(f'-- regionprops on input data') + logger.info(f'-- score:\n{regionprops_score_df}') + logger.info(f'-- confusion matrix:\n{regionprops_cm}') # elliptic fourier descriptors on input data and score logger.info(f'-- elliptic fourier descriptors on input data --') efd_df = run_elliptic_fourier_descriptors(params.dataset) logger.debug(f'\n{efd_df}') - efd_score_df = score_dataframe(efd_df, 'efd') - logger.info(f'-- elliptic fourier descriptors on input data, score:\n{efd_score_df}') + efd_cm, efd_score_df = score_dataframe(efd_df, 'efd') + logger.info(f'-- elliptic fourier descriptors on input data') + logger.info(f'-- score:\n{efd_score_df}') + logger.info(f'-- confusion matrix:\n{efd_cm}') # kmeans on input data and score logger.info(f'-- kmeans on input data --') kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) From 222f69892f11af2afb513d282eaa79d3c2c4b8eb Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 26 Jun 2024 23:18:01 +0100 Subject: [PATCH 136/204] use integer division for compression factor clarg --- scripts/shapeembed/shapeembed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index c0ba2f68..a47927c4 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -471,7 +471,7 @@ def prob (x): params.distance_matrix_roll_probability = clargs.distance_matrix_roll_probability if clargs.compression_factor: params.compression_factor = clargs.compression_factor - n_features = lambda d, n: d*(d-1)/(2**n) + n_features = lambda d, n: d*(d-1)//(2**n) params.latent_dim = n_features(params.distance_matrix_size, params.compression_factor) if clargs.number_embeddings: params.num_embeddings = clargs.number_embeddings From 793b72079eabd9f30b5f553fdc849fcd89f9889e Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 26 Jun 2024 23:19:49 +0100 Subject: [PATCH 137/204] explicitly binarise image when running regionprops --- scripts/shapeembed/evaluation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 84d8c8b3..2cfa3d0c 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -56,15 +56,18 @@ def run_regionprops( dataset_params , "minor_axis_length" , "orientation" ] ): # access the dataset - assert dataset_params.type == 'mask' + assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}' ds = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1)) # ... and run regionprops for the given properties for each image dfs = [] logger.info(f'running regionprops on {dataset_params.name}') logger.info(f'({dataset_params.path})') for i, (img, lbl) in enumerate(tqdm.tqdm(ds)): - t = measure.regionprops_table(numpy.array(img), properties=properties) + data = numpy.where(numpy.array(img)>20, 255, 0) + t = measure.regionprops_table(data, properties=properties) df = pandas.DataFrame(t) + assert df.shape[0] == 1, f'More than one object in image #{i}' + df.index = [i] df['class'] = lbl df.set_index("class", inplace=True) dfs.append(df) From 43673ee07805fb7240f302cb882681b282659f6f Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 26 Jun 2024 23:22:39 +0100 Subject: [PATCH 138/204] keep 'class' as a column rather than index + keeps column names as strings --- scripts/shapeembed/evaluation.py | 22 ++++++++++++++-------- scripts/shapeembed/shapeembed.py | 3 ++- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 2cfa3d0c..1bb582d4 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -69,7 +69,7 @@ def run_regionprops( dataset_params assert df.shape[0] == 1, f'More than one object in image #{i}' df.index = [i] df['class'] = lbl - df.set_index("class", inplace=True) + #df.set_index("class", inplace=True) dfs.append(df) # concatenate results as a single dataframe and return it df = pandas.concat(dfs) @@ -97,7 +97,9 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size=512): df.set_index("class", inplace=True, append=True) dfs.append(df) # concatenate results as a single dataframe and return it - return pandas.concat(dfs).xs('coeffs', level='coeffs') + df = pandas.concat(dfs).xs('coeffs', level='coeffs') + df.reset_index(level='class', inplace=True) + return df def score_dataframe( df, name , test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5 ): @@ -107,7 +109,8 @@ def score_dataframe( df, name # TODO, currently unused # Split the data into training and test sets #X_train, X_test, y_train, y_test = train_test_split( - # clean_df, clean_df.index, stratify=clean_df.index + # clean_df.drop('class', axis=1), clean_df['class'] + #, stratify=clean_df['class'] #, test_size=test_sz, randm_state=rand_seed, shuffle=shuffle #) # Define a dictionary of metrics @@ -126,13 +129,16 @@ def score_dataframe( df, name #, ("clf", DummyClassifier()) ]) # build confusion matrix - lbl_pred = cross_val_predict(pipeline, clean_df, clean_df.index) - conf_mat = confusion_matrix(clean_df.index, lbl_pred) + clean_df.columns = clean_df.columns.astype(str) # only string column names + lbl_pred = cross_val_predict( pipeline + , clean_df.drop('class', axis=1) + , clean_df['class']) + conf_mat = confusion_matrix(clean_df['class'], lbl_pred) # Perform k-fold cross-validation cv_results = cross_validate( estimator=pipeline - , X=clean_df - , y=clean_df.index + , X=clean_df.drop('class', axis=1) + , y=clean_df['class'] , cv=StratifiedKFold(n_splits=k_folds) , scoring=scoring , n_jobs=-1 @@ -161,7 +167,7 @@ def umap_plot( df , random_state=rand_seed ) mask = numpy.random.rand(len(clean_df)) < split - clean_df.reset_index(level='class', inplace=True) + #clean_df.reset_index(level='class', inplace=True) classes = clean_df['class'].copy() semi_labels = classes.copy() semi_labels[~mask] = -1 # Assuming -1 indicates unknown label for semi-supervision diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index a47927c4..29189140 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -288,7 +288,8 @@ def run_predictions(trainer, model, dataloader, num_workers=8): df.insert( loc=0, column='class_name' , value=[ds.classes[x] for x in class_indices]) df.insert(loc=0, column='class', value=class_indices) - df.set_index("class", inplace=True) + #df.set_index("class", inplace=True) + df.columns = df.columns.astype(str) # only string column names return (predictions, latent_space, df) From 11a6e69e1a51df57d7768f25aea49bd96299c85a Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 26 Jun 2024 23:23:34 +0100 Subject: [PATCH 139/204] change len for shape[0] --- scripts/shapeembed/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 1bb582d4..56c24b05 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -165,7 +165,7 @@ def umap_plot( df , min_dist=min_dist , n_components=n_components , random_state=rand_seed ) - mask = numpy.random.rand(len(clean_df)) < split + mask = numpy.random.rand(clean_df.shape[0]) < split #clean_df.reset_index(level='class', inplace=True) classes = clean_df['class'].copy() From 91dd1c5dba9bd3f261f32b0bca18dc3ceee944fb Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 26 Jun 2024 23:24:06 +0100 Subject: [PATCH 140/204] drop not needed return value from run_predictions --- scripts/shapeembed/shapeembed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 29189140..c0be264e 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -291,7 +291,7 @@ def run_predictions(trainer, model, dataloader, num_workers=8): #df.set_index("class", inplace=True) df.columns = df.columns.astype(str) # only string column names - return (predictions, latent_space, df) + return latent_space, df # main process ############################################################################### @@ -314,7 +314,7 @@ def main_process(params): ################# # ... and gather latent space logger.info(f'-- run predictions and extract latent space --') - predictions, latent_space, shapeembed_df = run_predictions( + latent_space, shapeembed_df = run_predictions( trainer, model, dataloader , num_workers=params.num_workers ) From 963d59048d6eb6a542f696bd78de583b593c8fe0 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Wed, 26 Jun 2024 23:24:52 +0100 Subject: [PATCH 141/204] added combined shapeembed + efd + regionprops scoring and comment out kmeans --- scripts/shapeembed/shapeembed.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index c0be264e..b4a9dfbb 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -350,13 +350,24 @@ def main_process(params): logger.info(f'-- elliptic fourier descriptors on input data') logger.info(f'-- score:\n{efd_score_df}') logger.info(f'-- confusion matrix:\n{efd_cm}') - # kmeans on input data and score - logger.info(f'-- kmeans on input data --') - kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) - print(kmeans) - logger.info(f'-- kmeans accuracy: {accuracy}') - logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') - # collate and save gathered results TODO KMeans + # combined shapeembed + efd + regionprops + logger.info(f'-- shapeembed + efd + regionprops --') + comb_df = pandas.concat([ shapeembed_df + , efd_df.drop('class', axis=1) + , regionprops_df.drop('class', axis=1) ], axis=1) + logger.debug(f'\n{comb_df}') + comb_cm, comb_score_df = score_dataframe(comb_df, 'combined') + logger.info(f'-- shapeembed + efd + regionprops on input data') + logger.info(f'-- score:\n{comb_score_df}') + logger.info(f'-- confusion matrix:\n{comb_cm}') + # XXX Not currently doing the kmeans + # XXX kmeans on input data and score + #logger.info(f'-- kmeans on input data --') + #kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) + #print(kmeans) + #logger.info(f'-- kmeans accuracy: {accuracy}') + #logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') + ## collate and save gathered results TODO KMeans scores_df = pandas.concat([ regionprops_score_df , efd_score_df , shapeembed_score_df ]) From df09ad5f957d3ef55526f9be36aa1b346ef0aca9 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 27 Jun 2024 17:35:06 +0100 Subject: [PATCH 142/204] save combined score --- scripts/shapeembed/shapeembed.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index b4a9dfbb..dabbf190 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -356,7 +356,7 @@ def main_process(params): , efd_df.drop('class', axis=1) , regionprops_df.drop('class', axis=1) ], axis=1) logger.debug(f'\n{comb_df}') - comb_cm, comb_score_df = score_dataframe(comb_df, 'combined') + comb_cm, comb_score_df = score_dataframe(comb_df, 'combined_all') logger.info(f'-- shapeembed + efd + regionprops on input data') logger.info(f'-- score:\n{comb_score_df}') logger.info(f'-- confusion matrix:\n{comb_cm}') @@ -367,10 +367,12 @@ def main_process(params): #print(kmeans) #logger.info(f'-- kmeans accuracy: {accuracy}') #logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') - ## collate and save gathered results TODO KMeans + + # collate and save gathered results TODO KMeans scores_df = pandas.concat([ regionprops_score_df , efd_score_df - , shapeembed_score_df ]) + , shapeembed_score_df + , comb_score_df ]) save_scores(scores_df, outputdir=params.output_dir) # main entry point From ff6fbd483b90c113461546164585aedcd729b1c1 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 27 Jun 2024 17:47:22 +0100 Subject: [PATCH 143/204] save confusion matrices --- scripts/shapeembed/evaluation.py | 12 ++++++++++++ scripts/shapeembed/shapeembed.py | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 56c24b05..b5d55cec 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -150,6 +150,18 @@ def score_dataframe( df, name df.insert(loc=0, column='trial', value=name) return conf_mat, df +def confusion_matrix_plot( cm, name, outputdir + , figsize=(10,7) ): + # Plot confusion matrix + plt.clf() # Clear figure + plt.figure(figsize=figsize) + seaborn.heatmap(cm, annot=True, fmt='d') + plt.title(f'{name} - Confusion Matrix') + plt.xlabel('Predicted') + plt.ylabel('Actual') + plt.savefig(f'{outputdir}/{name}_confusion_matrix.png') + plt.clf() # Clear figure + def umap_plot( df , name , outputdir='.' diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index dabbf190..1e05b222 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -334,6 +334,7 @@ def main_process(params): logger.info(f'-- shapeembed on input data') logger.info(f'-- score:\n{shapeembed_score_df}') logger.info(f'-- confusion matrix:\n{shapeembed_cm}') + confusion_matrix_plot(shapeembed_cm, 'shapeembed', params.output_dir) # regionprops on input data and score logger.info(f'-- regionprops on input data --') regionprops_df = run_regionprops(params.dataset) @@ -342,6 +343,7 @@ def main_process(params): logger.info(f'-- regionprops on input data') logger.info(f'-- score:\n{regionprops_score_df}') logger.info(f'-- confusion matrix:\n{regionprops_cm}') + confusion_matrix_plot(regionprops_cm, 'regionprops_cm', params.output_dir) # elliptic fourier descriptors on input data and score logger.info(f'-- elliptic fourier descriptors on input data --') efd_df = run_elliptic_fourier_descriptors(params.dataset) @@ -350,6 +352,7 @@ def main_process(params): logger.info(f'-- elliptic fourier descriptors on input data') logger.info(f'-- score:\n{efd_score_df}') logger.info(f'-- confusion matrix:\n{efd_cm}') + confusion_matrix_plot(efd_cm, 'efd', params.output_dir) # combined shapeembed + efd + regionprops logger.info(f'-- shapeembed + efd + regionprops --') comb_df = pandas.concat([ shapeembed_df @@ -360,6 +363,7 @@ def main_process(params): logger.info(f'-- shapeembed + efd + regionprops on input data') logger.info(f'-- score:\n{comb_score_df}') logger.info(f'-- confusion matrix:\n{comb_cm}') + confusion_matrix_plot(comb_cm, 'combined_all', params.output_dir) # XXX Not currently doing the kmeans # XXX kmeans on input data and score #logger.info(f'-- kmeans on input data --') From ebadaff76a84cd66d6775b55921c9a448148b494 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 5 Jul 2024 02:16:17 +0100 Subject: [PATCH 144/204] First attempt at a result gathering script --- scripts/shapeembed/gather_run_results.py | 138 +++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100755 scripts/shapeembed/gather_run_results.py diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py new file mode 100755 index 00000000..10d66fa6 --- /dev/null +++ b/scripts/shapeembed/gather_run_results.py @@ -0,0 +1,138 @@ +#! /usr/bin/env python3 + +import pandas as pd +import logging +import argparse +import shutil +import os +import functools + +# define a Custom aggregation +# function for finding total +def keep_first_fname(series): + return functools.reduce(lambda x, y: y if x == 'nofile' else y, series) + +def get_run_info(run): + x = run.split('_') + return f'{x[0]}_{x[1]}', x[2], x[4] + +def main_process(clargs, logger=logging.getLogger(__name__)): + print(clargs) + os.makedirs(clargs.output_dir, exist_ok=True) + dfs = [] + for d in clargs.run_folder: + csv = f'{d}/scores_df.csv' + #csv = f'{d}/scores_df_mean.csv' + if not os.path.isfile(csv): + print(f'WARNING: no {csv} found, skipping') + continue + + run_name = os.path.basename(d) + model, latent_space_sz, dataset = get_run_info(run_name) + df = pd.read_csv(csv) + df['model'] = model + df['latent_space_sz'] = latent_space_sz + df['dataset'] = dataset + for trial in ['efd','regionprops','shapeembed', 'combined_all']: + conf_mat = f'{trial}_confusion_matrix.png' + if os.path.isfile(f'{d}/{conf_mat}'): + shutil.copy(f'{d}/{conf_mat}',f'{clargs.output_dir}/{run_name}_{conf_mat}') + df.loc[df['trial'] == trial, 'conf_mat'] = f'./{run_name}_{conf_mat}' + else: + df.loc[df['trial'] == trial, 'conf_mat'] = f'nofile' + umap = f'umap_{trial}.pdf' + if os.path.isfile(f'{d}/{umap}'): + shutil.copy(f'{d}/{umap}',f'{clargs.output_dir}/{run_name}_{umap}') + df.loc[df['trial'] == trial, 'umap'] = f'./{run_name}_{umap}' + else: + df.loc[df['trial'] == trial, 'umap'] = f'nofile' + dfs.append(df.convert_dtypes()) + df = pd.concat(dfs) + df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df + df.set_index(['dataset', 'trial', 'model', 'latent_space_sz'], inplace=True) + df.sort_index(inplace=True) + df = df.groupby(level=['dataset', 'trial', 'model', 'latent_space_sz']).agg({ + 'test_accuracy': 'mean' + , 'test_precision': 'mean' + , 'test_recall': 'mean' + , 'test_f1': 'mean' + , 'conf_mat': keep_first_fname + , 'umap': keep_first_fname + }) + + print('-'*80) + print(df) + print('-'*80) + + + cell_hover = { # for row hover use instead of + 'selector': 'td:hover', + 'props': [('background-color', '#ffffb3')] + } + index_names = { + 'selector': '.index_name', + 'props': 'font-style: italic; color: darkgrey; font-weight:normal;' + } + headers = { + 'selector': 'th:not(.index_name)', + 'props': 'background-color: #eeeeee; color: #333333;' + } + + def html_img(path): + if os.path.splitext(path)[1][1:] == 'png': + return f'' + if os.path.splitext(path)[1][1:] == 'pdf': + return f'' + return '
:(
' + df['conf_mat'] = df['conf_mat'].apply(html_img) + df['umap'] = df['umap'].apply(html_img) + + def render_html(fname, d): + with open(fname, 'w') as f: + f.write(''' + ''') + s = d.style + s.set_table_styles([cell_hover, index_names, headers]) + s.to_html(f, classes='df') + + with open(f'{clargs.output_dir}/gathered_table.tex', 'w') as f: + f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n') + df.to_latex(f) + f.write('\\end{decument}') + render_html(f'{clargs.output_dir}/gathered_table.html', df) + + dft = df.transpose() + with open(f'{clargs.output_dir}/gathered_table_transpose.tex', 'w') as f: + f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n') + dft.to_latex(f) + f.write('\\end{decument}') + render_html(f'{clargs.output_dir}/gathered_table_transpose.html', dft) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Run the shape embed pipeline') + + parser.add_argument( 'run_folder', nargs="+", type=str + , help=f"The runs folders to gather results from") + parser.add_argument( '-o', '--output-dir', metavar='OUTPUT_DIR' + , default=f'{os.getcwd()}/gathered_results' + , help=f"The OUTPUT_DIR path to use to gather results") + parser.add_argument('-v', '--verbose', action='count', default=0 + , help="Increase verbosity level by adding more \"v\".") + + # parse command line arguments + clargs=parser.parse_args() + + # set verbosity level + logger = logging.getLogger(__name__) + if clargs.verbose > 2: + logger.setLevel(logging.DEBUG) + elif clargs.verbose > 0: + logger.setLevel(logging.INFO) + + main_process(clargs, logger) From 59fab42f7eeaed9144a5ec5d2011e4a40555c1b9 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 5 Jul 2024 02:37:28 +0100 Subject: [PATCH 145/204] added barplots --- scripts/shapeembed/gather_run_results.py | 28 +++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index 10d66fa6..1af719be 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -33,20 +33,32 @@ def main_process(clargs, logger=logging.getLogger(__name__)): df['model'] = model df['latent_space_sz'] = latent_space_sz df['dataset'] = dataset + for trial in ['efd','regionprops','shapeembed', 'combined_all']: + conf_mat = f'{trial}_confusion_matrix.png' if os.path.isfile(f'{d}/{conf_mat}'): shutil.copy(f'{d}/{conf_mat}',f'{clargs.output_dir}/{run_name}_{conf_mat}') df.loc[df['trial'] == trial, 'conf_mat'] = f'./{run_name}_{conf_mat}' else: df.loc[df['trial'] == trial, 'conf_mat'] = f'nofile' + umap = f'umap_{trial}.pdf' if os.path.isfile(f'{d}/{umap}'): shutil.copy(f'{d}/{umap}',f'{clargs.output_dir}/{run_name}_{umap}') df.loc[df['trial'] == trial, 'umap'] = f'./{run_name}_{umap}' else: df.loc[df['trial'] == trial, 'umap'] = f'nofile' + + barplot = f'scores_barplot.pdf' + if os.path.isfile(f'{d}/{barplot}'): + shutil.copy(f'{d}/{barplot}',f'{clargs.output_dir}/{run_name}_{barplot}') + df.loc[df['trial'] == trial, 'barplot'] = f'./{run_name}_{barplot}' + else: + df.loc[df['trial'] == trial, 'barplot'] = f'nofile' + dfs.append(df.convert_dtypes()) + df = pd.concat(dfs) df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df df.set_index(['dataset', 'trial', 'model', 'latent_space_sz'], inplace=True) @@ -58,6 +70,7 @@ def main_process(clargs, logger=logging.getLogger(__name__)): , 'test_f1': 'mean' , 'conf_mat': keep_first_fname , 'umap': keep_first_fname + , 'barplot': keep_first_fname }) print('-'*80) @@ -80,24 +93,29 @@ def main_process(clargs, logger=logging.getLogger(__name__)): def html_img(path): if os.path.splitext(path)[1][1:] == 'png': - return f'' + return f'' if os.path.splitext(path)[1][1:] == 'pdf': - return f'' + return f'' return '
:(
' df['conf_mat'] = df['conf_mat'].apply(html_img) df['umap'] = df['umap'].apply(html_img) + df['barplot'] = df['barplot'].apply(html_img) def render_html(fname, d): with open(fname, 'w') as f: - f.write(''' + + ''') s = d.style s.set_table_styles([cell_hover, index_names, headers]) s.to_html(f, classes='df') + f.write('') with open(f'{clargs.output_dir}/gathered_table.tex', 'w') as f: f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n') From bfab20d34107346376f4e52e3aca526c45c46b8d Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 19:13:31 +0100 Subject: [PATCH 146/204] Added a separate regionprops script --- scripts/shapeembed/regionprops.py | 85 +++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100755 scripts/shapeembed/regionprops.py diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py new file mode 100755 index 00000000..b57fa517 --- /dev/null +++ b/scripts/shapeembed/regionprops.py @@ -0,0 +1,85 @@ +#! /usr/bin/env python3 + +import types +import logging +import argparse +from skimage import measure + +# own imports +from evaluation import * + +def run_regionprops( dataset_params + , properties + , logger ): + # access the dataset + assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}' + ds = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1)) + # ... and run regionprops for the given properties for each image + dfs = [] + logger.info(f'running regionprops on {dataset_params.name}') + logger.info(f'({dataset_params.path})') + for i, (img, lbl) in enumerate(tqdm.tqdm(ds)): + data = numpy.where(numpy.array(img)>20, 255, 0) + t = measure.regionprops_table(data, properties=properties) + df = pandas.DataFrame(t) + assert df.shape[0] == 1, f'More than one object in image #{i}' + df.index = [i] + df['class'] = lbl + #df.set_index("class", inplace=True) + dfs.append(df) + # concatenate results as a single dataframe and return it + df = pandas.concat(dfs) + return df + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Run regionprops on a given dataset') + + dflt_dataset=('tiny_synthetic_shapes', '/nfs/research/uhlmann/afoix/datasets/image_datasets/tiny_synthetic_shapes', 'mask') + parser.add_argument( + '-d', '--dataset', nargs=3, metavar=('NAME', 'PATH', 'TYPE'), default=dflt_dataset + , help=f"The NAME, PATH and TYPE of the dataset (default: {dflt_dataset})") + + dflt_properties=[ "area" + , "perimeter" + , "centroid" + , "major_axis_length" + , "minor_axis_length" + , "orientation" ] + + parser.add_argument( + '-o', '--output-dir', metavar='OUTPUT_DIR', default='./' + , help=f"The OUTPUT_DIR path to use to dump results") + + parser.add_argument('-v', '--verbose', action='count', default=0 + , help="Increase verbosity level by adding more \"v\".") + + # parse command line arguments + clargs=parser.parse_args() + + # set verbosity level + logger = logging.getLogger(__name__) + if clargs.verbose > 2: + logger.setLevel(logging.DEBUG) + elif clargs.verbose > 0: + logger.setLevel(logging.INFO) + + # update default params with clargs + dataset = types.SimpleNamespace( name=clargs.dataset[0] + , path=clargs.dataset[1] + , type=clargs.dataset[2] ) + properties = dflt_properties + + # regionprops on input data and score + + regionprops_df = run_regionprops(dataset, properties, logger) + + logger.info(f'-- regionprops on {dataset.name}, raw\n{regionprops_df}') + regionprops_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_df.csv") + umap_plot(regionprops_df, f'{dataset.name}_regionprops_umap', outputdir=clargs.output_dir) + + regionprops_cm, regionprops_score_df = score_dataframe(regionprops_df, 'regionprops') + + logger.info(f'-- regionprops on {dataset.name}, score\n{regionprops_score_df}') + regionprops_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_score_df.csv") + logger.info(f'-- confusion matrix:\n{regionprops_cm}') + confusion_matrix_plot(regionprops_cm, f'{dataset.name}_regionprops_cm', clargs.output_dir) From c4a3a23ae6b534fa97ff1670429b50153468e941 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 19:13:52 +0100 Subject: [PATCH 147/204] added a separate efd script --- scripts/shapeembed/efd.py | 83 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100755 scripts/shapeembed/efd.py diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py new file mode 100755 index 00000000..e2dee043 --- /dev/null +++ b/scripts/shapeembed/efd.py @@ -0,0 +1,83 @@ +#! /usr/bin/env python3 + +import types +import logging +import argparse +import pyefd + +# own imports +from evaluation import * + +def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger): + # access the dataset + assert dataset_params.type == 'mask' + ds = datasets.ImageFolder( dataset_params.path + , transform=transforms.Compose([ + transforms.Grayscale(1) + , ImageToCoords(contour_size) ])) + # ... and run efd on each image + dfs = [] + logger.info(f'running efd on {dataset_params.name}') + logger.info(f'({dataset_params.path})') + for i, (img, lbl) in enumerate(tqdm.tqdm(ds)): + coeffs = pyefd.elliptic_fourier_descriptors(img, order=10, normalize=False) + norm_coeffs = pyefd.normalize_efd(coeffs) + df = pandas.DataFrame({ + "norm_coeffs": norm_coeffs.flatten().tolist() + , "coeffs": coeffs.flatten().tolist() + }).T.rename_axis("coeffs") + df['class'] = lbl + df.set_index("class", inplace=True, append=True) + dfs.append(df) + # concatenate results as a single dataframe and return it + df = pandas.concat(dfs).xs('coeffs', level='coeffs') + df.reset_index(level='class', inplace=True) + return df + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Run efd on a given dataset') + + dflt_dataset=('tiny_synthetic_shapes', '/nfs/research/uhlmann/afoix/datasets/image_datasets/tiny_synthetic_shapes', 'mask') + parser.add_argument( + '-d', '--dataset', nargs=3, metavar=('NAME', 'PATH', 'TYPE'), default=dflt_dataset + , help=f"The NAME, PATH and TYPE of the dataset (default: {dflt_dataset})") + + dflt_contour_size=512 + + parser.add_argument( + '-o', '--output-dir', metavar='OUTPUT_DIR', default='./' + , help=f"The OUTPUT_DIR path to use to dump results") + + parser.add_argument('-v', '--verbose', action='count', default=0 + , help="Increase verbosity level by adding more \"v\".") + + # parse command line arguments + clargs=parser.parse_args() + + # set verbosity level + logger = logging.getLogger(__name__) + if clargs.verbose > 2: + logger.setLevel(logging.DEBUG) + elif clargs.verbose > 0: + logger.setLevel(logging.INFO) + + # update default params with clargs + dataset = types.SimpleNamespace( name=clargs.dataset[0] + , path=clargs.dataset[1] + , type=clargs.dataset[2] ) + contour_size = dflt_contour_size + + # efd on input data and score + + efd_df = run_elliptic_fourier_descriptors(dataset, contour_size, logger) + + logger.info(f'-- efd on {dataset.name}, raw\n{efd_df}') + efd_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_df.csv") + umap_plot(efd_df, f'{dataset.name}_efd_umap', outputdir=clargs.output_dir) + + efd_cm, efd_score_df = score_dataframe(efd_df, 'efd') + + logger.info(f'-- efd on {dataset.name}, score\n{efd_score_df}') + efd_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_score_df.csv") + logger.info(f'-- confusion matrix:\n{efd_cm}') + confusion_matrix_plot(efd_cm, f'{dataset.name}_efd_cm', clargs.output_dir) From b7998039ea4dfb75dd10d33164fb3c88491e8dbc Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 19:17:52 +0100 Subject: [PATCH 148/204] refactor efd and regionprops out of evaluation helpers --- scripts/shapeembed/efd.py | 4 +-- scripts/shapeembed/evaluation.py | 55 +------------------------------ scripts/shapeembed/regionprops.py | 4 +-- 3 files changed, 5 insertions(+), 58 deletions(-) diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py index e2dee043..d24db81d 100755 --- a/scripts/shapeembed/efd.py +++ b/scripts/shapeembed/efd.py @@ -73,11 +73,11 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger): logger.info(f'-- efd on {dataset.name}, raw\n{efd_df}') efd_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_df.csv") - umap_plot(efd_df, f'{dataset.name}_efd_umap', outputdir=clargs.output_dir) + umap_plot(efd_df, f'{dataset.name}_efd', outputdir=clargs.output_dir) efd_cm, efd_score_df = score_dataframe(efd_df, 'efd') logger.info(f'-- efd on {dataset.name}, score\n{efd_score_df}') efd_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_score_df.csv") logger.info(f'-- confusion matrix:\n{efd_cm}') - confusion_matrix_plot(efd_cm, f'{dataset.name}_efd_cm', clargs.output_dir) + confusion_matrix_plot(efd_cm, f'{dataset.name}_efd', clargs.output_dir) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index b5d55cec..a273fb9d 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -48,59 +48,6 @@ def run_kmeans(dataframe, random_seed=42): conf_mat = confusion_matrix(dataframe['class'], kmeans.labels_) return kmeans, accuracy, conf_mat -def run_regionprops( dataset_params - , properties = [ "area" - , "perimeter" - , "centroid" - , "major_axis_length" - , "minor_axis_length" - , "orientation" ] ): - # access the dataset - assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}' - ds = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1)) - # ... and run regionprops for the given properties for each image - dfs = [] - logger.info(f'running regionprops on {dataset_params.name}') - logger.info(f'({dataset_params.path})') - for i, (img, lbl) in enumerate(tqdm.tqdm(ds)): - data = numpy.where(numpy.array(img)>20, 255, 0) - t = measure.regionprops_table(data, properties=properties) - df = pandas.DataFrame(t) - assert df.shape[0] == 1, f'More than one object in image #{i}' - df.index = [i] - df['class'] = lbl - #df.set_index("class", inplace=True) - dfs.append(df) - # concatenate results as a single dataframe and return it - df = pandas.concat(dfs) - return df - -def run_elliptic_fourier_descriptors(dataset_params, contour_size=512): - # access the dataset - assert dataset_params.type == 'mask' - ds = datasets.ImageFolder( dataset_params.path - , transform=transforms.Compose([ - transforms.Grayscale(1) - , ImageToCoords(contour_size) ])) - # ... and run efd on each image - dfs = [] - logger.info(f'running efd on {dataset_params.name}') - logger.info(f'({dataset_params.path})') - for i, (img, lbl) in enumerate(tqdm.tqdm(ds)): - coeffs = pyefd.elliptic_fourier_descriptors(img, order=10, normalize=False) - norm_coeffs = pyefd.normalize_efd(coeffs) - df = pandas.DataFrame({ - "norm_coeffs": norm_coeffs.flatten().tolist() - , "coeffs": coeffs.flatten().tolist() - }).T.rename_axis("coeffs") - df['class'] = lbl - df.set_index("class", inplace=True, append=True) - dfs.append(df) - # concatenate results as a single dataframe and return it - df = pandas.concat(dfs).xs('coeffs', level='coeffs') - df.reset_index(level='class', inplace=True) - return df - def score_dataframe( df, name , test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5 ): # drop strings and python object columns @@ -205,7 +152,7 @@ def umap_plot( df seaborn.despine(left=True, bottom=True) plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False) plt.tight_layout() - plt.savefig(f"{outputdir}/umap_{name}.pdf") + plt.savefig(f"{outputdir}/{name}_umap.pdf") plt.close() def save_scores( scores_df diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py index b57fa517..3af36220 100755 --- a/scripts/shapeembed/regionprops.py +++ b/scripts/shapeembed/regionprops.py @@ -75,11 +75,11 @@ def run_regionprops( dataset_params logger.info(f'-- regionprops on {dataset.name}, raw\n{regionprops_df}') regionprops_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_df.csv") - umap_plot(regionprops_df, f'{dataset.name}_regionprops_umap', outputdir=clargs.output_dir) + umap_plot(regionprops_df, f'{dataset.name}_regionprops', outputdir=clargs.output_dir) regionprops_cm, regionprops_score_df = score_dataframe(regionprops_df, 'regionprops') logger.info(f'-- regionprops on {dataset.name}, score\n{regionprops_score_df}') regionprops_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_score_df.csv") logger.info(f'-- confusion matrix:\n{regionprops_cm}') - confusion_matrix_plot(regionprops_cm, f'{dataset.name}_regionprops_cm', clargs.output_dir) + confusion_matrix_plot(regionprops_cm, f'{dataset.name}_regionprops', clargs.output_dir) From 6bc1947f63de4ae74ba92c19f297a7b2596ffc02 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 19:23:47 +0100 Subject: [PATCH 149/204] less debug info by default + create outdir if not there --- scripts/shapeembed/efd.py | 6 +++++- scripts/shapeembed/evaluation.py | 2 +- scripts/shapeembed/regionprops.py | 4 ++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py index d24db81d..20f250c5 100755 --- a/scripts/shapeembed/efd.py +++ b/scripts/shapeembed/efd.py @@ -1,9 +1,10 @@ #! /usr/bin/env python3 +import os import types +import pyefd import logging import argparse -import pyefd # own imports from evaluation import * @@ -67,6 +68,9 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger): , type=clargs.dataset[2] ) contour_size = dflt_contour_size + # create output dir if it does not exist + os.makedirs(clargs.output_dir, exist_ok=True) + # efd on input data and score efd_df = run_elliptic_fourier_descriptors(dataset, contour_size, logger) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index a273fb9d..0c2fec76 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -23,7 +23,7 @@ # logging facilities ############################################################################### logger = logging.getLogger(__name__) -logging.basicConfig(level=logging.DEBUG) +#logging.basicConfig(level=logging.DEBUG) def dataloader_to_dataframe(dataloader): # gather the data and the associated labels, and drop rows with NaNs diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py index 3af36220..1d76309c 100755 --- a/scripts/shapeembed/regionprops.py +++ b/scripts/shapeembed/regionprops.py @@ -1,5 +1,6 @@ #! /usr/bin/env python3 +import os import types import logging import argparse @@ -69,6 +70,9 @@ def run_regionprops( dataset_params , type=clargs.dataset[2] ) properties = dflt_properties + # create output dir if it does not exist + os.makedirs(clargs.output_dir, exist_ok=True) + # regionprops on input data and score regionprops_df = run_regionprops(dataset, properties, logger) From db47da97e65b82f14a824d9ca2ef1f09972a7199 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 19:52:42 +0100 Subject: [PATCH 150/204] removed regionprops/efd from main shapeembed script + filename sanitisation --- scripts/shapeembed/shapeembed.py | 91 ++++++++++++-------------------- 1 file changed, 35 insertions(+), 56 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 1e05b222..53e963a8 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -64,7 +64,7 @@ model_name='resnet18_vae' , dataset=types.SimpleNamespace( name='tiny_synthetic_shapes' - , path='/nfs/research/uhlmann/afoix/image_datasets/tiny_synthetic_shapes' + , path='/nfs/research/uhlmann/afoix/datasets/image_datasets/tiny_synthetic_shapes' , type='mask' ) , batch_size=4 @@ -313,71 +313,50 @@ def main_process(params): # run predictions ################# # ... and gather latent space + os.makedirs(f"{params.output_dir}/", exist_ok=True) logger.info(f'-- run predictions and extract latent space --') latent_space, shapeembed_df = run_predictions( trainer, model, dataloader , num_workers=params.num_workers ) logger.debug(f'\n{shapeembed_df}') - # ... and prepare output directory and save latent space - os.makedirs(f"{params.output_dir}/", exist_ok=True) - np.save(f'{params.output_dir}/latent_space.npy', latent_space) - shapeembed_df.to_pickle(f'{params.output_dir}/latent_space.pkl') + np.save(f'{params.output_dir}/{params.dataset.name}_shapeembed_latent_space.npy', latent_space) + shapeembed_df.to_pickle(f'{params.output_dir}/{params.dataset.name}_shapeembed_latent_space.pkl') + shapeembed_df.to_csv(f"{params.output_dir}/{params.dataset.name}_shapeembed_df.csv") logger.info(f'-- generate shapeembed umap --') - umap_plot(shapeembed_df, 'shapeembed', outputdir=params.output_dir) - - # gather metrics - ################ - # score shape embed + umap_plot(shapeembed_df, f'{params.dataset.name}_shapeembed', outputdir=params.output_dir) logger.info(f'-- score shape embed --') shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed') - logger.info(f'-- shapeembed on input data') - logger.info(f'-- score:\n{shapeembed_score_df}') + logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}') + shapeembed_score_df.to_csv(f"{params.output_dir}/{params.dataset.name}_shapeembed_score_df.csv") logger.info(f'-- confusion matrix:\n{shapeembed_cm}') - confusion_matrix_plot(shapeembed_cm, 'shapeembed', params.output_dir) - # regionprops on input data and score - logger.info(f'-- regionprops on input data --') - regionprops_df = run_regionprops(params.dataset) - logger.debug(f'\n{regionprops_df}') - regionprops_cm, regionprops_score_df = score_dataframe(regionprops_df, 'regionprops') - logger.info(f'-- regionprops on input data') - logger.info(f'-- score:\n{regionprops_score_df}') - logger.info(f'-- confusion matrix:\n{regionprops_cm}') - confusion_matrix_plot(regionprops_cm, 'regionprops_cm', params.output_dir) - # elliptic fourier descriptors on input data and score - logger.info(f'-- elliptic fourier descriptors on input data --') - efd_df = run_elliptic_fourier_descriptors(params.dataset) - logger.debug(f'\n{efd_df}') - efd_cm, efd_score_df = score_dataframe(efd_df, 'efd') - logger.info(f'-- elliptic fourier descriptors on input data') - logger.info(f'-- score:\n{efd_score_df}') - logger.info(f'-- confusion matrix:\n{efd_cm}') - confusion_matrix_plot(efd_cm, 'efd', params.output_dir) - # combined shapeembed + efd + regionprops - logger.info(f'-- shapeembed + efd + regionprops --') - comb_df = pandas.concat([ shapeembed_df - , efd_df.drop('class', axis=1) - , regionprops_df.drop('class', axis=1) ], axis=1) - logger.debug(f'\n{comb_df}') - comb_cm, comb_score_df = score_dataframe(comb_df, 'combined_all') - logger.info(f'-- shapeembed + efd + regionprops on input data') - logger.info(f'-- score:\n{comb_score_df}') - logger.info(f'-- confusion matrix:\n{comb_cm}') - confusion_matrix_plot(comb_cm, 'combined_all', params.output_dir) - # XXX Not currently doing the kmeans - # XXX kmeans on input data and score - #logger.info(f'-- kmeans on input data --') - #kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) - #print(kmeans) - #logger.info(f'-- kmeans accuracy: {accuracy}') - #logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') - - # collate and save gathered results TODO KMeans - scores_df = pandas.concat([ regionprops_score_df - , efd_score_df - , shapeembed_score_df - , comb_score_df ]) - save_scores(scores_df, outputdir=params.output_dir) + confusion_matrix_plot(shapeembed_cm, f'{params.dataset.name}_shapeembed', params.output_dir) + # XXX TODO move somewhere else if desired XXX + ## combined shapeembed + efd + regionprops + #logger.info(f'-- shapeembed + efd + regionprops --') + #comb_df = pandas.concat([ shapeembed_df + # , efd_df.drop('class', axis=1) + # , regionprops_df.drop('class', axis=1) ], axis=1) + #logger.debug(f'\n{comb_df}') + #comb_cm, comb_score_df = score_dataframe(comb_df, 'combined_all') + #logger.info(f'-- shapeembed + efd + regionprops on input data') + #logger.info(f'-- score:\n{comb_score_df}') + #logger.info(f'-- confusion matrix:\n{comb_cm}') + #confusion_matrix_plot(comb_cm, 'combined_all', params.output_dir) + ## XXX Not currently doing the kmeans + ## XXX kmeans on input data and score + ##logger.info(f'-- kmeans on input data --') + ##kmeans, accuracy, conf_mat = run_kmeans(dataloader_to_dataframe(dataloader.predict_dataloader())) + ##print(kmeans) + ##logger.info(f'-- kmeans accuracy: {accuracy}') + ##logger.info(f'-- kmeans confusion matrix:\n{conf_mat}') + + ## collate and save gathered results TODO KMeans + #scores_df = pandas.concat([ regionprops_score_df + # , efd_score_df + # , shapeembed_score_df + # , comb_score_df ]) + #save_scores(scores_df, outputdir=params.output_dir) # main entry point ############################################################################### From 5a8b27498194495e282f8cd6fa0150da0a582b9b Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 20:01:54 +0100 Subject: [PATCH 151/204] unify file names across efd/regionprops/shapeembed --- scripts/shapeembed/efd.py | 8 ++++---- scripts/shapeembed/evaluation.py | 4 ++-- scripts/shapeembed/regionprops.py | 8 ++++---- scripts/shapeembed/shapeembed.py | 12 ++++++------ 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py index 20f250c5..4f910990 100755 --- a/scripts/shapeembed/efd.py +++ b/scripts/shapeembed/efd.py @@ -76,12 +76,12 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger): efd_df = run_elliptic_fourier_descriptors(dataset, contour_size, logger) logger.info(f'-- efd on {dataset.name}, raw\n{efd_df}') - efd_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_df.csv") - umap_plot(efd_df, f'{dataset.name}_efd', outputdir=clargs.output_dir) + efd_df.to_csv(f"{clargs.output_dir}/{dataset.name}-efd-raw_df.csv") + umap_plot(efd_df, f'{dataset.name}-efd', outputdir=clargs.output_dir) efd_cm, efd_score_df = score_dataframe(efd_df, 'efd') logger.info(f'-- efd on {dataset.name}, score\n{efd_score_df}') - efd_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_efd_score_df.csv") + efd_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}-efd-score_df.csv") logger.info(f'-- confusion matrix:\n{efd_cm}') - confusion_matrix_plot(efd_cm, f'{dataset.name}_efd', clargs.output_dir) + confusion_matrix_plot(efd_cm, f'{dataset.name}-efd', clargs.output_dir) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 0c2fec76..b5b2fbb8 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -106,7 +106,7 @@ def confusion_matrix_plot( cm, name, outputdir plt.title(f'{name} - Confusion Matrix') plt.xlabel('Predicted') plt.ylabel('Actual') - plt.savefig(f'{outputdir}/{name}_confusion_matrix.png') + plt.savefig(f'{outputdir}/{name}-confusion_matrix.png') plt.clf() # Clear figure def umap_plot( df @@ -152,7 +152,7 @@ def umap_plot( df seaborn.despine(left=True, bottom=True) plt.tick_params(bottom=False, left=False, labelbottom=False, labelleft=False) plt.tight_layout() - plt.savefig(f"{outputdir}/{name}_umap.pdf") + plt.savefig(f"{outputdir}/{name}-umap.pdf") plt.close() def save_scores( scores_df diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py index 1d76309c..3b65933f 100755 --- a/scripts/shapeembed/regionprops.py +++ b/scripts/shapeembed/regionprops.py @@ -78,12 +78,12 @@ def run_regionprops( dataset_params regionprops_df = run_regionprops(dataset, properties, logger) logger.info(f'-- regionprops on {dataset.name}, raw\n{regionprops_df}') - regionprops_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_df.csv") - umap_plot(regionprops_df, f'{dataset.name}_regionprops', outputdir=clargs.output_dir) + regionprops_df.to_csv(f"{clargs.output_dir}/{dataset.name}-regionprops-raw_df.csv") + umap_plot(regionprops_df, f'{dataset.name}-regionprops', outputdir=clargs.output_dir) regionprops_cm, regionprops_score_df = score_dataframe(regionprops_df, 'regionprops') logger.info(f'-- regionprops on {dataset.name}, score\n{regionprops_score_df}') - regionprops_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}_regionprops_score_df.csv") + regionprops_score_df.to_csv(f"{clargs.output_dir}/{dataset.name}-regionprops-score_df.csv") logger.info(f'-- confusion matrix:\n{regionprops_cm}') - confusion_matrix_plot(regionprops_cm, f'{dataset.name}_regionprops', clargs.output_dir) + confusion_matrix_plot(regionprops_cm, f'{dataset.name}-regionprops', clargs.output_dir) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 53e963a8..7ae9130b 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -320,17 +320,17 @@ def main_process(params): , num_workers=params.num_workers ) logger.debug(f'\n{shapeembed_df}') - np.save(f'{params.output_dir}/{params.dataset.name}_shapeembed_latent_space.npy', latent_space) - shapeembed_df.to_pickle(f'{params.output_dir}/{params.dataset.name}_shapeembed_latent_space.pkl') - shapeembed_df.to_csv(f"{params.output_dir}/{params.dataset.name}_shapeembed_df.csv") + np.save(f'{params.output_dir}/{params.dataset.name}-shapeembed-latent_space.npy', latent_space) + shapeembed_df.to_pickle(f'{params.output_dir}/{params.dataset.name}-shapeembed-latent_space.pkl') + shapeembed_df.to_csv(f"{params.output_dir}/{params.dataset.name}-shapeembed-raw_df.csv") logger.info(f'-- generate shapeembed umap --') - umap_plot(shapeembed_df, f'{params.dataset.name}_shapeembed', outputdir=params.output_dir) + umap_plot(shapeembed_df, f'{params.dataset.name}-shapeembed', outputdir=params.output_dir) logger.info(f'-- score shape embed --') shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed') logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}') - shapeembed_score_df.to_csv(f"{params.output_dir}/{params.dataset.name}_shapeembed_score_df.csv") + shapeembed_score_df.to_csv(f"{params.output_dir}/{params.dataset.name}-shapeembed-score_df.csv") logger.info(f'-- confusion matrix:\n{shapeembed_cm}') - confusion_matrix_plot(shapeembed_cm, f'{params.dataset.name}_shapeembed', params.output_dir) + confusion_matrix_plot(shapeembed_cm, f'{params.dataset.name}-shapeembed', params.output_dir) # XXX TODO move somewhere else if desired XXX ## combined shapeembed + efd + regionprops #logger.info(f'-- shapeembed + efd + regionprops --') From 9d3a0538a16c2568fbf177b2aaf4661cc88a08e1 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 20:04:33 +0100 Subject: [PATCH 152/204] Added a readme --- scripts/shapeembed/readme.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 scripts/shapeembed/readme.md diff --git a/scripts/shapeembed/readme.md b/scripts/shapeembed/readme.md new file mode 100644 index 00000000..76bebf92 --- /dev/null +++ b/scripts/shapeembed/readme.md @@ -0,0 +1,9 @@ +# Shape Embed + +There are currently 3 toplevel scripts: + +- shapeembed.py +- regionprops.py +- efd.py + +Each can be run to generate results, a umap and a confusion matrix. Each have a `-o` option to specify an output directory. From 8326afcd2b4b4a66fe730985443afc1ccfa56c0b Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 20:55:20 +0100 Subject: [PATCH 153/204] track params in reporting --- scripts/shapeembed/evaluation.py | 4 ++++ scripts/shapeembed/shapeembed.py | 38 +++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index b5b2fbb8..3f3452d8 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -49,6 +49,7 @@ def run_kmeans(dataframe, random_seed=42): return kmeans, accuracy, conf_mat def score_dataframe( df, name + , tag_columns=[] , test_sz=0.2, rand_seed=42, shuffle=True, k_folds=5 ): # drop strings and python object columns #clean_df = df.select_dtypes(exclude=['object']) @@ -95,6 +96,9 @@ def score_dataframe( df, name df = pandas.DataFrame(cv_results) df = df.drop(["fit_time", "score_time"], axis=1) df.insert(loc=0, column='trial', value=name) + tag_columns.reverse() + for tag_col_name, tag_col_value in tag_columns: + df.insert(loc=0, column=tag_col_name, value=tag_col_value) return conf_mat, df def confusion_matrix_plot( cm, name, outputdir diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 7ae9130b..5e365dbd 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -97,6 +97,24 @@ , cycle_momentum=False ) +def model_str(params): + s = f'{params.model_name}' + if vars(params.model_args): + s += f"-{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}" + return s + +def job_str(params): + return f"{params.dataset.name}-{model_str(params)}-{params.compression_factor}-{params.latent_dim}-{params.batch_size}" + +def tag_cols(params): + cols = [] + cols.append(('dataset', params.dataset.name)) + cols.append(('model', model_str(params))) + cols.append(('compression_factor', params.compression_factor)) + cols.append(('latent_dim', params.latent_dim)) + cols.append(('batch_size', params.batch_size)) + return cols + # dataset loading functions ############################################################################### @@ -191,8 +209,7 @@ def get_trainer(model, params): # setup WandB logger logger.info('setup wandb logger') - jobname = f"{params.model_name}_{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}_{params.latent_dim}_{params.batch_size}_{params.dataset.name}" - wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=jobname) + wandblogger = pl_loggers.WandbLogger(entity=params.wandb_entity, project=params.wandb_project, name=job_str(params)) wandblogger.watch(model, log="all") # setup checkpoints @@ -320,17 +337,18 @@ def main_process(params): , num_workers=params.num_workers ) logger.debug(f'\n{shapeembed_df}') - np.save(f'{params.output_dir}/{params.dataset.name}-shapeembed-latent_space.npy', latent_space) - shapeembed_df.to_pickle(f'{params.output_dir}/{params.dataset.name}-shapeembed-latent_space.pkl') - shapeembed_df.to_csv(f"{params.output_dir}/{params.dataset.name}-shapeembed-raw_df.csv") + pfx=job_str(params) + np.save(f'{params.output_dir}/{pfx}-shapeembed-latent_space.npy', latent_space) + shapeembed_df.to_pickle(f'{params.output_dir}/{pfx}-shapeembed-latent_space.pkl') + shapeembed_df.to_csv(f"{params.output_dir}/{pfx}-shapeembed-raw_df.csv") logger.info(f'-- generate shapeembed umap --') - umap_plot(shapeembed_df, f'{params.dataset.name}-shapeembed', outputdir=params.output_dir) + umap_plot(shapeembed_df, f'{pfx}-shapeembed', outputdir=params.output_dir) logger.info(f'-- score shape embed --') - shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, f'shapeembed') + shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params)) logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}') - shapeembed_score_df.to_csv(f"{params.output_dir}/{params.dataset.name}-shapeembed-score_df.csv") + shapeembed_score_df.to_csv(f"{params.output_dir}/{pfx}-shapeembed-score_df.csv") logger.info(f'-- confusion matrix:\n{shapeembed_cm}') - confusion_matrix_plot(shapeembed_cm, f'{params.dataset.name}-shapeembed', params.output_dir) + confusion_matrix_plot(shapeembed_cm, f'{pfx}-shapeembed', params.output_dir) # XXX TODO move somewhere else if desired XXX ## combined shapeembed + efd + regionprops #logger.info(f'-- shapeembed + efd + regionprops --') @@ -481,7 +499,7 @@ def prob (x): if clargs.output_dir: params.output_dir = clargs.output_dir else: - params.output_dir = f'./{params.model_name}_{params.latent_dim}_{params.batch_size}_{params.dataset.name}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' + params.output_dir = f'./{job_str(params)}_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' # XXX torch.set_float32_matmul_precision('medium') From aaa55dbc6ee5df7fefa4fbc41b47bcc416019088 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 21:10:22 +0100 Subject: [PATCH 154/204] also add model specific params as tag columns --- scripts/shapeembed/shapeembed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 5e365dbd..182a2415 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -110,6 +110,7 @@ def tag_cols(params): cols = [] cols.append(('dataset', params.dataset.name)) cols.append(('model', model_str(params))) + for k, v in vars(params.model_args).items(): cols.append((k, v)) cols.append(('compression_factor', params.compression_factor)) cols.append(('latent_dim', params.latent_dim)) cols.append(('batch_size', params.batch_size)) From ba67d36237d7614fcb1e968dc18c927d48be1234 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 22:19:33 +0100 Subject: [PATCH 155/204] added a slurm script to sweap shapeembed parameters --- scripts/shapeembed/slurm_sweap_shapeembed.py | 132 +++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100755 scripts/shapeembed/slurm_sweap_shapeembed.py diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py new file mode 100755 index 00000000..f8af3c85 --- /dev/null +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -0,0 +1,132 @@ +#! /usr/bin/env python3 + +import os +import logging +import argparse +import datetime +import subprocess + +# shapeembed parameters to sweap +################################################################################ + +datasets_pfx = '/nfs/research/uhlmann/afoix/datasets/image_datasets' +datasets = [ +# ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask") + ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask") +#, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask") +, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask") +#, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask") +#, ("allen", f"{datasets_pfx}/allen_dataset/", "mask") +] + +models = [ + "resnet18_vae" +#, "resnet50_vae" +#, "resnet18_beta_vae" +#, "resnet18_vae_bolt" +#, "resnet50_vae_bolt" +, "resnet18_vqvae" +#, "resnet50_vqvae" +#, "resnet18_vqvae_legacy" +#, "resnet50_vqvae_legacy" +#, "resnet101_vqvae_legacy" +#, "resnet110_vqvae_legacy" +#, "resnet152_vqvae_legacy" +#, "resnet18_vae_legacy" +#, "resnet50_vae_legacy" +] + +model_params = { + "resnet18_beta_vae": {'beta': [0.5, 1.0, 2]} +} + +compression_factors = [2, 4] + +batch_sizes = [4] + +# other parameters +################################################################################ + +dflt_slurm_dir=f'{os.getcwd()}/slurm_info_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' +dflt_out_dir=f'{os.getcwd()}/output_results_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' + +slurm_time = '50:00:00' +slurm_mem = '200G' +slurm_gpus = 'a100:1' + +n_epochs = 2 + +wandb_project='shapeembed' + +slurm_script="""#! /bin/bash +echo "running shape embed with:" +echo " - dataset {dataset[0]} ({dataset[1]}, {dataset[2]})" +echo " - model {model} ({model_params})" +echo " - compression_factor {compression_factor}" +echo " - batch size {batch_size}" +python3 shapeembed.py --wandb-project {wandb_project} --num-epochs {n_epochs} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} +""" + +################################################################################ + +def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_factor, batch_size): + jobname = f'shapeembed_{dataset[0]}_{model}_{compression_factor}_{batch_size}' + logger.info(f'spawning {jobname}') + with open(f'{slurm_out_dir}/{jobname}.script', mode='w+') as fp: + fp.write(slurm_script.format( dataset=dataset + , model=model + , model_params=[] + , compression_factor=compression_factor + , batch_size=batch_size + , out_dir=out_dir + , wandb_project=wandb_project + , n_epochs=n_epochs )) + fp.flush() + logger.info(f'written {fp.name}') + logger.debug(f'cat {fp.name}') + result = subprocess.run(['cat', fp.name], stdout=subprocess.PIPE) + logger.debug(result.stdout.decode('utf-8')) + result = subprocess.run([ 'sbatch' + , '--time', slurm_time + , '--mem', slurm_mem + , '--job-name', jobname + , '--output', f'{slurm_out_dir}/{jobname}.out' + , '--error', f'{slurm_out_dir}/{jobname}.err' + #, '--gres', n_gpus(ls) + , f'--gpus={slurm_gpus}' + , fp.name ], stdout=subprocess.PIPE) + logger.info(result.stdout.decode('utf-8')) + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Sweap parameters for shapeembed') + + parser.add_argument( + '-s', '--slurm-output-dir', metavar='SLURM_OUTPUT_DIR', default=dflt_slurm_dir + , help=f"The SLURM_OUTPUT_DIR path to use to dump slurm info") + + parser.add_argument( + '-o', '--output-dir', metavar='OUTPUT_DIR', default=dflt_out_dir + , help=f"The OUTPUT_DIR path to use to dump results") + + parser.add_argument('-v', '--verbose', action='count', default=0 + , help="Increase verbosity level by adding more \"v\".") + + # parse command line arguments + clargs=parser.parse_args() + + # set verbosity level + logger = logging.getLogger(__name__) + if clargs.verbose > 2: + logger.setLevel(logging.DEBUG) + elif clargs.verbose > 0: + logger.setLevel(logging.INFO) + + os.makedirs(clargs.slurm_output_dir, exist_ok=True) + os.makedirs(clargs.output_dir, exist_ok=True) + + for params in [ (ds, m, cf, bs) for ds in datasets + for m in models + for cf in compression_factors + for bs in batch_sizes ]: + spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params) From 7c422b1473a861690491d5da20e882051fd279d6 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 23:23:05 +0100 Subject: [PATCH 156/204] added resnet50_beta_vae to the factory --- bioimage_embed/models/factory.py | 40 +++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/bioimage_embed/models/factory.py b/bioimage_embed/models/factory.py index 713b98af..8c6440d5 100644 --- a/bioimage_embed/models/factory.py +++ b/bioimage_embed/models/factory.py @@ -97,19 +97,6 @@ def resnet18_vae(self): bolts.ResNet18VAEDecoder, ) - def resnet50_vae(self): - return self.create_model( - partial( - pythae.models.VAEConfig, - use_default_encoder=False, - use_default_decoder=False, - **self.kwargs - ), - pythae.models.VAE, - bolts.ResNet50VAEEncoder, - bolts.ResNet50VAEDecoder, - ) - def resnet18_vqvae(self): return self.create_model( partial( @@ -136,6 +123,19 @@ def resnet18_beta_vae(self): bolts.ResNet18VAEDecoder, ) + def resnet50_vae(self): + return self.create_model( + partial( + pythae.models.VAEConfig, + use_default_encoder=False, + use_default_decoder=False, + **self.kwargs + ), + pythae.models.VAE, + bolts.ResNet50VAEEncoder, + bolts.ResNet50VAEDecoder, + ) + def resnet50_vqvae(self): return self.create_model( partial( @@ -149,6 +149,19 @@ def resnet50_vqvae(self): bolts.ResNet50VQVAEDecoder, ) + def resnet50_beta_vae(self): + return self.create_model( + partial( + pythae.models.BetaVAEConfig, + use_default_encoder=False, + use_default_decoder=False, + **self.kwargs + ), + pythae.models.BetaVAE, + bolts.ResNet50VAEEncoder, + bolts.ResNet50VAEDecoder, + ) + def resnet_vae_legacy(self, depth): return self.create_model( pythae.models.VAEConfig, @@ -192,6 +205,7 @@ def resnet152_vqvae_legacy(self): "resnet18_vae", "resnet18_beta_vae", "resnet50_vae", + "resnet50_beta_vae", "resnet18_vae_bolt", "resnet50_vae_bolt", "resnet18_vqvae", From 1307579b2239fc83f9ab2998f4023e19bffa1a94 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 23:23:56 +0100 Subject: [PATCH 157/204] added resnet50_beta_vae to the shapeembed script --- scripts/shapeembed/shapeembed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 182a2415..2b987875 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -46,6 +46,7 @@ "resnet18_vae" , "resnet50_vae" , "resnet18_beta_vae" +, "resnet50_beta_vae" , "resnet18_vae_bolt" , "resnet50_vae_bolt" , "resnet18_vqvae" From 1f82d9f200524998b93eadd2919cd67c54d419e4 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 23:24:47 +0100 Subject: [PATCH 158/204] handle per model params in slurm script + chose some param values to sweap --- scripts/shapeembed/slurm_sweap_shapeembed.py | 41 +++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index f8af3c85..b1ef3389 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -4,6 +4,7 @@ import logging import argparse import datetime +import itertools import subprocess # shapeembed parameters to sweap @@ -11,22 +12,24 @@ datasets_pfx = '/nfs/research/uhlmann/afoix/datasets/image_datasets' datasets = [ + ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask") # ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask") - ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask") +# ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask") #, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask") -, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask") +#, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask") #, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask") #, ("allen", f"{datasets_pfx}/allen_dataset/", "mask") ] models = [ "resnet18_vae" -#, "resnet50_vae" -#, "resnet18_beta_vae" +, "resnet50_vae" +, "resnet18_beta_vae" +, "resnet50_beta_vae" #, "resnet18_vae_bolt" #, "resnet50_vae_bolt" , "resnet18_vqvae" -#, "resnet50_vqvae" +, "resnet50_vqvae" #, "resnet18_vqvae_legacy" #, "resnet50_vqvae_legacy" #, "resnet101_vqvae_legacy" @@ -37,12 +40,13 @@ ] model_params = { - "resnet18_beta_vae": {'beta': [0.5, 1.0, 2]} + "resnet18_beta_vae": {'beta': [1,2,5,10,20]} +, "resnet50_beta_vae": {'beta': [1,2,5,10,20]} } -compression_factors = [2, 4] +compression_factors = [1,2,3,5,10,20] -batch_sizes = [4] +batch_sizes = [4, 8, 16] # other parameters ################################################################################ @@ -54,8 +58,6 @@ slurm_mem = '200G' slurm_gpus = 'a100:1' -n_epochs = 2 - wandb_project='shapeembed' slurm_script="""#! /bin/bash @@ -64,15 +66,19 @@ echo " - model {model} ({model_params})" echo " - compression_factor {compression_factor}" echo " - batch size {batch_size}" -python3 shapeembed.py --wandb-project {wandb_project} --num-epochs {n_epochs} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} +python3 shapeembed.py --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args} """ ################################################################################ -def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_factor, batch_size): +def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_factor, batch_size, **kwargs): jobname = f'shapeembed_{dataset[0]}_{model}_{compression_factor}_{batch_size}' logger.info(f'spawning {jobname}') with open(f'{slurm_out_dir}/{jobname}.script', mode='w+') as fp: + extra_args=[] + for k, v in kwargs.items(): + extra_args.append(f'--model-arg-{k}') + extra_args.append(f'{v}') fp.write(slurm_script.format( dataset=dataset , model=model , model_params=[] @@ -80,7 +86,7 @@ def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_ , batch_size=batch_size , out_dir=out_dir , wandb_project=wandb_project - , n_epochs=n_epochs )) + , extra_args=' '.join(extra_args) )) fp.flush() logger.info(f'written {fp.name}') logger.debug(f'cat {fp.name}') @@ -129,4 +135,11 @@ def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_ for m in models for cf in compression_factors for bs in batch_sizes ]: - spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params) + # per model params: + m = params[1] + if m in model_params: + mps = model_params[m] + for ps in [dict(zip(mps.keys(), vs)) for vs in itertools.product(*mps.values())]: + spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params, **ps) + else: + spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params) From 2c49fc9c1ad010a077037cd1b2dbaac5db509d22 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 18 Jul 2024 23:38:48 +0100 Subject: [PATCH 159/204] better slurm jobname --- scripts/shapeembed/slurm_sweap_shapeembed.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index b1ef3389..456246dd 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -72,7 +72,10 @@ ################################################################################ def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_factor, batch_size, **kwargs): - jobname = f'shapeembed_{dataset[0]}_{model}_{compression_factor}_{batch_size}' + model_str = model + if kwargs: + model_str += f"_{'_'.join([f'{k}{v}' for k, v in kwargs.items()])}" + jobname = f'shapeembed-{dataset[0]}-{model_str}-{compression_factor}-{batch_size}' logger.info(f'spawning {jobname}') with open(f'{slurm_out_dir}/{jobname}.script', mode='w+') as fp: extra_args=[] From a291c037281154ff9d85a52cf17d64843c438442 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 19 Jul 2024 16:58:00 +0100 Subject: [PATCH 160/204] removed compression factor 20 --- scripts/shapeembed/slurm_sweap_shapeembed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index 456246dd..5568b9b3 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -44,7 +44,7 @@ , "resnet50_beta_vae": {'beta': [1,2,5,10,20]} } -compression_factors = [1,2,3,5,10,20] +compression_factors = [1,2,3,5,10] batch_sizes = [4, 8, 16] From a29b6bd652315117e5ca44f4944f27667db61c08 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 19 Jul 2024 16:58:58 +0100 Subject: [PATCH 161/204] bumped up memory allocation to 250G --- scripts/shapeembed/slurm_sweap_shapeembed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index 5568b9b3..ffed3933 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -55,7 +55,7 @@ dflt_out_dir=f'{os.getcwd()}/output_results_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' slurm_time = '50:00:00' -slurm_mem = '200G' +slurm_mem = '250G' slurm_gpus = 'a100:1' wandb_project='shapeembed' From dde96fc499fde1c124483e9439ff5ccedd5b3e03 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 19 Jul 2024 16:59:26 +0100 Subject: [PATCH 162/204] added --no-early-stop flag --- scripts/shapeembed/slurm_sweap_shapeembed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index ffed3933..50925bfe 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -66,7 +66,7 @@ echo " - model {model} ({model_params})" echo " - compression_factor {compression_factor}" echo " - batch size {batch_size}" -python3 shapeembed.py --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args} +python3 shapeembed.py --no-early-stop --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args} """ ################################################################################ From 8dbf551cef3de04c66871959843252d554cdbcce Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jul 2024 15:35:04 +0100 Subject: [PATCH 163/204] added an oom_retry function --- scripts/shapeembed/shapeembed.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 2b987875..6cedd481 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -117,6 +117,18 @@ def tag_cols(params): cols.append(('batch_size', params.batch_size)) return cols +def oom_retry(f, *args, n_oom_retries=1, logger=logging.getLogger(__name__), **kwargs): + try: + logger.info(f'Trying {f.__name__} within oom_retry, n_oom_retries = {n_oom_retries}') + return f(*args, **kwargs) + except RuntimeError as e: + if 'out of memory' in str(e) and n_oom_retries > 0: + logger.warning(f'{f.__name__} ran out of memory, retrying') + torch.cuda.empty_cache() + return oom_retry(f, *args, n_oom_retries=n_oom_retries-1, logger=logger, **kwargs) + else: + raise e + # dataset loading functions ############################################################################### @@ -319,15 +331,15 @@ def main_process(params): # setup ####### - model = get_model(params) - trainer = get_trainer(model, params) - dataloader = get_dataloader(params) + model = oom_retry(get_model, params) + trainer = oom_retry(get_trainer, model, params) + dataloader = oom_retry(get_dataloader, params) # run actual work ################# - train_model(trainer, model, dataloader) - validate_model(trainer, model, dataloader) - test_model(trainer, model, dataloader) + oom_retry(train_model, trainer, model, dataloader, n_oom_retries=2) + oom_retry(validate_model, trainer, model, dataloader) + oom_retry(test_model, trainer, model, dataloader) # run predictions ################# From 38a9ff6429c4a6f3a3a3940dc4cf7868253d4157 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jul 2024 15:55:37 +0100 Subject: [PATCH 164/204] refined min / max epochs clargs --- scripts/shapeembed/shapeembed.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 6cedd481..9144e17a 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -74,7 +74,8 @@ , num_embeddings=1024 , num_hiddens=1024 , num_workers=8 -, epochs=150 +, min_epochs=50 +, max_epochs=150 , pretrained=False , frobenius_norm=False , early_stop=False @@ -251,8 +252,8 @@ def get_trainer(model, params): , accelerator="gpu" , accumulate_grad_batches=4 , callbacks=trainer_callbacks - , min_epochs=50 - , max_epochs=params.epochs + , min_epochs=params.min_epochs + , max_epochs=params.max_epochs , log_every_n_steps=1 ) @@ -453,9 +454,15 @@ def prob (x): parser.add_argument( '-n', '--num-workers', metavar='NUM_WORKERS', type=auto_pos_int , help=f"The NUM_WORKERS for the run, a positive integer (default {dflt_params.num_workers})") + parser.add_argument( + '--min-epochs', metavar='MIN_EPOCHS', type=auto_pos_int + , help=f"Set the MIN_EPOCHS for the run, a positive integer (default {dflt_params.min_epochs})") + parser.add_argument( + '--max-epochs', metavar='MAX_EPOCHS', type=auto_pos_int + , help=f"Set the MAX_EPOCHS for the run, a positive integer (default {dflt_params.max_epochs})") parser.add_argument( '-e', '--num-epochs', metavar='NUM_EPOCHS', type=auto_pos_int - , help=f"The NUM_EPOCHS for the run, a positive integer (default {dflt_params.epochs})") + , help=f"Forces the NUM_EPOCHS for the run, a positive integer (sets both min and max epoch)") parser.add_argument('--clear-checkpoints', action='store_true' , help='remove checkpoints') parser.add_argument('-v', '--verbose', action='count', default=0 @@ -508,8 +515,13 @@ def prob (x): params.num_hiddens = clargs.number_hiddens if clargs.num_workers: params.num_workers = clargs.num_workers + if clargs.min_epochs: + params.min_epochs = clargs.min_epochs + if clargs.max_epochs: + params.max_epochs = clargs.max_epochs if clargs.num_epochs: - params.epochs = clargs.num_epochs + params.min_epochs = clargs.num_epochs + params.max_epochs = clargs.num_epochs if clargs.output_dir: params.output_dir = clargs.output_dir else: From 10c8b507688710b85ef63c63cf67b85f915f0626 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jul 2024 16:03:00 +0100 Subject: [PATCH 165/204] slurm script refactor args + force 150 epochs --- scripts/shapeembed/slurm_sweap_shapeembed.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index 50925bfe..ffbc8b07 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -66,7 +66,7 @@ echo " - model {model} ({model_params})" echo " - compression_factor {compression_factor}" echo " - batch size {batch_size}" -python3 shapeembed.py --no-early-stop --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args} +python3 shapeembed.py --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args} """ ################################################################################ @@ -79,6 +79,9 @@ def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_ logger.info(f'spawning {jobname}') with open(f'{slurm_out_dir}/{jobname}.script', mode='w+') as fp: extra_args=[] + extra_args.append('--no-early-stop') + extra_args.append('--num-epochs') + extra_args.append('150') for k, v in kwargs.items(): extra_args.append(f'--model-arg-{k}') extra_args.append(f'{v}') From c0cd3b4b4b34362e3396089e6b137f5cd4c473fb Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jul 2024 19:11:06 +0100 Subject: [PATCH 166/204] bring triangular + compression computation in named function (to share use) --- scripts/shapeembed/shapeembed.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 9144e17a..cd41fbe7 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -99,6 +99,9 @@ , cycle_momentum=False ) +def compressed_n_features(dist_mat_size, comp_fact): + return dist_mat_size*(dist_mat_size-1)//(2**comp_fact) + def model_str(params): s = f'{params.model_name}' if vars(params.model_args): @@ -507,8 +510,7 @@ def prob (x): params.distance_matrix_roll_probability = clargs.distance_matrix_roll_probability if clargs.compression_factor: params.compression_factor = clargs.compression_factor - n_features = lambda d, n: d*(d-1)//(2**n) - params.latent_dim = n_features(params.distance_matrix_size, params.compression_factor) + params.latent_dim = compressed_n_features(params.distance_matrix_size, params.compression_factor) if clargs.number_embeddings: params.num_embeddings = clargs.number_embeddings if clargs.number_hiddens: From 3b1fdda50c23cdf9da9f5176b42a525ca0f09018 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jul 2024 19:12:00 +0100 Subject: [PATCH 167/204] fix in model_str function test of model_args --- scripts/shapeembed/shapeembed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index cd41fbe7..ab0f11a9 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -104,7 +104,7 @@ def compressed_n_features(dist_mat_size, comp_fact): def model_str(params): s = f'{params.model_name}' - if vars(params.model_args): + if hasattr(params, 'model_args'): s += f"-{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}" return s From 65842f6ca0b217bd1b3cdc07ef399d4a72da1c10 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 20 Jul 2024 19:14:50 +0100 Subject: [PATCH 168/204] refactor slurm script to detect already completed jobs --- scripts/shapeembed/slurm_sweap_shapeembed.py | 162 +++++++++++++------ 1 file changed, 115 insertions(+), 47 deletions(-) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index ffbc8b07..a507c40f 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -1,12 +1,18 @@ #! /usr/bin/env python3 import os +import glob +import copy +import types import logging +import tempfile import argparse import datetime import itertools import subprocess +import shapeembed + # shapeembed parameters to sweap ################################################################################ @@ -48,6 +54,47 @@ batch_sizes = [4, 8, 16] +def gen_params_sweap_list(): + p_sweap_list = [] + for params in [ { 'dataset': types.SimpleNamespace(name=ds[0], path=ds[1], type=ds[2]) + , 'model_name': m + , 'compression_factor': cf + , 'latent_dim': shapeembed.compressed_n_features(512, cf) + , 'batch_size': bs + } for ds in datasets + for m in models + for cf in compression_factors + for bs in batch_sizes ]: + # per model params: + if params['model_name'] in model_params: + mps = model_params[params['model_name']] + for ps in [dict(zip(mps.keys(), vs)) for vs in itertools.product(*mps.values())]: + newparams = copy.deepcopy(params) + newparams['model_args'] = types.SimpleNamespace(**ps) + p_sweap_list.append(types.SimpleNamespace(**newparams)) + else: + p_sweap_list.append(types.SimpleNamespace(**params)) + return p_sweap_list + +def params_match(x, ys): + found = False + def check_model_args(a, b): + a_yes = hasattr(a, 'model_args') + b_yes = hasattr(b, 'model_args') + if not a_yes and not b_yes: return True + if a_yes and b_yes: return a.model_args == b.model_args + return False + for y in ys: + if x.dataset.name == y.dataset \ + and x.model_name == y.model_name \ + and check_model_args(x, y) \ + and x.compression_factor == y.compression_factor \ + and x.latent_dim == y.latent_dim \ + and x.batch_size == y.batch_size: + found = True + break + return found + # other parameters ################################################################################ @@ -58,6 +105,7 @@ slurm_mem = '250G' slurm_gpus = 'a100:1' +shapeembed_script=f'{os.getcwd()}/shapeembed.py' wandb_project='shapeembed' slurm_script="""#! /bin/bash @@ -71,43 +119,70 @@ ################################################################################ -def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_factor, batch_size, **kwargs): - model_str = model - if kwargs: - model_str += f"_{'_'.join([f'{k}{v}' for k, v in kwargs.items()])}" - jobname = f'shapeembed-{dataset[0]}-{model_str}-{compression_factor}-{batch_size}' - logger.info(f'spawning {jobname}') - with open(f'{slurm_out_dir}/{jobname}.script', mode='w+') as fp: - extra_args=[] - extra_args.append('--no-early-stop') - extra_args.append('--num-epochs') - extra_args.append('150') - for k, v in kwargs.items(): - extra_args.append(f'--model-arg-{k}') - extra_args.append(f'{v}') - fp.write(slurm_script.format( dataset=dataset - , model=model - , model_params=[] - , compression_factor=compression_factor - , batch_size=batch_size - , out_dir=out_dir - , wandb_project=wandb_project - , extra_args=' '.join(extra_args) )) +def model_params_from_model_params_str(modelparamsstr): + rawps = modelparamsstr.split('_') + ps = {} + for p in rawps: + if p[0:4] == 'beta': ps['beta'] = float(p[4:]) + return types.SimpleNamespace(**ps) + +def params_from_job_str(jobstr): + raw = jobstr.split('-') + ps = {} + ps['batch_size'] = int(raw.pop()) + ps['latent_dim'] = int(raw.pop()) + ps['compression_factor'] = int(raw.pop()) + if len(raw) == 3: + ps['model_args'] = model_params_from_model_params_str(raw.pop()) + ps['model_name'] = raw.pop() + ps['dataset'] = raw.pop() + return types.SimpleNamespace(**ps) + +def find_done_params(out_dir): + ps = [] + for f in glob.glob(f'{out_dir}/*-shapeembed-score_df.csv'): + ps.append(params_from_job_str(os.path.basename(f)[:-24])) + return ps + +def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name__)): + + jobname = shapeembed.job_str(ps) + cmd = [ 'python3', shapeembed_script + , '--wandb-project', wandb_project + , '--output-dir', out_dir + ] + cmd += [ '--clear-checkpoints' + , '--no-early-stop' + , '--num-epochs', 150 + ] + cmd += [ '--dataset', ps.dataset.name, ps.dataset.path, ps.dataset.type + , '--model', ps.model_name + , '--compression-factor', ps.compression_factor + , '--batch-size', ps.batch_size + ] + if hasattr(ps, 'model_args'): + for k, v in vars(ps.model_args).items(): + cmd.append(f'--model-arg-{k}') + cmd.append(f'{v}') + logger.debug(" ".join(map(str,cmd))) + with tempfile.NamedTemporaryFile('w+') as fp: + fp.write('#! /usr/bin/env sh\n') + fp.write(" ".join(map(str,cmd))) + fp.write('\n') fp.flush() - logger.info(f'written {fp.name}') - logger.debug(f'cat {fp.name}') - result = subprocess.run(['cat', fp.name], stdout=subprocess.PIPE) + cmd = [ 'sbatch' + , '--time', slurm_time + , '--mem', slurm_mem + , '--job-name', jobname + , '--output', f'{slurm_out_dir}/{jobname}.out' + , '--error', f'{slurm_out_dir}/{jobname}.err' + , f'--gpus={slurm_gpus}' + , fp.name ] + logger.debug(" ".join(map(str,cmd))) + result = subprocess.run(cmd, stdout=subprocess.PIPE) logger.debug(result.stdout.decode('utf-8')) - result = subprocess.run([ 'sbatch' - , '--time', slurm_time - , '--mem', slurm_mem - , '--job-name', jobname - , '--output', f'{slurm_out_dir}/{jobname}.out' - , '--error', f'{slurm_out_dir}/{jobname}.err' - #, '--gres', n_gpus(ls) - , f'--gpus={slurm_gpus}' - , fp.name ], stdout=subprocess.PIPE) - logger.info(result.stdout.decode('utf-8')) + logger.info(f'job spawned for {ps}') + if __name__ == "__main__": @@ -137,15 +212,8 @@ def spawn_slurm_job(logger, slurm_out_dir, out_dir, dataset, model, compression_ os.makedirs(clargs.slurm_output_dir, exist_ok=True) os.makedirs(clargs.output_dir, exist_ok=True) - for params in [ (ds, m, cf, bs) for ds in datasets - for m in models - for cf in compression_factors - for bs in batch_sizes ]: - # per model params: - m = params[1] - if m in model_params: - mps = model_params[m] - for ps in [dict(zip(mps.keys(), vs)) for vs in itertools.product(*mps.values())]: - spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params, **ps) - else: - spawn_slurm_job(logger, clargs.slurm_output_dir, clargs.output_dir, *params) + done_params = find_done_params(clargs.output_dir) + all_params = gen_params_sweap_list() + todo_params = [x for x in all_params if not params_match(x, done_params)] + for ps in todo_params: + spawn_slurm_job(clargs.slurm_output_dir, clargs.output_dir, ps, logger=logger) From f97657e37201dbaf9962ed2488cd0cdac1e36567 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 21 Jul 2024 00:02:15 +0100 Subject: [PATCH 169/204] factored out some common helpers --- scripts/shapeembed/common_helpers.py | 38 +++++++++++++++++++ scripts/shapeembed/shapeembed.py | 14 +------ scripts/shapeembed/slurm_sweap_shapeembed.py | 40 ++++---------------- 3 files changed, 48 insertions(+), 44 deletions(-) create mode 100644 scripts/shapeembed/common_helpers.py diff --git a/scripts/shapeembed/common_helpers.py b/scripts/shapeembed/common_helpers.py new file mode 100644 index 00000000..71a13847 --- /dev/null +++ b/scripts/shapeembed/common_helpers.py @@ -0,0 +1,38 @@ +import os +import glob +import types +import logging + +def compressed_n_features(dist_mat_size, comp_fact): + return dist_mat_size*(dist_mat_size-1)//(2**comp_fact) + +def model_str(params): + s = f'{params.model_name}' + if hasattr(params, 'model_args'): + s += f"-{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}" + return s + +def job_str(params): + return f"{params.dataset.name}-{model_str(params)}-{params.compression_factor}-{params.latent_dim}-{params.batch_size}" + +def params_from_job_str(jobstr): + raw = jobstr.split('-') + ps = types.SimpleNamespace() + ps.batch_size = int(raw.pop()) + ps.latent_dim = int(raw.pop()) + ps.compression_factor = int(raw.pop()) + if len(raw) == 3: + ps.model_args = types.SimpleNamespace() + for p in raw.pop().split('-'): + if p[0:4] == 'beta': ps.model_args.beta = float(p[4:]) + ps.model_name = raw.pop() + ps.dataset = raw.pop() + return ps + +def find_existing_run_scores(dirname, logger=logging.getLogger(__name__)): + ps = [] + for f in glob.glob(f'{dirname}/*-shapeembed-score_df.csv'): + p = params_from_job_str(os.path.basename(f)[:-24]) + p.csv_file = f + ps.append(p) + return ps diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index ab0f11a9..744f29fa 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -27,6 +27,8 @@ from dataset_transformations import * from evaluation import * +from common_helpers import * + # logging facilities ############################################################################### logger = logging.getLogger(__name__) @@ -99,18 +101,6 @@ , cycle_momentum=False ) -def compressed_n_features(dist_mat_size, comp_fact): - return dist_mat_size*(dist_mat_size-1)//(2**comp_fact) - -def model_str(params): - s = f'{params.model_name}' - if hasattr(params, 'model_args'): - s += f"-{'_'.join([f'{k}{v}' for k, v in vars(params.model_args).items()])}" - return s - -def job_str(params): - return f"{params.dataset.name}-{model_str(params)}-{params.compression_factor}-{params.latent_dim}-{params.batch_size}" - def tag_cols(params): cols = [] cols.append(('dataset', params.dataset.name)) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index a507c40f..6ebb81ac 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -11,7 +11,7 @@ import itertools import subprocess -import shapeembed +from common_helpers import * # shapeembed parameters to sweap ################################################################################ @@ -59,7 +59,7 @@ def gen_params_sweap_list(): for params in [ { 'dataset': types.SimpleNamespace(name=ds[0], path=ds[1], type=ds[2]) , 'model_name': m , 'compression_factor': cf - , 'latent_dim': shapeembed.compressed_n_features(512, cf) + , 'latent_dim': compressed_n_features(512, cf) , 'batch_size': bs } for ds in datasets for m in models @@ -119,34 +119,9 @@ def check_model_args(a, b): ################################################################################ -def model_params_from_model_params_str(modelparamsstr): - rawps = modelparamsstr.split('_') - ps = {} - for p in rawps: - if p[0:4] == 'beta': ps['beta'] = float(p[4:]) - return types.SimpleNamespace(**ps) - -def params_from_job_str(jobstr): - raw = jobstr.split('-') - ps = {} - ps['batch_size'] = int(raw.pop()) - ps['latent_dim'] = int(raw.pop()) - ps['compression_factor'] = int(raw.pop()) - if len(raw) == 3: - ps['model_args'] = model_params_from_model_params_str(raw.pop()) - ps['model_name'] = raw.pop() - ps['dataset'] = raw.pop() - return types.SimpleNamespace(**ps) - -def find_done_params(out_dir): - ps = [] - for f in glob.glob(f'{out_dir}/*-shapeembed-score_df.csv'): - ps.append(params_from_job_str(os.path.basename(f)[:-24])) - return ps - def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name__)): - jobname = shapeembed.job_str(ps) + jobname = job_str(ps) cmd = [ 'python3', shapeembed_script , '--wandb-project', wandb_project , '--output-dir', out_dir @@ -203,16 +178,17 @@ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name_ clargs=parser.parse_args() # set verbosity level + logging.basicConfig() logger = logging.getLogger(__name__) - if clargs.verbose > 2: - logger.setLevel(logging.DEBUG) + if clargs.verbose > 1: + logger.setLevel('DEBUG') elif clargs.verbose > 0: - logger.setLevel(logging.INFO) + logger.setLevel('INFO') os.makedirs(clargs.slurm_output_dir, exist_ok=True) os.makedirs(clargs.output_dir, exist_ok=True) - done_params = find_done_params(clargs.output_dir) + done_params = find_existing_run_scores(clargs.output_dir) all_params = gen_params_sweap_list() todo_params = [x for x in all_params if not params_match(x, done_params)] for ps in todo_params: From b74be39fc79fe068876527d44c06fe4576bd3fe0 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 21 Jul 2024 16:58:20 +0100 Subject: [PATCH 170/204] Add a comment/uncomment block for quick ad-hoc single config run --- scripts/shapeembed/slurm_sweap_shapeembed.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index 6ebb81ac..0cbe042e 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -54,6 +54,16 @@ batch_sizes = [4, 8, 16] +# XXX XXX XXX XXX XXX XXX XXX # +# XXX ad-hoc one-off config XXX # +# XXX XXX XXX XXX XXX XXX XXX # +# uncomment the lines below for a quick overwrite of the parameter sweep +#datasets = [("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask")] +#models = ["resnet50_vae"] +#model_params = {} #{"resnet50_beta_vae": {'beta': [1]}} +#compression_factors = [10] +#batch_sizes = [16] + def gen_params_sweap_list(): p_sweap_list = [] for params in [ { 'dataset': types.SimpleNamespace(name=ds[0], path=ds[1], type=ds[2]) From 9b8e93463123cf895957bda084ae5e22a8c61963 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 21 Jul 2024 16:59:40 +0100 Subject: [PATCH 171/204] added a function to find currently submitted slurm jobs --- scripts/shapeembed/slurm_sweap_shapeembed.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index 0cbe042e..482afc5d 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -105,6 +105,10 @@ def check_model_args(a, b): break return found +def find_submitted_slurm_jobs(): + jobs = subprocess.run(['squeue', '--format', '%j'], stdout=subprocess.PIPE).stdout.decode('utf-8').split() + return list(map(params_from_job_str, jobs[1:])) + # other parameters ################################################################################ From 2cead3d94bae977b9ab6036dd38f94a35689df2a Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 21 Jul 2024 17:00:22 +0100 Subject: [PATCH 172/204] added clargs for job filtering enabling/disabling (enabled by default) --- scripts/shapeembed/slurm_sweap_shapeembed.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweap_shapeembed.py index 482afc5d..0024f557 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweap_shapeembed.py @@ -185,6 +185,14 @@ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name_ '-o', '--output-dir', metavar='OUTPUT_DIR', default=dflt_out_dir , help=f"The OUTPUT_DIR path to use to dump results") + parser.add_argument( + '--filter-done', action=argparse.BooleanOptionalAction, default=True + , help=f'filter out jobs with results (a *scores_df.csv) in OUTPUT_DIR') + + parser.add_argument( + '--filter-submitted', action=argparse.BooleanOptionalAction, default=True + , help=f'filter out jobs present in the current slurm `squeue`') + parser.add_argument('-v', '--verbose', action='count', default=0 , help="Increase verbosity level by adding more \"v\".") @@ -203,7 +211,14 @@ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name_ os.makedirs(clargs.output_dir, exist_ok=True) done_params = find_existing_run_scores(clargs.output_dir) + in_slurm_params = find_submitted_slurm_jobs() all_params = gen_params_sweap_list() - todo_params = [x for x in all_params if not params_match(x, done_params)] + + todo_params = all_params + if clargs.filter_done: + todo_params = [x for x in todo_params if not params_match(x, done_params)] + if clargs.filter_submitted: + todo_params = [x for x in todo_params if not params_match(x, in_slurm_params)] + for ps in todo_params: spawn_slurm_job(clargs.slurm_output_dir, clargs.output_dir, ps, logger=logger) From 650fcc17d5b2d7d5715ad2ddf2e653baad0470c8 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 21 Jul 2024 17:04:39 +0100 Subject: [PATCH 173/204] typo fix: sweap -> sweep --- ...eap_shapeembed.py => slurm_sweep_shapeembed.py} | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) rename scripts/shapeembed/{slurm_sweap_shapeembed.py => slurm_sweep_shapeembed.py} (96%) diff --git a/scripts/shapeembed/slurm_sweap_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py similarity index 96% rename from scripts/shapeembed/slurm_sweap_shapeembed.py rename to scripts/shapeembed/slurm_sweep_shapeembed.py index 0024f557..a9d89901 100755 --- a/scripts/shapeembed/slurm_sweap_shapeembed.py +++ b/scripts/shapeembed/slurm_sweep_shapeembed.py @@ -13,7 +13,7 @@ from common_helpers import * -# shapeembed parameters to sweap +# shapeembed parameters to sweep ################################################################################ datasets_pfx = '/nfs/research/uhlmann/afoix/datasets/image_datasets' @@ -64,8 +64,8 @@ #compression_factors = [10] #batch_sizes = [16] -def gen_params_sweap_list(): - p_sweap_list = [] +def gen_params_sweep_list(): + p_sweep_list = [] for params in [ { 'dataset': types.SimpleNamespace(name=ds[0], path=ds[1], type=ds[2]) , 'model_name': m , 'compression_factor': cf @@ -81,10 +81,10 @@ def gen_params_sweap_list(): for ps in [dict(zip(mps.keys(), vs)) for vs in itertools.product(*mps.values())]: newparams = copy.deepcopy(params) newparams['model_args'] = types.SimpleNamespace(**ps) - p_sweap_list.append(types.SimpleNamespace(**newparams)) + p_sweep_list.append(types.SimpleNamespace(**newparams)) else: - p_sweap_list.append(types.SimpleNamespace(**params)) - return p_sweap_list + p_sweep_list.append(types.SimpleNamespace(**params)) + return p_sweep_list def params_match(x, ys): found = False @@ -212,7 +212,7 @@ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name_ done_params = find_existing_run_scores(clargs.output_dir) in_slurm_params = find_submitted_slurm_jobs() - all_params = gen_params_sweap_list() + all_params = gen_params_sweep_list() todo_params = all_params if clargs.filter_done: From 62704af70eba4880bd586918150edbb88a88127e Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 21 Jul 2024 18:08:51 +0100 Subject: [PATCH 174/204] parse dataset as a SimpleNamespace from job string --- scripts/shapeembed/common_helpers.py | 2 +- scripts/shapeembed/slurm_sweep_shapeembed.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/common_helpers.py b/scripts/shapeembed/common_helpers.py index 71a13847..204b4c09 100644 --- a/scripts/shapeembed/common_helpers.py +++ b/scripts/shapeembed/common_helpers.py @@ -26,7 +26,7 @@ def params_from_job_str(jobstr): for p in raw.pop().split('-'): if p[0:4] == 'beta': ps.model_args.beta = float(p[4:]) ps.model_name = raw.pop() - ps.dataset = raw.pop() + ps.dataset = types.SimpleNamespace(name=raw.pop()) return ps def find_existing_run_scores(dirname, logger=logging.getLogger(__name__)): diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py index a9d89901..a15e85cd 100755 --- a/scripts/shapeembed/slurm_sweep_shapeembed.py +++ b/scripts/shapeembed/slurm_sweep_shapeembed.py @@ -95,7 +95,7 @@ def check_model_args(a, b): if a_yes and b_yes: return a.model_args == b.model_args return False for y in ys: - if x.dataset.name == y.dataset \ + if x.dataset.name == y.dataset.name \ and x.model_name == y.model_name \ and check_model_args(x, y) \ and x.compression_factor == y.compression_factor \ From 6e9ffcf2ca751ca23eadacfe0a04be98964180a5 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 21 Jul 2024 18:10:32 +0100 Subject: [PATCH 175/204] updated data gathering script to newer changes (still TODO for figures) --- scripts/shapeembed/gather_run_results.py | 220 ++++++++++++----------- 1 file changed, 116 insertions(+), 104 deletions(-) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index 1af719be..11410d39 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -1,12 +1,15 @@ #! /usr/bin/env python3 -import pandas as pd +import os +import shutil import logging import argparse -import shutil -import os +import datetime import functools - +import pandas as pd + +from common_helpers import * + # define a Custom aggregation # function for finding total def keep_first_fname(series): @@ -17,128 +20,136 @@ def get_run_info(run): return f'{x[0]}_{x[1]}', x[2], x[4] def main_process(clargs, logger=logging.getLogger(__name__)): - print(clargs) + + params = [] + for f in clargs.run_folders: + ps = find_existing_run_scores(f) + for p in ps: p.folder = f + params.append(ps) + params = [x for ps in params for x in ps] + logger.debug(params) + os.makedirs(clargs.output_dir, exist_ok=True) - dfs = [] - for d in clargs.run_folder: - csv = f'{d}/scores_df.csv' - #csv = f'{d}/scores_df_mean.csv' - if not os.path.isfile(csv): - print(f'WARNING: no {csv} found, skipping') - continue - - run_name = os.path.basename(d) - model, latent_space_sz, dataset = get_run_info(run_name) - df = pd.read_csv(csv) - df['model'] = model - df['latent_space_sz'] = latent_space_sz - df['dataset'] = dataset - - for trial in ['efd','regionprops','shapeembed', 'combined_all']: - - conf_mat = f'{trial}_confusion_matrix.png' - if os.path.isfile(f'{d}/{conf_mat}'): - shutil.copy(f'{d}/{conf_mat}',f'{clargs.output_dir}/{run_name}_{conf_mat}') - df.loc[df['trial'] == trial, 'conf_mat'] = f'./{run_name}_{conf_mat}' - else: - df.loc[df['trial'] == trial, 'conf_mat'] = f'nofile' - - umap = f'umap_{trial}.pdf' - if os.path.isfile(f'{d}/{umap}'): - shutil.copy(f'{d}/{umap}',f'{clargs.output_dir}/{run_name}_{umap}') - df.loc[df['trial'] == trial, 'umap'] = f'./{run_name}_{umap}' - else: - df.loc[df['trial'] == trial, 'umap'] = f'nofile' - - barplot = f'scores_barplot.pdf' - if os.path.isfile(f'{d}/{barplot}'): - shutil.copy(f'{d}/{barplot}',f'{clargs.output_dir}/{run_name}_{barplot}') - df.loc[df['trial'] == trial, 'barplot'] = f'./{run_name}_{barplot}' - else: - df.loc[df['trial'] == trial, 'barplot'] = f'nofile' + dfs = [] + for p in params: + + # open scores dataframe + df = pd.read_csv(p.csv_file, index_col=0) + + # pair up with confusion matrix + conf_mat_file = f'{job_str(p)}-shapeembed-confusion_matrix.png' + print(f'{p.folder}/{conf_mat_file}') + if os.path.isfile(f'{p.folder}/{conf_mat_file}'): + shutil.copy(f'{p.folder}/{conf_mat_file}',f'{clargs.output_dir}/{conf_mat_file}') + df['conf_mat'] = f'./{conf_mat_file}' + else: + df['conf_mat'] = f'nofile' + + # pair up with umap + umap_file = f'{job_str(p)}-shapeembed-umap.pdf' + if os.path.isfile(f'{p.folder}/{umap_file}'): + shutil.copy(f'{p.folder}/{umap_file}',f'{clargs.output_dir}/{umap_file}') + df['umap'] = f'./{umap_file}' + else: + df['umap'] = f'nofile' + + ## pair up with barplot + #barplot = f'scores_barplot.pdf' + #if os.path.isfile(f'{d}/{barplot}'): + # shutil.copy(f'{d}/{barplot}',f'{clargs.output_dir}/{run_name}_{barplot}') + # df.loc[df['trial'] == trial, 'barplot'] = f'./{run_name}_{barplot}' + #else: + # df.loc[df['trial'] == trial, 'barplot'] = f'nofile' + + # add dataframe to list for future concatenation dfs.append(df.convert_dtypes()) + # gather all dataframes together df = pd.concat(dfs) - df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df - df.set_index(['dataset', 'trial', 'model', 'latent_space_sz'], inplace=True) + logger.debug(df) + df.to_csv(f'{clargs.output_dir}/all_scores_df.csv', index=False) + + #df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df + df.set_index(['dataset', 'trial', 'model', 'compression_factor', 'latent_dim', 'batch_size'], inplace=True) df.sort_index(inplace=True) - df = df.groupby(level=['dataset', 'trial', 'model', 'latent_space_sz']).agg({ + df = df.groupby(level=['dataset', 'trial', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({ 'test_accuracy': 'mean' , 'test_precision': 'mean' , 'test_recall': 'mean' , 'test_f1': 'mean' , 'conf_mat': keep_first_fname , 'umap': keep_first_fname - , 'barplot': keep_first_fname + #, 'barplot': keep_first_fname }) print('-'*80) print(df) print('-'*80) - - - cell_hover = { # for row hover use instead of - 'selector': 'td:hover', - 'props': [('background-color', '#ffffb3')] - } - index_names = { - 'selector': '.index_name', - 'props': 'font-style: italic; color: darkgrey; font-weight:normal;' - } - headers = { - 'selector': 'th:not(.index_name)', - 'props': 'background-color: #eeeeee; color: #333333;' - } - - def html_img(path): - if os.path.splitext(path)[1][1:] == 'png': - return f'' - if os.path.splitext(path)[1][1:] == 'pdf': - return f'' - return '
:(
' - df['conf_mat'] = df['conf_mat'].apply(html_img) - df['umap'] = df['umap'].apply(html_img) - df['barplot'] = df['barplot'].apply(html_img) - - def render_html(fname, d): - with open(fname, 'w') as f: - f.write(''' - - - - ''') - s = d.style - s.set_table_styles([cell_hover, index_names, headers]) - s.to_html(f, classes='df') - f.write('') - - with open(f'{clargs.output_dir}/gathered_table.tex', 'w') as f: - f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n') - df.to_latex(f) - f.write('\\end{decument}') - render_html(f'{clargs.output_dir}/gathered_table.html', df) - - dft = df.transpose() - with open(f'{clargs.output_dir}/gathered_table_transpose.tex', 'w') as f: - f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n') - dft.to_latex(f) - f.write('\\end{decument}') - render_html(f'{clargs.output_dir}/gathered_table_transpose.html', dft) + df.to_csv(f'{clargs.output_dir}/all_scores_agg_df.csv') + + + #cell_hover = { # for row hover use instead of + # 'selector': 'td:hover', + # 'props': [('background-color', '#ffffb3')] + # } + #index_names = { + # 'selector': '.index_name', + # 'props': 'font-style: italic; color: darkgrey; font-weight:normal;' + # } + #headers = { + # 'selector': 'th:not(.index_name)', + # 'props': 'background-color: #eeeeee; color: #333333;' + # } + + #def html_img(path): + # if os.path.splitext(path)[1][1:] == 'png': + # return f'' + # if os.path.splitext(path)[1][1:] == 'pdf': + # return f'' + # return '
:(
' + #df['conf_mat'] = df['conf_mat'].apply(html_img) + #df['umap'] = df['umap'].apply(html_img) + #df['barplot'] = df['barplot'].apply(html_img) + + #def render_html(fname, d): + # with open(fname, 'w') as f: + # f.write(''' + # + # + # + # ''') + # s = d.style + # s.set_table_styles([cell_hover, index_names, headers]) + # s.to_html(f, classes='df') + # f.write('') + + #with open(f'{clargs.output_dir}/gathered_table.tex', 'w') as f: + # f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n') + # df.to_latex(f) + # f.write('\\end{decument}') + #render_html(f'{clargs.output_dir}/gathered_table.html', df) + + #dft = df.transpose() + #with open(f'{clargs.output_dir}/gathered_table_transpose.tex', 'w') as f: + # f.write('\\documentclass[12pt]{article}\n\\usepackage{booktabs}\n\\usepackage{underscore}\n\\usepackage{multirow}\n\\begin{document}\n') + # dft.to_latex(f) + # f.write('\\end{decument}') + #render_html(f'{clargs.output_dir}/gathered_table_transpose.html', dft) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Run the shape embed pipeline') - parser.add_argument( 'run_folder', nargs="+", type=str + parser.add_argument( 'run_folders', metavar='run_folder', nargs="+", type=str , help=f"The runs folders to gather results from") parser.add_argument( '-o', '--output-dir', metavar='OUTPUT_DIR' - , default=f'{os.getcwd()}/gathered_results' + , default=f'{os.getcwd()}/gathered_results_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' , help=f"The OUTPUT_DIR path to use to gather results") parser.add_argument('-v', '--verbose', action='count', default=0 , help="Increase verbosity level by adding more \"v\".") @@ -147,10 +158,11 @@ def render_html(fname, d): clargs=parser.parse_args() # set verbosity level + logging.basicConfig() logger = logging.getLogger(__name__) - if clargs.verbose > 2: - logger.setLevel(logging.DEBUG) + if clargs.verbose > 1: + logger.setLevel('DEBUG') elif clargs.verbose > 0: - logger.setLevel(logging.INFO) + logger.setLevel('INFO') main_process(clargs, logger) From ffce0d38677c84fe5e47fca85e4e61a4329647e9 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 21 Jul 2024 18:26:48 +0100 Subject: [PATCH 176/204] removed stale script string --- scripts/shapeembed/slurm_sweep_shapeembed.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py index a15e85cd..cc022bed 100755 --- a/scripts/shapeembed/slurm_sweep_shapeembed.py +++ b/scripts/shapeembed/slurm_sweep_shapeembed.py @@ -122,15 +122,6 @@ def find_submitted_slurm_jobs(): shapeembed_script=f'{os.getcwd()}/shapeembed.py' wandb_project='shapeembed' -slurm_script="""#! /bin/bash -echo "running shape embed with:" -echo " - dataset {dataset[0]} ({dataset[1]}, {dataset[2]})" -echo " - model {model} ({model_params})" -echo " - compression_factor {compression_factor}" -echo " - batch size {batch_size}" -python3 shapeembed.py --wandb-project {wandb_project} --dataset {dataset[0]} {dataset[1]} {dataset[2]} --model {model} --compression-factor {compression_factor} --batch-size {batch_size} --clear-checkpoints --output-dir {out_dir} {extra_args} -""" - ################################################################################ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name__)): From d802f9a751855da13568e1866afd87f677125bb6 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 21 Jul 2024 23:36:21 +0100 Subject: [PATCH 177/204] Split model name in two columns if there are model args --- scripts/shapeembed/gather_run_results.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index 11410d39..9a78f003 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -37,6 +37,13 @@ def main_process(clargs, logger=logging.getLogger(__name__)): # open scores dataframe df = pd.read_csv(p.csv_file, index_col=0) + # split model column in case model args are present + model_cols = df['model'].str.split('-', n=1, expand=True) + if model_cols.shape[1] == 2: + df = df.drop('model', axis=1) + df.insert(1, 'model_args', model_cols[1]) + df.insert(1, 'model', model_cols[0]) + # pair up with confusion matrix conf_mat_file = f'{job_str(p)}-shapeembed-confusion_matrix.png' print(f'{p.folder}/{conf_mat_file}') From 663cc52f1e02fbe575409dc94400b4aa0beab776 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 22 Jul 2024 00:25:03 +0100 Subject: [PATCH 178/204] remove stale import --- scripts/shapeembed/evaluation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 3f3452d8..26f344b7 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -18,8 +18,6 @@ import seaborn import matplotlib.pyplot as plt -from bioimage_embed.shapes.transforms import ImageToCoords - # logging facilities ############################################################################### logger = logging.getLogger(__name__) From 7d328d9102bc765492362e57b6c8c7183e53d313 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 22 Jul 2024 00:25:38 +0100 Subject: [PATCH 179/204] experiment with plots --- scripts/shapeembed/evaluation.py | 48 ++++++++++++++++++++++++ scripts/shapeembed/gather_run_results.py | 20 +++++----- 2 files changed, 57 insertions(+), 11 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 26f344b7..bd71cb17 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -184,3 +184,51 @@ def save_scores( scores_df .xs("test_f1", level="Metric", drop_level=False) .groupby("trial") .mean()) + +def save_barplot( scores_df + , outputdir='.' + , width = 7 + , height = 7 / 1.2 ): + # save a barplot representation of scores + melted_df = scores_df[['model', 'beta', 'compression_factor', 'batch_size', 'test_f1']].melt( + id_vars=['model', 'beta', 'compression_factor', 'batch_size'] + , var_name="Metric" + , value_name="Score" + ) + for m in melted_df['model'].unique(): + for cf in melted_df['compression_factor'].unique(): + if 'beta' in m: + for bs in melted_df['batch_size'].unique(): + ax = seaborn.catplot( data=melted_df.loc[ (melted_df['model'] == m) & (melted_df['compression_factor'] == cf) & (melted_df['batch_size'] == bs) + , ['beta', 'Metric', 'Score'] ] + , kind="bar" + , x='beta' + , hue="Metric" + , y="Score" + , errorbar="se" + , height=height + , aspect=width * 2**0.5 / height ) + ax.tick_params(axis='x', rotation=90) + ax.fig.subplots_adjust(top=0.9) + ax.set(title=f'f1 score against beta ({m}, compression factor {cf}, batch size {bs})') + plt.savefig(f"{outputdir}/beta_barplot_{m}_{cf}_{bs}.pdf") + plt.close() + ax = seaborn.catplot( data=melted_df.loc[ (melted_df['model'] == m) & (melted_df['compression_factor'] == cf) + , ['batch_size', 'beta', 'Metric', 'Score'] ] + , kind="bar" + , x='batch_size' + , hue='beta' if 'beta' in m else 'Metric' + , y="Score" + , errorbar="se" + , height=height + , aspect=width * 2**0.5 / height ) + ax.tick_params(axis='x', rotation=90) + ax.fig.subplots_adjust(top=0.9) + ax.set(title=f'f1 score against batch size ({m}, compression factor {cf})') + plt.savefig(f"{outputdir}/barplot_{m}_{cf}.pdf") + plt.close() + # log info + #logger.info(melted_df.set_index(["trial", "Metric"]) + # .xs("test_f1", level="Metric", drop_level=False) + # .groupby("trial") + # .mean()) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index 9a78f003..1c9b33e0 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -9,15 +9,7 @@ import pandas as pd from common_helpers import * - -# define a Custom aggregation -# function for finding total -def keep_first_fname(series): - return functools.reduce(lambda x, y: y if x == 'nofile' else y, series) - -def get_run_info(run): - x = run.split('_') - return f'{x[0]}_{x[1]}', x[2], x[4] +from evaluation import * def main_process(clargs, logger=logging.getLogger(__name__)): @@ -76,11 +68,17 @@ def main_process(clargs, logger=logging.getLogger(__name__)): df = pd.concat(dfs) logger.debug(df) df.to_csv(f'{clargs.output_dir}/all_scores_df.csv', index=False) + save_barplot(df, clargs.output_dir) #df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df - df.set_index(['dataset', 'trial', 'model', 'compression_factor', 'latent_dim', 'batch_size'], inplace=True) + # define a Custom aggregation + # function for finding total + def keep_first_fname(series): + return functools.reduce(lambda x, y: y if x == 'nofile' else x, series) + df.set_index(['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size'], inplace=True) df.sort_index(inplace=True) - df = df.groupby(level=['dataset', 'trial', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({ + #df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({ + df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({ 'test_accuracy': 'mean' , 'test_precision': 'mean' , 'test_recall': 'mean' From e3796ba8684b4115180360d99bf4320a750d79da Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 22 Jul 2024 08:42:41 +0100 Subject: [PATCH 180/204] keep exploring potential plots --- scripts/shapeembed/evaluation.py | 42 ++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index bd71cb17..96c399c8 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -190,12 +190,50 @@ def save_barplot( scores_df , width = 7 , height = 7 / 1.2 ): # save a barplot representation of scores - melted_df = scores_df[['model', 'beta', 'compression_factor', 'batch_size', 'test_f1']].melt( - id_vars=['model', 'beta', 'compression_factor', 'batch_size'] + melted_df = scores_df[['model', 'beta', 'compression_factor', 'latent_dim', 'batch_size', 'test_f1']].melt( + id_vars=['model', 'beta', 'compression_factor', 'latent_dim', 'batch_size'] , var_name="Metric" , value_name="Score" ) + # test plots... for m in melted_df['model'].unique(): + # 1 - general overview plot... + df = melted_df.loc[ (melted_df['model'] == m) + , ['compression_factor', 'latent_dim', 'batch_size', 'beta', 'Metric', 'Score'] ].sort_values(by=['compression_factor', 'latent_dim', 'batch_size', 'beta']) + hue = df[['compression_factor', 'latent_dim']].apply(lambda r: f'cf: {r.compression_factor}({r.latent_dim})', axis=1) + if 'beta' in m: + hue = df[['compression_factor', 'latent_dim', 'beta']].apply(lambda r: f'cf: {r.compression_factor}({r.latent_dim}), beta: {r.beta}', axis=1) + ax = seaborn.catplot( data=df + , kind="bar" + , x='batch_size' + , y="Score" + , hue=hue + , errorbar="se" + , height=height + , aspect=width * 2**0.5 / height ) + #ax.tick_params(axis='x', rotation=90) + #ax.set(xlabel=None) + #ax.set(xticklabels=[]) + ax._legend.remove() + #ax.fig.legend(loc='upper center', bbox_to_anchor=(0.5, 0.0), ncol=3) + #ax.fig.legend(ncol=4, loc='lower center') + ax.fig.legend(ncol=1) + #ax.fig.subplots_adjust(top=0.9) + #ax.set(title=f'f1 score against batch size ({m})') + + #add overall title + plt.title(f'f1 score against batch size ({m})', fontsize=16) + + ##add axis titles + #plt.xlabel('') + #plt.ylabel('') + + #rotate x-axis labels + #plt.xticks(rotation=45) + + plt.savefig(f"{outputdir}/barplot_{m}.pdf", bbox_inches="tight") + plt.close() + # 2 - more specific plots for cf in melted_df['compression_factor'].unique(): if 'beta' in m: for bs in melted_df['batch_size'].unique(): From cecde4e12997981a056384eb967a5a099390c684 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 22 Jul 2024 10:08:35 +0100 Subject: [PATCH 181/204] more graphs --- scripts/shapeembed/evaluation.py | 56 ++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 96c399c8..1a872eeb 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -231,8 +231,46 @@ def save_barplot( scores_df #rotate x-axis labels #plt.xticks(rotation=45) - plt.savefig(f"{outputdir}/barplot_{m}.pdf", bbox_inches="tight") + plt.savefig(f"{outputdir}/barplot_{m}_x_bs.pdf", bbox_inches="tight") plt.close() + + # 1b - general overview plot... + df = melted_df.loc[ (melted_df['model'] == m) + , ['batch_size', 'compression_factor', 'latent_dim', 'beta', 'Metric', 'Score'] ].sort_values(by=['batch_size', 'compression_factor', 'latent_dim', 'beta']) + hue = df['batch_size'].apply(lambda r: f'bs: {r}') + if 'beta' in m: + hue = df[['batch_size', 'beta']].apply(lambda r: f'bs: {r.batch_size}, beta: {r.beta}', axis=1) + ax = seaborn.catplot( data=df + , kind="bar" + , x=df[['compression_factor', 'latent_dim']].apply(lambda r: f'cf: {r.compression_factor}({r.latent_dim})', axis=1) + , y="Score" + , hue=hue + , errorbar="se" + , height=height + , aspect=width * 2**0.5 / height ) + #ax.tick_params(axis='x', rotation=90) + #ax.set(xlabel=None) + #ax.set(xticklabels=[]) + ax._legend.remove() + #ax.fig.legend(loc='upper center', bbox_to_anchor=(0.5, 0.0), ncol=3) + #ax.fig.legend(ncol=4, loc='lower center') + ax.fig.legend(ncol=1) + #ax.fig.subplots_adjust(top=0.9) + #ax.set(title=f'f1 score against batch size ({m})') + + #add overall title + plt.title(f'f1 score against compression factor (latent space size) ({m})', fontsize=16) + + ##add axis titles + #plt.xlabel('') + #plt.ylabel('') + + #rotate x-axis labels + #plt.xticks(rotation=45) + + plt.savefig(f"{outputdir}/barplot_{m}_x_cf.pdf", bbox_inches="tight") + plt.close() + # 2 - more specific plots for cf in melted_df['compression_factor'].unique(): if 'beta' in m: @@ -263,7 +301,21 @@ def save_barplot( scores_df ax.tick_params(axis='x', rotation=90) ax.fig.subplots_adjust(top=0.9) ax.set(title=f'f1 score against batch size ({m}, compression factor {cf})') - plt.savefig(f"{outputdir}/barplot_{m}_{cf}.pdf") + plt.savefig(f"{outputdir}/barplot_{m}_x_bs_cf{cf}.pdf") + plt.close() + ax = seaborn.catplot( data=melted_df.loc[ (melted_df['model'] == m) & (melted_df['batch_size'] == cf) + , ['compression_factor', 'beta', 'Metric', 'Score'] ] + , kind="bar" + , x='compression_factor' + , hue='beta' if 'beta' in m else 'Metric' + , y="Score" + , errorbar="se" + , height=height + , aspect=width * 2**0.5 / height ) + ax.tick_params(axis='x', rotation=90) + ax.fig.subplots_adjust(top=0.9) + ax.set(title=f'f1 score against batch size ({m}, compression factor {cf})') + plt.savefig(f"{outputdir}/barplot_{m}_x_cf_bs{bs}.pdf") plt.close() # log info #logger.info(melted_df.set_index(["trial", "Metric"]) From 344cff131aee49594f3cf2f61ad729f7a00a6756 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 22 Jul 2024 20:12:45 +0100 Subject: [PATCH 182/204] fix model name in shapeembed output csv --- scripts/shapeembed/shapeembed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index 744f29fa..b027a3c9 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -104,7 +104,7 @@ def tag_cols(params): cols = [] cols.append(('dataset', params.dataset.name)) - cols.append(('model', model_str(params))) + cols.append(('model', params.model_name)) for k, v in vars(params.model_args).items(): cols.append((k, v)) cols.append(('compression_factor', params.compression_factor)) cols.append(('latent_dim', params.latent_dim)) From 6846ce15f278e5238d57b85d08f6d7ee3b912a69 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 22 Jul 2024 21:28:09 +0100 Subject: [PATCH 183/204] Added loss / mse to shapeembed's generated csv --- bioimage_embed/lightning/torch.py | 26 ++++++++++++++------------ scripts/shapeembed/shapeembed.py | 5 ++++- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/bioimage_embed/lightning/torch.py b/bioimage_embed/lightning/torch.py index ab730c3f..22147b81 100644 --- a/bioimage_embed/lightning/torch.py +++ b/bioimage_embed/lightning/torch.py @@ -49,6 +49,8 @@ def __init__(self, model, args=SimpleNamespace()): # self.args = SimpleNamespace(**{**merged_kwargs, **vars(self.args)}) self.save_hyperparameters(vars(self.args)) # self.model.train() + # keep a handle on metrics logged by the model + self.metrics = {} def forward(self, batch): x = self.batch_to_tensor(batch) @@ -118,12 +120,12 @@ def validation_step(self, batch, batch_idx): x = self.batch_to_tensor(batch) model_output, loss = self.get_model_output(x, batch_idx) z = self.embedding_from_output(model_output) - self.log_dict( - { - "loss/val": loss, - "mse/val": F.mse_loss(model_output.recon_x, x["data"]), - } - ) + val_metrics ={ + "loss/val": loss, + "mse/val": F.mse_loss(model_output.recon_x, x["data"]), + } + self.log_dict( val_metrics,) + self.metrics = {**self.metrics, **val_metrics} return loss # def lr_scheduler_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None): @@ -171,12 +173,12 @@ def test_step(self, batch, batch_idx): loss = self.loss_function(model_output) # Log test metrics - self.log_dict( - { - "loss/test": loss, - "mse/test": F.mse_loss(model_output.recon_x, x["data"]), - } - ) + test_metrics = { + "loss/test": loss, + "mse/test": F.mse_loss(model_output.recon_x, x["data"]), + } + self.log_dict(test_metrics) + self.metrics = {**self.metrics, **test_metrics} return loss diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index b027a3c9..e62dbd59 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -344,6 +344,9 @@ def main_process(params): trainer, model, dataloader , num_workers=params.num_workers ) + + # gather and log stats + ###################### logger.debug(f'\n{shapeembed_df}') pfx=job_str(params) np.save(f'{params.output_dir}/{pfx}-shapeembed-latent_space.npy', latent_space) @@ -352,7 +355,7 @@ def main_process(params): logger.info(f'-- generate shapeembed umap --') umap_plot(shapeembed_df, f'{pfx}-shapeembed', outputdir=params.output_dir) logger.info(f'-- score shape embed --') - shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params)) + shapeembed_cm, shapeembed_score_df = score_dataframe(shapeembed_df, pfx, tag_cols(params)+[(k, v.item()) for k, v in model.metrics.items()]) logger.info(f'-- shapeembed on {params.dataset.name}, score\n{shapeembed_score_df}') shapeembed_score_df.to_csv(f"{params.output_dir}/{pfx}-shapeembed-score_df.csv") logger.info(f'-- confusion matrix:\n{shapeembed_cm}') From 7a8972fcc90b5d67005f67c009c4ef41c766ede9 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 25 Jul 2024 19:22:13 +0100 Subject: [PATCH 184/204] updated slurm script with regex filtering of squeue output --- scripts/shapeembed/common_helpers.py | 4 ++++ scripts/shapeembed/slurm_sweep_shapeembed.py | 17 +++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/scripts/shapeembed/common_helpers.py b/scripts/shapeembed/common_helpers.py index 204b4c09..fd09a241 100644 --- a/scripts/shapeembed/common_helpers.py +++ b/scripts/shapeembed/common_helpers.py @@ -1,3 +1,4 @@ +import re import os import glob import types @@ -15,6 +16,9 @@ def model_str(params): def job_str(params): return f"{params.dataset.name}-{model_str(params)}-{params.compression_factor}-{params.latent_dim}-{params.batch_size}" +def job_str_re(): + return re.compile("(.*)-(.*)-(\d+)-(\d+)-(\d+)") + def params_from_job_str(jobstr): raw = jobstr.split('-') ps = types.SimpleNamespace() diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py index cc022bed..7d4aa40c 100755 --- a/scripts/shapeembed/slurm_sweep_shapeembed.py +++ b/scripts/shapeembed/slurm_sweep_shapeembed.py @@ -18,9 +18,10 @@ datasets_pfx = '/nfs/research/uhlmann/afoix/datasets/image_datasets' datasets = [ - ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask") +# ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask") # ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask") -# ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask") + ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask") +, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask") #, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask") #, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask") #, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask") @@ -28,14 +29,14 @@ ] models = [ - "resnet18_vae" + "resnet18_vqvae" +, "resnet50_vqvae" +, "resnet18_vae" , "resnet50_vae" , "resnet18_beta_vae" , "resnet50_beta_vae" #, "resnet18_vae_bolt" #, "resnet50_vae_bolt" -, "resnet18_vqvae" -, "resnet50_vqvae" #, "resnet18_vqvae_legacy" #, "resnet50_vqvae_legacy" #, "resnet101_vqvae_legacy" @@ -46,8 +47,8 @@ ] model_params = { - "resnet18_beta_vae": {'beta': [1,2,5,10,20]} -, "resnet50_beta_vae": {'beta': [1,2,5,10,20]} + "resnet18_beta_vae": {'beta': [2,5]} +, "resnet50_beta_vae": {'beta': [2,5]} } compression_factors = [1,2,3,5,10] @@ -107,7 +108,7 @@ def check_model_args(a, b): def find_submitted_slurm_jobs(): jobs = subprocess.run(['squeue', '--format', '%j'], stdout=subprocess.PIPE).stdout.decode('utf-8').split() - return list(map(params_from_job_str, jobs[1:])) + return list(map(params_from_job_str, filter(lambda x: x, map(job_str_re().match, jobs[1:])))) # other parameters ################################################################################ From d9c87a3f7eca8ea59e29dd5edb456b55739b3eee Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 25 Jul 2024 23:43:04 +0100 Subject: [PATCH 185/204] added a simple latex table to the gather_run_results script --- scripts/shapeembed/gather_run_results.py | 26 +++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index 1c9b33e0..db968aa0 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -1,6 +1,7 @@ #! /usr/bin/env python3 import os +import re import shutil import logging import argparse @@ -11,6 +12,21 @@ from common_helpers import * from evaluation import * +def simple_table(df, tname, model_re=".*vq.*"): + cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1'] + df = df.loc[df.model.str.contains(model_re), cols].sort_values(by=cols) + df = df.sort_values(by='test_f1', ascending=False).iloc[:10] + + with open(f'{tname}_tabular.tex', 'w') as fp: + fp.write("\\begin{tabular}{|llll|r|} \hline\n") + fp.write("Model & CF (and latent space size) & batch size & BETA & F1 score \\\\ \hline\n") + for _, r in df.iterrows(): + mname = r['model'].replace('_','\_') + beta = '-' if pd.isna(r['beta']) else r['beta'] + fp.write(f"{mname} & {r['compression_factor']} ({r['latent_dim']}) & {r['batch_size']} & {beta} & {r['test_f1']:.4f} \\\\\n") + fp.write("\hline\n") + fp.write("\end{tabular}\n") + def main_process(clargs, logger=logging.getLogger(__name__)): params = [] @@ -75,11 +91,13 @@ def main_process(clargs, logger=logging.getLogger(__name__)): # function for finding total def keep_first_fname(series): return functools.reduce(lambda x, y: y if x == 'nofile' else x, series) - df.set_index(['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size'], inplace=True) + idx_cols = ['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size'] + df.set_index(idx_cols, inplace=True) df.sort_index(inplace=True) #df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({ - df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({ - 'test_accuracy': 'mean' + df = df.groupby(level=idx_cols).agg({ + 'beta': 'mean' + , 'test_accuracy': 'mean' , 'test_precision': 'mean' , 'test_recall': 'mean' , 'test_f1': 'mean' @@ -93,6 +111,8 @@ def keep_first_fname(series): print('-'*80) df.to_csv(f'{clargs.output_dir}/all_scores_agg_df.csv') + simple_table(df.reset_index(), f'{clargs.output_dir}/simple_table') + #cell_hover = { # for row hover use instead of # 'selector': 'td:hover', From cced4e9090791be6c2178952329b06e60ee2eb73 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 27 Jul 2024 22:28:32 +0100 Subject: [PATCH 186/204] minor refactor in efd --- scripts/shapeembed/efd.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py index 4f910990..28e8aa0b 100755 --- a/scripts/shapeembed/efd.py +++ b/scripts/shapeembed/efd.py @@ -7,20 +7,27 @@ import argparse # own imports +#import bioimage_embed # necessary for the datamodule class to make sure we get the same test set +from bioimage_embed.shapes.transforms import ImageToCoords from evaluation import * -def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger): +def get_dataset(dataset_params): # access the dataset - assert dataset_params.type == 'mask' - ds = datasets.ImageFolder( dataset_params.path - , transform=transforms.Compose([ - transforms.Grayscale(1) - , ImageToCoords(contour_size) ])) - # ... and run efd on each image + assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}' + dataset = datasets.ImageFolder( dataset_params.path + , transform=transforms.Compose([ + transforms.Grayscale(1) + , ImageToCoords(contour_size) ])) + return dataset + #dataloader = bioimage_embed.lightning.DataModule(dataset, shuffle=True) + #dataloader.setup() + #return dataloader.test + +def run_elliptic_fourier_descriptors(dataset, contour_size, logger): + # run efd on each image dfs = [] - logger.info(f'running efd on {dataset_params.name}') - logger.info(f'({dataset_params.path})') - for i, (img, lbl) in enumerate(tqdm.tqdm(ds)): + logger.info(f'running efd on {dataset}') + for i, (img, lbl) in enumerate(tqdm.tqdm(dataset)): coeffs = pyefd.elliptic_fourier_descriptors(img, order=10, normalize=False) norm_coeffs = pyefd.normalize_efd(coeffs) df = pandas.DataFrame({ @@ -73,7 +80,7 @@ def run_elliptic_fourier_descriptors(dataset_params, contour_size, logger): # efd on input data and score - efd_df = run_elliptic_fourier_descriptors(dataset, contour_size, logger) + efd_df = run_elliptic_fourier_descriptors(get_dataset(dataset), contour_size, logger) logger.info(f'-- efd on {dataset.name}, raw\n{efd_df}') efd_df.to_csv(f"{clargs.output_dir}/{dataset.name}-efd-raw_df.csv") From 3e6e4c9f2e0d401f2687b90c0545e5334fd94b84 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 27 Jul 2024 22:28:41 +0100 Subject: [PATCH 187/204] minor refactor in regionprops --- scripts/shapeembed/regionprops.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py index 3b65933f..db37ac25 100755 --- a/scripts/shapeembed/regionprops.py +++ b/scripts/shapeembed/regionprops.py @@ -7,19 +7,25 @@ from skimage import measure # own imports +#import bioimage_embed # necessary for the datamodule class to make sure we get the same test set from evaluation import * -def run_regionprops( dataset_params - , properties - , logger ): +def get_dataset(dataset_params): # access the dataset assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}' - ds = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1)) - # ... and run regionprops for the given properties for each image + dataset = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1)) + return dataset + #dataloader = bioimage_embed.lightning.DataModule(dataset, shuffle=True) + #dataloader.setup() + #return dataloader.test + +def run_regionprops( dataset + , properties + , logger ): + # run regionprops for the given properties for each image dfs = [] - logger.info(f'running regionprops on {dataset_params.name}') - logger.info(f'({dataset_params.path})') - for i, (img, lbl) in enumerate(tqdm.tqdm(ds)): + logger.info(f'running regionprops on {dataset}') + for i, (img, lbl) in enumerate(tqdm.tqdm(dataset)): data = numpy.where(numpy.array(img)>20, 255, 0) t = measure.regionprops_table(data, properties=properties) df = pandas.DataFrame(t) @@ -75,7 +81,7 @@ def run_regionprops( dataset_params # regionprops on input data and score - regionprops_df = run_regionprops(dataset, properties, logger) + regionprops_df = run_regionprops(get_dataset(dataset), properties, logger) logger.info(f'-- regionprops on {dataset.name}, raw\n{regionprops_df}') regionprops_df.to_csv(f"{clargs.output_dir}/{dataset.name}-regionprops-raw_df.csv") From 2908c155e713a2e37000fe26d736c6e7d816e128 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 27 Jul 2024 22:29:19 +0100 Subject: [PATCH 188/204] generated plots and more tables in gather_run_results --- scripts/shapeembed/gather_run_results.py | 70 +++++++++++++++++++++--- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index db968aa0..e354e55b 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -4,6 +4,7 @@ import re import shutil import logging +import seaborn import argparse import datetime import functools @@ -12,18 +13,44 @@ from common_helpers import * from evaluation import * -def simple_table(df, tname, model_re=".*vq.*"): - cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1'] +#def simple_table(df, tname, model_re=".*vq.*"): +def simple_table(df, tname, model_re=".*", sort_by_col=None, ascending=False, best_n=40): + cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1', 'mse/test'] df = df.loc[df.model.str.contains(model_re), cols].sort_values(by=cols) - df = df.sort_values(by='test_f1', ascending=False).iloc[:10] + if sort_by_col: + df = df.sort_values(by=sort_by_col, ascending=ascending) + df = df.iloc[:best_n] with open(f'{tname}_tabular.tex', 'w') as fp: - fp.write("\\begin{tabular}{|llll|r|} \hline\n") - fp.write("Model & CF (and latent space size) & batch size & BETA & F1 score \\\\ \hline\n") + fp.write("\\begin{tabular}{|llll|r|r|} \hline\n") + fp.write("Model & CF (and latent space size) & batch size & BETA & F1 score & Mse \\\\ \hline\n") for _, r in df.iterrows(): mname = r['model'].replace('_','\_') beta = '-' if pd.isna(r['beta']) else r['beta'] - fp.write(f"{mname} & {r['compression_factor']} ({r['latent_dim']}) & {r['batch_size']} & {beta} & {r['test_f1']:.4f} \\\\\n") + fp.write(f"{mname} & {r['compression_factor']} ({r['latent_dim']}) & {r['batch_size']} & {beta} & {r['test_f1']:f} & {r['mse/test']:f} \\\\\n") + fp.write("\hline\n") + fp.write("\end{tabular}\n") + +def compare_f1_mse_table(df, tname, best_n=40): + cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1', 'mse/test'] + df0 = df[cols].sort_values(by=cols) + df0 = df0.sort_values(by='test_f1', ascending=False) + df0 = df0.iloc[:best_n] + df1 = df[cols].sort_values(by=cols) + df1 = df1.sort_values(by='mse/test', ascending=True) + df1 = df1.iloc[:best_n] + df = pd.concat([df0.reset_index(), df1.reset_index()], axis=1, keys=['f1', 'mse']) + print(df) + with open(f'{tname}_tabular.tex', 'w') as fp: + fp.write("\\begin{tabular}{|llll|r|r|llll|r|r|} \hline\n") + fp.write("\multicolumn{6}{|l}{Best F1 score} & \multicolumn{6}{|l|}{Best Mse} \\\\\n") + fp.write("Model & CF (latent space) & batch size & BETA & F1 score & Mse & Model & CF (latent space) & batch size & BETA & F1 score & Mse \\\\ \hline\n") + for _, r in df.iterrows(): + f1_name = r[('f1', 'model')].replace('_','\_') + mse_name = r[('mse', 'model')].replace('_','\_') + f1_beta = '-' if pd.isna(r[('f1', 'beta')]) else r[('f1', 'beta')] + mse_beta = '-' if pd.isna(r[('mse', 'beta')]) else r[('mse', 'beta')] + fp.write(f"{f1_name} & {r[('f1', 'compression_factor')]} ({r[('f1', 'latent_dim')]}) & {r[('f1', 'batch_size')]} & {f1_beta} & {r[('f1', 'test_f1')]:f} & {r[('f1', 'mse/test')]:f} & {mse_name} & {r[('mse', 'compression_factor')]} ({r[('mse', 'latent_dim')]}) & {r[('mse', 'batch_size')]} & {mse_beta} & {r[('mse', 'test_f1')]:f} & {r[('mse', 'mse/test')]:f} \\\\\n") fp.write("\hline\n") fp.write("\end{tabular}\n") @@ -101,6 +128,10 @@ def keep_first_fname(series): , 'test_precision': 'mean' , 'test_recall': 'mean' , 'test_f1': 'mean' + , 'mse/test': 'mean' + , 'loss/test': 'mean' + , 'mse/val': 'mean' + , 'loss/val': 'mean' , 'conf_mat': keep_first_fname , 'umap': keep_first_fname #, 'barplot': keep_first_fname @@ -110,9 +141,30 @@ def keep_first_fname(series): print(df) print('-'*80) df.to_csv(f'{clargs.output_dir}/all_scores_agg_df.csv') - - simple_table(df.reset_index(), f'{clargs.output_dir}/simple_table') - + df = df.reset_index() + + # table results for f1 and mse comparison + simple_table(df, f'{clargs.output_dir}/table_top40_f1', sort_by_col='test_f1') + simple_table(df, f'{clargs.output_dir}/table_top40_mse', sort_by_col='mse/test', ascending=True) + compare_f1_mse_table(df, f'{clargs.output_dir}/table_top5_compare', best_n=5) + + # mse / f1 plots + dff=df[df['mse/test'] instead of # 'selector': 'td:hover', From 41d460343d39d98787b5bd9e0c38b762d3ee4b18 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 27 Jul 2024 23:38:56 +0100 Subject: [PATCH 189/204] added regionprops and efd to gather results script --- scripts/shapeembed/gather_run_results.py | 34 +++++++++++++++++++----- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index e354e55b..3b5c7e32 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -13,6 +13,17 @@ from common_helpers import * from evaluation import * +def trial_table(df, tname): + best_model = df.dropna(subset=['model']).sort_values(by='test_f1', ascending=False).iloc[0] + with open(f'{tname}_tabular.tex', 'w') as fp: + fp.write("\\begin{tabular}{|l|r|} \hline\n") + fp.write("Trial & F1 score \\\\ \hline\n") + name = best_model['trial'].replace('_','\_') + fp.write(f"{name} & {best_model['test_f1']} \\\\ \hline\n") + fp.write(f"regionprops & {df[df['trial'] == 'regionprops'].iloc[0]['test_f1']} \\\\ \hline\n") + fp.write(f"efd & {df[df['trial'] == 'efd'].iloc[0]['test_f1']} \\\\ \hline\n") + fp.write("\end{tabular}\n") + #def simple_table(df, tname, model_re=".*vq.*"): def simple_table(df, tname, model_re=".*", sort_by_col=None, ascending=False, best_n=40): cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1', 'mse/test'] @@ -56,6 +67,16 @@ def compare_f1_mse_table(df, tname, best_n=40): def main_process(clargs, logger=logging.getLogger(__name__)): + dfs = [] + + # regionprops / efd + for dirname in clargs.run_folders: + for f in glob.glob(f'{dirname}/*-regionprops-score_df.csv'): + dfs.append(pd.read_csv(f, index_col=0)) + for f in glob.glob(f'{dirname}/*-efd-score_df.csv'): + dfs.append(pd.read_csv(f, index_col=0)) + + # shapeembed params = [] for f in clargs.run_folders: ps = find_existing_run_scores(f) @@ -66,7 +87,6 @@ def main_process(clargs, logger=logging.getLogger(__name__)): os.makedirs(clargs.output_dir, exist_ok=True) - dfs = [] for p in params: # open scores dataframe @@ -111,18 +131,18 @@ def main_process(clargs, logger=logging.getLogger(__name__)): df = pd.concat(dfs) logger.debug(df) df.to_csv(f'{clargs.output_dir}/all_scores_df.csv', index=False) - save_barplot(df, clargs.output_dir) + save_barplot(df.dropna(subset=['model']), clargs.output_dir) #df = df.iloc[:, 1:] # drop first column 'unnamed' for non-mean df # define a Custom aggregation # function for finding total def keep_first_fname(series): - return functools.reduce(lambda x, y: y if x == 'nofile' else x, series) + return functools.reduce(lambda x, y: y if str(x) == 'nofile' else x, series) idx_cols = ['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size'] df.set_index(idx_cols, inplace=True) df.sort_index(inplace=True) #df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({ - df = df.groupby(level=idx_cols).agg({ + df = df.groupby(level=idx_cols, dropna=False).agg({ 'beta': 'mean' , 'test_accuracy': 'mean' , 'test_precision': 'mean' @@ -147,6 +167,7 @@ def keep_first_fname(series): simple_table(df, f'{clargs.output_dir}/table_top40_f1', sort_by_col='test_f1') simple_table(df, f'{clargs.output_dir}/table_top40_mse', sort_by_col='mse/test', ascending=True) compare_f1_mse_table(df, f'{clargs.output_dir}/table_top5_compare', best_n=5) + trial_table(df, f'{clargs.output_dir}/trials') # mse / f1 plots dff=df[df['mse/test'] Date: Mon, 29 Jul 2024 21:10:59 +0100 Subject: [PATCH 190/204] Updated graphs titles --- scripts/shapeembed/evaluation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/evaluation.py b/scripts/shapeembed/evaluation.py index 1a872eeb..d530e9f6 100644 --- a/scripts/shapeembed/evaluation.py +++ b/scripts/shapeembed/evaluation.py @@ -303,7 +303,8 @@ def save_barplot( scores_df ax.set(title=f'f1 score against batch size ({m}, compression factor {cf})') plt.savefig(f"{outputdir}/barplot_{m}_x_bs_cf{cf}.pdf") plt.close() - ax = seaborn.catplot( data=melted_df.loc[ (melted_df['model'] == m) & (melted_df['batch_size'] == cf) + for bs in melted_df['batch_size'].unique(): + ax = seaborn.catplot( data=melted_df.loc[ (melted_df['model'] == m) & (melted_df['batch_size'] == bs) , ['compression_factor', 'beta', 'Metric', 'Score'] ] , kind="bar" , x='compression_factor' @@ -314,7 +315,7 @@ def save_barplot( scores_df , aspect=width * 2**0.5 / height ) ax.tick_params(axis='x', rotation=90) ax.fig.subplots_adjust(top=0.9) - ax.set(title=f'f1 score against batch size ({m}, compression factor {cf})') + ax.set(title=f'f1 score against compression factor ({m}, compression batch size {bs})') plt.savefig(f"{outputdir}/barplot_{m}_x_cf_bs{bs}.pdf") plt.close() # log info From 932b13a7bcc926cdde63168783ec45afe91952cb Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 29 Jul 2024 21:13:43 +0100 Subject: [PATCH 191/204] fake beta column if necessary and filter out regionprops and efd for f1 Vs Mse comparison --- scripts/shapeembed/gather_run_results.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index 3b5c7e32..b58e5fd1 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -116,6 +116,10 @@ def main_process(clargs, logger=logging.getLogger(__name__)): else: df['umap'] = f'nofile' + # NA desired columns if not already present + if 'beta' not in df.keys(): + df['beta'] = pd.NA + ## pair up with barplot #barplot = f'scores_barplot.pdf' #if os.path.isfile(f'{d}/{barplot}'): @@ -166,8 +170,13 @@ def keep_first_fname(series): # table results for f1 and mse comparison simple_table(df, f'{clargs.output_dir}/table_top40_f1', sort_by_col='test_f1') simple_table(df, f'{clargs.output_dir}/table_top40_mse', sort_by_col='mse/test', ascending=True) - compare_f1_mse_table(df, f'{clargs.output_dir}/table_top5_compare', best_n=5) - trial_table(df, f'{clargs.output_dir}/trials') + # temporarily drop regionprops and efd rows for F1 and MSE comparison + dff = df[(df['trial'] != 'regionprops') & (df['trial'] != 'efd')] + compare_f1_mse_table(dff, f'{clargs.output_dir}/table_top5_compare', best_n=5) + if 'regionprops' in df['trial'].values and 'efd' in df['trial'].values: + trial_table(df, f'{clargs.output_dir}/trials') + else: + logger.info('skipped trial table comparison (need both regionprops and efd results)') # mse / f1 plots dff=df[df['mse/test'] Date: Mon, 29 Jul 2024 21:24:09 +0100 Subject: [PATCH 192/204] updated datasets + only find jobs and scores if corresponding filter active --- scripts/shapeembed/slurm_sweep_shapeembed.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py index 7d4aa40c..1080394c 100755 --- a/scripts/shapeembed/slurm_sweep_shapeembed.py +++ b/scripts/shapeembed/slurm_sweep_shapeembed.py @@ -20,8 +20,10 @@ datasets = [ # ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask") # ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask") - ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask") -, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask") +# ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask") +# ("vampire_cells", f"{datasets_pfx}/vampire_cells/", "mask") + ("vampire_nuclei", f"{datasets_pfx}/vampire_nuclei/", "mask") +#, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask") #, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask") #, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask") #, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask") @@ -202,14 +204,13 @@ def spawn_slurm_job(slurm_out_dir, out_dir, ps, logger=logging.getLogger(__name_ os.makedirs(clargs.slurm_output_dir, exist_ok=True) os.makedirs(clargs.output_dir, exist_ok=True) - done_params = find_existing_run_scores(clargs.output_dir) - in_slurm_params = find_submitted_slurm_jobs() - all_params = gen_params_sweep_list() + todo_params = gen_params_sweep_list() - todo_params = all_params if clargs.filter_done: + done_params = find_existing_run_scores(clargs.output_dir) todo_params = [x for x in todo_params if not params_match(x, done_params)] if clargs.filter_submitted: + in_slurm_params = find_submitted_slurm_jobs() todo_params = [x for x in todo_params if not params_match(x, in_slurm_params)] for ps in todo_params: From 583e835fcdf3ae637df792f2a7ac531201ec358d Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 29 Jul 2024 21:54:19 +0100 Subject: [PATCH 193/204] bugfix overwriting loop dataframe --- scripts/shapeembed/gather_run_results.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index b58e5fd1..478a4c73 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -187,15 +187,15 @@ def keep_first_fname(series): dff = df.dropna(subset=['model']) for m in dff['model'].unique(): - dff = dff[dff['model']==m] + local_df = dff[dff['model']==m] print(m) - ax = seaborn.relplot(kind='line', data=dff.dropna(subset=['test_f1']), x='compression_factor', y='test_f1', hue='batch_size') + ax = seaborn.relplot(kind='line', data=local_df.dropna(subset=['test_f1']), x='compression_factor', y='test_f1', hue='batch_size') ax.figure.suptitle(f'{m}: f1 VS compression factor') ax.figure.savefig(f'{clargs.output_dir}/{m}_f1VScompression_factor_line.png') - ax = seaborn.relplot(kind='line', data=dff.dropna(subset=['mse/test']), x='compression_factor', y='mse/test', hue='batch_size') + ax = seaborn.relplot(kind='line', data=local_df.dropna(subset=['mse/test']), x='compression_factor', y='mse/test', hue='batch_size') ax.figure.suptitle(f'{m}: Mse VS compression factor') ax.figure.savefig(f'{clargs.output_dir}/{m}_mseVScompression_factor_line.png') - simple_table(dff, f'{clargs.output_dir}/{m}_summary_table') + simple_table(local_df, f'{clargs.output_dir}/{m}_summary_table') #cell_hover = { # for row hover use instead of # 'selector': 'td:hover', From 07bb89c57ff4a266b0a67b229bdf025c3fb6ea8f Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Tue, 30 Jul 2024 10:41:50 +0100 Subject: [PATCH 194/204] dded a clarg to control region prop properties --- scripts/shapeembed/regionprops.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py index db37ac25..07431864 100755 --- a/scripts/shapeembed/regionprops.py +++ b/scripts/shapeembed/regionprops.py @@ -24,7 +24,7 @@ def run_regionprops( dataset , logger ): # run regionprops for the given properties for each image dfs = [] - logger.info(f'running regionprops on {dataset}') + logger.info(f'running regionprops on {dataset}, properties: {properties}') for i, (img, lbl) in enumerate(tqdm.tqdm(dataset)): data = numpy.where(numpy.array(img)>20, 255, 0) t = measure.regionprops_table(data, properties=properties) @@ -53,6 +53,10 @@ def run_regionprops( dataset , "minor_axis_length" , "orientation" ] + parser.add_argument( + '-p', '--properties', metavar='PROP', default=dflt_properties, nargs='+' + , help=f"Overwrite the list of properties to consider (default: {dflt_properties})") + parser.add_argument( '-o', '--output-dir', metavar='OUTPUT_DIR', default='./' , help=f"The OUTPUT_DIR path to use to dump results") @@ -74,7 +78,7 @@ def run_regionprops( dataset dataset = types.SimpleNamespace( name=clargs.dataset[0] , path=clargs.dataset[1] , type=clargs.dataset[2] ) - properties = dflt_properties + properties = clargs.properties # create output dir if it does not exist os.makedirs(clargs.output_dir, exist_ok=True) From a35490c65e45a7fdd41dbc35593b04d5a76df054 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Thu, 8 Aug 2024 11:25:11 +0100 Subject: [PATCH 195/204] Added random order to efd and regionprops --- scripts/shapeembed/efd.py | 14 +++++++------- scripts/shapeembed/regionprops.py | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/shapeembed/efd.py b/scripts/shapeembed/efd.py index 28e8aa0b..9b9525f8 100755 --- a/scripts/shapeembed/efd.py +++ b/scripts/shapeembed/efd.py @@ -3,6 +3,7 @@ import os import types import pyefd +import random import logging import argparse @@ -14,14 +15,13 @@ def get_dataset(dataset_params): # access the dataset assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}' - dataset = datasets.ImageFolder( dataset_params.path - , transform=transforms.Compose([ - transforms.Grayscale(1) - , ImageToCoords(contour_size) ])) + raw_dataset = datasets.ImageFolder( dataset_params.path + , transform=transforms.Compose([ + transforms.Grayscale(1) + , ImageToCoords(contour_size) ])) + dataset = [x for x in raw_dataset] + random.shuffle(dataset) return dataset - #dataloader = bioimage_embed.lightning.DataModule(dataset, shuffle=True) - #dataloader.setup() - #return dataloader.test def run_elliptic_fourier_descriptors(dataset, contour_size, logger): # run efd on each image diff --git a/scripts/shapeembed/regionprops.py b/scripts/shapeembed/regionprops.py index 07431864..a2325c86 100755 --- a/scripts/shapeembed/regionprops.py +++ b/scripts/shapeembed/regionprops.py @@ -2,6 +2,7 @@ import os import types +import random import logging import argparse from skimage import measure @@ -13,11 +14,10 @@ def get_dataset(dataset_params): # access the dataset assert dataset_params.type == 'mask', f'unsupported dataset type {dataset_params.type}' - dataset = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1)) + raw_dataset = datasets.ImageFolder(dataset_params.path, transforms.Grayscale(1)) + dataset = [x for x in raw_dataset] + random.shuffle(dataset) return dataset - #dataloader = bioimage_embed.lightning.DataModule(dataset, shuffle=True) - #dataloader.setup() - #return dataloader.test def run_regionprops( dataset , properties From 83a7679bb893928be9d46e5eb5a60ac38edfb4ce Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sat, 7 Sep 2024 20:35:52 +0100 Subject: [PATCH 196/204] force different markers for scatter plot F1vMSE --- scripts/shapeembed/gather_run_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index 478a4c73..1d2ca37e 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -182,7 +182,7 @@ def keep_first_fname(series): dff=df[df['mse/test'] Date: Mon, 9 Sep 2024 18:35:27 +0100 Subject: [PATCH 197/204] updated scatterplot --- scripts/shapeembed/gather_run_results.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index 1d2ca37e..35098ebd 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -182,7 +182,9 @@ def keep_first_fname(series): dff=df[df['mse/test'] Date: Fri, 27 Sep 2024 13:42:39 +0100 Subject: [PATCH 198/204] add standard deviation to the report for regions props and efd --- scripts/shapeembed/gather_run_results.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/shapeembed/gather_run_results.py b/scripts/shapeembed/gather_run_results.py index 35098ebd..b14ffb58 100755 --- a/scripts/shapeembed/gather_run_results.py +++ b/scripts/shapeembed/gather_run_results.py @@ -26,7 +26,7 @@ def trial_table(df, tname): #def simple_table(df, tname, model_re=".*vq.*"): def simple_table(df, tname, model_re=".*", sort_by_col=None, ascending=False, best_n=40): - cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1', 'mse/test'] + cols=['model', 'compression_factor', 'latent_dim', 'batch_size', 'beta', 'test_f1', 'test_f1_std', 'mse/test'] df = df.loc[df.model.str.contains(model_re), cols].sort_values(by=cols) if sort_by_col: df = df.sort_values(by=sort_by_col, ascending=ascending) @@ -34,11 +34,11 @@ def simple_table(df, tname, model_re=".*", sort_by_col=None, ascending=False, be with open(f'{tname}_tabular.tex', 'w') as fp: fp.write("\\begin{tabular}{|llll|r|r|} \hline\n") - fp.write("Model & CF (and latent space size) & batch size & BETA & F1 score & Mse \\\\ \hline\n") + fp.write("Model & CF (and latent space size) & batch size & BETA & F1 score & F1 score (std) & Mse \\\\ \hline\n") for _, r in df.iterrows(): mname = r['model'].replace('_','\_') beta = '-' if pd.isna(r['beta']) else r['beta'] - fp.write(f"{mname} & {r['compression_factor']} ({r['latent_dim']}) & {r['batch_size']} & {beta} & {r['test_f1']:f} & {r['mse/test']:f} \\\\\n") + fp.write(f"{mname} & {r['compression_factor']} ({r['latent_dim']}) & {r['batch_size']} & {beta} & {r['test_f1']:f} & {r['test_f1_std']:f} & {r['mse/test']:f} \\\\\n") fp.write("\hline\n") fp.write("\end{tabular}\n") @@ -146,12 +146,14 @@ def keep_first_fname(series): df.set_index(idx_cols, inplace=True) df.sort_index(inplace=True) #df = df.groupby(level=['trial', 'dataset', 'model', 'compression_factor', 'latent_dim', 'batch_size']).agg({ + df['test_f1_std'] = df['test_f1'].astype(float) df = df.groupby(level=idx_cols, dropna=False).agg({ 'beta': 'mean' , 'test_accuracy': 'mean' , 'test_precision': 'mean' , 'test_recall': 'mean' , 'test_f1': 'mean' + , 'test_f1_std': 'std' , 'mse/test': 'mean' , 'loss/test': 'mean' , 'mse/val': 'mean' From c0232c7c36bd883941cc609a5e795cea894267b5 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Fri, 27 Sep 2024 13:48:29 +0100 Subject: [PATCH 199/204] modification slurm script --- scripts/shapeembed/slurm_sweep_shapeembed.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py index 1080394c..7a94fda2 100755 --- a/scripts/shapeembed/slurm_sweep_shapeembed.py +++ b/scripts/shapeembed/slurm_sweep_shapeembed.py @@ -21,8 +21,8 @@ # ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask") # ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask") # ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask") -# ("vampire_cells", f"{datasets_pfx}/vampire_cells/", "mask") - ("vampire_nuclei", f"{datasets_pfx}/vampire_nuclei/", "mask") + ("mefs_cells", f"{datasets_pfx}/mefs_single_object_cell/", "mask") +# ("vampire_nuclei", f"{datasets_pfx}/vampire_nuclei/", "mask") #, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask") #, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask") #, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask") @@ -32,11 +32,11 @@ models = [ "resnet18_vqvae" -, "resnet50_vqvae" -, "resnet18_vae" -, "resnet50_vae" +#, "resnet50_vqvae" +#, "resnet18_vae" +#, "resnet50_vae" , "resnet18_beta_vae" -, "resnet50_beta_vae" +#, "resnet50_beta_vae" #, "resnet18_vae_bolt" #, "resnet50_vae_bolt" #, "resnet18_vqvae_legacy" @@ -49,8 +49,10 @@ ] model_params = { - "resnet18_beta_vae": {'beta': [2,5]} -, "resnet50_beta_vae": {'beta': [2,5]} + #"resnet18_beta_vae": {'beta': [2,5]} + "resnet18_beta_vae": {'beta': [0.0001]} +#, "resnet50_beta_vae": {'beta': [2,5]} +, "resnet50_beta_vae": {'beta': [0.00001]} } compression_factors = [1,2,3,5,10] @@ -119,7 +121,7 @@ def find_submitted_slurm_jobs(): dflt_out_dir=f'{os.getcwd()}/output_results_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}' slurm_time = '50:00:00' -slurm_mem = '250G' +slurm_mem = '80G' slurm_gpus = 'a100:1' shapeembed_script=f'{os.getcwd()}/shapeembed.py' From 870aa4211d522a7069b4b1ed0963026fa5ed6cde Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 29 Sep 2024 18:26:17 +0100 Subject: [PATCH 200/204] changes to test o2vae integration XXX relies on an adapted o2vae repo present in bioimage_embed/modles/o2vae --- bioimage_embed/models/factory.py | 67 ++++++++++++++++++- scripts/shapeembed/dataset_transformations.py | 66 ++++++++++++++++++ scripts/shapeembed/shapeembed.py | 25 ++++++- 3 files changed, 154 insertions(+), 4 deletions(-) diff --git a/bioimage_embed/models/factory.py b/bioimage_embed/models/factory.py index 8c6440d5..4c5f1a21 100644 --- a/bioimage_embed/models/factory.py +++ b/bioimage_embed/models/factory.py @@ -18,7 +18,6 @@ from . import bolts from functools import partial - class ModelFactory: def __init__( self, input_dim, latent_dim, pretrained=False, progress=True, **kwargs @@ -200,6 +199,71 @@ def resnet110_vqvae_legacy(self): def resnet152_vqvae_legacy(self): return self.resnet_vqvae_legacy(152) + def o2vae(self): + from .o2vae.models.decoders.cnn_decoder import CnnDecoder + from .o2vae.models.encoders_o2.e2scnn import E2SFCNN + from .o2vae.models.vae import VAE as O2VAE + + # encoder + q_net = E2SFCNN( + n_channels = 1, + n_classes = 64 * 2, # bc vae saves mean and stdDev vecors + # `name`: 'o2_cnn' for o2-invariant encoder. 'cnn_encoder' for standard cnn encoder. + name="o2_cnn_encoder", + # `cnn_dims`: must be 6 elements long. Increase numbers for larger model capacity + cnn_dims=[6, 9, 12, 12, 19, 25], + # `layer_type`: type of cnn layer (following e2cnn library examples) + layer_type="inducedgated_norm", # recommend not changing + # `N`: Ignored if `name!='o2'`. Negative means the model will be O2-invariant. + # Again, see (e2cnn library examples). Recommend not changing. + N=-3, + ) + + # decoder + p_net = CnnDecoder( + zdim = 64, + name="cnn_decoder", # 'cnn' is the ony option + # `cnn_dims`: each extra layer doubles the dimension (image width) by a factor of 2. + # E.g. if there are 6 elements, image width is 2^6=64 + cnn_dims=[192, 96, 96, 48, 48, 48], + #cnn_dims=[192, 96, 96, 48, 48, 24, 24, 12, 12], + out_channels=1, + ) + + # vae + model = O2VAE( + q_net = q_net, + p_net = p_net, + zdim = 64, # vae bottleneck layer + do_sigmoid = True, # whether to make the output be between [0,1]. Usually True. + loss_kwargs = dict( + # 'beta' from beta-vae, or the weight on the KL-divergence term https://openreview.net/forum?id=Sy2fzU9gl + beta=0.01, + # `recon_loss_type`: "bce" (binary cross entropy) or "mse" (mean square error) + # or "ce" (cross-entropy, but warning, not been tested well) + #recon_loss_type="bce", + recon_loss_type="mse", + # for reconstrutcion loss, pixel mask. Must be either `None` or an array with same dimension as the images. + mask=None, + align_loss=True, # whether to align the output image to the input image + # whether to use efficient Foureier-based loss alignment. (Ignored if align_loss==False) + align_fourier=True, + # whether to do align the best rotation AND flip, instead of just rotation. (Ignored if align_loss==False) + do_flip=True, + # if doing brute force align loss, this is the rotation discretization. (Ignored if + # align_loss==False or if align_fourier==True) + rot_steps=2, + # Recommend not changing. The vae prior distribution. Optoins: ("standard","normal","gmm"). See models.vae.VAE for deatils. + prior_kwargs=dict( prior="standard",), + ) + ) + + # extra attributes + model.encoder = q_net + model.decoder = p_net + + return model + MODELS = [ "resnet18_vae", @@ -217,6 +281,7 @@ def resnet152_vqvae_legacy(self): "resnet152_vqvae_legacy", "resnet18_vae_legacy", "resnet50_vae_legacy", + "o2vae", ] from typing import Tuple diff --git a/scripts/shapeembed/dataset_transformations.py b/scripts/shapeembed/dataset_transformations.py index 1cd76c7f..ad3789c7 100644 --- a/scripts/shapeembed/dataset_transformations.py +++ b/scripts/shapeembed/dataset_transformations.py @@ -146,3 +146,69 @@ def mask2distmatrix(mask, matrix_size=512, raw_sampling_sparsity=1): dm = build_distance_matrix(x_reinterpolated, y_reinterpolated) logger.debug(f'mask2distmatrix: created distance matrix shape {dm.shape}') return dm + +def bbox(img): + """ + This function returns the bounding box of the content of an image, where + "content" is any non 0-valued pixel. The bounding box is returned as the + quadruple ymin, ymax, xmin, xmax. + + Parameters + ---------- + img : 2-d numpy array + An image with an object to find the bounding box for. The truth value of + object pixels should be True and of non-object pixels should be False. + + Returns + ------- + ymin: int + The lowest index row containing object pixels + ymax: int + The highest index row containing object pixels + xmin: int + The lowest index column containing object pixels + xmax: int + The highest index column containing object pixels + """ + rows = np.any(img, axis=1) + cols = np.any(img, axis=0) + ymin, ymax = np.where(rows)[0][[0, -1]] + xmin, xmax = np.where(cols)[0][[0, -1]] + return ymin, ymax, xmin, xmax + +def recrop_image(img, square=False): + """ + This function returns an image recroped to its content. + + Parameters + ---------- + img : 3-d numpy array + A 3-channels (rgb) 2-d image with an object to recrop around. The value of + object pixels should be non-zero (and zero for non-object pixels). + + Returns + ------- + 3-d numpy array + The recroped image + """ + + ymin, ymax, xmin, xmax = bbox(img) + newimg = img[ymin:ymax+1, xmin:xmax+1] + + if square: # slot the new image into a black square + dx, dy = xmax - xmin + 1, ymax - ymin + 1 + dmax = max(dx, dy) + dmin = min(dx, dy) + dd = max(dx, dy) - min(dx, dy) + off = dd // 2 + res = np.full((dmax, dmax, 3), [.0,.0,.0]) # big black square + if dx < dy: # fewer columns, center horizontally + res[:, off+1:off+1+newimg.shape[1]] = newimg + else: # fewer lines, center vertically + #print(f"DEBUG: dx {dx}, dy {dy}, dmax {dmax}, dd {dd}, off {off}") + #print(f"DEBUG: res[off+1:off+1+newimg.shape[0],:].shape: {res[off+1:off+1+newimg.shape[0],:].shape}") + #print(f"DEBUG: newimg.shape: {newimg.shape}") + res[off+1:off+1+newimg.shape[0],:] = newimg + return res + else: + return newimg diff --git a/scripts/shapeembed/shapeembed.py b/scripts/shapeembed/shapeembed.py index e62dbd59..9f18a7c0 100755 --- a/scripts/shapeembed/shapeembed.py +++ b/scripts/shapeembed/shapeembed.py @@ -60,6 +60,7 @@ , "resnet152_vqvae_legacy" , "resnet18_vae_legacy" , "resnet50_vae_legacy" +, "o2vae" ] # set of parameters for a run, with default values @@ -165,12 +166,30 @@ def get_dataloader(params): if params.dataset.type == 'raw_image': # TODO raise NotImplementedError("raw images not yet supported") elif params.dataset.type == 'mask': # mask data, convert to distance matrix first + #dataset = datasets.ImageFolder( + # params.dataset.path + #, transforms.Compose([ np.array + # , functools.partial( mask2distmatrix + # , matrix_size=params.distance_matrix_size ) + # , distmat_ts ])) + def f(x): + print(f"DEBUG: shape:{x.shape}") + return x + def g(x): + print(f"-------------") + return x dataset = datasets.ImageFolder( params.dataset.path , transforms.Compose([ np.array - , functools.partial( mask2distmatrix - , matrix_size=params.distance_matrix_size ) - , distmat_ts ])) + , functools.partial(recrop_image, square=True) + , torch.as_tensor + , lambda x: torch.transpose(x, 0, 2) + , transforms.Resize(64) + , lambda x: torch.transpose(x, 0, 2) + , rgb2grey + #, lambda x: x.repeat(3, 1, 1) + , lambda x: x.repeat(1, 1, 1) + ])) elif params.dataset.type == 'distance_matrix': # distance matrix data dataset = datasets.DatasetFolder( params.dataset.path , loader=np.load From 6fa5cc903f518e2bd8a766734bf332416ec84448 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Sun, 29 Sep 2024 22:28:50 +0100 Subject: [PATCH 201/204] off-by-one in square recrop --- scripts/shapeembed/dataset_transformations.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/scripts/shapeembed/dataset_transformations.py b/scripts/shapeembed/dataset_transformations.py index ad3789c7..8c4c6693 100644 --- a/scripts/shapeembed/dataset_transformations.py +++ b/scripts/shapeembed/dataset_transformations.py @@ -196,19 +196,21 @@ def recrop_image(img, square=False): newimg = img[ymin:ymax+1, xmin:xmax+1] if square: # slot the new image into a black square - dx, dy = xmax - xmin + 1, ymax - ymin + 1 + dx, dy = xmax+1 - xmin, ymax+1 - ymin dmax = max(dx, dy) - dmin = min(dx, dy) + #dmin = min(dx, dy) dd = max(dx, dy) - min(dx, dy) off = dd // 2 res = np.full((dmax, dmax, 3), [.0,.0,.0]) # big black square + #print(f"DEBUG: dx {dx}, dy {dy}, dmax {dmax}, dd {dd}, off {off}") + #print(f"DEBUG: res[off+1:off+1+newimg.shape[0],:].shape: {res[off+1:off+1+newimg.shape[0],:].shape}") + #print(f"DEBUG: newimg.shape: {newimg.shape}") if dx < dy: # fewer columns, center horizontally - res[:, off+1:off+1+newimg.shape[1]] = newimg + res[:, off:off+newimg.shape[1]] = newimg else: # fewer lines, center vertically - #print(f"DEBUG: dx {dx}, dy {dy}, dmax {dmax}, dd {dd}, off {off}") - #print(f"DEBUG: res[off+1:off+1+newimg.shape[0],:].shape: {res[off+1:off+1+newimg.shape[0],:].shape}") - #print(f"DEBUG: newimg.shape: {newimg.shape}") - res[off+1:off+1+newimg.shape[0],:] = newimg + res[off:off+newimg.shape[0],:] = newimg + #print(f"DEBUG: res img updated") + #print(f"DEBUG: ------------------------------") return res else: return newimg From 85ce853ad6fb2a47ee17246716cabb3b4f3c1958 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 30 Sep 2024 08:26:40 +0100 Subject: [PATCH 202/204] added drop_last for uneven dataset sizes --- bioimage_embed/lightning/dataloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bioimage_embed/lightning/dataloader.py b/bioimage_embed/lightning/dataloader.py index 29f608a4..34b84097 100644 --- a/bioimage_embed/lightning/dataloader.py +++ b/bioimage_embed/lightning/dataloader.py @@ -35,6 +35,7 @@ def __init__( "pin_memory": True, "shuffle": False, "sampler": sampler, + "drop_last": True, # "collate_fn": self.collate_wrapper(self.collate_filter_for_none), # "collate_fn": self.collate_filter_for_none, } From db3a4fb975f6c0a4cc33b31c3ecbd50f08336345 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 30 Sep 2024 17:20:51 +0100 Subject: [PATCH 203/204] specialized slurm script --- scripts/shapeembed/slurm_sweep_shapeembed.py | 28 +++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/scripts/shapeembed/slurm_sweep_shapeembed.py b/scripts/shapeembed/slurm_sweep_shapeembed.py index 7a94fda2..d04a5d5f 100755 --- a/scripts/shapeembed/slurm_sweep_shapeembed.py +++ b/scripts/shapeembed/slurm_sweep_shapeembed.py @@ -21,21 +21,22 @@ # ("synthetic_shapes", f"{datasets_pfx}/synthetic_shapes/", "mask") # ("tiny_synthcell", f"{datasets_pfx}/tiny_synthcellshapes_dataset/", "mask") # ("vampire", f"{datasets_pfx}/vampire/torchvision/Control/", "mask") - ("mefs_cells", f"{datasets_pfx}/mefs_single_object_cell/", "mask") -# ("vampire_nuclei", f"{datasets_pfx}/vampire_nuclei/", "mask") -#, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask") -#, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask") -#, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask") -#, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask") -#, ("allen", f"{datasets_pfx}/allen_dataset/", "mask") + ("mefs_cells", f"{datasets_pfx}/mefs_single_object_cell/", "mask") +, ("vampire_nuclei", f"{datasets_pfx}/vampire_nuclei/", "mask") +, ("binary_vampire", f"{datasets_pfx}/binary_vampire/", "mask") +, ("bbbc010", f"{datasets_pfx}/bbbc010/BBBC010_v1_foreground_eachworm/", "mask") +, ("synthcell", f"{datasets_pfx}/synthcellshapes_dataset/", "mask") +, ("helakyoto", f"{datasets_pfx}/H2b_10x_MD_exp665/samples/", "mask") +, ("allen", f"{datasets_pfx}/allen_dataset/", "mask") ] models = [ - "resnet18_vqvae" + "o2vae" +# "resnet18_vqvae" #, "resnet50_vqvae" #, "resnet18_vae" #, "resnet50_vae" -, "resnet18_beta_vae" +#, "resnet18_beta_vae" #, "resnet50_beta_vae" #, "resnet18_vae_bolt" #, "resnet50_vae_bolt" @@ -50,14 +51,15 @@ model_params = { #"resnet18_beta_vae": {'beta': [2,5]} - "resnet18_beta_vae": {'beta': [0.0001]} +# "resnet18_beta_vae": {'beta': [0.0001]} #, "resnet50_beta_vae": {'beta': [2,5]} -, "resnet50_beta_vae": {'beta': [0.00001]} +#, "resnet50_beta_vae": {'beta': [0.00001]} } -compression_factors = [1,2,3,5,10] +#compression_factors = [1,2,3,5,10] +compression_factors = [1] -batch_sizes = [4, 8, 16] +batch_sizes = [4, 16, 64, 128, 256] # XXX XXX XXX XXX XXX XXX XXX # # XXX ad-hoc one-off config XXX # From c45b884273216eef610fd8b2dca4094dae5bb176 Mon Sep 17 00:00:00 2001 From: Anna Foix Date: Mon, 30 Sep 2024 17:21:56 +0100 Subject: [PATCH 204/204] added o2vae repo patch --- .../models/o2vae_shapeembed_integration.diff | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 bioimage_embed/models/o2vae_shapeembed_integration.diff diff --git a/bioimage_embed/models/o2vae_shapeembed_integration.diff b/bioimage_embed/models/o2vae_shapeembed_integration.diff new file mode 100644 index 00000000..309d7206 --- /dev/null +++ b/bioimage_embed/models/o2vae_shapeembed_integration.diff @@ -0,0 +1,97 @@ +diff --git a/models/align_reconstructions.py b/models/align_reconstructions.py +index d07d1ab..c52b40d 100644 +--- a/models/align_reconstructions.py ++++ b/models/align_reconstructions.py +@@ -6,7 +6,7 @@ import torch + import torchgeometry as tgm + import torchvision.transforms.functional as T_f + +-from registration import registration ++from ..registration import registration + + + def loss_reconstruction_fourier_batch(x, y, recon_loss_type="bce", mask=None): +diff --git a/models/decoders/cnn_decoder.py b/models/decoders/cnn_decoder.py +index ba3a1cc..1740945 100644 +--- a/models/decoders/cnn_decoder.py ++++ b/models/decoders/cnn_decoder.py +@@ -58,7 +58,7 @@ class CnnDecoder(nn.Module): + + self.dec_conv = nn.Sequential(*layers) + +- def forward(self, x): ++ def forward(self, x, epoch = None): + bs = x.size(0) + x = self.fc(x) + dim = x.size(1) +diff --git a/models/encoders_o2/e2scnn.py b/models/encoders_o2/e2scnn.py +index 9c4f47f..e292b1e 100644 +--- a/models/encoders_o2/e2scnn.py ++++ b/models/encoders_o2/e2scnn.py +@@ -219,14 +219,20 @@ class E2SFCNN(torch.nn.Module): + repr += f"\t{i: <3} - {name: <70} | {params: <8} |\n" + return repr + +- def forward(self, input: torch.tensor): ++ def forward(self, input: torch.tensor, epoch = None): ++ #print(f"DEBUG: e2scnn forward: input.shape: {input.shape}") + x = GeometricTensor(input, self.in_repr) ++ #print(f"DEBUG: e2scnn forward: pre layers x.shape: {x.shape}") + + for layer in self.eq_layers: + x = layer(x) + ++ #print(f"DEBUG: e2scnn forward: pre fully_net x.shape: {x.shape}") ++ + x = self.fully_net(x.tensor.reshape(x.tensor.shape[0], -1)) + ++ #print(f"DEBUG: e2scnn forward: pre final x.shape: {x.shape}") ++ + return x + + def build_layer_regular( +diff --git a/models/vae.py b/models/vae.py +index 3af262b..af1a2dc 100644 +--- a/models/vae.py ++++ b/models/vae.py +@@ -3,8 +3,9 @@ import importlib + import numpy as np + import torch + import torchvision ++from pythae.models.base.base_utils import ModelOutput + +-from models import align_reconstructions ++from . import align_reconstructions + + from . import model_utils as mut + +@@ -273,10 +274,11 @@ class VAE(torch.nn.Module): + + return y + +- def forward(self, x): ++ def forward(self, x, epoch = None): ++ x = x["data"] + in_shape = x.shape + bs = in_shape[0] +- assert x.ndim == 4 ++ assert len(in_shape) == 4 + + # inference and sample + z = self.q_net(x) +@@ -290,8 +292,12 @@ class VAE(torch.nn.Module): + y = torch.sigmoid(y) + # check the spatial dimensions are good (if doing multiclass prediction per pixel, the `c` dim may be different) + assert in_shape[-2:] == y.shape[-2:], ( +- "output image different dimension to " +- "input image ... probably change the number of layers (cnn_dims) in the decoder" ++ f"output image different dimension {y.shape[-2:]} to " ++ f"input image {in_shape[-2:]} ... probably change the number of layers (cnn_dims) in the decoder" + ) + +- return x, y, mu, logvar ++ # gather losses ++ losses = self.loss(x, y, mu, logvar) ++ ++ return ModelOutput(recon_x=y, z=z_sample, loss=losses['loss'], recon_loss=losses['loss_recon']) ++ #return ModelOutput(recon_x=y, z=z_sample)