From ee8d0b20f391b235cf9cfe70007c19c731490a61 Mon Sep 17 00:00:00 2001
From: Landan Seguin <landanjs@gmail.com>
Date: Mon, 21 Aug 2023 13:52:19 -0700
Subject: [PATCH] Clean up yamls (#58)

* Clean up yamls
---
 yamls/hydra-yamls/SD-2-base-256.yaml  | 16 ++++---------
 yamls/hydra-yamls/SD-2-base-512.yaml  | 16 ++++---------
 yamls/mosaic-yamls/SD-2-base-256.yaml | 34 ++++++++++++++-------------
 yamls/mosaic-yamls/SD-2-base-512.yaml | 27 ++++++++++-----------
 4 files changed, 40 insertions(+), 53 deletions(-)

diff --git a/yamls/hydra-yamls/SD-2-base-256.yaml b/yamls/hydra-yamls/SD-2-base-256.yaml
index b466fa98..d0ad1470 100644
--- a/yamls/hydra-yamls/SD-2-base-256.yaml
+++ b/yamls/hydra-yamls/SD-2-base-256.yaml
@@ -1,8 +1,6 @@
 project: # Insert wandb project name
-batch_size: 2048
-seed: 17
-scale_schedule_ratio: 1.0
 name: # Insert wandb run name
+seed: 17
 eval_first: false
 algorithms:
   low_precision_groupnorm:
@@ -22,15 +20,14 @@ model:
   val_guidance_scales: []
   loss_bins: []
 dataset:
-  train_batch_size: ${batch_size}
-  eval_batch_size: 1024 # Should be 8 per device
+  train_batch_size: 2048 # Global training batch size
+  eval_batch_size: 1024  # Global evaluation batch size
   train_dataset:
     _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader
     remote:
       # Path to object store bucket(s)
     local:
       # Path to corresponding local dataset(s)
-    batch_size: ${batch_size}
     tokenizer_name_or_path: stabilityai/stable-diffusion-2-base
     caption_drop_prob: 0.1
     resize_size: 256
@@ -46,7 +43,6 @@ dataset:
     _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader
     remote:  # Path to object store bucket
     local: # Path to local dataset cache
-    batch_size: 8
     resize_size: 256
     prefetch_factor: 2
     num_workers: 8
@@ -57,10 +53,9 @@ optimizer:
   lr: 1.0e-4
   weight_decay: 0.01
 scheduler:
-  _target_: composer.optim.MultiStepWithWarmupScheduler
+  _target_: composer.optim.LinearWithWarmupScheduler
   t_warmup: 10000ba
-  milestones:
-    - 200ep
+  alpha_f: 1.0
 logger:
   wandb:
     _target_: composer.loggers.wandb_logger.WandBLogger
@@ -87,7 +82,6 @@ trainer:
   device_train_microbatch_size: 16
   run_name: ${name}
   seed: ${seed}
-  scale_schedule_ratio: ${scale_schedule_ratio}
   save_folder:  # Insert path to save folder or bucket
   save_interval: 10000ba
   save_overwrite: true
diff --git a/yamls/hydra-yamls/SD-2-base-512.yaml b/yamls/hydra-yamls/SD-2-base-512.yaml
index ab254624..39a1e659 100644
--- a/yamls/hydra-yamls/SD-2-base-512.yaml
+++ b/yamls/hydra-yamls/SD-2-base-512.yaml
@@ -1,8 +1,6 @@
 project: # Insert wandb project name
-batch_size: 2048
-seed: 17
-scale_schedule_ratio: 1.0
 name: # Insert wandb run name
+seed: 17
 eval_first: false
 algorithms:
   ema:
@@ -28,15 +26,14 @@ model:
   val_guidance_scales: []
   loss_bins: []
 dataset:
-  train_batch_size: ${batch_size}
-  eval_batch_size: 1024 # Should be 8 per device
+  train_batch_size: 2048 # Global training batch size
+  eval_batch_size: 1024  # Global evaluation batch size
   train_dataset:
     _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader
     remote:
       # Path to object store bucket(s)
     local:
       # Path to corresponding local dataset(s)
-    batch_size: ${batch_size}
     tokenizer_name_or_path: stabilityai/stable-diffusion-2-base
     caption_drop_prob: 0.1
     resize_size: 512
@@ -52,7 +49,6 @@ dataset:
     _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader
     remote: # Path to object store bucket
     local: # Path to local dataset cache
-    batch_size: 8
     resize_size: 512
     prefetch_factor: 2
     num_workers: 8
@@ -63,10 +59,9 @@ optimizer:
   lr: 1.0e-4
   weight_decay: 0.01
 scheduler:
-  _target_: composer.optim.MultiStepWithWarmupScheduler
+  _target_: composer.optim.LinearWithWarmupScheduler
   t_warmup: 10000ba
-  milestones:
-    - 200ep
+  alpha_f: 1.0
 logger:
   wandb:
     _target_: composer.loggers.wandb_logger.WandBLogger
@@ -93,7 +88,6 @@ trainer:
   device_train_microbatch_size: 16
   run_name: ${name}
   seed: ${seed}
-  scale_schedule_ratio: ${scale_schedule_ratio}
   save_folder:  # Insert path to save folder or bucket
   save_interval: 10000ba
   save_overwrite: true
diff --git a/yamls/mosaic-yamls/SD-2-base-256.yaml b/yamls/mosaic-yamls/SD-2-base-256.yaml
index 99755738..d6bbe901 100644
--- a/yamls/mosaic-yamls/SD-2-base-256.yaml
+++ b/yamls/mosaic-yamls/SD-2-base-256.yaml
@@ -1,24 +1,30 @@
 run_name: SD2-base-256
-cluster: # Insert cluster here
-gpu_num: # Insert number of GPUs
 image: mosaicml/pytorch_vision:1.13.1_cu117-python3.10-ubuntu20.04
+compute:
+  gpus: # Number of GPUs to use
+
+  ## These configurations are optional
+  # cluster: TODO # Name of the cluster to use for this run
+  # gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments
+
 integrations:
   - integration_type: "git_repo"
-    git_repo: mosaicml/diffusion2
+    git_repo: mosaicml/diffusion
     git_branch: main
     pip_install: .[all]
   - integration_type: "wandb"
     project: # Insert wandb project name
     entity: # Insert wandb entity name
+env_variables:
+- key: HYDRA_FULL_ERROR
+  value: '1' # Set to '0' to limit Hydra tracebacks
 command: |
-  cd diffusion2
-  HYDRA_FULL_ERROR=1 composer run.py
+  cd diffusion
+  composer run.py --config-path /mnt/config --config-name parameters
 parameters:
   project:  # Insert wandb project name
-  batch_size: 2048
-  seed: 17
-  scale_schedule_ratio: 1.0
   name:  # Insert wandb run name
+  seed: 17
   eval_first: false
   algorithms:
     low_precision_groupnorm:
@@ -38,15 +44,14 @@ parameters:
     val_guidance_scales: []
     loss_bins: []
   dataset:
-    train_batch_size: ${batch_size}
-    eval_batch_size: 1024 # Should be 8 per device
+    train_batch_size: 2048 # Global training batch size
+    eval_batch_size: 1024  # Global evaluation batch size
     train_dataset:
       _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader
       remote:
         # Path to object store bucket(s)
       local:
         # Path to corresponding local dataset(s)
-      batch_size: ${batch_size}
       tokenizer_name_or_path: stabilityai/stable-diffusion-2-base
       caption_drop_prob: 0.1
       resize_size: 256
@@ -62,7 +67,6 @@ parameters:
       _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader
       remote: # Path to object store bucket
       local: # Path to local dataset cache
-      batch_size: 8
       resize_size: 256
       prefetch_factor: 2
       num_workers: 8
@@ -73,10 +77,9 @@ parameters:
     lr: 1.0e-4
     weight_decay: 0.01
   scheduler:
-    _target_: composer.optim.MultiStepWithWarmupScheduler
+    _target_: composer.optim.LinearWithWarmupScheduler
     t_warmup: 10000ba
-    milestones:
-      - 200ep
+    alpha_f: 1.0
   logger:
     wandb:
       _target_: composer.loggers.wandb_logger.WandBLogger
@@ -103,7 +106,6 @@ parameters:
     device_train_microbatch_size: 16
     run_name: ${name}
     seed: ${seed}
-    scale_schedule_ratio: ${scale_schedule_ratio}
     save_folder:  # Insert path to save folder or bucket
     save_interval: 10000ba
     save_overwrite: true
diff --git a/yamls/mosaic-yamls/SD-2-base-512.yaml b/yamls/mosaic-yamls/SD-2-base-512.yaml
index df82250d..ac26c68a 100644
--- a/yamls/mosaic-yamls/SD-2-base-512.yaml
+++ b/yamls/mosaic-yamls/SD-2-base-512.yaml
@@ -1,7 +1,7 @@
 name: SD2-base-512
 image: mosaicml/pytorch_vision:1.13.1_cu117-python3.10-ubuntu20.04
 compute:
-  gpus: 8  # Number of GPUs to use
+  gpus:  # Number of GPUs to use
 
   ## These configurations are optional
   # cluster: TODO # Name of the cluster to use for this run
@@ -9,21 +9,22 @@ compute:
 
 integrations:
   - integration_type: "git_repo"
-    git_repo: mosaicml/diffusion2
+    git_repo: mosaicml/diffusion
     git_branch: main
     pip_install: .[all]
   - integration_type: "wandb"
     project: # Insert wandb project name
     entity: # Insert wandb entity name
+env_variables:
+- key: HYDRA_FULL_ERROR
+  value: '1' # Set to '0' to limit Hydra tracebacks
 command: |
-  cd diffusion2
-  HYDRA_FULL_ERROR=1 composer run.py
+  cd diffusion
+  composer run.py --config-path /mnt/config --config-name parameters
 parameters:
   project:  # Insert wandb project name
-  batch_size: 2048
-  seed: 17
-  scale_schedule_ratio: 1.0
   name:  # Insert wandb run name
+  seed: 17
   eval_first: false
   algorithms:
     ema:
@@ -49,15 +50,14 @@ parameters:
     val_guidance_scales: []
     loss_bins: []
   dataset:
-    train_batch_size: ${batch_size}
-    eval_batch_size: 1024 # Should be 8 per device
+    train_batch_size: 2048 # Global training batch size
+    eval_batch_size: 1024  # Global evaluation batch size
     train_dataset:
       _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader
       remote:
         # Path to object store bucket(s)
       local:
         # Path to corresponding local dataset(s)
-      batch_size: ${batch_size}
       tokenizer_name_or_path: stabilityai/stable-diffusion-2-base
       caption_drop_prob: 0.1
       resize_size: 512
@@ -73,7 +73,6 @@ parameters:
       _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader
       remote: # Path to object store bucket
       local: # Path to local dataset cache
-      batch_size: 8
       resize_size: 512
       prefetch_factor: 2
       num_workers: 8
@@ -84,10 +83,9 @@ parameters:
     lr: 1.0e-4
     weight_decay: 0.01
   scheduler:
-    _target_: composer.optim.MultiStepWithWarmupScheduler
+    _target_: composer.optim.LinearWithWarmupScheduler
     t_warmup: 10000ba
-    milestones:
-      - 200ep
+    alpha_f: 1.0
   logger:
     wandb:
       _target_: composer.loggers.wandb_logger.WandBLogger
@@ -114,7 +112,6 @@ parameters:
     device_train_microbatch_size: 16
     run_name: ${name}
     seed: ${seed}
-    scale_schedule_ratio: ${scale_schedule_ratio}
     save_folder:  # Insert path to save folder or bucket
     save_interval: 10000ba
     save_overwrite: true