From 364bbe9fc3dfd30cba928080bf0eed7beacce72f Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Mon, 7 Aug 2023 19:05:20 +0000
Subject: [PATCH 01/14] add sdxl unet

---
 diffusion/models/models.py | 20 +++++++++++++++++---
 setup.py                   |  2 +-
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/diffusion/models/models.py b/diffusion/models/models.py
index 6990710f..68b6472a 100644
--- a/diffusion/models/models.py
+++ b/diffusion/models/models.py
@@ -27,6 +27,7 @@
 
 def stable_diffusion_2(
     model_name: str = 'stabilityai/stable-diffusion-2-base',
+    unet_model_name: str = 'stabilityai/stable-diffusion-2-base',
     pretrained: bool = True,
     prediction_type: str = 'epsilon',
     train_metrics: Optional[List] = None,
@@ -44,7 +45,10 @@ def stable_diffusion_2(
     prompts.
 
     Args:
-        model_name (str, optional): Name of the model to load. Defaults to 'stabilityai/stable-diffusion-2-base'.
+        model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder.
+            Defaults to 'stabilityai/stable-diffusion-2-base'.
+        unet_model_name (str, optional): Name of the UNet model to load. Defaults to 
+            'stabilityai/stable-diffusion-2-base'
         pretrained (bool, optional): Whether to load pretrained weights. Defaults to True.
         prediction_type (str): The type of prediction to use. Must be one of 'sample',
             'epsilon', or 'v_prediction'. Default: `epsilon`.
@@ -75,11 +79,21 @@ def stable_diffusion_2(
             metric.requires_grad_(False)
 
     if pretrained:
-        unet = UNet2DConditionModel.from_pretrained(model_name, subfolder='unet')
+        unet = UNet2DConditionModel.from_pretrained(unet_model_name, subfolder='unet')
     else:
-        config = PretrainedConfig.get_config_dict(model_name, subfolder='unet')
+        config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet')
+
+        if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL
+            config[0]['addition_embed_type'] = None
+            config[0]['cross_attention_dim'] = 1024
+
         unet = UNet2DConditionModel(**config[0])
 
+    if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL
+        # Can't fsdp wrap up_blocks or down_blocks because the forward pass calls length on these
+        unet.up_blocks._fsdp_wrap = False
+        unet.down_blocks._fsdp_wrap = False
+
     if encode_latents_in_fp16:
         vae = AutoencoderKL.from_pretrained(model_name, subfolder='vae', torch_dtype=torch.float16)
         text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=torch.float16)
diff --git a/setup.py b/setup.py
index d05090ce..6a79423d 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     'mosaicml-streaming>=0.4.0,<1.0',
     'hydra-core>=1.2',
     'hydra-colorlog>=1.1.0',
-    'diffusers[torch]==0.16.0',
+    'diffusers[torch]==0.19.0',
     'transformers[torch]==4.29.2',
     'wandb==0.15.4',
     'xformers==0.0.16',

From 7dfb6f6f9c76813d7507a40a042ae1c8dcbe434b Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Mon, 7 Aug 2023 20:23:42 +0000
Subject: [PATCH 02/14] fix stochastic failures in streaming datasets

---
 diffusion/train.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/diffusion/train.py b/diffusion/train.py
index fb1bdd60..e8bf1fc1 100644
--- a/diffusion/train.py
+++ b/diffusion/train.py
@@ -7,6 +7,7 @@
 from collections.abc import Iterable
 from typing import Any, Dict, List, Optional, Union
 
+import time
 import hydra
 from composer import Algorithm, Callback, ComposerModel, DataSpec, Evaluator, Trainer
 from composer.algorithms.low_precision_groupnorm import apply_low_precision_groupnorm
@@ -38,6 +39,9 @@ def train(config: DictConfig) -> None:
         config.dataset.train_dataset,
         batch_size=config.dataset.train_batch_size // dist.get_world_size(),
     )
+    
+    # fix stochastic failures in streaming datasets
+    time.sleep(10)
 
     # Composer can take dataloaders, dataspecs, evaluators, or list of evaluators
     eval_set: Optional[Union[DataSpec, List[Evaluator]]] = None
@@ -59,6 +63,9 @@ def train(config: DictConfig) -> None:
     else:
         eval_set = hydra.utils.instantiate(config.dataset.eval_dataset,
                                            batch_size=config.dataset.eval_batch_size // dist.get_world_size())
+        
+    # fix stochastic failures in streaming datasets
+    time.sleep(10)
 
     # Build list of loggers, callbacks, and algorithms to pass to trainer
     logger: List[LoggerDestination] = []

From 36c005b983651e8cfbccdf0475e457700ff04d54 Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Mon, 7 Aug 2023 21:22:24 +0000
Subject: [PATCH 03/14] add some debug logging

---
 diffusion/models/models.py | 1 +
 diffusion/train.py         | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/diffusion/models/models.py b/diffusion/models/models.py
index 68b6472a..230c4660 100644
--- a/diffusion/models/models.py
+++ b/diffusion/models/models.py
@@ -84,6 +84,7 @@ def stable_diffusion_2(
         config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet')
 
         if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL
+            print('running SDXL!')
             config[0]['addition_embed_type'] = None
             config[0]['cross_attention_dim'] = 1024
 
diff --git a/diffusion/train.py b/diffusion/train.py
index e8bf1fc1..18c22dbe 100644
--- a/diffusion/train.py
+++ b/diffusion/train.py
@@ -39,9 +39,10 @@ def train(config: DictConfig) -> None:
         config.dataset.train_dataset,
         batch_size=config.dataset.train_batch_size // dist.get_world_size(),
     )
-    
+
     # fix stochastic failures in streaming datasets
     time.sleep(10)
+    print('sleeping afer dataset creation')
 
     # Composer can take dataloaders, dataspecs, evaluators, or list of evaluators
     eval_set: Optional[Union[DataSpec, List[Evaluator]]] = None
@@ -66,6 +67,7 @@ def train(config: DictConfig) -> None:
         
     # fix stochastic failures in streaming datasets
     time.sleep(10)
+    print('sleeping afer dataset creation')
 
     # Build list of loggers, callbacks, and algorithms to pass to trainer
     logger: List[LoggerDestination] = []

From db70078507f7de8d52f713e6951597cc33175b75 Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Mon, 7 Aug 2023 21:56:02 +0000
Subject: [PATCH 04/14] unpin some reqs

---
 setup.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 6a79423d..47fb29d2 100644
--- a/setup.py
+++ b/setup.py
@@ -6,16 +6,16 @@
 from setuptools import find_packages, setup
 
 install_requires = [
-    'mosaicml@git+https://github.com/mosaicml/composer.git@6cf3d3a1aa300834c650f89460b5ac9bbc5a1e46',
+    'mosaicml',
     'mosaicml-streaming>=0.4.0,<1.0',
     'hydra-core>=1.2',
     'hydra-colorlog>=1.1.0',
     'diffusers[torch]==0.19.0',
-    'transformers[torch]==4.29.2',
+    'transformers[torch]',
     'wandb==0.15.4',
-    'xformers==0.0.16',
-    'triton==2.0.0',
-    'torchmetrics[image]==0.11.3',
+    'xformers',
+    'triton',
+    'torchmetrics[image]',
     'clean-fid',
     'clip@git+https://github.com/openai/CLIP.git',
 ]

From b667fb46fc421dd530082bc50cd65a923502d626 Mon Sep 17 00:00:00 2001
From: jazcollins <jazzie@berkeley.edu>
Date: Mon, 7 Aug 2023 16:33:06 -0700
Subject: [PATCH 05/14] add yamls

---
 yamls/local-yamls/SDXL-a100-256-lite.yaml | 136 ++++++++++++++
 yamls/local-yamls/SDXL-h100-256.yaml      | 212 ++++++++++++++++++++++
 2 files changed, 348 insertions(+)
 create mode 100644 yamls/local-yamls/SDXL-a100-256-lite.yaml
 create mode 100644 yamls/local-yamls/SDXL-h100-256.yaml

diff --git a/yamls/local-yamls/SDXL-a100-256-lite.yaml b/yamls/local-yamls/SDXL-a100-256-lite.yaml
new file mode 100644
index 00000000..60d048da
--- /dev/null
+++ b/yamls/local-yamls/SDXL-a100-256-lite.yaml
@@ -0,0 +1,136 @@
+run_name: sd2-sdxl-unet-256
+cluster: r1z1
+gpu_num: 4
+image: mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04
+integrations:
+  - integration_type: "git_repo"
+    git_repo: jazcollins/diffusion
+    git_branch: sdxl
+    pip_install: .[all]
+  - integration_type: "wandb"
+    project: jasmine-sd2-sdxl-unet
+    entity: mosaic-ml
+command: |
+  pip install -U ninja
+  pip install -U git+https://github.com/facebookresearch/xformers
+  cd diffusion
+  HYDRA_FULL_ERROR=1 composer run.py --config-path /mnt/config --config-name parameters
+  (echo "Command failed - killing python" && pkill python && exit 1)
+
+parameters:
+  project:  jasmine-sd2-sdxl-unet
+  batch_size: 32 # 2048
+  seed: 17
+  scale_schedule_ratio: 1.0
+  name: test # wandb run name
+  eval_first: true
+  algorithms:
+    low_precision_groupnorm:
+      attribute: unet
+      precision: amp_fp16
+    low_precision_layernorm:
+      attribute: unet
+      precision: amp_fp16
+  model:
+    _target_: diffusion.models.models.stable_diffusion_2
+    pretrained: false
+    model_name: stabilityai/stable-diffusion-2-base
+    unet_model_name: stabilityai/stable-diffusion-xl-refiner-1.0
+    precomputed_latents: true
+    encode_latents_in_fp16: true
+    fsdp: true
+    val_metrics:
+      - _target_: torchmetrics.MeanSquaredError
+    val_guidance_scales: []
+    loss_bins: []
+  dataset:
+    train_batch_size: ${batch_size}
+    eval_batch_size: 1024 # Should be 8 per device
+    train_dataset:
+      _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader
+      remote:
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/256-512
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/512-768
+      local:
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/256-512
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/512-768
+      batch_size: ${batch_size}
+      tokenizer_name_or_path: stabilityai/stable-diffusion-2-base
+      caption_drop_prob: 0.1
+      resize_size: 256
+      drop_last: true
+      shuffle: true
+      prefetch_factor: 2
+      num_workers: 8
+      persistent_workers: true
+      pin_memory: true
+      download_timeout: 900
+      num_canonical_nodes: 32
+    eval_dataset:
+      _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader
+      remote:  oci://mosaicml-internal-dataset-coco/2014/val/10k-1/
+      local: /tmp/mds-cache/mds-coco-10k-1/
+      batch_size: 8
+      resize_size: 256
+      prefetch_factor: 2
+      num_workers: 8
+      persistent_workers: True
+      pin_memory: True
+  optimizer:
+    _target_: torch.optim.AdamW
+    lr: 1.0e-4
+    weight_decay: 0.01
+  scheduler:
+    _target_: composer.optim.MultiStepWithWarmupScheduler
+    t_warmup: 10000ba
+    milestones:
+      - 2000ep
+  logger:
+    wandb:
+      _target_: composer.loggers.wandb_logger.WandBLogger
+      name: ${name}
+      project: ${project}
+      group: ${name}
+  callbacks:
+    speed_monitor:
+      _target_: composer.callbacks.speed_monitor.SpeedMonitor
+      window_size: 10
+    lr_monitor:
+      _target_: composer.callbacks.lr_monitor.LRMonitor
+    memory_monitor:
+      _target_: composer.callbacks.memory_monitor.MemoryMonitor
+    runtime_estimator:
+      _target_: composer.callbacks.runtime_estimator.RuntimeEstimator
+    optimizer_monitor:
+      _target_: composer.callbacks.OptimizerMonitor
+    image_logger:
+      _target_: diffusion.callbacks.log_diffusion_images.LogDiffusionImages
+      prompts:
+        - a couple waiting to cross the street underneath an umbrella.
+        - three men walking in the rain with umbrellas.
+        - a man is riding a red motor cycle, with baskets.
+        - a clock that has animal pictures instead of numbers.
+        - a brightly decorated bus sits on the road.
+        - a horse bucking with a rider on it, completely vertical, with another horse and onlookers.
+        - a white and blue bus is on a city street at night.
+        - a large clock tower on a building by a river
+        - beans and other food is sitting on a plate.
+        - a group of people that are standing up on a tennis court
+      size: 256
+      guidance_scale: 5.0
+  trainer:
+    _target_: composer.Trainer
+    device: gpu
+    max_duration: 550000ba
+    eval_interval: 1000ba
+    device_train_microbatch_size: 8 # 64
+    run_name: ${name}
+    seed: ${seed}
+    scale_schedule_ratio: ${scale_schedule_ratio}
+    save_folder:  oci://mosaicml-internal-checkpoints/jasmine/test/
+    save_interval: 10000ba
+    save_overwrite: false
+    autoresume: true
+    fsdp_config:
+      sharding_strategy: "SHARD_GRAD_OP"
+    progress_bar: false
diff --git a/yamls/local-yamls/SDXL-h100-256.yaml b/yamls/local-yamls/SDXL-h100-256.yaml
new file mode 100644
index 00000000..86e751fc
--- /dev/null
+++ b/yamls/local-yamls/SDXL-h100-256.yaml
@@ -0,0 +1,212 @@
+run_name: sd2-sdxl-unet-256
+cluster: r9z1
+gpu_num: 32
+env_variables:
+  - key: NCCL_IB_PCI_RELAXED_ORDERING
+    value: "0"
+image: mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04
+compute:
+  instance: coreweave.h100-80
+scheduling:
+    resumable: true
+    priority: medium
+integrations:
+  - integration_type: "git_repo"
+    git_repo: jazcollins/diffusion
+    git_branch: sdxl
+    pip_install: .[all]
+  - integration_type: "wandb"
+    project: jasmine-sd2-sdxl-unet
+    entity: mosaic-ml
+command: |
+  pip install -U ninja
+  pip install -U git+https://github.com/facebookresearch/xformers
+  cd diffusion
+  HYDRA_FULL_ERROR=1 composer run.py --config-path /mnt/config --config-name parameters
+  (echo "Command failed - killing python" && pkill python && exit 1)
+
+parameters:
+  project:  jasmine-sd2-sdxl-unet
+  batch_size: 2048
+  seed: 17
+  scale_schedule_ratio: 1.0
+  name: 90m-sdxl-unet # wandb run name
+  eval_first: true
+  algorithms:
+    low_precision_groupnorm:
+      attribute: unet
+      precision: amp_fp16
+    low_precision_layernorm:
+      attribute: unet
+      precision: amp_fp16
+  model:
+    _target_: diffusion.models.models.stable_diffusion_2
+    pretrained: false
+    model_name: stabilityai/stable-diffusion-2-base
+    unet_model_name: stabilityai/stable-diffusion-xl-refiner-1.0
+    precomputed_latents: true
+    encode_latents_in_fp16: true
+    fsdp: true
+    val_metrics:
+      - _target_: torchmetrics.MeanSquaredError
+    val_guidance_scales: []
+    loss_bins: []
+  dataset:
+    train_batch_size: ${batch_size}
+    eval_batch_size: 1024 # Should be 8 per device
+    train_dataset:
+      _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader
+      remote:
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/256-512
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/512-768
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/768-1024
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/1024-1048576
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/256-512
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/512-768
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/768-1024
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/1024-1048576
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/256-512
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/512-768
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/768-1024
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/1024-1048576
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/256-512
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/512-768
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/768-1024
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/1024-1048576
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/256-512
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/512-768
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/768-1024
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/1024-1048576
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/256-512
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/512-768
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/768-1024
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/1024-1048576
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/256-512
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/512-768
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/768-1024
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/1024-1048576
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/256-512
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/512-768
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/768-1024
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/1024-1048576
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/256-512
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/512-768
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/768-1024
+        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/1024-1048576
+      local:
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/256-512
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/512-768
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/768-1024
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/1024-1048576
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/256-512
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/512-768
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/768-1024
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/1024-1048576
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/256-512
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/512-768
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/768-1024
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/1024-1048576
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/256-512
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/512-768
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/768-1024
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/1024-1048576
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/256-512
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/512-768
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/768-1024
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/1024-1048576
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/256-512
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/512-768
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/768-1024
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/1024-1048576
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/256-512
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/512-768
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/768-1024
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/1024-1048576
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/256-512
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/512-768
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/768-1024
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/1024-1048576
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/256-512
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/512-768
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/768-1024
+        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/1024-1048576
+      batch_size: ${batch_size}
+      tokenizer_name_or_path: stabilityai/stable-diffusion-2-base
+      caption_drop_prob: 0.1
+      resize_size: 256
+      drop_last: true
+      shuffle: true
+      prefetch_factor: 2
+      num_workers: 8
+      persistent_workers: true
+      pin_memory: true
+      download_timeout: 900
+      num_canonical_nodes: 32
+    eval_dataset:
+      _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader
+      remote:  oci://mosaicml-internal-dataset-coco/2014/val/10k-1/
+      local: /tmp/mds-cache/mds-coco-10k-1/
+      batch_size: 8
+      resize_size: 256
+      prefetch_factor: 2
+      num_workers: 8
+      persistent_workers: True
+      pin_memory: True
+  optimizer:
+    _target_: torch.optim.AdamW
+    lr: 1.0e-4
+    weight_decay: 0.01
+  scheduler:
+    _target_: composer.optim.MultiStepWithWarmupScheduler
+    t_warmup: 10000ba
+    milestones:
+      - 2000ep
+  logger:
+    wandb:
+      _target_: composer.loggers.wandb_logger.WandBLogger
+      name: ${name}
+      project: ${project}
+      group: ${name}
+  callbacks:
+    speed_monitor:
+      _target_: composer.callbacks.speed_monitor.SpeedMonitor
+      window_size: 10
+    lr_monitor:
+      _target_: composer.callbacks.lr_monitor.LRMonitor
+    memory_monitor:
+      _target_: composer.callbacks.memory_monitor.MemoryMonitor
+    runtime_estimator:
+      _target_: composer.callbacks.runtime_estimator.RuntimeEstimator
+    optimizer_monitor:
+      _target_: composer.callbacks.OptimizerMonitor
+    image_logger:
+      _target_: diffusion.callbacks.log_diffusion_images.LogDiffusionImages
+      prompts:
+        - a couple waiting to cross the street underneath an umbrella.
+        - three men walking in the rain with umbrellas.
+        - a man is riding a red motor cycle, with baskets.
+        - a clock that has animal pictures instead of numbers.
+        - a brightly decorated bus sits on the road.
+        - a horse bucking with a rider on it, completely vertical, with another horse and onlookers.
+        - a white and blue bus is on a city street at night.
+        - a large clock tower on a building by a river
+        - beans and other food is sitting on a plate.
+        - a group of people that are standing up on a tennis court
+      size: 256
+      guidance_scale: 5.0
+  trainer:
+    _target_: composer.Trainer
+    device: gpu
+    max_duration: 550000ba
+    eval_interval: 1000ba
+    device_train_microbatch_size: 64
+    run_name: ${name}
+    seed: ${seed}
+    scale_schedule_ratio: ${scale_schedule_ratio}
+    save_folder:  oci://mosaicml-internal-checkpoints/jasmine/stable-diffusion-sdxl-unet-256-90m-h100/
+    save_interval: 10000ba
+    save_overwrite: false
+    autoresume: true
+    fsdp_config:
+      sharding_strategy: "SHARD_GRAD_OP"
+    progress_bar: false

From abc7b015172fad35aa1578b3fbe6c285b3ceedd0 Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Tue, 8 Aug 2023 00:09:15 +0000
Subject: [PATCH 06/14] remove debug prints

---
 diffusion/models/models.py | 1 -
 diffusion/train.py         | 2 --
 2 files changed, 3 deletions(-)

diff --git a/diffusion/models/models.py b/diffusion/models/models.py
index 230c4660..68b6472a 100644
--- a/diffusion/models/models.py
+++ b/diffusion/models/models.py
@@ -84,7 +84,6 @@ def stable_diffusion_2(
         config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet')
 
         if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL
-            print('running SDXL!')
             config[0]['addition_embed_type'] = None
             config[0]['cross_attention_dim'] = 1024
 
diff --git a/diffusion/train.py b/diffusion/train.py
index 18c22dbe..2d4380ba 100644
--- a/diffusion/train.py
+++ b/diffusion/train.py
@@ -42,7 +42,6 @@ def train(config: DictConfig) -> None:
 
     # fix stochastic failures in streaming datasets
     time.sleep(10)
-    print('sleeping afer dataset creation')
 
     # Composer can take dataloaders, dataspecs, evaluators, or list of evaluators
     eval_set: Optional[Union[DataSpec, List[Evaluator]]] = None
@@ -67,7 +66,6 @@ def train(config: DictConfig) -> None:
         
     # fix stochastic failures in streaming datasets
     time.sleep(10)
-    print('sleeping afer dataset creation')
 
     # Build list of loggers, callbacks, and algorithms to pass to trainer
     logger: List[LoggerDestination] = []

From 631b9f4dfe1c05c204229c6f7c82c6c726ca090b Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Tue, 8 Aug 2023 21:59:22 +0000
Subject: [PATCH 07/14] allow passing vae model path

---
 diffusion/models/models.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/diffusion/models/models.py b/diffusion/models/models.py
index 68b6472a..20d23f2d 100644
--- a/diffusion/models/models.py
+++ b/diffusion/models/models.py
@@ -28,6 +28,7 @@
 def stable_diffusion_2(
     model_name: str = 'stabilityai/stable-diffusion-2-base',
     unet_model_name: str = 'stabilityai/stable-diffusion-2-base',
+    vae_model_name: str = 'stabilityai/stable-diffusion-2-base',
     pretrained: bool = True,
     prediction_type: str = 'epsilon',
     train_metrics: Optional[List] = None,
@@ -48,7 +49,9 @@ def stable_diffusion_2(
         model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder.
             Defaults to 'stabilityai/stable-diffusion-2-base'.
         unet_model_name (str, optional): Name of the UNet model to load. Defaults to 
-            'stabilityai/stable-diffusion-2-base'
+            'stabilityai/stable-diffusion-2-base'.
+        vae_model_name (str, optional): Name of the VAE model to load. Defaults to 
+            'stabilityai/stable-diffusion-2-base'.
         pretrained (bool, optional): Whether to load pretrained weights. Defaults to True.
         prediction_type (str): The type of prediction to use. Must be one of 'sample',
             'epsilon', or 'v_prediction'. Default: `epsilon`.
@@ -84,6 +87,7 @@ def stable_diffusion_2(
         config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet')
 
         if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL
+            print('using SDXL unet!')
             config[0]['addition_embed_type'] = None
             config[0]['cross_attention_dim'] = 1024
 
@@ -95,10 +99,13 @@ def stable_diffusion_2(
         unet.down_blocks._fsdp_wrap = False
 
     if encode_latents_in_fp16:
-        vae = AutoencoderKL.from_pretrained(model_name, subfolder='vae', torch_dtype=torch.float16)
+        try: 
+            vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder='vae', torch_dtype=torch.float16)
+        except: # for handling SDXL vae fp16 fixed checkpoint
+            vae = AutoencoderKL.from_pretrained(vae_model_name, torch_dtype=torch.float16)
         text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=torch.float16)
     else:
-        vae = AutoencoderKL.from_pretrained(model_name, subfolder='vae')
+        vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder='vae')
         text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder')
 
     tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder='tokenizer')

From e785ca6880881d247b5ea271d87e18c7430c4f6a Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Thu, 10 Aug 2023 00:02:48 +0000
Subject: [PATCH 08/14] add base

---
 diffusion/models/models.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/diffusion/models/models.py b/diffusion/models/models.py
index 20d23f2d..356f1455 100644
--- a/diffusion/models/models.py
+++ b/diffusion/models/models.py
@@ -86,17 +86,22 @@ def stable_diffusion_2(
     else:
         config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet')
 
-        if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL
+        if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL
             print('using SDXL unet!')
             config[0]['addition_embed_type'] = None
             config[0]['cross_attention_dim'] = 1024
 
         unet = UNet2DConditionModel(**config[0])
 
-    if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL
+    if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL
         # Can't fsdp wrap up_blocks or down_blocks because the forward pass calls length on these
         unet.up_blocks._fsdp_wrap = False
         unet.down_blocks._fsdp_wrap = False
+        # for block in unet.up_blocks:
+        #     block._fsdp_wrap = False
+        # for block in unet.down_blocks:
+        #     block._fsdp_wrap = False
+
 
     if encode_latents_in_fp16:
         try: 

From 218981c82d30be98d67908f8235cb10ba696c622 Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Thu, 17 Aug 2023 20:21:37 +0000
Subject: [PATCH 09/14] remove trailing whitespace

---
 diffusion/models/models.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/diffusion/models/models.py b/diffusion/models/models.py
index 356f1455..0c477b1d 100644
--- a/diffusion/models/models.py
+++ b/diffusion/models/models.py
@@ -48,9 +48,9 @@ def stable_diffusion_2(
     Args:
         model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder.
             Defaults to 'stabilityai/stable-diffusion-2-base'.
-        unet_model_name (str, optional): Name of the UNet model to load. Defaults to 
+        unet_model_name (str, optional): Name of the UNet model to load. Defaults to
             'stabilityai/stable-diffusion-2-base'.
-        vae_model_name (str, optional): Name of the VAE model to load. Defaults to 
+        vae_model_name (str, optional): Name of the VAE model to load. Defaults to
             'stabilityai/stable-diffusion-2-base'.
         pretrained (bool, optional): Whether to load pretrained weights. Defaults to True.
         prediction_type (str): The type of prediction to use. Must be one of 'sample',
@@ -86,14 +86,14 @@ def stable_diffusion_2(
     else:
         config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet')
 
-        if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL
+        if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0':  # SDXL
             print('using SDXL unet!')
             config[0]['addition_embed_type'] = None
             config[0]['cross_attention_dim'] = 1024
 
         unet = UNet2DConditionModel(**config[0])
 
-    if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL
+    if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0':  # SDXL
         # Can't fsdp wrap up_blocks or down_blocks because the forward pass calls length on these
         unet.up_blocks._fsdp_wrap = False
         unet.down_blocks._fsdp_wrap = False
@@ -102,11 +102,10 @@ def stable_diffusion_2(
         # for block in unet.down_blocks:
         #     block._fsdp_wrap = False
 
-
     if encode_latents_in_fp16:
-        try: 
+        try:
             vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder='vae', torch_dtype=torch.float16)
-        except: # for handling SDXL vae fp16 fixed checkpoint
+        except:  # for handling SDXL vae fp16 fixed checkpoint
             vae = AutoencoderKL.from_pretrained(vae_model_name, torch_dtype=torch.float16)
         text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=torch.float16)
     else:

From 049a1fb695181a091d93e1526acbd886a5ce485a Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Thu, 17 Aug 2023 20:39:21 +0000
Subject: [PATCH 10/14] split sdxl into separate model

---
 diffusion/models/models.py | 138 ++++++++++++++++++++++++++++++-------
 1 file changed, 115 insertions(+), 23 deletions(-)

diff --git a/diffusion/models/models.py b/diffusion/models/models.py
index 0c477b1d..b662b78c 100644
--- a/diffusion/models/models.py
+++ b/diffusion/models/models.py
@@ -27,8 +27,6 @@
 
 def stable_diffusion_2(
     model_name: str = 'stabilityai/stable-diffusion-2-base',
-    unet_model_name: str = 'stabilityai/stable-diffusion-2-base',
-    vae_model_name: str = 'stabilityai/stable-diffusion-2-base',
     pretrained: bool = True,
     prediction_type: str = 'epsilon',
     train_metrics: Optional[List] = None,
@@ -45,13 +43,114 @@ def stable_diffusion_2(
     Requires batches of matched images and text prompts to train. Generates images from text
     prompts.
 
+    Args:
+        model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder.
+            Defaults to 'stabilityai/stable-diffusion-2-base'.
+        pretrained (bool, optional): Whether to load pretrained weights. Defaults to True.
+        prediction_type (str): The type of prediction to use. Must be one of 'sample',
+            'epsilon', or 'v_prediction'. Default: `epsilon`.
+        train_metrics (list, optional): List of metrics to compute during training. If None, defaults to
+            [MeanSquaredError()].
+        val_metrics (list, optional): List of metrics to compute during validation. If None, defaults to
+            [MeanSquaredError(), FrechetInceptionDistance(normalize=True)].
+        val_guidance_scales (list, optional): List of scales to use for validation guidance. If None, defaults to
+            [1.0, 3.0, 7.0].
+        val_seed (int, optional): Seed to use for generating evaluation images. Defaults to 1138.
+        loss_bins (list, optional): List of tuples of (min, max) values to use for loss binning. If None, defaults to
+            [(0, 1)].
+        precomputed_latents (bool, optional): Whether to use precomputed latents. Defaults to False.
+        encode_latents_in_fp16 (bool, optional): Whether to encode latents in fp16. Defaults to True.
+        fsdp (bool, optional): Whether to use FSDP. Defaults to True.
+    """
+    if train_metrics is None:
+        train_metrics = [MeanSquaredError()]
+    if val_metrics is None:
+        val_metrics = [MeanSquaredError(), FrechetInceptionDistance(normalize=True)]
+    if val_guidance_scales is None:
+        val_guidance_scales = [1.0, 3.0, 7.0]
+    if loss_bins is None:
+        loss_bins = [(0, 1)]
+    # Fix a bug where CLIPScore requires grad
+    for metric in val_metrics:
+        if isinstance(metric, CLIPScore):
+            metric.requires_grad_(False)
+
+    if pretrained:
+        unet = UNet2DConditionModel.from_pretrained(model_name, subfolder='unet')
+    else:
+        config = PretrainedConfig.get_config_dict(model_name, subfolder='unet')
+        unet = UNet2DConditionModel(**config[0])
+
+    if encode_latents_in_fp16:
+        vae = AutoencoderKL.from_pretrained(model_name, subfolder='vae', torch_dtype=torch.float16)
+        text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=torch.float16)
+    else:
+        vae = AutoencoderKL.from_pretrained(model_name, subfolder='vae')
+        text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder')
+
+    tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder='tokenizer')
+    noise_scheduler = DDPMScheduler.from_pretrained(model_name, subfolder='scheduler')
+    inference_noise_scheduler = DDIMScheduler(num_train_timesteps=noise_scheduler.config.num_train_timesteps,
+                                              beta_start=noise_scheduler.config.beta_start,
+                                              beta_end=noise_scheduler.config.beta_end,
+                                              beta_schedule=noise_scheduler.config.beta_schedule,
+                                              trained_betas=noise_scheduler.config.trained_betas,
+                                              clip_sample=noise_scheduler.config.clip_sample,
+                                              set_alpha_to_one=noise_scheduler.config.set_alpha_to_one,
+                                              prediction_type=prediction_type)
+
+    model = StableDiffusion(
+        unet=unet,
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        noise_scheduler=noise_scheduler,
+        inference_noise_scheduler=inference_noise_scheduler,
+        prediction_type=prediction_type,
+        train_metrics=train_metrics,
+        val_metrics=val_metrics,
+        val_guidance_scales=val_guidance_scales,
+        val_seed=val_seed,
+        loss_bins=loss_bins,
+        precomputed_latents=precomputed_latents,
+        encode_latents_in_fp16=encode_latents_in_fp16,
+        fsdp=fsdp,
+    )
+    if torch.cuda.is_available():
+        model = DeviceGPU().module_to_device(model)
+        if is_xformers_installed:
+            model.unet.enable_xformers_memory_efficient_attention()
+            model.vae.enable_xformers_memory_efficient_attention()
+    return model
+
+
+def stable_diffusion_xl(
+    model_name: str = 'stabilityai/stable-diffusion-2-base',
+    unet_model_name: str = 'stabilityai/stable-diffusion-xl-base-1.0',
+    vae_model_name: str = 'madebyollin/sdxl-vae-fp16-fix',
+    pretrained: bool = True,
+    prediction_type: str = 'epsilon',
+    train_metrics: Optional[List] = None,
+    val_metrics: Optional[List] = None,
+    val_guidance_scales: Optional[List] = None,
+    val_seed: int = 1138,
+    loss_bins: Optional[List] = None,
+    precomputed_latents: bool = False,
+    encode_latents_in_fp16: bool = True,
+    fsdp: bool = True,
+):
+    """Stable diffusion 2 training setup + SDXL UNet and VAE.
+
+    Requires batches of matched images and text prompts to train. Generates images from text
+    prompts. Currently uses UNet and VAE config from SDXL, but text encoder/tokenizer from SD2.
+
     Args:
         model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder.
             Defaults to 'stabilityai/stable-diffusion-2-base'.
         unet_model_name (str, optional): Name of the UNet model to load. Defaults to
-            'stabilityai/stable-diffusion-2-base'.
+            'stabilityai/stable-diffusion-xl-base-1.0'.
         vae_model_name (str, optional): Name of the VAE model to load. Defaults to
-            'stabilityai/stable-diffusion-2-base'.
+            'madebyollin/sdxl-vae-fp16-fix'.
         pretrained (bool, optional): Whether to load pretrained weights. Defaults to True.
         prediction_type (str): The type of prediction to use. Must be one of 'sample',
             'epsilon', or 'v_prediction'. Default: `epsilon`.
@@ -85,31 +184,24 @@ def stable_diffusion_2(
         unet = UNet2DConditionModel.from_pretrained(unet_model_name, subfolder='unet')
     else:
         config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet')
-
-        if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0':  # SDXL
-            print('using SDXL unet!')
-            config[0]['addition_embed_type'] = None
-            config[0]['cross_attention_dim'] = 1024
-
+        # Currently not doing micro-conditioning, so set config appropriately
+        config[0]['addition_embed_type'] = None
+        config[0]['cross_attention_dim'] = 1024
         unet = UNet2DConditionModel(**config[0])
 
-    if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0':  # SDXL
-        # Can't fsdp wrap up_blocks or down_blocks because the forward pass calls length on these
-        unet.up_blocks._fsdp_wrap = False
-        unet.down_blocks._fsdp_wrap = False
-        # for block in unet.up_blocks:
-        #     block._fsdp_wrap = False
-        # for block in unet.down_blocks:
-        #     block._fsdp_wrap = False
+    # Prevent fsdp from wrapping up_blocks and down_blocks because the forward pass calls length on these
+    unet.up_blocks._fsdp_wrap = False
+    unet.down_blocks._fsdp_wrap = False
+    for block in unet.up_blocks:
+        block._fsdp_wrap = True
+    for block in unet.down_blocks:
+        block._fsdp_wrap = True
 
     if encode_latents_in_fp16:
-        try:
-            vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder='vae', torch_dtype=torch.float16)
-        except:  # for handling SDXL vae fp16 fixed checkpoint
-            vae = AutoencoderKL.from_pretrained(vae_model_name, torch_dtype=torch.float16)
+        vae = AutoencoderKL.from_pretrained(vae_model_name, torch_dtype=torch.float16)
         text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=torch.float16)
     else:
-        vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder='vae')
+        vae = AutoencoderKL.from_pretrained(vae_model_name)
         text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder')
 
     tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder='tokenizer')

From df4db968aea509a6b7300d39c6285a34b49421c9 Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Thu, 17 Aug 2023 20:41:22 +0000
Subject: [PATCH 11/14] remove local yamls

---
 yamls/local-yamls/SDXL-a100-256-lite.yaml | 136 --------------
 yamls/local-yamls/SDXL-h100-256.yaml      | 212 ----------------------
 2 files changed, 348 deletions(-)
 delete mode 100644 yamls/local-yamls/SDXL-a100-256-lite.yaml
 delete mode 100644 yamls/local-yamls/SDXL-h100-256.yaml

diff --git a/yamls/local-yamls/SDXL-a100-256-lite.yaml b/yamls/local-yamls/SDXL-a100-256-lite.yaml
deleted file mode 100644
index 60d048da..00000000
--- a/yamls/local-yamls/SDXL-a100-256-lite.yaml
+++ /dev/null
@@ -1,136 +0,0 @@
-run_name: sd2-sdxl-unet-256
-cluster: r1z1
-gpu_num: 4
-image: mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04
-integrations:
-  - integration_type: "git_repo"
-    git_repo: jazcollins/diffusion
-    git_branch: sdxl
-    pip_install: .[all]
-  - integration_type: "wandb"
-    project: jasmine-sd2-sdxl-unet
-    entity: mosaic-ml
-command: |
-  pip install -U ninja
-  pip install -U git+https://github.com/facebookresearch/xformers
-  cd diffusion
-  HYDRA_FULL_ERROR=1 composer run.py --config-path /mnt/config --config-name parameters
-  (echo "Command failed - killing python" && pkill python && exit 1)
-
-parameters:
-  project:  jasmine-sd2-sdxl-unet
-  batch_size: 32 # 2048
-  seed: 17
-  scale_schedule_ratio: 1.0
-  name: test # wandb run name
-  eval_first: true
-  algorithms:
-    low_precision_groupnorm:
-      attribute: unet
-      precision: amp_fp16
-    low_precision_layernorm:
-      attribute: unet
-      precision: amp_fp16
-  model:
-    _target_: diffusion.models.models.stable_diffusion_2
-    pretrained: false
-    model_name: stabilityai/stable-diffusion-2-base
-    unet_model_name: stabilityai/stable-diffusion-xl-refiner-1.0
-    precomputed_latents: true
-    encode_latents_in_fp16: true
-    fsdp: true
-    val_metrics:
-      - _target_: torchmetrics.MeanSquaredError
-    val_guidance_scales: []
-    loss_bins: []
-  dataset:
-    train_batch_size: ${batch_size}
-    eval_batch_size: 1024 # Should be 8 per device
-    train_dataset:
-      _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader
-      remote:
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/256-512
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/512-768
-      local:
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/256-512
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/512-768
-      batch_size: ${batch_size}
-      tokenizer_name_or_path: stabilityai/stable-diffusion-2-base
-      caption_drop_prob: 0.1
-      resize_size: 256
-      drop_last: true
-      shuffle: true
-      prefetch_factor: 2
-      num_workers: 8
-      persistent_workers: true
-      pin_memory: true
-      download_timeout: 900
-      num_canonical_nodes: 32
-    eval_dataset:
-      _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader
-      remote:  oci://mosaicml-internal-dataset-coco/2014/val/10k-1/
-      local: /tmp/mds-cache/mds-coco-10k-1/
-      batch_size: 8
-      resize_size: 256
-      prefetch_factor: 2
-      num_workers: 8
-      persistent_workers: True
-      pin_memory: True
-  optimizer:
-    _target_: torch.optim.AdamW
-    lr: 1.0e-4
-    weight_decay: 0.01
-  scheduler:
-    _target_: composer.optim.MultiStepWithWarmupScheduler
-    t_warmup: 10000ba
-    milestones:
-      - 2000ep
-  logger:
-    wandb:
-      _target_: composer.loggers.wandb_logger.WandBLogger
-      name: ${name}
-      project: ${project}
-      group: ${name}
-  callbacks:
-    speed_monitor:
-      _target_: composer.callbacks.speed_monitor.SpeedMonitor
-      window_size: 10
-    lr_monitor:
-      _target_: composer.callbacks.lr_monitor.LRMonitor
-    memory_monitor:
-      _target_: composer.callbacks.memory_monitor.MemoryMonitor
-    runtime_estimator:
-      _target_: composer.callbacks.runtime_estimator.RuntimeEstimator
-    optimizer_monitor:
-      _target_: composer.callbacks.OptimizerMonitor
-    image_logger:
-      _target_: diffusion.callbacks.log_diffusion_images.LogDiffusionImages
-      prompts:
-        - a couple waiting to cross the street underneath an umbrella.
-        - three men walking in the rain with umbrellas.
-        - a man is riding a red motor cycle, with baskets.
-        - a clock that has animal pictures instead of numbers.
-        - a brightly decorated bus sits on the road.
-        - a horse bucking with a rider on it, completely vertical, with another horse and onlookers.
-        - a white and blue bus is on a city street at night.
-        - a large clock tower on a building by a river
-        - beans and other food is sitting on a plate.
-        - a group of people that are standing up on a tennis court
-      size: 256
-      guidance_scale: 5.0
-  trainer:
-    _target_: composer.Trainer
-    device: gpu
-    max_duration: 550000ba
-    eval_interval: 1000ba
-    device_train_microbatch_size: 8 # 64
-    run_name: ${name}
-    seed: ${seed}
-    scale_schedule_ratio: ${scale_schedule_ratio}
-    save_folder:  oci://mosaicml-internal-checkpoints/jasmine/test/
-    save_interval: 10000ba
-    save_overwrite: false
-    autoresume: true
-    fsdp_config:
-      sharding_strategy: "SHARD_GRAD_OP"
-    progress_bar: false
diff --git a/yamls/local-yamls/SDXL-h100-256.yaml b/yamls/local-yamls/SDXL-h100-256.yaml
deleted file mode 100644
index 86e751fc..00000000
--- a/yamls/local-yamls/SDXL-h100-256.yaml
+++ /dev/null
@@ -1,212 +0,0 @@
-run_name: sd2-sdxl-unet-256
-cluster: r9z1
-gpu_num: 32
-env_variables:
-  - key: NCCL_IB_PCI_RELAXED_ORDERING
-    value: "0"
-image: mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04
-compute:
-  instance: coreweave.h100-80
-scheduling:
-    resumable: true
-    priority: medium
-integrations:
-  - integration_type: "git_repo"
-    git_repo: jazcollins/diffusion
-    git_branch: sdxl
-    pip_install: .[all]
-  - integration_type: "wandb"
-    project: jasmine-sd2-sdxl-unet
-    entity: mosaic-ml
-command: |
-  pip install -U ninja
-  pip install -U git+https://github.com/facebookresearch/xformers
-  cd diffusion
-  HYDRA_FULL_ERROR=1 composer run.py --config-path /mnt/config --config-name parameters
-  (echo "Command failed - killing python" && pkill python && exit 1)
-
-parameters:
-  project:  jasmine-sd2-sdxl-unet
-  batch_size: 2048
-  seed: 17
-  scale_schedule_ratio: 1.0
-  name: 90m-sdxl-unet # wandb run name
-  eval_first: true
-  algorithms:
-    low_precision_groupnorm:
-      attribute: unet
-      precision: amp_fp16
-    low_precision_layernorm:
-      attribute: unet
-      precision: amp_fp16
-  model:
-    _target_: diffusion.models.models.stable_diffusion_2
-    pretrained: false
-    model_name: stabilityai/stable-diffusion-2-base
-    unet_model_name: stabilityai/stable-diffusion-xl-refiner-1.0
-    precomputed_latents: true
-    encode_latents_in_fp16: true
-    fsdp: true
-    val_metrics:
-      - _target_: torchmetrics.MeanSquaredError
-    val_guidance_scales: []
-    loss_bins: []
-  dataset:
-    train_batch_size: ${batch_size}
-    eval_batch_size: 1024 # Should be 8 per device
-    train_dataset:
-      _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader
-      remote:
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/256-512
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/512-768
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/768-1024
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/1024-1048576
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/256-512
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/512-768
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/768-1024
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/1024-1048576
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/256-512
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/512-768
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/768-1024
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/1024-1048576
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/256-512
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/512-768
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/768-1024
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/1024-1048576
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/256-512
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/512-768
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/768-1024
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/1024-1048576
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/256-512
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/512-768
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/768-1024
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/1024-1048576
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/256-512
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/512-768
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/768-1024
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/1024-1048576
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/256-512
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/512-768
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/768-1024
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/1024-1048576
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/256-512
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/512-768
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/768-1024
-        - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/1024-1048576
-      local:
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/256-512
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/512-768
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/768-1024
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/1024-1048576
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/256-512
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/512-768
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/768-1024
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/1024-1048576
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/256-512
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/512-768
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/768-1024
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/1024-1048576
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/256-512
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/512-768
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/768-1024
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/1024-1048576
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/256-512
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/512-768
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/768-1024
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/1024-1048576
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/256-512
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/512-768
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/768-1024
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/1024-1048576
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/256-512
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/512-768
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/768-1024
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/1024-1048576
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/256-512
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/512-768
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/768-1024
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/1024-1048576
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/256-512
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/512-768
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/768-1024
-        - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/1024-1048576
-      batch_size: ${batch_size}
-      tokenizer_name_or_path: stabilityai/stable-diffusion-2-base
-      caption_drop_prob: 0.1
-      resize_size: 256
-      drop_last: true
-      shuffle: true
-      prefetch_factor: 2
-      num_workers: 8
-      persistent_workers: true
-      pin_memory: true
-      download_timeout: 900
-      num_canonical_nodes: 32
-    eval_dataset:
-      _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader
-      remote:  oci://mosaicml-internal-dataset-coco/2014/val/10k-1/
-      local: /tmp/mds-cache/mds-coco-10k-1/
-      batch_size: 8
-      resize_size: 256
-      prefetch_factor: 2
-      num_workers: 8
-      persistent_workers: True
-      pin_memory: True
-  optimizer:
-    _target_: torch.optim.AdamW
-    lr: 1.0e-4
-    weight_decay: 0.01
-  scheduler:
-    _target_: composer.optim.MultiStepWithWarmupScheduler
-    t_warmup: 10000ba
-    milestones:
-      - 2000ep
-  logger:
-    wandb:
-      _target_: composer.loggers.wandb_logger.WandBLogger
-      name: ${name}
-      project: ${project}
-      group: ${name}
-  callbacks:
-    speed_monitor:
-      _target_: composer.callbacks.speed_monitor.SpeedMonitor
-      window_size: 10
-    lr_monitor:
-      _target_: composer.callbacks.lr_monitor.LRMonitor
-    memory_monitor:
-      _target_: composer.callbacks.memory_monitor.MemoryMonitor
-    runtime_estimator:
-      _target_: composer.callbacks.runtime_estimator.RuntimeEstimator
-    optimizer_monitor:
-      _target_: composer.callbacks.OptimizerMonitor
-    image_logger:
-      _target_: diffusion.callbacks.log_diffusion_images.LogDiffusionImages
-      prompts:
-        - a couple waiting to cross the street underneath an umbrella.
-        - three men walking in the rain with umbrellas.
-        - a man is riding a red motor cycle, with baskets.
-        - a clock that has animal pictures instead of numbers.
-        - a brightly decorated bus sits on the road.
-        - a horse bucking with a rider on it, completely vertical, with another horse and onlookers.
-        - a white and blue bus is on a city street at night.
-        - a large clock tower on a building by a river
-        - beans and other food is sitting on a plate.
-        - a group of people that are standing up on a tennis court
-      size: 256
-      guidance_scale: 5.0
-  trainer:
-    _target_: composer.Trainer
-    device: gpu
-    max_duration: 550000ba
-    eval_interval: 1000ba
-    device_train_microbatch_size: 64
-    run_name: ${name}
-    seed: ${seed}
-    scale_schedule_ratio: ${scale_schedule_ratio}
-    save_folder:  oci://mosaicml-internal-checkpoints/jasmine/stable-diffusion-sdxl-unet-256-90m-h100/
-    save_interval: 10000ba
-    save_overwrite: false
-    autoresume: true
-    fsdp_config:
-      sharding_strategy: "SHARD_GRAD_OP"
-    progress_bar: false

From f31cd8fa8e227bfde806b1cbed528cb9f6edc645 Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Thu, 17 Aug 2023 20:44:16 +0000
Subject: [PATCH 12/14] clean up sd2 doc

---
 diffusion/models/models.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/diffusion/models/models.py b/diffusion/models/models.py
index b662b78c..b52e6fd0 100644
--- a/diffusion/models/models.py
+++ b/diffusion/models/models.py
@@ -44,8 +44,7 @@ def stable_diffusion_2(
     prompts.
 
     Args:
-        model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder.
-            Defaults to 'stabilityai/stable-diffusion-2-base'.
+        model_name (str, optional): Name of the model to load. Defaults to 'stabilityai/stable-diffusion-2-base'.
         pretrained (bool, optional): Whether to load pretrained weights. Defaults to True.
         prediction_type (str): The type of prediction to use. Must be one of 'sample',
             'epsilon', or 'v_prediction'. Default: `epsilon`.

From b07abcf5aecf4c88790c8f71ce2d5fffd8cfbf64 Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Thu, 17 Aug 2023 20:57:09 +0000
Subject: [PATCH 13/14] one more doc fix

---
 diffusion/models/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/diffusion/models/models.py b/diffusion/models/models.py
index b52e6fd0..5ab83398 100644
--- a/diffusion/models/models.py
+++ b/diffusion/models/models.py
@@ -144,8 +144,8 @@ def stable_diffusion_xl(
     prompts. Currently uses UNet and VAE config from SDXL, but text encoder/tokenizer from SD2.
 
     Args:
-        model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder.
-            Defaults to 'stabilityai/stable-diffusion-2-base'.
+        model_name (str, optional): Name of the model to load. Determines the text encoder, tokenizer,
+            and noise scheduler. Defaults to 'stabilityai/stable-diffusion-2-base'.
         unet_model_name (str, optional): Name of the UNet model to load. Defaults to
             'stabilityai/stable-diffusion-xl-base-1.0'.
         vae_model_name (str, optional): Name of the VAE model to load. Defaults to

From 9739743e955020e6c39298ef46d346d4758b76de Mon Sep 17 00:00:00 2001
From: jazcollins <jasmine@mosaicml.com>
Date: Mon, 21 Aug 2023 16:58:57 +0000
Subject: [PATCH 14/14] add NotImplementedError, fix docs

---
 diffusion/models/models.py | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/diffusion/models/models.py b/diffusion/models/models.py
index 5ab83398..e2eed1a8 100644
--- a/diffusion/models/models.py
+++ b/diffusion/models/models.py
@@ -44,8 +44,8 @@ def stable_diffusion_2(
     prompts.
 
     Args:
-        model_name (str, optional): Name of the model to load. Defaults to 'stabilityai/stable-diffusion-2-base'.
-        pretrained (bool, optional): Whether to load pretrained weights. Defaults to True.
+        model_name (str): Name of the model to load. Defaults to 'stabilityai/stable-diffusion-2-base'.
+        pretrained (bool): Whether to load pretrained weights. Defaults to True.
         prediction_type (str): The type of prediction to use. Must be one of 'sample',
             'epsilon', or 'v_prediction'. Default: `epsilon`.
         train_metrics (list, optional): List of metrics to compute during training. If None, defaults to
@@ -54,12 +54,12 @@ def stable_diffusion_2(
             [MeanSquaredError(), FrechetInceptionDistance(normalize=True)].
         val_guidance_scales (list, optional): List of scales to use for validation guidance. If None, defaults to
             [1.0, 3.0, 7.0].
-        val_seed (int, optional): Seed to use for generating evaluation images. Defaults to 1138.
+        val_seed (int): Seed to use for generating evaluation images. Defaults to 1138.
         loss_bins (list, optional): List of tuples of (min, max) values to use for loss binning. If None, defaults to
             [(0, 1)].
-        precomputed_latents (bool, optional): Whether to use precomputed latents. Defaults to False.
-        encode_latents_in_fp16 (bool, optional): Whether to encode latents in fp16. Defaults to True.
-        fsdp (bool, optional): Whether to use FSDP. Defaults to True.
+        precomputed_latents (bool): Whether to use precomputed latents. Defaults to False.
+        encode_latents_in_fp16 (bool): Whether to encode latents in fp16. Defaults to True.
+        fsdp (bool): Whether to use FSDP. Defaults to True.
     """
     if train_metrics is None:
         train_metrics = [MeanSquaredError()]
@@ -144,13 +144,14 @@ def stable_diffusion_xl(
     prompts. Currently uses UNet and VAE config from SDXL, but text encoder/tokenizer from SD2.
 
     Args:
-        model_name (str, optional): Name of the model to load. Determines the text encoder, tokenizer,
+        model_name (str): Name of the model to load. Determines the text encoder, tokenizer,
             and noise scheduler. Defaults to 'stabilityai/stable-diffusion-2-base'.
-        unet_model_name (str, optional): Name of the UNet model to load. Defaults to
+        unet_model_name (str): Name of the UNet model to load. Defaults to
             'stabilityai/stable-diffusion-xl-base-1.0'.
-        vae_model_name (str, optional): Name of the VAE model to load. Defaults to
-            'madebyollin/sdxl-vae-fp16-fix'.
-        pretrained (bool, optional): Whether to load pretrained weights. Defaults to True.
+        vae_model_name (str): Name of the VAE model to load. Defaults to
+            'madebyollin/sdxl-vae-fp16-fix' as the official VAE checkpoint (from
+            'stabilityai/stable-diffusion-xl-base-1.0') is not compatible with fp16.
+        pretrained (bool): Whether to load pretrained weights. Defaults to True.
         prediction_type (str): The type of prediction to use. Must be one of 'sample',
             'epsilon', or 'v_prediction'. Default: `epsilon`.
         train_metrics (list, optional): List of metrics to compute during training. If None, defaults to
@@ -159,12 +160,12 @@ def stable_diffusion_xl(
             [MeanSquaredError(), FrechetInceptionDistance(normalize=True)].
         val_guidance_scales (list, optional): List of scales to use for validation guidance. If None, defaults to
             [1.0, 3.0, 7.0].
-        val_seed (int, optional): Seed to use for generating evaluation images. Defaults to 1138.
+        val_seed (int): Seed to use for generating evaluation images. Defaults to 1138.
         loss_bins (list, optional): List of tuples of (min, max) values to use for loss binning. If None, defaults to
             [(0, 1)].
-        precomputed_latents (bool, optional): Whether to use precomputed latents. Defaults to False.
-        encode_latents_in_fp16 (bool, optional): Whether to encode latents in fp16. Defaults to True.
-        fsdp (bool, optional): Whether to use FSDP. Defaults to True.
+        precomputed_latents (bool): Whether to use precomputed latents. Defaults to False.
+        encode_latents_in_fp16 (bool): Whether to encode latents in fp16. Defaults to True.
+        fsdp (bool): Whether to use FSDP. Defaults to True.
     """
     if train_metrics is None:
         train_metrics = [MeanSquaredError()]
@@ -180,7 +181,7 @@ def stable_diffusion_xl(
             metric.requires_grad_(False)
 
     if pretrained:
-        unet = UNet2DConditionModel.from_pretrained(unet_model_name, subfolder='unet')
+        raise NotImplementedError('Full SDXL pipeline not implemented yet.')
     else:
         config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet')
         # Currently not doing micro-conditioning, so set config appropriately