From 364bbe9fc3dfd30cba928080bf0eed7beacce72f Mon Sep 17 00:00:00 2001 From: jazcollins Date: Mon, 7 Aug 2023 19:05:20 +0000 Subject: [PATCH 01/14] add sdxl unet --- diffusion/models/models.py | 20 +++++++++++++++++--- setup.py | 2 +- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index 6990710f..68b6472a 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -27,6 +27,7 @@ def stable_diffusion_2( model_name: str = 'stabilityai/stable-diffusion-2-base', + unet_model_name: str = 'stabilityai/stable-diffusion-2-base', pretrained: bool = True, prediction_type: str = 'epsilon', train_metrics: Optional[List] = None, @@ -44,7 +45,10 @@ def stable_diffusion_2( prompts. Args: - model_name (str, optional): Name of the model to load. Defaults to 'stabilityai/stable-diffusion-2-base'. + model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder. + Defaults to 'stabilityai/stable-diffusion-2-base'. + unet_model_name (str, optional): Name of the UNet model to load. Defaults to + 'stabilityai/stable-diffusion-2-base' pretrained (bool, optional): Whether to load pretrained weights. Defaults to True. prediction_type (str): The type of prediction to use. Must be one of 'sample', 'epsilon', or 'v_prediction'. Default: `epsilon`. @@ -75,11 +79,21 @@ def stable_diffusion_2( metric.requires_grad_(False) if pretrained: - unet = UNet2DConditionModel.from_pretrained(model_name, subfolder='unet') + unet = UNet2DConditionModel.from_pretrained(unet_model_name, subfolder='unet') else: - config = PretrainedConfig.get_config_dict(model_name, subfolder='unet') + config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet') + + if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL + config[0]['addition_embed_type'] = None + config[0]['cross_attention_dim'] = 1024 + unet = UNet2DConditionModel(**config[0]) + if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL + # Can't fsdp wrap up_blocks or down_blocks because the forward pass calls length on these + unet.up_blocks._fsdp_wrap = False + unet.down_blocks._fsdp_wrap = False + if encode_latents_in_fp16: vae = AutoencoderKL.from_pretrained(model_name, subfolder='vae', torch_dtype=torch.float16) text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=torch.float16) diff --git a/setup.py b/setup.py index d05090ce..6a79423d 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ 'mosaicml-streaming>=0.4.0,<1.0', 'hydra-core>=1.2', 'hydra-colorlog>=1.1.0', - 'diffusers[torch]==0.16.0', + 'diffusers[torch]==0.19.0', 'transformers[torch]==4.29.2', 'wandb==0.15.4', 'xformers==0.0.16', From 7dfb6f6f9c76813d7507a40a042ae1c8dcbe434b Mon Sep 17 00:00:00 2001 From: jazcollins Date: Mon, 7 Aug 2023 20:23:42 +0000 Subject: [PATCH 02/14] fix stochastic failures in streaming datasets --- diffusion/train.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/diffusion/train.py b/diffusion/train.py index fb1bdd60..e8bf1fc1 100644 --- a/diffusion/train.py +++ b/diffusion/train.py @@ -7,6 +7,7 @@ from collections.abc import Iterable from typing import Any, Dict, List, Optional, Union +import time import hydra from composer import Algorithm, Callback, ComposerModel, DataSpec, Evaluator, Trainer from composer.algorithms.low_precision_groupnorm import apply_low_precision_groupnorm @@ -38,6 +39,9 @@ def train(config: DictConfig) -> None: config.dataset.train_dataset, batch_size=config.dataset.train_batch_size // dist.get_world_size(), ) + + # fix stochastic failures in streaming datasets + time.sleep(10) # Composer can take dataloaders, dataspecs, evaluators, or list of evaluators eval_set: Optional[Union[DataSpec, List[Evaluator]]] = None @@ -59,6 +63,9 @@ def train(config: DictConfig) -> None: else: eval_set = hydra.utils.instantiate(config.dataset.eval_dataset, batch_size=config.dataset.eval_batch_size // dist.get_world_size()) + + # fix stochastic failures in streaming datasets + time.sleep(10) # Build list of loggers, callbacks, and algorithms to pass to trainer logger: List[LoggerDestination] = [] From 36c005b983651e8cfbccdf0475e457700ff04d54 Mon Sep 17 00:00:00 2001 From: jazcollins Date: Mon, 7 Aug 2023 21:22:24 +0000 Subject: [PATCH 03/14] add some debug logging --- diffusion/models/models.py | 1 + diffusion/train.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index 68b6472a..230c4660 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -84,6 +84,7 @@ def stable_diffusion_2( config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet') if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL + print('running SDXL!') config[0]['addition_embed_type'] = None config[0]['cross_attention_dim'] = 1024 diff --git a/diffusion/train.py b/diffusion/train.py index e8bf1fc1..18c22dbe 100644 --- a/diffusion/train.py +++ b/diffusion/train.py @@ -39,9 +39,10 @@ def train(config: DictConfig) -> None: config.dataset.train_dataset, batch_size=config.dataset.train_batch_size // dist.get_world_size(), ) - + # fix stochastic failures in streaming datasets time.sleep(10) + print('sleeping afer dataset creation') # Composer can take dataloaders, dataspecs, evaluators, or list of evaluators eval_set: Optional[Union[DataSpec, List[Evaluator]]] = None @@ -66,6 +67,7 @@ def train(config: DictConfig) -> None: # fix stochastic failures in streaming datasets time.sleep(10) + print('sleeping afer dataset creation') # Build list of loggers, callbacks, and algorithms to pass to trainer logger: List[LoggerDestination] = [] From db70078507f7de8d52f713e6951597cc33175b75 Mon Sep 17 00:00:00 2001 From: jazcollins Date: Mon, 7 Aug 2023 21:56:02 +0000 Subject: [PATCH 04/14] unpin some reqs --- setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 6a79423d..47fb29d2 100644 --- a/setup.py +++ b/setup.py @@ -6,16 +6,16 @@ from setuptools import find_packages, setup install_requires = [ - 'mosaicml@git+https://github.com/mosaicml/composer.git@6cf3d3a1aa300834c650f89460b5ac9bbc5a1e46', + 'mosaicml', 'mosaicml-streaming>=0.4.0,<1.0', 'hydra-core>=1.2', 'hydra-colorlog>=1.1.0', 'diffusers[torch]==0.19.0', - 'transformers[torch]==4.29.2', + 'transformers[torch]', 'wandb==0.15.4', - 'xformers==0.0.16', - 'triton==2.0.0', - 'torchmetrics[image]==0.11.3', + 'xformers', + 'triton', + 'torchmetrics[image]', 'clean-fid', 'clip@git+https://github.com/openai/CLIP.git', ] From b667fb46fc421dd530082bc50cd65a923502d626 Mon Sep 17 00:00:00 2001 From: jazcollins Date: Mon, 7 Aug 2023 16:33:06 -0700 Subject: [PATCH 05/14] add yamls --- yamls/local-yamls/SDXL-a100-256-lite.yaml | 136 ++++++++++++++ yamls/local-yamls/SDXL-h100-256.yaml | 212 ++++++++++++++++++++++ 2 files changed, 348 insertions(+) create mode 100644 yamls/local-yamls/SDXL-a100-256-lite.yaml create mode 100644 yamls/local-yamls/SDXL-h100-256.yaml diff --git a/yamls/local-yamls/SDXL-a100-256-lite.yaml b/yamls/local-yamls/SDXL-a100-256-lite.yaml new file mode 100644 index 00000000..60d048da --- /dev/null +++ b/yamls/local-yamls/SDXL-a100-256-lite.yaml @@ -0,0 +1,136 @@ +run_name: sd2-sdxl-unet-256 +cluster: r1z1 +gpu_num: 4 +image: mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04 +integrations: + - integration_type: "git_repo" + git_repo: jazcollins/diffusion + git_branch: sdxl + pip_install: .[all] + - integration_type: "wandb" + project: jasmine-sd2-sdxl-unet + entity: mosaic-ml +command: | + pip install -U ninja + pip install -U git+https://github.com/facebookresearch/xformers + cd diffusion + HYDRA_FULL_ERROR=1 composer run.py --config-path /mnt/config --config-name parameters + (echo "Command failed - killing python" && pkill python && exit 1) + +parameters: + project: jasmine-sd2-sdxl-unet + batch_size: 32 # 2048 + seed: 17 + scale_schedule_ratio: 1.0 + name: test # wandb run name + eval_first: true + algorithms: + low_precision_groupnorm: + attribute: unet + precision: amp_fp16 + low_precision_layernorm: + attribute: unet + precision: amp_fp16 + model: + _target_: diffusion.models.models.stable_diffusion_2 + pretrained: false + model_name: stabilityai/stable-diffusion-2-base + unet_model_name: stabilityai/stable-diffusion-xl-refiner-1.0 + precomputed_latents: true + encode_latents_in_fp16: true + fsdp: true + val_metrics: + - _target_: torchmetrics.MeanSquaredError + val_guidance_scales: [] + loss_bins: [] + dataset: + train_batch_size: ${batch_size} + eval_batch_size: 1024 # Should be 8 per device + train_dataset: + _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader + remote: + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/256-512 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/512-768 + local: + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/256-512 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/512-768 + batch_size: ${batch_size} + tokenizer_name_or_path: stabilityai/stable-diffusion-2-base + caption_drop_prob: 0.1 + resize_size: 256 + drop_last: true + shuffle: true + prefetch_factor: 2 + num_workers: 8 + persistent_workers: true + pin_memory: true + download_timeout: 900 + num_canonical_nodes: 32 + eval_dataset: + _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader + remote: oci://mosaicml-internal-dataset-coco/2014/val/10k-1/ + local: /tmp/mds-cache/mds-coco-10k-1/ + batch_size: 8 + resize_size: 256 + prefetch_factor: 2 + num_workers: 8 + persistent_workers: True + pin_memory: True + optimizer: + _target_: torch.optim.AdamW + lr: 1.0e-4 + weight_decay: 0.01 + scheduler: + _target_: composer.optim.MultiStepWithWarmupScheduler + t_warmup: 10000ba + milestones: + - 2000ep + logger: + wandb: + _target_: composer.loggers.wandb_logger.WandBLogger + name: ${name} + project: ${project} + group: ${name} + callbacks: + speed_monitor: + _target_: composer.callbacks.speed_monitor.SpeedMonitor + window_size: 10 + lr_monitor: + _target_: composer.callbacks.lr_monitor.LRMonitor + memory_monitor: + _target_: composer.callbacks.memory_monitor.MemoryMonitor + runtime_estimator: + _target_: composer.callbacks.runtime_estimator.RuntimeEstimator + optimizer_monitor: + _target_: composer.callbacks.OptimizerMonitor + image_logger: + _target_: diffusion.callbacks.log_diffusion_images.LogDiffusionImages + prompts: + - a couple waiting to cross the street underneath an umbrella. + - three men walking in the rain with umbrellas. + - a man is riding a red motor cycle, with baskets. + - a clock that has animal pictures instead of numbers. + - a brightly decorated bus sits on the road. + - a horse bucking with a rider on it, completely vertical, with another horse and onlookers. + - a white and blue bus is on a city street at night. + - a large clock tower on a building by a river + - beans and other food is sitting on a plate. + - a group of people that are standing up on a tennis court + size: 256 + guidance_scale: 5.0 + trainer: + _target_: composer.Trainer + device: gpu + max_duration: 550000ba + eval_interval: 1000ba + device_train_microbatch_size: 8 # 64 + run_name: ${name} + seed: ${seed} + scale_schedule_ratio: ${scale_schedule_ratio} + save_folder: oci://mosaicml-internal-checkpoints/jasmine/test/ + save_interval: 10000ba + save_overwrite: false + autoresume: true + fsdp_config: + sharding_strategy: "SHARD_GRAD_OP" + progress_bar: false diff --git a/yamls/local-yamls/SDXL-h100-256.yaml b/yamls/local-yamls/SDXL-h100-256.yaml new file mode 100644 index 00000000..86e751fc --- /dev/null +++ b/yamls/local-yamls/SDXL-h100-256.yaml @@ -0,0 +1,212 @@ +run_name: sd2-sdxl-unet-256 +cluster: r9z1 +gpu_num: 32 +env_variables: + - key: NCCL_IB_PCI_RELAXED_ORDERING + value: "0" +image: mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04 +compute: + instance: coreweave.h100-80 +scheduling: + resumable: true + priority: medium +integrations: + - integration_type: "git_repo" + git_repo: jazcollins/diffusion + git_branch: sdxl + pip_install: .[all] + - integration_type: "wandb" + project: jasmine-sd2-sdxl-unet + entity: mosaic-ml +command: | + pip install -U ninja + pip install -U git+https://github.com/facebookresearch/xformers + cd diffusion + HYDRA_FULL_ERROR=1 composer run.py --config-path /mnt/config --config-name parameters + (echo "Command failed - killing python" && pkill python && exit 1) + +parameters: + project: jasmine-sd2-sdxl-unet + batch_size: 2048 + seed: 17 + scale_schedule_ratio: 1.0 + name: 90m-sdxl-unet # wandb run name + eval_first: true + algorithms: + low_precision_groupnorm: + attribute: unet + precision: amp_fp16 + low_precision_layernorm: + attribute: unet + precision: amp_fp16 + model: + _target_: diffusion.models.models.stable_diffusion_2 + pretrained: false + model_name: stabilityai/stable-diffusion-2-base + unet_model_name: stabilityai/stable-diffusion-xl-refiner-1.0 + precomputed_latents: true + encode_latents_in_fp16: true + fsdp: true + val_metrics: + - _target_: torchmetrics.MeanSquaredError + val_guidance_scales: [] + loss_bins: [] + dataset: + train_batch_size: ${batch_size} + eval_batch_size: 1024 # Should be 8 per device + train_dataset: + _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader + remote: + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/256-512 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/512-768 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/768-1024 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/1024-1048576 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/256-512 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/512-768 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/768-1024 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/1024-1048576 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/256-512 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/512-768 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/768-1024 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/1024-1048576 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/256-512 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/512-768 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/768-1024 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/1024-1048576 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/256-512 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/512-768 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/768-1024 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/1024-1048576 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/256-512 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/512-768 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/768-1024 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/1024-1048576 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/256-512 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/512-768 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/768-1024 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/1024-1048576 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/256-512 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/512-768 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/768-1024 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/1024-1048576 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/256-512 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/512-768 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/768-1024 + - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/1024-1048576 + local: + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/256-512 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/512-768 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/768-1024 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/1024-1048576 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/256-512 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/512-768 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/768-1024 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/1024-1048576 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/256-512 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/512-768 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/768-1024 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/1024-1048576 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/256-512 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/512-768 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/768-1024 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/1024-1048576 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/256-512 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/512-768 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/768-1024 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/1024-1048576 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/256-512 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/512-768 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/768-1024 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/1024-1048576 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/256-512 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/512-768 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/768-1024 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/1024-1048576 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/256-512 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/512-768 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/768-1024 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/1024-1048576 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/256-512 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/512-768 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/768-1024 + - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/1024-1048576 + batch_size: ${batch_size} + tokenizer_name_or_path: stabilityai/stable-diffusion-2-base + caption_drop_prob: 0.1 + resize_size: 256 + drop_last: true + shuffle: true + prefetch_factor: 2 + num_workers: 8 + persistent_workers: true + pin_memory: true + download_timeout: 900 + num_canonical_nodes: 32 + eval_dataset: + _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader + remote: oci://mosaicml-internal-dataset-coco/2014/val/10k-1/ + local: /tmp/mds-cache/mds-coco-10k-1/ + batch_size: 8 + resize_size: 256 + prefetch_factor: 2 + num_workers: 8 + persistent_workers: True + pin_memory: True + optimizer: + _target_: torch.optim.AdamW + lr: 1.0e-4 + weight_decay: 0.01 + scheduler: + _target_: composer.optim.MultiStepWithWarmupScheduler + t_warmup: 10000ba + milestones: + - 2000ep + logger: + wandb: + _target_: composer.loggers.wandb_logger.WandBLogger + name: ${name} + project: ${project} + group: ${name} + callbacks: + speed_monitor: + _target_: composer.callbacks.speed_monitor.SpeedMonitor + window_size: 10 + lr_monitor: + _target_: composer.callbacks.lr_monitor.LRMonitor + memory_monitor: + _target_: composer.callbacks.memory_monitor.MemoryMonitor + runtime_estimator: + _target_: composer.callbacks.runtime_estimator.RuntimeEstimator + optimizer_monitor: + _target_: composer.callbacks.OptimizerMonitor + image_logger: + _target_: diffusion.callbacks.log_diffusion_images.LogDiffusionImages + prompts: + - a couple waiting to cross the street underneath an umbrella. + - three men walking in the rain with umbrellas. + - a man is riding a red motor cycle, with baskets. + - a clock that has animal pictures instead of numbers. + - a brightly decorated bus sits on the road. + - a horse bucking with a rider on it, completely vertical, with another horse and onlookers. + - a white and blue bus is on a city street at night. + - a large clock tower on a building by a river + - beans and other food is sitting on a plate. + - a group of people that are standing up on a tennis court + size: 256 + guidance_scale: 5.0 + trainer: + _target_: composer.Trainer + device: gpu + max_duration: 550000ba + eval_interval: 1000ba + device_train_microbatch_size: 64 + run_name: ${name} + seed: ${seed} + scale_schedule_ratio: ${scale_schedule_ratio} + save_folder: oci://mosaicml-internal-checkpoints/jasmine/stable-diffusion-sdxl-unet-256-90m-h100/ + save_interval: 10000ba + save_overwrite: false + autoresume: true + fsdp_config: + sharding_strategy: "SHARD_GRAD_OP" + progress_bar: false From abc7b015172fad35aa1578b3fbe6c285b3ceedd0 Mon Sep 17 00:00:00 2001 From: jazcollins Date: Tue, 8 Aug 2023 00:09:15 +0000 Subject: [PATCH 06/14] remove debug prints --- diffusion/models/models.py | 1 - diffusion/train.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index 230c4660..68b6472a 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -84,7 +84,6 @@ def stable_diffusion_2( config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet') if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL - print('running SDXL!') config[0]['addition_embed_type'] = None config[0]['cross_attention_dim'] = 1024 diff --git a/diffusion/train.py b/diffusion/train.py index 18c22dbe..2d4380ba 100644 --- a/diffusion/train.py +++ b/diffusion/train.py @@ -42,7 +42,6 @@ def train(config: DictConfig) -> None: # fix stochastic failures in streaming datasets time.sleep(10) - print('sleeping afer dataset creation') # Composer can take dataloaders, dataspecs, evaluators, or list of evaluators eval_set: Optional[Union[DataSpec, List[Evaluator]]] = None @@ -67,7 +66,6 @@ def train(config: DictConfig) -> None: # fix stochastic failures in streaming datasets time.sleep(10) - print('sleeping afer dataset creation') # Build list of loggers, callbacks, and algorithms to pass to trainer logger: List[LoggerDestination] = [] From 631b9f4dfe1c05c204229c6f7c82c6c726ca090b Mon Sep 17 00:00:00 2001 From: jazcollins Date: Tue, 8 Aug 2023 21:59:22 +0000 Subject: [PATCH 07/14] allow passing vae model path --- diffusion/models/models.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index 68b6472a..20d23f2d 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -28,6 +28,7 @@ def stable_diffusion_2( model_name: str = 'stabilityai/stable-diffusion-2-base', unet_model_name: str = 'stabilityai/stable-diffusion-2-base', + vae_model_name: str = 'stabilityai/stable-diffusion-2-base', pretrained: bool = True, prediction_type: str = 'epsilon', train_metrics: Optional[List] = None, @@ -48,7 +49,9 @@ def stable_diffusion_2( model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder. Defaults to 'stabilityai/stable-diffusion-2-base'. unet_model_name (str, optional): Name of the UNet model to load. Defaults to - 'stabilityai/stable-diffusion-2-base' + 'stabilityai/stable-diffusion-2-base'. + vae_model_name (str, optional): Name of the VAE model to load. Defaults to + 'stabilityai/stable-diffusion-2-base'. pretrained (bool, optional): Whether to load pretrained weights. Defaults to True. prediction_type (str): The type of prediction to use. Must be one of 'sample', 'epsilon', or 'v_prediction'. Default: `epsilon`. @@ -84,6 +87,7 @@ def stable_diffusion_2( config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet') if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL + print('using SDXL unet!') config[0]['addition_embed_type'] = None config[0]['cross_attention_dim'] = 1024 @@ -95,10 +99,13 @@ def stable_diffusion_2( unet.down_blocks._fsdp_wrap = False if encode_latents_in_fp16: - vae = AutoencoderKL.from_pretrained(model_name, subfolder='vae', torch_dtype=torch.float16) + try: + vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder='vae', torch_dtype=torch.float16) + except: # for handling SDXL vae fp16 fixed checkpoint + vae = AutoencoderKL.from_pretrained(vae_model_name, torch_dtype=torch.float16) text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=torch.float16) else: - vae = AutoencoderKL.from_pretrained(model_name, subfolder='vae') + vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder='vae') text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder') tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder='tokenizer') From e785ca6880881d247b5ea271d87e18c7430c4f6a Mon Sep 17 00:00:00 2001 From: jazcollins Date: Thu, 10 Aug 2023 00:02:48 +0000 Subject: [PATCH 08/14] add base --- diffusion/models/models.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index 20d23f2d..356f1455 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -86,17 +86,22 @@ def stable_diffusion_2( else: config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet') - if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL + if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL print('using SDXL unet!') config[0]['addition_embed_type'] = None config[0]['cross_attention_dim'] = 1024 unet = UNet2DConditionModel(**config[0]) - if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0': # SDXL + if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL # Can't fsdp wrap up_blocks or down_blocks because the forward pass calls length on these unet.up_blocks._fsdp_wrap = False unet.down_blocks._fsdp_wrap = False + # for block in unet.up_blocks: + # block._fsdp_wrap = False + # for block in unet.down_blocks: + # block._fsdp_wrap = False + if encode_latents_in_fp16: try: From 218981c82d30be98d67908f8235cb10ba696c622 Mon Sep 17 00:00:00 2001 From: jazcollins Date: Thu, 17 Aug 2023 20:21:37 +0000 Subject: [PATCH 09/14] remove trailing whitespace --- diffusion/models/models.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index 356f1455..0c477b1d 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -48,9 +48,9 @@ def stable_diffusion_2( Args: model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder. Defaults to 'stabilityai/stable-diffusion-2-base'. - unet_model_name (str, optional): Name of the UNet model to load. Defaults to + unet_model_name (str, optional): Name of the UNet model to load. Defaults to 'stabilityai/stable-diffusion-2-base'. - vae_model_name (str, optional): Name of the VAE model to load. Defaults to + vae_model_name (str, optional): Name of the VAE model to load. Defaults to 'stabilityai/stable-diffusion-2-base'. pretrained (bool, optional): Whether to load pretrained weights. Defaults to True. prediction_type (str): The type of prediction to use. Must be one of 'sample', @@ -86,14 +86,14 @@ def stable_diffusion_2( else: config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet') - if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL + if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL print('using SDXL unet!') config[0]['addition_embed_type'] = None config[0]['cross_attention_dim'] = 1024 unet = UNet2DConditionModel(**config[0]) - if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL + if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL # Can't fsdp wrap up_blocks or down_blocks because the forward pass calls length on these unet.up_blocks._fsdp_wrap = False unet.down_blocks._fsdp_wrap = False @@ -102,11 +102,10 @@ def stable_diffusion_2( # for block in unet.down_blocks: # block._fsdp_wrap = False - if encode_latents_in_fp16: - try: + try: vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder='vae', torch_dtype=torch.float16) - except: # for handling SDXL vae fp16 fixed checkpoint + except: # for handling SDXL vae fp16 fixed checkpoint vae = AutoencoderKL.from_pretrained(vae_model_name, torch_dtype=torch.float16) text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=torch.float16) else: From 049a1fb695181a091d93e1526acbd886a5ce485a Mon Sep 17 00:00:00 2001 From: jazcollins Date: Thu, 17 Aug 2023 20:39:21 +0000 Subject: [PATCH 10/14] split sdxl into separate model --- diffusion/models/models.py | 138 ++++++++++++++++++++++++++++++------- 1 file changed, 115 insertions(+), 23 deletions(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index 0c477b1d..b662b78c 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -27,8 +27,6 @@ def stable_diffusion_2( model_name: str = 'stabilityai/stable-diffusion-2-base', - unet_model_name: str = 'stabilityai/stable-diffusion-2-base', - vae_model_name: str = 'stabilityai/stable-diffusion-2-base', pretrained: bool = True, prediction_type: str = 'epsilon', train_metrics: Optional[List] = None, @@ -45,13 +43,114 @@ def stable_diffusion_2( Requires batches of matched images and text prompts to train. Generates images from text prompts. + Args: + model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder. + Defaults to 'stabilityai/stable-diffusion-2-base'. + pretrained (bool, optional): Whether to load pretrained weights. Defaults to True. + prediction_type (str): The type of prediction to use. Must be one of 'sample', + 'epsilon', or 'v_prediction'. Default: `epsilon`. + train_metrics (list, optional): List of metrics to compute during training. If None, defaults to + [MeanSquaredError()]. + val_metrics (list, optional): List of metrics to compute during validation. If None, defaults to + [MeanSquaredError(), FrechetInceptionDistance(normalize=True)]. + val_guidance_scales (list, optional): List of scales to use for validation guidance. If None, defaults to + [1.0, 3.0, 7.0]. + val_seed (int, optional): Seed to use for generating evaluation images. Defaults to 1138. + loss_bins (list, optional): List of tuples of (min, max) values to use for loss binning. If None, defaults to + [(0, 1)]. + precomputed_latents (bool, optional): Whether to use precomputed latents. Defaults to False. + encode_latents_in_fp16 (bool, optional): Whether to encode latents in fp16. Defaults to True. + fsdp (bool, optional): Whether to use FSDP. Defaults to True. + """ + if train_metrics is None: + train_metrics = [MeanSquaredError()] + if val_metrics is None: + val_metrics = [MeanSquaredError(), FrechetInceptionDistance(normalize=True)] + if val_guidance_scales is None: + val_guidance_scales = [1.0, 3.0, 7.0] + if loss_bins is None: + loss_bins = [(0, 1)] + # Fix a bug where CLIPScore requires grad + for metric in val_metrics: + if isinstance(metric, CLIPScore): + metric.requires_grad_(False) + + if pretrained: + unet = UNet2DConditionModel.from_pretrained(model_name, subfolder='unet') + else: + config = PretrainedConfig.get_config_dict(model_name, subfolder='unet') + unet = UNet2DConditionModel(**config[0]) + + if encode_latents_in_fp16: + vae = AutoencoderKL.from_pretrained(model_name, subfolder='vae', torch_dtype=torch.float16) + text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=torch.float16) + else: + vae = AutoencoderKL.from_pretrained(model_name, subfolder='vae') + text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder') + + tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder='tokenizer') + noise_scheduler = DDPMScheduler.from_pretrained(model_name, subfolder='scheduler') + inference_noise_scheduler = DDIMScheduler(num_train_timesteps=noise_scheduler.config.num_train_timesteps, + beta_start=noise_scheduler.config.beta_start, + beta_end=noise_scheduler.config.beta_end, + beta_schedule=noise_scheduler.config.beta_schedule, + trained_betas=noise_scheduler.config.trained_betas, + clip_sample=noise_scheduler.config.clip_sample, + set_alpha_to_one=noise_scheduler.config.set_alpha_to_one, + prediction_type=prediction_type) + + model = StableDiffusion( + unet=unet, + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + noise_scheduler=noise_scheduler, + inference_noise_scheduler=inference_noise_scheduler, + prediction_type=prediction_type, + train_metrics=train_metrics, + val_metrics=val_metrics, + val_guidance_scales=val_guidance_scales, + val_seed=val_seed, + loss_bins=loss_bins, + precomputed_latents=precomputed_latents, + encode_latents_in_fp16=encode_latents_in_fp16, + fsdp=fsdp, + ) + if torch.cuda.is_available(): + model = DeviceGPU().module_to_device(model) + if is_xformers_installed: + model.unet.enable_xformers_memory_efficient_attention() + model.vae.enable_xformers_memory_efficient_attention() + return model + + +def stable_diffusion_xl( + model_name: str = 'stabilityai/stable-diffusion-2-base', + unet_model_name: str = 'stabilityai/stable-diffusion-xl-base-1.0', + vae_model_name: str = 'madebyollin/sdxl-vae-fp16-fix', + pretrained: bool = True, + prediction_type: str = 'epsilon', + train_metrics: Optional[List] = None, + val_metrics: Optional[List] = None, + val_guidance_scales: Optional[List] = None, + val_seed: int = 1138, + loss_bins: Optional[List] = None, + precomputed_latents: bool = False, + encode_latents_in_fp16: bool = True, + fsdp: bool = True, +): + """Stable diffusion 2 training setup + SDXL UNet and VAE. + + Requires batches of matched images and text prompts to train. Generates images from text + prompts. Currently uses UNet and VAE config from SDXL, but text encoder/tokenizer from SD2. + Args: model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder. Defaults to 'stabilityai/stable-diffusion-2-base'. unet_model_name (str, optional): Name of the UNet model to load. Defaults to - 'stabilityai/stable-diffusion-2-base'. + 'stabilityai/stable-diffusion-xl-base-1.0'. vae_model_name (str, optional): Name of the VAE model to load. Defaults to - 'stabilityai/stable-diffusion-2-base'. + 'madebyollin/sdxl-vae-fp16-fix'. pretrained (bool, optional): Whether to load pretrained weights. Defaults to True. prediction_type (str): The type of prediction to use. Must be one of 'sample', 'epsilon', or 'v_prediction'. Default: `epsilon`. @@ -85,31 +184,24 @@ def stable_diffusion_2( unet = UNet2DConditionModel.from_pretrained(unet_model_name, subfolder='unet') else: config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet') - - if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL - print('using SDXL unet!') - config[0]['addition_embed_type'] = None - config[0]['cross_attention_dim'] = 1024 - + # Currently not doing micro-conditioning, so set config appropriately + config[0]['addition_embed_type'] = None + config[0]['cross_attention_dim'] = 1024 unet = UNet2DConditionModel(**config[0]) - if unet_model_name == 'stabilityai/stable-diffusion-xl-refiner-1.0' or unet_model_name == 'stabilityai/stable-diffusion-xl-base-1.0': # SDXL - # Can't fsdp wrap up_blocks or down_blocks because the forward pass calls length on these - unet.up_blocks._fsdp_wrap = False - unet.down_blocks._fsdp_wrap = False - # for block in unet.up_blocks: - # block._fsdp_wrap = False - # for block in unet.down_blocks: - # block._fsdp_wrap = False + # Prevent fsdp from wrapping up_blocks and down_blocks because the forward pass calls length on these + unet.up_blocks._fsdp_wrap = False + unet.down_blocks._fsdp_wrap = False + for block in unet.up_blocks: + block._fsdp_wrap = True + for block in unet.down_blocks: + block._fsdp_wrap = True if encode_latents_in_fp16: - try: - vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder='vae', torch_dtype=torch.float16) - except: # for handling SDXL vae fp16 fixed checkpoint - vae = AutoencoderKL.from_pretrained(vae_model_name, torch_dtype=torch.float16) + vae = AutoencoderKL.from_pretrained(vae_model_name, torch_dtype=torch.float16) text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=torch.float16) else: - vae = AutoencoderKL.from_pretrained(vae_model_name, subfolder='vae') + vae = AutoencoderKL.from_pretrained(vae_model_name) text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder') tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder='tokenizer') From df4db968aea509a6b7300d39c6285a34b49421c9 Mon Sep 17 00:00:00 2001 From: jazcollins Date: Thu, 17 Aug 2023 20:41:22 +0000 Subject: [PATCH 11/14] remove local yamls --- yamls/local-yamls/SDXL-a100-256-lite.yaml | 136 -------------- yamls/local-yamls/SDXL-h100-256.yaml | 212 ---------------------- 2 files changed, 348 deletions(-) delete mode 100644 yamls/local-yamls/SDXL-a100-256-lite.yaml delete mode 100644 yamls/local-yamls/SDXL-h100-256.yaml diff --git a/yamls/local-yamls/SDXL-a100-256-lite.yaml b/yamls/local-yamls/SDXL-a100-256-lite.yaml deleted file mode 100644 index 60d048da..00000000 --- a/yamls/local-yamls/SDXL-a100-256-lite.yaml +++ /dev/null @@ -1,136 +0,0 @@ -run_name: sd2-sdxl-unet-256 -cluster: r1z1 -gpu_num: 4 -image: mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04 -integrations: - - integration_type: "git_repo" - git_repo: jazcollins/diffusion - git_branch: sdxl - pip_install: .[all] - - integration_type: "wandb" - project: jasmine-sd2-sdxl-unet - entity: mosaic-ml -command: | - pip install -U ninja - pip install -U git+https://github.com/facebookresearch/xformers - cd diffusion - HYDRA_FULL_ERROR=1 composer run.py --config-path /mnt/config --config-name parameters - (echo "Command failed - killing python" && pkill python && exit 1) - -parameters: - project: jasmine-sd2-sdxl-unet - batch_size: 32 # 2048 - seed: 17 - scale_schedule_ratio: 1.0 - name: test # wandb run name - eval_first: true - algorithms: - low_precision_groupnorm: - attribute: unet - precision: amp_fp16 - low_precision_layernorm: - attribute: unet - precision: amp_fp16 - model: - _target_: diffusion.models.models.stable_diffusion_2 - pretrained: false - model_name: stabilityai/stable-diffusion-2-base - unet_model_name: stabilityai/stable-diffusion-xl-refiner-1.0 - precomputed_latents: true - encode_latents_in_fp16: true - fsdp: true - val_metrics: - - _target_: torchmetrics.MeanSquaredError - val_guidance_scales: [] - loss_bins: [] - dataset: - train_batch_size: ${batch_size} - eval_batch_size: 1024 # Should be 8 per device - train_dataset: - _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader - remote: - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/256-512 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/512-768 - local: - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/256-512 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/512-768 - batch_size: ${batch_size} - tokenizer_name_or_path: stabilityai/stable-diffusion-2-base - caption_drop_prob: 0.1 - resize_size: 256 - drop_last: true - shuffle: true - prefetch_factor: 2 - num_workers: 8 - persistent_workers: true - pin_memory: true - download_timeout: 900 - num_canonical_nodes: 32 - eval_dataset: - _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader - remote: oci://mosaicml-internal-dataset-coco/2014/val/10k-1/ - local: /tmp/mds-cache/mds-coco-10k-1/ - batch_size: 8 - resize_size: 256 - prefetch_factor: 2 - num_workers: 8 - persistent_workers: True - pin_memory: True - optimizer: - _target_: torch.optim.AdamW - lr: 1.0e-4 - weight_decay: 0.01 - scheduler: - _target_: composer.optim.MultiStepWithWarmupScheduler - t_warmup: 10000ba - milestones: - - 2000ep - logger: - wandb: - _target_: composer.loggers.wandb_logger.WandBLogger - name: ${name} - project: ${project} - group: ${name} - callbacks: - speed_monitor: - _target_: composer.callbacks.speed_monitor.SpeedMonitor - window_size: 10 - lr_monitor: - _target_: composer.callbacks.lr_monitor.LRMonitor - memory_monitor: - _target_: composer.callbacks.memory_monitor.MemoryMonitor - runtime_estimator: - _target_: composer.callbacks.runtime_estimator.RuntimeEstimator - optimizer_monitor: - _target_: composer.callbacks.OptimizerMonitor - image_logger: - _target_: diffusion.callbacks.log_diffusion_images.LogDiffusionImages - prompts: - - a couple waiting to cross the street underneath an umbrella. - - three men walking in the rain with umbrellas. - - a man is riding a red motor cycle, with baskets. - - a clock that has animal pictures instead of numbers. - - a brightly decorated bus sits on the road. - - a horse bucking with a rider on it, completely vertical, with another horse and onlookers. - - a white and blue bus is on a city street at night. - - a large clock tower on a building by a river - - beans and other food is sitting on a plate. - - a group of people that are standing up on a tennis court - size: 256 - guidance_scale: 5.0 - trainer: - _target_: composer.Trainer - device: gpu - max_duration: 550000ba - eval_interval: 1000ba - device_train_microbatch_size: 8 # 64 - run_name: ${name} - seed: ${seed} - scale_schedule_ratio: ${scale_schedule_ratio} - save_folder: oci://mosaicml-internal-checkpoints/jasmine/test/ - save_interval: 10000ba - save_overwrite: false - autoresume: true - fsdp_config: - sharding_strategy: "SHARD_GRAD_OP" - progress_bar: false diff --git a/yamls/local-yamls/SDXL-h100-256.yaml b/yamls/local-yamls/SDXL-h100-256.yaml deleted file mode 100644 index 86e751fc..00000000 --- a/yamls/local-yamls/SDXL-h100-256.yaml +++ /dev/null @@ -1,212 +0,0 @@ -run_name: sd2-sdxl-unet-256 -cluster: r9z1 -gpu_num: 32 -env_variables: - - key: NCCL_IB_PCI_RELAXED_ORDERING - value: "0" -image: mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04 -compute: - instance: coreweave.h100-80 -scheduling: - resumable: true - priority: medium -integrations: - - integration_type: "git_repo" - git_repo: jazcollins/diffusion - git_branch: sdxl - pip_install: .[all] - - integration_type: "wandb" - project: jasmine-sd2-sdxl-unet - entity: mosaic-ml -command: | - pip install -U ninja - pip install -U git+https://github.com/facebookresearch/xformers - cd diffusion - HYDRA_FULL_ERROR=1 composer run.py --config-path /mnt/config --config-name parameters - (echo "Command failed - killing python" && pkill python && exit 1) - -parameters: - project: jasmine-sd2-sdxl-unet - batch_size: 2048 - seed: 17 - scale_schedule_ratio: 1.0 - name: 90m-sdxl-unet # wandb run name - eval_first: true - algorithms: - low_precision_groupnorm: - attribute: unet - precision: amp_fp16 - low_precision_layernorm: - attribute: unet - precision: amp_fp16 - model: - _target_: diffusion.models.models.stable_diffusion_2 - pretrained: false - model_name: stabilityai/stable-diffusion-2-base - unet_model_name: stabilityai/stable-diffusion-xl-refiner-1.0 - precomputed_latents: true - encode_latents_in_fp16: true - fsdp: true - val_metrics: - - _target_: torchmetrics.MeanSquaredError - val_guidance_scales: [] - loss_bins: [] - dataset: - train_batch_size: ${batch_size} - eval_batch_size: 1024 # Should be 8 per device - train_dataset: - _target_: diffusion.datasets.laion.laion.build_streaming_laion_dataloader - remote: - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/256-512 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/512-768 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/768-1024 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/0/1024-1048576 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/256-512 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/512-768 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/768-1024 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/1/1024-1048576 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/256-512 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/512-768 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/768-1024 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/2/1024-1048576 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/256-512 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/512-768 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/768-1024 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/3/1024-1048576 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/256-512 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/512-768 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/768-1024 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/4/1024-1048576 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/256-512 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/512-768 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/768-1024 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/5/1024-1048576 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/256-512 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/512-768 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/768-1024 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/6/1024-1048576 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/256-512 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/512-768 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/768-1024 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/7/1024-1048576 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/256-512 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/512-768 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/768-1024 - - oci://mosaicml-internal-dataset-laion2b-en/4.5v2/10m-subsets/8/1024-1048576 - local: - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/256-512 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/512-768 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/768-1024 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/0/1024-1048576 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/256-512 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/512-768 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/768-1024 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/1/1024-1048576 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/256-512 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/512-768 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/768-1024 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/2/1024-1048576 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/256-512 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/512-768 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/768-1024 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/3/1024-1048576 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/256-512 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/512-768 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/768-1024 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/4/1024-1048576 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/256-512 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/512-768 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/768-1024 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/5/1024-1048576 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/256-512 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/512-768 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/768-1024 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/6/1024-1048576 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/256-512 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/512-768 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/768-1024 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/7/1024-1048576 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/256-512 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/512-768 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/768-1024 - - /tmp/mds-cache/mds-laion2b-en/4.5v2/10m-subsets/8/1024-1048576 - batch_size: ${batch_size} - tokenizer_name_or_path: stabilityai/stable-diffusion-2-base - caption_drop_prob: 0.1 - resize_size: 256 - drop_last: true - shuffle: true - prefetch_factor: 2 - num_workers: 8 - persistent_workers: true - pin_memory: true - download_timeout: 900 - num_canonical_nodes: 32 - eval_dataset: - _target_: diffusion.datasets.coco.coco_captions.build_streaming_cocoval_dataloader - remote: oci://mosaicml-internal-dataset-coco/2014/val/10k-1/ - local: /tmp/mds-cache/mds-coco-10k-1/ - batch_size: 8 - resize_size: 256 - prefetch_factor: 2 - num_workers: 8 - persistent_workers: True - pin_memory: True - optimizer: - _target_: torch.optim.AdamW - lr: 1.0e-4 - weight_decay: 0.01 - scheduler: - _target_: composer.optim.MultiStepWithWarmupScheduler - t_warmup: 10000ba - milestones: - - 2000ep - logger: - wandb: - _target_: composer.loggers.wandb_logger.WandBLogger - name: ${name} - project: ${project} - group: ${name} - callbacks: - speed_monitor: - _target_: composer.callbacks.speed_monitor.SpeedMonitor - window_size: 10 - lr_monitor: - _target_: composer.callbacks.lr_monitor.LRMonitor - memory_monitor: - _target_: composer.callbacks.memory_monitor.MemoryMonitor - runtime_estimator: - _target_: composer.callbacks.runtime_estimator.RuntimeEstimator - optimizer_monitor: - _target_: composer.callbacks.OptimizerMonitor - image_logger: - _target_: diffusion.callbacks.log_diffusion_images.LogDiffusionImages - prompts: - - a couple waiting to cross the street underneath an umbrella. - - three men walking in the rain with umbrellas. - - a man is riding a red motor cycle, with baskets. - - a clock that has animal pictures instead of numbers. - - a brightly decorated bus sits on the road. - - a horse bucking with a rider on it, completely vertical, with another horse and onlookers. - - a white and blue bus is on a city street at night. - - a large clock tower on a building by a river - - beans and other food is sitting on a plate. - - a group of people that are standing up on a tennis court - size: 256 - guidance_scale: 5.0 - trainer: - _target_: composer.Trainer - device: gpu - max_duration: 550000ba - eval_interval: 1000ba - device_train_microbatch_size: 64 - run_name: ${name} - seed: ${seed} - scale_schedule_ratio: ${scale_schedule_ratio} - save_folder: oci://mosaicml-internal-checkpoints/jasmine/stable-diffusion-sdxl-unet-256-90m-h100/ - save_interval: 10000ba - save_overwrite: false - autoresume: true - fsdp_config: - sharding_strategy: "SHARD_GRAD_OP" - progress_bar: false From f31cd8fa8e227bfde806b1cbed528cb9f6edc645 Mon Sep 17 00:00:00 2001 From: jazcollins Date: Thu, 17 Aug 2023 20:44:16 +0000 Subject: [PATCH 12/14] clean up sd2 doc --- diffusion/models/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index b662b78c..b52e6fd0 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -44,8 +44,7 @@ def stable_diffusion_2( prompts. Args: - model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder. - Defaults to 'stabilityai/stable-diffusion-2-base'. + model_name (str, optional): Name of the model to load. Defaults to 'stabilityai/stable-diffusion-2-base'. pretrained (bool, optional): Whether to load pretrained weights. Defaults to True. prediction_type (str): The type of prediction to use. Must be one of 'sample', 'epsilon', or 'v_prediction'. Default: `epsilon`. From b07abcf5aecf4c88790c8f71ce2d5fffd8cfbf64 Mon Sep 17 00:00:00 2001 From: jazcollins Date: Thu, 17 Aug 2023 20:57:09 +0000 Subject: [PATCH 13/14] one more doc fix --- diffusion/models/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index b52e6fd0..5ab83398 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -144,8 +144,8 @@ def stable_diffusion_xl( prompts. Currently uses UNet and VAE config from SDXL, but text encoder/tokenizer from SD2. Args: - model_name (str, optional): Name of the model to load. Determines the text encoder and autoencder. - Defaults to 'stabilityai/stable-diffusion-2-base'. + model_name (str, optional): Name of the model to load. Determines the text encoder, tokenizer, + and noise scheduler. Defaults to 'stabilityai/stable-diffusion-2-base'. unet_model_name (str, optional): Name of the UNet model to load. Defaults to 'stabilityai/stable-diffusion-xl-base-1.0'. vae_model_name (str, optional): Name of the VAE model to load. Defaults to From 9739743e955020e6c39298ef46d346d4758b76de Mon Sep 17 00:00:00 2001 From: jazcollins Date: Mon, 21 Aug 2023 16:58:57 +0000 Subject: [PATCH 14/14] add NotImplementedError, fix docs --- diffusion/models/models.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index 5ab83398..e2eed1a8 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -44,8 +44,8 @@ def stable_diffusion_2( prompts. Args: - model_name (str, optional): Name of the model to load. Defaults to 'stabilityai/stable-diffusion-2-base'. - pretrained (bool, optional): Whether to load pretrained weights. Defaults to True. + model_name (str): Name of the model to load. Defaults to 'stabilityai/stable-diffusion-2-base'. + pretrained (bool): Whether to load pretrained weights. Defaults to True. prediction_type (str): The type of prediction to use. Must be one of 'sample', 'epsilon', or 'v_prediction'. Default: `epsilon`. train_metrics (list, optional): List of metrics to compute during training. If None, defaults to @@ -54,12 +54,12 @@ def stable_diffusion_2( [MeanSquaredError(), FrechetInceptionDistance(normalize=True)]. val_guidance_scales (list, optional): List of scales to use for validation guidance. If None, defaults to [1.0, 3.0, 7.0]. - val_seed (int, optional): Seed to use for generating evaluation images. Defaults to 1138. + val_seed (int): Seed to use for generating evaluation images. Defaults to 1138. loss_bins (list, optional): List of tuples of (min, max) values to use for loss binning. If None, defaults to [(0, 1)]. - precomputed_latents (bool, optional): Whether to use precomputed latents. Defaults to False. - encode_latents_in_fp16 (bool, optional): Whether to encode latents in fp16. Defaults to True. - fsdp (bool, optional): Whether to use FSDP. Defaults to True. + precomputed_latents (bool): Whether to use precomputed latents. Defaults to False. + encode_latents_in_fp16 (bool): Whether to encode latents in fp16. Defaults to True. + fsdp (bool): Whether to use FSDP. Defaults to True. """ if train_metrics is None: train_metrics = [MeanSquaredError()] @@ -144,13 +144,14 @@ def stable_diffusion_xl( prompts. Currently uses UNet and VAE config from SDXL, but text encoder/tokenizer from SD2. Args: - model_name (str, optional): Name of the model to load. Determines the text encoder, tokenizer, + model_name (str): Name of the model to load. Determines the text encoder, tokenizer, and noise scheduler. Defaults to 'stabilityai/stable-diffusion-2-base'. - unet_model_name (str, optional): Name of the UNet model to load. Defaults to + unet_model_name (str): Name of the UNet model to load. Defaults to 'stabilityai/stable-diffusion-xl-base-1.0'. - vae_model_name (str, optional): Name of the VAE model to load. Defaults to - 'madebyollin/sdxl-vae-fp16-fix'. - pretrained (bool, optional): Whether to load pretrained weights. Defaults to True. + vae_model_name (str): Name of the VAE model to load. Defaults to + 'madebyollin/sdxl-vae-fp16-fix' as the official VAE checkpoint (from + 'stabilityai/stable-diffusion-xl-base-1.0') is not compatible with fp16. + pretrained (bool): Whether to load pretrained weights. Defaults to True. prediction_type (str): The type of prediction to use. Must be one of 'sample', 'epsilon', or 'v_prediction'. Default: `epsilon`. train_metrics (list, optional): List of metrics to compute during training. If None, defaults to @@ -159,12 +160,12 @@ def stable_diffusion_xl( [MeanSquaredError(), FrechetInceptionDistance(normalize=True)]. val_guidance_scales (list, optional): List of scales to use for validation guidance. If None, defaults to [1.0, 3.0, 7.0]. - val_seed (int, optional): Seed to use for generating evaluation images. Defaults to 1138. + val_seed (int): Seed to use for generating evaluation images. Defaults to 1138. loss_bins (list, optional): List of tuples of (min, max) values to use for loss binning. If None, defaults to [(0, 1)]. - precomputed_latents (bool, optional): Whether to use precomputed latents. Defaults to False. - encode_latents_in_fp16 (bool, optional): Whether to encode latents in fp16. Defaults to True. - fsdp (bool, optional): Whether to use FSDP. Defaults to True. + precomputed_latents (bool): Whether to use precomputed latents. Defaults to False. + encode_latents_in_fp16 (bool): Whether to encode latents in fp16. Defaults to True. + fsdp (bool): Whether to use FSDP. Defaults to True. """ if train_metrics is None: train_metrics = [MeanSquaredError()] @@ -180,7 +181,7 @@ def stable_diffusion_xl( metric.requires_grad_(False) if pretrained: - unet = UNet2DConditionModel.from_pretrained(unet_model_name, subfolder='unet') + raise NotImplementedError('Full SDXL pipeline not implemented yet.') else: config = PretrainedConfig.get_config_dict(unet_model_name, subfolder='unet') # Currently not doing micro-conditioning, so set config appropriately