Merge branch 'sd3' of https://github.com/sdbds/sd-scripts into qinglong

sdbds · Oct 10, 2024 · 94db49a · 94db49a
2 parents 6aee7b3 + 1bb33b7
commit 94db49a
Show file tree

Hide file tree

Showing 7 changed files with 200 additions and 111 deletions.
diff --git a/README.md b/README.md
@@ -11,6 +11,10 @@ The command to install PyTorch is as follows:
 
 ### Recent Updates
 
+Oct 6, 2024:
+- In FLUX.1 LoRA training and fine-tuning, the specified weight file (*.safetensors) is automatically determined to be dev or schnell. This allows schnell models to be loaded correctly. Note that LoRA training with schnell models and fine-tuning with schnell models are unverified.
+- FLUX.1 LoRA training and fine-tuning can now load weights in Diffusers format in addition to BFL format (a single *.safetensors file). Please specify the parent directory of `transformer` or `diffusion_pytorch_model-00001-of-00003.safetensors` with the full path. However, Diffusers format CLIP/T5XXL is not supported. Saving is supported only in BFL format.
+
 Sep 26, 2024:
 The implementation of block swap during FLUX.1 fine-tuning has been changed to improve speed about 10% (depends on the environment). A new `--blocks_to_swap` option has been added, and `--double_blocks_to_swap` and `--single_blocks_to_swap` are deprecated. `--double_blocks_to_swap` and `--single_blocks_to_swap` are working as before, but they will be removed in the future. See [FLUX.1 fine-tuning](#flux1-fine-tuning) for details.
 

diff --git a/flux_minimal_inference.py b/flux_minimal_inference.py
@@ -419,9 +419,6 @@ def encode(prpt: str):
     steps = args.steps
     guidance_scale = args.guidance
 
-    name = "schnell" if "schnell" in args.ckpt_path else "dev"  # TODO change this to a more robust way
-    is_schnell = name == "schnell"
-
     def is_fp8(dt):
         return dt in [torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz]
 
@@ -455,12 +452,8 @@ def is_fp8(dt):
     # if is_fp8(t5xxl_dtype):
     #     t5xxl = accelerator.prepare(t5xxl)
 
-    t5xxl_max_length = 256 if is_schnell else 512
-    tokenize_strategy = strategy_flux.FluxTokenizeStrategy(t5xxl_max_length)
-    encoding_strategy = strategy_flux.FluxTextEncodingStrategy()
-
     # DiT
-    model = flux_utils.load_flow_model(name, args.ckpt_path, None, loading_device)
+    is_schnell, model = flux_utils.load_flow_model(args.ckpt_path, None, loading_device)
     model.eval()
     logger.info(f"Casting model to {flux_dtype}")
     model.to(flux_dtype)  # make sure model is dtype
@@ -469,8 +462,12 @@ def is_fp8(dt):
     #     if args.offload:
     #         model = model.to("cpu")
 
+    t5xxl_max_length = 256 if is_schnell else 512
+    tokenize_strategy = strategy_flux.FluxTokenizeStrategy(t5xxl_max_length)
+    encoding_strategy = strategy_flux.FluxTextEncodingStrategy()
+
     # AE
-    ae = flux_utils.load_ae(name, args.ae, ae_dtype, loading_device)
+    ae = flux_utils.load_ae(args.ae, ae_dtype, loading_device)
     ae.eval()
     # if is_fp8(ae_dtype):
     #     ae = accelerator.prepare(ae)

diff --git a/flux_train.py b/flux_train.py
@@ -137,16 +137,16 @@ def train(args):
 
     train_dataset_group.verify_bucket_reso_steps(16)  # TODO これでいいか確認
 
+    _, is_schnell, _ = flux_utils.check_flux_state_dict_diffusers_schnell(args.pretrained_model_name_or_path)
     if args.debug_dataset:
         if args.cache_text_encoder_outputs:
             strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(
                 strategy_flux.FluxTextEncoderOutputsCachingStrategy(
                     args.cache_text_encoder_outputs_to_disk, args.text_encoder_batch_size, False, False
                 )
             )
-        name = "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"
         t5xxl_max_token_length = (
-            args.t5xxl_max_token_length if args.t5xxl_max_token_length is not None else (256 if name == "schnell" else 512)
+            args.t5xxl_max_token_length if args.t5xxl_max_token_length is not None else (256 if is_schnell else 512)
         )
         strategy_base.TokenizeStrategy.set_strategy(strategy_flux.FluxTokenizeStrategy(t5xxl_max_token_length))
 
@@ -177,7 +177,6 @@ def train(args):
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
 
     # モデルを読み込む
-    name = "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"
 
     def unwrap_model(model):
         model = accelerator.unwrap_model(model)
@@ -187,7 +186,7 @@ def unwrap_model(model):
     # load VAE for caching latents
     ae = None
     if cache_latents:
-        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
+        ae = flux_utils.load_ae( args.ae, weight_dtype, "cpu", args.disable_mmap_load_safetensors)
         ae.to(accelerator.device, dtype=weight_dtype)
         ae.requires_grad_(False)
         ae.eval()
@@ -201,7 +200,7 @@ def unwrap_model(model):
 
     # prepare tokenize strategy
     if args.t5xxl_max_token_length is None:
-        if name == "schnell":
+        if is_schnell:
             t5xxl_max_token_length = 256
         else:
             t5xxl_max_token_length = 512
@@ -265,8 +264,8 @@ def unwrap_model(model):
         clean_memory_on_device(accelerator.device)
 
     # load FLUX
-    flux = flux_utils.load_flow_model(
-        name, args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
+    _, flux = flux_utils.load_flow_model(
+        args.pretrained_model_name_or_path, weight_dtype, "cpu", args.disable_mmap_load_safetensors
     )
 
     if args.gradient_checkpointing:
@@ -301,7 +300,7 @@ def unwrap_model(model):
 
     if not cache_latents:
         # load VAE here if not cached
-        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu")
+        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu")
         ae.requires_grad_(False)
         ae.eval()
         ae.to(accelerator.device, dtype=weight_dtype)
@@ -721,7 +720,9 @@ def optimizer_hook(parameter: torch.Tensor):
         accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
 
     # For --sample_at_first
+    optimizer_eval_fn()
     flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs)
+    optimizer_train_fn()
     if len(accelerator.trackers) > 0:
         # log empty object to commit the sample images to wandb
         accelerator.log({}, step=0)

diff --git a/flux_train_network.py b/flux_train_network.py
@@ -2,7 +2,7 @@
 import copy
 import math
 import random
-from typing import Any
+from typing import Any, Optional
 
 import torch
 from accelerate import Accelerator
@@ -25,6 +25,7 @@ class FluxNetworkTrainer(train_network.NetworkTrainer):
     def __init__(self):
         super().__init__()
         self.sample_prompts_te_outputs = None
+        self.is_schnell: Optional[bool] = None
 
     def assert_extra_args(self, args, train_dataset_group):
         super().assert_extra_args(args, train_dataset_group)
@@ -58,19 +59,15 @@ def assert_extra_args(self, args, train_dataset_group):
 
         train_dataset_group.verify_bucket_reso_steps(32)  # TODO check this
 
-    def get_flux_model_name(self, args):
-        return "schnell" if "schnell" in args.pretrained_model_name_or_path else "dev"
-
     def load_target_model(self, args, weight_dtype, accelerator):
         # currently offload to cpu for some models
-        name = self.get_flux_model_name(args)
 
         # if the file is fp8 and we are using fp8_base, we can load it as is (fp8)
         loading_dtype = None if args.fp8_base else weight_dtype
 
         # if we load to cpu, flux.to(fp8) takes a long time, so we should load to gpu in future
-        model = flux_utils.load_flow_model(
-            name, args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
+        self.is_schnell, model = flux_utils.load_flow_model(
+            args.pretrained_model_name_or_path, loading_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors
         )
         if args.fp8_base:
             # check dtype of model
@@ -101,7 +98,7 @@ def load_target_model(self, args, weight_dtype, accelerator):
             elif t5xxl.dtype == torch.float8_e4m3fn:
                 logger.info("Loaded fp8 T5XXL model")
 
-        ae = flux_utils.load_ae(name, args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
+        ae = flux_utils.load_ae(args.ae, weight_dtype, "cpu", disable_mmap=args.disable_mmap_load_safetensors)
 
         return flux_utils.MODEL_VERSION_FLUX_V1, [clip_l, t5xxl], ae, model
 
@@ -143,10 +140,10 @@ def prepare_split_model(self, model, weight_dtype, accelerator):
         return flux_lower
 
     def get_tokenize_strategy(self, args):
-        name = self.get_flux_model_name(args)
+        _, is_schnell, _ = flux_utils.check_flux_state_dict_diffusers_schnell(args.pretrained_model_name_or_path)
 
         if args.t5xxl_max_token_length is None:
-            if name == "schnell":
+            if is_schnell:
                 t5xxl_max_token_length = 256
             else:
                 t5xxl_max_token_length = 512