From 87526942a67fd71bb775bc479b0a7449df516dd8 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Fri, 12 Jul 2024 22:56:38 +0800
Subject: [PATCH 01/21] judge image size for using diff interpolation

---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 15c23f3cc..74720fec6 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2362,7 +2362,7 @@ def trim_and_resize_if_required(
 
     if image_width != resized_size[0] or image_height != resized_size[1]:
         # リサイズする
-        image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA)  # INTER_AREAでやりたいのでcv2でリサイズ
+        image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA if image_width > resized_size[0] and image_height > resized_size[1] else cv2.INTER_LANCZOS4)
 
     image_height, image_width = image.shape[0:2]
 

From 2e67978ee243a20f169ce76d7644bb1f9dec9bad Mon Sep 17 00:00:00 2001
From: Millie <millie.vvu@gmail.com>
Date: Thu, 18 Jul 2024 11:52:58 -0700
Subject: [PATCH 02/21] Generate sample images without having CUDA (such as on
 Macs)

---
 library/train_util.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index 15c23f3cc..9b0397d7d 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5229,7 +5229,7 @@ def sample_images_common(
     clean_memory_on_device(accelerator.device)
 
     torch.set_rng_state(rng_state)
-    if cuda_rng_state is not None:
+    if torch.cuda.is_available() and cuda_rng_state is not None:
         torch.cuda.set_rng_state(cuda_rng_state)
     vae.to(org_vae_device)
 
@@ -5263,11 +5263,13 @@ def sample_image_inference(
 
     if seed is not None:
         torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(seed)
     else:
         # True random sample image generation
         torch.seed()
-        torch.cuda.seed()
+        if torch.cuda.is_available():
+            torch.cuda.seed()
 
     scheduler = get_my_scheduler(
         sample_sampler=sampler_name,
@@ -5302,8 +5304,9 @@ def sample_image_inference(
             controlnet_image=controlnet_image,
         )
 
-    with torch.cuda.device(torch.cuda.current_device()):
-        torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        with torch.cuda.device(torch.cuda.current_device()):
+            torch.cuda.empty_cache()
 
     image = pipeline.latents_to_image(latents)[0]
 

From 1f16b80e88b1c4f05d49b4fc328d3b9b105ebcbe Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Sat, 20 Jul 2024 21:35:24 +0800
Subject: [PATCH 03/21] Revert "judge image size for using diff interpolation"

This reverts commit 87526942a67fd71bb775bc479b0a7449df516dd8.
---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 74720fec6..15c23f3cc 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2362,7 +2362,7 @@ def trim_and_resize_if_required(
 
     if image_width != resized_size[0] or image_height != resized_size[1]:
         # リサイズする
-        image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA if image_width > resized_size[0] and image_height > resized_size[1] else cv2.INTER_LANCZOS4)
+        image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA)  # INTER_AREAでやりたいのでcv2でリサイズ
 
     image_height, image_width = image.shape[0:2]
 

From 9ca7a5b6cc99e25820a1aa6d02a779004d73bca0 Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Sat, 20 Jul 2024 21:59:11 +0800
Subject: [PATCH 04/21] instead cv2 LANCZOS4 resize to pil resize

---
 finetune/tag_images_by_wd14_tagger.py |  8 +++++---
 library/train_util.py                 | 11 ++++++-----
 library/utils.py                      | 14 +++++++++++++-
 tools/detect_face_rotate.py           |  7 +++++--
 tools/resize_images_to_resolution.py  | 11 +++++++----
 5 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py
index a327bbd61..6f5bdd36b 100644
--- a/finetune/tag_images_by_wd14_tagger.py
+++ b/finetune/tag_images_by_wd14_tagger.py
@@ -11,7 +11,7 @@
 from tqdm import tqdm
 
 import library.train_util as train_util
-from library.utils import setup_logging
+from library.utils import setup_logging, pil_resize
 
 setup_logging()
 import logging
@@ -42,8 +42,10 @@ def preprocess_image(image):
     pad_t = pad_y // 2
     image = np.pad(image, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode="constant", constant_values=255)
 
-    interp = cv2.INTER_AREA if size > IMAGE_SIZE else cv2.INTER_LANCZOS4
-    image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE), interpolation=interp)
+    if size > IMAGE_SIZE:
+        image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE), cv2.INTER_AREA)
+    else:
+        image = pil_resize(image, (IMAGE_SIZE, IMAGE_SIZE))
 
     image = image.astype(np.float32)
     return image
diff --git a/library/train_util.py b/library/train_util.py
index 15c23f3cc..160e3b44b 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -71,7 +71,7 @@
 import library.huggingface_util as huggingface_util
 import library.sai_model_spec as sai_model_spec
 import library.deepspeed_utils as deepspeed_utils
-from library.utils import setup_logging
+from library.utils import setup_logging, pil_resize
 
 setup_logging()
 import logging
@@ -2028,9 +2028,7 @@ def __getitem__(self, index):
                 # ), f"image size is small / 画像サイズが小さいようです: {image_info.absolute_path}"
                 # resize to target
                 if cond_img.shape[0] != target_size_hw[0] or cond_img.shape[1] != target_size_hw[1]:
-                    cond_img = cv2.resize(
-                        cond_img, (int(target_size_hw[1]), int(target_size_hw[0])), interpolation=cv2.INTER_LANCZOS4
-                    )
+                    cond_img=pil_resize(cond_img,(int(target_size_hw[1]), int(target_size_hw[0])))
 
             if flipped:
                 cond_img = cond_img[:, ::-1, :].copy()  # copy to avoid negative stride
@@ -2362,7 +2360,10 @@ def trim_and_resize_if_required(
 
     if image_width != resized_size[0] or image_height != resized_size[1]:
         # リサイズする
-        image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA)  # INTER_AREAでやりたいのでcv2でリサイズ
+        if image_width > resized_size[0] and image_height > resized_size[1]:
+            image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA)  # INTER_AREAでやりたいのでcv2でリサイズ
+        else:
+            image = pil_resize(image, resized_size)
 
     image_height, image_width = image.shape[0:2]
 
diff --git a/library/utils.py b/library/utils.py
index 3037c055d..a219f6cb7 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -7,7 +7,9 @@
 from diffusers import EulerAncestralDiscreteScheduler
 import diffusers.schedulers.scheduling_euler_ancestral_discrete
 from diffusers.schedulers.scheduling_euler_ancestral_discrete import EulerAncestralDiscreteSchedulerOutput
-
+import cv2
+from PIL import Image
+import numpy as np
 
 def fire_in_thread(f, *args, **kwargs):
     threading.Thread(target=f, args=args, kwargs=kwargs).start()
@@ -78,7 +80,17 @@ def setup_logging(args=None, log_level=None, reset=False):
         logger = logging.getLogger(__name__)
         logger.info(msg_init)
 
+def pil_resize(image, size, interpolation=Image.LANCZOS):
+
+    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+
+    # use Pillow resize
+    resized_pil = pil_image.resize(size, interpolation)
+
+    # return cv2 image
+    resized_cv2 = cv2.cvtColor(np.array(resized_pil), cv2.COLOR_RGB2BGR)
 
+    return resized_cv2
 
 # TODO make inf_utils.py
 
diff --git a/tools/detect_face_rotate.py b/tools/detect_face_rotate.py
index bbc643edc..d2a4d9cfb 100644
--- a/tools/detect_face_rotate.py
+++ b/tools/detect_face_rotate.py
@@ -15,7 +15,7 @@
 from anime_face_detector import create_detector
 from tqdm import tqdm
 import numpy as np
-from library.utils import setup_logging
+from library.utils import setup_logging, pil_resize
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
@@ -172,7 +172,10 @@ def process(args):
         if scale != 1.0:
           w = int(w * scale + .5)
           h = int(h * scale + .5)
-          face_img = cv2.resize(face_img, (w, h), interpolation=cv2.INTER_AREA if scale < 1.0 else cv2.INTER_LANCZOS4)
+          if scale < 1.0:
+            face_img = cv2.resize(face_img, (w, h), interpolation=cv2.INTER_AREA)
+          else:
+            face_img = pil_resize(face_img, (w, h))
           cx = int(cx * scale + .5)
           cy = int(cy * scale + .5)
           fw = int(fw * scale + .5)
diff --git a/tools/resize_images_to_resolution.py b/tools/resize_images_to_resolution.py
index b8069fc1d..0f9e00b1e 100644
--- a/tools/resize_images_to_resolution.py
+++ b/tools/resize_images_to_resolution.py
@@ -6,7 +6,7 @@
 import math
 from PIL import Image
 import numpy as np
-from library.utils import setup_logging
+from library.utils import setup_logging, pil_resize
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
@@ -24,9 +24,9 @@ def resize_images(src_img_folder, dst_img_folder, max_resolution="512x512", divi
 
   # Select interpolation method
   if interpolation == 'lanczos4':
-    cv2_interpolation = cv2.INTER_LANCZOS4
+    pil_interpolation = Image.LANCZOS
   elif interpolation == 'cubic':
-    cv2_interpolation = cv2.INTER_CUBIC
+    pil_interpolation = Image.BICUBIC
   else:
     cv2_interpolation = cv2.INTER_AREA
 
@@ -64,7 +64,10 @@ def resize_images(src_img_folder, dst_img_folder, max_resolution="512x512", divi
         new_width = int(img.shape[1] * math.sqrt(scale_factor))
 
         # Resize image
-        img = cv2.resize(img, (new_width, new_height), interpolation=cv2_interpolation)
+        if cv2_interpolation:
+          img = cv2.resize(img, (new_width, new_height), interpolation=cv2_interpolation)
+        else:
+          img = pil_resize(img, (new_width, new_height), interpolation=pil_interpolation)
       else:
         new_height, new_width = img.shape[0:2]
 

From 2a3aefb4e44dce1f189677d0a996ba0244633956 Mon Sep 17 00:00:00 2001
From: Nando Metzger <42088121+nandometzger@users.noreply.github.com>
Date: Fri, 30 Aug 2024 08:15:05 +0200
Subject: [PATCH 05/21] Update train_util.py, bug fix

---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index 15c23f3cc..0fec565db 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -1489,7 +1489,7 @@ def read_caption(img_path, caption_extension, enable_wildcard):
         def load_dreambooth_dir(subset: DreamBoothSubset):
             if not os.path.isdir(subset.image_dir):
                 logger.warning(f"not directory: {subset.image_dir}")
-                return [], []
+                return [], [], []
 
             info_cache_file = os.path.join(subset.image_dir, self.IMAGE_INFO_CACHE_FILE)
             use_cached_info_for_subset = subset.cache_info

From 3a6154b7b0dbcae82d24adacf5a76f75288b98f4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 31 Aug 2024 06:21:16 +0000
Subject: [PATCH 06/21] Bump opencv-python from 4.7.0.68 to 4.8.1.78

Bumps [opencv-python](https://github.com/opencv/opencv-python) from 4.7.0.68 to 4.8.1.78.
- [Release notes](https://github.com/opencv/opencv-python/releases)
- [Commits](https://github.com/opencv/opencv-python/commits)

---
updated-dependencies:
- dependency-name: opencv-python
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index e99775b8a..977c5cd91 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ transformers==4.36.2
 diffusers[torch]==0.25.0
 ftfy==6.1.1
 # albumentations==1.3.0
-opencv-python==4.7.0.68
+opencv-python==4.8.1.78
 einops==0.7.0
 pytorch-lightning==1.9.0
 bitsandbytes==0.43.0

From 1bcf8d600bfb9f4314a41a12a5e7b272a17ceaed Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 1 Sep 2024 01:33:04 +0000
Subject: [PATCH 07/21] Bump crate-ci/typos from 1.19.0 to 1.24.3

Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.19.0 to 1.24.3.
- [Release notes](https://github.com/crate-ci/typos/releases)
- [Changelog](https://github.com/crate-ci/typos/blob/master/CHANGELOG.md)
- [Commits](https://github.com/crate-ci/typos/compare/v1.19.0...v1.24.3)

---
updated-dependencies:
- dependency-name: crate-ci/typos
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/typos.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index e8b06483f..0149dcdd3 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -18,4 +18,4 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: typos-action
-        uses: crate-ci/typos@v1.19.0
+        uses: crate-ci/typos@v1.24.3

From 0005867ba509d2e1a5674b267e8286b561c0ed71 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sat, 7 Sep 2024 10:45:18 +0900
Subject: [PATCH 08/21] update README, format code

---
 README.md             | 5 +++++
 library/train_util.py | 4 ++--
 library/utils.py      | 4 +++-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 81a549378..16ab80e7a 100644
--- a/README.md
+++ b/README.md
@@ -139,7 +139,12 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ### Working in progress
 
+- When enlarging images in the script (when the size of the training image is small and bucket_no_upscale is not specified), it has been changed to use Pillow's resize and LANCZOS interpolation instead of OpenCV2's resize and Lanczos4 interpolation. The quality of the image enlargement may be slightly improved. PR [#1426](https://github.com/kohya-ss/sd-scripts/pull/1426) Thanks to sdbds!
+
+- Sample image generation during training now works on non-CUDA devices. PR [#1433](https://github.com/kohya-ss/sd-scripts/pull/1433) Thanks to millie-v!
+
 - `--v_parameterization` is available in `sdxl_train.py`. The results are unpredictable, so use with caution. PR [#1505](https://github.com/kohya-ss/sd-scripts/pull/1505) Thanks to liesened!
+
 - Fused optimizer is available for SDXL training. PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) Thanks to 2kpr!
   - The memory usage during training is significantly reduced by integrating the optimizer's backward pass with step. The training results are the same as before, but if you have plenty of memory, the speed will be slower.
   - Specify the `--fused_backward_pass` option in `sdxl_train.py`. At this time, only AdaFactor is supported. Gradient accumulation is not available.
diff --git a/library/train_util.py b/library/train_util.py
index 102d39ed7..1441e74f6 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2094,7 +2094,7 @@ def __getitem__(self, index):
                 # ), f"image size is small / 画像サイズが小さいようです: {image_info.absolute_path}"
                 # resize to target
                 if cond_img.shape[0] != target_size_hw[0] or cond_img.shape[1] != target_size_hw[1]:
-                    cond_img=pil_resize(cond_img,(int(target_size_hw[1]), int(target_size_hw[0])))
+                    cond_img = pil_resize(cond_img, (int(target_size_hw[1]), int(target_size_hw[0])))
 
             if flipped:
                 cond_img = cond_img[:, ::-1, :].copy()  # copy to avoid negative stride
@@ -2432,7 +2432,7 @@ def load_arbitrary_dataset(args, tokenizer) -> MinimalDataset:
     return train_dataset_group
 
 
-def load_image(image_path, alpha=False):    
+def load_image(image_path, alpha=False):
     try:
         with Image.open(image_path) as image:
             if alpha:
diff --git a/library/utils.py b/library/utils.py
index a219f6cb7..5b7e657b2 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -11,6 +11,7 @@
 from PIL import Image
 import numpy as np
 
+
 def fire_in_thread(f, *args, **kwargs):
     threading.Thread(target=f, args=args, kwargs=kwargs).start()
 
@@ -80,8 +81,8 @@ def setup_logging(args=None, log_level=None, reset=False):
         logger = logging.getLogger(__name__)
         logger.info(msg_init)
 
-def pil_resize(image, size, interpolation=Image.LANCZOS):
 
+def pil_resize(image, size, interpolation=Image.LANCZOS):
     pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
 
     # use Pillow resize
@@ -92,6 +93,7 @@ def pil_resize(image, size, interpolation=Image.LANCZOS):
 
     return resized_cv2
 
+
 # TODO make inf_utils.py
 
 

From fd68703f3795b3e9c75409ac5452807d056b928f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9D=92=E9=BE=8D=E8=81=96=E8=80=85=40bdsqlsz?=
 <865105819@qq.com>
Date: Wed, 11 Sep 2024 20:25:45 +0800
Subject: [PATCH 09/21] Add New lr scheduler (#1393)

* add new lr scheduler

* fix bugs and use num_cycles / 2

* Update requirements.txt

* add num_cycles for min lr

* keep PIECEWISE_CONSTANT

* allow use float with warmup or decay ratio.

* Update train_util.py
---
 library/train_util.py | 80 ++++++++++++++++++++++++++++++++++++++-----
 requirements.txt      |  6 ++--
 2 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index c7b73ee37..340f6d640 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -42,7 +42,8 @@
 from torchvision import transforms
 from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextModelWithProjection
 import transformers
-from diffusers.optimization import SchedulerType, TYPE_TO_SCHEDULER_FUNCTION
+from diffusers.optimization import SchedulerType as DiffusersSchedulerType, TYPE_TO_SCHEDULER_FUNCTION as DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION
+from transformers.optimization import SchedulerType, TYPE_TO_SCHEDULER_FUNCTION
 from diffusers import (
     StableDiffusionPipeline,
     DDPMScheduler,
@@ -2972,6 +2973,20 @@ def add_sd_models_arguments(parser: argparse.ArgumentParser):
 
 
 def add_optimizer_arguments(parser: argparse.ArgumentParser):
+    def int_or_float(value):
+        if value.endswith('%'):
+            try:
+                return float(value[:-1]) / 100.0
+            except ValueError:
+                raise argparse.ArgumentTypeError(f"Value '{value}' is not a valid percentage")
+        try:
+            float_value = float(value)
+            if float_value >= 1:
+                return int(value)
+            return float(value)
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"'{value}' is not an int or float")
+
     parser.add_argument(
         "--optimizer_type",
         type=str,
@@ -3024,9 +3039,15 @@ def add_optimizer_arguments(parser: argparse.ArgumentParser):
     )
     parser.add_argument(
         "--lr_warmup_steps",
-        type=int,
+        type=int_or_float,
+        default=0,
+        help="Int number of steps for the warmup in the lr scheduler (default is 0) or float with ratio of train steps / 学習率のスケジューラをウォームアップするステップ数（デフォルト0）",
+    )
+    parser.add_argument(
+        "--lr_decay_steps",
+        type=int_or_float,
         default=0,
-        help="Number of steps for the warmup in the lr scheduler (default is 0) / 学習率のスケジューラをウォームアップするステップ数（デフォルト0）",
+        help="Int number of steps for the decay in the lr scheduler (default is 0) or float with ratio of train steps",
     )
     parser.add_argument(
         "--lr_scheduler_num_cycles",
@@ -3046,6 +3067,18 @@ def add_optimizer_arguments(parser: argparse.ArgumentParser):
         help="Combines backward pass and optimizer step to reduce VRAM usage. Only available in SDXL"
         + " / バックワードパスとオプティマイザステップを組み合わせてVRAMの使用量を削減します。SDXLでのみ有効",
     )
+    parser.add_argument(
+        "--lr_scheduler_timescale",
+        type=int,
+        default=None,
+        help="Inverse sqrt timescale for inverse sqrt scheduler,defaults to `num_warmup_steps`",
+    )
+    parser.add_argument(
+        "--lr_scheduler_min_lr_ratio",
+        type=float,
+        default=None,
+        help="The minimum learning rate as a ratio of the initial learning rate for cosine with min lr scheduler and warmup decay scheduler",
+    )
 
 
 def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth: bool):
@@ -4293,10 +4326,14 @@ def get_scheduler_fix(args, optimizer: Optimizer, num_processes: int):
     Unified API to get any scheduler from its name.
     """
     name = args.lr_scheduler
-    num_warmup_steps: Optional[int] = args.lr_warmup_steps
     num_training_steps = args.max_train_steps * num_processes  # * args.gradient_accumulation_steps
+    num_warmup_steps: Optional[int] = int(args.lr_warmup_steps * num_training_steps) if isinstance(args.lr_warmup_steps, float) else args.lr_warmup_steps
+    num_decay_steps: Optional[int] = int(args.lr_decay_steps * num_training_steps) if isinstance(args.lr_decay_steps, float) else args.lr_decay_steps
+    num_stable_steps = num_training_steps - num_warmup_steps - num_decay_steps
     num_cycles = args.lr_scheduler_num_cycles
     power = args.lr_scheduler_power
+    timescale = args.lr_scheduler_timescale
+    min_lr_ratio = args.lr_scheduler_min_lr_ratio
 
     lr_scheduler_kwargs = {}  # get custom lr_scheduler kwargs
     if args.lr_scheduler_args is not None and len(args.lr_scheduler_args) > 0:
@@ -4332,13 +4369,13 @@ def wrap_check_needless_num_warmup_steps(return_vals):
         # logger.info(f"adafactor scheduler init lr {initial_lr}")
         return wrap_check_needless_num_warmup_steps(transformers.optimization.AdafactorSchedule(optimizer, initial_lr))
 
-    name = SchedulerType(name)
-    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+    name = SchedulerType(name) or DiffusersSchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name] or DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION[name]
 
     if name == SchedulerType.CONSTANT:
         return wrap_check_needless_num_warmup_steps(schedule_func(optimizer, **lr_scheduler_kwargs))
 
-    if name == SchedulerType.PIECEWISE_CONSTANT:
+    if name == DiffusersSchedulerType.PIECEWISE_CONSTANT:
         return schedule_func(optimizer, **lr_scheduler_kwargs)  # step_rules and last_epoch are given as kwargs
 
     # All other schedulers require `num_warmup_steps`
@@ -4348,6 +4385,9 @@ def wrap_check_needless_num_warmup_steps(return_vals):
     if name == SchedulerType.CONSTANT_WITH_WARMUP:
         return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **lr_scheduler_kwargs)
 
+    if name == SchedulerType.INVERSE_SQRT:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, timescale=timescale, **lr_scheduler_kwargs)
+
     # All other schedulers require `num_training_steps`
     if num_training_steps is None:
         raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
@@ -4366,7 +4406,31 @@ def wrap_check_needless_num_warmup_steps(return_vals):
             optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, power=power, **lr_scheduler_kwargs
         )
 
-    return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, **lr_scheduler_kwargs)
+    if name == SchedulerType.COSINE_WITH_MIN_LR:
+        return schedule_func(
+            optimizer, 
+            num_warmup_steps=num_warmup_steps, 
+            num_training_steps=num_training_steps, 
+            num_cycles=num_cycles / 2,
+            min_lr_rate=min_lr_ratio, 
+            **lr_scheduler_kwargs,
+        )
+
+    # All other schedulers require `num_decay_steps`
+    if num_decay_steps is None:
+        raise ValueError(f"{name} requires `num_decay_steps`, please provide that argument.")
+    if name == SchedulerType.WARMUP_STABLE_DECAY:
+        return schedule_func(
+            optimizer, 
+            num_warmup_steps=num_warmup_steps, 
+            num_stable_steps=num_stable_steps, 
+            num_decay_steps=num_decay_steps, 
+            num_cycles=num_cycles / 2, 
+            min_lr_ratio=min_lr_ratio if min_lr_ratio is not None else 0.0,
+            **lr_scheduler_kwargs,
+        )
+
+    return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, num_decay_steps=num_decay_steps, **lr_scheduler_kwargs)
 
 
 def prepare_dataset_args(args: argparse.Namespace, support_metadata: bool):
diff --git a/requirements.txt b/requirements.txt
index 977c5cd91..d2a2fbb8a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-accelerate==0.25.0
-transformers==4.36.2
+accelerate==0.30.0
+transformers==4.41.2
 diffusers[torch]==0.25.0
 ftfy==6.1.1
 # albumentations==1.3.0
@@ -16,7 +16,7 @@ altair==4.2.2
 easygui==0.98.3
 toml==0.10.2
 voluptuous==0.13.1
-huggingface-hub==0.20.1
+huggingface-hub==0.23.3
 # for Image utils
 imagesize==1.4.1
 # for BLIP captioning

From 6dbfd47a59cdb91be2077e1d0dec0f94698348dd Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 11 Sep 2024 21:44:36 +0900
Subject: [PATCH 10/21] Fix to work PIECEWISE_CONSTANT, update requirement.txt
 and README #1393

---
 README.md             |  9 ++++++
 library/train_util.py | 66 ++++++++++++++++++++++++++++---------------
 requirements.txt      |  4 +--
 3 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 16ab80e7a..011141bf1 100644
--- a/README.md
+++ b/README.md
@@ -139,6 +139,15 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ### Working in progress
 
+- __important__ The dependent libraries are updated. Please see [Upgrade](#upgrade) and update the libraries.
+  - transformers, accelerate and huggingface_hub are updated. 
+  - If you encounter any issues, please report them.
+
+- en: The INVERSE_SQRT, COSINE_WITH_MIN_LR, and WARMUP_STABLE_DECAY learning rate schedules are now available in the transformers library. See PR [#1393](https://github.com/kohya-ss/sd-scripts/pull/1393) for details. Thanks to sdbds!
+  - See the [transformers documentation](https://huggingface.co/docs/transformers/v4.44.2/en/main_classes/optimizer_schedules#schedules) for details on each scheduler.
+  - `--lr_warmup_steps` and `--lr_decay_steps` can now be specified as a ratio of the number of training steps, not just the step value. Example: `--lr_warmup_steps=0.1` or `--lr_warmup_steps=10%`, etc.
+
+https://github.com/kohya-ss/sd-scripts/pull/1393
 - When enlarging images in the script (when the size of the training image is small and bucket_no_upscale is not specified), it has been changed to use Pillow's resize and LANCZOS interpolation instead of OpenCV2's resize and Lanczos4 interpolation. The quality of the image enlargement may be slightly improved. PR [#1426](https://github.com/kohya-ss/sd-scripts/pull/1426) Thanks to sdbds!
 
 - Sample image generation during training now works on non-CUDA devices. PR [#1433](https://github.com/kohya-ss/sd-scripts/pull/1433) Thanks to millie-v!
diff --git a/library/train_util.py b/library/train_util.py
index 340f6d640..e65760bae 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -42,7 +42,10 @@
 from torchvision import transforms
 from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextModelWithProjection
 import transformers
-from diffusers.optimization import SchedulerType as DiffusersSchedulerType, TYPE_TO_SCHEDULER_FUNCTION as DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION
+from diffusers.optimization import (
+    SchedulerType as DiffusersSchedulerType,
+    TYPE_TO_SCHEDULER_FUNCTION as DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION,
+)
 from transformers.optimization import SchedulerType, TYPE_TO_SCHEDULER_FUNCTION
 from diffusers import (
     StableDiffusionPipeline,
@@ -2974,7 +2977,7 @@ def add_sd_models_arguments(parser: argparse.ArgumentParser):
 
 def add_optimizer_arguments(parser: argparse.ArgumentParser):
     def int_or_float(value):
-        if value.endswith('%'):
+        if value.endswith("%"):
             try:
                 return float(value[:-1]) / 100.0
             except ValueError:
@@ -3041,13 +3044,15 @@ def int_or_float(value):
         "--lr_warmup_steps",
         type=int_or_float,
         default=0,
-        help="Int number of steps for the warmup in the lr scheduler (default is 0) or float with ratio of train steps / 学習率のスケジューラをウォームアップするステップ数（デフォルト0）",
+        help="Int number of steps for the warmup in the lr scheduler (default is 0) or float with ratio of train steps"
+        " / 学習率のスケジューラをウォームアップするステップ数（デフォルト0）、または学習ステップの比率（1未満のfloat値の場合）",
     )
     parser.add_argument(
         "--lr_decay_steps",
         type=int_or_float,
         default=0,
-        help="Int number of steps for the decay in the lr scheduler (default is 0) or float with ratio of train steps",
+        help="Int number of steps for the decay in the lr scheduler (default is 0) or float (<1) with ratio of train steps"
+        " / 学習率のスケジューラを減衰させるステップ数（デフォルト0）、または学習ステップの比率（1未満のfloat値の場合）",
     )
     parser.add_argument(
         "--lr_scheduler_num_cycles",
@@ -3071,13 +3076,16 @@ def int_or_float(value):
         "--lr_scheduler_timescale",
         type=int,
         default=None,
-        help="Inverse sqrt timescale for inverse sqrt scheduler,defaults to `num_warmup_steps`",
+        help="Inverse sqrt timescale for inverse sqrt scheduler,defaults to `num_warmup_steps`"
+        " / 逆平方根スケジューラのタイムスケール、デフォルトは`num_warmup_steps`",
+        ,
     )
     parser.add_argument(
         "--lr_scheduler_min_lr_ratio",
         type=float,
         default=None,
-        help="The minimum learning rate as a ratio of the initial learning rate for cosine with min lr scheduler and warmup decay scheduler",
+        help="The minimum learning rate as a ratio of the initial learning rate for cosine with min lr scheduler and warmup decay scheduler"
+        " / 初期学習率の比率としての最小学習率を指定する、cosine with min lr と warmup decay スケジューラ で有効",
     )
 
 
@@ -4327,8 +4335,12 @@ def get_scheduler_fix(args, optimizer: Optimizer, num_processes: int):
     """
     name = args.lr_scheduler
     num_training_steps = args.max_train_steps * num_processes  # * args.gradient_accumulation_steps
-    num_warmup_steps: Optional[int] = int(args.lr_warmup_steps * num_training_steps) if isinstance(args.lr_warmup_steps, float) else args.lr_warmup_steps
-    num_decay_steps: Optional[int] = int(args.lr_decay_steps * num_training_steps) if isinstance(args.lr_decay_steps, float) else args.lr_decay_steps
+    num_warmup_steps: Optional[int] = (
+        int(args.lr_warmup_steps * num_training_steps) if isinstance(args.lr_warmup_steps, float) else args.lr_warmup_steps
+    )
+    num_decay_steps: Optional[int] = (
+        int(args.lr_decay_steps * num_training_steps) if isinstance(args.lr_decay_steps, float) else args.lr_decay_steps
+    )
     num_stable_steps = num_training_steps - num_warmup_steps - num_decay_steps
     num_cycles = args.lr_scheduler_num_cycles
     power = args.lr_scheduler_power
@@ -4369,15 +4381,17 @@ def wrap_check_needless_num_warmup_steps(return_vals):
         # logger.info(f"adafactor scheduler init lr {initial_lr}")
         return wrap_check_needless_num_warmup_steps(transformers.optimization.AdafactorSchedule(optimizer, initial_lr))
 
-    name = SchedulerType(name) or DiffusersSchedulerType(name)
-    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name] or DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION[name]
+    if name == DiffusersSchedulerType.PIECEWISE_CONSTANT.value:
+        name = DiffusersSchedulerType(name)
+        schedule_func = DIFFUSERS_TYPE_TO_SCHEDULER_FUNCTION[name]
+        return schedule_func(optimizer, **lr_scheduler_kwargs)  # step_rules and last_epoch are given as kwargs
+
+    name = SchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
 
     if name == SchedulerType.CONSTANT:
         return wrap_check_needless_num_warmup_steps(schedule_func(optimizer, **lr_scheduler_kwargs))
 
-    if name == DiffusersSchedulerType.PIECEWISE_CONSTANT:
-        return schedule_func(optimizer, **lr_scheduler_kwargs)  # step_rules and last_epoch are given as kwargs
-
     # All other schedulers require `num_warmup_steps`
     if num_warmup_steps is None:
         raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
@@ -4408,11 +4422,11 @@ def wrap_check_needless_num_warmup_steps(return_vals):
 
     if name == SchedulerType.COSINE_WITH_MIN_LR:
         return schedule_func(
-            optimizer, 
-            num_warmup_steps=num_warmup_steps, 
-            num_training_steps=num_training_steps, 
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
             num_cycles=num_cycles / 2,
-            min_lr_rate=min_lr_ratio, 
+            min_lr_rate=min_lr_ratio,
             **lr_scheduler_kwargs,
         )
 
@@ -4421,16 +4435,22 @@ def wrap_check_needless_num_warmup_steps(return_vals):
         raise ValueError(f"{name} requires `num_decay_steps`, please provide that argument.")
     if name == SchedulerType.WARMUP_STABLE_DECAY:
         return schedule_func(
-            optimizer, 
-            num_warmup_steps=num_warmup_steps, 
-            num_stable_steps=num_stable_steps, 
-            num_decay_steps=num_decay_steps, 
-            num_cycles=num_cycles / 2, 
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_stable_steps=num_stable_steps,
+            num_decay_steps=num_decay_steps,
+            num_cycles=num_cycles / 2,
             min_lr_ratio=min_lr_ratio if min_lr_ratio is not None else 0.0,
             **lr_scheduler_kwargs,
         )
 
-    return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, num_decay_steps=num_decay_steps, **lr_scheduler_kwargs)
+    return schedule_func(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_decay_steps=num_decay_steps,
+        **lr_scheduler_kwargs,
+    )
 
 
 def prepare_dataset_args(args: argparse.Namespace, support_metadata: bool):
diff --git a/requirements.txt b/requirements.txt
index d2a2fbb8a..15e6e58f1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 accelerate==0.30.0
-transformers==4.41.2
+transformers==4.44.0
 diffusers[torch]==0.25.0
 ftfy==6.1.1
 # albumentations==1.3.0
@@ -16,7 +16,7 @@ altair==4.2.2
 easygui==0.98.3
 toml==0.10.2
 voluptuous==0.13.1
-huggingface-hub==0.23.3
+huggingface-hub==0.24.5
 # for Image utils
 imagesize==1.4.1
 # for BLIP captioning

From c7c666b1829a7c1f3435558efa425b08b50fab41 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Wed, 11 Sep 2024 22:12:31 +0900
Subject: [PATCH 11/21] fix typo

---
 library/train_util.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/library/train_util.py b/library/train_util.py
index e65760bae..a46d94877 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -3077,15 +3077,14 @@ def int_or_float(value):
         type=int,
         default=None,
         help="Inverse sqrt timescale for inverse sqrt scheduler,defaults to `num_warmup_steps`"
-        " / 逆平方根スケジューラのタイムスケール、デフォルトは`num_warmup_steps`",
-        ,
+        + " / 逆平方根スケジューラのタイムスケール、デフォルトは`num_warmup_steps`",
     )
     parser.add_argument(
         "--lr_scheduler_min_lr_ratio",
         type=float,
         default=None,
         help="The minimum learning rate as a ratio of the initial learning rate for cosine with min lr scheduler and warmup decay scheduler"
-        " / 初期学習率の比率としての最小学習率を指定する、cosine with min lr と warmup decay スケジューラ で有効",
+        + " / 初期学習率の比率としての最小学習率を指定する、cosine with min lr と warmup decay スケジューラ で有効",
     )
 
 

From 1d7118a62268f12ebfd81c10db53bd85ef9d7631 Mon Sep 17 00:00:00 2001
From: Maru-mee <151493593+Maru-mee@users.noreply.github.com>
Date: Fri, 13 Sep 2024 19:01:36 +0900
Subject: [PATCH 12/21] Support : OFT merge to base model (#1580)

* Support : OFT merge to base model

* Fix typo

* Fix typo_2

* Delete unused parameter 'eye'
---
 networks/sdxl_merge_lora.py | 192 +++++++++++++++++++++++++++---------
 1 file changed, 144 insertions(+), 48 deletions(-)

diff --git a/networks/sdxl_merge_lora.py b/networks/sdxl_merge_lora.py
index 3383a80de..2c998c8cb 100644
--- a/networks/sdxl_merge_lora.py
+++ b/networks/sdxl_merge_lora.py
@@ -8,10 +8,12 @@
 from library import sai_model_spec, sdxl_model_util, train_util
 import library.model_util as model_util
 import lora
+import oft
 from library.utils import setup_logging
 setup_logging()
 import logging
 logger = logging.getLogger(__name__)
+import concurrent.futures
 
 def load_state_dict(file_name, dtype):
     if os.path.splitext(file_name)[1] == ".safetensors":
@@ -39,82 +41,176 @@ def save_to_file(file_name, model, state_dict, dtype, metadata):
     else:
         torch.save(model, file_name)
 
+def detect_method_from_training_model(models, dtype):
+    for model in models:
+        lora_sd, _ = load_state_dict(model, dtype)
+    for key in tqdm(lora_sd.keys()):
+        if 'lora_up' in key or 'lora_down' in key:
+            return 'LoRA'
+        elif "oft_blocks" in key:
+            return 'OFT'
 
 def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_dtype):
     text_encoder1.to(merge_dtype)
     text_encoder1.to(merge_dtype)
     unet.to(merge_dtype)
+        
+    # detect the method: OFT or LoRA_module
+    method = detect_method_from_training_model(models, merge_dtype)
+    logger.info(f"method:{method}")
 
     # create module map
     name_to_module = {}
     for i, root_module in enumerate([text_encoder1, text_encoder2, unet]):
-        if i <= 1:
-            if i == 0:
-                prefix = lora.LoRANetwork.LORA_PREFIX_TEXT_ENCODER1
+        if method == 'LoRA':
+            if i <= 1:
+                if i == 0:
+                    prefix = lora.LoRANetwork.LORA_PREFIX_TEXT_ENCODER1
+                else:
+                    prefix = lora.LoRANetwork.LORA_PREFIX_TEXT_ENCODER2
+                target_replace_modules = lora.LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE
             else:
-                prefix = lora.LoRANetwork.LORA_PREFIX_TEXT_ENCODER2
-            target_replace_modules = lora.LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE
-        else:
-            prefix = lora.LoRANetwork.LORA_PREFIX_UNET
-            target_replace_modules = (
+                prefix = lora.LoRANetwork.LORA_PREFIX_UNET
+                target_replace_modules = (
                 lora.LoRANetwork.UNET_TARGET_REPLACE_MODULE + lora.LoRANetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
+                )
+        elif method == 'OFT':
+            prefix = oft.OFTNetwork.OFT_PREFIX_UNET
+            target_replace_modules = (
+                oft.OFTNetwork.UNET_TARGET_REPLACE_MODULE_ALL_LINEAR + oft.OFTNetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
             )
 
         for name, module in root_module.named_modules():
             if module.__class__.__name__ in target_replace_modules:
                 for child_name, child_module in module.named_modules():
-                    if child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "Conv2d":
-                        lora_name = prefix + "." + name + "." + child_name
-                        lora_name = lora_name.replace(".", "_")
-                        name_to_module[lora_name] = child_module
-
+                    if method == 'LoRA':
+                        if child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "Conv2d":
+                            lora_name = prefix + "." + name + "." + child_name
+                            lora_name = lora_name.replace(".", "_")
+                            name_to_module[lora_name] = child_module
+                    elif method == 'OFT':
+                        if child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "Conv2d":
+                            oft_name = prefix + "." + name + "." + child_name
+                            oft_name = oft_name.replace(".", "_")
+                            name_to_module[oft_name] = child_module
+                            
+                            
     for model, ratio in zip(models, ratios):
         logger.info(f"loading: {model}")
         lora_sd, _ = load_state_dict(model, merge_dtype)
 
         logger.info(f"merging...")
-        for key in tqdm(lora_sd.keys()):
-            if "lora_down" in key:
-                up_key = key.replace("lora_down", "lora_up")
-                alpha_key = key[: key.index("lora_down")] + "alpha"
 
-                # find original module for this lora
-                module_name = ".".join(key.split(".")[:-2])  # remove trailing ".lora_down.weight"
+        if method == 'LoRA':
+            for key in tqdm(lora_sd.keys()):
+                if "lora_down" in key:
+                    up_key = key.replace("lora_down", "lora_up")
+                    alpha_key = key[: key.index("lora_down")] + "alpha"
+
+                    # find original module for this lora
+                    module_name = ".".join(key.split(".")[:-2])  # remove trailing ".lora_down.weight"
+                    if module_name not in name_to_module:
+                        logger.info(f"no module found for LoRA weight: {key}")
+                        continue
+                    module = name_to_module[module_name]
+                    # logger.info(f"apply {key} to {module}")
+
+                    down_weight = lora_sd[key]
+                    up_weight = lora_sd[up_key]
+
+                    dim = down_weight.size()[0]
+                    alpha = lora_sd.get(alpha_key, dim)
+                    scale = alpha / dim
+
+                    # W <- W + U * D
+                    weight = module.weight
+                    # logger.info(module_name, down_weight.size(), up_weight.size())
+                    if len(weight.size()) == 2:
+                        # linear
+                        weight = weight + ratio * (up_weight @ down_weight) * scale
+                    elif down_weight.size()[2:4] == (1, 1):
+                        # conv2d 1x1
+                        weight = (
+                            weight
+                            + ratio
+                            * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                            * scale
+                        )
+                    else:
+                        # conv2d 3x3
+                        conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+                        # logger.info(conved.size(), weight.size(), module.stride, module.padding)
+                        weight = weight + ratio * conved * scale
+
+                    module.weight = torch.nn.Parameter(weight)
+
+       
+        elif method == 'OFT':
+            
+            multiplier=1.0
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            
+            for key in tqdm(lora_sd.keys()):
+                if "oft_blocks" in key:
+                    oft_blocks = lora_sd[key]
+                    dim = oft_blocks.shape[0]
+                    break
+            for key in tqdm(lora_sd.keys()):
+                if "alpha" in key:
+                    oft_blocks = lora_sd[key]
+                    alpha = oft_blocks.item()
+                    break         
+            
+            def merge_to(key):
+                if "alpha" in key:
+                    return
+            
+                # find original module for this OFT
+                module_name = ".".join(key.split(".")[:-1])
                 if module_name not in name_to_module:
-                    logger.info(f"no module found for LoRA weight: {key}")
-                    continue
+                    return
                 module = name_to_module[module_name]
-                # logger.info(f"apply {key} to {module}")
 
-                down_weight = lora_sd[key]
-                up_weight = lora_sd[up_key]
-
-                dim = down_weight.size()[0]
-                alpha = lora_sd.get(alpha_key, dim)
-                scale = alpha / dim
-
-                # W <- W + U * D
-                weight = module.weight
-                # logger.info(module_name, down_weight.size(), up_weight.size())
-                if len(weight.size()) == 2:
-                    # linear
-                    weight = weight + ratio * (up_weight @ down_weight) * scale
-                elif down_weight.size()[2:4] == (1, 1):
-                    # conv2d 1x1
-                    weight = (
-                        weight
-                        + ratio
-                        * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
-                        * scale
-                    )
+                # logger.info(f"apply {key} to {module}")
+                
+                oft_blocks = lora_sd[key]
+                
+                if isinstance(module, torch.nn.Linear):
+                    out_dim = module.out_features
+                elif isinstance(module, torch.nn.Conv2d):
+                    out_dim = module.out_channels
+                    
+                num_blocks = dim
+                block_size = out_dim // dim
+                constraint = (0 if alpha is None else alpha) * out_dim
+                     
+                block_Q = oft_blocks - oft_blocks.transpose(1, 2)
+                norm_Q = torch.norm(block_Q.flatten())
+                new_norm_Q = torch.clamp(norm_Q, max=constraint)
+                block_Q = block_Q * ((new_norm_Q + 1e-8) / (norm_Q + 1e-8))
+                I = torch.eye(block_size, device=oft_blocks.device).unsqueeze(0).repeat(num_blocks, 1, 1)
+                block_R = torch.matmul(I + block_Q, (I - block_Q).inverse())
+                block_R_weighted = multiplier * block_R + (1 - multiplier) * I
+                R = torch.block_diag(*block_R_weighted)
+                
+                # get org weight
+                org_sd = module.state_dict()
+                org_weight = org_sd["weight"].to(device)
+
+                R = R.to(org_weight.device, dtype=org_weight.dtype)
+                
+                if org_weight.dim() == 4:
+                    weight = torch.einsum("oihw, op -> pihw", org_weight, R)
                 else:
-                    # conv2d 3x3
-                    conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
-                    # logger.info(conved.size(), weight.size(), module.stride, module.padding)
-                    weight = weight + ratio * conved * scale
-
+                    weight = torch.einsum("oi, op -> pi", org_weight, R)
+                    
+                weight = weight.contiguous() # Make Tensor contiguous; required due to ThreadPoolExecutor
+               
                 module.weight = torch.nn.Parameter(weight)
 
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                list(tqdm(executor.map(merge_to, lora_sd.keys()), total=len(lora_sd.keys())))
+
 
 def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
     base_alphas = {}  # alpha for merged model

From 57ae44eb6138fe4a3864fffa62090f9d0113417d Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 13 Sep 2024 19:45:00 +0900
Subject: [PATCH 13/21] refactor to make safer

---
 networks/sdxl_merge_lora.py | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/networks/sdxl_merge_lora.py b/networks/sdxl_merge_lora.py
index 2c998c8cb..d5a54e02a 100644
--- a/networks/sdxl_merge_lora.py
+++ b/networks/sdxl_merge_lora.py
@@ -44,11 +44,11 @@ def save_to_file(file_name, model, state_dict, dtype, metadata):
 def detect_method_from_training_model(models, dtype):
     for model in models:
         lora_sd, _ = load_state_dict(model, dtype)
-    for key in tqdm(lora_sd.keys()):
-        if 'lora_up' in key or 'lora_down' in key:
-            return 'LoRA'
-        elif "oft_blocks" in key:
-            return 'OFT'
+        for key in tqdm(lora_sd.keys()):
+            if 'lora_up' in key or 'lora_down' in key:
+                return 'LoRA'
+            elif "oft_blocks" in key:
+                return 'OFT'
 
 def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_dtype):
     text_encoder1.to(merge_dtype)
@@ -76,6 +76,7 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
                 )
         elif method == 'OFT':
             prefix = oft.OFTNetwork.OFT_PREFIX_UNET
+            # ALL_LINEAR includes ATTN_ONLY, so we don't need to specify ATTN_ONLY
             target_replace_modules = (
                 oft.OFTNetwork.UNET_TARGET_REPLACE_MODULE_ALL_LINEAR + oft.OFTNetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
             )
@@ -83,17 +84,11 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
         for name, module in root_module.named_modules():
             if module.__class__.__name__ in target_replace_modules:
                 for child_name, child_module in module.named_modules():
-                    if method == 'LoRA':
-                        if child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "Conv2d":
-                            lora_name = prefix + "." + name + "." + child_name
-                            lora_name = lora_name.replace(".", "_")
-                            name_to_module[lora_name] = child_module
-                    elif method == 'OFT':
-                        if child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "Conv2d":
-                            oft_name = prefix + "." + name + "." + child_name
-                            oft_name = oft_name.replace(".", "_")
-                            name_to_module[oft_name] = child_module
-                            
+                    if child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "Conv2d":
+                        lora_name = prefix + "." + name + "." + child_name
+                        lora_name = lora_name.replace(".", "_")
+                        name_to_module[lora_name] = child_module
+                        
                             
     for model, ratio in zip(models, ratios):
         logger.info(f"loading: {model}")
@@ -168,6 +163,7 @@ def merge_to(key):
                 # find original module for this OFT
                 module_name = ".".join(key.split(".")[:-1])
                 if module_name not in name_to_module:
+                    logger.info(f"no module found for OFT weight: {key}")
                     return
                 module = name_to_module[module_name]
 
@@ -208,7 +204,9 @@ def merge_to(key):
                
                 module.weight = torch.nn.Parameter(weight)
 
-            with concurrent.futures.ThreadPoolExecutor() as executor:
+            # TODO multi-threading may cause OOM on CPU if cpu_count is too high and RAM is not enough
+            max_workers = 1 if device.type != "cpu" else None # avoid OOM on GPU
+            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                 list(tqdm(executor.map(merge_to, lora_sd.keys()), total=len(lora_sd.keys())))
 
 

From 3387dc7306087b84646666e49323980c89d14945 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 13 Sep 2024 19:45:42 +0900
Subject: [PATCH 14/21] formatting, update README

---
 README.md                   |  6 +++
 networks/sdxl_merge_lora.py | 86 +++++++++++++++++++++----------------
 2 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index fd81a781f..d5d2a7f73 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,12 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ## Change History
 
+### Sep 13, 2024 / 2024-09-13: 
+
+- `sdxl_merge_lora.py` now supports OFT. Thanks to Maru-mee for the PR [#1580](https://github.com/kohya-ss/sd-scripts/pull/1580). Will be included in the next release.
+
+- `sdxl_merge_lora.py` が OFT をサポートしました。PR [#1580](https://github.com/kohya-ss/sd-scripts/pull/1580) Maru-mee 氏に感謝します。次のリリースに含まれます。
+
 ### Jun 23, 2024 / 2024-06-23: 
 
 - Fixed `cache_latents.py` and `cache_text_encoder_outputs.py` not working. (Will be included in the next release.)
diff --git a/networks/sdxl_merge_lora.py b/networks/sdxl_merge_lora.py
index d5a54e02a..d5b6f7f34 100644
--- a/networks/sdxl_merge_lora.py
+++ b/networks/sdxl_merge_lora.py
@@ -10,11 +10,14 @@
 import lora
 import oft
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 import concurrent.futures
 
+
 def load_state_dict(file_name, dtype):
     if os.path.splitext(file_name)[1] == ".safetensors":
         sd = load_file(file_name)
@@ -41,20 +44,22 @@ def save_to_file(file_name, model, state_dict, dtype, metadata):
     else:
         torch.save(model, file_name)
 
+
 def detect_method_from_training_model(models, dtype):
     for model in models:
         lora_sd, _ = load_state_dict(model, dtype)
         for key in tqdm(lora_sd.keys()):
-            if 'lora_up' in key or 'lora_down' in key:
-                return 'LoRA'
+            if "lora_up" in key or "lora_down" in key:
+                return "LoRA"
             elif "oft_blocks" in key:
-                return 'OFT'
+                return "OFT"
+
 
 def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_dtype):
     text_encoder1.to(merge_dtype)
     text_encoder1.to(merge_dtype)
     unet.to(merge_dtype)
-        
+
     # detect the method: OFT or LoRA_module
     method = detect_method_from_training_model(models, merge_dtype)
     logger.info(f"method:{method}")
@@ -62,7 +67,7 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
     # create module map
     name_to_module = {}
     for i, root_module in enumerate([text_encoder1, text_encoder2, unet]):
-        if method == 'LoRA':
+        if method == "LoRA":
             if i <= 1:
                 if i == 0:
                     prefix = lora.LoRANetwork.LORA_PREFIX_TEXT_ENCODER1
@@ -72,9 +77,9 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
             else:
                 prefix = lora.LoRANetwork.LORA_PREFIX_UNET
                 target_replace_modules = (
-                lora.LoRANetwork.UNET_TARGET_REPLACE_MODULE + lora.LoRANetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
+                    lora.LoRANetwork.UNET_TARGET_REPLACE_MODULE + lora.LoRANetwork.UNET_TARGET_REPLACE_MODULE_CONV2D_3X3
                 )
-        elif method == 'OFT':
+        elif method == "OFT":
             prefix = oft.OFTNetwork.OFT_PREFIX_UNET
             # ALL_LINEAR includes ATTN_ONLY, so we don't need to specify ATTN_ONLY
             target_replace_modules = (
@@ -88,15 +93,14 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
                         lora_name = prefix + "." + name + "." + child_name
                         lora_name = lora_name.replace(".", "_")
                         name_to_module[lora_name] = child_module
-                        
-                            
+
     for model, ratio in zip(models, ratios):
         logger.info(f"loading: {model}")
         lora_sd, _ = load_state_dict(model, merge_dtype)
 
         logger.info(f"merging...")
 
-        if method == 'LoRA':
+        if method == "LoRA":
             for key in tqdm(lora_sd.keys()):
                 if "lora_down" in key:
                     up_key = key.replace("lora_down", "lora_up")
@@ -139,12 +143,11 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
 
                     module.weight = torch.nn.Parameter(weight)
 
-       
-        elif method == 'OFT':
-            
-            multiplier=1.0
-            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-            
+        elif method == "OFT":
+
+            multiplier = 1.0
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
             for key in tqdm(lora_sd.keys()):
                 if "oft_blocks" in key:
                     oft_blocks = lora_sd[key]
@@ -154,12 +157,12 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
                 if "alpha" in key:
                     oft_blocks = lora_sd[key]
                     alpha = oft_blocks.item()
-                    break         
-            
+                    break
+
             def merge_to(key):
                 if "alpha" in key:
                     return
-            
+
                 # find original module for this OFT
                 module_name = ".".join(key.split(".")[:-1])
                 if module_name not in name_to_module:
@@ -168,18 +171,18 @@ def merge_to(key):
                 module = name_to_module[module_name]
 
                 # logger.info(f"apply {key} to {module}")
-                
+
                 oft_blocks = lora_sd[key]
-                
+
                 if isinstance(module, torch.nn.Linear):
                     out_dim = module.out_features
                 elif isinstance(module, torch.nn.Conv2d):
                     out_dim = module.out_channels
-                    
+
                 num_blocks = dim
                 block_size = out_dim // dim
                 constraint = (0 if alpha is None else alpha) * out_dim
-                     
+
                 block_Q = oft_blocks - oft_blocks.transpose(1, 2)
                 norm_Q = torch.norm(block_Q.flatten())
                 new_norm_Q = torch.clamp(norm_Q, max=constraint)
@@ -188,24 +191,24 @@ def merge_to(key):
                 block_R = torch.matmul(I + block_Q, (I - block_Q).inverse())
                 block_R_weighted = multiplier * block_R + (1 - multiplier) * I
                 R = torch.block_diag(*block_R_weighted)
-                
+
                 # get org weight
                 org_sd = module.state_dict()
                 org_weight = org_sd["weight"].to(device)
 
                 R = R.to(org_weight.device, dtype=org_weight.dtype)
-                
+
                 if org_weight.dim() == 4:
                     weight = torch.einsum("oihw, op -> pihw", org_weight, R)
                 else:
                     weight = torch.einsum("oi, op -> pi", org_weight, R)
-                    
-                weight = weight.contiguous() # Make Tensor contiguous; required due to ThreadPoolExecutor
-               
+
+                weight = weight.contiguous()  # Make Tensor contiguous; required due to ThreadPoolExecutor
+
                 module.weight = torch.nn.Parameter(weight)
 
             # TODO multi-threading may cause OOM on CPU if cpu_count is too high and RAM is not enough
-            max_workers = 1 if device.type != "cpu" else None # avoid OOM on GPU
+            max_workers = 1 if device.type != "cpu" else None  # avoid OOM on GPU
             with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                 list(tqdm(executor.map(merge_to, lora_sd.keys()), total=len(lora_sd.keys())))
 
@@ -258,7 +261,7 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
         for key in tqdm(lora_sd.keys()):
             if "alpha" in key:
                 continue
-            
+
             if "lora_up" in key and concat:
                 concat_dim = 1
             elif "lora_down" in key and concat:
@@ -272,8 +275,8 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
             alpha = alphas[lora_module_name]
 
             scale = math.sqrt(alpha / base_alpha) * ratio
-            scale = abs(scale) if "lora_up" in key else scale # マイナスの重みに対応する。
-            
+            scale = abs(scale) if "lora_up" in key else scale  # マイナスの重みに対応する。
+
             if key in merged_sd:
                 assert (
                     merged_sd[key].size() == lora_sd[key].size() or concat_dim is not None
@@ -295,7 +298,7 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
             dim = merged_sd[key_down].shape[0]
             perm = torch.randperm(dim)
             merged_sd[key_down] = merged_sd[key_down][perm]
-            merged_sd[key_up] = merged_sd[key_up][:,perm]
+            merged_sd[key_up] = merged_sd[key_up][:, perm]
 
     logger.info("merged model")
     logger.info(f"dim: {list(set(base_dims.values()))}, alpha: {list(set(base_alphas.values()))}")
@@ -323,7 +326,9 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
 
 
 def merge(args):
-    assert len(args.models) == len(args.ratios), f"number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
+    assert len(args.models) == len(
+        args.ratios
+    ), f"number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
 
     def str_to_dtype(p):
         if p == "float":
@@ -410,10 +415,16 @@ def setup_parser() -> argparse.ArgumentParser:
         help="Stable Diffusion model to load: ckpt or safetensors file, merge LoRA models if omitted / 読み込むモデル、ckptまたはsafetensors。省略時はLoRAモデル同士をマージする",
     )
     parser.add_argument(
-        "--save_to", type=str, default=None, help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors"
+        "--save_to",
+        type=str,
+        default=None,
+        help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors",
     )
     parser.add_argument(
-        "--models", type=str, nargs="*", help="LoRA models to merge: ckpt or safetensors file / マージするLoRAモデル、ckptまたはsafetensors"
+        "--models",
+        type=str,
+        nargs="*",
+        help="LoRA models to merge: ckpt or safetensors file / マージするLoRAモデル、ckptまたはsafetensors",
     )
     parser.add_argument("--ratios", type=float, nargs="*", help="ratios for each model / それぞれのLoRAモデルの比率")
     parser.add_argument(
@@ -431,8 +442,7 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--shuffle",
         action="store_true",
-        help="shuffle lora weight./ "
-        + "LoRAの重みをシャッフルする",
+        help="shuffle lora weight./ " + "LoRAの重みをシャッフルする",
     )
 
     return parser

From 734d2e5b2b7a1551f3750a15e71060f3beed98e9 Mon Sep 17 00:00:00 2001
From: terracottahaniwa <57107346+terracottahaniwa@users.noreply.github.com>
Date: Fri, 13 Sep 2024 20:45:35 +0900
Subject: [PATCH 15/21] Support Lora Block Weight (LBW) to svd_merge_lora.py
 (#1575)

* support lora block weight

* solve license incompatibility

* Fix issue: lbw index calculation
---
 networks/svd_merge_lora.py | 150 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 146 insertions(+), 4 deletions(-)

diff --git a/networks/svd_merge_lora.py b/networks/svd_merge_lora.py
index cb00a6000..6e163aecf 100644
--- a/networks/svd_merge_lora.py
+++ b/networks/svd_merge_lora.py
@@ -1,5 +1,8 @@
 import argparse
+import itertools
+import json
 import os
+import re
 import time
 import torch
 from safetensors.torch import load_file, save_file
@@ -14,6 +17,106 @@
 
 CLAMP_QUANTILE = 0.99
 
+ACCEPTABLE = [12, 17, 20, 26]
+SDXL_LAYER_NUM = [12, 20]
+
+LAYER12 = {
+    "BASE": True,
+    "IN00": False, "IN01": False, "IN02": False, "IN03": False, "IN04": True, "IN05": True,
+    "IN06": False, "IN07": True, "IN08": True, "IN09": False, "IN10": False, "IN11": False,
+    "MID": True,
+    "OUT00": True, "OUT01": True, "OUT02": True, "OUT03": True, "OUT04": True, "OUT05": True,
+    "OUT06": False, "OUT07": False, "OUT08": False, "OUT09": False, "OUT10": False, "OUT11": False
+}
+
+LAYER17 = {
+    "BASE": True,
+    "IN00": False, "IN01": True, "IN02": True, "IN03": False, "IN04": True, "IN05": True,
+    "IN06": False, "IN07": True, "IN08": True, "IN09": False, "IN10": False, "IN11": False,
+    "MID": True,
+    "OUT00": False, "OUT01": False, "OUT02": False, "OUT03": True, "OUT04": True, "OUT05": True,
+    "OUT06": True, "OUT07": True, "OUT08": True, "OUT09": True, "OUT10": True, "OUT11": True,        
+}
+
+LAYER20 = {
+    "BASE": True,
+    "IN00": True, "IN01": True, "IN02": True, "IN03": True, "IN04": True, "IN05": True,
+    "IN06": True, "IN07": True, "IN08": True, "IN09": False, "IN10": False, "IN11": False,
+    "MID": True,
+    "OUT00": True, "OUT01": True, "OUT02": True, "OUT03": True, "OUT04": True, "OUT05": True,
+    "OUT06": True, "OUT07": True, "OUT08": True, "OUT09": False, "OUT10": False, "OUT11": False,
+}
+
+LAYER26 = {
+    "BASE": True,
+    "IN00": True, "IN01": True, "IN02": True, "IN03": True, "IN04": True, "IN05": True,
+    "IN06": True, "IN07": True, "IN08": True, "IN09": True, "IN10": True, "IN11": True,
+    "MID": True,
+    "OUT00": True, "OUT01": True, "OUT02": True, "OUT03": True, "OUT04": True, "OUT05": True,
+    "OUT06": True, "OUT07": True, "OUT08": True, "OUT09": True, "OUT10": True, "OUT11": True,
+}
+
+assert len([v for v in LAYER12.values() if v]) == 12
+assert len([v for v in LAYER17.values() if v]) == 17
+assert len([v for v in LAYER20.values() if v]) == 20
+assert len([v for v in LAYER26.values() if v]) == 26
+
+RE_UPDOWN = re.compile(r"(up|down)_blocks_(\d+)_(resnets|upsamplers|downsamplers|attentions)_(\d+)_")
+
+
+def get_lbw_block_index(lora_name: str, is_sdxl: bool = False) -> int:
+    # lbw block index is 0-based, but 0 for text encoder, so we return 0 for text encoder
+    if "text_model_encoder_" in lora_name:  # LoRA for text encoder
+        return 0
+
+    # lbw block index is 1-based for U-Net, and no "input_blocks.0" in CompVis SD, so "input_blocks.1" have index 2
+    block_idx = -1  # invalid lora name
+    if not is_sdxl:
+        NUM_OF_BLOCKS = 12  # up/down blocks
+        m = RE_UPDOWN.search(lora_name)
+        if m:
+            g = m.groups()
+            up_down = g[0]
+            i = int(g[1])
+            j = int(g[3])
+            if up_down == "down":
+                if g[2] == "resnets" or g[2] == "attentions":
+                    idx = 3 * i + j + 1
+                elif g[2] == "downsamplers":
+                    idx = 3 * (i + 1)
+                else:
+                    return block_idx  # invalid lora name
+            elif up_down == "up":
+                if g[2] == "resnets" or g[2] == "attentions":
+                    idx = 3 * i + j
+                elif g[2] == "upsamplers":
+                    idx = 3 * i + 2
+                else:
+                    return block_idx  # invalid lora name
+
+            if g[0] == "down":
+                block_idx = 1 + idx  # 1-based index, down block index
+            elif g[0] == "up":
+                block_idx = 1 + NUM_OF_BLOCKS + 1 + idx  # 1-based index, num blocks, mid block, up block index
+
+        elif "mid_block_" in lora_name:
+            block_idx = 1 + NUM_OF_BLOCKS  # 1-based index, num blocks, mid block
+    else:
+        if lora_name.startswith("lora_unet_"):
+            name = lora_name[len("lora_unet_") :]
+            if name.startswith("time_embed_") or name.startswith("label_emb_"):  # 1, No LoRA in sd-scripts
+                block_idx = 1
+            elif name.startswith("input_blocks_"):  # 1-8 to 2-9
+                block_idx = 1 + int(name.split("_")[2])
+            elif name.startswith("middle_block_"):  # 10
+                block_idx = 10
+            elif name.startswith("output_blocks_"):  # 0-8 to 11-19
+                block_idx = 11 + int(name.split("_")[2])
+            elif name.startswith("out_"):  # 20, No LoRA in sd-scripts
+                block_idx = 20
+
+    return block_idx
+
 
 def load_state_dict(file_name, dtype):
     if os.path.splitext(file_name)[1] == ".safetensors":
@@ -42,12 +145,34 @@ def save_to_file(file_name, state_dict, dtype, metadata):
         torch.save(state_dict, file_name)
 
 
-def merge_lora_models(models, ratios, new_rank, new_conv_rank, device, merge_dtype):
+def merge_lora_models(models, ratios, lbws, new_rank, new_conv_rank, device, merge_dtype):
     logger.info(f"new rank: {new_rank}, new conv rank: {new_conv_rank}")
     merged_sd = {}
-    v2 = None
+    v2 = None  # This is meaning LoRA Metadata v2, Not meaning SD2
     base_model = None
-    for model, ratio in zip(models, ratios):
+
+    if lbws:
+        try:
+            # lbwは"[1,1,1,1,1,1,1,1,1,1,1,1]"のような文字列で与えられることを期待している
+            lbws = [json.loads(lbw) for lbw in lbws]
+        except Exception:
+            raise ValueError(f"format of lbws are must be json / 層別適用率はJSON形式で書いてください")
+        assert all(isinstance(lbw, list) for lbw in lbws), f"lbws are must be list / 層別適用率はリストにしてください"
+        assert len(set(len(lbw) for lbw in lbws)) == 1, "all lbws should have the same length  / 層別適用率は同じ長さにしてください"
+        assert all(len(lbw) in ACCEPTABLE for lbw in lbws), f"length of lbw are must be in {ACCEPTABLE} / 層別適用率の長さは{ACCEPTABLE}のいずれかにしてください"
+        assert all(all(isinstance(weight, (int, float)) for weight in lbw) for lbw in lbws), f"values of lbs are must be numbers / 層別適用率の値はすべて数値にしてください"
+
+        layer_num = len(lbws[0])
+        is_sdxl = True if layer_num in SDXL_LAYER_NUM else False
+        FLAGS = {
+            "12": LAYER12.values(),
+            "17": LAYER17.values(),
+            "20": LAYER20.values(),
+            "26": LAYER26.values(),
+        }[str(layer_num)]
+        LBW_TARGET_IDX = [i for i, flag in enumerate(FLAGS) if flag]
+
+    for model, ratio, lbw in itertools.zip_longest(models, ratios, lbws):
         logger.info(f"loading: {model}")
         lora_sd, lora_metadata = load_state_dict(model, merge_dtype)
 
@@ -57,6 +182,12 @@ def merge_lora_models(models, ratios, new_rank, new_conv_rank, device, merge_dty
             if base_model is None:
                 base_model = lora_metadata.get(train_util.SS_METADATA_KEY_BASE_MODEL_VERSION, None)
 
+        if lbw:
+            lbw_weights = [1] * 26
+            for index, value in zip(LBW_TARGET_IDX, lbw):
+                lbw_weights[index] = value
+            print(dict(zip(LAYER26.keys(), lbw_weights)))
+
         # merge
         logger.info(f"merging...")
         for key in tqdm(list(lora_sd.keys())):
@@ -93,6 +224,12 @@ def merge_lora_models(models, ratios, new_rank, new_conv_rank, device, merge_dty
             # W <- W + U * D
             scale = alpha / network_dim
 
+            if lbw:
+                index = get_lbw_block_index(key, is_sdxl)
+                is_lbw_target = index in LBW_TARGET_IDX
+                if is_lbw_target:
+                    scale *= lbw_weights[index]  # keyがlbwの対象であれば、lbwの重みを掛ける
+
             if device:  # and isinstance(scale, torch.Tensor):
                 scale = scale.to(device)
 
@@ -170,6 +307,10 @@ def merge_lora_models(models, ratios, new_rank, new_conv_rank, device, merge_dty
 
 def merge(args):
     assert len(args.models) == len(args.ratios), f"number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
+    if args.lbws:
+        assert len(args.models) == len(args.lbws), f"number of models must be equal to number of ratios / モデルの数と層別適用率の数は合わせてください"
+    else:
+        args.lbws = []  # zip_longestで扱えるようにlbws未使用時には空のリストにしておく
 
     def str_to_dtype(p):
         if p == "float":
@@ -187,7 +328,7 @@ def str_to_dtype(p):
 
     new_conv_rank = args.new_conv_rank if args.new_conv_rank is not None else args.new_rank
     state_dict, metadata, v2, base_model = merge_lora_models(
-        args.models, args.ratios, args.new_rank, new_conv_rank, args.device, merge_dtype
+        args.models, args.ratios, args.lbws, args.new_rank, new_conv_rank, args.device, merge_dtype
     )
 
     logger.info(f"calculating hashes and creating metadata...")
@@ -237,6 +378,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--models", type=str, nargs="*", help="LoRA models to merge: ckpt or safetensors file / マージするLoRAモデル、ckptまたはsafetensors"
     )
     parser.add_argument("--ratios", type=float, nargs="*", help="ratios for each model / それぞれのLoRAモデルの比率")
+    parser.add_argument("--lbws", type=str, nargs="*", help="lbw for each model / それぞれのLoRAモデルの層別適用率")
     parser.add_argument("--new_rank", type=int, default=4, help="Specify rank of output LoRA / 出力するLoRAのrank (dim)")
     parser.add_argument(
         "--new_conv_rank",

From f4a0bea6dce152e2210f611f94acfdfaa72068fe Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 13 Sep 2024 21:26:06 +0900
Subject: [PATCH 16/21] format by black

---
 networks/svd_merge_lora.py | 188 +++++++++++++++++++++++++++++--------
 1 file changed, 147 insertions(+), 41 deletions(-)

diff --git a/networks/svd_merge_lora.py b/networks/svd_merge_lora.py
index 6e163aecf..0decd9048 100644
--- a/networks/svd_merge_lora.py
+++ b/networks/svd_merge_lora.py
@@ -11,8 +11,10 @@
 import library.model_util as model_util
 import lora
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 CLAMP_QUANTILE = 0.99
@@ -22,38 +24,118 @@
 
 LAYER12 = {
     "BASE": True,
-    "IN00": False, "IN01": False, "IN02": False, "IN03": False, "IN04": True, "IN05": True,
-    "IN06": False, "IN07": True, "IN08": True, "IN09": False, "IN10": False, "IN11": False,
+    "IN00": False,
+    "IN01": False,
+    "IN02": False,
+    "IN03": False,
+    "IN04": True,
+    "IN05": True,
+    "IN06": False,
+    "IN07": True,
+    "IN08": True,
+    "IN09": False,
+    "IN10": False,
+    "IN11": False,
     "MID": True,
-    "OUT00": True, "OUT01": True, "OUT02": True, "OUT03": True, "OUT04": True, "OUT05": True,
-    "OUT06": False, "OUT07": False, "OUT08": False, "OUT09": False, "OUT10": False, "OUT11": False
+    "OUT00": True,
+    "OUT01": True,
+    "OUT02": True,
+    "OUT03": True,
+    "OUT04": True,
+    "OUT05": True,
+    "OUT06": False,
+    "OUT07": False,
+    "OUT08": False,
+    "OUT09": False,
+    "OUT10": False,
+    "OUT11": False,
 }
 
 LAYER17 = {
     "BASE": True,
-    "IN00": False, "IN01": True, "IN02": True, "IN03": False, "IN04": True, "IN05": True,
-    "IN06": False, "IN07": True, "IN08": True, "IN09": False, "IN10": False, "IN11": False,
+    "IN00": False,
+    "IN01": True,
+    "IN02": True,
+    "IN03": False,
+    "IN04": True,
+    "IN05": True,
+    "IN06": False,
+    "IN07": True,
+    "IN08": True,
+    "IN09": False,
+    "IN10": False,
+    "IN11": False,
     "MID": True,
-    "OUT00": False, "OUT01": False, "OUT02": False, "OUT03": True, "OUT04": True, "OUT05": True,
-    "OUT06": True, "OUT07": True, "OUT08": True, "OUT09": True, "OUT10": True, "OUT11": True,        
+    "OUT00": False,
+    "OUT01": False,
+    "OUT02": False,
+    "OUT03": True,
+    "OUT04": True,
+    "OUT05": True,
+    "OUT06": True,
+    "OUT07": True,
+    "OUT08": True,
+    "OUT09": True,
+    "OUT10": True,
+    "OUT11": True,
 }
 
 LAYER20 = {
     "BASE": True,
-    "IN00": True, "IN01": True, "IN02": True, "IN03": True, "IN04": True, "IN05": True,
-    "IN06": True, "IN07": True, "IN08": True, "IN09": False, "IN10": False, "IN11": False,
+    "IN00": True,
+    "IN01": True,
+    "IN02": True,
+    "IN03": True,
+    "IN04": True,
+    "IN05": True,
+    "IN06": True,
+    "IN07": True,
+    "IN08": True,
+    "IN09": False,
+    "IN10": False,
+    "IN11": False,
     "MID": True,
-    "OUT00": True, "OUT01": True, "OUT02": True, "OUT03": True, "OUT04": True, "OUT05": True,
-    "OUT06": True, "OUT07": True, "OUT08": True, "OUT09": False, "OUT10": False, "OUT11": False,
+    "OUT00": True,
+    "OUT01": True,
+    "OUT02": True,
+    "OUT03": True,
+    "OUT04": True,
+    "OUT05": True,
+    "OUT06": True,
+    "OUT07": True,
+    "OUT08": True,
+    "OUT09": False,
+    "OUT10": False,
+    "OUT11": False,
 }
 
 LAYER26 = {
     "BASE": True,
-    "IN00": True, "IN01": True, "IN02": True, "IN03": True, "IN04": True, "IN05": True,
-    "IN06": True, "IN07": True, "IN08": True, "IN09": True, "IN10": True, "IN11": True,
+    "IN00": True,
+    "IN01": True,
+    "IN02": True,
+    "IN03": True,
+    "IN04": True,
+    "IN05": True,
+    "IN06": True,
+    "IN07": True,
+    "IN08": True,
+    "IN09": True,
+    "IN10": True,
+    "IN11": True,
     "MID": True,
-    "OUT00": True, "OUT01": True, "OUT02": True, "OUT03": True, "OUT04": True, "OUT05": True,
-    "OUT06": True, "OUT07": True, "OUT08": True, "OUT09": True, "OUT10": True, "OUT11": True,
+    "OUT00": True,
+    "OUT01": True,
+    "OUT02": True,
+    "OUT03": True,
+    "OUT04": True,
+    "OUT05": True,
+    "OUT06": True,
+    "OUT07": True,
+    "OUT08": True,
+    "OUT09": True,
+    "OUT10": True,
+    "OUT11": True,
 }
 
 assert len([v for v in LAYER12.values() if v]) == 12
@@ -145,6 +227,33 @@ def save_to_file(file_name, state_dict, dtype, metadata):
         torch.save(state_dict, file_name)
 
 
+def format_lbws(lbws):
+    try:
+        # lbwは"[1,1,1,1,1,1,1,1,1,1,1,1]"のような文字列で与えられることを期待している
+        lbws = [json.loads(lbw) for lbw in lbws]
+    except Exception:
+        raise ValueError(f"format of lbws are must be json / 層別適用率はJSON形式で書いてください")
+    assert all(isinstance(lbw, list) for lbw in lbws), f"lbws are must be list / 層別適用率はリストにしてください"
+    assert len(set(len(lbw) for lbw in lbws)) == 1, "all lbws should have the same length  / 層別適用率は同じ長さにしてください"
+    assert all(
+        len(lbw) in ACCEPTABLE for lbw in lbws
+    ), f"length of lbw are must be in {ACCEPTABLE} / 層別適用率の長さは{ACCEPTABLE}のいずれかにしてください"
+    assert all(
+        all(isinstance(weight, (int, float)) for weight in lbw) for lbw in lbws
+    ), f"values of lbs are must be numbers / 層別適用率の値はすべて数値にしてください"
+
+    layer_num = len(lbws[0])
+    is_sdxl = True if layer_num in SDXL_LAYER_NUM else False
+    FLAGS = {
+        "12": LAYER12.values(),
+        "17": LAYER17.values(),
+        "20": LAYER20.values(),
+        "26": LAYER26.values(),
+    }[str(layer_num)]
+    LBW_TARGET_IDX = [i for i, flag in enumerate(FLAGS) if flag]
+    return lbws, is_sdxl, LBW_TARGET_IDX
+
+
 def merge_lora_models(models, ratios, lbws, new_rank, new_conv_rank, device, merge_dtype):
     logger.info(f"new rank: {new_rank}, new conv rank: {new_conv_rank}")
     merged_sd = {}
@@ -152,25 +261,10 @@ def merge_lora_models(models, ratios, lbws, new_rank, new_conv_rank, device, mer
     base_model = None
 
     if lbws:
-        try:
-            # lbwは"[1,1,1,1,1,1,1,1,1,1,1,1]"のような文字列で与えられることを期待している
-            lbws = [json.loads(lbw) for lbw in lbws]
-        except Exception:
-            raise ValueError(f"format of lbws are must be json / 層別適用率はJSON形式で書いてください")
-        assert all(isinstance(lbw, list) for lbw in lbws), f"lbws are must be list / 層別適用率はリストにしてください"
-        assert len(set(len(lbw) for lbw in lbws)) == 1, "all lbws should have the same length  / 層別適用率は同じ長さにしてください"
-        assert all(len(lbw) in ACCEPTABLE for lbw in lbws), f"length of lbw are must be in {ACCEPTABLE} / 層別適用率の長さは{ACCEPTABLE}のいずれかにしてください"
-        assert all(all(isinstance(weight, (int, float)) for weight in lbw) for lbw in lbws), f"values of lbs are must be numbers / 層別適用率の値はすべて数値にしてください"
-
-        layer_num = len(lbws[0])
-        is_sdxl = True if layer_num in SDXL_LAYER_NUM else False
-        FLAGS = {
-            "12": LAYER12.values(),
-            "17": LAYER17.values(),
-            "20": LAYER20.values(),
-            "26": LAYER26.values(),
-        }[str(layer_num)]
-        LBW_TARGET_IDX = [i for i, flag in enumerate(FLAGS) if flag]
+        lbws, is_sdxl, LBW_TARGET_IDX = format_lbws(lbws)
+    else:
+        is_sdxl = False
+        LBW_TARGET_IDX = []
 
     for model, ratio, lbw in itertools.zip_longest(models, ratios, lbws):
         logger.info(f"loading: {model}")
@@ -186,7 +280,7 @@ def merge_lora_models(models, ratios, lbws, new_rank, new_conv_rank, device, mer
             lbw_weights = [1] * 26
             for index, value in zip(LBW_TARGET_IDX, lbw):
                 lbw_weights[index] = value
-            print(dict(zip(LAYER26.keys(), lbw_weights)))
+            logger.info(f"lbw: {dict(zip(LAYER26.keys(), lbw_weights))}")
 
         # merge
         logger.info(f"merging...")
@@ -306,9 +400,13 @@ def merge_lora_models(models, ratios, lbws, new_rank, new_conv_rank, device, mer
 
 
 def merge(args):
-    assert len(args.models) == len(args.ratios), f"number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
+    assert len(args.models) == len(
+        args.ratios
+    ), f"number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
     if args.lbws:
-        assert len(args.models) == len(args.lbws), f"number of models must be equal to number of ratios / モデルの数と層別適用率の数は合わせてください"
+        assert len(args.models) == len(
+            args.lbws
+        ), f"number of models must be equal to number of ratios / モデルの数と層別適用率の数は合わせてください"
     else:
         args.lbws = []  # zip_longestで扱えるようにlbws未使用時には空のリストにしておく
 
@@ -372,10 +470,16 @@ def setup_parser() -> argparse.ArgumentParser:
         help="precision in merging (float is recommended) / マージの計算時の精度（floatを推奨）",
     )
     parser.add_argument(
-        "--save_to", type=str, default=None, help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors"
+        "--save_to",
+        type=str,
+        default=None,
+        help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors",
     )
     parser.add_argument(
-        "--models", type=str, nargs="*", help="LoRA models to merge: ckpt or safetensors file / マージするLoRAモデル、ckptまたはsafetensors"
+        "--models",
+        type=str,
+        nargs="*",
+        help="LoRA models to merge: ckpt or safetensors file / マージするLoRAモデル、ckptまたはsafetensors",
     )
     parser.add_argument("--ratios", type=float, nargs="*", help="ratios for each model / それぞれのLoRAモデルの比率")
     parser.add_argument("--lbws", type=str, nargs="*", help="lbw for each model / それぞれのLoRAモデルの層別適用率")
@@ -386,7 +490,9 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="Specify rank of output LoRA for Conv2d 3x3, None for same as new_rank / 出力するConv2D 3x3 LoRAのrank (dim)、Noneでnew_rankと同じ",
     )
-    parser.add_argument("--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う")
+    parser.add_argument(
+        "--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う"
+    )
     parser.add_argument(
         "--no_metadata",
         action="store_true",

From b755ebd0a4dd2967171b6b5909624325359a2aa0 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 13 Sep 2024 21:29:31 +0900
Subject: [PATCH 17/21] add LBW support for SDXL merge LoRA

---
 README.md                   | 14 +++++--
 networks/sdxl_merge_lora.py | 75 ++++++++++++++++++++++++++++++++-----
 2 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index d5d2a7f73..0be2f9a70 100644
--- a/README.md
+++ b/README.md
@@ -139,9 +139,17 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
 
 ### Sep 13, 2024 / 2024-09-13: 
 
-- `sdxl_merge_lora.py` now supports OFT. Thanks to Maru-mee for the PR [#1580](https://github.com/kohya-ss/sd-scripts/pull/1580). Will be included in the next release.
-
-- `sdxl_merge_lora.py` が OFT をサポートしました。PR [#1580](https://github.com/kohya-ss/sd-scripts/pull/1580) Maru-mee 氏に感謝します。次のリリースに含まれます。
+- `sdxl_merge_lora.py` now supports OFT. Thanks to Maru-mee for the PR [#1580](https://github.com/kohya-ss/sd-scripts/pull/1580). 
+- `svd_merge_lora.py` now supports LBW. Thanks to terracottahaniwa. See PR [#1575](https://github.com/kohya-ss/sd-scripts/pull/1575) for details.
+- `sdxl_merge_lora.py` also supports LBW. 
+- See [LoRA Block Weight](https://github.com/hako-mikan/sd-webui-lora-block-weight) by hako-mikan for details on LBW.
+- These will be included in the next release.
+
+- `sdxl_merge_lora.py` が OFT をサポートされました。PR [#1580](https://github.com/kohya-ss/sd-scripts/pull/1580) Maru-mee 氏に感謝します。
+- `svd_merge_lora.py` で LBW がサポートされました。PR [#1575](https://github.com/kohya-ss/sd-scripts/pull/1575) terracottahaniwa 氏に感謝します。
+- `sdxl_merge_lora.py` でも LBW がサポートされました。
+- LBW の詳細は hako-mikan 氏の [LoRA Block Weight](https://github.com/hako-mikan/sd-webui-lora-block-weight) をご覧ください。
+- 以上は次回リリースに含まれます。
 
 ### Jun 23, 2024 / 2024-06-23: 
 
diff --git a/networks/sdxl_merge_lora.py b/networks/sdxl_merge_lora.py
index d5b6f7f34..62f5a87d4 100644
--- a/networks/sdxl_merge_lora.py
+++ b/networks/sdxl_merge_lora.py
@@ -1,7 +1,9 @@
+import itertools
 import math
 import argparse
 import os
 import time
+import concurrent.futures
 import torch
 from safetensors.torch import load_file, save_file
 from tqdm import tqdm
@@ -9,13 +11,13 @@
 import library.model_util as model_util
 import lora
 import oft
+from svd_merge_lora import format_lbws, get_lbw_block_index, LAYER26
 from library.utils import setup_logging
 
 setup_logging()
 import logging
 
 logger = logging.getLogger(__name__)
-import concurrent.futures
 
 
 def load_state_dict(file_name, dtype):
@@ -47,6 +49,7 @@ def save_to_file(file_name, model, state_dict, dtype, metadata):
 
 def detect_method_from_training_model(models, dtype):
     for model in models:
+        # TODO It is better to use key names to detect the method
         lora_sd, _ = load_state_dict(model, dtype)
         for key in tqdm(lora_sd.keys()):
             if "lora_up" in key or "lora_down" in key:
@@ -55,15 +58,20 @@ def detect_method_from_training_model(models, dtype):
                 return "OFT"
 
 
-def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_dtype):
-    text_encoder1.to(merge_dtype)
+def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, lbws, merge_dtype):
     text_encoder1.to(merge_dtype)
+    text_encoder2.to(merge_dtype)
     unet.to(merge_dtype)
 
     # detect the method: OFT or LoRA_module
     method = detect_method_from_training_model(models, merge_dtype)
     logger.info(f"method:{method}")
 
+    if lbws:
+        lbws, _, LBW_TARGET_IDX = format_lbws(lbws)
+    else:
+        LBW_TARGET_IDX = []
+
     # create module map
     name_to_module = {}
     for i, root_module in enumerate([text_encoder1, text_encoder2, unet]):
@@ -94,12 +102,18 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
                         lora_name = lora_name.replace(".", "_")
                         name_to_module[lora_name] = child_module
 
-    for model, ratio in zip(models, ratios):
+    for model, ratio, lbw in itertools.zip_longest(models, ratios, lbws):
         logger.info(f"loading: {model}")
         lora_sd, _ = load_state_dict(model, merge_dtype)
 
         logger.info(f"merging...")
 
+        if lbw:
+            lbw_weights = [1] * 26
+            for index, value in zip(LBW_TARGET_IDX, lbw):
+                lbw_weights[index] = value
+            logger.info(f"lbw: {dict(zip(LAYER26.keys(), lbw_weights))}")
+
         if method == "LoRA":
             for key in tqdm(lora_sd.keys()):
                 if "lora_down" in key:
@@ -121,6 +135,12 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
                     alpha = lora_sd.get(alpha_key, dim)
                     scale = alpha / dim
 
+                    if lbw:
+                        index = get_lbw_block_index(key, True)
+                        is_lbw_target = index in LBW_TARGET_IDX
+                        if is_lbw_target:
+                            scale *= lbw_weights[index]  # keyがlbwの対象であれば、lbwの重みを掛ける
+
                     # W <- W + U * D
                     weight = module.weight
                     # logger.info(module_name, down_weight.size(), up_weight.size())
@@ -145,7 +165,6 @@ def merge_to_sd_model(text_encoder1, text_encoder2, unet, models, ratios, merge_
 
         elif method == "OFT":
 
-            multiplier = 1.0
             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
             for key in tqdm(lora_sd.keys()):
@@ -183,6 +202,13 @@ def merge_to(key):
                 block_size = out_dim // dim
                 constraint = (0 if alpha is None else alpha) * out_dim
 
+                multiplier = 1
+                if lbw:
+                    index = get_lbw_block_index(key, False)
+                    is_lbw_target = index in LBW_TARGET_IDX
+                    if is_lbw_target:
+                        multiplier *= lbw_weights[index]
+
                 block_Q = oft_blocks - oft_blocks.transpose(1, 2)
                 norm_Q = torch.norm(block_Q.flatten())
                 new_norm_Q = torch.clamp(norm_Q, max=constraint)
@@ -213,17 +239,35 @@ def merge_to(key):
                 list(tqdm(executor.map(merge_to, lora_sd.keys()), total=len(lora_sd.keys())))
 
 
-def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
+def merge_lora_models(models, ratios, lbws, merge_dtype, concat=False, shuffle=False):
     base_alphas = {}  # alpha for merged model
     base_dims = {}
 
+    # detect the method: OFT or LoRA_module
+    method = detect_method_from_training_model(models, merge_dtype)
+    if method == "OFT":
+        raise ValueError(
+            "OFT model is not supported for merging OFT models. / OFTモデルはOFTモデル同士のマージには対応していません"
+        )
+
+    if lbws:
+        lbws, _, LBW_TARGET_IDX = format_lbws(lbws)
+    else:
+        LBW_TARGET_IDX = []
+
     merged_sd = {}
     v2 = None
     base_model = None
-    for model, ratio in zip(models, ratios):
+    for model, ratio, lbw in itertools.zip_longest(models, ratios, lbws):
         logger.info(f"loading: {model}")
         lora_sd, lora_metadata = load_state_dict(model, merge_dtype)
 
+        if lbw:
+            lbw_weights = [1] * 26
+            for index, value in zip(LBW_TARGET_IDX, lbw):
+                lbw_weights[index] = value
+            logger.info(f"lbw: {dict(zip(LAYER26.keys(), lbw_weights))}")
+
         if lora_metadata is not None:
             if v2 is None:
                 v2 = lora_metadata.get(train_util.SS_METADATA_KEY_V2, None)  # returns string, SDXLはv2がないのでFalseのはず
@@ -277,6 +321,12 @@ def merge_lora_models(models, ratios, merge_dtype, concat=False, shuffle=False):
             scale = math.sqrt(alpha / base_alpha) * ratio
             scale = abs(scale) if "lora_up" in key else scale  # マイナスの重みに対応する。
 
+            if lbw:
+                index = get_lbw_block_index(key, True)
+                is_lbw_target = index in LBW_TARGET_IDX
+                if is_lbw_target:
+                    scale *= lbw_weights[index]  # keyがlbwの対象であれば、lbwの重みを掛ける
+
             if key in merged_sd:
                 assert (
                     merged_sd[key].size() == lora_sd[key].size() or concat_dim is not None
@@ -329,6 +379,12 @@ def merge(args):
     assert len(args.models) == len(
         args.ratios
     ), f"number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
+    if args.lbws:
+        assert len(args.models) == len(
+            args.lbws
+        ), f"number of models must be equal to number of ratios / モデルの数と層別適用率の数は合わせてください"
+    else:
+        args.lbws = []  # zip_longestで扱えるようにlbws未使用時には空のリストにしておく
 
     def str_to_dtype(p):
         if p == "float":
@@ -356,7 +412,7 @@ def str_to_dtype(p):
             ckpt_info,
         ) = sdxl_model_util.load_models_from_sdxl_checkpoint(sdxl_model_util.MODEL_VERSION_SDXL_BASE_V1_0, args.sd_model, "cpu")
 
-        merge_to_sd_model(text_model1, text_model2, unet, args.models, args.ratios, merge_dtype)
+        merge_to_sd_model(text_model1, text_model2, unet, args.models, args.ratios, args.lbws, merge_dtype)
 
         if args.no_metadata:
             sai_metadata = None
@@ -372,7 +428,7 @@ def str_to_dtype(p):
             args.save_to, text_model1, text_model2, unet, 0, 0, ckpt_info, vae, logit_scale, sai_metadata, save_dtype
         )
     else:
-        state_dict, metadata = merge_lora_models(args.models, args.ratios, merge_dtype, args.concat, args.shuffle)
+        state_dict, metadata = merge_lora_models(args.models, args.ratios, args.lbws, merge_dtype, args.concat, args.shuffle)
 
         logger.info(f"calculating hashes and creating metadata...")
 
@@ -427,6 +483,7 @@ def setup_parser() -> argparse.ArgumentParser:
         help="LoRA models to merge: ckpt or safetensors file / マージするLoRAモデル、ckptまたはsafetensors",
     )
     parser.add_argument("--ratios", type=float, nargs="*", help="ratios for each model / それぞれのLoRAモデルの比率")
+    parser.add_argument("--lbws", type=str, nargs="*", help="lbw for each model / それぞれのLoRAモデルの層別適用率")
     parser.add_argument(
         "--no_metadata",
         action="store_true",

From 93d9fbf60761fc1158e37f45f0d0c142913d70f5 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Fri, 13 Sep 2024 22:37:11 +0900
Subject: [PATCH 18/21] improve OFT implementation closes #944

---
 README.md                      | 26 ++++++++-
 gen_img.py                     |  3 +-
 networks/check_lora_weights.py |  2 +-
 networks/oft.py                | 96 +++++++++++++++++++++-------------
 4 files changed, 89 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index 0130ccffc..def528a22 100644
--- a/README.md
+++ b/README.md
@@ -143,7 +143,31 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
   - transformers, accelerate and huggingface_hub are updated. 
   - If you encounter any issues, please report them.
 
-- en: The INVERSE_SQRT, COSINE_WITH_MIN_LR, and WARMUP_STABLE_DECAY learning rate schedules are now available in the transformers library. See PR [#1393](https://github.com/kohya-ss/sd-scripts/pull/1393) for details. Thanks to sdbds!
+- Improvements in OFT (Orthogonal Finetuning) Implementation
+  1. Optimization of Calculation Order:
+      - Changed the calculation order in the forward method from (Wx)R to W(xR).
+      - This has improved computational efficiency and processing speed.
+  2. Correction of Bias Application:
+      - In the previous implementation, R was incorrectly applied to the bias.
+      - The new implementation now correctly handles bias by using F.conv2d and F.linear.
+  3. Efficiency Enhancement in Matrix Operations:
+      - Introduced einsum in both the forward and merge_to methods.
+      - This has optimized matrix operations, resulting in further speed improvements.
+  4. Proper Handling of Data Types:
+      - Improved to use torch.float32 during calculations and convert results back to the original data type.
+      - This maintains precision while ensuring compatibility with the original model.
+  5. Unified Processing for Conv2d and Linear Layers:
+     - Implemented a consistent method for applying OFT to both layer types.
+  - These changes have made the OFT implementation more efficient and accurate, potentially leading to improved model performance and training stability.
+
+  - Additional Information
+    * Recommended α value for OFT constraint: We recommend using α values between 1e-4 and 1e-2. This differs slightly from the original implementation of "(α\*out_dim\*out_dim)". Our implementation uses "(α\*out_dim)", hence we recommend higher values than the 1e-5 suggested in the original implementation.
+
+    * Performance Improvement: Training speed has been improved by approximately 30%.
+
+    * Inference Environment: This implementation is compatible with and operates within Stable Diffusion web UI (SD1/2 and SDXL).
+
+- The INVERSE_SQRT, COSINE_WITH_MIN_LR, and WARMUP_STABLE_DECAY learning rate schedules are now available in the transformers library. See PR [#1393](https://github.com/kohya-ss/sd-scripts/pull/1393) for details. Thanks to sdbds!
   - See the [transformers documentation](https://huggingface.co/docs/transformers/v4.44.2/en/main_classes/optimizer_schedules#schedules) for details on each scheduler.
   - `--lr_warmup_steps` and `--lr_decay_steps` can now be specified as a ratio of the number of training steps, not just the step value. Example: `--lr_warmup_steps=0.1` or `--lr_warmup_steps=10%`, etc.
 
diff --git a/gen_img.py b/gen_img.py
index d0a8f8141..59bcd5b09 100644
--- a/gen_img.py
+++ b/gen_img.py
@@ -86,7 +86,8 @@
 """
 
 
-def replace_unet_modules(unet: diffusers.models.unet_2d_condition.UNet2DConditionModel, mem_eff_attn, xformers, sdpa):
+# def replace_unet_modules(unet: diffusers.models.unets.unet_2d_condition.UNet2DConditionModel, mem_eff_attn, xformers, sdpa):
+def replace_unet_modules(unet, mem_eff_attn, xformers, sdpa):
     if mem_eff_attn:
         logger.info("Enable memory efficient attention for U-Net")
 
diff --git a/networks/check_lora_weights.py b/networks/check_lora_weights.py
index 794659c94..f8eab53ba 100644
--- a/networks/check_lora_weights.py
+++ b/networks/check_lora_weights.py
@@ -18,7 +18,7 @@ def main(file):
 
     keys = list(sd.keys())
     for key in keys:
-        if "lora_up" in key or "lora_down" in key:
+        if "lora_up" in key or "lora_down" in key or "lora_A" in key or "lora_B" in key or "oft_" in key:
             values.append((key, sd[key]))
     print(f"number of LoRA modules: {len(values)}")
 
diff --git a/networks/oft.py b/networks/oft.py
index 461a98698..6321def3b 100644
--- a/networks/oft.py
+++ b/networks/oft.py
@@ -4,13 +4,17 @@
 import os
 from typing import Dict, List, Optional, Tuple, Type, Union
 from diffusers import AutoencoderKL
+import einops
 from transformers import CLIPTextModel
 import numpy as np
 import torch
+import torch.nn.functional as F
 import re
 from library.utils import setup_logging
+
 setup_logging()
 import logging
+
 logger = logging.getLogger(__name__)
 
 RE_UPDOWN = re.compile(r"(up|down)_blocks_(\d+)_(resnets|upsamplers|downsamplers|attentions)_(\d+)_")
@@ -45,11 +49,16 @@ def __init__(
 
         if type(alpha) == torch.Tensor:
             alpha = alpha.detach().numpy()
-        self.constraint = alpha * out_dim
+        
+        # constraint in original paper is alpha * out_dim * out_dim, but we use alpha * out_dim for backward compatibility
+        # original alpha is 1e-6, so we use 1e-3 or 1e-4 for alpha
+        self.constraint = alpha * out_dim 
+        
         self.register_buffer("alpha", torch.tensor(alpha))
 
         self.block_size = out_dim // self.num_blocks
         self.oft_blocks = torch.nn.Parameter(torch.zeros(self.num_blocks, self.block_size, self.block_size))
+        self.I = torch.eye(self.block_size).unsqueeze(0).repeat(self.num_blocks, 1, 1)  # cpu
 
         self.out_dim = out_dim
         self.shape = org_module.weight.shape
@@ -69,27 +78,36 @@ def get_weight(self, multiplier=None):
         norm_Q = torch.norm(block_Q.flatten())
         new_norm_Q = torch.clamp(norm_Q, max=self.constraint)
         block_Q = block_Q * ((new_norm_Q + 1e-8) / (norm_Q + 1e-8))
-        I = torch.eye(self.block_size, device=self.oft_blocks.device).unsqueeze(0).repeat(self.num_blocks, 1, 1)
-        block_R = torch.matmul(I + block_Q, (I - block_Q).inverse())
 
-        block_R_weighted = self.multiplier * block_R + (1 - self.multiplier) * I
-        R = torch.block_diag(*block_R_weighted)
-
-        return R
+        if self.I.device != block_Q.device:
+            self.I = self.I.to(block_Q.device)
+        I = self.I
+        block_R = torch.matmul(I + block_Q, (I - block_Q).float().inverse())
+        block_R_weighted = self.multiplier * (block_R - I) + I
+        return block_R_weighted
 
     def forward(self, x, scale=None):
-        x = self.org_forward(x)
         if self.multiplier == 0.0:
-            return x
-
-        R = self.get_weight().to(x.device, dtype=x.dtype)
-        if x.dim() == 4:
-            x = x.permute(0, 2, 3, 1)
-            x = torch.matmul(x, R)
-            x = x.permute(0, 3, 1, 2)
-        else:
-            x = torch.matmul(x, R)
-        return x
+            return self.org_forward(x)
+        org_module = self.org_module[0]
+        org_dtype = x.dtype
+
+        R = self.get_weight().to(torch.float32)
+        W = org_module.weight.to(torch.float32)
+
+        if len(W.shape) == 4:  # Conv2d
+            W_reshaped = einops.rearrange(W, "(k n) ... -> k n ...", k=self.num_blocks, n=self.block_size)
+            RW = torch.einsum("k n m, k n ... -> k m ...", R, W_reshaped)
+            RW = einops.rearrange(RW, "k m ... -> (k m) ...")
+            result = F.conv2d(
+                x, RW.to(org_dtype), org_module.bias, org_module.stride, org_module.padding, org_module.dilation, org_module.groups
+            )
+        else:  # Linear
+            W_reshaped = einops.rearrange(W, "(k n) m -> k n m", k=self.num_blocks, n=self.block_size)
+            RW = torch.einsum("k n m, k n p -> k m p", R, W_reshaped)
+            RW = einops.rearrange(RW, "k m p -> (k m) p")
+            result = F.linear(x, RW.to(org_dtype), org_module.bias)
+        return result
 
 
 class OFTInfModule(OFTModule):
@@ -115,18 +133,19 @@ def forward(self, x, scale=None):
             return self.org_forward(x)
         return super().forward(x, scale)
 
-    def merge_to(self, multiplier=None, sign=1):
-        R = self.get_weight(multiplier) * sign
-
+    def merge_to(self, multiplier=None):
         # get org weight
         org_sd = self.org_module[0].state_dict()
-        org_weight = org_sd["weight"]
-        R = R.to(org_weight.device, dtype=org_weight.dtype)
+        org_weight = org_sd["weight"].to(torch.float32)
 
-        if org_weight.dim() == 4:
-            weight = torch.einsum("oihw, op -> pihw", org_weight, R)
-        else:
-            weight = torch.einsum("oi, op -> pi", org_weight, R)
+        R = self.get_weight(multiplier).to(torch.float32)
+
+        weight = org_weight.reshape(self.num_blocks, self.block_size, -1)
+        weight = torch.einsum("k n m, k n ... -> k m ...", R, weight)
+        weight = weight.reshape(org_weight.shape)
+
+        # convert back to original dtype
+        weight = weight.to(org_sd["weight"].dtype)
 
         # set weight to org_module
         org_sd["weight"] = weight
@@ -145,8 +164,16 @@ def create_network(
 ):
     if network_dim is None:
         network_dim = 4  # default
-    if network_alpha is None:
-        network_alpha = 1.0
+    if network_alpha is None:  # should be set
+        logger.info(
+            "network_alpha is not set, use default value 1e-3 / network_alphaが設定されていないのでデフォルト値 1e-3 を使用します"
+        )
+        network_alpha = 1e-3
+    elif network_alpha >= 1:
+        logger.warning(
+            "network_alpha is too large (>=1, maybe default value is too large), please consider to set smaller value like 1e-3"
+            " / network_alphaが大きすぎるようです(>=1, デフォルト値が大きすぎる可能性があります)。1e-3のような小さな値を推奨"
+        )
 
     enable_all_linear = kwargs.get("enable_all_linear", None)
     enable_conv = kwargs.get("enable_conv", None)
@@ -190,12 +217,11 @@ def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weigh
         else:
             if dim is None:
                 dim = param.size()[0]
-            if has_conv2d is None and param.dim() == 4:
+            if has_conv2d is None and "in_layers_2" in name:
                 has_conv2d = True
-            if all_linear is None:
-                if param.dim() == 3 and "attn" not in name:
-                    all_linear = True
-        if dim is not None and alpha is not None and has_conv2d is not None:
+            if all_linear is None and "_ff_" in name:
+                all_linear = True
+        if dim is not None and alpha is not None and has_conv2d is not None and all_linear is not None:
             break
     if has_conv2d is None:
         has_conv2d = False
@@ -241,7 +267,7 @@ def __init__(
         self.alpha = alpha
 
         logger.info(
-            f"create OFT network. num blocks: {self.dim}, constraint: {self.alpha}, multiplier: {self.multiplier}, enable_conv: {enable_conv}"
+            f"create OFT network. num blocks: {self.dim}, constraint: {self.alpha}, multiplier: {self.multiplier}, enable_conv: {enable_conv}, enable_all_linear: {enable_all_linear}"
         )
 
         # create module instances

From e7040669bc9a31706fe9fedec14978b05223f968 Mon Sep 17 00:00:00 2001
From: Maru-mee <151493593+Maru-mee@users.noreply.github.com>
Date: Thu, 19 Sep 2024 15:47:06 +0900
Subject: [PATCH 19/21] Bug fix: alpha_mask load

---
 library/train_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/train_util.py b/library/train_util.py
index a46d94877..5a8da90e1 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -2207,7 +2207,7 @@ def is_disk_cached_latents_is_expected(reso, npz_path: str, flip_aug: bool, alph
         if alpha_mask:
             if "alpha_mask" not in npz:
                 return False
-            if npz["alpha_mask"].shape[0:2] != reso:  # HxW
+            if (npz["alpha_mask"].shape[1], npz["alpha_mask"].shape[0]) != reso:  # HxW => WxH != reso
                 return False
         else:
             if "alpha_mask" in npz:

From 9c757c2fba43d4e91d773cf6e9b7e2e8e3e8b376 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Thu, 19 Sep 2024 21:14:57 +0900
Subject: [PATCH 20/21] fix SDXL block index to match LBW

---
 networks/svd_merge_lora.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/networks/svd_merge_lora.py b/networks/svd_merge_lora.py
index 0decd9048..b4b9e3bfd 100644
--- a/networks/svd_merge_lora.py
+++ b/networks/svd_merge_lora.py
@@ -184,18 +184,19 @@ def get_lbw_block_index(lora_name: str, is_sdxl: bool = False) -> int:
         elif "mid_block_" in lora_name:
             block_idx = 1 + NUM_OF_BLOCKS  # 1-based index, num blocks, mid block
     else:
+        # SDXL: some numbers are skipped
         if lora_name.startswith("lora_unet_"):
             name = lora_name[len("lora_unet_") :]
             if name.startswith("time_embed_") or name.startswith("label_emb_"):  # 1, No LoRA in sd-scripts
                 block_idx = 1
             elif name.startswith("input_blocks_"):  # 1-8 to 2-9
                 block_idx = 1 + int(name.split("_")[2])
-            elif name.startswith("middle_block_"):  # 10
-                block_idx = 10
-            elif name.startswith("output_blocks_"):  # 0-8 to 11-19
-                block_idx = 11 + int(name.split("_")[2])
-            elif name.startswith("out_"):  # 20, No LoRA in sd-scripts
-                block_idx = 20
+            elif name.startswith("middle_block_"):  # 13
+                block_idx = 13
+            elif name.startswith("output_blocks_"):  # 0-8 to 14-22
+                block_idx = 14 + int(name.split("_")[2])
+            elif name.startswith("out_"):  # 23, No LoRA in sd-scripts
+                block_idx = 23
 
     return block_idx
 

From 29177d2f0389bd13e3f12c95d463fb0e1c58f9a1 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Mon, 23 Sep 2024 21:14:03 +0900
Subject: [PATCH 21/21] retain alpha in pil_resize backport #1619

---
 library/utils.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/library/utils.py b/library/utils.py
index 5b7e657b2..49d46a546 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -83,13 +83,20 @@ def setup_logging(args=None, log_level=None, reset=False):
 
 
 def pil_resize(image, size, interpolation=Image.LANCZOS):
-    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+    has_alpha = image.shape[2] == 4 if len(image.shape) == 3 else False
+
+    if has_alpha:
+        pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA))
+    else:
+        pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
 
-    # use Pillow resize
     resized_pil = pil_image.resize(size, interpolation)
 
-    # return cv2 image
-    resized_cv2 = cv2.cvtColor(np.array(resized_pil), cv2.COLOR_RGB2BGR)
+    # Convert back to cv2 format
+    if has_alpha:
+        resized_cv2 = cv2.cvtColor(np.array(resized_pil), cv2.COLOR_RGBA2BGRA)
+    else:
+        resized_cv2 = cv2.cvtColor(np.array(resized_pil), cv2.COLOR_RGB2BGR)
 
     return resized_cv2