From 142ed5dfa6bbbd5463c70a9a0adb2f1ad8da71ab Mon Sep 17 00:00:00 2001
From: sdbds <865105819@qq.com>
Date: Thu, 7 Nov 2024 22:37:33 +0800
Subject: [PATCH] update for resolution shift

---
 library/sd3_train_utils.py | 21 +++++++++++++++++----
 sd3_train_network.py       |  9 +++++++--
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index d6c0ad1ae..be0f9bdb0 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -292,6 +292,11 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         default=1.0,
         help="Discrete flow shift for training timestep distribution adjustment, applied in addition to the weighting scheme, default is 1.0. /タイムステップ分布のための離散フローシフト、重み付けスキームの上に適用される、デフォルトは1.0。",
     )
+    parser.add_argument(
+        "--resolution_shift",
+        action="store_true",
+        help="use flux resolution shift for training timestep distribution adjustment / 訓練タイムステップ分布調整のためにflux解像度シフトを使用する",
+    )
 
 
 def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCaching: bool = True):
@@ -992,7 +997,7 @@ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
 
 
 def get_noisy_model_input_and_timesteps(args, latents, noise, device, dtype) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    bsz = latents.shape[0]
+    bsz, _, h, w = latents.shape
 
     # Sample a random timestep for each image
     # for weighting schemes where we sample timesteps non-uniformly
@@ -1005,10 +1010,18 @@ def get_noisy_model_input_and_timesteps(args, latents, noise, device, dtype) ->
     )
     t_min = args.min_timestep if args.min_timestep is not None else 0
     t_max = args.max_timestep if args.max_timestep is not None else 1000
-    shift = args.training_shift
 
-    # weighting shift, value >1 will shift distribution to noisy side (focus more on overall structure), value <1 will shift towards less-noisy side (focus more on details)
-    u = (u * shift) / (1 + (shift - 1) * u)
+    if args.resolution_shift:
+        mu = flux_train_utils.get_lin_function(
+            y1=0.5,
+            y2=1.15,
+        )((h // 2) * (w // 2))
+        u = flux_train_utils.time_shift(mu, 1.0, u)
+    else:
+        shift = args.training_shift
+
+        # weighting shift, value >1 will shift distribution to noisy side (focus more on overall structure), value <1 will shift towards less-noisy side (focus more on details)
+        u = (u * shift) / (1 + (shift - 1) * u)
 
     indices = (u * (t_max - t_min) + t_min).long()
     timesteps = indices.to(device=device, dtype=dtype)
diff --git a/sd3_train_network.py b/sd3_train_network.py
index 3c9b70579..bb02c7ac7 100644
--- a/sd3_train_network.py
+++ b/sd3_train_network.py
@@ -328,8 +328,13 @@ def get_noise_pred_and_target(
             # TODO support attention mask
             model_pred = unet(noisy_model_input, timesteps, context=context, y=lg_pooled)
 
-        # apply model prediction type
-        model_pred, weighting = flux_train_utils.apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas)
+        # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
+        # Preconditioning of the model outputs.
+        model_pred = model_pred * (-sigmas) + noisy_model_input
+
+        # these weighting schemes use a uniform timestep sampling
+        # and instead post-weight the loss
+        weighting = sd3_train_utils.compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
 
         # flow matching loss
         target = latents