img2img.py

"""
---
title: Generate images using stable diffusion with a prompt from a given image
summary: >
 Generate images using stable diffusion with a prompt from a given image
---

# Generate images using [stable diffusion](../index.html) with a prompt from a given image
"""

import torch

import argparse
from torch import nn
import einops
from resize import resize_images_in_path

import argparse
from pathlib import Path
import clip
from labml import lab, monit
from labml_nn.diffusion.stable_diffusion.sampler.ddim import DDIMSampler
from labml_nn.diffusion.stable_diffusion.util import load_model, load_img, save_images, set_seed
# from torchvision.utils import save_image

class Img2Img:
    """
    ### Image to image class
    """

    def __init__(self, *, config,
                 ddim_steps: int = 50,
                 ddim_eta: float = 0.0):
        """
        :param checkpoint_path: is the path of the checkpoint
        :param ddim_steps: is the number of sampling steps
        :param ddim_eta: is the [DDIM sampling](../sampler/ddim.html) $\eta$ constant
        """
        self.ddim_steps = ddim_steps

        # Load [latent diffusion model](../latent_diffusion.html)
        self.model = load_model(config.uvit ,config)  

        # Get device
        self.device = torch.device(config.device) if torch.cuda.is_available() else torch.device("cpu")
        # Move the model to device
        self.model.to(self.device)

        # Initialize [DDIM sampler](../sampler/ddim.html)
        self.sampler = DDIMSampler(self.model,
                                   n_steps=ddim_steps,
                                   ddim_eta=ddim_eta)
        

    @torch.no_grad()
    def __call__(self, *,
                 dest_path: str,
                 orig_img: str,
                 strength: float,
                 batch_size: int = 3,
                 prompt: str,
                 uncond_scale: float = 5.0,
                 ):
        """
        :param dest_path: is the path to store the generated images
        :param orig_img: is the image to transform
        :param strength: specifies how much of the original image should not be preserved
        :param batch_size: is the number of images to generate in a batch
        :param prompt: is the prompt to generate images with
        :param uncond_scale: is the unconditional guidance scale $s$. This is used for
            $\epsilon_\theta(x_t, c) = s\epsilon_\text{cond}(x_t, c) + (s - 1)\epsilon_\text{cond}(x_t, c_u)$
        """

        def combine(z, clip_img):
            z = einops.rearrange(z, 'B C H W -> B (C H W)')
            clip_img = einops.rearrange(clip_img, 'B L D -> B (L D)')
            return torch.concat([z, clip_img], dim=-1)
        def unpreprocess(v):  # to B C H W and [0, 1]
            v = 0.5 * (v + 1.)
            v.clamp_(0., 1.)
            return v
        def split(x):
            ### x : torch.Size([4, 4, 64, 64])
            C, H, W = (4, 64, 64)
            z_dim = C * H * W
            z, clip_img = x.split([z_dim, 512], dim=1)
    
            z = einops.rearrange(z, 'B (C H W) -> B C H W', C=C, H=H, W=W)
            clip_img = einops.rearrange(clip_img, 'B (L D) -> B L D', L=1, D=512)
            return z, clip_img


        # Make a batch of prompts
        prompts = batch_size * [prompt]
 
        # Load image 已经是 tensor 
        orig_image = load_img(orig_img).to(self.device)
        
        
        # Encode the image in the latent space and make `batch_size` copies of it
        orig = self.model.autoencoder_encode(orig_image).repeat(batch_size, 1, 1, 1).to(self.device)
        
        ### orig: torch.Size([4, 4, 80, 60])

        orig_clipimg = self.model.get_clipimg_embedding(orig_image).repeat(batch_size,1,1).to(self.device)

        
        # orig = combine(orig, orig_clipimg)
        
        
        # Get the number of steps to diffuse the original
        assert 0. <= strength <= 1., 'can only work with strength in [0.0, 1.0]'
        t_index = int(strength * self.ddim_steps) # int 37 

        # AMP auto casting
        with torch.cuda.amp.autocast():
            # In unconditional scaling is not $1$ get the embeddings for empty prompts (no conditioning).
            if uncond_scale != 1.0:
                un_cond = self.model.get_text_conditioning(batch_size * [""])
                ### un_cond.shape: torch.Size([4, 77, 768])
            else:
                un_cond = None
                
                
            # Get the prompt embeddings
            cond = self.model.get_text_conditioning(prompts)
            
            ### cond.shape: torch.Size([4, 77, 768])
            cond = self.model.get_encode_prefix(cond)
            
            def captiondecodeprefix(x):
                return self.model.get_decode_prefix(x)
            
            def captionencodeprefix(x):
                return self.model.get_encode_prefix(x)
            
            # Add noise to the original image
            
            t_img = torch.Tensor(t_index).unsqueeze(0).repeat(batch_size, 1).to(self.device)
            t_text = torch.zeros(t_img.size(0), dtype=torch.int, device=self.device)
            datatype = torch.zeros_like(t_text, device=self.device, dtype=torch.int) + 1
            
            
            x,added_noise = self.sampler.q_sample(orig, t_index)
            
            # Reconstruct from the noisy image
            x = self.sampler.paint(x, cond, t_index,t_img, orig_clipimg, t_text, datatype, captiondecodeprefix,captionencodeprefix,
                                   uncond_scale=uncond_scale,
                                   uncond_cond=un_cond)
            # Decode the image from the [autoencoder](../model/autoencoder.html)
            
            # z, _ = split(x)
            
            images = self.model.autoencoder_decode(x)

        # Save images
        save_images(images, dest_path, 'img_')


def main():
    """
    ### CLI
    """
    from configs.sample_config import get_config
    
    config = get_config()

    
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--prompt",
        type=str,
        nargs="?",
        default="",
        help="the prompt to render"
    )

    parser.add_argument(
        "--orig-img",
        type=str,
        nargs="?",
        default="/home/schengwei/Competitionrepo/resources/boy1_example.jpeg",
        help="path to the input image"
    )
    parser.add_argument(
        "--device-id",
        type=str,
        default="cuda:5",
        help="device to use"
    )
    # init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png")
    parser.add_argument("--batch_size", type=int, default=4, help="batch size", )
    parser.add_argument("--steps", type=int, default=50, help="number of ddim sampling steps")

    parser.add_argument("--scale", type=float, default=5.0,
                        help="unconditional guidance scale: "
                             "eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))")

    parser.add_argument("--strength", type=float, default=0.01,
                        help="strength for noise: "
                             "vary from 0.0 to 1.0 which 1.0 corresponds to full destruction of information in init image")
    parser.add_argument(
        "--ddim_eta",
        type=int,
        default=0,
        help="ddim eta control the noise adding each step."
    )
    parser.add_argument(
        "--dest_path",
        type=str,
        default="/home/schengwei/Competitionrepo/ddimoutput",
        help="the path to save the generated images"
    )
    
    opt = parser.parse_args()
    set_seed(42)
    
    # which gpu to use 
    config.device = opt.device_id
    
    img2img = Img2Img(config=config, ddim_steps=opt.steps, ddim_eta=opt.ddim_eta)


    with monit.section('Generate'):
        img2img(
            dest_path='/home/schengwei/Competitionrepo/ddimoutput',
            orig_img=opt.orig_img,
            strength=opt.strength,
            batch_size=opt.batch_size,
            prompt=opt.prompt,
            uncond_scale=opt.scale)


#
if __name__ == "__main__":
    main()