+# Byte-compiled / optimized / DLL files
+# C extensions
+# Distribution / packaging
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+# Installer logs
+# Unit test / coverage reports
+# Translations
+# Django stuff:
+# Flask stuff:
+# Scrapy stuff:
+# Sphinx documentation
+# PyBuilder
+# Jupyter Notebook
+# IPython
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+# Celery stuff
+# SageMath parsed files
+# Environments
+# Spyder project settings
+# Rope project settings
+# mkdocs documentation
+# mypy
+# Pyre type checker
+# pytype static type analyzer
+# Cython debug symbols
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# Do not save this folder (mainly used for AI generated content)
# ComfyUI workflows
+This repository stores workflows for [ComfyUI](https://github.com/comfyanonymous/ComfyUI).
+Please note that the custom nodes involved have to be loaded manually in ComfyUI.
+## Philosophy
+The node defined here are totally independent of the rest of the project, and may reimplement existing
+features of the code base. The main purpose of this folder is to provide visual equivalents to the code features.
+## Main workflows
+The workflows can be found in the ``workflows`` folder. It is recommended to use small workflows steps,
+as the result of the biggest workflows are quite random.
+- sdxl.json: Basic image generation using Stable Diffusion, roughly equivalent to ``../skybox/diffusion.py``
+- sdxl_with_refiner.json: Improved image generation that implements the
+- inpainting_demo.json: some simple demo for inpainting.
+- sdxl_inpainting_demo.json: a more complete inpainting demo using pure SDXL features.
+- central_inpainting.json: An inpainting implementation with the standard functions,
+using [sdxl inpainting](https://huggingface.co/diffusers/stable-diffusion-xl-1.0-inpainting-0.1),
+that creates a horizontal tiling.
+- panorama_creator.json: an extended workflow to create a flat image as a panorama.
+- text_to_skybox.json: a complete workflow to generate a skybox from a prompt.
+A custom implementation of eigenpunk/ComfyUI-audio, with the main difference of accepting several prompts as an entry.
+Source : https://github.com/eigenpunk/ComfyUI-audio/blob/main/musicgen_nodes.py
+import ast
+import gc
+from contextlib import contextmanager
+from torch.nn.functional import pad
+from typing import Optional, Union
+import torch
+from audiocraft.models import AudioGen, MusicGen
+ "musicgen-small",
+ "musicgen-medium",
+ "musicgen-melody",
+ "musicgen-large",
+ "musicgen-melody-large",
+ # TODO: stereo models seem not to be working out of the box
+ # "musicgen-stereo-small",
+ # "musicgen-stereo-medium",
+ # "musicgen-stereo-melody",
+ # "musicgen-stereo-large",
+ # "musicgen-stereo-melody-large",
+ "audiogen-medium",
+def do_cleanup(cuda_cache=True):
+ gc.collect()
+ if cuda_cache:
+ torch.cuda.empty_cache()
+def object_to(obj, device=None, exclude=None, empty_cuda_cache=True, verbose=False):
+ """
+ recurse through an object and move any pytorch tensors/parameters/modules to the given device.
+ if device is None, cpu is used by default. if the device is a CUDA device and empty_cuda_cache is
+ enabled, this will also free unused CUDA memory cached by pytorch.
+ """
+ if not hasattr(obj, "__dict__"):
+ return obj
+ classname = type(obj).__name__
+ exclude = exclude or set()
+ device = device or "cpu"
+ def _move_and_recurse(o, name=""):
+ child_moved = False
+ for k, v in vars(o).items():
+ moved = False
+ cur_name = f"{name}.{k}" if name != "" else k
+ if cur_name in exclude:
+ continue
+ if isinstance(v, (torch.nn.Module, torch.nn.Parameter, torch.Tensor)):
+ setattr(o, k, v.to(device))
+ moved = True
+ elif hasattr(v, "__dict__"):
+ v, moved = _move_and_recurse(v, name=cur_name)
+ if moved: setattr(o, k, v)
+ if verbose and moved:
+ print(f"moved {classname}.{cur_name} to {device}")
+ child_moved |= moved
+ return o, child_moved
+ if isinstance(obj, torch.nn.Module):
+ obj = obj.to(device)
+ obj, _ = _move_and_recurse(obj)
+ if "cuda" in device and empty_cuda_cache:
+ torch.cuda.empty_cache()
+ return obj
+def tensors_to(tensors, device):
+ if isinstance(tensors, torch.Tensor):
+ return tensors.to(device)
+ if hasattr(tensors, "__dict__"):
+ return object_to(tensors, device, empty_cuda_cache=False)
+ if isinstance(tensors, (list, tuple)):
+ return [tensors_to(x, device) for x in tensors]
+ if isinstance(tensors, dict):
+ return {k: tensors_to(v, device) for k, v in tensors.items()}
+ if isinstance(tensors, set):
+ return {tensors_to(x, device) for x in tensors}
+ return tensors
+def tensors_to_cpu(tensors):
+ return tensors_to(tensors, "cpu")
+def obj_on_device(model, src="cpu", dst="cuda", exclude=None, empty_cuda_cache=True, verbose_move=False):
+ model = object_to(model, dst, exclude=exclude, empty_cuda_cache=empty_cuda_cache, verbose=verbose_move)
+ yield model
+ model = object_to(model, src, exclude=exclude, empty_cuda_cache=empty_cuda_cache, verbose=verbose_move)
+def stack_audio_tensors(tensors, mode="pad"):
+ sizes = [x.shape[-1] for x in tensors]
+ if mode in {"pad_l", "pad_r", "pad"}:
+ # pad input tensors to be equal length
+ dst_size = max(sizes)
+ stack_tensors = (
+ [pad(x, pad=(0, dst_size - x.shape[-1])) for x in tensors]
+ if mode == "pad_r"
+ else [pad(x, pad=(dst_size - x.shape[-1], 0)) for x in tensors]
+ )
+ elif mode in {"trunc_l", "trunc_r", "trunc"}:
+ # truncate input tensors to be equal length
+ dst_size = min(sizes)
+ stack_tensors = (
+ [x[:, x.shape[-1] - dst_size:] for x in tensors]
+ if mode == "trunc_r"
+ else [x[:, :dst_size] for x in tensors]
+ )
+ else:
+ assert False, 'unknown mode "{pad}"'
+ return torch.stack(stack_tensors)
+class MusicgenGenerate:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {
+ "required": {
+ "model": ("MUSICGEN_MODEL",),
+ "text": ("STRING", {"default": "", "multiline": True}),
+ "batch_size": ("INT", {"default": 1, "min": 1}),
+ "duration": ("FLOAT", {"default": 10.0, "min": 1.0, "max": 300.0, "step": 0.01}),
+ "cfg": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
+ "top_k": ("INT", {"default": 250, "min": 0, "max": 10000, "step": 1}),
+ "top_p": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
+ "temperature": ("FLOAT", {"default": 1.0, "min": 0.001, "step": 0.001}),
+ "seed": ("INT", {"default": 0, "min": 0}),
+ },
+ "optional": {"audio": ("AUDIO_TENSOR",)},
+ }
+ FUNCTION = "generate"
+ CATEGORY = "audio"
+ def generate(
+ self,
+ model: Union[AudioGen, MusicGen],
+ text: str = "",
+ batch_size: int = 1,
+ duration: float = 10.0,
+ cfg: float = 1.0,
+ top_k: int = 250,
+ top_p: float = 0.0,
+ temperature: float = 1.0,
+ seed: int = 0,
+ audio: Optional[torch.Tensor] = None,
+ ):
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ # empty string = unconditional generation
+ if text == "":
+ text = None
+ model.set_generation_params(
+ top_k=top_k,
+ top_p=top_p,
+ temperature=temperature,
+ duration=duration,
+ cfg_coef=cfg,
+ )
+ with torch.random.fork_rng(), obj_on_device(model, dst=device, verbose_move=True) as m:
+ torch.manual_seed(seed)
+ text_input = ast.literal_eval(text)
+ print(text_input)
+ if audio is not None:
+ # do continuation with input audio and (optional) text prompting
+ if isinstance(audio, list):
+ # left-padded stacking into batch tensor
+ audio = stack_audio_tensors(audio)
+ if audio.shape[0] < batch_size:
+ # (try to) expand batch if smaller than requested
+ audio = audio.expand(batch_size, -1, -1)
+ elif audio.shape[0] > batch_size:
+ # truncate batch if larger than requested
+ audio = audio[:batch_size]
+ audio_input = tensors_to(audio, device)
+ audio_out = m.generate_continuation(audio_input, model.sample_rate, text_input, progress=True)
+ elif text is not None:
+ # do text-to-music
+ audio_out = m.generate(text_input, progress=True)
+ else:
+ # do unconditional music generation
+ audio_out = m.generate_unconditional(batch_size, progress=True)
+ audio_out = tensors_to_cpu(audio_out)
+ audio_out = torch.unbind(audio_out)
+ do_cleanup()
+ return list(audio_out),
+ "MusicgenGenerateCustom": MusicgenGenerate,
+ "MusicgenGenerateCustom": "Musicgen Generator Custom",
\ No newline at end of file
Define a VerticalMiddleMask node that mask only the center of an image.
+import torch
+class VerticalMiddleMask:
+ """
+ A node that splits an image in the middle and returns the two parts in mirror.
+ Class methods
+ -------------
+ INPUT_TYPES (dict):
+ Tell the main program input parameters of nodes.
+ optional method to control when the node is re-executed.
+ Attributes
+ ----------
+ RETURN_TYPES (`tuple`):
+ The type of each element in the output tuple.
+ RETURN_NAMES (`tuple`):
+ The name of each output in the output tuple (Optional).
+ FUNCTION (`str`):
+ The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute()
+ OUTPUT_NODE ([`bool`]):
+ If this node is an output node that outputs a result/image from the graph. The SaveImage node is an example.
+ The backend iterates on these output nodes and tries to execute all their parents
+ if their parent graph is properly connected.
+ Assumed to be False if not present.
+ CATEGORY (`str`):
+ The category the node should appear in the UI.
+ execute(s) -> tuple || None:
+ The entry point method. The name of this method must be the same as the value of property `FUNCTION`.
+ For example, if `FUNCTION = "execute"` then this method's name must be `execute`, if `FUNCTION = "foo"`
+ then it must be `foo`.
+ """
+ @classmethod
+ def INPUT_TYPES(self):
+ """
+ Return a dictionary which contains config for all input fields.
+ Some types (string): "MODEL", "VAE", "CLIP", "CONDITIONING", "LATENT", "IMAGE", "INT", "STRING", "FLOAT".
+ Input types "INT", "STRING" or "FLOAT" are special values for fields on the node.
+ The type can be a list for selection.
+ Returns: `dict`:
+ - Key input_fields_group (`string`): Can be either required, hidden or optional.
+ A node class must have property `required`
+ - Value input_fields (`dict`): Contains input fields config:
+ * Key field_name (`string`): Name of a entry-point method's argument
+ * Value field_config (`tuple`):
+ + First value is a string indicate the type of field or a list for selection.
+ + Second value is a config for type "INT", "STRING" or "FLOAT".
+ """
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "mask_width": ("INT", {
+ "default": 16,
+ "min": 0, # Minimum value
+ "max": 1024, # Maximum value
+ "step": 4, # Slider's step
+ "display": "number" # Cosmetic only: display as "number" or "slider"
+ })
+ },
+ }
+ RETURN_NAMES = ("Mask", )
+ FUNCTION = "main"
+ # OUTPUT_NODE = False
+ CATEGORY = "mask"
+ def main(self, image, mask_width):
+ mask = torch.zeros(image.shape[:-1])
+ image_center = image.shape[2] // 2
+ mask[:, :, image_center - mask_width // 2:image_center + mask_width // 2] = 1
+ return mask,
+# A dictionary that contains all nodes you want to export with their names
+# NOTE: names should be globally unique
+ "VerticalMiddleMask": VerticalMiddleMask
+# A dictionary that contains the friendly/humanly readable titles for the nodes
+ "VerticalMiddleMask": "VerticalMiddleMask"
Definition of the MiddleSplit node.
+import torch
+class MiddleSplit:
+ """
+ A node that splits an image in the middle and returns the two parts in mirror.
+ Class methods
+ -------------
+ INPUT_TYPES (dict):
+ Tell the main program input parameters of nodes.
+ optional method to control when the node is re-executed.
+ Attributes
+ ----------
+ RETURN_TYPES (`tuple`):
+ The type of each element in the output tuple.
+ RETURN_NAMES (`tuple`):
+ The name of each output in the output tuple (Optional).
+ FUNCTION (`str`):
+ The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute()
+ OUTPUT_NODE ([`bool`]):
+ If this node is an output node that outputs a result/image from the graph. The SaveImage node is an example.
+ The backend iterates on these output nodes and tries to execute all their parents
+ if their parent graph is properly connected.
+ Assumed to be False if not present.
+ CATEGORY (`str`):
+ The category the node should appear in the UI.
+ execute(s) -> tuple || None:
+ The entry point method. The name of this method must be the same as the value of property `FUNCTION`.
+ For example, if `FUNCTION = "execute"` then this method's name must be `execute`, if `FUNCTION = "foo"`
+ then it must be `foo`.
+ """
+ @classmethod
+ def INPUT_TYPES(self):
+ """
+ Return a dictionary which contains config for all input fields.
+ Some types (string): "MODEL", "VAE", "CLIP", "CONDITIONING", "LATENT", "IMAGE", "INT", "STRING", "FLOAT".
+ Input types "INT", "STRING" or "FLOAT" are special values for fields on the node.
+ The type can be a list for selection.
+ Returns: `dict`:
+ - Key input_fields_group (`string`): Can be either required, hidden or optional.
+ A node class must have property `required`
+ - Value input_fields (`dict`): Contains input fields config:
+ * Key field_name (`string`): Name of a entry-point method's argument
+ * Value field_config (`tuple`):
+ + First value is a string indicate the type of field or a list for selection.
+ + Second value is a config for type "INT", "STRING" or "FLOAT".
+ """
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ },
+ }
+ RETURN_NAMES = ("Image", )
+ FUNCTION = "main"
+ # OUTPUT_NODE = False
+ CATEGORY = "image"
+ def main(self, image):
+ center = image.size()[2] // 2
+ flipped = torch.fliplr(image.permute(1, 2, 3, 0)).permute(3, 0, 1, 2)
+ l_im, r_im = flipped[:, :, :center, :], flipped[:, :, center:, :]
+ return torch.cat((r_im, l_im), 2),
+ """
+ The node will always be re executed if any of the inputs change but
+ this method can be used to force the node to execute again even when the inputs don't change.
+ You can make this node return a number or a string.
+ This value will be compared to the one returned the last time the node was executed,
+ if it is different the node will be executed again.
+ This method is used in the core repo for the LoadImage node where they return the image hash as a string,
+ if the image hash changes between executions the LoadImage node is executed again.
+ """
+ # @classmethod
+ # def IS_CHANGED(s, image, string_field, int_field, float_field, print_to_screen):
+ # return ""
+# A dictionary that contains all nodes you want to export with their names
+# NOTE: names should be globally unique
+ "MiddleSplit": MiddleSplit
+# A dictionary that contains the friendly/humanly readable titles for the nodes
+ "MiddleSplit": "Split and flip Image"
Define an outpainting node that stretches the borders of the original image.
+import torch
+def cylindrical_projection(image):
+ """
+ Compute a cylindrical projection from a flat image.
+ The x-axis is preserved, by the y-axis will be changed.
+ This is the inverse operation of a Lambert projection.
+ :param torch.tensor image: Input image
+ :return torch.tensor: Output image in reversed cylindrical projection
+ """
+ height, width = image.shape[1:3]
+ cylindrical_image = torch.empty(image.shape)
+ # Convert each pixel in the equirectangular image, from [0, height] to [0, height]
+ # As the view is essentially from a cylinder to a sphere, a cosine transformation is applied
+ # We then apply a reverse cosine
+ lines = height * (1 - torch.arccos(torch.linspace(-1, 1, height)) / torch.pi)
+ ratios = lines - torch.round(lines)
+ for y in range(height):
+ v = int(lines[y].item())
+ ratio = ratios[y]
+ if v + 1 < height:
+ interpolates = image[:, v + 1] * ratio + (1 - ratio) * image[:, v]
+ else:
+ interpolates = image[:, height - 1]
+ cylindrical_image[:, y] = interpolates
+ return cylindrical_image
+class ImageReverseLambert:
+ """
+ A node that splits an image in the middle and returns the two parts in mirror.
+ Class methods
+ -------------
+ INPUT_TYPES (dict):
+ Tell the main program input parameters of nodes.
+ optional method to control when the node is re-executed.
+ Attributes
+ ----------
+ RETURN_TYPES (`tuple`):
+ The type of each element in the output tuple.
+ RETURN_NAMES (`tuple`):
+ The name of each output in the output tuple (Optional).
+ FUNCTION (`str`):
+ The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute()
+ OUTPUT_NODE ([`bool`]):
+ If this node is an output node that outputs a result/image from the graph. The SaveImage node is an example.
+ The backend iterates on these output nodes and tries to execute all their parents
+ if their parent graph is properly connected.
+ Assumed to be False if not present.
+ CATEGORY (`str`):
+ The category the node should appear in the UI.
+ execute(s) -> tuple || None:
+ The entry point method. The name of this method must be the same as the value of property `FUNCTION`.
+ For example, if `FUNCTION = "execute"` then this method's name must be `execute`, if `FUNCTION = "foo"`
+ then it must be `foo`.
+ """
+ @classmethod
+ def INPUT_TYPES(self):
+ """
+ Return a dictionary which contains config for all input fields.
+ Some types (string): "MODEL", "VAE", "CLIP", "CONDITIONING", "LATENT", "IMAGE", "INT", "STRING", "FLOAT".
+ Input types "INT", "STRING" or "FLOAT" are special values for fields on the node.
+ The type can be a list for selection.
+ Returns: `dict`:
+ - Key input_fields_group (`string`): Can be either required, hidden or optional.
+ A node class must have property `required`
+ - Value input_fields (`dict`): Contains input fields config:
+ * Key field_name (`string`): Name of a entry-point method's argument
+ * Value field_config (`tuple`):
+ + First value is a string indicate the type of field or a list for selection.
+ + Second value is a config for type "INT", "STRING" or "FLOAT".
+ """
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ }
+ }
+ FUNCTION = "main"
+ # OUTPUT_NODE = False
+ CATEGORY = "image"
+ def main(self, image):
+ return cylindrical_projection(image),
+# A dictionary that contains all nodes you want to export with their names
+# NOTE: names should be globally unique
+ "ImageReverseLambert": ImageReverseLambert
+# A dictionary that contains the friendly/humanly readable titles for the nodes
+ "ImageReverseLambert": "Project as Reversed Lambert"
Define an outpainting node that stretches the borders of the original image.
+import torch
+def pad_image(image, top, bottom):
+ initial_height = image.shape[1]
+ shape = list(image.shape)
+ shape[1] = initial_height + top + bottom
+ output = torch.zeros(shape)
+ mask = torch.zeros(output.shape[:-1])
+ output[:, top:top + initial_height] = image
+ mask[:, :top] = 1
+ mask[:, initial_height + top:] = 1
+ return output, mask
+def stretch_image(padded_image, top, bottom, border):
+ output = padded_image.clone().detach()
+ top_area = padded_image[:, top:top + border]
+ top_area = torch.mean(top_area, 1, keepdim=True).repeat([1, top, 1, 1])
+ output[:, :top] = top_area
+ if bottom > 0:
+ bottom_area = padded_image[:, -bottom - border:-bottom]
+ bottom_area = torch.mean(bottom_area, 1, keepdim=True).repeat([1, bottom, 1, 1])
+ output[:, -bottom:] = bottom_area
+ return output
+class ImageStretchForOutpaint:
+ """
+ A node that splits an image in the middle and returns the two parts in mirror.
+ Class methods
+ -------------
+ INPUT_TYPES (dict):
+ Tell the main program input parameters of nodes.
+ optional method to control when the node is re-executed.
+ Attributes
+ ----------
+ RETURN_TYPES (`tuple`):
+ The type of each element in the output tuple.
+ RETURN_NAMES (`tuple`):
+ The name of each output in the output tuple (Optional).
+ FUNCTION (`str`):
+ The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute()
+ OUTPUT_NODE ([`bool`]):
+ If this node is an output node that outputs a result/image from the graph. The SaveImage node is an example.
+ The backend iterates on these output nodes and tries to execute all their parents
+ if their parent graph is properly connected.
+ Assumed to be False if not present.
+ CATEGORY (`str`):
+ The category the node should appear in the UI.
+ execute(s) -> tuple || None:
+ The entry point method. The name of this method must be the same as the value of property `FUNCTION`.
+ For example, if `FUNCTION = "execute"` then this method's name must be `execute`, if `FUNCTION = "foo"`
+ then it must be `foo`.
+ """
+ @classmethod
+ def INPUT_TYPES(self):
+ """
+ Return a dictionary which contains config for all input fields.
+ Some types (string): "MODEL", "VAE", "CLIP", "CONDITIONING", "LATENT", "IMAGE", "INT", "STRING", "FLOAT".
+ Input types "INT", "STRING" or "FLOAT" are special values for fields on the node.
+ The type can be a list for selection.
+ Returns: `dict`:
+ - Key input_fields_group (`string`): Can be either required, hidden or optional.
+ A node class must have property `required`
+ - Value input_fields (`dict`): Contains input fields config:
+ * Key field_name (`string`): Name of a entry-point method's argument
+ * Value field_config (`tuple`):
+ + First value is a string indicate the type of field or a list for selection.
+ + Second value is a config for type "INT", "STRING" or "FLOAT".
+ """
+ outpainting_settings = {
+ "default": 0,
+ "min": 0, # Minimum value
+ "max": 1024, # Maximum value
+ "step": 4, # Slider's step
+ "display": "number" # Cosmetic only: display as "number" or "slider"
+ }
+ return {
+ "required": {
+ "image": ("IMAGE",),
+ "top": ("INT", outpainting_settings),
+ "bottom": ("INT", {
+ "default": 0,
+ "min": 0, # Minimum value
+ "max": 1024, # Maximum value
+ "step": 4, # Slider's step
+ "display": "number" # Cosmetic only: display as "number" or "slider"
+ }),
+ "border": ("INT", {
+ "default": 5,
+ "min": 0,
+ "max": 1024,
+ "step": 1,
+ "display": "number"
+ })
+ },
+ }
+ # RETURN_NAMES = ("Image", "Mask", )
+ FUNCTION = "main"
+ # OUTPUT_NODE = False
+ CATEGORY = "image"
+ def main(self, image, top, bottom, border):
+ padded_image, mask = pad_image(image, top, bottom)
+ output = stretch_image(padded_image, top, bottom, border)
+ return output, mask
+# A dictionary that contains all nodes you want to export with their names
+# NOTE: names should be globally unique
+ "ImageStretchForOutpaint": ImageStretchForOutpaint
+# A dictionary that contains the friendly/humanly readable titles for the nodes
+ "ImageStretchForOutpaint": "Stretch Image for Outpainting"
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 39
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SpectrogramImage"
+ },
+ "widgets_values": [
+ 200,
+ 50,
+ 100,
+ 1,
+ false,
+ true
+ ]
+ }
+ ],
+ "links": [
+ [
+ 1,
+ 4,
+ 0,
+ 3,
+ 0,
+ ],
+ [
+ 2,
+ 5,
+ 0,
+ 3,
+ 3,
+ ],
+ [
+ 3,
+ 4,
+ 1,
+ 6,
+ 0,
+ "CLIP"
+ ],
+ [
+ 4,
+ 6,
+ 0,
+ 3,
+ 1,
+ ],
+ [
+ 5,
+ 4,
+ 1,
+ 7,
+ 0,
+ "CLIP"
+ ],
+ [
+ 6,
+ 7,
+ 0,
+ 3,
+ 2,
+ ],
+ [
+ 7,
+ 3,
+ 0,
+ 8,
+ 0,
+ ],
+ [
+ 10,
+ 11,
+ 0,
+ 10,
+ 0,
+ ],
+ [
+ 13,
+ 11,
+ 0,
+ 6,
+ 1,
+ ],
+ [
+ 22,
+ 11,
+ 0,
+ 22,
+ 0,
+ ],
+ [
+ 23,
+ 22,
+ 0,
+ 18,
+ 0,
+ ],
+ [
+ 24,
+ 22,
+ 0,
+ 23,
+ 0,
+ ],
+ [
+ 25,
+ 18,
+ 0,
+ 24,
+ 0,
+ ],
+ [
+ 32,
+ 4,
+ 2,
+ 34,
+ 0,
+ "*"
+ ],
+ [
+ 33,
+ 34,
+ 0,
+ 8,
+ 1,
+ "VAE"
+ ],
+ [
+ 37,
+ 25,
+ 0,
+ 36,
+ 0,
+ ],
+ [
+ 38,
+ 36,
+ 0,
+ 37,
+ 0,
+ ],
+ [
+ 39,
+ 37,
+ 0,
+ 38,
+ 0,
+ ],
+ [
+ 41,
+ 36,
+ 0,
+ 33,
+ 0,
+ ],
+ [
+ 44,
+ 18,
+ 0,
+ 36,
+ 2,
+ ],
+ [
+ 45,
+ 8,
+ 0,
+ 9,
+ 0,
+ ],
+ [
+ 47,
+ 48,
+ 0,
+ 33,
+ 1,
+ "INT"
+ ],
+ [
+ 48,
+ 25,
+ 1,
+ 48,
+ 0,
+ "*"
+ ]
+ ],
+ "groups": [
+ {
+ "title": "Image generation",
+ "bounding": [
+ -478,
+ -174,
+ 2754,
+ 376
+ ],
+ "color": "#b58b2a",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Inputs preparation",
+ "bounding": [
+ -883,
+ 393,
+ 1984,
+ 426
+ ],
+ "color": "#8A8",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Audio Generation",
+ "bounding": [
+ 1173,
+ 380,
+ 1669,
+ 774
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "locked": false
+ }
+ ],
+ "config": {},
+ "extra": {},
+ "version": 0.4
\ No newline at end of file
diff --git a/ComfyUI/workflows/panorama_creator.json b/ComfyUI/workflows/panorama_creator.json
new file mode 100644
index 0000000..411bdf0
--- /dev/null
+++ b/ComfyUI/workflows/panorama_creator.json
@@ -0,0 +1,2490 @@
+ "last_node_id": 140,
+ "last_link_id": 241,
+ "nodes": [
+ {
+ "id": 72,
+ "type": "PreviewImage",
+ "pos": [
+ 6732.320592261197,
+ 1022.4875354181333
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 37,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 116
+ }
+ ],
+ "title": "Refined Inpainted Image",
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 69,
+ "type": "PreviewImage",
+ "pos": [
+ 6728.320592261197,
+ 680.4875354181331
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 34,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 114
+ }
+ ],
+ "title": "Inpainted Image",
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 64,
+ "type": "PreviewImage",
+ "pos": [
+ 6726.320592261197,
+ 348.48753541813363
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 30,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 110
+ }
+ ],
+ "title": "Masked Image",
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 47,
+ "type": "PrimitiveNode",
+ "pos": [
+ -436.65502631696484,
+ 326.1107483300291
+ ],
+ "size": {
+ "0": 210,
+ "1": 76.00001525878906
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 84,
+ 98
+ ],
+ "slot_index": 0,
+ "widget": {
+ "name": "text"
+ }
+ }
+ ],
+ "properties": {
+ "Run widget replace on values": false
+ },
+ "widgets_values": [
+ "a text, a logo, borders"
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 46,
+ "type": "PrimitiveNode",
+ "pos": [
+ -441.48548678978943,
+ 197.57055928998972
+ ],
+ "size": {
+ "0": 210,
+ "1": 76.00000762939453
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 81,
+ 97
+ ],
+ "slot_index": 0,
+ "widget": {
+ "name": "text"
+ }
+ }
+ ],
+ "properties": {
+ "Run widget replace on values": false
+ },
+ "widgets_values": [
+ "a sunny valley"
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 76,
+ "type": "PrimitiveNode",
+ "pos": [
+ -431.8105058402872,
+ 597.6940746637698
+ ],
+ "size": {
+ "0": 210,
+ "1": 82
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 120,
+ 122,
+ 131
+ ],
+ "slot_index": 0,
+ "widget": {
+ "name": "height"
+ }
+ }
+ ],
+ "title": "Initial Image Height\n",
+ "properties": {
+ "Run widget replace on values": false
+ },
+ "widgets_values": [
+ 512,
+ "fixed"
+ ]
+ },
+ {
+ "id": 75,
+ "type": "PrimitiveNode",
+ "pos": [
+ -435.8105058402872,
+ 464.6940746637689
+ ],
+ "size": {
+ "0": 210,
+ "1": 82
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 119,
+ 121,
+ 130
+ ],
+ "slot_index": 0,
+ "widget": {
+ "name": "width"
+ }
+ }
+ ],
+ "title": "Initial Image Width\n",
+ "properties": {
+ "Run widget replace on values": false
+ },
+ "widgets_values": [
+ 2048,
+ "fixed"
+ ]
+ },
+ {
+ "id": 82,
+ "type": "EmptyLatentImage",
+ "pos": [
+ 187.1067275294476,
+ -76.79146373353836
+ ],
+ "size": {
+ "0": 315,
+ "1": 106
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "width",
+ "type": "INT",
+ "link": 130,
+ "widget": {
+ "name": "width"
+ }
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "link": 131,
+ "widget": {
+ "name": "height"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 132
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 2048,
+ 512,
+ 2
+ ]
+ },
+ {
+ "id": 56,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ 177.90388094884744,
+ 103.65074083354882
+ ],
+ "size": {
+ "0": 400,
+ "1": 270
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 95
+ },
+ {
+ "name": "text_g",
+ "type": "STRING",
+ "link": 97,
+ "widget": {
+ "name": "text_g"
+ }
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "link": 119,
+ "widget": {
+ "name": "width"
+ }
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "link": 120,
+ "widget": {
+ "name": "height"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 124
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 2048,
+ 512,
+ 0,
+ 0,
+ 2048,
+ 512,
+ "a sunny valley",
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 57,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ 182.90388094884744,
+ 156.6507408335485
+ ],
+ "size": {
+ "0": 400,
+ "1": 270.0000305175781
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 99
+ },
+ {
+ "name": "text_g",
+ "type": "STRING",
+ "link": 98,
+ "widget": {
+ "name": "text_g"
+ }
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "link": 121,
+ "widget": {
+ "name": "width"
+ }
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "link": 122,
+ "widget": {
+ "name": "height"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 125
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 2048,
+ 512,
+ 0,
+ 0,
+ 2048,
+ 512,
+ "a text, a logo, borders",
+ "a logo, text, borders"
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 126,
+ "type": "PreviewImage",
+ "pos": [
+ 6710,
+ -160
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 194
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 49,
+ "type": "VAEDecode",
+ "pos": [
+ 5563.898602057173,
+ 1528.4313459247896
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 35,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 88
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 77
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 116,
+ 117
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 130,
+ "type": "Reroute",
+ "pos": [
+ 2396.8792072153574,
+ 762.4135259871944
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 215
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "VAE",
+ "links": [
+ 216,
+ 217
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 68,
+ "type": "SaveImage",
+ "pos": [
+ 6668,
+ 1410
+ ],
+ "size": {
+ "0": 315,
+ "1": 270.0000305175781
+ },
+ "flags": {},
+ "order": 36,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 113
+ }
+ ],
+ "title": "Inpainted Image",
+ "properties": {},
+ "widgets_values": [
+ "skybox"
+ ]
+ },
+ {
+ "id": 50,
+ "type": "SaveImage",
+ "pos": [
+ 6677,
+ 1729
+ ],
+ "size": {
+ "0": 315,
+ "1": 270.00006103515625
+ },
+ "flags": {},
+ "order": 39,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 118
+ }
+ ],
+ "title": "Inpainted Refined Image",
+ "properties": {},
+ "widgets_values": [
+ "skybox_refined"
+ ]
+ },
+ {
+ "id": 73,
+ "type": "MiddleSplit",
+ "pos": [
+ 5821.898602057173,
+ 1611.4313459247896
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {},
+ "order": 38,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 117
+ }
+ ],
+ "outputs": [
+ {
+ "name": "Image",
+ "type": "IMAGE",
+ "links": [
+ 118,
+ 231
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MiddleSplit"
+ }
+ },
+ {
+ "id": 136,
+ "type": "ImageReverseLambert",
+ "pos": [
+ 6061.898602057173,
+ 1666.4313459247896
+ ],
+ "size": {
+ "0": 226.8000030517578,
+ "1": 26
+ },
+ "flags": {},
+ "order": 40,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 231
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 232
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageReverseLambert"
+ }
+ },
+ {
+ "id": 137,
+ "type": "SaveImage",
+ "pos": [
+ 6664,
+ 2058
+ ],
+ "size": {
+ "0": 315,
+ "1": 270
+ },
+ "flags": {},
+ "order": 41,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 232
+ }
+ ],
+ "properties": {},
+ "widgets_values": [
+ "skybox_refined_lambert"
+ ]
+ },
+ {
+ "id": 125,
+ "type": "VAEDecode",
+ "pos": [
+ 1508.8792072153576,
+ 425.41352598719476
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 237
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 214
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 194,
+ 201,
+ 202
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 129,
+ "type": "Reroute",
+ "pos": [
+ 1088,
+ 631
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 235
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "VAE",
+ "links": [
+ 214,
+ 215
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 127,
+ "type": "Reroute",
+ "pos": [
+ 289,
+ 579
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 203
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "VAE",
+ "links": [
+ 204,
+ 235
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 79,
+ "type": "VAEDecode",
+ "pos": [
+ 1240,
+ 58
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 127
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 204
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 133
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 34,
+ "type": "UNETLoader",
+ "pos": [
+ -471,
+ 958
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 236
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader"
+ },
+ "widgets_values": [
+ "sd_xl_inpainting_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 78,
+ "type": "KSampler",
+ "pos": [
+ 671.1067275294507,
+ -109.79146373353866
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 123
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 124
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 125
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 132
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 127,
+ 237
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 7898391413359,
+ "randomize",
+ 20,
+ 8,
+ "euler",
+ "normal",
+ 0.8
+ ]
+ },
+ {
+ "id": 71,
+ "type": "PreviewImage",
+ "pos": [
+ 6737,
+ -552
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 133
+ }
+ ],
+ "title": "Initial Image",
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 42,
+ "type": "workflow/Blurry Mask",
+ "pos": [
+ 2419,
+ 751
+ ],
+ "size": {
+ "0": 315,
+ "1": 318
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 106
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 67
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "workflow/Blurry Mask"
+ },
+ "widgets_values": [
+ 10,
+ 1,
+ "red"
+ ]
+ },
+ {
+ "id": 21,
+ "type": "MiddleSplit",
+ "pos": [
+ 2057,
+ 459
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {},
+ "order": 23,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 202
+ }
+ ],
+ "outputs": [
+ {
+ "name": "Image",
+ "type": "IMAGE",
+ "links": [
+ 42,
+ 49,
+ 109
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MiddleSplit"
+ }
+ },
+ {
+ "id": 24,
+ "type": "VerticalMiddleMask",
+ "pos": [
+ 2058,
+ 658
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {},
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 201
+ }
+ ],
+ "outputs": [
+ {
+ "name": "Mask",
+ "type": "MASK",
+ "links": [
+ 106,
+ 107
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VerticalMiddleMask"
+ },
+ "widgets_values": [
+ 168
+ ]
+ },
+ {
+ "id": 63,
+ "type": "MaskToImage",
+ "pos": [
+ 2429,
+ 635
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {
+ "collapsed": false
+ },
+ "order": 25,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 107
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 108
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskToImage"
+ }
+ },
+ {
+ "id": 61,
+ "type": "ImageBlend",
+ "pos": [
+ 2769,
+ 688
+ ],
+ "size": {
+ "0": 315,
+ "1": 102
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 28,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image1",
+ "type": "IMAGE",
+ "link": 109
+ },
+ {
+ "name": "image2",
+ "type": "IMAGE",
+ "link": 108
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 110
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageBlend"
+ },
+ "widgets_values": [
+ 0.5,
+ "multiply"
+ ]
+ },
+ {
+ "id": 32,
+ "type": "KSampler",
+ "pos": [
+ 3329,
+ 784
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 29,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 236
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 238,
+ "slot_index": 1
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 239,
+ "slot_index": 2
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 55
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 56,
+ 111
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 10826311454902,
+ "randomize",
+ 50,
+ 8,
+ "euler",
+ "normal",
+ 0.9
+ ]
+ },
+ {
+ "id": 27,
+ "type": "VAEEncodeForInpaint",
+ "pos": [
+ 2845,
+ 902
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 27,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": 49
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 216
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 67
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 55
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEEncodeForInpaint"
+ },
+ "widgets_values": [
+ 30
+ ]
+ },
+ {
+ "id": 28,
+ "type": "VAEDecode",
+ "pos": [
+ 3694,
+ 890
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 31,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 56
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 217
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 112,
+ 114
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 67,
+ "type": "MiddleSplit",
+ "pos": [
+ 3925,
+ 1019
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {},
+ "order": 33,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 112
+ }
+ ],
+ "outputs": [
+ {
+ "name": "Image",
+ "type": "IMAGE",
+ "links": [
+ 113
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MiddleSplit"
+ }
+ },
+ {
+ "id": 139,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 1540,
+ 564
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 240
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 238
+ ],
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 25,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -485.65502631696495,
+ 746.1107483300289
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 123
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 95,
+ 99,
+ 240,
+ 241
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 203
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sd_xl_base_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 140,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 1534,
+ 628
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 241
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 239
+ ],
+ "shape": 3
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 3,
+ "type": "PreviewImage",
+ "pos": [
+ 2935,
+ 396
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {
+ "collapsed": false
+ },
+ "order": 26,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 42
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 44,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 4266,
+ 1271
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 82
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 80,
+ 85
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 77
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sd_xl_refiner_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 53,
+ "type": "CLIPTextEncodeSDXLRefiner",
+ "pos": [
+ 4766,
+ 1526
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 85
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "link": 84,
+ "widget": {
+ "name": "text"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 86
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXLRefiner"
+ },
+ "widgets_values": [
+ 6,
+ 1024,
+ 1024,
+ "a text, a logo, borders"
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 51,
+ "type": "CLIPTextEncodeSDXLRefiner",
+ "pos": [
+ 4756,
+ 1467
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 80
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "link": 81,
+ "widget": {
+ "name": "text"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 83
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXLRefiner"
+ },
+ "widgets_values": [
+ 6,
+ 1024,
+ 1024,
+ "a sunny valley"
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 52,
+ "type": "KSampler",
+ "pos": [
+ 5184,
+ 1463
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 32,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 82
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 83
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 86
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 111
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 88
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 418024017608548,
+ "randomize",
+ 30,
+ 8,
+ "euler",
+ "normal",
+ 0.2
+ ]
+ }
+ ],
+ "links": [
+ [
+ 42,
+ 21,
+ 0,
+ 3,
+ 0,
+ ],
+ [
+ 49,
+ 21,
+ 0,
+ 27,
+ 0,
+ ],
+ [
+ 55,
+ 27,
+ 0,
+ 32,
+ 3,
+ ],
+ [
+ 56,
+ 32,
+ 0,
+ 28,
+ 0,
+ ],
+ [
+ 67,
+ 42,
+ 0,
+ 27,
+ 2,
+ "MASK"
+ ],
+ [
+ 77,
+ 44,
+ 2,
+ 49,
+ 1,
+ "VAE"
+ ],
+ [
+ 80,
+ 44,
+ 1,
+ 51,
+ 0,
+ "CLIP"
+ ],
+ [
+ 81,
+ 46,
+ 0,
+ 51,
+ 1,
+ ],
+ [
+ 82,
+ 44,
+ 0,
+ 52,
+ 0,
+ ],
+ [
+ 83,
+ 51,
+ 0,
+ 52,
+ 1,
+ ],
+ [
+ 84,
+ 47,
+ 0,
+ 53,
+ 1,
+ ],
+ [
+ 85,
+ 44,
+ 1,
+ 53,
+ 0,
+ "CLIP"
+ ],
+ [
+ 86,
+ 53,
+ 0,
+ 52,
+ 2,
+ ],
+ [
+ 88,
+ 52,
+ 0,
+ 49,
+ 0,
+ ],
+ [
+ 95,
+ 25,
+ 1,
+ 56,
+ 0,
+ "CLIP"
+ ],
+ [
+ 97,
+ 46,
+ 0,
+ 56,
+ 1,
+ ],
+ [
+ 98,
+ 47,
+ 0,
+ 57,
+ 1,
+ ],
+ [
+ 99,
+ 25,
+ 1,
+ 57,
+ 0,
+ "CLIP"
+ ],
+ [
+ 106,
+ 24,
+ 0,
+ 42,
+ 0,
+ "MASK"
+ ],
+ [
+ 107,
+ 24,
+ 0,
+ 63,
+ 0,
+ "MASK"
+ ],
+ [
+ 108,
+ 63,
+ 0,
+ 61,
+ 1,
+ ],
+ [
+ 109,
+ 21,
+ 0,
+ 61,
+ 0,
+ ],
+ [
+ 110,
+ 61,
+ 0,
+ 64,
+ 0,
+ ],
+ [
+ 111,
+ 32,
+ 0,
+ 52,
+ 3,
+ ],
+ [
+ 112,
+ 28,
+ 0,
+ 67,
+ 0,
+ ],
+ [
+ 113,
+ 67,
+ 0,
+ 68,
+ 0,
+ ],
+ [
+ 114,
+ 28,
+ 0,
+ 69,
+ 0,
+ ],
+ [
+ 116,
+ 49,
+ 0,
+ 72,
+ 0,
+ ],
+ [
+ 117,
+ 49,
+ 0,
+ 73,
+ 0,
+ ],
+ [
+ 118,
+ 73,
+ 0,
+ 50,
+ 0,
+ ],
+ [
+ 119,
+ 75,
+ 0,
+ 56,
+ 2,
+ "INT"
+ ],
+ [
+ 120,
+ 76,
+ 0,
+ 56,
+ 3,
+ "INT"
+ ],
+ [
+ 121,
+ 75,
+ 0,
+ 57,
+ 2,
+ "INT"
+ ],
+ [
+ 122,
+ 76,
+ 0,
+ 57,
+ 3,
+ "INT"
+ ],
+ [
+ 123,
+ 25,
+ 0,
+ 78,
+ 0,
+ ],
+ [
+ 124,
+ 56,
+ 0,
+ 78,
+ 1,
+ ],
+ [
+ 125,
+ 57,
+ 0,
+ 78,
+ 2,
+ ],
+ [
+ 127,
+ 78,
+ 0,
+ 79,
+ 0,
+ ],
+ [
+ 130,
+ 75,
+ 0,
+ 82,
+ 0,
+ "INT"
+ ],
+ [
+ 131,
+ 76,
+ 0,
+ 82,
+ 1,
+ "INT"
+ ],
+ [
+ 132,
+ 82,
+ 0,
+ 78,
+ 3,
+ ],
+ [
+ 133,
+ 79,
+ 0,
+ 71,
+ 0,
+ ],
+ [
+ 194,
+ 125,
+ 0,
+ 126,
+ 0,
+ ],
+ [
+ 201,
+ 125,
+ 0,
+ 24,
+ 0,
+ ],
+ [
+ 202,
+ 125,
+ 0,
+ 21,
+ 0,
+ ],
+ [
+ 203,
+ 25,
+ 2,
+ 127,
+ 0,
+ "*"
+ ],
+ [
+ 204,
+ 127,
+ 0,
+ 79,
+ 1,
+ "VAE"
+ ],
+ [
+ 214,
+ 129,
+ 0,
+ 125,
+ 1,
+ "VAE"
+ ],
+ [
+ 215,
+ 129,
+ 0,
+ 130,
+ 0,
+ "*"
+ ],
+ [
+ 216,
+ 130,
+ 0,
+ 27,
+ 1,
+ "VAE"
+ ],
+ [
+ 217,
+ 130,
+ 0,
+ 28,
+ 1,
+ "VAE"
+ ],
+ [
+ 231,
+ 73,
+ 0,
+ 136,
+ 0,
+ ],
+ [
+ 232,
+ 136,
+ 0,
+ 137,
+ 0,
+ ],
+ [
+ 235,
+ 127,
+ 0,
+ 129,
+ 0,
+ "*"
+ ],
+ [
+ 236,
+ 34,
+ 0,
+ 32,
+ 0,
+ ],
+ [
+ 237,
+ 78,
+ 0,
+ 125,
+ 0,
+ ],
+ [
+ 238,
+ 139,
+ 0,
+ 32,
+ 1,
+ ],
+ [
+ 239,
+ 140,
+ 0,
+ 32,
+ 2,
+ ],
+ [
+ 240,
+ 25,
+ 1,
+ 139,
+ 0,
+ "CLIP"
+ ],
+ [
+ 241,
+ 25,
+ 1,
+ 140,
+ 0,
+ "CLIP"
+ ]
+ ],
+ "groups": [
+ {
+ "title": "Horizontal Tiling",
+ "bounding": [
+ 1326,
+ 320,
+ 2824,
+ 915
+ ],
+ "color": "#b58b2a",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "User Inputs",
+ "bounding": [
+ -496,
+ 124,
+ 335,
+ 731
+ ],
+ "color": "#88A",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Refining",
+ "bounding": [
+ 4232,
+ 1208,
+ 2062,
+ 534
+ ],
+ "color": "#8A8",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Ouput",
+ "bounding": [
+ 6657,
+ 1324,
+ 335,
+ 1038
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Previewes",
+ "bounding": [
+ 6717,
+ 274,
+ 236,
+ 1004
+ ],
+ "color": "#444",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Initial Image",
+ "bounding": [
+ 143,
+ -249,
+ 1342,
+ 443
+ ],
+ "color": "#8AA",
+ "font_size": 24,
+ "locked": false
+ }
+ ],
+ "config": {},
+ "extra": {
+ "groupNodes": {
+ "Blurry Mask": {
+ "nodes": [
+ {
+ "type": "MaskToImage",
+ "pos": [
+ 190,
+ 520
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskToImage"
+ },
+ "index": 0
+ },
+ {
+ "type": "ImageBlur",
+ "pos": [
+ 320,
+ 650
+ ],
+ "size": {
+ "0": 315,
+ "1": 82
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageBlur"
+ },
+ "widgets_values": [
+ 20,
+ 1
+ ],
+ "index": 1
+ },
+ {
+ "type": "PreviewImage",
+ "pos": [
+ 500,
+ 640
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ },
+ "index": 2
+ },
+ {
+ "type": "ImageToMask",
+ "pos": [
+ 380,
+ 540
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageToMask"
+ },
+ "widgets_values": [
+ "red"
+ ],
+ "index": 3
+ }
+ ],
+ "links": [
+ [
+ null,
+ 0,
+ 0,
+ 0,
+ 24,
+ "MASK"
+ ],
+ [
+ 0,
+ 0,
+ 1,
+ 0,
+ 39,
+ ],
+ [
+ 1,
+ 0,
+ 2,
+ 0,
+ 38,
+ ],
+ [
+ 1,
+ 0,
+ 3,
+ 0,
+ 38,
+ ]
+ ],
+ "external": [
+ [
+ 3,
+ 0,
+ "MASK"
+ ]
+ ]
+ }
+ }
+ },
+ "version": 0.4
\ No newline at end of file
diff --git a/ComfyUI/workflows/sdxl.json b/ComfyUI/workflows/sdxl.json
new file mode 100644
index 0000000..c8ed132
--- /dev/null
+++ b/ComfyUI/workflows/sdxl.json
@@ -0,0 +1,366 @@
+ "last_node_id": 11,
+ "last_link_id": 9,
+ "nodes": [
+ {
+ "id": 6,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 515,
+ 130
+ ],
+ "size": {
+ "0": 422.84503173828125,
+ "1": 164.31304931640625
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 3
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 4
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "a serene landscape under the sun"
+ ]
+ },
+ {
+ "id": 9,
+ "type": "SaveImage",
+ "pos": [
+ 1765.2780151367188,
+ 130
+ ],
+ "size": {
+ "0": 210,
+ "1": 270
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 9
+ }
+ ],
+ "properties": {},
+ "widgets_values": [
+ "ComfyUI"
+ ]
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 1455.2780151367188,
+ 130
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 7
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 9
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 5,
+ "type": "EmptyLatentImage",
+ "pos": [
+ 100,
+ 130
+ ],
+ "size": {
+ "0": 315,
+ "1": 106
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 2
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 1024,
+ 512,
+ 1
+ ]
+ },
+ {
+ "id": 7,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 515,
+ 424.31304931640625
+ ],
+ "size": {
+ "0": 425.27801513671875,
+ "1": 180.6060791015625
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 5
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 6
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "text, watermark"
+ ]
+ },
+ {
+ "id": 4,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 100,
+ 366
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 1
+ ],
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 3,
+ 5
+ ],
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 8
+ ],
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sd_xl_base_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1040.2780151367188,
+ 130
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 1
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 4
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 6
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 2
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 7
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 610640153339234,
+ "randomize",
+ 20,
+ 8,
+ "euler",
+ "normal",
+ 1
+ ]
+ }
+ ],
+ "links": [
+ [
+ 1,
+ 4,
+ 0,
+ 3,
+ 0,
+ ],
+ [
+ 2,
+ 5,
+ 0,
+ 3,
+ 3,
+ ],
+ [
+ 3,
+ 4,
+ 1,
+ 6,
+ 0,
+ "CLIP"
+ ],
+ [
+ 4,
+ 6,
+ 0,
+ 3,
+ 1,
+ ],
+ [
+ 5,
+ 4,
+ 1,
+ 7,
+ 0,
+ "CLIP"
+ ],
+ [
+ 6,
+ 7,
+ 0,
+ 3,
+ 2,
+ ],
+ [
+ 7,
+ 3,
+ 0,
+ 8,
+ 0,
+ ],
+ [
+ 8,
+ 4,
+ 2,
+ 8,
+ 1,
+ "VAE"
+ ],
+ [
+ 9,
+ 8,
+ 0,
+ 9,
+ 0,
+ ]
+ ],
+ "groups": [],
+ "config": {},
+ "extra": {},
+ "version": 0.4
\ No newline at end of file
diff --git a/ComfyUI/workflows/sdxl_inpainting_demo.json b/ComfyUI/workflows/sdxl_inpainting_demo.json
new file mode 100644
index 0000000..acd1b8b
--- /dev/null
+++ b/ComfyUI/workflows/sdxl_inpainting_demo.json
@@ -0,0 +1,830 @@
+ "last_node_id": 32,
+ "last_link_id": 45,
+ "nodes": [
+ {
+ "id": 15,
+ "type": "UNETLoader",
+ "pos": [
+ 100,
+ 130
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 16
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader"
+ },
+ "widgets_values": [
+ "sd_xl_inpainting_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 9,
+ "type": "SaveImage",
+ "pos": [
+ 1845,
+ 130
+ ],
+ "size": {
+ "0": 210,
+ "1": 270
+ },
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 9
+ }
+ ],
+ "properties": {},
+ "widgets_values": [
+ "ComfyUI"
+ ]
+ },
+ {
+ "id": 20,
+ "type": "VAEDecode",
+ "pos": [
+ 1845,
+ 530
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 38
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 23
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 22
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 21,
+ "type": "SaveImage",
+ "pos": [
+ 2155,
+ 130
+ ],
+ "size": [
+ 315,
+ 270
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 22
+ }
+ ],
+ "properties": {},
+ "widgets_values": [
+ "ComfyUI"
+ ]
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 1430,
+ 130
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 7
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 9
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 18,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 100,
+ 318
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 17
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 33,
+ 35
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 23
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sd_xl_refiner_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 28,
+ "type": "CLIPTextEncodeSDXLRefiner",
+ "pos": [
+ 515,
+ 460
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 35
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 36
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXLRefiner"
+ },
+ "widgets_values": [
+ 6,
+ 1024,
+ 1024,
+ "text, watermark"
+ ]
+ },
+ {
+ "id": 4,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 100,
+ 546
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [],
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 40,
+ 42
+ ],
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 8,
+ 11
+ ],
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sd_xl_base_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 12,
+ "type": "VAEEncodeForInpaint",
+ "pos": [
+ 515,
+ 1590
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": 10
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 11
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 12
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 13
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEEncodeForInpaint"
+ },
+ "widgets_values": [
+ 30
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1015,
+ 130
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 16
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 41
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 43
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 13
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 7,
+ 39
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 424987263190372,
+ "randomize",
+ 30,
+ 8,
+ "euler",
+ "normal",
+ 0.9
+ ]
+ },
+ {
+ "id": 19,
+ "type": "KSampler",
+ "pos": [
+ 1430,
+ 306
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 17
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 34
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 36
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 39
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 38
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 1043567145924620,
+ "randomize",
+ 20,
+ 8,
+ "euler",
+ "normal",
+ 1
+ ]
+ },
+ {
+ "id": 10,
+ "type": "LoadImage",
+ "pos": [
+ 100,
+ 774
+ ],
+ "size": {
+ "0": 315,
+ "1": 314
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 10
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 12
+ ],
+ "shape": 3,
+ "slot_index": 1
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadImage"
+ },
+ "widgets_values": [
+ "clipspace/clipspace-mask-257596.png [input]",
+ "image"
+ ]
+ },
+ {
+ "id": 27,
+ "type": "CLIPTextEncodeSDXLRefiner",
+ "pos": [
+ 515,
+ 130
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 33
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 34
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXLRefiner"
+ },
+ "widgets_values": [
+ 6,
+ 1024,
+ 1024,
+ "the sun shining"
+ ]
+ },
+ {
+ "id": 29,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ 515,
+ 790
+ ],
+ "size": {
+ "0": 400,
+ "1": 270
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 40
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 41
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 0,
+ 0,
+ 1024,
+ 1024,
+ "the sun shining",
+ ""
+ ]
+ },
+ {
+ "id": 30,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ 515,
+ 1190
+ ],
+ "size": {
+ "0": 400,
+ "1": 270
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 42
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 43
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 0,
+ 0,
+ 1024,
+ 1024,
+ "text, watermark",
+ "text, watermark"
+ ]
+ }
+ ],
+ "links": [
+ [
+ 7,
+ 3,
+ 0,
+ 8,
+ 0,
+ ],
+ [
+ 8,
+ 4,
+ 2,
+ 8,
+ 1,
+ "VAE"
+ ],
+ [
+ 9,
+ 8,
+ 0,
+ 9,
+ 0,
+ ],
+ [
+ 10,
+ 10,
+ 0,
+ 12,
+ 0,
+ ],
+ [
+ 11,
+ 4,
+ 2,
+ 12,
+ 1,
+ "VAE"
+ ],
+ [
+ 12,
+ 10,
+ 1,
+ 12,
+ 2,
+ "MASK"
+ ],
+ [
+ 13,
+ 12,
+ 0,
+ 3,
+ 3,
+ ],
+ [
+ 16,
+ 15,
+ 0,
+ 3,
+ 0,
+ ],
+ [
+ 17,
+ 18,
+ 0,
+ 19,
+ 0,
+ ],
+ [
+ 22,
+ 20,
+ 0,
+ 21,
+ 0,
+ ],
+ [
+ 23,
+ 18,
+ 2,
+ 20,
+ 1,
+ "VAE"
+ ],
+ [
+ 33,
+ 18,
+ 1,
+ 27,
+ 0,
+ "CLIP"
+ ],
+ [
+ 34,
+ 27,
+ 0,
+ 19,
+ 1,
+ ],
+ [
+ 35,
+ 18,
+ 1,
+ 28,
+ 0,
+ "CLIP"
+ ],
+ [
+ 36,
+ 28,
+ 0,
+ 19,
+ 2,
+ ],
+ [
+ 38,
+ 19,
+ 0,
+ 20,
+ 0,
+ ],
+ [
+ 39,
+ 3,
+ 0,
+ 19,
+ 3,
+ ],
+ [
+ 40,
+ 4,
+ 1,
+ 29,
+ 0,
+ "CLIP"
+ ],
+ [
+ 41,
+ 29,
+ 0,
+ 3,
+ 1,
+ ],
+ [
+ 42,
+ 4,
+ 1,
+ 30,
+ 0,
+ "CLIP"
+ ],
+ [
+ 43,
+ 30,
+ 0,
+ 3,
+ 2,
+ ]
+ ],
+ "groups": [],
+ "config": {},
+ "extra": {},
+ "version": 0.4
\ No newline at end of file
diff --git a/ComfyUI/workflows/sdxl_with_refiner.json b/ComfyUI/workflows/sdxl_with_refiner.json
new file mode 100644
index 0000000..488daaf
--- /dev/null
+++ b/ComfyUI/workflows/sdxl_with_refiner.json
@@ -0,0 +1,952 @@
+ "last_node_id": 34,
+ "last_link_id": 53,
+ "nodes": [
+ {
+ "id": 24,
+ "type": "VAEDecode",
+ "pos": [
+ 2233.40966796875,
+ -157.24490356445312
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 25
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 35
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 43
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 22,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 1228.8193005475018,
+ -192.48979517142766
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 44
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 37,
+ 40
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 35
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sd_xl_refiner_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 31,
+ "type": "CLIPTextEncodeSDXLRefiner",
+ "pos": [
+ 1583,
+ 36
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 40
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "link": 39,
+ "widget": {
+ "name": "text"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 41
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXLRefiner"
+ },
+ "widgets_values": [
+ 6,
+ 2048,
+ 1024,
+ "text, logo, borders, frame"
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 30,
+ "type": "CLIPTextEncodeSDXLRefiner",
+ "pos": [
+ 1580,
+ -16
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 37
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "link": 38,
+ "widget": {
+ "name": "text"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 42
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXLRefiner"
+ },
+ "widgets_values": [
+ 6,
+ 2048,
+ 1024,
+ "a beautiful landscape with trees and a mountain in the background"
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 16,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 100,
+ -200
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 18
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 11,
+ 15
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 12
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sd_xl_base_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 14,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ 469,
+ -56
+ ],
+ "size": {
+ "0": 400,
+ "1": 270
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 11
+ },
+ {
+ "name": "text_g",
+ "type": "STRING",
+ "link": 17,
+ "widget": {
+ "name": "text_g"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 19
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 128,
+ 128,
+ 0,
+ 0,
+ 2048,
+ 1024,
+ "a beautiful landscape with trees and a mountain in the background",
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 17,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ 464,
+ 29
+ ],
+ "size": {
+ "0": 400,
+ "1": 270
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 15
+ },
+ {
+ "name": "text_g",
+ "type": "STRING",
+ "link": 23,
+ "widget": {
+ "name": "text_g"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 20
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 128,
+ 128,
+ 0,
+ 0,
+ 2048,
+ 1024,
+ "text, logo, borders, frame",
+ ""
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 28,
+ "type": "KSamplerAdvanced",
+ "pos": [
+ 1839.40966796875,
+ -184.24490356445312
+ ],
+ "size": {
+ "0": 315,
+ "1": 334
+ },
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 44
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 42
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 41
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 52
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 25
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSamplerAdvanced"
+ },
+ "widgets_values": [
+ "enable",
+ 606835094522680,
+ "randomize",
+ 100,
+ 8,
+ "dpmpp_2m_sde_gpu",
+ "normal",
+ 40,
+ 10000,
+ "disable"
+ ],
+ "color": "#323",
+ "bgcolor": "#535"
+ },
+ {
+ "id": 19,
+ "type": "KSamplerAdvanced",
+ "pos": [
+ 740,
+ -160
+ ],
+ "size": {
+ "0": 315,
+ "1": 334
+ },
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 18
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 19
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 20
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 21
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 51
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSamplerAdvanced"
+ },
+ "widgets_values": [
+ "enable",
+ 249141753340176,
+ "randomize",
+ 50,
+ 8,
+ "dpmpp_2m_sde_gpu",
+ "normal",
+ 0,
+ 40,
+ "enable"
+ ],
+ "color": "#323",
+ "bgcolor": "#535"
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 1434,
+ 342
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 53
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 12
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 24
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 21,
+ "type": "PreviewImage",
+ "pos": [
+ 1797,
+ 344
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 24
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 34,
+ "type": "Reroute",
+ "pos": [
+ 1132,
+ 0
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 51
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "LATENT",
+ "links": [
+ 52,
+ 53
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 18,
+ "type": "PrimitiveNode",
+ "pos": [
+ -228,
+ -23
+ ],
+ "size": {
+ "0": 210,
+ "1": 76
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 17,
+ 38
+ ],
+ "slot_index": 0,
+ "widget": {
+ "name": "text_g"
+ }
+ }
+ ],
+ "properties": {
+ "Run widget replace on values": false
+ },
+ "widgets_values": [
+ "a beautiful landscape with trees and a mountain in the background"
+ ],
+ "color": "#432",
+ "bgcolor": "#653"
+ },
+ {
+ "id": 5,
+ "type": "EmptyLatentImage",
+ "pos": [
+ 330,
+ 90
+ ],
+ "size": {
+ "0": 315,
+ "1": 106
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 21
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 1024,
+ 512,
+ 8
+ ]
+ },
+ {
+ "id": 20,
+ "type": "PrimitiveNode",
+ "pos": [
+ -229,
+ 135
+ ],
+ "size": {
+ "0": 210,
+ "1": 76
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 23,
+ 39
+ ],
+ "slot_index": 0,
+ "widget": {
+ "name": "text_g"
+ }
+ }
+ ],
+ "properties": {
+ "Run widget replace on values": false
+ },
+ "widgets_values": [
+ "text, logo, borders, frame"
+ ],
+ "color": "#432",
+ "bgcolor": "#653"
+ },
+ {
+ "id": 32,
+ "type": "SaveImage",
+ "pos": [
+ 2555,
+ -157
+ ],
+ "size": {
+ "0": 315,
+ "1": 270
+ },
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 43
+ }
+ ],
+ "properties": {},
+ "widgets_values": [
+ "sdxl_w_ref"
+ ]
+ }
+ ],
+ "links": [
+ [
+ 11,
+ 16,
+ 1,
+ 14,
+ 0,
+ "CLIP"
+ ],
+ [
+ 12,
+ 16,
+ 2,
+ 8,
+ 1,
+ "VAE"
+ ],
+ [
+ 15,
+ 16,
+ 1,
+ 17,
+ 0,
+ "CLIP"
+ ],
+ [
+ 17,
+ 18,
+ 0,
+ 14,
+ 1,
+ ],
+ [
+ 18,
+ 16,
+ 0,
+ 19,
+ 0,
+ ],
+ [
+ 19,
+ 14,
+ 0,
+ 19,
+ 1,
+ ],
+ [
+ 20,
+ 17,
+ 0,
+ 19,
+ 2,
+ ],
+ [
+ 21,
+ 5,
+ 0,
+ 19,
+ 3,
+ ],
+ [
+ 23,
+ 20,
+ 0,
+ 17,
+ 1,
+ ],
+ [
+ 24,
+ 8,
+ 0,
+ 21,
+ 0,
+ ],
+ [
+ 25,
+ 28,
+ 0,
+ 24,
+ 0,
+ ],
+ [
+ 35,
+ 22,
+ 2,
+ 24,
+ 1,
+ "VAE"
+ ],
+ [
+ 37,
+ 22,
+ 1,
+ 30,
+ 0,
+ "CLIP"
+ ],
+ [
+ 38,
+ 18,
+ 0,
+ 30,
+ 1,
+ ],
+ [
+ 39,
+ 20,
+ 0,
+ 31,
+ 1,
+ ],
+ [
+ 40,
+ 22,
+ 1,
+ 31,
+ 0,
+ "CLIP"
+ ],
+ [
+ 41,
+ 31,
+ 0,
+ 28,
+ 2,
+ ],
+ [
+ 42,
+ 30,
+ 0,
+ 28,
+ 1,
+ ],
+ [
+ 43,
+ 24,
+ 0,
+ 32,
+ 0,
+ ],
+ [
+ 44,
+ 22,
+ 0,
+ 28,
+ 0,
+ ],
+ [
+ 51,
+ 19,
+ 0,
+ 34,
+ 0,
+ "*"
+ ],
+ [
+ 52,
+ 34,
+ 0,
+ 28,
+ 3,
+ ],
+ [
+ 53,
+ 34,
+ 0,
+ 8,
+ 0,
+ ]
+ ],
+ "groups": [
+ {
+ "title": "SDXL Base",
+ "bounding": [
+ 90,
+ -274,
+ 1041,
+ 479
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Group",
+ "bounding": [
+ 1219,
+ -266,
+ 1299,
+ 544
+ ],
+ "color": "#8A8",
+ "font_size": 24,
+ "locked": false
+ }
+ ],
+ "config": {},
+ "extra": {},
+ "version": 0.4
\ No newline at end of file
diff --git a/ComfyUI/workflows/text_to_skybox.json b/ComfyUI/workflows/text_to_skybox.json
new file mode 100644
index 0000000..ecd438d
--- /dev/null
+++ b/ComfyUI/workflows/text_to_skybox.json
@@ -0,0 +1,3518 @@
+ "last_node_id": 139,
+ "last_link_id": 233,
+ "nodes": [
+ {
+ "id": 44,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 4514.93927980685,
+ 896.9013427646663
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 82
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 80,
+ 85
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 77
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sd_xl_refiner_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 53,
+ "type": "CLIPTextEncodeSDXLRefiner",
+ "pos": [
+ 5118.939279806862,
+ 903.9013427646663
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 85
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "link": 84,
+ "widget": {
+ "name": "text"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 86
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXLRefiner"
+ },
+ "widgets_values": [
+ 6,
+ 1024,
+ 1024,
+ "a text, a logo, borders"
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 72,
+ "type": "PreviewImage",
+ "pos": [
+ 6732.320592261197,
+ 1022.4875354181333
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 55,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 116
+ }
+ ],
+ "title": "Refined Inpainted Image",
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 69,
+ "type": "PreviewImage",
+ "pos": [
+ 6728.320592261197,
+ 680.4875354181331
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 52,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 114
+ }
+ ],
+ "title": "Inpainted Image",
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 64,
+ "type": "PreviewImage",
+ "pos": [
+ 6726.320592261197,
+ 348.48753541813363
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 48,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 110
+ }
+ ],
+ "title": "Masked Image",
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 51,
+ "type": "CLIPTextEncodeSDXLRefiner",
+ "pos": [
+ 5115.939279806862,
+ 850.9013427646667
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 80
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "link": 81,
+ "widget": {
+ "name": "text"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 83
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXLRefiner"
+ },
+ "widgets_values": [
+ 6,
+ 1024,
+ 1024,
+ "a landscape of a calm lake during winter"
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 47,
+ "type": "PrimitiveNode",
+ "pos": [
+ -2704.9724782326266,
+ -1546.9441847257217
+ ],
+ "size": {
+ "0": 210,
+ "1": 76.00001525878906
+ },
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 84,
+ 98
+ ],
+ "slot_index": 0,
+ "widget": {
+ "name": "text"
+ }
+ }
+ ],
+ "properties": {
+ "Run widget replace on values": false
+ },
+ "widgets_values": [
+ "a text, a logo, borders"
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 103,
+ "type": "VAEEncodeForInpaint",
+ "pos": [
+ 276.26822492539986,
+ -1508.7930392200958
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 29,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": 158
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 210
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 159
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 160
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEEncodeForInpaint"
+ },
+ "widgets_values": [
+ 50
+ ]
+ },
+ {
+ "id": 79,
+ "type": "VAEDecode",
+ "pos": [
+ -868.7971534193995,
+ -2118.442204567087
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 23,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 127
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 204
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 133,
+ 164
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 105,
+ "type": "VAEDecode",
+ "pos": [
+ 1533.0353624058816,
+ -1296.2245560660915
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 32,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 167
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 211
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 169,
+ 200
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 102,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -322.31398221552763,
+ -1265.5950233140627
+ ],
+ "size": {
+ "0": 400,
+ "1": 200
+ },
+ "flags": {
+ "collapsed": false
+ },
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 229
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 155
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "text, logo, clouds"
+ ]
+ },
+ {
+ "id": 123,
+ "type": "VAEEncodeForInpaint",
+ "pos": [
+ 2305.68769282187,
+ -862.2867925875006
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 36,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": 188
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 213
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 189
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 192
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEEncodeForInpaint"
+ },
+ "widgets_values": [
+ 30
+ ]
+ },
+ {
+ "id": 52,
+ "type": "KSampler",
+ "pos": [
+ 5436.58734015978,
+ 682.9881587332845
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 50,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 82
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 83
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 86
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 111
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 88
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 551062170539516,
+ "randomize",
+ 30,
+ 8,
+ "euler",
+ "normal",
+ 0.2
+ ]
+ },
+ {
+ "id": 76,
+ "type": "PrimitiveNode",
+ "pos": [
+ -2700.127957755949,
+ -1275.3608583919813
+ ],
+ "size": {
+ "0": 210,
+ "1": 82
+ },
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 120,
+ 122,
+ 131
+ ],
+ "slot_index": 0,
+ "widget": {
+ "name": "height"
+ }
+ }
+ ],
+ "title": "Initial Image Height\n",
+ "properties": {
+ "Run widget replace on values": false
+ },
+ "widgets_values": [
+ 512,
+ "fixed"
+ ]
+ },
+ {
+ "id": 75,
+ "type": "PrimitiveNode",
+ "pos": [
+ -2704.127957755949,
+ -1408.3608583919818
+ ],
+ "size": {
+ "0": 210,
+ "1": 82
+ },
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 119,
+ 121,
+ 130
+ ],
+ "slot_index": 0,
+ "widget": {
+ "name": "width"
+ }
+ }
+ ],
+ "title": "Initial Image Width\n",
+ "properties": {
+ "Run widget replace on values": false
+ },
+ "widgets_values": [
+ 2048,
+ "fixed"
+ ]
+ },
+ {
+ "id": 82,
+ "type": "EmptyLatentImage",
+ "pos": [
+ -1873.7971534193998,
+ -2014.442204567088
+ ],
+ "size": {
+ "0": 315,
+ "1": 106
+ },
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "width",
+ "type": "INT",
+ "link": 130,
+ "widget": {
+ "name": "width"
+ }
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "link": 131,
+ "widget": {
+ "name": "height"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 132
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage"
+ },
+ "widgets_values": [
+ 2048,
+ 512,
+ 2
+ ]
+ },
+ {
+ "id": 56,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ -1883,
+ -1834
+ ],
+ "size": {
+ "0": 400,
+ "1": 270
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 95
+ },
+ {
+ "name": "text_g",
+ "type": "STRING",
+ "link": 97,
+ "widget": {
+ "name": "text_g"
+ }
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "link": 119,
+ "widget": {
+ "name": "width"
+ }
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "link": 120,
+ "widget": {
+ "name": "height"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 96,
+ 124
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 2048,
+ 512,
+ 0,
+ 0,
+ 2048,
+ 512,
+ "a landscape of a calm lake during winter",
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 57,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ -1878,
+ -1781
+ ],
+ "size": {
+ "0": 400,
+ "1": 270.0000305175781
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 99
+ },
+ {
+ "name": "text_g",
+ "type": "STRING",
+ "link": 98,
+ "widget": {
+ "name": "text_g"
+ }
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "link": 121,
+ "widget": {
+ "name": "width"
+ }
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "link": 122,
+ "widget": {
+ "name": "height"
+ }
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 100,
+ 125
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 2048,
+ 512,
+ 0,
+ 0,
+ 2048,
+ 512,
+ "a text, a logo, borders",
+ "a logo, text, borders"
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 78,
+ "type": "KSampler",
+ "pos": [
+ -1389.7971534193982,
+ -2047.442204567088
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 123
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 124
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 125
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 132
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 127
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 1035767414431814,
+ "randomize",
+ 20,
+ 8,
+ "euler",
+ "normal",
+ 0.8
+ ]
+ },
+ {
+ "id": 71,
+ "type": "PreviewImage",
+ "pos": [
+ 6690,
+ -860
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 25,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 133
+ }
+ ],
+ "title": "Initial Image",
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 106,
+ "type": "PreviewImage",
+ "pos": [
+ 6700,
+ -490
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 33,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 169
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 126,
+ "type": "PreviewImage",
+ "pos": [
+ 6710,
+ -160
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 39,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 194
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 129,
+ "type": "Reroute",
+ "pos": [
+ 1582,
+ -411
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 212
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "VAE",
+ "links": [
+ 213,
+ 214,
+ 215
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 49,
+ "type": "VAEDecode",
+ "pos": [
+ 5836,
+ 931
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 53,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 88
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 77
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 116,
+ 117
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 130,
+ "type": "Reroute",
+ "pos": [
+ 3917,
+ 502
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 215
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "VAE",
+ "links": [
+ 216,
+ 217
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 128,
+ "type": "Reroute",
+ "pos": [
+ -117,
+ -985
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 209
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "VAE",
+ "links": [
+ 210,
+ 211,
+ 212
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 127,
+ "type": "Reroute",
+ "pos": [
+ -1378,
+ -1240
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 203
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "VAE",
+ "links": [
+ 204,
+ 209
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 34,
+ "type": "UNETLoader",
+ "pos": [
+ -2756,
+ -941
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 220
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader"
+ },
+ "widgets_values": [
+ "sd_xl_inpainting_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 133,
+ "type": "Reroute",
+ "pos": [
+ 757,
+ -873
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 220
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "MODEL",
+ "links": [
+ 221,
+ 222,
+ 223
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 121,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ 2060,
+ -679
+ ],
+ "size": {
+ "0": 400,
+ "1": 270
+ },
+ "flags": {
+ "collapsed": false
+ },
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 225
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 190
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 0,
+ 0,
+ 1024,
+ 1024,
+ "ground viewed from above",
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 122,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ 2040,
+ -334
+ ],
+ "size": {
+ "0": 400,
+ "1": 270
+ },
+ "flags": {
+ "collapsed": false
+ },
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 226
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 191
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 0,
+ 0,
+ 1024,
+ 1024,
+ "",
+ ""
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 134,
+ "type": "Reroute",
+ "pos": [
+ 1573,
+ -535
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 227
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "CLIP",
+ "links": [
+ 225,
+ 226
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 25,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -2753.9724782326266,
+ -1126.9441847257215
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 123
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 95,
+ 99,
+ 227,
+ 228
+ ],
+ "shape": 3,
+ "slot_index": 1
+ },
+ {
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 203
+ ],
+ "shape": 3,
+ "slot_index": 2
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple"
+ },
+ "widgets_values": [
+ "sd_xl_base_1.0.safetensors"
+ ]
+ },
+ {
+ "id": 97,
+ "type": "CLIPTextEncodeSDXL",
+ "pos": [
+ -309,
+ -1387
+ ],
+ "size": {
+ "0": 400,
+ "1": 270
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "clip",
+ "type": "CLIP",
+ "link": 230
+ }
+ ],
+ "outputs": [
+ {
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 154
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncodeSDXL"
+ },
+ "widgets_values": [
+ 2048,
+ 2048,
+ 0,
+ 0,
+ 2048,
+ 2048,
+ "the sky viewed from below",
+ ""
+ ]
+ },
+ {
+ "id": 135,
+ "type": "Reroute",
+ "pos": [
+ -665,
+ -1176
+ ],
+ "size": [
+ 75,
+ 26
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 228
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "CLIP",
+ "links": [
+ 229,
+ 230
+ ],
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false
+ }
+ },
+ {
+ "id": 88,
+ "type": "ImageStretchForOutpaint",
+ "pos": [
+ -233,
+ -1688
+ ],
+ "size": {
+ "0": 315,
+ "1": 126
+ },
+ "flags": {},
+ "order": 26,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 164
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 142,
+ 158
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 151,
+ 159
+ ],
+ "shape": 3,
+ "slot_index": 1
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageStretchForOutpaint"
+ },
+ "widgets_values": [
+ 1024,
+ 0,
+ 10
+ ]
+ },
+ {
+ "id": 95,
+ "type": "MaskToImage",
+ "pos": [
+ 337,
+ -1710
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {
+ "collapsed": false
+ },
+ "order": 28,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 151,
+ "slot_index": 0
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 150
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskToImage"
+ }
+ },
+ {
+ "id": 96,
+ "type": "PreviewImage",
+ "pos": [
+ 760,
+ -1747
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {
+ "collapsed": false
+ },
+ "order": 30,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 150
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 68,
+ "type": "SaveImage",
+ "pos": [
+ 6668,
+ 1410
+ ],
+ "size": {
+ "0": 315,
+ "1": 270.0000305175781
+ },
+ "flags": {},
+ "order": 54,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 113
+ }
+ ],
+ "title": "Inpainted Image",
+ "properties": {},
+ "widgets_values": [
+ "skybox"
+ ]
+ },
+ {
+ "id": 73,
+ "type": "MiddleSplit",
+ "pos": [
+ 6094,
+ 1014
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {},
+ "order": 56,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 117
+ }
+ ],
+ "outputs": [
+ {
+ "name": "Image",
+ "type": "IMAGE",
+ "links": [
+ 118,
+ 231
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MiddleSplit"
+ }
+ },
+ {
+ "id": 125,
+ "type": "VAEDecode",
+ "pos": [
+ 3080,
+ -412
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 38,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 193
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 214
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 194,
+ 201,
+ 202
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 24,
+ "type": "VerticalMiddleMask",
+ "pos": [
+ 3360,
+ 298
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {},
+ "order": 40,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 201
+ }
+ ],
+ "outputs": [
+ {
+ "name": "Mask",
+ "type": "MASK",
+ "links": [
+ 106,
+ 107
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VerticalMiddleMask"
+ },
+ "widgets_values": [
+ 168
+ ]
+ },
+ {
+ "id": 21,
+ "type": "MiddleSplit",
+ "pos": [
+ 3756,
+ 137
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {},
+ "order": 41,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 202
+ }
+ ],
+ "outputs": [
+ {
+ "name": "Image",
+ "type": "IMAGE",
+ "links": [
+ 42,
+ 49,
+ 109
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MiddleSplit"
+ }
+ },
+ {
+ "id": 63,
+ "type": "MaskToImage",
+ "pos": [
+ 3740,
+ 267
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {
+ "collapsed": false
+ },
+ "order": 43,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 107
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 108
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskToImage"
+ }
+ },
+ {
+ "id": 42,
+ "type": "workflow/Blurry Mask",
+ "pos": [
+ 3786,
+ 386
+ ],
+ "size": {
+ "0": 315,
+ "1": 318
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 42,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 106
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 67
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "workflow/Blurry Mask"
+ },
+ "widgets_values": [
+ 10,
+ 1,
+ "red"
+ ]
+ },
+ {
+ "id": 61,
+ "type": "ImageBlend",
+ "pos": [
+ 4114,
+ 233
+ ],
+ "size": {
+ "0": 315,
+ "1": 102
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 46,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image1",
+ "type": "IMAGE",
+ "link": 109
+ },
+ {
+ "name": "image2",
+ "type": "IMAGE",
+ "link": 108
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 110
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageBlend"
+ },
+ "widgets_values": [
+ 0.5,
+ "multiply"
+ ]
+ },
+ {
+ "id": 27,
+ "type": "VAEEncodeForInpaint",
+ "pos": [
+ 4411,
+ 133
+ ],
+ "size": {
+ "0": 315,
+ "1": 98
+ },
+ "flags": {},
+ "order": 45,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": 49
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 216
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": 67
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 55
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEEncodeForInpaint"
+ },
+ "widgets_values": [
+ 30
+ ]
+ },
+ {
+ "id": 32,
+ "type": "KSampler",
+ "pos": [
+ 4784,
+ 201
+ ],
+ "size": {
+ "0": 315,
+ "1": 262
+ },
+ "flags": {},
+ "order": 47,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 223
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 96
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 100
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 55
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 56,
+ 111
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 346909251795691,
+ "randomize",
+ 50,
+ 8,
+ "euler",
+ "normal",
+ 0.9
+ ]
+ },
+ {
+ "id": 28,
+ "type": "VAEDecode",
+ "pos": [
+ 5170,
+ 467
+ ],
+ "size": {
+ "0": 210,
+ "1": 46
+ },
+ "flags": {},
+ "order": 49,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "samples",
+ "type": "LATENT",
+ "link": 56
+ },
+ {
+ "name": "vae",
+ "type": "VAE",
+ "link": 217
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 112,
+ 114
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 67,
+ "type": "MiddleSplit",
+ "pos": [
+ 5456,
+ 422
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {},
+ "order": 51,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 112
+ }
+ ],
+ "outputs": [
+ {
+ "name": "Image",
+ "type": "IMAGE",
+ "links": [
+ 113
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MiddleSplit"
+ }
+ },
+ {
+ "id": 136,
+ "type": "ImageReverseLambert",
+ "pos": [
+ 6334,
+ 1069
+ ],
+ "size": {
+ "0": 226.8000030517578,
+ "1": 26
+ },
+ "flags": {},
+ "order": 58,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 231
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 232
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageReverseLambert"
+ }
+ },
+ {
+ "id": 137,
+ "type": "SaveImage",
+ "pos": [
+ 6664,
+ 2058
+ ],
+ "size": {
+ "0": 315,
+ "1": 270
+ },
+ "flags": {},
+ "order": 59,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 232
+ }
+ ],
+ "properties": {},
+ "widgets_values": [
+ "skybox_refined_lambert"
+ ]
+ },
+ {
+ "id": 83,
+ "type": "PreviewImage",
+ "pos": [
+ 332,
+ -1329
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {
+ "collapsed": false
+ },
+ "order": 27,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 142
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 120,
+ "type": "ImageStretchForOutpaint",
+ "pos": [
+ 1914.6118936249993,
+ -902.1584717031254
+ ],
+ "size": {
+ "0": 315,
+ "1": 126
+ },
+ "flags": {},
+ "order": 34,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": 200
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 188,
+ 233
+ ],
+ "shape": 3,
+ "slot_index": 0
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 189
+ ],
+ "shape": 3,
+ "slot_index": 1
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageStretchForOutpaint"
+ },
+ "widgets_values": [
+ 0,
+ 512,
+ 30
+ ]
+ },
+ {
+ "id": 138,
+ "type": "PreviewImage",
+ "pos": [
+ 2642,
+ -528
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {},
+ "order": 35,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 233
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 3,
+ "type": "PreviewImage",
+ "pos": [
+ 4291,
+ 286
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {
+ "collapsed": false
+ },
+ "order": 44,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 42
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ }
+ },
+ {
+ "id": 46,
+ "type": "PrimitiveNode",
+ "pos": [
+ -2709.8029387054517,
+ -1675.4843737657613
+ ],
+ "size": {
+ "0": 210,
+ "1": 76.00000762939453
+ },
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "outputs": [
+ {
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 81,
+ 97
+ ],
+ "slot_index": 0,
+ "widget": {
+ "name": "text"
+ }
+ }
+ ],
+ "properties": {
+ "Run widget replace on values": false
+ },
+ "widgets_values": [
+ "a landscape of a calm lake during winter"
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 100,
+ "type": "KSamplerAdvanced",
+ "pos": [
+ 1005.2682249254053,
+ -1472.7930392200956
+ ],
+ "size": {
+ "0": 315,
+ "1": 334
+ },
+ "flags": {},
+ "order": 31,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 221
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 154
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 155
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 160
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 167
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSamplerAdvanced"
+ },
+ "widgets_values": [
+ "enable",
+ 35120899948498,
+ "randomize",
+ 30,
+ 8,
+ "euler",
+ "normal",
+ 0,
+ 10000,
+ "enable"
+ ]
+ },
+ {
+ "id": 124,
+ "type": "KSamplerAdvanced",
+ "pos": [
+ 2696,
+ -952
+ ],
+ "size": {
+ "0": 315,
+ "1": 334
+ },
+ "flags": {},
+ "order": 37,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "model",
+ "type": "MODEL",
+ "link": 222
+ },
+ {
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 190
+ },
+ {
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 191
+ },
+ {
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 192
+ }
+ ],
+ "outputs": [
+ {
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 193
+ ],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSamplerAdvanced"
+ },
+ "widgets_values": [
+ "enable",
+ 357250320487448,
+ "randomize",
+ 30,
+ 8,
+ "euler",
+ "normal",
+ 0,
+ 10000,
+ "disable"
+ ]
+ },
+ {
+ "id": 50,
+ "type": "SaveImage",
+ "pos": [
+ 6677,
+ 1729
+ ],
+ "size": {
+ "0": 315,
+ "1": 270.00006103515625
+ },
+ "flags": {},
+ "order": 57,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": 118
+ }
+ ],
+ "title": "Inpainted Refined Image",
+ "properties": {},
+ "widgets_values": [
+ "skybox_refined"
+ ]
+ }
+ ],
+ "links": [
+ [
+ 42,
+ 21,
+ 0,
+ 3,
+ 0,
+ ],
+ [
+ 49,
+ 21,
+ 0,
+ 27,
+ 0,
+ ],
+ [
+ 55,
+ 27,
+ 0,
+ 32,
+ 3,
+ ],
+ [
+ 56,
+ 32,
+ 0,
+ 28,
+ 0,
+ ],
+ [
+ 67,
+ 42,
+ 0,
+ 27,
+ 2,
+ "MASK"
+ ],
+ [
+ 77,
+ 44,
+ 2,
+ 49,
+ 1,
+ "VAE"
+ ],
+ [
+ 80,
+ 44,
+ 1,
+ 51,
+ 0,
+ "CLIP"
+ ],
+ [
+ 81,
+ 46,
+ 0,
+ 51,
+ 1,
+ ],
+ [
+ 82,
+ 44,
+ 0,
+ 52,
+ 0,
+ ],
+ [
+ 83,
+ 51,
+ 0,
+ 52,
+ 1,
+ ],
+ [
+ 84,
+ 47,
+ 0,
+ 53,
+ 1,
+ ],
+ [
+ 85,
+ 44,
+ 1,
+ 53,
+ 0,
+ "CLIP"
+ ],
+ [
+ 86,
+ 53,
+ 0,
+ 52,
+ 2,
+ ],
+ [
+ 88,
+ 52,
+ 0,
+ 49,
+ 0,
+ ],
+ [
+ 95,
+ 25,
+ 1,
+ 56,
+ 0,
+ "CLIP"
+ ],
+ [
+ 96,
+ 56,
+ 0,
+ 32,
+ 1,
+ ],
+ [
+ 97,
+ 46,
+ 0,
+ 56,
+ 1,
+ ],
+ [
+ 98,
+ 47,
+ 0,
+ 57,
+ 1,
+ ],
+ [
+ 99,
+ 25,
+ 1,
+ 57,
+ 0,
+ "CLIP"
+ ],
+ [
+ 100,
+ 57,
+ 0,
+ 32,
+ 2,
+ ],
+ [
+ 106,
+ 24,
+ 0,
+ 42,
+ 0,
+ "MASK"
+ ],
+ [
+ 107,
+ 24,
+ 0,
+ 63,
+ 0,
+ "MASK"
+ ],
+ [
+ 108,
+ 63,
+ 0,
+ 61,
+ 1,
+ ],
+ [
+ 109,
+ 21,
+ 0,
+ 61,
+ 0,
+ ],
+ [
+ 110,
+ 61,
+ 0,
+ 64,
+ 0,
+ ],
+ [
+ 111,
+ 32,
+ 0,
+ 52,
+ 3,
+ ],
+ [
+ 112,
+ 28,
+ 0,
+ 67,
+ 0,
+ ],
+ [
+ 113,
+ 67,
+ 0,
+ 68,
+ 0,
+ ],
+ [
+ 114,
+ 28,
+ 0,
+ 69,
+ 0,
+ ],
+ [
+ 116,
+ 49,
+ 0,
+ 72,
+ 0,
+ ],
+ [
+ 117,
+ 49,
+ 0,
+ 73,
+ 0,
+ ],
+ [
+ 118,
+ 73,
+ 0,
+ 50,
+ 0,
+ ],
+ [
+ 119,
+ 75,
+ 0,
+ 56,
+ 2,
+ "INT"
+ ],
+ [
+ 120,
+ 76,
+ 0,
+ 56,
+ 3,
+ "INT"
+ ],
+ [
+ 121,
+ 75,
+ 0,
+ 57,
+ 2,
+ "INT"
+ ],
+ [
+ 122,
+ 76,
+ 0,
+ 57,
+ 3,
+ "INT"
+ ],
+ [
+ 123,
+ 25,
+ 0,
+ 78,
+ 0,
+ ],
+ [
+ 124,
+ 56,
+ 0,
+ 78,
+ 1,
+ ],
+ [
+ 125,
+ 57,
+ 0,
+ 78,
+ 2,
+ ],
+ [
+ 127,
+ 78,
+ 0,
+ 79,
+ 0,
+ ],
+ [
+ 130,
+ 75,
+ 0,
+ 82,
+ 0,
+ "INT"
+ ],
+ [
+ 131,
+ 76,
+ 0,
+ 82,
+ 1,
+ "INT"
+ ],
+ [
+ 132,
+ 82,
+ 0,
+ 78,
+ 3,
+ ],
+ [
+ 133,
+ 79,
+ 0,
+ 71,
+ 0,
+ ],
+ [
+ 142,
+ 88,
+ 0,
+ 83,
+ 0,
+ ],
+ [
+ 150,
+ 95,
+ 0,
+ 96,
+ 0,
+ ],
+ [
+ 151,
+ 88,
+ 1,
+ 95,
+ 0,
+ "MASK"
+ ],
+ [
+ 154,
+ 97,
+ 0,
+ 100,
+ 1,
+ ],
+ [
+ 155,
+ 102,
+ 0,
+ 100,
+ 2,
+ ],
+ [
+ 158,
+ 88,
+ 0,
+ 103,
+ 0,
+ ],
+ [
+ 159,
+ 88,
+ 1,
+ 103,
+ 2,
+ "MASK"
+ ],
+ [
+ 160,
+ 103,
+ 0,
+ 100,
+ 3,
+ ],
+ [
+ 164,
+ 79,
+ 0,
+ 88,
+ 0,
+ ],
+ [
+ 167,
+ 100,
+ 0,
+ 105,
+ 0,
+ ],
+ [
+ 169,
+ 105,
+ 0,
+ 106,
+ 0,
+ ],
+ [
+ 188,
+ 120,
+ 0,
+ 123,
+ 0,
+ ],
+ [
+ 189,
+ 120,
+ 1,
+ 123,
+ 2,
+ "MASK"
+ ],
+ [
+ 190,
+ 121,
+ 0,
+ 124,
+ 1,
+ ],
+ [
+ 191,
+ 122,
+ 0,
+ 124,
+ 2,
+ ],
+ [
+ 192,
+ 123,
+ 0,
+ 124,
+ 3,
+ ],
+ [
+ 193,
+ 124,
+ 0,
+ 125,
+ 0,
+ ],
+ [
+ 194,
+ 125,
+ 0,
+ 126,
+ 0,
+ ],
+ [
+ 200,
+ 105,
+ 0,
+ 120,
+ 0,
+ ],
+ [
+ 201,
+ 125,
+ 0,
+ 24,
+ 0,
+ ],
+ [
+ 202,
+ 125,
+ 0,
+ 21,
+ 0,
+ ],
+ [
+ 203,
+ 25,
+ 2,
+ 127,
+ 0,
+ "*"
+ ],
+ [
+ 204,
+ 127,
+ 0,
+ 79,
+ 1,
+ "VAE"
+ ],
+ [
+ 209,
+ 127,
+ 0,
+ 128,
+ 0,
+ "*"
+ ],
+ [
+ 210,
+ 128,
+ 0,
+ 103,
+ 1,
+ "VAE"
+ ],
+ [
+ 211,
+ 128,
+ 0,
+ 105,
+ 1,
+ "VAE"
+ ],
+ [
+ 212,
+ 128,
+ 0,
+ 129,
+ 0,
+ "*"
+ ],
+ [
+ 213,
+ 129,
+ 0,
+ 123,
+ 1,
+ "VAE"
+ ],
+ [
+ 214,
+ 129,
+ 0,
+ 125,
+ 1,
+ "VAE"
+ ],
+ [
+ 215,
+ 129,
+ 0,
+ 130,
+ 0,
+ "*"
+ ],
+ [
+ 216,
+ 130,
+ 0,
+ 27,
+ 1,
+ "VAE"
+ ],
+ [
+ 217,
+ 130,
+ 0,
+ 28,
+ 1,
+ "VAE"
+ ],
+ [
+ 220,
+ 34,
+ 0,
+ 133,
+ 0,
+ "*"
+ ],
+ [
+ 221,
+ 133,
+ 0,
+ 100,
+ 0,
+ ],
+ [
+ 222,
+ 133,
+ 0,
+ 124,
+ 0,
+ ],
+ [
+ 223,
+ 133,
+ 0,
+ 32,
+ 0,
+ ],
+ [
+ 225,
+ 134,
+ 0,
+ 121,
+ 0,
+ "CLIP"
+ ],
+ [
+ 226,
+ 134,
+ 0,
+ 122,
+ 0,
+ "CLIP"
+ ],
+ [
+ 227,
+ 25,
+ 1,
+ 134,
+ 0,
+ "*"
+ ],
+ [
+ 228,
+ 25,
+ 1,
+ 135,
+ 0,
+ "*"
+ ],
+ [
+ 229,
+ 135,
+ 0,
+ 102,
+ 0,
+ "CLIP"
+ ],
+ [
+ 230,
+ 135,
+ 0,
+ 97,
+ 0,
+ "CLIP"
+ ],
+ [
+ 231,
+ 73,
+ 0,
+ 136,
+ 0,
+ ],
+ [
+ 232,
+ 136,
+ 0,
+ 137,
+ 0,
+ ],
+ [
+ 233,
+ 120,
+ 0,
+ 138,
+ 0,
+ ]
+ ],
+ "groups": [
+ {
+ "title": "Horizontal Tiling",
+ "bounding": [
+ 2846,
+ 60,
+ 2843,
+ 494
+ ],
+ "color": "#b58b2a",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "User Inputs",
+ "bounding": [
+ -2764,
+ -1749,
+ 335,
+ 731
+ ],
+ "color": "#88A",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Refining",
+ "bounding": [
+ 4504,
+ 610,
+ 2062,
+ 534
+ ],
+ "color": "#8A8",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Ouput",
+ "bounding": [
+ 6657,
+ 1324,
+ 335,
+ 1038
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Previewes",
+ "bounding": [
+ 6717,
+ 274,
+ 236,
+ 1004
+ ],
+ "color": "#444",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Sky creation",
+ "bounding": [
+ -383,
+ -1902,
+ 2136,
+ 874
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Initial Image",
+ "bounding": [
+ -1918,
+ -2186,
+ 1342,
+ 443
+ ],
+ "color": "#8AA",
+ "font_size": 24,
+ "locked": false
+ },
+ {
+ "title": "Ground Outpainting",
+ "bounding": [
+ 1892,
+ -1026,
+ 1128,
+ 994
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "locked": false
+ }
+ ],
+ "config": {},
+ "extra": {
+ "groupNodes": {
+ "Blurry Mask": {
+ "nodes": [
+ {
+ "type": "MaskToImage",
+ "pos": [
+ 190,
+ 520
+ ],
+ "size": {
+ "0": 210,
+ "1": 26
+ },
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "mask",
+ "type": "MASK",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskToImage"
+ },
+ "index": 0
+ },
+ {
+ "type": "ImageBlur",
+ "pos": [
+ 320,
+ 650
+ ],
+ "size": {
+ "0": 315,
+ "1": 82
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageBlur"
+ },
+ "widgets_values": [
+ 20,
+ 1
+ ],
+ "index": 1
+ },
+ {
+ "type": "PreviewImage",
+ "pos": [
+ 500,
+ 640
+ ],
+ "size": {
+ "0": 210,
+ "1": 246
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "images",
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewImage"
+ },
+ "index": 2
+ },
+ {
+ "type": "ImageToMask",
+ "pos": [
+ 380,
+ 540
+ ],
+ "size": {
+ "0": 315,
+ "1": 58
+ },
+ "flags": {
+ "collapsed": true
+ },
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": [],
+ "shape": 3,
+ "slot_index": 0
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageToMask"
+ },
+ "widgets_values": [
+ "red"
+ ],
+ "index": 3
+ }
+ ],
+ "links": [
+ [
+ null,
+ 0,
+ 0,
+ 0,
+ 24,
+ "MASK"
+ ],
+ [
+ 0,
+ 0,
+ 1,
+ 0,
+ 39,
+ ],
+ [
+ 1,
+ 0,
+ 2,
+ 0,
+ 38,
+ ],
+ [
+ 1,
+ 0,
+ 3,
+ 0,
+ 38,
+ ]
+ ],
+ "external": [
+ [
+ 3,
+ 0,
+ "MASK"
+ ]
+ ]
+ }
+ }
+ },
+ "version": 0.4
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0a27482
--- /dev/null
+++ b/README.md
@@ -0,0 +1,179 @@
+# VR Environment Creation with Generative AI, Python Server
+A Python project to create VR environments using Generative AI.
+You can run it as a TCP server to interface it with a [Unity client](https://github.com/fcbg-hnp-vr/VR-Environment-GenAI-Unity),
+to get the fully-fledged AI/VR application.
+This is a use case of generative AI to build a complete VR scenery.
+It was developed at the [Fondation Campus Biotech Geneva](https://fcbg.ch/),
+in collaboration with the [Laboratory of Cognitive Science](https://www.epfl.ch/labs/lnco/),
+## Requirements
+- Python 3.10.12+
+- A CUDA-compatible graphic card and at least 12 GB of VRAM.
+- Up to 15 GB of storage for the models.
+## Installation
+Using Python:
+1. Install [CUDA 12.1](https://developer.nvidia.com/cuda-12-1-0-download-archive), it enables computation on the GPU.
+2. Install **Python 3.10**, for Windows you can download it using the [official installer](https://www.python.org/downloads/windows/).
+3. Clone or copy the Git repository: .
+4. Create a Python virtual environment. While not strictly necessary, it is highly recommended as the project requires
+many dependencies. For instance using [venv](https://docs.python.org/3/library/venv.html):
+ * On Linux:
+ ```bash
+ cd VR-Environment-GenAI-Server
+ # From https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/#create-and-use-virtual-environments
+ python -m venv .venv # Creates the virtual environment under .venv
+ source .venv/bin/activate # Activates it
+ ```
+ * On Windows:
+ ```shell
+ cd VR-Environment-GenAI-Server
+ # From https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/#create-and-use-virtual-environments
+ py -m venv .venv # Creates the virtual environment under .venv
+ .venv\Scripts\activate # Activates it
+ ```
+5. Install the Python requirements.
+ ```bash
+ pip install -r requirements.txt
+ ```
+ **Important**: at the time of writing (2024-07-29) the default version of PyTorch
+ is compatible with CUDA 12.1, and you may not need any extra steps. If you receive
+ error message telling you that your version of PyTorch is not compatible with CUDA,
+ uninstall PyTorch completely and reinstall it by running
+ ``pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121``.
+ Please have a look at for details.
+From here on, the project should be functional. The next section is optional, but it can save you a lot of time.
+> (optional) You can speed up image generation using [accelerate](https://huggingface.co/docs/accelerate/index).
+Download it with ``pip install accelerate``.
+## Installation details
+* The first time a model is launched is needs to be downloaded,
+this operation can take some time, and you need an internet connection.
+The [Usage](#usage) section explains how to download all models at once.
+* For users of PyCharm, an `.idea` folder is included to add the folder as a project.
+* Optional, demo only: to capture the audio from the microphone in Python (ASR),
+you need ffmpeg, portaudio and pyaudio:
+ ```bash
+ sudo apt install ffmpeg portaudio19-dev python3-pyaudio
+ pip install -r requirements-optional.txt # Installs PyAudio
+ ```
+## Usage
+Each file can be executed independently, so they are as many entry points as files.
+The most common use cases are the following:
+* Generate a new image with ``python -m skybox.diffusion``.
+* Download all models with ``python -m utils.download_models``.
+If you don't do it the models will be downloaded at run time which may be very slow.
+* Start the server with ``python -m server.run``.
+Next is the detail for special files.
+### Image generation
+Go to the ``skybox`` folder.
+1. diffusion.py - base module to create an image from a diffusion model.
+2. inpainting.py - implements an inpainting model.
+3. image_processing.py - defines image processing features
+4. mask_editor.py - code logics to generate a mask adapted to the image.
+The result is usually passed to inpainting functions.
+5. panorama_creator.py - code logics to generate a panorama.
+6. The code in ``skybox/legacy`` may not be useful. I keep it there for personal intents.
+### 3D features
+3D features are in the ``environment`` folder. It is still in active development at the time of writing (June 2024),
+hence the following is subject to change.
+1. depth_generation.py - provides a model to come from a standard RGB image and create a depth map.
+2. point_cloud_pipeline.py - uses the RGBD to create a point cloud, and converts it to a mesh.
+3. mesh_pipeline.py - uses the RGBD image and representation features to create a terrain mesh.
+4. mask_former.py - semantic segmentation of an RGB image.
+5. image_segmentation.py - uses an RGBD+semantic image to isolate the main elements.
+6. depth_inpainting.py - combines inpainting controlled by depth data to recreate parts of a terrain.
+Yet not integrated in the main code base.
+7. rendered.py - create a 3D view for the terrain, not finished yet.
+### Speech-to-text (ASR)
+For speech to text features, go to ``asr`` (automatic speech recognition)
+* speech_to_text.py - implements an Automatic Speech Recognition (ASR) model.
+* asr_demo.py - simply a demo, you can either use your microphone or load the dataset
+### ComfyUI graphical interface
+If you want to use a graphical interface instead of Python code,
+you can use the provided [ComfyUI](https://github.com/comfyanonymous/ComfyUI) workflows
+in the `ComfyUI` folder.
+The explanation for each workflow is detailed in [ComfyUI/README.md](ComfyUI/README.md).
+### Server
+The server features are in `server`. See [Start as a TCP server](#start-as-a-tcp-server) for the details on usage.
+* run.py - starts a TCP server, able to serve requests to the previously defined models.
+* task_tracker.py - Just a class adding syntactic suger to track a task easily
+* utils.py - Utility functions for the server.
+### Other Features
+* As a test, the ``sound`` folder has some experiments with sound generation.
+* The ``utils`` folder contains useful functions for the user:
+ * download_models.py - downloads useful models for the server. It does not download all models.
+## Configuration
+The main server configuration is in ``api.json``.
+The most significant configuration data are "serverIp" and "serverPort" as they set the address of the server.
+## Start as a TCP server
+A TCP server can be started in order to offload the AI part from the application thread.
+Just launch `python -m server.run`. The server [configuration](#configuration) is defined in `api.json`.
+The communication is handled in JSON format, with a strong HTTP style.
+To connect to the server from another computer on the same network, you need to open a port.
+On Windows, you simply need to go to the control panel add a new rule for the port `9000` (with the default configuration).
+This [How-To Geek tutorial](https://www.howtogeek.com/394735/how-do-i-open-a-port-on-windows-firewall/) seems guiding enough.
+On Linux, opening ports is a bit more fun, I personally recommend using nginx with a port redirection.
+## Roadmap
+Current status of the project, from a very far perspective.
+- [x] Skybox generation : v0.4 done, go to ``skybox/panorama_creator.py``
+- [ ] Terrain generation : Early 3D terrain generation in ``environment/renderer.py`` not suitable for production now.
+- [ ] Props generation : use billboards only as current technology do not allow to dream bigger.
+## Models' list
+This project includes several artificial neural network models.
+If you want to substitute a model by another one, you should have a good knowledge of what you are doing,
+otherwise the quality of the end product may be decreased.
+- Image creation : [Stable Diffusion XL base 1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and
+[Stable Diffusion XL refiner 1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0).
+- Inpainting and outpainting : [Stable Diffusion XL 1.0 Inpainting 0.1](https://huggingface.co/diffusers/stable-diffusion-xl-1.0-inpainting-0.1).
+- Speech-to-text and translation : [Whisper Large v3](https://huggingface.co/openai/whisper-large-v3).
+Please have a look at ``utils/download_models.py`` to see where those models are loaded from.
+## Useful Links
+You can download the official Unity client from [VR-Environment-GenAI-Unity (GitHub)](https://github.com/fcbg-hnp-vr/VR-Environment-GenAI-Unity).
diff --git a/api.json b/api.json
new file mode 100644
index 0000000..0221a69
--- /dev/null
+++ b/api.json
@@ -0,0 +1,7 @@
+ "name": "AIWorldGenerationAPI",
+ "description": "AI World Generation API",
+ "version": "0.4.2",
+ "serverIp": "",
+ "serverPort": 9000
\ No newline at end of file
diff --git a/asr/asr_demo.py b/asr/asr_demo.py
new file mode 100644
index 0000000..6b30647
--- /dev/null
+++ b/asr/asr_demo.py
@@ -0,0 +1,79 @@
+Demo file for an Automatic Speech Recognition system.
+import wave
+from datasets import load_dataset
+import pyaudio
+from asr.speech_to_text import do_audio_transcription
+def register_audio():
+ """Register audio from the user's microphone."""
+ chunk = 1024
+ audio_format = pyaudio.paInt16
+ channels = 1
+ rate = 44100
+ record_seconds = 10
+ output_filename = "output.mp3"
+ p = pyaudio.PyAudio()
+ stream = p.open(
+ format=audio_format,
+ channels=channels,
+ rate=rate,
+ input=True,
+ frames_per_buffer=chunk,
+ )
+ print("* recording")
+ frames = []
+ for _ in range(int(rate / chunk * record_seconds)):
+ data = stream.read(chunk)
+ frames.append(data)
+ print("* done recording")
+ stream.stop_stream()
+ stream.close()
+ p.terminate()
+ wf = wave.open(output_filename, "wb")
+ wf.setnchannels(channels)
+ wf.setsampwidth(p.get_sample_size(audio_format))
+ wf.setframerate(rate)
+ wf.writeframes(b"".join(frames))
+ wf.close()
+ return output_filename
+def main_demo():
+ """
+ Print the user audio or a default sample.
+ If the user chooses to enter their own audio, it calls the `register_audio` function to record
+ audio and then uses the `sample_to_text` function to convert the audio to text.
+ If the user chooses not to enter their own audio,
+ it uses a default sample from the 'distil-whisper/librispeech_long' dataset.
+ """
+ if input("Would you like to enter your own audio (y/[N])? ") == "y":
+ print("Please describe what you would like to see.")
+ sample = register_audio()
+ else:
+ print("Using default sample")
+ dataset = load_dataset(
+ "distil-whisper/librispeech_long", "clean", split="validation"
+ )
+ sample = dataset[0]["audio"]
+ result = do_audio_transcription(sample)
+ print(result["text"])
+if __name__ == "__main__":
+ main_demo()
diff --git a/asr/speech_to_text.py b/asr/speech_to_text.py
new file mode 100644
index 0000000..6ed2e8e
--- /dev/null
+++ b/asr/speech_to_text.py
@@ -0,0 +1,51 @@
+A simple Speech-to-Text module.
+It uses whisper by OpenAI, source https://huggingface.co/openai/whisper-large-v3
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+MODEL_ID = "openai/whisper-large-v3"
+def get_asr_model():
+ """Load the model from Hugging Face."""
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+ return AutoModelForSpeechSeq2Seq.from_pretrained(
+ torch_dtype=torch_dtype,
+ low_cpu_mem_usage=True,
+ use_safetensors=True
+ )
+def do_audio_transcription(audio):
+ """
+ Return the text from an audio file.
+ :param audio: Input audio, either a file path or bytes
+ :type audio: str | bytes[]
+ :return str: Text in the audio
+ """
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+ model = get_asr_model().to(device)
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
+ pipe = pipeline(
+ "automatic-speech-recognition",
+ model=model,
+ tokenizer=processor.tokenizer,
+ feature_extractor=processor.feature_extractor,
+ max_new_tokens=128,
+ chunk_length_s=30,
+ batch_size=16,
+ return_timestamps=True,
+ torch_dtype=torch_dtype,
+ device=device,
+ generate_kwargs={"task": "translate"}
+ )
+ return pipe(audio)
diff --git a/environment/depth_generation.py b/environment/depth_generation.py
new file mode 100644
index 0000000..a4597df
--- /dev/null
+++ b/environment/depth_generation.py
@@ -0,0 +1,151 @@
+Generate an RGBD image from a simgle image.
+from PIL import Image
+from diffusers import MarigoldDepthPipeline
+import matplotlib.pyplot as plt
+import numpy as np
+def compute_image_depth(image, color_map="Spectral"):
+ """
+ Compute the depth of the image.
+ :param PIL.Image.Image image: Input RGB image path.
+ :param str | None color_map: Colorize depth image, set to None to skip colormap generation.
+ :return: Pipeline
+ """
+ # Original DDIM version (higher quality)
+ pipe = MarigoldDepthPipeline.from_pretrained(
+ "prs-eth/marigold-v1-0",
+ custom_pipeline="marigold_depth_estimation",
+ )
+ # Note: a 16-bit variant is also available, just use torch_dtype=torch.float16, variant="fp16"
+ pipe.to("cuda")
+ return pipe(
+ image,
+ # (optional) Maximum resolution of processing. If set to 0: will not resize at all.
+ # Defaults to 768.
+ # processing_res=768,
+ # (optional) Resize depth prediction to match input resolution.
+ # match_input_res=True,
+ # (optional) Inference batch size, no bigger than `num_ensemble`.
+ # If set to 0, the script will automatically decide the proper batch size. Defaults to 0.
+ # batch_size=0,
+ # (optional) Random seed can be set to ensure additional reproducibility.
+ # Default: None (unseeded).
+ # Note: forcing --batch_size 1 helps to increase reproducibility.
+ # To ensure full reproducibility, deterministic mode needs to be used.
+ # seed=2024,
+ # (optional) Colormap used to colorize the depth map. Defaults to "Spectral".
+ # Set to `None` to skip colormap generation.
+ color_map=color_map,
+ # (optional) If true, will show progress bars of the inference progress.
+ show_progress_bar=False,
+ )
+def get_depth(image):
+ """Return a depth map of the image."""
+ pipeline_output = compute_image_depth(image, color_map=None)
+ return pipeline_output.depth_np
+def get_depth_image(image, depth_map_path=None, color_map_path=None):
+ """
+ Return the colored depth image, save both grey and colored depth.
+ :param PIL.Image.Image image: Input RGB image path.
+ :param depth_map_path: Path to the depth map if it should be saved as a file
+ :type depth_map_path: str or None
+ :param color_map_path: Path to the colored depth map if it should be saved as a file
+ :type color_map_path: str or None
+ :return np.ndarray: Depth map, between 0 and 1
+ """
+ pipeline_output = compute_image_depth(image)
+ # Predicted depth map
+ depth = pipeline_output.depth_np
+ if depth_map_path is not None:
+ # Save as uint16 PNG
+ depth_uint16 = (depth * (2**16 - 1)).astype(np.uint16)
+ grey_depth_image = Image.fromarray(depth_uint16)
+ grey_depth_image.save(depth_map_path, mode="I;16")
+ if color_map_path is not None:
+ # Colorized prediction
+ depth_colored: Image.Image = pipeline_output.depth_colored
+ # Save colorized depth map
+ depth_colored.save(color_map_path)
+ return depth
+def plot_arrays(array1, array2, titles=None):
+ """
+ Plot two matrix arrays as images.
+ Create a figure with two subplots and plots the given matrix arrays as grayscale images.
+ If the `titles` parameter is provided, it sets the titles for the two plots.
+ :param array1: The first matrix array to be plotted.
+ :type array1: numpy.ndarray
+ :param array2: The second matrix array to be plotted.
+ :type array2: numpy.ndarray
+ :param titles: Optional titles for the two plotted images.
+ :type titles: tuple, default is None
+ """
+ # Create a figure and grid objects
+ _fig, axes = plt.subplots(1, 2)
+ # Plot the arrays as an images
+ axes[0].imshow(array1, cmap="gray")
+ axes[1].imshow(array2, cmap="gray")
+ if titles is not None:
+ axes[0].set_title(titles[0])
+ axes[1].set_title(titles[1])
+ plt.show()
+def view_flat_estimation(rgbd_image):
+ """
+ Plot the color and depth components of an RGBD image.
+ :param rgbd_image: An RGBD Image containing the color and depth components of an image.
+ :type rgbd_image: open3d.geometry.RGBDImage
+ """
+ plot_arrays(
+ rgbd_image.color,
+ rgbd_image.depth,
+ ["Mountain grayscale image", "Mountain depth image"],
+ )
+def get_horizon_height(depth_map):
+ """Return the height of the horizon line in pixel coordinates."""
+ average_depth = np.median(depth_map, axis=1)
+ return np.argmax(average_depth)
+def main(image=None):
+ """
+ Main demo function for depth generation.
+ :param PIL.Image.Image | None image: The image to generate depth from.
+ """
+ depth_map_path = "outputs/" + ("sunny_" if image is None else "") + "depth_map.png"
+ color_map_path = (
+ "outputs/" + ("sunny_" if image is None else "") + "depth_colored.png"
+ )
+ if image is None:
+ image = Image.open("../sunny_mountain.png")
+ get_depth_image(image, depth_map_path, color_map_path)
+if __name__ == "__main__":
+ main(Image.open("../sky.png"))
diff --git a/environment/depth_inpainting.py b/environment/depth_inpainting.py
new file mode 100644
index 0000000..4cad811
--- /dev/null
+++ b/environment/depth_inpainting.py
@@ -0,0 +1,70 @@
+Inpainting using depth data as a ControlNet.
+import diffusers
+from PIL import Image
+import torch
+def get_inpaint_depth_pipeline():
+ """
+ Initialize and return a Stable Diffusion XL ControlNet inpainting pipeline.
+ The pipeline uses depth data as a control signal for inpainting.
+ For details:
+ https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet_sdxl#diffusers.StableDiffusionXLControlNetInpaintPipeline
+ :return: A pre-configured pipeline for inpainting with depth control.
+ :rtype: diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput
+ """
+ controlnet = diffusers.ControlNetModel.from_pretrained(
+ "diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16
+ )
+ pipe = diffusers.StableDiffusionXLControlNetInpaintPipeline.from_pretrained(
+ "stabilityai/stable-diffusion-xl-base-1.0",
+ controlnet=controlnet,
+ torch_dtype=torch.float16,
+ )
+ return pipe
+def inpaint_depth_controlled(init_image, mask_image, control_image, prompts):
+ """
+ Perform depth-guided inpainting using a Stable Diffusion XL ControlNet pipeline.
+ This function initializes a pre-configured pipeline for inpainting with depth control,
+ and then generates images based on the given parameters.
+ :param PIL.Image.Image init_image: The initial image to start inpainting from.
+ :param PIL.Image.Image mask_image: The mask image indicating the areas to be inpainted.
+ :param PIL.Image.Image control_image: The depth map image to guide the inpainting process.
+ :param str prompts: The text prompt to guide the image generation.
+ :return list[PIL.Image.Image]: A list containing the generated inpainted images.
+ """
+ pipe = get_inpaint_depth_pipeline()
+ # pipe.to("cuda")
+ pipe.enable_model_cpu_offload() # use it instead of CUDA if you run out of VRAM
+ # Generate the images
+ images = pipe(
+ prompts,
+ num_inference_steps=50,
+ eta=1.0,
+ image=init_image,
+ mask_image=mask_image,
+ control_image=control_image,
+ ).images
+ return images
+if __name__ == "__main__":
+ inpaint_depth_controlled(
+ Image.open("../sunny_mountain.png"),
+ Image.open("../skybox/mask.png"),
+ Image.open("sunny_depth_map.png"),
+ "a mountain",
+ )[0].show()
diff --git a/environment/image_segmentation.py b/environment/image_segmentation.py
new file mode 100644
index 0000000..df9a8a0
--- /dev/null
+++ b/environment/image_segmentation.py
@@ -0,0 +1,610 @@
+Image segmentation techniques using the depth map.
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from scipy.ndimage import gaussian_filter
+import skimage
+from sklearn.preprocessing import minmax_scale, scale
+from sklearn.cluster import KMeans, DBSCAN
+import torch
+from environment.mask_former import mask_former, panoptic_segmentation, get_sky_ids
+DEFAULT_IMAGE = "../sunny_mountain.png"
+def planar_grid(image):
+ """
+ Create a planar grid of values.
+ The output grid will have dimension (*image.shape, 2) that respects
+ ``grid[x, y] = [image.shape[0] / x, image.shape[1] / y]``.
+ """
+ # Grid from [0, image.shape[axis]] on each axis
+ grid = np.indices(image.shape).T
+ # Reduce each axis to [0, 1]
+ return grid / (image.shape - np.ones(image.ndim))
+def do_kmeans(data, n_clusters):
+ """Apply a K-mean clustering."""
+ flat_data = data.reshape((-1, data.shape[2]))
+ # Create an instance of the K-Means clustering algorithm
+ kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_SEED)
+ # Fit the K-Means algorithm to the pixel data
+ kmeans.fit(flat_data)
+ # Predict the cluster labels for each pixel
+ labels = kmeans.predict(flat_data)
+ return labels.reshape(*data.shape[:2])
+def deep_scale_stack_data(*data):
+ """Stack data on depth, normalized."""
+ normalized_data = tuple(
+ minmax_scale(d.reshape(-1, 1)).reshape(*d.shape) for d in data
+ )
+ return np.dstack(normalized_data)
+def depth_clustering(image, n_clusters=5):
+ """Cluster using depth only."""
+ return do_kmeans(image, n_clusters)
+def spatial_clustering(image, n_clusters=15):
+ """Cluster using XYD data."""
+ grid = planar_grid(image)
+ xyd_image = deep_scale_stack_data(image, grid)
+ return do_kmeans(xyd_image, n_clusters)
+def rgbd_spatial_clustering(depth_image, rgb_image, n_clusters=15):
+ """Cluster using spatial K-means on RGBD data."""
+ grid = planar_grid(depth_image)
+ xy_rgbd_image = deep_scale_stack_data(grid, rgb_image, depth_image)
+ return do_kmeans(xy_rgbd_image, n_clusters)
+def mask_former_clustering(depth_image, mask_former_labels, n_clusters=15):
+ """Cluster using MaskFormer."""
+ grid = planar_grid(depth_image)
+ xyd_mask_image = deep_scale_stack_data(grid, depth_image, mask_former_labels)
+ return do_kmeans(xyd_mask_image, n_clusters)
+def spatial_fz_mf_clustering(image, segments_fz, mask_former_labels, n_clusters=None):
+ """Cluster using Felzenszwalbs's method."""
+ grid = planar_grid(image)
+ if n_clusters is None:
+ sizes = np.unique(segments_fz).shape[0], np.unique(mask_former_labels).shape[0]
+ n_clusters = max(sizes)
+ print(f"Segmenting in {n_clusters} clusters")
+ stacked_data = deep_scale_stack_data(grid, image, segments_fz, mask_former_labels)
+ return do_kmeans(stacked_data, n_clusters)
+def compare_segmentations(image_path, depth_path):
+ """Compare various segmentation methods."""
+ # Load the grayscale image
+ image = skimage.io.imread(depth_path, as_gray=True)
+ # Remove background
+ far_clip = np.quantile(image, 0.7)
+ clipped = np.clip(image, 0, far_clip)
+ # Apply Gaussian filtering to reduce noise (optional)
+ filtered_image = skimage.filters.gaussian(clipped, sigma=1)
+ original_spatial = spatial_clustering(skimage.filters.gaussian(image, sigma=1), 12)
+ spatial_clusters = spatial_clustering(filtered_image, 12)
+ clustering_labels = spatial_clusters.reshape(filtered_image.shape)
+ mask_former_labels = mask_former(Image.open(image_path))
+ mask_former_clusters = mask_former_clustering(clipped, mask_former_labels, 10)
+ k_mask_former_labels = mask_former_clusters.reshape(filtered_image.shape)
+ segments_fz = skimage.segmentation.felzenszwalb(
+ clipped, scale=1, min_size=int(np.sqrt(image.shape[0] * image.shape[1]) * 10)
+ )
+ fz_mf_labels = spatial_fz_mf_clustering(
+ clipped, segments_fz, mask_former_labels, n_clusters=None
+ )
+ _fig, ax = plt.subplots(3, 3, figsize=(10, 10), sharex=True, sharey=True)
+ ax[0, 0].set_title("Spatial K-means")
+ ax[0, 0].imshow(
+ skimage.segmentation.mark_boundaries(
+ image, original_spatial.reshape(filtered_image.shape)
+ )
+ )
+ ax[0, 1].set_title("Clipped spatial K-means")
+ ax[0, 1].imshow(skimage.segmentation.mark_boundaries(image, clustering_labels))
+ ax[0, 2].set_title("Felzenszwalbs's method")
+ ax[0, 2].imshow(skimage.segmentation.mark_boundaries(image, segments_fz))
+ ax[1, 0].set_title("MaskFormer segmentation")
+ ax[1, 0].imshow(skimage.segmentation.mark_boundaries(image, mask_former_labels))
+ ax[1, 1].set_title("K-Mean + MaskFormer segmentation")
+ ax[1, 1].imshow(skimage.segmentation.mark_boundaries(image, k_mask_former_labels))
+ ax[1, 2].set_title("MaskFormer + Felzenszwalbs,\nK-mean")
+ ax[1, 2].imshow(skimage.segmentation.mark_boundaries(image, fz_mf_labels))
+ ax[2, 0].set_title("Spatial K-means")
+ ax[2, 0].imshow(clustering_labels)
+ ax[2, 1].set_title("K-Mean + MaskFormer segmentation")
+ ax[2, 1].imshow(k_mask_former_labels)
+ ax[2, 2].set_title("MaskFormer + Felzenszwalbs,\nK-mean")
+ ax[2, 2].imshow(fz_mf_labels)
+ plt.show()
+def segmentation_maps(image_path, depth_path):
+ """Segment the image and display the result."""
+ # Load the grayscale image
+ image = skimage.io.imread(depth_path, as_gray=True)
+ # Remove background
+ far_clip = np.quantile(image, 0.7)
+ clipped = np.clip(image, 0, far_clip)
+ grid = planar_grid(image)
+ xyd_image = deep_scale_stack_data(image, grid)
+ mask_former_labels = mask_former(Image.open(image_path))
+ segments_fz = skimage.segmentation.felzenszwalb(
+ clipped, scale=1, min_size=int(np.sqrt(image.shape[0] * image.shape[1]) * 10)
+ )
+ fz_mf_labels = spatial_fz_mf_clustering(
+ clipped, segments_fz, mask_former_labels, n_clusters=None
+ )
+ _fig, ax = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True)
+ ax[0, 0].set_title("Clipped xyD image")
+ ax[0, 0].imshow(xyd_image)
+ ax[0, 1].set_title("Felzenszwalbs's method")
+ ax[0, 1].imshow(segments_fz)
+ ax[1, 0].set_title("MaskFormer segmentation")
+ ax[1, 0].imshow(mask_former_labels)
+ ax[1, 1].set_title("Final segmentation")
+ ax[1, 1].imshow(fz_mf_labels)
+ plt.show()
+def segment_image(image_path, depth_path, far_clip_pos=0.7):
+ """Segment the RGB image with all methods."""
+ # Load the grayscale image
+ image = skimage.io.imread(depth_path, as_gray=True)
+ # Remove background
+ far_clip = np.quantile(image, far_clip_pos)
+ clipped = np.clip(image, 0, far_clip)
+ mask_former_labels = mask_former(Image.open(image_path))
+ segments_fz = skimage.segmentation.felzenszwalb(
+ clipped, scale=1, min_size=int(np.sqrt(image.shape[0] * image.shape[1]) * 10)
+ )
+ return spatial_fz_mf_clustering(clipped, segments_fz, mask_former_labels)
+def show_images_grid(np_images):
+ """Show images in a nice grid."""
+ n_cols = int(np.sqrt(len(np_images)))
+ n_lines = int(np.ceil(len(np_images) / n_cols))
+ _fig, ax = plt.subplots(n_lines, n_cols)
+ for i, result in enumerate(np_images):
+ ax[i // n_cols, i % n_cols].imshow(result)
+ plt.show()
+def split_image(image, labels):
+ """Split image in the different labels."""
+ labels_values = np.unique(labels)
+ outputs = []
+ for target_label in labels_values:
+ mask = (labels == target_label).astype(image.dtype)
+ image_masked = image * np.dstack((mask, mask, mask))
+ outputs.append(image_masked)
+ Image.fromarray(image_masked).save(f"outputs/image_{target_label}.png")
+ # show_images_grid(outputs)
+ return outputs
+def crop_to_mask(np_image, mask):
+ """Crop an image to a specific mask."""
+ # Find the indices of non-False
+ non_null_rows, non_null_cols = np.nonzero(mask)
+ # Find the bounding box
+ crop = np_image[
+ np.min(non_null_rows) : np.max(non_null_rows),
+ np.min(non_null_cols) : np.max(non_null_cols),
+ ]
+ return crop
+def crop_to_content(np_images):
+ """Crop an image to a not empty space."""
+ crops = []
+ for i, np_im in enumerate(np_images):
+ mask = np.sum(np_im, axis=2) > 0
+ crop = crop_to_mask(np_im, mask)
+ crops.append(crop)
+ cropped_mask = crop_to_mask(mask, mask)
+ transparent = np.dstack((crop, cropped_mask * 255)).astype(np.uint8)
+ Image.fromarray(transparent).save(f"outputs/cropped_{i}.png")
+ return crops
+def segment_and_save(image_path, depth_path):
+ """Segment an image and save each segment."""
+ labels = segment_image(image_path, depth_path)
+ images = split_image(skimage.io.imread(image_path), labels)
+ segments = crop_to_content(images)
+ # show_images_grid(segments)
+ return segments
+def is_segment_skybox(
+ depth_field, mask, far_threshold, near_threshold, vertical_gradient=0.2
+ """Check if a given segment is a good skybox candidate."""
+ data = depth_field[mask]
+ mean_depth = np.mean(data)
+ # Object is very far: skybox
+ if mean_depth > far_threshold:
+ return True
+ # Object very near: not a skybox
+ if mean_depth < near_threshold:
+ return False
+ # Otherwise check if it is deep with a nice vertical decreasing gradient
+ masked_image = depth_field * mask / mean_depth
+ return np.mean(masked_image[1:] - masked_image[:-1]) < vertical_gradient
+def mask_skybox(image_path, depth_path, labeled_image):
+ """List of the labels corresponding to the sky."""
+ rgb_image = skimage.io.imread(image_path)
+ depth_map = skimage.io.imread(depth_path)
+ _images = split_image(rgb_image, labeled_image)
+ labels = np.unique(labeled_image)
+ skybox_list = []
+ far_plane = np.quantile(depth_map, 0.55)
+ near_plane = np.quantile(depth_map, 0.30)
+ # Check if elements belong to the skybox
+ for label_id in labels:
+ mask = labeled_image == label_id
+ depth_field = crop_to_mask(depth_map, mask)
+ is_skybox = is_segment_skybox(
+ depth_field, crop_to_mask(mask, mask), far_plane, near_plane
+ )
+ if is_skybox:
+ # Mark the label
+ skybox_list.append(label_id)
+ return skybox_list
+def mask_terrain(image_path, depth_path, labeled_image, ignore_labels=None):
+ """Create a mask for the labels belonging to the terrain."""
+ rgb_image = skimage.io.imread(image_path)
+ depth_map = minmax_scale(skimage.io.imread(depth_path).flatten()).reshape(
+ rgb_image.shape[:-1]
+ )
+ labels = np.unique(labeled_image)
+ images = split_image(rgb_image, labeled_image)
+ terrain_list = []
+ # Check if elements belong to the skybox
+ for label_id, _im in zip(labels, images):
+ if label_id in ignore_labels:
+ continue
+ mask = labeled_image == label_id
+ depth_field = crop_to_mask(depth_map, mask)
+ # Otherwise check if it is deep with a nice vertical decreasing gradient
+ masked_image = depth_field * crop_to_mask(mask, mask)
+ plt.imshow(depth_field)
+ plt.show()
+ plt.imshow(masked_image)
+ plt.show()
+ plt.imshow((masked_image[1:] - masked_image[:-1]))
+ plt.show()
+ slope = np.mean(
+ (masked_image[1:] - masked_image[:-1])[crop_to_mask(mask, mask)[1:]]
+ )
+ print(label_id, "has slope", slope)
+ if slope < 0.002:
+ # Mark the label as terrain
+ print(label_id, "is terrain")
+ terrain_list.append(label_id)
+ return terrain_list
+def segment_parts():
+ """Segment an image into skybox and terrain parts."""
+ facebook_mask_former_labels = mask_former(Image.open(DEFAULT_IMAGE))
+ skybox_indices = mask_skybox(
+ DEFAULT_IMAGE, "../outputs/depth_map.png", facebook_mask_former_labels
+ )
+ terrain_indices = mask_terrain(
+ "../outputs/depth_map.png",
+ facebook_mask_former_labels,
+ skybox_indices,
+ )
+ print("terrain indices are ", terrain_indices)
+def segment_skybox(segmentation, depth_map):
+ """
+ Probability for each segment to be a skybox part.
+ :param dict segmentation: The segmentation data.
+ :param np.ndarray depth_map: The depth of each pixel.
+ :return torch.Tensor: Tensor of probability for each segment.
+ """
+ reduced_depth = force_monotonous(depth_map, bottom_to_top=False)
+ mean_depth = np.mean(reduced_depth, axis=1)
+ norm_mean_depth = (mean_depth - mean_depth.min()) / (
+ mean_depth.max() - mean_depth.min()
+ )
+ y_indices = torch.linspace(torch.pi / 2, -torch.pi / 2, depth_map.shape[0])
+ height_distribution = (torch.sin(y_indices) + 1) / 2
+ segmented = segmentation["segmentation"]
+ has_undefined = 0 in segmented
+ masks = np.empty(
+ (
+ len(np.unique(segmented)) + has_undefined + 1,
+ depth_map.shape[1],
+ depth_map.shape[0],
+ )
+ )
+ if has_undefined:
+ masks[0] = segmented.T == 0
+ sky_ids = get_sky_ids()
+ sky_detected = []
+ for i, info in enumerate(segmentation["segments_info"]):
+ masks[i + has_undefined] = segmented.T == (i + has_undefined)
+ if info["label_id"] in sky_ids:
+ sky_detected.append(i + has_undefined)
+ """
+ # Just some visualization code (to delete)
+ plt.plot(height_distribution, label="Probability following y")
+ plt.plot(norm_mean_depth, label="Probability following mean depth")
+ plt.plot(height_distribution * norm_mean_depth, label="Combined probability")
+ plt.xlabel("Height in pixel coordinates (0 = image top)")
+ plt.ylabel("Probability of being above the horizon")
+ plt.title("Probability of an horizontal line to be above the horizon (on y coordinate)")
+ plt.legend()
+ plt.grid()
+ plt.show()
+ """
+ sky_probability = torch.mean(
+ height_distribution * norm_mean_depth * masks, axis=(1, 2)
+ )
+ return sky_probability
+def get_skybox_mask(segmentation, depth_map, closest_plane=0.3, farthest_plane=0.7):
+ """
+ Return the skybox mask for a given image.
+ :param dict segmentation: Panoptic segmentation from Mask2Former
+ :param numpy.ndarray depth_map: Array of depth for each pixel.
+ :param float closest_plane: Pixels closer to this plane cannot be a part of the skybox.
+ :param float farthest_plane: Pixels above this plane are automatically a part of the skybox.
+ :return numpy.ndarray: A binary mask of the same size as the input image.
+ """
+ sky_probability = segment_skybox(segmentation, depth_map)
+ # Use a threshold or at least one element
+ threshold = min(0.5, torch.max(sky_probability))
+ passing_sky = torch.argwhere(sky_probability >= threshold)
+ masks = [segmentation["segmentation"] == i for i in passing_sky]
+ # Far plane has to be a part of the skybox
+ far_plane = torch.from_numpy(depth_map > farthest_plane)
+ masks.append(far_plane)
+ composite_mask = torch.logical_or(*masks)
+ # Near plane cannot be a part of the skybox
+ not_near_plane = torch.from_numpy(depth_map > closest_plane)
+ return torch.logical_and(composite_mask, not_near_plane).numpy()
+def force_monotonous(data, bottom_to_top=True):
+ """Check area where depth coordinate increase monotonously."""
+ output = np.empty(data.shape)
+ flipper = -bottom_to_top
+ prog_depth = data[flipper]
+ for i in range(data.shape[0] + flipper - 1):
+ slicer_index = (-1 if bottom_to_top else 1) * (i + 1)
+ prog_depth = np.max([data[slicer_index], prog_depth], axis=0)
+ output[slicer_index] = prog_depth
+ return output
+def increasing_depth(monotonous_depth):
+ """
+ Enhance the monotonous depth map by rewriting all points with positive of null gradient.
+ The idea is to get a strictly monotonous map that would be similar
+ to the natural view direction.
+ :param numpy.ndarray monotonous_depth: Depth of monotonous progression
+ :return numpy.ndarray: A natural progression of the depth.
+ """
+ grad = np.gradient(monotonous_depth, axis=0)
+ # Skip indices of null gradient
+ corrupt_depth = np.copy(monotonous_depth)
+ corrupt_depth[grad >= 0] = np.nan
+ median_depths = np.nanmedian(corrupt_depth, axis=1)
+ # Replace by average depth
+ corrupt_depth[np.isnan(corrupt_depth)] = np.tile(
+ median_depths, (monotonous_depth.shape[1], 1)
+ ).T[np.isnan(corrupt_depth)]
+ return force_monotonous(corrupt_depth)
+def get_ground_mask(depth_map, ground_mask=None):
+ """
+ Identify and segment the ground in a given depth map.
+ This function uses a combination of depth gradient, monotonicity,
+ and clustering to identify the ground.
+ It then applies a Gaussian filter to smoothen the result and
+ DBSCAN to further segment the ground.
+ :param numpy.ndarray depth_map: The depth map of the image.
+ :param numpy.ndarray | None ground_mask: A mask indicating the initial ground pixels.
+ :return numpy.ndarray: A binary mask indicating the ground pixels.
+ """
+ monotonous_depth = force_monotonous(depth_map)
+ grad = np.gradient(monotonous_depth, axis=0)
+ corrupt_depth = increasing_depth(monotonous_depth)
+ adherence = corrupt_depth - monotonous_depth
+ great_map = np.logical_and((adherence * corrupt_depth) < 0.1, grad < 0).astype(np.float32)
+ conv = gaussian_filter(great_map, sigma=10)
+ zones = conv > 0.5
+ # Now select which are is a part of the ground
+ points = np.argwhere(zones)
+ clustering = DBSCAN(eps=50, min_samples=2000).fit(points)
+ labels = clustering.labels_
+ # Number of clusters in labels, ignoring noise if present.
+ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+ n_noise_ = list(labels).count(-1)
+ print(f"Estimated number of clusters: {n_clusters_}")
+ print(f"Estimated number of noise points: {n_noise_}")
+ clustered = -np.ones(depth_map.shape)
+ clustered[points[:, 0], points[:, 1]] = labels
+ new_ground_mask = clustered >= 0
+ if ground_mask is not None:
+ new_ground_mask = np.logical_and(new_ground_mask, ground_mask)
+ return new_ground_mask
+def segments_objects(depth_map, mask=None):
+ """
+ Segment the objects in the given depth map using DBSCAN clustering.
+ :param numpy.ndarray depth_map: The depth map of the image.
+ :param numpy.ndarray mask: A mask indicating the points to be considered. Defaults to None.
+ :return numpy.ndarray: The clustered labels for the objects in the depth map.
+ """
+ if mask is None:
+ remaining_indices = planar_grid(depth_map)
+ else:
+ remaining_indices = np.argwhere(mask)
+ remaining_deep_points = (
+ depth_map[remaining_indices[:, 0],
+ remaining_indices[:, 1]].reshape(-1, 1)
+ )
+ points = np.hstack((remaining_indices / 1024, scale(remaining_deep_points)))
+ clustering = DBSCAN(eps=0.1, min_samples=2000).fit(points)
+ labels = clustering.labels_
+ # Number of clusters in labels, ignoring noise if present.
+ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+ n_noise_ = list(labels).count(-1)
+ print(f"Estimated number of object clusters: {n_clusters_}")
+ print(f"Estimated number of objects noise points: {n_noise_}")
+ clustered = -np.ones(depth_map.shape)
+ clustered[np.uint64(points[:, 0] * 1024), np.uint64(points[:, 1] * 1024)] = labels
+ return clustered
+def get_horizon_level(depth_map, sky_mask):
+ """Return the horizon line level."""
+ up = force_monotonous(depth_map)
+ down = force_monotonous(depth_map, False)
+ horizon_mask = np.logical_or(up == down, sky_mask)
+ horizon_line = np.argmin(horizon_mask, axis=0)
+ plt.imshow(horizon_mask)
+ plt.show()
+ return np.mean(horizon_line)
+def segment_anything(image, depth_map):
+ """
+ Segment the image into skybox, ground, and objects using depth map and panoptic segmentation.
+ :param PIL.Image.Image image: The input image.
+ :param numpy.ndarray depth_map: The depth map of the image.
+ :return: A combined mask: 0 for unidentified, 1 for sky, 2 for ground, >=3 for objects.
+ :rtype: numpy.ndarray
+ """
+ segmentation = panoptic_segmentation(image)[0]
+ sky_mask = get_skybox_mask(segmentation, depth_map)
+ ground_mask = get_ground_mask(depth_map, np.logical_not(sky_mask))
+ object_clusters = segments_objects(
+ depth_map, np.logical_not(np.logical_or(sky_mask, ground_mask))
+ )
+ return sky_mask, ground_mask, object_clusters
+def segmentation_demo(image_path, depth_map_path):
+ """
+ Demonstrate the segmentation process on an image using a depth map.
+ :param str image_path: The path to the input image file.
+ :param str depth_map_path: The path to the depth map file.
+ """
+ with open(depth_map_path, "rb") as file:
+ depth_map = np.load(file)
+ image = Image.open(image_path)
+ clusters = segment_anything(image, depth_map)
+ masks_aggregation = clusters[0] + clusters[1] * 2 + (clusters[2] + 3) * (clusters[2] >= 0)
+ Image.fromarray(masks_aggregation == 2).show()
+ plt.imshow(image)
+ plt.imshow(masks_aggregation, alpha=0.7)
+ plt.show()
+if __name__ == "__main__":
+ """
+ Different segmentation techniques
+ compare_segmentations(DEFAULT_IMAGE, '../outputs/depth_map.png')
+ segmentation_maps(DEFAULT_IMAGE, '../outputs/depth_map.png')
+ image_segments = segment_and_save(DEFAULT_IMAGE, '../outputs/depth_map.png')
+ segment_parts()
+ prepare_ground_mask("../forest.png", "depth.npy", "mask.npy")
+ """
+ segmentation_demo(DEFAULT_IMAGE, "depth.npy")
diff --git a/environment/mask_former.py b/environment/mask_former.py
new file mode 100644
index 0000000..f211f5c
--- /dev/null
+++ b/environment/mask_former.py
@@ -0,0 +1,134 @@
+MaskFormer segmentation by Facebook.
+from transformers import Mask2FormerForUniversalSegmentation, Mask2FormerImageProcessor
+from PIL import Image
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import torch
+# model.config.id2label
+ "terrain": (
+ "floor-wood",
+ "flower",
+ "gravel",
+ "river",
+ "road",
+ "sand",
+ "sea",
+ "snow",
+ "stairs",
+ "floor-other-merged",
+ "pavement-merged",
+ "mountain-merged",
+ "grass-merged",
+ "dirt-merged",
+ "building-other-merged",
+ "rock-merged",
+ "rug-merged",
+ ),
+ "sky": ("ceiling-merged", "sky-other-merged"),
+def mask2former_model():
+ """Return the model for Mask2Former."""
+ model = Mask2FormerForUniversalSegmentation.from_pretrained(
+ "facebook/mask2former-swin-large-coco-panoptic"
+ )
+ return model
+def merge_label_list():
+ """Create the id_to_fuse list."""
+ label_list = [[] for _ in range(3)]
+ label2id = mask2former_model().config.label2id
+ for merge_list, key in zip(LABELS, label_list):
+ for label in LABELS[key]:
+ merge_list.append(label2id[label])
+ return label_list
+def panoptic_segmentation(image):
+ """
+ Apply a panoptic segmentation to a given image.
+ :return: Batch of panoptic segmentations,
+ each of which is a dict with keys "segmentation" and "segments_info".
+ :rtype: list
+ """
+ model = mask2former_model()
+ processor = Mask2FormerImageProcessor.from_pretrained(
+ "facebook/mask2former-swin-large-coco-panoptic"
+ )
+ inputs = processor(images=image, return_tensors="pt")
+ with torch.no_grad():
+ outputs = model(**inputs)
+ # Fuse terrain and sky elements: if the sky appears twice, group it in the same group
+ label_ids_to_fuse = list(model.config.label2id[label] for label in LABELS["terrain"])
+ label_ids_to_fuse.extend(model.config.label2id[label] for label in LABELS["sky"])
+ return processor.post_process_panoptic_segmentation(
+ outputs, target_sizes=[image.size[::-1]], label_ids_to_fuse=label_ids_to_fuse
+ )
+def mask_former(image):
+ """
+ Assign mask to the pixels of an image.
+ :param PIL.Image.Image image: Image to segment.
+ """
+ result = panoptic_segmentation(image)[0]
+ # we refer to the demo notebooks for visualization
+ # (see "Resources" section in the Mask2Former docs)
+ predicted_panoptic_map = result["segmentation"]
+ # Convert the tensor to a NumPy array
+ return predicted_panoptic_map.squeeze().detach().numpy()
+def get_model_labels():
+ """A dictionary of id to labels associations for the model used."""
+ model = mask2former_model()
+ return model.config.id2label
+def get_sky_ids():
+ """The ids of the elements corresponding to the sky."""
+ label2id = mask2former_model().config.label2id
+ return tuple(label2id[label] for label in LABELS['sky'])
+def main(image):
+ """Demonstration usage of MaskFormer."""
+ result = panoptic_segmentation(image)[0]
+ segments_map = result["segmentation"].squeeze().detach()
+ segments_info = result["segments_info"]
+ values = torch.unique(segments_map)
+ _fig, ax = plt.subplots()
+ im = ax.imshow(segments_map)
+ ax.imshow(image, alpha=0.5)
+ # get the colors of the values, according to the
+ # colormap used by imshow
+ colors = [im.cmap(im.norm(value)) for value in values]
+ # create a patch (proxy artist) for every color
+ id2label = get_model_labels()
+ labels_list = [id2label[seg["label_id"]] + f" ({seg['label_id']})" for seg in segments_info]
+ if 0 in values:
+ labels_list.insert(0, "Unknown (0)")
+ patches = [
+ mpl.patches.Patch(color=colors[i], label=label) for i, label in enumerate(labels_list)
+ ]
+ # put those patched as legend-handles into the legend
+ plt.legend(handles=patches)
+ plt.show()
+if __name__ == "__main__":
+ main(Image.open("../island.png"))
diff --git a/environment/mesh_pipeline.py b/environment/mesh_pipeline.py
new file mode 100644
index 0000000..5374ad3
--- /dev/null
+++ b/environment/mesh_pipeline.py
@@ -0,0 +1,189 @@
+Functions to generate a mesh from an RGBD image.
+import numpy as np
+import open3d as o3d
+import torch
+def reduce_image_size(initial_data, resolution):
+ """
+ Take an input 2D array and averages it to reduce its size.
+ :param numpy.ndarray initial_data: Initial array of shape (W, H)
+ :param int resolution: Target resolution to match.
+ :return numpy.ndarray: A new array of size (resolution, resolution)
+ """
+ # We have more depth pixels than vertices, hence the average
+ dilate = initial_data.shape[0] // resolution, initial_data.shape[1] // resolution
+ averager = torch.nn.AvgPool2d(dilate, stride=dilate)
+ # Reshape as (N, C, H, W) and pass to torch
+ tensor_data = torch.from_numpy(
+ initial_data.reshape(1, 1, initial_data.shape[0], initial_data.shape[1])
+ )
+ average = np.asarray(averager(tensor_data)[0, 0])
+ return average
+def create_triangle_list(rows, cols):
+ """
+ Create an indices of triangles that maps the vertices in a planar mesh.
+ :param int rows: Number of rows
+ :param int cols: Number of columns.
+ :return numpy.ndarray: Triangles indices of shape (rows * cols * 2, 3)
+ """
+ # Create a grid of vertex indices
+ indices = np.arange(rows * cols).reshape(rows, cols)
+ # Generate triangles
+ triangles = []
+ # Upper-left triangles
+ upper_left = indices[:-1, :-1].reshape(-1, 1)
+ upper_right = indices[:-1, 1:].reshape(-1, 1)
+ lower_left = indices[1:, :-1].reshape(-1, 1)
+ triangles.append(np.hstack((upper_left, lower_left, upper_right)))
+ # Lower-right triangles
+ lower_right = indices[1:, 1:].reshape(-1, 1)
+ triangles.append(np.hstack((upper_right, lower_left, lower_right)))
+ # Combine all triangles
+ return np.vstack(triangles)
+def create_mesh_geometry(max_resolution, depth_data):
+ """
+ Create a set of vertices and triangles as a plan with deformation.
+ :param max_resolution: The approximate maximum resolution of the generated mesh.
+ :type max_resolution: int
+ :param depth_data: The depth data of the input image.
+ :type depth_data: numpy.ndarray
+ :return: A tuple containing the vertices and triangles of the generated mesh.
+ :rtype: tuple
+ This function creates a grid of vertices positions based on the depth data of the input image.
+ It then assigns the vertices position and depth to the grid.
+ Finally, it creates a grid of triangles for the planar mesh.
+ The function first calculates the average depth value for each pixel in the depth data.
+ It then generates a grid of vertices positions based on the average depth values.
+ The vertices are assigned positions based on their corresponding pixel coordinates
+ in the depth data.
+ The function then creates a grid of triangles for the planar mesh.
+ The function returns a tuple containing the vertices and triangles of the generated mesh.
+ """
+ # Grid of vertices positions
+ resized_depth = reduce_image_size(depth_data, max_resolution)
+ resolution = np.shape(resized_depth)
+ # plot_arrays(data, average)
+ # Assign vertices position and depth
+ x_grid, y_grid = np.meshgrid(
+ np.linspace(0, 1, resolution[1]), np.linspace(1, 0, resolution[0])
+ )
+ # Legacy code? view_height = 0.3
+ # Legacy code? z_grid = np.sqrt(np.abs(resized_depth ** 2 - (y_grid - view_height) ** 2))
+ vertices = np.column_stack(
+ (x_grid.flatten(), y_grid.flatten(), resized_depth.flatten())
+ )
+ # Create a grid of triangles for the planar mesh
+ triangles = create_triangle_list(resolution[0], resolution[1])
+ return vertices, triangles
+def generate_uv(triangles, vertices):
+ """
+ Generate the uv coordinates for a planar mesh.
+ :param triangles: An array of shape (n, 3) for the indices of the vertices in the mesh.
+ :type triangles: numpy.ndarray
+ :param vertices: An array of shape (n, 3) for the 3D coordinates of the vertices in the mesh.
+ :type vertices: numpy.ndarray
+ :return: An array of shape (n*3, 2) representing the uv coordinates of the vertices in the mesh.
+ :rtype: numpy.ndarray
+ This function generates the uv coordinates for a planar mesh.
+ It takes as input the indices and 3D coordinates of the vertices in the mesh,
+ and returns a numpy array containing the uv coordinates of the vertices.
+ Example usage:
+ ```python
+ triangles = np.array([[0, 1, 2]])
+ vertices = np.array([[0, 0, 0], [1, 0, 0], [0, 1, 0]])
+ uv_coords = generate_uv(triangles, vertices)
+ print(uv_coords)
+ ```
+ """
+ v_uv = np.empty((len(triangles) * 3, 2))
+ for i, t in enumerate(triangles):
+ for j in range(3):
+ v_uv[i * 3 + j] = vertices[t[j]][:2] * [1, -1]
+ return v_uv
+def mesh_impression_pipeline(depth_map, max_resolution=256, texture_image=None):
+ """
+ Pipeline to create a mesh from a depth map.
+ :param depth_map: The input depth map.
+ :type depth_map: numpy.ndarray
+ :param max_resolution: Approximate maximum resolution of the generated mesh.
+ :type max_resolution: int
+ :param texture_image: The texture image for the mesh.
+ :type texture_image: open3d.geometry.Image or None
+ :return: A 3D mesh created from the input depth map.
+ :rtype: open3d.geometry.TriangleMesh
+ This function creates a 3D mesh from a depth map using the following steps:
+ 1. Create a grid of vertices positions based on the depth data of the input depth map.
+ 2. Assign the vertices position and depth to the grid.
+ 3. Create a grid of triangles for the planar mesh.
+ 4. Generate the uv coordinates for the mesh.
+ 5. Load a texture image (if provided) and assign it to the mesh.
+ 6. Compute the vertex normals for the mesh.
+ Example usage:
+ ```python
+ depth_map = np.asarray(Image.open("depth_map.png")) / 65535
+ mesh = mesh_impression_pipeline(depth_map, 256, "texture.png")
+ ```
+ """
+ # Create the mesh
+ vertices, triangles = create_mesh_geometry(max_resolution, np.asarray(depth_map))
+ mesh = o3d.geometry.TriangleMesh(
+ o3d.utility.Vector3dVector(vertices), o3d.utility.Vector3iVector(triangles)
+ )
+ # Load a texture image (change the file path accordingly)
+ v_uv = generate_uv(triangles, vertices)
+ mesh.triangle_uvs = o3d.utility.Vector2dVector(v_uv)
+ if texture_image is not None:
+ texture_image = o3d.io.read_image(texture_image)
+ mesh.textures = [texture_image]
+ mesh.compute_vertex_normals()
+ return mesh
diff --git a/environment/point_cloud_pipeline.py b/environment/point_cloud_pipeline.py
new file mode 100644
index 0000000..53d49ea
--- /dev/null
+++ b/environment/point_cloud_pipeline.py
@@ -0,0 +1,201 @@
+RGBD Image to mesh using a point cloud strategy.
+import numpy as np
+import open3d as o3d
+import matplotlib.pyplot as plt
+def generate_point_cloud(rgbd_image):
+ """
+ Generate a point cloud from an RGBD image.
+ :param rgbd_image: An RGBDImage containing the color and depth components of an image.
+ :type rgbd_image: open3d.geometry.RGBDImage
+ :return: 3D point cloud generated from the input RGBD image.
+ :rtype: open3d.geometry.PointCloud
+ This function creates a point cloud from an RGBD image using the provided camera
+ intrinsic and extrinsic parameters.
+ The generated point cloud contains the 3D coordinates of the image's pixels,
+ with depth values corresponding to the depth information in the input RGBD image.
+ """
+ # Default camera: o3d.camera.PinholeCameraIntrinsicParameters.PrimeSenseDefault
+ focal_distance = 200
+ distant_camera = o3d.camera.PinholeCameraIntrinsic(
+ 1, 1, focal_distance, focal_distance, 0, 0
+ )
+ extrinsic_parameters = [
+ [1, 0, 0, 0.5],
+ [0, -1, 0, 0.5],
+ [0, 0, -1, 0],
+ [0, 0, 0, 1],
+ ]
+ pcd = o3d.geometry.PointCloud.create_from_rgbd_image(
+ rgbd_image, intrinsic=distant_camera, extrinsic=extrinsic_parameters
+ )
+ return pcd
+def pcd_from_image(rgb_image, depth_map):
+ """
+ Convert an RGB image and a depth map into a point cloud (pcd).
+ :param str rgb_image: The input RGB image.
+ :param open3d.io.Image depth_map: The input depth map.
+ :return: The 3D point cloud generated from the input RGB image and depth map.
+ :rtype: open3d.geometry.PointCloud
+ This function creates a point cloud from an RGB image and a depth map using the Open3D library.
+ The generated point cloud contains the 3D coordinates of the image's pixels,
+ with depth values corresponding to the depth information in the input depth map.
+ Note: The input RGB image and depth map should be in the same coordinate system and
+ have the same dimensions.
+ """
+ rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth(
+ color=o3d.io.read_image(rgb_image),
+ depth=depth_map,
+ depth_scale=1,
+ convert_rgb_to_intensity=False,
+ )
+ # For debugging: view_flat_estimation(rgbd_image)
+ # To use a point cloud: pcd = generate_point_cloud(rgbd_image)
+ pcd = o3d.geometry.PointCloud()
+ shape = np.shape(rgbd_image.depth)
+ x_grid, y_grid = np.meshgrid(
+ np.linspace(0, 1, shape[0]), np.linspace(1, 0, shape[1])
+ )
+ points = np.column_stack(
+ (x_grid.flatten(), y_grid.flatten(), 1 - np.asarray(rgbd_image.depth).flatten())
+ )
+ pcd.points = o3d.utility.Vector3dVector(points)
+ colors = np.asarray(rgbd_image.color).reshape(shape[0] * shape[1], -1) / 256
+ pcd.colors = o3d.utility.Vector3dVector(colors)
+ pcd.estimate_normals(
+ search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30)
+ )
+ return pcd
+def clustering(pcd):
+ """
+ Cluster the point cloud data.
+ :param pcd: The input point cloud data.
+ :type pcd: open3d.geometry.PointCloud.PointCloud
+ :return: The clustered point cloud data.
+ :rtype: open3d.geometry.PointCloud.PointCloud
+ This function clusters the input point cloud data using the DBSCAN algorithm.
+ The function first creates a new point cloud object from the input point cloud.
+ Then, it sets the minimum distance between points to be the square root of the number of points
+ in the input point cloud.
+ With this distance, it performs the DBSCAN clustering algorithm
+ with a minimum number of points set to 300.
+ The function then assigns colors to the points based on their cluster labels and returns
+ the clustered point cloud data.
+ Example usage:
+ ```python
+ pcd = generate_point_cloud(rgbd_image)
+ clustered_pcd = clustering(pcd)
+ ```
+ """
+ new_pcd = o3d.geometry.PointCloud(pcd)
+ min_dist = len(pcd.points) ** -0.5
+ with o3d.utility.VerbosityContextManager(o3d.utility.VerbosityLevel.Debug):
+ print(min_dist)
+ labels = np.array(
+ pcd.cluster_dbscan(eps=min_dist * 20, min_points=300, print_progress=True)
+ )
+ max_label = labels.max()
+ print(f"point cloud has {max_label + 1} clusters")
+ colors = plt.get_cmap("tab20")(labels / (max_label if max_label > 0 else 1))
+ colors[labels < 0] = 0
+ new_pcd.colors = o3d.utility.Vector3dVector(colors[:, :3])
+ o3d.visualization.draw_geometries([new_pcd])
+ return new_pcd
+def view_densities(mesh, densities):
+ """
+ Visualize the density values of a mesh using Open3D.
+ :param mesh: The input mesh data.
+ :type mesh: open3d.geometry.TriangleMesh
+ :param densities: The input density values.
+ :type densities: numpy.ndarray
+ :return: The visualized mesh with density values.
+ :rtype: open3d.geometry.TriangleMesh
+ This function visualizes the density values of a mesh using Open3D.
+ Then, it assigns these colors to the vertices of the input mesh based on their density values.
+ Finally, it returns the visualized mesh with density values.
+ Example usage:
+ ```python
+ mesh = generate_mesh("sunny_mountain.png", "outputs/3D view.obj", "outputs/depth_map.png")
+ density_mesh = view_densities(mesh, densities)
+ ```
+ """
+ densities = np.asarray(densities)
+ density_colors = plt.get_cmap("plasma")(
+ (densities - densities.min()) / (densities.max() - densities.min())
+ )
+ density_colors = density_colors[:, :3]
+ density_mesh = o3d.geometry.TriangleMesh()
+ density_mesh.vertices = mesh.vertices
+ density_mesh.triangles = mesh.triangles
+ density_mesh.triangle_normals = mesh.triangle_normals
+ density_mesh.vertex_colors = o3d.utility.Vector3dVector(density_colors)
+ o3d.visualization.draw_geometries([density_mesh])
+ return density_mesh
+def point_cloud_pipeline(rgb_image, depth_map):
+ """
+ Pipeline to create a 3D mesh from an image and a depth map.
+ :param str rgb_image: The input RGB image.
+ :param depth_map: The input depth map.
+ :return: A 3D mesh created from the input RGB image and depth map.
+ :rtype: open3d.geometry.TriangleMesh
+ This function first converts the input RGB image and depth map into a point cloud.
+ Then, it creates a 3D mesh from the point cloud.
+ Finally, it removes the vertices with low density values and returns the resulting 3D mesh.
+ Example usage:
+ ```python
+ mesh = point_cloud_pipeline("sunny_mountain.png", "outputs/depth_map.png")
+ ```
+ """
+ pcd = pcd_from_image(rgb_image, depth_map)
+ # Basic clustering: clustering(pcd)
+ poisson_mesh, densities = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(
+ pcd=pcd, depth=10
+ )
+ # view_densities(poisson_mesh, densities)
+ vertices_to_remove = densities < np.quantile(densities, 0.018)
+ trimmed_mesh = o3d.geometry.TriangleMesh(poisson_mesh)
+ trimmed_mesh.remove_vertices_by_mask(vertices_to_remove)
+ print("Displaying reconstructed mesh ...")
+ o3d.visualization.draw_geometries([trimmed_mesh])
+ return poisson_mesh
diff --git a/environment/renderer.py b/environment/renderer.py
new file mode 100644
index 0000000..89bf1a9
--- /dev/null
+++ b/environment/renderer.py
@@ -0,0 +1,500 @@
+Generate a 2.5D view of a 2D image.
+Currently, using marigold-v1-0 (https://huggingface.co/prs-eth/marigold-v1-0)
+Source code:
+import warnings
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+import open3d
+import scipy
+from skimage.restoration.inpaint import inpaint_biharmonic
+from skybox.inpainting import inpaint_image
+from environment.depth_generation import get_depth
+from environment.mesh_pipeline import mesh_impression_pipeline
+from environment.image_segmentation import (
+ segment_anything,
+ crop_to_mask,
+ force_monotonous,
+ increasing_depth,
+from environment.depth_inpainting import inpaint_depth_controlled
+def cylindrical_projection(flat_vertices, total_angle):
+ """Map from vertices from a flat panorama to a circular geometry."""
+ vertices = flat_vertices
+ far_plane = 10
+ radii = vertices[:, 2] * far_plane
+ angles = 2 * np.pi * vertices[:, 0] * total_angle
+ new_vertices = np.dstack(
+ (radii * np.cos(angles), vertices[:, 1], radii * np.sin(angles))
+ )[0]
+ return new_vertices
+def spherical_projection(flat_vertices):
+ """Map from vertices from a flat panorama to a circular geometry."""
+ vertices = np.asarray(flat_vertices)
+ radii = vertices[:, 2]
+ # [0, 1] -> [0, tau]
+ theta = 2 * np.pi * vertices[:, 0]
+ # [1, 0] -> [pi / 2, -pi / 2]
+ phi = -np.pi / 2 + vertices[:, 1] * np.pi
+ new_vertices = np.dstack(
+ (
+ radii * np.cos(theta),
+ radii * np.sin(theta) * np.sin(phi),
+ radii * np.sin(theta) * np.cos(phi),
+ )
+ )[0]
+ return new_vertices
+def normalize_depth(vertices):
+ """Simple depth normalization between 0 and 1."""
+ vertices[:, 2] = (vertices[:, 2] - np.min(vertices[:, 2])) / (
+ np.max(vertices[:, 2]) - np.min(vertices[:, 2])
+ )
+ return vertices
+def force_ground_closing(vertices):
+ """
+ Apply a force to the vertices of a 3D mesh that pulls the lowest vertices to the center.
+ :param vertices: A numpy array representing the 3D coordinates of the vertices.
+ :type vertices: numpy.ndarray
+ :return: The modified 3D coordinates of the vertices after applying the force.
+ :rtype: numpy.ndarray
+ """
+ heights = vertices[:, 1]
+ # height = 0 -> 0
+ # height > 0.2 -> ~1
+ attractions = 1 - np.exp(-heights / 0.05)
+ new_vertices = np.copy(vertices)
+ new_vertices[:, 2] = 1 - new_vertices[:, 2]
+ new_vertices[:, 2] -= np.min(new_vertices[:, 2])
+ new_vertices[:, 2] *= -attractions
+ return new_vertices
+def remove_aberrant_triangles(mesh, limit=0.5):
+ """
+ This function removes triangles from a 3D mesh that have a normal on z below a specified limit.
+ :param mesh: A :class:`open3d.geometry.TriangleMesh` object representing the 3D mesh.
+ :type mesh: open3d.geometry.TriangleMesh
+ :param limit: A float value representing the minimum z normal for the triangles to be kept.
+ :type limit: float
+ :return: A new :class:`open3d.geometry.TriangleMesh` object with the aberrant triangles removed.
+ :rtype: open3d.geometry.TriangleMesh
+ This function takes a 3D mesh and a height limit as input.
+ It then removes all triangles from the mesh that have a height below the specified limit.
+ The function returns a new 3D mesh object with the aberrant triangles removed.
+ """
+ new_mesh = open3d.geometry.TriangleMesh(mesh)
+ triangles_list = np.nonzero(np.asarray(mesh.triangle_normals)[:, 2] < limit)[0]
+ new_mesh.remove_triangles_by_index(triangles_list)
+ return new_mesh
+def fold_as_panorama(mesh, total_angle=0.5):
+ """Fold a mesh as a panorama."""
+ # To force depth=0 when y=0: new_vertices = force_ground_closing(np.asarray(mesh.vertices))
+ new_vertices = normalize_depth(np.asarray(mesh.vertices))
+ new_vertices = cylindrical_projection(new_vertices, total_angle)
+ new_mesh = open3d.geometry.TriangleMesh(mesh)
+ new_mesh.vertices = open3d.utility.Vector3dVector(new_vertices)
+ return new_mesh
+def display_meshes(mesh_list):
+ """Remove the texture on a meshes to display them with open3d."""
+ for mesh in mesh_list:
+ mesh.textures = []
+ open3d.visualization.draw_geometries(mesh_list)
+def save_mesh(mesh, filename, view=False):
+ """
+ Save the mesh to a file.
+ :param mesh: The input mesh data.
+ :type mesh: open3d.geometry.TriangleMesh
+ :param filename: The path to the file where the mesh will be saved.
+ :type filename: str
+ :param view: A boolean flag indicating whether to visualize the mesh before saving it.
+ :type view: bool
+ Example usage:
+ ```python
+ mesh = generate_mesh("sunny_mountain.png", "outputs/3D view.obj", "outputs/depth_map.png")
+ save_mesh(mesh, "outputs/3D view.obj", view=True)
+ ```
+ """
+ # Disable texture for visualization
+ if view:
+ mesh.textures = []
+ open3d.visualization.draw_geometries([mesh], mesh_show_wireframe=True)
+ open3d.io.write_triangle_mesh(filename, mesh)
+def moving_average(data, window_size=5):
+ """Moving average over the given input data."""
+ return np.convolve(data, np.ones(window_size) / window_size, mode="same")
+def horizon_height(depth_map):
+ """Get the height of the horizon using gradient only, not precise."""
+ y_depth_grad = np.gradient(depth_map, axis=0)
+ reduced_grad = np.mean(y_depth_grad, axis=1)
+ # The ground is the area of negative gradient, find the first occurrence of positive gradient
+ smoothed_grad = moving_average(reduced_grad, depth_map.shape[0] // 50)
+ return np.argmax(smoothed_grad)
+def plot_horizon_computation(depth_map):
+ """Compare different horizon line finding methods."""
+ mean_depth = np.mean(depth_map, axis=1)
+ y_depth_grad = np.gradient(depth_map, axis=0)
+ reduced_grad = np.mean(y_depth_grad, axis=1)
+ # The ground is the area of negative gradient, find the first occurrence of positive gradient
+ smoothed_grad = moving_average(reduced_grad, depth_map.shape[0] // 50)
+ horizon = horizon_height(depth_map)
+ y_indices = np.arange(depth_map.shape[0])
+ _, axes = plt.subplots(1, 2, sharey=True)
+ axes[0].imshow(depth_map)
+ axes[0].plot(mean_depth * depth_map.shape[1], y_indices, label="Average depth")
+ axes[1].plot(reduced_grad, y_indices, label="Vertical depth gradient")
+ axes[1].plot(smoothed_grad, y_indices, label="Smoothed depth gradient")
+ axes[1].plot(
+ (np.min(reduced_grad), np.max(reduced_grad)),
+ (horizon, horizon),
+ label="Detected horizon",
+ )
+ axes[1].plot(
+ moving_average(smoothed_grad),
+ y_indices,
+ )
+ axes[1].plot((0, 0), (0, depth_map.shape[0]))
+ plt.grid()
+ axes[1].set_position(
+ [
+ axes[1].get_position().x0,
+ axes[0].get_position().y0,
+ axes[1].get_position().width,
+ axes[0].get_position().height,
+ ]
+ )
+ plt.legend()
+ plt.show()
+ exit()
+def generate_mesh(texture, depth_map, resolution=256):
+ """
+ Generate a 3D mesh from an image and a depth map.
+ :param open3d.geometry.Image texture: The input RGB image to use as mesh texture.
+ :param depth_map: The input depth map.
+ :type depth_map: numpy.ndarray
+ :param int resolution: Vertices per side in the generated mesh.
+ :return: A 3D mesh created from the input RGB image and depth map.
+ :rtype: open3d.geometry.TriangleMesh
+ This function generates a 3D mesh from an image and a depth map using the following steps:
+ 1. Convert the input RGB image and depth map into a point cloud.
+ 2. Create a 3D mesh from the point cloud using a Poisson surface reconstruction.
+ 3. Remove the vertices with low density values.
+ 4. Save the resulting 3D mesh to a file specified by the `output_mesh` parameter.
+ Example usage:
+ ```python
+ generate_mesh("sunny_mountain.png", "outputs/3D view.obj", "outputs/depth_map.png")
+ ```
+ """
+ # To use the point cloud alternative: mesh = environment.point_cloud_pipeline(input_image, depth_map)
+ mesh = mesh_impression_pipeline(depth_map, resolution, texture)
+ new_mesh = remove_aberrant_triangles(mesh, 0.1)
+ new_mesh = fold_as_panorama(new_mesh, 1)
+ # display_meshes([mesh, new_mesh])
+ return new_mesh
+def mesh_panorama_from_files(
+ input_image, output_mesh, depth_image=None, resolution=256
+ """
+ Generate a 3D mesh from an image and a depth map.
+ :param str input_image: The input RGB image.
+ :param str output_mesh: The path to the file where the mesh will be saved.
+ :param depth_image: The input depth map. If not provided, it will be generated.
+ :type depth_image: str or None
+ :param int resolution: Vertices per side in the generated mesh.
+ :return: A 3D mesh created from the input RGB image and depth map.
+ :rtype: open3d.geometry.TriangleMesh
+ Example usage:
+ ```python
+ mesh_panorama_from_files("sunny_mountain.png", "outputs/3D view.obj", "outputs/depth_map.png")
+ ```
+ """
+ if depth_image is None:
+ depth_map = get_depth(Image.open(input_image))
+ else:
+ # Image im I;16 format with depth on 16 bits.
+ depth_map = np.asarray(open3d.io.read_image(depth_image)) / (2**16 - 1)
+ plot_horizon_computation(depth_map)
+ main_texture = open3d.io.read_image(input_image)
+ new_mesh = generate_mesh(main_texture, depth_map, resolution)
+ if output_mesh is not None:
+ save_mesh(new_mesh, output_mesh)
+ print(f"Mesh saved as '{output_mesh}'.")
+ return new_mesh
+def segment_stuff(image_path, depth_path=None):
+ """
+ This function generates a mask of the skybox in an input image.
+ :param str image_path: The path to the input image.
+ :param str depth_path: The path to the input depth map. If not provided, it will be generated.
+ :return: A binary mask of the skybox in the input image.
+ :rtype: numpy.ndarray
+ Example usage:
+ ```python
+ skybox_mask = mask_skybox("../sunny_mountain.png", "sunny_depth_map.png")
+ plt.imshow(skybox_mask)
+ plt.show()
+ ```
+ """
+ image = Image.open(image_path)
+ if depth_path is None:
+ depth_map = get_depth(image)
+ else:
+ # Image im I;16 format with depth on 16 bits.
+ depth_map = np.asarray(open3d.io.read_image(depth_path)) / (2**16 - 1)
+ np.save("depth.npy", depth_map)
+ return segment_anything(image, depth_map)
+def mask_image(image, mask):
+ """Extract the part of an image that matches the given mask."""
+ indices = np.argwhere(mask)
+ pixels = np.zeros_like(image)
+ pixels[indices[:, 0], indices[:, 1]] = image[indices[:, 0], indices[:, 1]]
+ return pixels
+def filling_strategy(image_np, large_mask):
+ """Create a very large image with many mirrored views."""
+ cropped = crop_to_mask(image_np, large_mask)
+ # Fill holes
+ # Map in big size
+ large_skybox = np.empty(
+ (cropped.shape[0] * 3, cropped.shape[1] * 3, cropped.shape[2])
+ )
+ elem = cropped[:, ::-1]
+ for i in range(3):
+ elem = elem[::-1]
+ for j in range(3):
+ if j != 0:
+ elem = elem[:, ::-1]
+ large_skybox[
+ i * cropped.shape[0] : (i + 1) * cropped.shape[0],
+ j * cropped.shape[1] : (j + 1) * cropped.shape[1],
+ ] = elem
+ Image.fromarray(large_skybox.astype(np.uint8)).show()
+ # Fill holes
+ # For testing: return large_skybox
+ raise NotImplementedError("This function is not finished.")
+def enlarge_mask(initial_mask, iterations=20):
+ """Enlarge an input mask by applying a binary dilatation repetitively."""
+ large_mask = initial_mask
+ for _ in range(iterations):
+ large_mask = scipy.ndimage.binary_dilation(large_mask)
+ return large_mask
+def inpaint_skybox(image_np, skybox):
+ """Apply inpainting to complete the skybox."""
+ large_mask = enlarge_mask(np.logical_not(skybox))
+ inpainted_skybox = Image.fromarray(
+ (inpaint_biharmonic(image_np, large_mask, channel_axis=-1) * 255).astype(
+ np.uint8
+ )
+ )
+ inpainted_skybox.show()
+ complete_skybox = inpaint_image(
+ "the sky", inpainted_skybox, Image.fromarray(large_mask), num_inference_steps=50
+ )[0]
+ return complete_skybox
+def inpaint_ground(image, image_np, depth_map, ground, filling_mask):
+ """Apply inpainting to complete the ground."""
+ ground_segment = mask_image(image_np, ground)
+ stretched_ground = (
+ Image.fromarray(ground_segment)
+ .resize((image.width * 5, image.height))
+ .crop((image.width * 2, 0, image.width * 3, image.height))
+ )
+ inpainted_ground = Image.fromarray(ground_segment)
+ inpainted_ground.paste(stretched_ground)
+ inpainted_ground.paste(
+ Image.fromarray(ground_segment), mask=Image.fromarray(ground)
+ )
+ print("Completing ground, inpainting")
+ complete_ground = inpaint_image("the ground", inpainted_ground, filling_mask)[0]
+ complete_ground.show()
+ print("Completing ground with controlnet")
+ ground_depth = 1 - increasing_depth(force_monotonous(depth_map))
+ complete_ground_controlled = inpaint_depth_controlled(
+ inpainted_ground,
+ filling_mask,
+ Image.fromarray(ground_depth),
+ "the ground",
+ )[0]
+ return complete_ground_controlled
+def complete_segments(image, depth_map, skybox, ground, objects):
+ """
+ Process the image parts.
+ - Sky: Reshape image size, inpaint holes.
+ - Ground: Inpaint holes, reshape rectangle.
+ - Objects: Store depth, normal map (?).
+ :param PIL.Image.Image image: The initial image
+ :param numpy.ndarray depth_map: The depth map
+ :param PIL.Image.Image skybox: The masked skybox
+ :param PIL.Image.Image ground: The masked ground
+ :param list[PIL.Image.Image] objects: Each masked object
+ :return: The completed segments.
+ :rtype: tuple[PIL.Image.Image, PIL.Image.Image, list[tuple[PIL.Image.Image, numpy.ndarray]]]
+ """
+ image_np = np.asarray(image)
+ # Complete the skybox
+ complete_skybox = inpaint_skybox(image_np, skybox)
+ complete_skybox.show()
+ # Complete the terrain
+ filling_mask = np.logical_and(
+ np.logical_not(ground), np.logical_not(skybox), objects >= 0
+ )
+ filling_mask = enlarge_mask(filling_mask)
+ complete_ground = inpaint_ground(
+ image, image_np, depth_map, ground, Image.fromarray(filling_mask)
+ )
+ complete_ground.show()
+ objects_data = []
+ # Save the objects
+ for i in range(int(np.max(objects)) + 1):
+ mask = objects == i
+ cropping = crop_to_mask(image_np, mask)
+ depth = crop_to_mask(depth_map, mask)
+ # Handle occlusions
+ objects_data.append((cropping, depth))
+ warnings.warn("Objects occlusions cannot be handled yet.")
+ return complete_skybox, complete_ground, objects_data
+def save_as_scene(skybox, terrain, depth_map, _objects):
+ """
+ Save all the elements as objects in a scene.
+ :param PIL.Image.Image skybox: Skybox to save.
+ :param PIL.Image.Image terrain: Terrain texture.
+ :param numpy.ndarray depth_map: Terrain depth map.
+ :param _objects: Objects to save.
+ :type _objects: list[tuple[PIL.Image.Image, numpy.ndarray]]
+ """
+ skybox_path = "outputs/complete_skybox.png"
+ skybox.save("outputs/complete_skybox.png")
+ print("Saved the skybox under " + skybox_path)
+ terrain_texture_path = "outputs/terrain_texture.png"
+ terrain.save(terrain_texture_path)
+ terrain_mesh = generate_mesh(open3d.io.read_image(terrain_texture_path), depth_map)
+ terrain_mesh_path = "outputs/terrain_mesh.obj"
+ save_mesh(terrain_mesh, terrain_mesh_path)
+ print("Saved the mesh under " + terrain_mesh_path)
+ warnings.warn("Objects are not handled yet.")
+if __name__ == "__main__":
+ """
+ # To generate a new mesh
+ mesh_panorama_from_files("../sunny_mountain.png", "outputs/sunny 3D.obj", "outputs/sunny_depth_map.png")
+ # To regenerate data
+ skybox, ground, objects = segment_stuff(
+ "../sunny_mountain.png", "sunny_depth_map.png"
+ )
+ # Skybox mask only
+ skybox_mask = mask_skybox("../forest.png", "outputs/depth_map.png")
+ # Save the data to files
+ np.save("outputs/skybox.npy", skybox)
+ np.save("outputs/ground.npy", ground)
+ np.save("outputs/objects.npy", objects)
+ """
+ SKYBOX = np.load("outputs/skybox.npy")
+ GROUND = np.load("outputs/ground.npy")
+ OBJECTS = np.load("outputs/objects.npy")
+ complete_segments(
+ Image.open("../sunny_mountain.png"),
+ np.asarray(open3d.io.read_image("sunny_depth_map.png")) / (2**16 - 1),
+ )
+ # np.save("mask.npy", skybox_mask)
diff --git a/environment/sunny_depth_map.png b/environment/sunny_depth_map.png
new file mode 100644
index 0000000..7936ced
Binary files /dev/null and b/environment/sunny_depth_map.png differ
diff --git a/requirements-optional.txt b/requirements-optional.txt
new file mode 100644
index 0000000..de4cf03
--- /dev/null
+++ b/requirements-optional.txt
@@ -0,0 +1,3 @@
+PyAudio~=0.2.14 # For speech-to-text only
+datasets~=2.18.0 # For speech-to-text only
+# audiocraft~=1.2.0 # for text-to-sound, won't work, see https://github.com/facebookresearch/audiocraft?tab=readme-ov-file#installation
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..7a1c942
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+torch~=2.3.0 # see https://pytorch.org/get-started/locally/ when it gets incompatible with the CUDA version
diff --git a/server/run.py b/server/run.py
new file mode 100644
index 0000000..7235bfc
--- /dev/null
+++ b/server/run.py
@@ -0,0 +1,533 @@
+Starts a Python server using sockets, able to pass data to various AI functions.
+import os
+import time
+import json
+import socket
+import socketserver
+import threading
+import warnings
+import torch.cuda
+from PIL import Image
+from asr.speech_to_text import do_audio_transcription
+from server.utils import (
+ hex_to_pillow,
+ get_server_address,
+ hex_to_bytes,
+ image_response,
+ get_configuration_data,
+from server.task_tracker import TaskTracker
+from skybox.diffusion import generate_images, refine_images
+from skybox.inpainting import make_transparent_black, inpaint_panorama_pipeline
+from skybox import panorama_creator
+# Max chunk size for input data
+CHUNK_SIZE = 4096
+def init_server(server_ip, server_port):
+ """
+ Initialize the server with the input configuration file.
+ """
+ # Create a TCP socket
+ server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ # Bind the socket to the IP address and port
+ server_socket.bind((server_ip, server_port))
+ # Listen for incoming connections
+ server_socket.listen(5)
+ print(f"Server is listening on {server_ip}:{server_port}")
+ return server_socket
+def server_data():
+ """The public data about this server."""
+ configuration_data = get_configuration_data()
+ data = {
+ "name": configuration_data["name"],
+ "description": configuration_data["description"],
+ "version": configuration_data["version"],
+ }
+ return data
+def completion_report(completion, client_socket, task_id):
+ """
+ Completion report for a task.
+ Send a completion report through the TCP connection.
+ :param int completion: Task completion from 0 to 100
+ :param socket.socket client_socket: TCP client socket
+ :param int task_id: Identifier of the task to check completion.
+ :return dict: Data sent
+ """
+ data = {"completion": "progress", "taskCompletion": completion, "taskId": task_id}
+ response = {"status": 200, "data": json.dumps(data), "type": "completion"}
+ send_response(response, client_socket)
+ return data
+def new_skybox_handler(prompt, advanced, progress_tracker=None):
+ """
+ Generate a new skybox and add the image to the output data.
+ :param string prompt: The prompt for the skybox generation.
+ :param bool advanced: Stop the generation at the first of the pipeline if true.
+ :param TaskTracker | None progress_tracker: TaskTracker object to report each step
+ """
+ if advanced:
+ height = 416
+ image = Image.new("RGB", (2504, height * 5 // 2), "black")
+ base_image = generate_images(
+ prompt, num_inference_steps=50, width=2504, height=height,
+ **{"callback_on_step_end": progress_tracker.callback if progress_tracker else None}
+ )[0]
+ image.paste(base_image, (0, height))
+ else:
+ image = panorama_creator.generate_panorama(prompt, progress_tracker=progress_tracker)
+ response = image_response(image)
+ return response
+def new_skybox_local_handler(prompt, destination_path, step_callback=None):
+ """
+ Generate a new skybox.
+ :param string prompt: The prompt for the skybox generation.
+ :param string destination_path: Where to save the generated image.
+ :param Callable | None step_callback: Callback function to report each step.
+ :return dict: A dictionary with "skyboxFilePath" key.
+ """
+ images = generate_images(
+ prompt, callback_on_step_end=step_callback, height=1024, width=2048
+ )
+ images[0].save(destination_path)
+ data = {"skyboxFilePath": destination_path}
+ return data
+def panorama_handler(prompt, step_callback=None):
+ """
+ Generate a new panorma skybox (no seam line) and add the image to the output data.
+ Deprecated: since pipeline v0.4, use new_skybox_handler instead.
+ :param string prompt: The prompt for the skybox generation.
+ :param Callable | None step_callback: Callback function to report each step.
+ :return dict: A dictionary containing the image bytes in hexadecimal string.
+ """
+ image = generate_images(
+ prompt, callback_on_step_end=step_callback, height=1024, width=2048
+ )[0]
+ panorama = panorama_creator.rewrite_image_borders(image)
+ cylindrical = panorama_creator.cylindrical_projection(panorama)
+ smoothed = panorama_creator.blend_borders(cylindrical, 10)
+ response = image_response(smoothed)
+ return response
+def refine_skybox_handler(image_hex, prompt, step_callback=None):
+ """
+ Refine an image with SDXL refiner.
+ :param str image_hex: Base image hexadecimal string, PNG format
+ :param str prompt: Prompt to guide the refining process.
+ :param Callable | None step_callback: Report function object to report each step.
+ :return dict: A dictionary with the PNG image in encoded.
+ """
+ base = hex_to_pillow(image_hex).convert("RGB")
+ image_part = base.crop((0, base.height * 2 // 5, base.width, base.height * 4 // 5))
+ refined = refine_images(
+ prompt, image_part, num_inference_steps=50, **{"callback_on_step_end": step_callback}
+ )[0]
+ base.paste(refined, (0, base.height * 2 // 5))
+ response = image_response(base)
+ return response
+def remove_seam_handler(image_hex, _step_callback=None):
+ """
+ Fixes the borders of an image to make it an asymmetric tiling.
+ :param str image_hex: Base image hexadecimal string, PNG format
+ :param Callable | None _step_callback: Callback function to report each step.
+ :return dict: A dictionary with the PNG image encoded.
+ """
+ image_frame = hex_to_pillow(image_hex).convert("RGB")
+ image_part = image_frame.crop(
+ (0, image_frame.height * 2 // 5, image_frame.width, image_frame.height * 4 // 5)
+ )
+ asymmetric_image = panorama_creator.rewrite_image_borders(image_part)
+ image_frame.paste(asymmetric_image, (0, image_frame.height * 2 // 5))
+ response = image_response(image_frame)
+ return response
+def extend_skybox_handler(image_hex, step_callback=None):
+ """
+ Expand the given image to create a larger skybox.
+ :param str image_hex: Base image hexadecimal string, PNG format
+ :param Callable | None step_callback: Callback function to report each step.
+ :return dict: A dictionary with the PNG image encoded.
+ """
+ image_frame = hex_to_pillow(image_hex).convert("RGB")
+ image_part = image_frame.crop(
+ (0, image_frame.height * 2 // 5, image_frame.width, image_frame.height * 4 // 5)
+ )
+ extended = panorama_creator.extend_image(image_part, 50, step_callback=step_callback)
+ response = image_response(extended)
+ return response
+def asr_local_handler(audio_file_path):
+ """
+ Return the transcription from an audio file.
+ :param str audio_file_path: Audio file path
+ :return dict: Text enclosed in "transcription" key
+ """
+ if os.path.exists(audio_file_path):
+ result = do_audio_transcription(audio_file_path)
+ print(result)
+ data = {"transcription": result["text"]}
+ else:
+ print("File does not exist")
+ data = {
+ "transcription": f"Error: input file {audio_file_path} does not exist!",
+ "message": f"Error: input file {audio_file_path} does not exist!",
+ }
+ return data
+def asr_handler(audio_bytes):
+ """
+ Return the transcription from an audio.
+ :param str audio_bytes: The audio as byte string, hexadecimal encoded
+ :return dict: Text enclosed in "transcription" key
+ """
+ raw_bytes = hex_to_bytes(audio_bytes)
+ result = do_audio_transcription(raw_bytes)
+ print(result)
+ return {"transcription": result["text"]}
+def inpaint_handler(image_hex, mask_image_hex, prompt, step_callback=None):
+ """
+ Inpaint (draw on) an image using a prompt, and add the image to the output data.
+ :param str image_hex: Hexadecimal string encoding of the image in PNG format.
+ :param str mask_image_hex: Mask image bytes, PNG format
+ :param str prompt: Prompt for inpainting
+ :param step_callback: Function to run at the end of each step f : step_number -> Any
+ :type step_callback: Callable | None
+ :return dict: The new inpainted image, in standard image response format.
+ """
+ init_image = hex_to_pillow(image_hex).convert("RGB")
+ mask_image = make_transparent_black(hex_to_pillow(mask_image_hex)).resize(init_image.size)
+ new_image = inpaint_panorama_pipeline(init_image, mask_image, prompt, step_callback)
+ response = image_response(new_image)
+ return response
+def inpaint_local_handler(
+ init_image_path, mask_image_path, prompt, destination_path, step_callback=None
+ """
+ Inpaint (draw on) an image using a prompt.
+ :param str init_image_path: Base image path
+ :param str mask_image_path: Mask image path
+ :param str prompt: Prompt for inpainting
+ :param str destination_path: Destination path for the inpainted image
+ :param step_callback: Function to run at the end of each step f : step_number -> Any
+ :type step_callback: Callable | None
+ :return dict: Path to the new image, enclosed in "inpaintedFilePath" key
+ """
+ init_image = Image.open(init_image_path).convert("RGB")
+ mask_image = make_transparent_black(Image.open(mask_image_path)).resize(init_image.size)
+ new_image = inpaint_panorama_pipeline(init_image, mask_image, prompt, step_callback)
+ new_image.save(destination_path)
+ data = {"inpaintedFilePath": destination_path}
+ return data
+def send_response(response, client_socket):
+ """
+ Send a response through the client socket.
+ :param dict response: Response to send, a flat (not nested) dictionary.
+ :param socket.socket client_socket: Socket to send the response
+ """
+ str_dump = json.dumps(response)
+ client_socket.sendall(str_dump.encode())
+def start_task(task_dict, tracker):
+ """
+ Start a new server task.
+ :param dict task_dict: Dictionary of data about this task
+ :param TaskTracker | None tracker: Object to call on step end
+ :return dict: Dictionary containing the response to this task.
+ """
+ result = None
+ print(f"Starting task: {task_dict['type']}")
+ if task_dict["type"] == "new-skybox-local":
+ result = new_skybox_local_handler(
+ task_dict["prompt"],
+ task_dict["outputFilePath"],
+ step_callback=tracker.callback,
+ )
+ elif task_dict["type"] == "new-skybox":
+ result = new_skybox_handler(
+ task_dict["prompt"], bool(task_dict["quick"]), progress_tracker=tracker
+ )
+ elif task_dict["type"] == "panorama":
+ result = panorama_handler(task_dict["prompt"], step_callback=tracker.callback)
+ elif task_dict["type"] == "inpainting-local":
+ result = inpaint_local_handler(
+ task_dict["imagePath"],
+ task_dict["maskPath"],
+ task_dict["prompt"],
+ task_dict["outputFilePath"],
+ step_callback=tracker.callback,
+ )
+ elif task_dict["type"] == "inpainting":
+ result = inpaint_handler(
+ task_dict["imageBytes"],
+ task_dict["maskBytes"],
+ task_dict["prompt"],
+ step_callback=tracker.callback,
+ )
+ elif task_dict["type"] == "refine-skybox":
+ result = refine_skybox_handler(
+ task_dict["imageBytes"],
+ task_dict["prompt"],
+ step_callback=tracker.callback,
+ )
+ elif task_dict["type"] == "remove-seam":
+ result = remove_seam_handler(
+ task_dict["imageBytes"],
+ _step_callback=tracker.callback,
+ )
+ elif task_dict["type"] == "extend-skybox":
+ result = extend_skybox_handler(
+ task_dict["imageBytes"],
+ step_callback=tracker,
+ )
+ elif task_dict["type"] == "asr-local":
+ result = asr_local_handler(task_dict["audioPath"])
+ elif task_dict["type"] == "asr":
+ result = asr_handler(task_dict["audioBytes"])
+ elif task_dict["type"] == "ping":
+ result = {
+ "queryTimestamp": task_dict["queryTimestamp"],
+ "responseTimestamp": int(time.time() * 1000),
+ "responseMilliseconds": int(time.time() * 1000)
+ - task_dict["queryTimestamp"],
+ }
+ if result is None:
+ raise NotImplementedError(
+ f"The task '{task_dict['type']}' is not recognized as a valid task type."
+ )
+ return result
+def prepare_response(json_data, tracker):
+ """
+ Prepare a response to be sent after a query.
+ :param dict json_data: Response dictionary
+ :param TaskTracker | None tracker: TaskTracker tp report completion.
+ :return dict response: Response dictionary
+ """
+ if "reportCompletion" not in json_data or json_data["reportCompletion"] == 0:
+ tracker = None
+ if json_data["type"] == "info":
+ answer_data = server_data()
+ response = {
+ "status": 200,
+ "data": json.dumps(answer_data),
+ "message": "Info",
+ "type": "info",
+ }
+ elif json_data["type"] == "completion":
+ response = {
+ "status": 200,
+ "data": json.dumps({"completion": 0}),
+ "type": "completion",
+ }
+ elif json_data["type"] in (
+ "ping",
+ "new-skybox-local",
+ "new-skybox",
+ "panorama",
+ "refine-skybox",
+ "inpainting-local",
+ "inpainting",
+ "remove-seam",
+ "extend-skybox",
+ "asr-local",
+ "asr",
+ ):
+ response = {
+ "status": 200,
+ "taskId": json_data["taskId"],
+ "type": json_data["type"],
+ }
+ try:
+ answer_data = start_task(json_data, tracker)
+ except torch.cuda.OutOfMemoryError as err:
+ response = {
+ "status": 500,
+ "data": json.dumps({}),
+ "message": f"Out of memory: {err}",
+ "type": "error",
+ }
+ else:
+ response["data"] = json.dumps(answer_data)
+ else:
+ response = {
+ "status": 404,
+ "data": json.dumps({}),
+ "message": f"Unknown type: {json_data['type']}",
+ "type": "error",
+ }
+ return response
+def safe_send(response, client_socket):
+ """Safely send a response to the client, handling large responses by fragmenting them."""
+ size_limit = 8196 * 16
+ if len(response["data"]) > size_limit:
+ n_fragments = len(response["data"])
+ capsule = {}
+ capsule.update(response)
+ capsule["status"] = 206
+ capsule["total_fragments"] = n_fragments // size_limit
+ print(f"Fragmenting response in {capsule['total_fragments']} fragments.")
+ for i in range(capsule["total_fragments"]):
+ fragment = response["data"][i * size_limit: i * size_limit + size_limit]
+ capsule["data"] = fragment
+ print(len(fragment))
+ capsule["index"] = i
+ send_response(capsule, client_socket)
+ else:
+ send_response(response, client_socket)
+def handle_query(data, client_socket):
+ """
+ Handle a query from a client.
+ :param str data: Data received from client
+ :param socket.socket client_socket: The client socket
+ """
+ if not data or data.isspace():
+ print("Empty data, aborting")
+ response = {"status": 400, "data": json.dumps({}), "message": "Empty data"}
+ else:
+ try:
+ json_data = json.loads(data)
+ except ValueError as error:
+ print("Data is not json")
+ response = {
+ "status": 304,
+ "data": json.dumps({}),
+ "message": f"Wrong JSON: {error}",
+ }
+ else:
+ tracker = TaskTracker(client_socket, json_data["taskId"], completion_report)
+ response = prepare_response(json_data, tracker)
+ send_response(response, client_socket)
+def wait_for_connection(server_socket):
+ """
+ Wait for a connection from a client.
+ :param socket.socket server_socket: Server socket
+ """
+ # Accept incoming connection
+ client_socket, client_address = server_socket.accept()
+ print(f"Client {client_address} connected.")
+ # Receive data from client
+ bytes_buffer = []
+ while True:
+ bytes_read = client_socket.recv(CHUNK_SIZE)
+ bytes_buffer.append(bytes_read)
+ if len(bytes_read) == 0:
+ warnings.warn("Received 0 bytes from client")
+ elif len(bytes_read) == 1:
+ print("Received short data " + bytes_read.decode())
+ elif bytes_read[-2] != b"\\" and bytes_read.endswith(b"}"):
+ break
+ query_string = b"".join(bytes_buffer).decode()
+ try:
+ handle_query(query_string, client_socket)
+ except ConnectionResetError:
+ print("Connection reset during transmission.")
+ # Close the connection
+ client_socket.close()
+def handle(client_socket, client_address):
+ """Read the data until termination and take action."""
+ # self.request is the TCP socket connected to the client
+ bytes_buffer = []
+ while True:
+ bytes_read = client_socket.recv(CHUNK_SIZE)
+ bytes_buffer.append(bytes_read)
+ if len(bytes_read) == 0:
+ warnings.warn("Received 0 bytes from client")
+ elif len(bytes_read) == 1:
+ print("Received short data " + bytes_read.decode())
+ elif bytes_read[-2] != b"\\" and bytes_read.endswith(b"}"):
+ break
+ query_string = b"".join(bytes_buffer).decode()
+ print(f"Request from {client_address[0]}:{client_address[1]}")
+ handle_query(query_string, client_socket)
+class TCPHandler(socketserver.StreamRequestHandler):
+ """Instantiates the server."""
+ def handle(self):
+ """Define our to receive data, just a wrapper for the handle function."""
+ handle(self.request, self.client_address)
+def run_server(forked_server=True):
+ """Start the server."""
+ server_ip, server_port = get_server_address()
+ # Create the server
+ server = socketserver.TCPServer((server_ip, server_port), TCPHandler)
+ with server:
+ print(f"Starting server on {server_ip}:{server_port}")
+ if forked_server:
+ server_thread = threading.Thread(target=server.serve_forever)
+ server_thread.start()
+ else:
+ server.serve_forever()
+if __name__ == "__main__":
+ run_server(False)
diff --git a/server/task_tracker.py b/server/task_tracker.py
new file mode 100644
index 0000000..6562df7
--- /dev/null
+++ b/server/task_tracker.py
@@ -0,0 +1,54 @@
+A task tracker object keeps track of a completion of a task and send reports at regular intervals.
+import time
+class TaskTracker:
+ """Tracks a long task."""
+ def __init__(self, socket, task_id, reporter):
+ """
+ Start a task tracker.
+ :param socket: Client socket to send reports to.
+ :param int task_id: ID of the task being tracked.
+ :param Callable reporter: Function to call to send reports.
+ """
+ self.socket = socket
+ self.task_id = task_id
+ self.reporter = reporter
+ self.progress = 0
+ self.last_report_time = 0
+ def sending_report(self):
+ """Check if the completion report should be sent."""
+ send_report = time.time() - self.last_report_time > 2
+ if send_report:
+ self.last_report_time = time.time()
+ return send_report
+ def callback(self, pipe, step_index, _tensor, tensor_callback):
+ """Callback function to pass to a diffusion model."""
+ self.progress = step_index * 100 // pipe.num_timesteps
+ if self.sending_report():
+ self.reporter(self.progress, self.socket, self.task_id)
+ return tensor_callback
+ def incomplete_callback(self, max_progress):
+ """
+ Emulates an "incomplete progress": when a task needs several diffusion models.
+ :param int max_progress: Max progress that can be set by this task.
+ :return Callable: Function to pass to the diffusion model.
+ """
+ initial_progress = self.progress
+ def local_faker(pipe, step_index, _tensor, tensor_callback):
+ """Callback function to pass to a diffusion model that fakes the completion status."""
+ self.progress = initial_progress + step_index * max_progress // pipe.num_timesteps
+ if self.sending_report():
+ self.reporter(self.progress, self.socket, self.task_id)
+ return tensor_callback
+ return local_faker
diff --git a/server/utils.py b/server/utils.py
new file mode 100644
index 0000000..3c79435
--- /dev/null
+++ b/server/utils.py
@@ -0,0 +1,74 @@
+Various utility functions for the server,
+import os
+import io
+import json
+from PIL import Image
+def hex_to_bytes(hex_string):
+ """
+ Convert a hex string to bytes.
+ :param str hex_string: Hex string is in C# format, separated by dashes.
+ :return bytes: Bytes object
+ """
+ return bytes.fromhex(hex_string.replace("-", ""))
+def hex_to_pillow(hex_string):
+ """
+ Take a hex string and convert it to a pillow image.
+ :param str hex_string: Hex string in C# format, separated by dashes.
+ :return PIL.Image.Image: Decoded Pillow image.
+ """
+ base_image_io = io.BytesIO(hex_to_bytes(hex_string))
+ return Image.open(base_image_io)
+def get_image_bytes(image):
+ """
+ Return the bytes composing a PNG image.
+ :param PIL.Image.Image image: Input image to get bytes from.
+ :return bytes: Bytes object
+ """
+ img_byte_arr = io.BytesIO()
+ image.save(img_byte_arr, format="PNG")
+ return img_byte_arr.getvalue()
+def get_configuration_data():
+ """
+ Configuration data for the server.
+ :return dict: Server configuration data from a JSON file.
+ """
+ with open(os.path.join(os.path.dirname(__file__), "../api.json"), encoding="utf-8") as file:
+ configuration_data = json.load(file)
+ return configuration_data
+def get_server_address():
+ """Return the suggested IP and port for the server."""
+ configuration_data = get_configuration_data()
+ # Specify the IP address and port the server will listen on
+ server_ip = configuration_data["serverIp"]
+ server_port = configuration_data["serverPort"]
+ return server_ip, server_port
+def image_response(image):
+ """
+ A classical image response, image encoded in hexadecimal bytes.\
+ :param PIL.Image.Image image: The image to return.
+ :return dict: Response data with the key 'imageHexBytes'.
+ """
+ skybox_bytes = get_image_bytes(image)
+ data = {"imageHexBytes": skybox_bytes.hex()}
+ return data
diff --git a/skybox/diffusion.py b/skybox/diffusion.py
new file mode 100644
index 0000000..eb26947
--- /dev/null
+++ b/skybox/diffusion.py
@@ -0,0 +1,134 @@
+Simple(st) diffusion network,
+based on https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0.
+Generates an image after prompt.
+import warnings
+from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline
+import torch
+def show_images(images):
+ """
+ Show the first five images.
+ :param list[PIL.Image.Image] images: Images.
+ """
+ for i in range(min(len(images), 5)):
+ images[i].show()
+def is_power_of_two(n):
+ """Check if a number is a power of two."""
+ return n > 0 and (n & (n - 1)) == 0
+def get_image_generation_pipeline():
+ """Load a text-to-image pipeline from Hugging Face for SDXL base."""
+ return StableDiffusionXLPipeline.from_pretrained(
+ "stabilityai/stable-diffusion-xl-base-1.0",
+ torch_dtype=torch.float16,
+ use_safetensors=True,
+ variant="fp16",
+ )
+def get_image_refinement_pipeline():
+ """Load an image-to-image pipeline from Hugging Face using Stable Diffusion XL."""
+ return StableDiffusionXLImg2ImgPipeline.from_pretrained(
+ "stabilityai/stable-diffusion-xl-refiner-1.0",
+ torch_dtype=torch.float16,
+ variant="fp16",
+ use_safetensors=True,
+ )
+def generate_images(prompt, num_inference_steps=50, height=1024, width=None, **pipe_kwargs):
+ """
+ Generate an image from the given prompt, using a diffusion network.
+ Note: for best results with SDXL, height * width should be equal to 1024*1024.
+ :param prompt: The prompt for the image.
+ :type prompt: str | tuple[str] | list[str]
+ :param int num_inference_steps: Number of denoising steps
+ :param int height: Image height, should be a power of two
+ :param int width: Image width, if left to None it will be equal to 1024*1024 // height
+ :param dict pipe_kwargs: Additional arguments to pass to the pipeline.
+ :return list[PIL.Image.Image]: Generated images
+ """
+ if width is None:
+ width = 1024 * 1024 // height
+ if not is_power_of_two(height) and not is_power_of_two(width):
+ warnings.warn(
+ f"Specified dimensions {width} * {height} are not powers of two, proceed with care."
+ )
+ elif not is_power_of_two(height):
+ warnings.warn(
+ f"Specified image height {height} is not a power of two, you may run into issues."
+ )
+ elif not is_power_of_two(width):
+ warnings.warn(
+ f"Specified image width {width} is not a power of two, you may run into issues."
+ )
+ if width * height != 1024 * 1024:
+ print(
+ "width * height should be equal to 1024 * 1024 for better results.",
+ f"Current is {width} * {height}."
+ )
+ pipe = get_image_generation_pipeline().to("cuda")
+ # If more VRAM needed
+ # pipe.enable_model_cpu_offload()
+ # If computation takes a long time (Linux only)
+ # pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+ return pipe(
+ prompt=prompt,
+ num_inference_steps=num_inference_steps,
+ height=height,
+ width=width,
+ **pipe_kwargs
+ ).images
+def refine_images(prompt, init_image, num_inference_steps=15, **pipe_kwargs):
+ """
+ Refine a batch of images using the diffusion network.
+ :param str | list[str] prompt: The prompt for the refined image.
+ :param PIL.Image.Image init_image: The initial image to refine.
+ :param int num_inference_steps: The number of inference steps for the refinement process.
+ :param dict pipe_kwargs: Additional keyword arguments to pass to the pipeline.
+ :return list[PIL.Image.Image]: A list of refined images.
+ """
+ pipe = get_image_refinement_pipeline().to("cuda")
+ return pipe(
+ prompt, image=init_image, num_inference_steps=num_inference_steps, **pipe_kwargs
+ ).images
+def main():
+ """Main demo for the diffusion model."""
+ demand = input(
+ "What would you like to generate? (Empty: An astronaut riding a green horse) "
+ )
+ if not demand or demand.strip().isspace():
+ demand = "An astronaut riding a green horse"
+ batch_size = 1
+ inference_steps = 50
+ results = generate_images(
+ [demand] * batch_size, num_inference_steps=inference_steps, height=512, width=2048,
+ )
+ show_images(results)
+ results = refine_images([demand] * batch_size, results, inference_steps)
+ show_images(results)
+if __name__ == "__main__":
+ main()
diff --git a/skybox/image_processing.py b/skybox/image_processing.py
new file mode 100644
index 0000000..1a2aaa8
--- /dev/null
+++ b/skybox/image_processing.py
@@ -0,0 +1,283 @@
+Various image edition functions.
+import itertools
+from PIL import Image, ImageDraw
+import numpy as np
+def split_base_image(img):
+ """Split an image in two and return left and right parts."""
+ width, height = img.size
+ position = width // 2
+ left_image = img.crop((0, 0, position, height))
+ right_image = img.crop((position, 0, width, height))
+ return left_image, right_image
+def flip_image_sides(img):
+ """
+ Take an input image, split it in the middle and flip both parts.
+ :param PIL.Image.Image img: Base input image (won't be changed)
+ :return PIL.Image.Image: Image with the same dimension but parts flipped
+ """
+ left_img, right_img = split_base_image(img)
+ n_right_img = left_img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
+ n_left_img = right_img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
+ out_image = Image.new(img.mode, img.size)
+ out_image.paste(n_right_img, (0, 0))
+ out_image.paste(n_left_img, (n_right_img.width, 0))
+ return out_image
+def paste_borders(background, left_img, right_img):
+ """Paste the borders onto an image."""
+ size = background.size
+ n_right_img = left_img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
+ n_left_img = right_img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
+ # Find the center of the second image
+ width, height = background.size
+ center = (width // 2, height // 2)
+ # Calculate the top-left corner of the white square image
+ top_left = (center[0] - size[0] // 2, center[1] - size[1] // 2)
+ top_right = (center[0] + size[0] // 2, center[1] + size[1] // 2)
+ # Paste the images onto the background image
+ background.paste(n_right_img, top_left)
+ background.paste(n_left_img, top_right)
+def concatenate_borders(left_image, right_image):
+ """
+ Create a new image with the border added, return the image.
+ :param PIL.Image.Image left_image: Image to concatenate on the left
+ :param PIL.Image.Image right_image: Image to concatenate on the right
+ """
+ # Get the width and height of the two images
+ width1, height1 = left_image.size
+ width2, height2 = right_image.size
+ # Calculate the width and height of the new image
+ new_width = width1 + width2
+ new_height = max(height1, height2)
+ # Create a new blank image with white background
+ new_image = Image.new("RGB", (new_width, new_height))
+ # Paste the first image onto the new image at position (0, 0)
+ new_image.paste(left_image, (0, 0))
+ # Paste the second image onto the new image at position (width1 + center_width, 0)
+ new_image.paste(right_image, (width1, 0))
+ return new_image
+def horizontal_carrousel(base_image, left_translation):
+ """
+ Crop the image at a specific horizontal point to do a carrousel.
+ The right side of the image will be sent on the left like in Pacman.
+ :param PIL.Image.Image base_image: Image to carrousel.
+ :param int left_translation: Number of pixels to translate the image by.
+ A negative value translates to the left.
+ :return PIL.Image.Image: New image translated.
+ """
+ if left_translation < 0:
+ left_translation = base_image.width + left_translation
+ left_image = base_image.crop((0, 0, left_translation, base_image.height))
+ right_image = base_image.crop((left_translation, 0, base_image.width, base_image.height))
+ output_image = base_image.copy()
+ output_image.paste(right_image)
+ output_image.paste(left_image, (base_image.width - left_translation, 0))
+ return output_image
+def box_mean_color(img, box):
+ """
+ Get the mean pixel value of a portion of image.
+ :param PIL.Image.Image img: Image to take pixels from
+ :param box: Box delimitation in format (left, top, right, bottom)
+ :type box: tuple[int, int, int, int]
+ :return tuple[int, int, int]: mean pixel color
+ """
+ diffs = box[2] - box[0], box[3] - box[1]
+ pixels = img.crop(box).load()
+ average_sky = np.mean(
+ [pixels[pos] for pos in itertools.product(range(diffs[0]), range(diffs[1]))],
+ axis=0,
+ )
+ return tuple(map(int, average_sky))
+def draw_gradient_box(img, position, size, start_color, end_color):
+ """
+ Draw a box as a gradient between two points.
+ :param PIL.Image.Image img: Base image to draw rectangle on
+ :param tuple[int, int] position: top-left corner where to start the box
+ :param tuple[int, int] size: Size of the box to draw
+ :param tuple[int, int, int] start_color: Color at the beginning of the box
+ :param tuple[int, int, int] end_color: Color at the end of the box
+ """
+ draw = ImageDraw.Draw(img)
+ x, y = position
+ width, height = size
+ for i in range(width):
+ color = [
+ int(start_color[c] + (end_color[c] - start_color[c]) * i / width)
+ for c in range(3)
+ ]
+ draw.line([(x + i, y), (x + i, y + height)], tuple(color))
+# 2D polar geometry functions
+def cartesian_to_polar(pos, origin=(0, 0)):
+ """
+ Polar coordinates from cartesian one.
+ :param tuple[float, float] pos: (x, y) position in cartesian coordinates
+ :param tuple[float, float] origin: Relative to
+ :return tuple[float, float]: Radius and angle
+ """
+ vector = np.array(pos) - origin
+ return np.linalg.norm(vector), np.arctan2(vector[1], vector[0])
+def cartesian_to_polar_batch(pos, origin=(0, 0)):
+ """
+ Polar coordinates from cartesian one, applied on a batched of positions.
+ :param numpy.ndarray | tuple | list pos: (x, y) positions in cartesian coordinates
+ :param tuple[float, float] | float origin: Relative to
+ :return numpy.ndarray: Array of radii and angles
+ """
+ vector = np.array(pos) - origin
+ return np.dstack(
+ (np.linalg.norm(vector, axis=1), np.arctan2(vector[:, 1], vector[:, 0]))
+ )[0]
+def polar_to_cartesian(pos, origin=(0, 0)):
+ """
+ Convert from polar coordinates to cartesian ones.
+ :param tuple[float, float] pos: (radius, angle) position in polar coordinates
+ :param tuple[float, float] origin: Cartesian axis origin
+ :return tuple[float, float]: Position as (x, y)
+ """
+ return origin + pos[0] * np.array([np.cos(pos[1]), np.sin(pos[1])])
+# Image distortion
+def distort_image(img, inner_radius=None):
+ """
+ Create an image distorted to fit on a circle.
+ With an initial image of dimensions (width, height),
+ the new image has dimensions ((height + inner_radius) * 2, (height + inner_radius) * 2).
+ All modified pixels are in a circle of radius height.
+ :param PIL.Image.Image img: Base image.
+ :param int | None inner_radius: Radius of a white circle to add (optional)
+ :return PIL.Image.Image: New image in a circle.
+ """
+ # New image dimensions
+ canvas_size = img.height * 2 + (inner_radius or 0) * 2
+ # Get the polar coordinate of each pixel in the new image, format [[radius, angle], ...]
+ # Max radius is sqrt(2) * canvas_size / 2 (corners)
+ grid = tuple(itertools.product(range(canvas_size), range(canvas_size)))
+ polar_coordinates = cartesian_to_polar_batch(grid, canvas_size / 2)
+ # Select the indices of the pixels that should be changed in the new image
+ insiders = np.nonzero(
+ np.logical_and(
+ # Inside painted circle
+ polar_coordinates[:, 0] < canvas_size / 2,
+ # And outside mask
+ polar_coordinates[:, 0] >= (inner_radius or 0),
+ )
+ )[0]
+ # Acquire position on base image, cast to [0, img.size - 1]
+ adapted_pos = (
+ polar_coordinates[insiders]
+ * (np.array(img.size[::-1]) - 1)
+ / (canvas_size / 2, 2 * np.pi)
+ )
+ adapted_pos[:, 1] += (img.width - 1) / 2
+ # Round to int and swap the last dimensions (return to image format)
+ slicer = np.round(adapted_pos).astype(np.uint16)
+ new_pixels = np.asarray(img)[slicer[:, 0], slicer[:, 1]]
+ new_img_data = np.zeros((canvas_size * canvas_size, 3), dtype=np.uint8)
+ # Assign to the new image
+ new_img_data[insiders] = new_pixels
+ return Image.fromarray(new_img_data.reshape(canvas_size, canvas_size, 3))
+def unroll_top_image(img, width=None):
+ """
+ Unroll a polar projected image to standard format.
+ Take an image fitting in a circle, and unrolls it.
+ """
+ canvas_size = (width or img.size[0] // 2), img.size[1] // 2
+ # Create a white background image
+ new_img = Image.new("RGB", canvas_size, color="black")
+ for pos in itertools.product(range(new_img.width), range(new_img.height)):
+ # Acquire polar coordinates corresponding to this position (radius, angle)
+ adapted_pos = (
+ pos[1] * img.height / 2 / new_img.height,
+ pos[0] * 2 * np.pi / new_img.width,
+ )
+ # Position in the base image space
+ adapted_pos = polar_to_cartesian(adapted_pos, (img.width / 2, img.height / 2))
+ # Reduce to base image pixel space
+ pixel_pos = int(adapted_pos[0]), int(adapted_pos[1])
+ # print(pos, polar_pos, pixel_pos)
+ new_img.putpixel(pos, img.getpixel(pixel_pos))
+ return new_img
+def image_polar_to_rect(img, width=None):
+ """Take a polar (fisheye) image, and display it on a rectangle."""
+ base_radius = img.size[1] // 2
+ rect_size = (base_radius if width is None else width), base_radius
+ new_img = Image.new("RGB", rect_size, "white")
+ for radius in np.linspace(0, base_radius, base_radius - 1, endpoint=False):
+ for theta in np.linspace(0, 2 * np.pi, img.size[0], endpoint=False):
+ canvas_pos = (
+ int(base_radius + radius * np.cos(theta)),
+ int(base_radius + radius * np.sin(theta)),
+ )
+ new_img.putpixel(
+ canvas_pos,
+ img.getpixel(
+ (int(theta / (2 * np.pi) * (img.size[0] - 1)), int(radius))
+ ),
+ )
+ new_img.show()
diff --git a/skybox/inpainting.py b/skybox/inpainting.py
new file mode 100644
index 0000000..55c4301
--- /dev/null
+++ b/skybox/inpainting.py
@@ -0,0 +1,319 @@
+Image inpainting.
+For general usage, see https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint
+First version stable diffusion 1.2 base:
+Second version stable diffusion 2 base:
+Third version stable diffusion xl 1.0:
+import enum
+import warnings
+from diffusers import StableDiffusionXLInpaintPipeline
+from PIL import Image, ImageFilter
+import numpy as np
+import torch
+from skimage.restoration.inpaint import inpaint_biharmonic
+from skybox import image_processing
+class InpaintingFilling(enum.Enum):
+ """
+ Inpainting filling masks.
+ SAME: do not edit the input image.
+ AVERAGE: replace the masked area with the average pixel value.
+ MEAN_GREY: replace the masked area with a uniform grey mask.
+ BIHARMONIC: bi-harmonic interpolation,
+ see skimage.restoration.inpaint_biharmonic for more details
+ """
+ SAME = 1
+ RANDOM = 5
+def make_transparent_black(image):
+ """
+ From an RGBA image, make the transparent pixels black.
+ :param PIL.Image.Image image: Base RGBA image
+ :return PIL.Image.Image: Mask image in grayscale format (L).
+ """
+ # Convert to grayscale format
+ grayscale = image.convert("L")
+ # Iterate through each pixel in the image
+ for y in range(grayscale.height):
+ for x in range(grayscale.width):
+ # If the alpha value is less than 255 (transparent), set the pixel to black
+ if image.getpixel((x, y))[-1] < 255:
+ grayscale.putpixel((x, y), 0)
+ return grayscale
+def center_on_mask(mask_image):
+ """
+ Translate an image horizontally so that the mask is centered.
+ :param PIL.Image.Image mask_image: Mask image where to find the mean point.
+ :return: How many pixels should be translated, and if the mask goes across the image.
+ :rtype: tuple[int, bool]
+ """
+ mask_x_pos = np.asarray(mask_image).nonzero()[1]
+ mean_point = int(np.mean(mask_x_pos))
+ mask_x_extend = np.min(mask_x_pos), np.max(mask_x_pos)
+ if mask_x_extend[0] > 0 or mask_x_extend[1] < mask_image.width - 1:
+ return mean_point, False
+ dummy_mask_translation = image_processing.horizontal_carrousel(
+ mask_image, mask_image.width // 2
+ )
+ mask_x_pos = np.asarray(dummy_mask_translation).nonzero()[1]
+ mean_point = int(np.mean(mask_x_pos))
+ mask_x_extend = np.min(mask_x_pos), np.max(mask_x_pos)
+ if mask_x_extend[0] == 0 and mask_x_extend[1] == mask_image.width - 1:
+ warnings.warn("Seems like the mask is too large!")
+ return mask_image.width - mean_point, True
+def fill_masked_area(image, mask, inpainting_filling=InpaintingFilling.SAME):
+ """Fill a masked area of the given image with a specific strategy."""
+ if inpainting_filling == InpaintingFilling.SAME:
+ return image
+ if inpainting_filling == InpaintingFilling.AVERAGE:
+ # Use the average pixel value
+ image_data = np.asarray(image)
+ mask_data = np.asarray(mask)
+ pixels = image_data[mask_data != 0]
+ mean_pixel = np.mean(pixels, axis=0).astype(np.uint8)
+ area = Image.new(image.mode, image.size, color=tuple(mean_pixel))
+ masked_image = image.copy()
+ masked_image.paste(area, mask=mask)
+ return masked_image
+ if inpainting_filling == InpaintingFilling.MEAN_GREY:
+ # Equalize with grey
+ grey_area = Image.new(image.mode, image.size, color="grey")
+ masked_image = image.copy()
+ masked_image.paste(grey_area, mask=mask)
+ return masked_image
+ if inpainting_filling == InpaintingFilling.BIHARMONIC:
+ # Bi-harmonic filling
+ image_data = np.asarray(image)
+ mask_data = np.asarray(mask)
+ inpainted = (
+ inpaint_biharmonic(image_data, mask_data, channel_axis=-1) * 255
+ ).astype(np.uint8)
+ return Image.fromarray(inpainted)
+ if inpainting_filling == InpaintingFilling.RANDOM:
+ # Adds only random values
+ image_data = np.asarray(image)
+ mask_data = np.asarray(mask)
+ rng = np.random.default_rng(1)
+ noise_data = rng.integers(
+ 0, 255, image_data.shape
+ ) * mask_data.reshape(*mask_data.shape, 1)
+ masked_image = image.copy()
+ masked_image.paste(Image.fromarray(noise_data.astype(np.uint8)), mask=mask)
+ return masked_image
+ raise ValueError
+def get_inpainting_pipeline():
+ """
+ This function initializes and returns a pre-trained Stable Diffusion XL inpainting pipeline.
+ The pipeline is loaded from the Hugging Face model hub.
+ The pipeline is set to use half-precision (float16) for faster inference and lower memory usage.
+ :return: A pre-trained Stable Diffusion XL inpainting pipeline.
+ """
+ return StableDiffusionXLInpaintPipeline.from_pretrained(
+ "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+ torch_dtype=torch.float16,
+ variant="fp16",
+ )
+def inpaint_image(prompts, image, mask_image, negative_prompt=None, **pipe_kwargs):
+ """
+ Apply the prompt to do the inpainting.
+ Side effect: reduce the quality of the image, even outside the mask.
+ :param str or list[str] prompts: Prompts to use
+ :param PIL.Image.Image image: Base image
+ :param mask_image: Mask to apply. The mask is white for inpainting and black for keeping as is.
+ :type mask_image: PIL.Image.Image
+ :param str negative_prompt: Negative prompt to apply
+ :return list[PIL.Image.Image]: Inpainted images
+ """
+ pipe = get_inpainting_pipeline().to("cuda")
+ return pipe(
+ prompt=prompts,
+ image=image,
+ mask_image=mask_image,
+ negative_prompt=negative_prompt,
+ height=image.height,
+ width=image.width,
+ strength=0.9,
+ **pipe_kwargs,
+ ).images
+def force_inpainting(prompts, image, mask_image, negative_prompt=None, **pipe_kwargs):
+ """
+ Apply the prompts to do the inpainting when you want to be sure that the inpainting is applied.
+ The inpainting will start with a random noise instead of the image,
+ generating more random results.
+ Side effect: reduce the quality of the image, even outside the mask.
+ :param str or list[str] prompts: Prompts to use
+ :param PIL.Image.Image image: Base image
+ :param mask_image: Mask to apply. The mask is white for inpainting and black for keeping as is.
+ :type mask_image: PIL.Image.Image
+ :param str negative_prompt: Negative prompt to apply
+ :return list[PIL.Image.Image]: Inpainted images
+ """
+ masked_image = fill_masked_area(image, mask_image, InpaintingFilling.RANDOM)
+ pipe_kwargs["guidance_scale"] = 20
+ pipe_kwargs["num_inference_steps"] = 25
+ return inpaint_image(
+ prompts, masked_image, mask_image, negative_prompt, **pipe_kwargs
+ )
+def image_compositing(
+ initial_image, inpainted_image, mask_image, blurring_radius, horizontal_tiling=False
+ """
+ Preserve the quality of the original image by blending the original and the inpainted images.
+ :param PIL.Image.Image initial_image: Initial image before any inpainting.
+ :param PIL.Image.Image inpainted_image: Image after inpainting process.
+ :param PIL.Image.Image mask_image: Mask image to define the area to be inpainted.
+ :param int blurring_radius: Radius of the blurring filter applied to the mask.
+ :param bool horizontal_tiling: If True, we apply a horizontal tiling before compositing.
+ :return PIL.Image.Image final_composition: Composited image with original and inpainted parts.
+ """
+ if horizontal_tiling:
+ image_frame = Image.new(
+ initial_image.mode, (initial_image.width * 2, initial_image.height)
+ )
+ inpainted_image_frame = image_frame.copy()
+ mask_image_frame = image_frame.copy()
+ # Remark: an image of size (base_image.width + blurring_radius * 2) would be enough
+ for left_padding in range(0, initial_image.width * 2, initial_image.width):
+ image_frame.paste(initial_image, (left_padding, 0))
+ inpainted_image_frame.paste(inpainted_image, (left_padding, 0))
+ mask_image_frame.paste(mask_image, (left_padding, 0))
+ blurred_mask = mask_image_frame.filter(ImageFilter.BoxBlur(blurring_radius)).convert("L")
+ big_image = Image.composite(inpainted_image_frame, image_frame, blurred_mask)
+ final_composition = big_image.crop(
+ (initial_image.width, 0, initial_image.width * 2, initial_image.height)
+ )
+ else:
+ blurred_mask = mask_image.filter(ImageFilter.BoxBlur(blurring_radius))
+ final_composition = Image.composite(inpainted_image, initial_image, blurred_mask)
+ return final_composition
+def inpaint_panorama_pipeline(
+ init_image, mask_image, prompt, step_callback=None, blurring_radius=40
+ """
+ Base framework for an inpainting.
+ :param PIL.Image.Image init_image: Initial image to inpaint
+ :param PIL.Image.Image mask_image: Mask image to use
+ :param str prompt: Prompt for inpainting
+ :param step_callback: Function to run at the end of each step f : step_number -> Any
+ :type step_callback: Callable | None
+ :param int blurring_radius: Size of the blurring radius to apply.
+ :return PIL.Image.Image: The new inpainted image
+ """
+ left_translation, should_translate = center_on_mask(mask_image)
+ # If the mask is across the borders we need to "turn" the image
+ if should_translate:
+ translated_image = image_processing.horizontal_carrousel(init_image, left_translation)
+ translated_mask = image_processing.horizontal_carrousel(mask_image, left_translation)
+ translated_result = force_inpainting(
+ prompt,
+ translated_image,
+ translated_mask,
+ callback_on_step_end=step_callback,
+ )[0]
+ new_image = image_processing.horizontal_carrousel(translated_result, -left_translation)
+ else:
+ new_image = force_inpainting(
+ prompt, init_image, mask_image, callback_on_step_end=step_callback
+ )[0]
+ # Apply the image on the mask only to avoid quality decrease
+ composited_image = image_compositing(init_image, new_image, mask_image, blurring_radius, True)
+ return composited_image
+def inpainting_demo():
+ """
+ A demo interaction of what the model can do.
+ This function demonstrates the usage of the model by prompting the user for a replacement,
+ to be added in the input image.
+ If the user doesn't provide a valid input, a default prompt is used.
+ """
+ demo_prompt = "A cat, high resolution, sitting"
+ prompt = input(f"What replacement do you want? [{demo_prompt}] ")
+ if not prompt or prompt.strip().isspace():
+ prompt = demo_prompt
+ image_path = input("What is the image path? [../sunny_mountain.png] ")
+ if not image_path or image_path.strip().isspace():
+ image_path = "../sunny_mountain.png"
+ base_image = Image.open(image_path).convert("RGB")
+ mask_path = input("What is the mask path? [mask.png] ")
+ if not mask_path or mask_path.strip().isspace():
+ mask_path = "mask.png"
+ mask_image = Image.open(mask_path)
+ print("Starting inpainting")
+ inpainted_images = inpaint_image([prompt] * 4, base_image, mask_image)
+ for im in inpainted_images:
+ im.show()
+ print("Restoring initial image quality.")
+ for im in inpainted_images:
+ image_compositing(base_image, im, mask_image, 5, True).show()
+def __regenerate_mask():
+ image = Image.open("../sunny_mountain.png")
+ # Define the size of the mask (width, height)
+ mask_size = image.size
+ # Create a blank mask filled with zeros
+ mask = torch.zeros(mask_size, dtype=torch.uint8)
+ # Set some pixels to 1 to create a binary mask
+ mask[
+ image.width // 2 : image.width // 2 + 100,
+ image.height // 2 : image.height // 2 + 100,
+ ] = 255
+ # Save the mask as a PNG file using Pillow
+ img = Image.fromarray(mask.numpy())
+ img.save("mask.png")
+ return img
+if __name__ == "__main__":
+ inpainting_demo()
diff --git a/skybox/legacy/diffusion_trainer.py b/skybox/legacy/diffusion_trainer.py
new file mode 100644
index 0000000..2db59a5
--- /dev/null
+++ b/skybox/legacy/diffusion_trainer.py
@@ -0,0 +1,40 @@
+Training pipeline for a diffusion network.
+import random
+from skybox.diffusion import generate_images
+from skybox.legacy.equirectangular_checker import score_image
+def random_sentence():
+ """Generate random sentences."""
+ # Sets of words
+ adjectives = ("quick", "lazy", "smart", "cute", "red")
+ nouns = ("dog", "cat", "bird", "apple", "car")
+ verbs = ("runs", "eats", "hops", "jumps", "drives")
+ adverbs = ("quickly", "slowly", "carefully", "loudly", "eagerly")
+ sentence = " ".join(map(random.choice, (adjectives, nouns, verbs, adverbs)))
+ return sentence
+def generate():
+ """Generate a new image."""
+ prompt = random_sentence() + " monoscopic 360 equirectangular"
+ print(prompt)
+ image = generate_images(prompt)[0]
+ image.show()
+ return image
+def evaluate(img):
+ """Evaluates a given image quality."""
+ return score_image(img)
+if __name__ == "__main__":
+ for _ in range(5):
+ score = evaluate(generate())
+ print(f"Borders variation: {score}")
diff --git a/skybox/legacy/equirectangular_checker.py b/skybox/legacy/equirectangular_checker.py
new file mode 100644
index 0000000..00aa5f9
--- /dev/null
+++ b/skybox/legacy/equirectangular_checker.py
@@ -0,0 +1,80 @@
+Evaluates how much an image variates from an equirectangular projection.
+import sys
+from PIL import Image
+def check_ratio(img):
+ """Check if the image's aspect ratio is 2:1"""
+ width, height = img.size
+ return width / height >= 2
+def define_boxes(img_size, subdivisions, pixels):
+ """Define the boxes of an image divided by a given number of subdivisions."""
+ width, height = img_size
+ for i in range(subdivisions):
+ box_range = i * width // subdivisions, (i + 1) * width // subdivisions
+ box = (box_range[0], 0, box_range[1], pixels)
+ opp = (box_range[0], height - pixels, box_range[1], height)
+ yield box, opp
+ for i in range(subdivisions):
+ box_range = i * height // subdivisions, (i + 1) * height // subdivisions
+ box = (0, box_range[0], pixels, box_range[1])
+ opp = (width - pixels, box_range[0], width, box_range[1])
+ yield box, opp
+def check_boundaries(img, subdivisions=10, pixels=5):
+ """Check if the boundaries of an image can be matched."""
+ diff = 0
+ for box, partner in define_boxes(img.size, subdivisions, pixels):
+ regions = img.crop(box), img.crop(partner)
+ means = [sum(regions[i].getdata()) / 255 / pixels / pixels for i in range(2)]
+ diff += (means[1] - means[0]) ** 2
+ # Method 2: diff = sum((regions[1].mirror - regions[0]) ** 2)
+ return diff / subdivisions
+def check_frontiers(img):
+ """
+ Return how different are the pixels on the opposite borders of an image.
+ :param PIL.Image.Image img: Input image
+ :return: Frontiers difference. 0 for identical, 1 if they are totally different.
+ :rtype: float
+ """
+ diff = 0
+ width, height = img.size
+ for y in range(width):
+ pixels = img.getpixel((0, y)), img.getpixel((width - 1, y))
+ diff += ((pixels[1] - pixels[0]) / 255) ** 2
+ for x in range(height):
+ pixels = img.getpixel((x, 0)), img.getpixel((x, height - 1))
+ diff += ((pixels[1] - pixels[0]) / 255) ** 2
+ return diff / (width + height)
+def score_image(img):
+ """
+ Return the likelyhood of the input image to be equirectangular.
+ """
+ variation = check_frontiers(img.convert("L"))
+ return variation
+def score_file(image_path):
+ """Assigns a variation score to the image."""
+ img = Image.open(image_path)
+ variation = score_image(img)
+ print("Boundary", check_boundaries(img), "frontier", check_frontiers(img))
+ return variation
+if __name__ == "__main__":
+ if len(sys.argv) != 2:
+ print("Please provide an image.")
+ sys.exit(126)
+ print("Value :", score_file(sys.argv[1]))
diff --git a/skybox/legacy/sdxl.py b/skybox/legacy/sdxl.py
new file mode 100644
index 0000000..ed2306a
--- /dev/null
+++ b/skybox/legacy/sdxl.py
@@ -0,0 +1,44 @@
+An implementation of Stable Diffusion XL with a custom checkpoint by ByteDance.
+From https://huggingface.co/ByteDance/SDXL-Lightning.
+import torch
+from diffusers import (
+ StableDiffusionXLPipeline,
+ UNet2DConditionModel,
+ EulerDiscreteScheduler,
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+def main():
+ """Main demo function."""
+ base = "stabilityai/stable-diffusion-xl-base-1.0"
+ repo = "ByteDance/SDXL-Lightning"
+ ckpt = "sdxl_lightning_4step_unet.safetensors" # Use the correct ckpt for your step setting!
+ # Load model.
+ unet = UNet2DConditionModel.from_config(base, subfolder="unet").to(
+ "cuda", torch.float16
+ )
+ unet.load_state_dict(load_file(hf_hub_download(repo, ckpt), device="cuda"))
+ pipe = StableDiffusionXLPipeline.from_pretrained(
+ base, unet=unet, torch_dtype=torch.float16, variant="fp16"
+ ).to("cuda")
+ # Ensure sampler uses "trailing" time steps.
+ pipe.scheduler = EulerDiscreteScheduler.from_config(
+ pipe.scheduler.config, timestep_spacing="trailing"
+ )
+ # Ensure using the same inference steps as the loaded model and CFG set to 0.
+ prompt = "A lazy cat jumping smiling 360 equirectangular monoscopic"
+ image = pipe(prompt, num_inference_steps=4, guidance_scale=0).images[0]
+ image.show()
+if __name__ == "__main__":
+ main()
diff --git a/skybox/mask.png b/skybox/mask.png
new file mode 100644
index 0000000..ec634e5
--- /dev/null
+++ b/skybox/mask_editor.py
@@ -0,0 +1,343 @@
+Create mask for an image so that it can be locally inpainted.
+import enum
+import numpy as np
+from PIL import Image, ImageDraw
+from sklearn.cluster import KMeans
+from skybox import image_processing
+ "width": 2048 + 256,
+ "height": 1024,
+ # Initial image to map
+ "base_image": {"width": 2048, "height": 1024},
+ # Extension of the image to make a cylinder
+ "horizontal_extensions": {"width": 256, "display": "both"},
+ # Sky extension to make a hemisphere
+ "top_extension": {"radius": 1024},
+class ExtensionFilling(enum.Enum):
+ """
+ What type of filling to apply to an extended image.
+ STRETCH: stretch the image border.
+ MEAN: use th mean value of the image border.
+ GRADIENT: apply a continuous gradient from the mean value of the image border.
+ SMART: auto-extension depending on context.
+ """
+ MEAN = 2
+ SMART = 4
+# Horizontal panorama zone
+def central_vertical_mask(image, center_width):
+ """
+ Create a vertical mask for the edition zone.
+ The mask will be in centered.
+ :param PIL.Image.Image image: Input image
+ :param int center_width: Width of the vertical mask.
+ """
+ mask = Image.new("L", image.size, "black")
+ mask.paste(
+ Image.new("L", (center_width, image.height), "white"),
+ (image.width // 2 - center_width // 2, 0),
+ )
+ return mask
+def central_circular_mask(canvas_size, inner_radius=None):
+ """
+ Create a circular mask with a specified canvas size and inner radius.
+ :param int canvas_size: The width or height of the output square image.
+ :param inner_radius: The radius of the inner circle. If None, a full circle is created.
+ :type inner_radius: int | None
+ :return PIL.Image.Image: A black square image with a white circle in the center.
+ """
+ mask = Image.new("L", (canvas_size, canvas_size), color="black")
+ # Draw a white circle on the image
+ if inner_radius is not None:
+ draw = ImageDraw.Draw(mask)
+ draw.ellipse(
+ (
+ canvas_size / 2 - inner_radius,
+ canvas_size / 2 - inner_radius,
+ canvas_size / 2 + inner_radius,
+ canvas_size / 2 + inner_radius,
+ ),
+ fill="white",
+ )
+ return mask
+def add_top_mask(img, radius):
+ """Add a mask on the top of the image."""
+ # Get the width and height of the two images
+ width, height = img.size
+ # Calculate the width and height of the new image
+ new_height = height + radius * 2
+ # Create a new blank image with white background
+ new_image = Image.new("RGB", (width, new_height), "white")
+ mask = Image.new("L", (width, new_height), "white")
+ # Paste the first image onto the new image at position (0, 0)
+ new_image.paste(img, (0, radius * 2))
+ mask.paste(Image.new("L", (width, new_height), color="black"), (0, radius * 2))
+ return new_image, mask
+def add_center_image(background, box_size):
+ """Add a white square to the center of an image."""
+ # Create a white square image
+ size = (box_size, box_size)
+ white_square = Image.new("RGB", size, "white")
+ # Find the center of the second image
+ width, height = background.size
+ center = (width // 2, height // 2)
+ # Calculate the top-left corner of the white square image
+ top_left = (center[0] - size[0] // 2, center[1] - size[1] // 2)
+ # Paste the white square image onto the second image
+ background.paste(white_square, top_left)
+ return white_square
+def draw_top_image(background, radius, base_height):
+ """Draw a white circle in the top image."""
+ draw = ImageDraw.Draw(background)
+ # Add a white box in the center of the square
+ center_x, center_y = background.size[0] // 2, background.size[1] // 2
+ circle_center = (center_x, center_y - base_height // 2 - radius)
+ draw.ellipse(
+ [
+ circle_center[0] - radius,
+ circle_center[1] - radius,
+ circle_center[0] + radius,
+ circle_center[1] + radius,
+ ],
+ fill="white",
+ )
+# Completion functions
+def draw_masks(base_file):
+ """
+ Draw a mask for inpainting from a base image.
+ The image is drawn upon, instead of creating a new image.
+ """
+ frame_object = FRAME_CONFIG
+ # Create a black square image of size 2048x1024
+ img = Image.new(
+ "RGB", (frame_object["width"], frame_object["height"]), color="black"
+ )
+ base_image = Image.open(base_file)
+ base_image.thumbnail(
+ (frame_object["base_image"]["width"], frame_object["base_image"]["height"])
+ )
+ left_img, right_img = image_processing.split_base_image(base_image)
+ add_center_image(img, frame_object["base_image"]["width"])
+ # Add borders
+ image_processing.paste_borders(img, left_img, right_img)
+ # Add a circle on top of the box
+ draw_top_image(
+ img,
+ frame_object["top_extension"]["radius"],
+ frame_object["base_image"]["height"],
+ )
+ img.show()
+def horizontal_tiling_mask(img, frame_object=None):
+ """
+ Create an image with masks so that an IA can complete it.
+ :param PIL.Image.Image img: Image to apply masks to
+ :param frame_object: Frame configuration object,
+ containing information about the image dimensions and extensions.
+ :type frame_object: dict
+ :return: A tuple containing the image with masks applied and the corresponding mask.
+ """
+ if frame_object is None:
+ frame_object = FRAME_CONFIG
+ inpaint_canvas = image_processing.flip_image_sides(img)
+ mask = central_vertical_mask(
+ img, frame_object["width"] - frame_object["base_image"]["width"]
+ )
+ return inpaint_canvas, mask
+def create_gradient_mask(width, height, is_horizontal=True):
+ """
+ Create a gradient mask of specified dimensions.
+ :param int width: Width of the mask
+ :param int height: Height of the mask
+ :param bool is_horizontal: If True, the gradient will be horizontal.
+ Otherwise, it will be vertical.
+ :return: A mask image with a gradient fill.
+ """
+ mask = Image.new("L", (width, height))
+ draw = ImageDraw.Draw(mask)
+ if is_horizontal:
+ for i in range(width):
+ draw.line([(i, 0), (i, height)], fill=int(255 * (i / width)))
+ else:
+ for i in range(height):
+ draw.line([(0, i), (width, i)], fill=int(255 * (i / height)))
+ return mask
+def gradient_fill(img, size):
+ """
+ Create a gradient mask of specified dimensions and apply it to the input image.
+ The background color is chosen as an average of the 10% brighter pixels from the top
+ of the image and the K-mean group with the least contrast.
+ The best solution is probably to use the deepest pixels.
+ :param Image.Image img: Input image to apply the gradient mask to.
+ :param int size: Height of the gradient mask.
+ :return: A new image with the input image's content blended with a gradient mask.
+ """
+ background = Image.new("RGBA", (img.width, size), color="white")
+ blend_mask = create_gradient_mask(background.width, background.height, False)
+ # Take only the 10% brighter pixels
+ pixels_array = np.asarray(img.convert("L"))
+ threshold = np.quantile(pixels_array, 0.9)
+ valid_indices = np.argwhere(pixels_array > threshold)
+ selection = np.asarray(img)[valid_indices[:, 0], valid_indices[:, 1]]
+ mean_pixel_value = np.mean(selection, axis=0).astype(np.uint8)
+ # Alternative path: use pixels with less contrast
+ pixels_stack = np.vstack(np.asarray(img))
+ n_clusters = 5
+ kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_SEED).fit(pixels_stack)
+ dispersions = np.empty(n_clusters)
+ for label in range(n_clusters):
+ indices = np.argwhere(kmeans.labels_ == label)
+ dispersions[label] = np.sum(
+ (kmeans.cluster_centers_[label] - pixels_stack[indices]) ** 2
+ ) / len(indices)
+ mean_pixel_value2 = kmeans.cluster_centers_[np.argmin(dispersions)]
+ mean_pixel_value = (mean_pixel_value + mean_pixel_value2) / 2
+ foreground = Image.fromarray(
+ np.full((size, img.width, 3), mean_pixel_value).astype(np.uint8)
+ ).convert("RGBA")
+ blended_image = Image.composite(foreground, background, blend_mask)
+ return blended_image
+def add_top_frame(img, size, border_size=10, extension_filling=ExtensionFilling.MEAN):
+ """
+ Add a new frame on top of the current image.
+ :param Image.Image img: Input image to apply the gradient mask to.
+ :param int size: Height of the gradient mask.
+ :param int border_size: Size of the border to be added to the top of the image.
+ :param extension_filling: Method to fill the top zone with a crop from the original image.
+ :type extension_filling: ExtensionFilling
+ :return: A new image with the input image's content blended with a gradient mask.
+ """
+ canvas_size = img.width, img.height + size
+ # Create a white background image
+ new_img = Image.new("RGB", canvas_size, color="white")
+ mask = Image.new("L", canvas_size, color="white")
+ new_img.paste(img, (0, size))
+ # Fill the top zone with a crop from the original image
+ cropped = img.crop((0, 0, img.width, border_size))
+ if extension_filling == ExtensionFilling.GRADIENT:
+ blended_image = gradient_fill(cropped, size)
+ new_img.paste(blended_image)
+ elif extension_filling == ExtensionFilling.STRETCH:
+ new_img.paste(cropped.resize((cropped.width, size)))
+ else:
+ arr = np.asarray(cropped)
+ mean_pixel_value = np.mean(arr.reshape(-1, 3), axis=0).astype(np.uint8)
+ averaged_img = Image.fromarray(
+ np.full((size, cropped.width, 3), mean_pixel_value)
+ )
+ new_img.paste(averaged_img)
+ mask.paste(
+ Image.new("L", (img.width, img.height - border_size), color="black"),
+ (0, size + border_size),
+ )
+ return new_img, mask
+def create_top_mask(img, sky_size=None, extension_filling=ExtensionFilling.MEAN):
+ """
+ Create a mask to the top of the image.
+ :param Image.Image img: Input image to apply the gradient mask to.
+ :param int sky_size: Height of the gradient mask.
+ If None, it will be set to the default value in the FRAME_CONFIG dictionary.
+ :param extension_filling: Method to fill the top zone with a crop from the original image.
+ Default is ExtensionFilling.MEAN.
+ :type extension_filling: ExtensionFilling
+ :return: A tuple containing the new image with the input image's
+ content blended with a gradient mask and the corresponding mask.
+ :rtype: tuple[PIL.Image.Image, PIL.Image.Image]
+ """
+ if sky_size is None:
+ sky_size = FRAME_CONFIG["top_extension"]["radius"]
+ return add_top_frame(img, sky_size, extension_filling=extension_filling)
+def display_masks(base_file):
+ """Display the image with the mask applied."""
+ base_image = Image.open(base_file)
+ img, _ = horizontal_tiling_mask(base_image, FRAME_CONFIG)
+ img.show("Central mask applied")
+ img, _ = create_top_mask(base_image)
+ img.show("Top mask applied")
+if __name__ == "__main__":
+ display_masks("../sunny_mountain.png")
diff --git a/skybox/panorama_creator.py b/skybox/panorama_creator.py
new file mode 100644
index 0000000..a4cabb7
--- /dev/null
+++ b/skybox/panorama_creator.py
@@ -0,0 +1,505 @@
+A Python script using Stable Diffusion and Inpainting to create panorama and skyboxes.
+from PIL import Image, ImageFilter, ImageDraw
+import numpy as np
+import torch
+from torchvision.transforms.functional import pil_to_tensor
+from skybox.diffusion import generate_images
+from skybox.inpainting import inpaint_image
+from skybox import mask_editor as me
+from skybox import image_processing
+def clamp(x, low=0, high=1):
+ """
+ Clamp a value between two extremes.
+ :param float x: Value to clamp
+ :param float low: Min value
+ :param float high: Max value
+ :return float: Clamped value
+ """
+ return max(min(x, high), low)
+def equirectangular_projection(img):
+ """
+ Compute an equirectangular projection from a flat image.
+ The formula to convert a set of coordinates (latitude, longitude) on a sphere to
+ equirectangular projection is:
+ x = (longitude + 180) * (image width / 360) y = (90 - latitude) * (image height / 180)
+ But we won't be using such formula.
+ :param PIL.Image.Image img: Input image
+ :return PIL.Image.Image: Projected image.
+ """
+ width, height = img.size
+ equirectangular_image = Image.new("RGB", (width, height), "white")
+ # Convert each pixel in the equirectangular image
+ for x in range(width):
+ for y in range(height):
+ v = y
+ # [-1, 1]
+ lon, lat = (x - width / 2) * 2 / width, (y - height / 2) * 2 / height
+ u = x + (
+ width / 2 * np.sin(lon * np.pi / 2) * 1 * (1 - np.cos(lat * np.pi / 2))
+ )
+ u, v = int(clamp(u, 0, width - 1)), int(clamp(v, 0, height - 1))
+ # Map the pixel from the input image to the equirectangular image
+ if (
+ u >= width
+ or v >= height
+ or x >= equirectangular_image.size[0]
+ or y >= equirectangular_image.size[1]
+ ):
+ continue
+ equirectangular_image.putpixel((x, y), img.getpixel((int(u), int(v))))
+ return equirectangular_image
+def cylindrical_projection(img):
+ """
+ Compute a cylindrical projection from a flat image.
+ The x-axis is preserved, by the y-axis will be changed.
+ This is the inverse operation of a Lambert projection.
+ :param PIL.Image.Image img: Input image
+ :return PIL.Image.Image: Output image in cylindrical projection
+ """
+ image = pil_to_tensor(img)
+ height, _width = image.shape[1:3]
+ cylindrical_image = torch.empty(image.shape)
+ # Convert each pixel in the equirectangular image, from [0, height] to [0, height]
+ # As the view is essentially from a cylinder to a sphere, a cosine transformation is applied
+ # We then apply a reverse cosine
+ lines = height * (1 - torch.arccos(torch.linspace(-1, 1, height)) / torch.pi)
+ ratios = lines - torch.round(lines)
+ for y in range(height):
+ v = int(lines[y].item())
+ ratio = ratios[y]
+ if v + 1 < height:
+ interpolates = image[:, v + 1] * ratio + (1 - ratio) * image[:, v]
+ else:
+ interpolates = image[:, height - 1]
+ cylindrical_image[:, y] = interpolates
+ # Convert as a pillow image
+ cylindrical_image = Image.fromarray(
+ np.transpose(cylindrical_image.numpy(), (1, 2, 0)).astype("uint8")
+ )
+ return cylindrical_image
+def horizontal_tiling(img):
+ """
+ Simple tiling function to view if an image can be tilled with itself.
+ :param PIL.Image.Image img: Base image to tile.
+ :return PIL.Image.Image: Horizontal concatenation of the base image.
+ """
+ width, height = img.size
+ # Create a new image with twice the width
+ new_image = Image.new("RGB", (width * 2, height))
+ # Paste the original image twice
+ new_image.paste(img, (0, 0))
+ new_image.paste(img, (width, 0))
+ return new_image
+def blend_borders(img, size=10):
+ """
+ Blend the borders of an image to make them match. The new image is centered on the borders.
+ :param PIL.Image.Image img: Input image.
+ :param int size: Number of pixels to use
+ :return PIL.Image.Image img: Auto-blended image.
+ """
+ width, height = img.size
+ position = width // 2
+ right_crop = img.crop((position, 0, width, height))
+ translated = img.transform(
+ img.size, Image.Transform.AFFINE, (1, 0, -position, 0, 1, 0)
+ )
+ translated.paste(right_crop, (0, 0))
+ box = (width // 2 - size // 2, 0, width // 2 + size // 2, height)
+ central_crop = translated.crop(box)
+ central_crop = central_crop.filter(ImageFilter.SMOOTH)
+ translated.paste(central_crop, box)
+ return translated
+def rewrite_image_borders(image, steps=20):
+ """
+ Inpaint the borders of an image to remove a seam line.
+ :param PIL.Image.Image image: Initial image.
+ :param int steps: Number of steps for inpainting.
+ :return PIL.Image.Image: The inpainted image."""
+ img, mask = me.horizontal_tiling_mask(image)
+ inv_panorama = inpaint_image(
+ "", img, mask, negative_prompt="a logo, a text", num_inference_steps=steps
+ )[0]
+ panorama = image_processing.flip_image_sides(inv_panorama)
+ return panorama
+def add_ground(base_image, steps, step_callback=None):
+ """
+ Add a ground to an image.
+ The process is the following:
+ 1. The bottom part of the base image is selected, copied and stretched.
+ 2. The image is then distorted into a circle, centered on the lower part of the new image.
+ 3. An inpainting process is ran to redraw the ground.
+ 4. The image unrolled to the initial dimensions.
+ :param PIL.Image.Image base_image: The input image to be extended as a ground.
+ :param int steps: The number of inference steps for each inpainting process.
+ :param step_callback: Optional callback function to be called after each inference step.
+ :type step_callback: Callable | None
+ :return PIL.Image.Image: The new ground of the image.
+ """
+ # Reverse the image, add a frame to the top part
+ # 2048x256 image
+ half_image = base_image.transpose(Image.Transpose.FLIP_TOP_BOTTOM).crop(
+ (0, 0, base_image.width, base_image.height // 2)
+ )
+ # 2048x512
+ img, _ = me.add_top_frame(
+ half_image,
+ half_image.height,
+ half_image.height // 8,
+ extension_filling=me.ExtensionFilling.STRETCH,
+ )
+ # Distort on the ground
+ img = image_processing.distort_image(img)
+ mask = Image.new("L", img.size, color="black")
+ draw = ImageDraw.Draw(mask)
+ draw.ellipse(
+ (
+ half_image.height,
+ half_image.height,
+ img.height - half_image.height,
+ img.height - half_image.height,
+ ),
+ fill="white",
+ )
+ img_with_ground = inpaint_image(
+ "the ground seen from above, uniform color",
+ img,
+ mask,
+ negative_prompt="a logo, a text, clouds, birds",
+ num_inference_steps=steps,
+ callback_on_step_end=step_callback,
+ )[0]
+ extended_ground = (
+ # Unroll from (1024x1024) to (1024x512)
+ image_processing.unroll_top_image(
+ img_with_ground.transpose(Image.Transpose.ROTATE_270), base_image.width
+ )
+ .transpose(Image.Transpose.FLIP_TOP_BOTTOM)
+ .transpose(Image.Transpose.FLIP_LEFT_RIGHT)
+ )
+ # Stitch the new ground to the upper part without seam
+ bottom_mask = linear_gradient_mask(
+ (base_image.width, base_image.height // 2), extended_ground.height // 10
+ ).transpose(Image.Transpose.FLIP_TOP_BOTTOM)
+ blend_mask = Image.new("L", extended_ground.size, "white")
+ blend_mask.paste(bottom_mask)
+ initial_ground_frame = Image.new(base_image.mode, extended_ground.size)
+ initial_ground_frame.paste(
+ base_image.crop((0, base_image.height // 2, base_image.width, base_image.height))
+ )
+ new_ground_frame = Image.new(base_image.mode, extended_ground.size)
+ new_ground_frame.paste(extended_ground)
+ ground_blend = Image.composite(new_ground_frame, initial_ground_frame, blend_mask)
+ return ground_blend
+def linear_gradient_mask(size, margin_height=10):
+ """
+ Create a gradient mask for an image that has a logistic curve shape.
+ The mask is a grayscale image where the top half is darker and the bottom half is lighter.
+ This is useful for creating a seamless transition between the top and bottom halves of an image.
+ :param tuple[int, int] size: The size of the output mask.
+ :param int margin_height: The height of the margin from the mask.
+ :return: A grayscale image representing the gradient mask.
+ """
+ mask = Image.new("L", size)
+ gradient = (
+ Image
+ .linear_gradient("L")
+ .transpose(Image.Transpose.FLIP_TOP_BOTTOM)
+ .resize((size[0], margin_height))
+ )
+ mask.paste(gradient)
+ return mask
+def sigmoid_gradient_mask(width, height, decay=50):
+ """
+ Create a gradient mask for an image that has a logistic curve shape.
+ The mask is a grayscale image where the top half is darker and the bottom half is lighter.
+ This is useful for creating a seamless transition between the top and bottom halves of an image.
+ :param int width: The width of the output mask.
+ :param int height: The height of the output mask.
+ :param float decay: The speed at which the blending changes.
+ :return: A grayscale image representing the gradient mask.
+ """
+ mask = Image.new("L", (width, height))
+ draw = ImageDraw.Draw(mask)
+ indices = np.linspace(0, 1, height)
+ # Logistic curve shape
+ shades = 255 / (1 + np.exp(-decay * (indices - 0.5)))
+ for i, shade in enumerate(shades):
+ draw.line([(0, i), (width, i)], fill=int(shade))
+ return mask
+def add_sky(input_image, steps, step_callback=None):
+ """
+ Create a sky from the top half of the base image as a sky.
+ The sky has the same dimensions as the base image.
+ :param PIL.Image.Image input_image: The input image to be extended as a sky.
+ :param int steps: The number of inference steps for each inpainting process.
+ :param step_callback: Optional callback function to be called after each inference step.
+ :type step_callback: Callable | None
+ :return PIL.Image.Image: The final image with more sky.
+ """
+ # Base image is 2504x416, this is too much VRAM, need to reduce the size a bit
+ context_height = input_image.height // 2
+ half_sky = input_image.crop((0, 0, input_image.width, context_height))
+ # Prepare the image that will receive an inpainting
+ gradient_extended, mask = me.create_top_mask(
+ half_sky, input_image.height, extension_filling=me.ExtensionFilling.GRADIENT
+ )
+ # Distort on a circle
+ img = image_processing.distort_image(gradient_extended)
+ mask = Image.new("L", img.size, color="black")
+ draw = ImageDraw.Draw(mask)
+ draw.ellipse(
+ (
+ half_sky.height,
+ half_sky.height,
+ img.height - half_sky.height,
+ img.height - half_sky.height,
+ ),
+ fill="white",
+ )
+ img_with_sky = inpaint_image(
+ "the sky seen from below",
+ img,
+ mask,
+ negative_prompt="a logo, a text, birds",
+ num_inference_steps=steps,
+ callback_on_step_end=step_callback,
+ )[0]
+ extended_sky = (
+ # Unroll from (4:4) to (4:~1)
+ image_processing.unroll_top_image(
+ img_with_sky.transpose(Image.Transpose.ROTATE_270), input_image.width
+ )
+ .transpose(Image.Transpose.FLIP_LEFT_RIGHT)
+ )
+ # Merge the original image and the extended version to get a seamless blend
+ bottom_mask = linear_gradient_mask(input_image.size, input_image.height // 5)
+ blend_mask = Image.new("L", extended_sky.size, "white")
+ blend_mask.paste(bottom_mask, (0, input_image.height))
+ sky_blend = Image.composite(extended_sky, gradient_extended, blend_mask)
+ # Return only the new part
+ return sky_blend
+def concatenate_images_seamless(top_image, bottom_image):
+ """Vertically concatenate two images together without leaving a seam mark."""
+ blend_mask = sigmoid_gradient_mask(top_image.width, top_image.height * 2)
+ foreground = Image.new("RGB", (bottom_image.width, bottom_image.height * 2))
+ foreground.paste(top_image)
+ foreground.paste(
+ top_image.transpose(Image.Transpose.FLIP_TOP_BOTTOM), (0, top_image.height)
+ )
+ background = Image.new("RGB", (bottom_image.width, bottom_image.height * 2))
+ background.paste(bottom_image.transpose(Image.Transpose.FLIP_TOP_BOTTOM))
+ background.paste(bottom_image, (0, bottom_image.height))
+ return Image.composite(background, foreground, blend_mask)
+def extend_image(base_image, steps_per_inference=50, step_callback=None):
+ """
+ Triple the height of an image with more sky and ground.
+ The optimal dimensions for the base image are 2508x418 (1024*sqrt(6)).
+ The closest dimensions divisible by 8 are 2504x416, but 2048x512 yields better image quality.
+ :param PIL.Image.Image base_image: Initial image to work on.
+ :param int steps_per_inference: Number of inference steps for each inpainting process.
+ :param step_callback: Optional callback function to be called after each inference step.
+ :type step_callback: server.task_tracker.TaskTracker | None
+ :return PIL.Image.Image: The final image with more sky and ground.
+ """
+ img_with_sky = add_sky(
+ base_image,
+ min(steps_per_inference, 30),
+ step_callback.incomplete_callback(30) if step_callback else None,
+ )
+ img_with_sky.show()
+ # Add the ground
+ extended_ground = add_ground(
+ base_image,
+ steps_per_inference,
+ step_callback.incomplete_callback(30) if step_callback else None,
+ )
+ # Add the three pieces to the final canvas
+ final_image = Image.new(base_image.mode, (base_image.width, base_image.height * 5 // 2))
+ final_image.paste(base_image, (0, final_image.height - base_image.height // 2))
+ final_image.paste(img_with_sky)
+ final_image.paste(extended_ground, (0, img_with_sky.height))
+ final_image.show()
+ return final_image
+def legacy_extension(base_image, prompt, num_inference_steps=50):
+ """
+ Extend the base image with the legacy pipeline v0.3.
+ The main trade-off of this pipeline was that while it made seamless matching,
+ it uses up to 16 GB of VRAM and was sometimes not compliant to sky and ground requests.
+ :param PIL.Image.Image base_image: Initial image to extend.
+ :param str prompt: Prompt to use to tile as a panorama.
+ :param int num_inference_steps: Number of inference steps for generation.
+ :return PIL.Image.Image: The extended image with a cylindrical projection.
+ """
+ base_image.show()
+ print("Closing the sky...")
+ img, mask = me.create_top_mask(base_image)
+ img_with_sky = inpaint_image(
+ "a sky, uniform color",
+ img,
+ mask,
+ negative_prompt="a logo, a text, clouds, birds",
+ num_inference_steps=num_inference_steps,
+ )[0]
+ img_with_sky.show("Image with more sky")
+ img, mask = me.horizontal_tiling_mask(img_with_sky)
+ print("Fixing the panorama...")
+ panorama = inpaint_image(
+ prompt,
+ img,
+ mask,
+ negative_prompt="a logo, a text",
+ num_inference_steps=num_inference_steps,
+ )[0]
+ panorama.show("panorama")
+ cylindrical = cylindrical_projection(panorama)
+ blended = blend_borders(cylindrical, 10)
+ # horizontal_tiling(blended).show("manually tiling")
+ return blended
+def generate_panorama_legacy(prompt, num_inference_steps=50):
+ """
+ Create a panorama from a prompt.
+ A panorama is an image with a deformation on the vertical axis.
+ :param str prompt: The initial user prompt.
+ :param int num_inference_steps: Number of inference steps for each step.
+ :return PIL.Image.Image: The computed panorama.
+ """
+ print("Generating image...")
+ base_image = generate_images(
+ prompt, num_inference_steps=num_inference_steps, width=2048, height=512
+ )[0]
+ extended_image = legacy_extension(base_image, prompt, num_inference_steps)
+ return extended_image
+def generate_panorama(prompt, num_inference_steps=50, progress_tracker=None):
+ """
+ Create a panorama from a prompt, more complete than the legacy version.
+ :param str prompt: The initial user prompt.
+ :param int num_inference_steps: Number of inference steps for each step.
+ :param progress_tracker: A TaskTracker to be called when the step is finished.
+ :type progress_tracker: server.task_tracker.TaskTracker | None
+ :return PIL.Image.Image: The computed panorama.
+ """
+ base_image = generate_images(
+ prompt, num_inference_steps=num_inference_steps, width=2504, height=416,
+ callback_on_step_end=progress_tracker.incomplete_callback(30) if progress_tracker else None
+ )[0]
+ base_image.show()
+ # Inpaint to blend the borders
+ panorama = rewrite_image_borders(base_image)
+ extended_image = extend_image(panorama, num_inference_steps, progress_tracker)
+ return extended_image
+def __user_interaction(num_inference_steps=50, use_legacy=False):
+ """A demonstration function that asks an image prompt to the user and shows the result."""
+ prompt = input("What panorama do you want? ")
+ if not prompt or prompt.strip().isspace():
+ prompt = "a peaceful valley"
+ print("Using prompt: " + prompt)
+ if use_legacy:
+ img = generate_panorama_legacy(prompt, num_inference_steps)
+ else:
+ img = generate_panorama(prompt, num_inference_steps)
+ img.show("Final panorama")
+if __name__ == "__main__":
+ __user_interaction(20)
diff --git a/sound/ambient_generation.py b/sound/ambient_generation.py
new file mode 100644
index 0000000..eae0560
--- /dev/null
+++ b/sound/ambient_generation.py
@@ -0,0 +1,85 @@
+Generates an ambient audio from a text prompt.
+* For ambient audio:
+ * https://huggingface.co/declare-lab/tango2 : apparently a good model but difficult to integrate
+ * https://huggingface.co/facebook/audiogen-medium : less good but sufficient model
+* Music : https://huggingface.co/facebook/musicgen-small
+* Text-to-speech : https://huggingface.co/suno/bark
+from audiocraft.models import AudioGen
+from audiocraft.data.audio import audio_write
+def ambient_audio(descriptions, duration=10):
+ """
+ Generate audio samples based on descriptions provided.
+ :param list[str] descriptions: Description of the audio.
+ :param int duration: The duration of the audio.
+ :return tuple[torch.Tensor, int]: WAVE audio samples and sample rate.
+ """
+ model = AudioGen.get_pretrained("facebook/audiogen-medium")
+ model.set_generation_params(duration=duration)
+ wav = model.generate(descriptions)
+ return wav, model.sample_rate
+def ambient_music(descriptions, duration=30):
+ """
+ Generate musics based on the descriptions provided.
+ :param list[str] descriptions: Description of the audio.
+ :param int duration: The duration of the audio.
+ :return tuple[torch.Tensor, int]: WAVE audio samples and sample rate.
+ """
+ model = AudioGen.get_pretrained("facebook/musicgen-medium")
+ model.set_generation_params(duration=duration)
+ wav = model.generate(descriptions)
+ return wav, model.sample_rate
+def generate_audio(descriptions, duration=10):
+ """
+ Generates audio samples based on descriptions provided and saves them as .wav files.
+ :param list[str] descriptions: Description of the audio.
+ :param int duration: The duration of the audio in seconds. Default is 10 seconds.
+ """
+ wav_data, sample_rate = ambient_audio(descriptions, duration)
+ for idx, one_wav in enumerate(wav_data):
+ # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
+ audio_write(
+ f"outputs/audio_{idx}.wav",
+ one_wav.cpu(),
+ sample_rate,
+ strategy="loudness",
+ loudness_compressor=True,
+ )
+def generate_music(descriptions, duration=30):
+ """
+ Generate music based on the descriptions provided.
+ :param list[str] descriptions: Description of the audio.
+ :param int duration: The duration of the audio. Default is 30 seconds.
+ """
+ wav_data, sample_rate = ambient_music(descriptions, duration)
+ for idx, one_wav in enumerate(wav_data):
+ # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
+ audio_write(
+ f"outputs/music_{idx}.wav",
+ one_wav.cpu(),
+ sample_rate,
+ strategy="loudness",
+ loudness_compressor=True,
+ )
+if __name__ == "__main__":
+ generate_audio(["Seagulls crying", "Waves crashing", "Water lapping at the shore"])
+ # generate_music(["Calm and relaxing music"])
diff --git a/sunny_mountain.png b/sunny_mountain.png
new file mode 100644
index 0000000..6f594b0
--- /dev/null
+++ b/utils/download_models.py
@@ -0,0 +1,27 @@
+Simple utility script that forces the download of all models.
+Just load the script, and the models should get installed.
+import asr.speech_to_text
+import skybox.diffusion
+import skybox.inpainting
+def load_production_pipelines():
+ """Load all pipelines used in the server in order to download the associated models."""
+ print("Starting loading models")
+ print("Loading speech recognition...")
+ asr.speech_to_text.get_asr_model()
+ print("Loading image generation...")
+ skybox.diffusion.get_image_generation_pipeline()
+ print("Loading image refinement...")
+ skybox.diffusion.get_image_refinement_pipeline()
+ print("Loading inpainting...")
+ skybox.inpainting.get_inpainting_pipeline()
+ print("Finished loading models with success!")
+if __name__ == "__main__":
+ load_production_pipelines()