2 files changed, 131 insertions, 39 deletions
diff --git a/infer.py b/infer.py
index d917239..b440cb6 100644
--- a/infer.py
+++ b/infer.py
@@ -8,13 +8,47 @@ from pathlib import Path
 from torch import autocast
 import torch
 import json
+from PIL import Image
 from diffusers import StableDiffusionPipeline, AutoencoderKL, UNet2DConditionModel, PNDMScheduler, DDIMScheduler, LMSDiscreteScheduler
 from transformers import CLIPTextModel, CLIPTokenizer, CLIPFeatureExtractor
 from slugify import slugify
-from pipelines.stable_diffusion.clip_guided_stable_diffusion import CLIPGuidedStableDiffusion
+from pipelines.stable_diffusion.vlpn_stable_diffusion import VlpnStableDiffusion
 from schedulers.scheduling_euler_a import EulerAScheduler
+default_args = {
+    "model": None,
+    "scheduler": "euler_a",
+    "output_dir": "output/inference",
+    "config": None,
+}
+default_cmds = {
+    "prompt": None,
+    "negative_prompt": None,
+    "image": None,
+    "image_strength": .7,
+    "width": 512,
+    "height": 512,
+    "batch_size": 1,
+    "batch_num": 1,
+    "steps": 50,
+    "guidance_scale": 7.0,
+    "seed": None,
+    "config": None,
+}
+def merge_dicts(d1, *args):
+    d1 = d1.copy()
+    for d in args:
+        d1.update({k: v for (k, v) in d.items() if v is not None})
+    return d1
 def create_args_parser():
    parser = argparse.ArgumentParser(
        description="Simple example of a training script."
@@ -22,23 +56,19 @@ def create_args_parser():
    parser.add_argument(
        "--model",
        type=str,
-        default=None,
    )
    parser.add_argument(
        "--scheduler",
        type=str,
        choices=["plms", "ddim", "klms", "euler_a"],
-        default="euler_a",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
-        default="output/inference",
    )
    parser.add_argument(
        "--config",
        type=str,
-        default=None,
    )
    return parser
@@ -51,66 +81,69 @@ def create_cmd_parser():
    parser.add_argument(
        "--prompt",
        type=str,
-        default=None,
    )
    parser.add_argument(
        "--negative_prompt",
        type=str,
-        default=None,
+    )
+    parser.add_argument(
+        "--image",
+        type=str,
+    )
+    parser.add_argument(
+        "--image_strength",
+        type=float,
    )
    parser.add_argument(
        "--width",
        type=int,
-        default=512,
    )
    parser.add_argument(
        "--height",
        type=int,
-        default=512,
    )
    parser.add_argument(
        "--batch_size",
        type=int,
-        default=1,
    )
    parser.add_argument(
        "--batch_num",
        type=int,
-        default=1,
    )
    parser.add_argument(
        "--steps",
        type=int,
-        default=70,
    )
    parser.add_argument(
        "--guidance_scale",
-        type=int,
+        type=float,
-        default=7,
    )
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
    )
    parser.add_argument(
        "--config",
        type=str,
-        default=None,
    )
    return parser
-def run_parser(parser, input=None):
+def run_parser(parser, defaults, input=None):
    args = parser.parse_known_args(input)[0]
+    conf_args = argparse.Namespace()
    if args.config is not None:
        with open(args.config, 'rt') as f:
-            args = parser.parse_known_args(
+            conf_args = parser.parse_known_args(
                namespace=argparse.Namespace(**json.load(f)["args"]))[0]
-    return args
+    res = defaults.copy()
+    for dict in [vars(conf_args), vars(args)]:
+        res.update({k: v for (k, v) in dict.items() if v is not None})
+    return argparse.Namespace(**res)
 def save_args(basepath, args, extra={}):
@@ -146,7 +179,7 @@ def create_pipeline(model, scheduler, dtype=torch.bfloat16):
            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False
        )
-    pipeline = CLIPGuidedStableDiffusion(
+    pipeline = VlpnStableDiffusion(
        text_encoder=text_encoder,
        vae=vae,
        unet=unet,
@@ -154,7 +187,7 @@ def create_pipeline(model, scheduler, dtype=torch.bfloat16):
        scheduler=scheduler,
        feature_extractor=feature_extractor
    )
-    pipeline.enable_attention_slicing()
+    # pipeline.enable_attention_slicing()
    pipeline.to("cuda")
    print("Pipeline loaded.")
@@ -171,6 +204,13 @@ def generate(output_dir, pipeline, args):
    save_args(output_dir, args)
+    if args.image:
+        init_image = Image.open(args.image)
+        if not init_image.mode == "RGB":
+            init_image = init_image.convert("RGB")
+    else:
+        init_image = None
    with autocast("cuda"):
        for i in range(args.batch_num):
            pipeline.set_progress_bar_config(desc=f"Batch {i + 1} of {args.batch_num}")
@@ -184,11 +224,15 @@ def generate(output_dir, pipeline, args):
                num_inference_steps=args.steps,
                guidance_scale=args.guidance_scale,
                generator=generator,
+                latents=init_image,
            ).images
            for j, image in enumerate(images):
                image.save(output_dir.joinpath(f"{seed + i}_{j}.jpg"))
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
 class CmdParse(cmd.Cmd):
    prompt = 'dream> '
@@ -213,7 +257,7 @@ class CmdParse(cmd.Cmd):
            return True
        try:
-            args = run_parser(self.parser, elements)
+            args = run_parser(self.parser, default_cmds, elements)
        except SystemExit:
            self.parser.print_help()
@@ -233,7 +277,7 @@ def main():
    logging.basicConfig(stream=sys.stdout, level=logging.WARN)
    args_parser = create_args_parser()
-    args = run_parser(args_parser)
+    args = run_parser(args_parser, default_args)
    output_dir = Path(args.output_dir)
    pipeline = create_pipeline(args.model, args.scheduler)
diff --git a/pipelines/stable_diffusion/clip_guided_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index eff74b5..4c793a8 100644
--- a/pipelines/stable_diffusion/clip_guided_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -2,22 +2,29 @@ import inspect
 import warnings
 from typing import List, Optional, Union
+import numpy as np
 import torch
-from torch import nn
+import PIL
-from torch.nn import functional as F
 from diffusers.configuration_utils import FrozenDict
 from diffusers import AutoencoderKL, DiffusionPipeline, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import logging
-from torchvision import transforms
+from transformers import CLIPTextModel, CLIPTokenizer
-from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
 from schedulers.scheduling_euler_a import EulerAScheduler, CFGDenoiserForward
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-class CLIPGuidedStableDiffusion(DiffusionPipeline):
+def preprocess(image, w, h):
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+class VlpnStableDiffusion(DiffusionPipeline):
    def __init__(
        self,
        vae: AutoencoderKL,
@@ -83,13 +90,14 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        self,
        prompt: Union[str, List[str]],
        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
        height: Optional[int] = 512,
        width: Optional[int] = 512,
        num_inference_steps: Optional[int] = 50,
        guidance_scale: Optional[float] = 7.5,
        eta: Optional[float] = 0.0,
        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
    ):
@@ -99,6 +107,12 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
@@ -158,6 +172,42 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if strength < 0 or strength > 1:
+            raise ValueError(f"`strength` should in [0.0, 1.0] but is {strength}")
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+        offset = self.scheduler.config.get("steps_offset", 0)
+        if latents is not None and isinstance(latents, PIL.Image.Image):
+            latents = preprocess(latents, width, height)
+            latent_dist = self.vae.encode(latents.to(self.device)).latent_dist
+            latents = latent_dist.sample(generator=generator)
+            latents = 0.18215 * latents
+            latents = torch.cat([latents] * batch_size)
+            # get the original timestep using init_timestep
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                timesteps = torch.tensor(
+                    [num_inference_steps - init_timestep] * batch_size, dtype=torch.long, device=self.device
+                )
+            elif isinstance(self.scheduler, EulerAScheduler):
+                timesteps = self.scheduler.timesteps[-init_timestep]
+                timesteps = torch.tensor([timesteps] * batch_size, device=self.device)
+            else:
+                timesteps = self.scheduler.timesteps[-init_timestep]
+                timesteps = torch.tensor([timesteps] * batch_size, dtype=torch.long, device=self.device)
+            # add noise to latents using the timesteps
+            noise = torch.randn(latents.shape, generator=generator, device=self.device)
+            latents = self.scheduler.add_noise(latents, noise, timesteps)
+        else:
+            init_timestep = num_inference_steps + offset
        # get prompt text embeddings
        text_inputs = self.tokenizer(
            prompt,
@@ -213,15 +263,11 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
            latents = latents.to(self.device)
-        # set timesteps
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
-        self.scheduler.set_timesteps(num_inference_steps)
        # Some schedulers like PNDM have timesteps as arrays
        # It's more optimzed to move all timesteps to correct device beforehand
-        if torch.is_tensor(self.scheduler.timesteps):
+        timesteps_tensor = torch.tensor(self.scheduler.timesteps[t_start:], device=self.device)
-            timesteps_tensor = self.scheduler.timesteps.to(self.device)
-        else:
-            timesteps_tensor = torch.tensor(self.scheduler.timesteps.copy(), device=self.device)
        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
        if isinstance(self.scheduler, LMSDiscreteScheduler):
@@ -244,10 +290,12 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
            extra_step_kwargs["generator"] = generator
        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            t_index = t_start + i
            # expand the latents if we are doing classifier free guidance
            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                sigma = self.scheduler.sigmas[i]
+                sigma = self.scheduler.sigmas[t_index]
                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
@@ -270,10 +318,10 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
            # compute the previous noisy sample x_t -> x_t-1
            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t_index, latents, **extra_step_kwargs).prev_sample
            elif isinstance(self.scheduler, EulerAScheduler):
-                if i < self.scheduler.timesteps.shape[0] - 1:  # avoid out of bound error
+                if t_index < self.scheduler.timesteps.shape[0] - 1:  # avoid out of bound error
-                    t_prev = self.scheduler.timesteps[i+1]
+                    t_prev = self.scheduler.timesteps[t_index+1]
                    latents = self.scheduler.step(noise_pred, t, t_prev, latents, **extra_step_kwargs).prev_sample
            else:
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample