1 files changed, 64 insertions, 16 deletions
diff --git a/pipelines/stable_diffusion/clip_guided_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index eff74b5..4c793a8 100644
--- a/pipelines/stable_diffusion/clip_guided_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -2,22 +2,29 @@ import inspect
 import warnings
 from typing import List, Optional, Union
+import numpy as np
 import torch
-from torch import nn
+import PIL
-from torch.nn import functional as F
 from diffusers.configuration_utils import FrozenDict
 from diffusers import AutoencoderKL, DiffusionPipeline, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import logging
-from torchvision import transforms
+from transformers import CLIPTextModel, CLIPTokenizer
-from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
 from schedulers.scheduling_euler_a import EulerAScheduler, CFGDenoiserForward
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-class CLIPGuidedStableDiffusion(DiffusionPipeline):
+def preprocess(image, w, h):
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+class VlpnStableDiffusion(DiffusionPipeline):
    def __init__(
        self,
        vae: AutoencoderKL,
@@ -83,13 +90,14 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        self,
        prompt: Union[str, List[str]],
        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
        height: Optional[int] = 512,
        width: Optional[int] = 512,
        num_inference_steps: Optional[int] = 50,
        guidance_scale: Optional[float] = 7.5,
        eta: Optional[float] = 0.0,
        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
+        latents: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
    ):
@@ -99,6 +107,12 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
@@ -158,6 +172,42 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if strength < 0 or strength > 1:
+            raise ValueError(f"`strength` should in [0.0, 1.0] but is {strength}")
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+        offset = self.scheduler.config.get("steps_offset", 0)
+        if latents is not None and isinstance(latents, PIL.Image.Image):
+            latents = preprocess(latents, width, height)
+            latent_dist = self.vae.encode(latents.to(self.device)).latent_dist
+            latents = latent_dist.sample(generator=generator)
+            latents = 0.18215 * latents
+            latents = torch.cat([latents] * batch_size)
+            # get the original timestep using init_timestep
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                timesteps = torch.tensor(
+                    [num_inference_steps - init_timestep] * batch_size, dtype=torch.long, device=self.device
+                )
+            elif isinstance(self.scheduler, EulerAScheduler):
+                timesteps = self.scheduler.timesteps[-init_timestep]
+                timesteps = torch.tensor([timesteps] * batch_size, device=self.device)
+            else:
+                timesteps = self.scheduler.timesteps[-init_timestep]
+                timesteps = torch.tensor([timesteps] * batch_size, dtype=torch.long, device=self.device)
+            # add noise to latents using the timesteps
+            noise = torch.randn(latents.shape, generator=generator, device=self.device)
+            latents = self.scheduler.add_noise(latents, noise, timesteps)
+        else:
+            init_timestep = num_inference_steps + offset
        # get prompt text embeddings
        text_inputs = self.tokenizer(
            prompt,
@@ -213,15 +263,11 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
            latents = latents.to(self.device)
-        # set timesteps
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
-        self.scheduler.set_timesteps(num_inference_steps)
        # Some schedulers like PNDM have timesteps as arrays
        # It's more optimzed to move all timesteps to correct device beforehand
-        if torch.is_tensor(self.scheduler.timesteps):
+        timesteps_tensor = torch.tensor(self.scheduler.timesteps[t_start:], device=self.device)
-            timesteps_tensor = self.scheduler.timesteps.to(self.device)
-        else:
-            timesteps_tensor = torch.tensor(self.scheduler.timesteps.copy(), device=self.device)
        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
        if isinstance(self.scheduler, LMSDiscreteScheduler):
@@ -244,10 +290,12 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
            extra_step_kwargs["generator"] = generator
        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            t_index = t_start + i
            # expand the latents if we are doing classifier free guidance
            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                sigma = self.scheduler.sigmas[i]
+                sigma = self.scheduler.sigmas[t_index]
                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
@@ -270,10 +318,10 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
            # compute the previous noisy sample x_t -> x_t-1
            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs).prev_sample
+                latents = self.scheduler.step(noise_pred, t_index, latents, **extra_step_kwargs).prev_sample
            elif isinstance(self.scheduler, EulerAScheduler):
-                if i < self.scheduler.timesteps.shape[0] - 1:  # avoid out of bound error
+                if t_index < self.scheduler.timesteps.shape[0] - 1:  # avoid out of bound error
-                    t_prev = self.scheduler.timesteps[i+1]
+                    t_prev = self.scheduler.timesteps[t_index+1]
                    latents = self.scheduler.step(noise_pred, t, t_prev, latents, **extra_step_kwargs).prev_sample
            else:
                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample