From bc28ad0e0355916cb7e0b2df5ee0992f2e0b427c Mon Sep 17 00:00:00 2001 From: Volpeon Date: Sat, 4 Mar 2023 19:24:24 +0100 Subject: More flexible pipeline wrt init noise --- .../stable_diffusion/vlpn_stable_diffusion.py | 57 +++++++++++++++++----- 1 file changed, 44 insertions(+), 13 deletions(-) (limited to 'pipelines/stable_diffusion') diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py index 5f4fc38..f27be78 100644 --- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py +++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py @@ -1,7 +1,7 @@ import inspect import warnings import math -from typing import List, Dict, Any, Optional, Union, Callable +from typing import List, Dict, Any, Optional, Union, Callable, Literal import numpy as np import torch @@ -22,7 +22,7 @@ from diffusers import ( PNDMScheduler, ) from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput -from diffusers.utils import logging +from diffusers.utils import logging, randn_tensor from transformers import CLIPTextModel, CLIPTokenizer from models.clip.util import unify_input_ids, get_extended_embeddings @@ -312,7 +312,7 @@ class VlpnStableDiffusion(DiffusionPipeline): ).expand(batch_size, 3, width, height) return (1.4 * noise).clamp(-1, 1) - def prepare_latents(self, init_image, timestep, batch_size, dtype, device, generator=None): + def prepare_latents_from_image(self, init_image, timestep, batch_size, dtype, device, generator=None): init_image = init_image.to(device=device, dtype=dtype) init_latents = self.vae.encode(init_image).latent_dist.sample(generator=generator) init_latents = self.vae.config.scaling_factor * init_latents @@ -334,6 +334,23 @@ class VlpnStableDiffusion(DiffusionPipeline): return latents + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + def prepare_extra_step_kwargs(self, generator, eta): # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. @@ -373,7 +390,7 @@ class VlpnStableDiffusion(DiffusionPipeline): sag_scale: float = 0.75, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, + image: Optional[Union[torch.FloatTensor, PIL.Image.Image, Literal["noise"]]] = None, output_type: str = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, @@ -443,8 +460,10 @@ class VlpnStableDiffusion(DiffusionPipeline): # 2. Define call parameters batch_size = len(prompt) device = self.execution_device + num_channels_latents = self.unet.in_channels do_classifier_free_guidance = guidance_scale > 1.0 do_self_attention_guidance = sag_scale > 0.0 + prep_from_image = isinstance(image, PIL.Image.Image) or image == "noise" # 3. Encode input prompt prompt_embeds = self.encode_prompt( @@ -458,7 +477,7 @@ class VlpnStableDiffusion(DiffusionPipeline): # 4. Prepare latent variables if isinstance(image, PIL.Image.Image): image = preprocess(image) - elif image is None: + elif image == "noise": image = self.prepare_image( batch_size * num_images_per_prompt, width, @@ -474,14 +493,26 @@ class VlpnStableDiffusion(DiffusionPipeline): latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # 6. Prepare latent variables - latents = self.prepare_latents( - image, - latent_timestep, - batch_size * num_images_per_prompt, - prompt_embeds.dtype, - device, - generator - ) + if prep_from_image: + latents = self.prepare_latents_from_image( + image, + latent_timestep, + batch_size * num_images_per_prompt, + prompt_embeds.dtype, + device, + generator + ) + else: + latents = self.prepare_latents( + batch_size, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + image + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) -- cgit v1.2.3-54-g00ecf