From 5b80eb8dac50941c05209df9bb560959ab81bdb0 Mon Sep 17 00:00:00 2001 From: Volpeon Date: Sat, 4 Mar 2023 08:17:31 +0100 Subject: Pipeline: Improved initial image generation --- .../stable_diffusion/vlpn_stable_diffusion.py | 49 ++++++++++++---------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'pipelines/stable_diffusion') diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py index 242be29..2251848 100644 --- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py +++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py @@ -295,16 +295,14 @@ class VlpnStableDiffusion(DiffusionPipeline): def get_timesteps(self, num_inference_steps, strength, device): # get the original timestep using init_timestep - offset = self.scheduler.config.get("steps_offset", 0) - init_timestep = int(num_inference_steps * strength) + offset - init_timestep = min(init_timestep, num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - t_start = max(num_inference_steps - init_timestep + offset, 0) + t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start:] timesteps = timesteps.to(device) - return timesteps + return timesteps, num_inference_steps - t_start def prepare_image(self, batch_size, width, height, max_offset, dtype, device, generator=None): offset = (max_offset * (2 * torch.rand( @@ -312,12 +310,16 @@ class VlpnStableDiffusion(DiffusionPipeline): dtype=dtype, device=device, generator=generator - ) - 1)).expand(batch_size, 3, width, height) - image = (.1 * torch.normal( - mean=offset, - std=1, - generator=generator - )).clamp(-1, 1) + ) - 1)).expand(batch_size, 1, 2, 2) + image = F.interpolate( + torch.normal( + mean=offset, + std=0.3, + generator=generator + ).clamp(-1, 1), + size=(width, height), + mode="bicubic" + ).expand(batch_size, 3, width, height) return image def prepare_latents(self, init_image, timestep, batch_size, dtype, device, generator=None): @@ -382,7 +384,7 @@ class VlpnStableDiffusion(DiffusionPipeline): eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, - max_image_offset: float = 1.0, + max_init_offset: float = 0.7, output_type: str = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, @@ -464,11 +466,7 @@ class VlpnStableDiffusion(DiffusionPipeline): device ) - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.get_timesteps(num_inference_steps, strength, device) - - # 5. Prepare latent variables + # 4. Prepare latent variables if isinstance(image, PIL.Image.Image): image = preprocess(image) elif image is None: @@ -476,13 +474,18 @@ class VlpnStableDiffusion(DiffusionPipeline): batch_size * num_images_per_prompt, width, height, - max_image_offset, + max_init_offset, prompt_embeds.dtype, device, generator ) + # 5. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + + # 6. Prepare latent variables latents = self.prepare_latents( image, latent_timestep, @@ -492,10 +495,10 @@ class VlpnStableDiffusion(DiffusionPipeline): generator ) - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 7. Denoising loo + # 8. Denoising loo if do_self_attention_guidance: store_processor = CrossAttnStoreProcessor() self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor @@ -559,13 +562,13 @@ class VlpnStableDiffusion(DiffusionPipeline): if callback is not None and i % callback_steps == 0: callback(i, t, latents) - # 8. Post-processing + # 9. Post-processing image = self.decode_latents(latents) - # 9. Run safety checker + # 10. Run safety checker has_nsfw_concept = None - # 10. Convert to PIL + # 11. Convert to PIL if output_type == "pil": image = self.numpy_to_pil(image) -- cgit v1.2.3-70-g09d2