From 27b18776ba6d38d6bda5e5bafee3e7c4ca8c9712 Mon Sep 17 00:00:00 2001 From: Volpeon Date: Sat, 24 Jun 2023 16:26:22 +0200 Subject: Fixes --- .../stable_diffusion/vlpn_stable_diffusion.py | 41 +++++++++++----------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'pipelines') diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py index 98703d5..204276e 100644 --- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py +++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py @@ -9,6 +9,7 @@ import torch.nn.functional as F import PIL from diffusers.configuration_utils import FrozenDict +from diffusers.image_processor import VaeImageProcessor from diffusers.utils import is_accelerate_available from diffusers import ( AutoencoderKL, @@ -161,6 +162,7 @@ class VlpnStableDiffusion(DiffusionPipeline): scheduler=scheduler, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): r""" @@ -443,14 +445,6 @@ class VlpnStableDiffusion(DiffusionPipeline): extra_step_kwargs["generator"] = generator return extra_step_kwargs - def decode_latents(self, latents): - latents = 1 / self.vae.config.scaling_factor * latents - image = self.vae.decode(latents.to(dtype=self.vae.dtype), return_dict=False)[0] - image = (image / 2 + 0.5).clamp(0, 1) - # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16 - image = image.cpu().permute(0, 2, 3, 1).float().numpy() - return image - @torch.no_grad() def __call__( self, @@ -544,6 +538,8 @@ class VlpnStableDiffusion(DiffusionPipeline): do_classifier_free_guidance = guidance_scale > 1.0 do_self_attention_guidance = sag_scale > 0.0 prep_from_image = isinstance(image, PIL.Image.Image) + if not prep_from_image: + strength = 1 # 3. Encode input prompt prompt_embeds = self.encode_prompt( @@ -577,7 +573,7 @@ class VlpnStableDiffusion(DiffusionPipeline): ) else: latents = self.prepare_latents( - batch_size, + batch_size * num_images_per_prompt, num_channels_latents, height, width, @@ -623,9 +619,12 @@ class VlpnStableDiffusion(DiffusionPipeline): noise_pred = noise_pred_uncond + guidance_scale * ( noise_pred_text - noise_pred_uncond ) - noise_pred = rescale_noise_cfg( - noise_pred, noise_pred_text, guidance_rescale=guidance_rescale - ) + if guidance_rescale > 0.0: + noise_pred = rescale_noise_cfg( + noise_pred, + noise_pred_text, + guidance_rescale=guidance_rescale, + ) if do_self_attention_guidance: # classifier-free guidance produces two chunks of attention map @@ -690,17 +689,17 @@ class VlpnStableDiffusion(DiffusionPipeline): has_nsfw_concept = None - if output_type == "latent": + if not output_type == "latent": + image = self.vae.decode( + latents / self.vae.config.scaling_factor, return_dict=False + )[0] + else: image = latents - elif output_type == "pil": - # 9. Post-processing - image = self.decode_latents(latents) - # 10. Convert to PIL - image = self.numpy_to_pil(image) - else: - # 9. Post-processing - image = self.decode_latents(latents) + do_denormalize = [True] * image.shape[0] + image = self.image_processor.postprocess( + image, output_type=output_type, do_denormalize=do_denormalize + ) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: -- cgit v1.2.3-54-g00ecf