From 27b18776ba6d38d6bda5e5bafee3e7c4ca8c9712 Mon Sep 17 00:00:00 2001
From: Volpeon <git@volpeon.ink>
Date: Sat, 24 Jun 2023 16:26:22 +0200
Subject: Fixes

---
 .../stable_diffusion/vlpn_stable_diffusion.py      | 41 +++++++++++-----------
 1 file changed, 20 insertions(+), 21 deletions(-)

(limited to 'pipelines')

diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index 98703d5..204276e 100644
--- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -9,6 +9,7 @@ import torch.nn.functional as F
 import PIL
 
 from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
 from diffusers.utils import is_accelerate_available
 from diffusers import (
     AutoencoderKL,
@@ -161,6 +162,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
             scheduler=scheduler,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
@@ -443,14 +445,6 @@ class VlpnStableDiffusion(DiffusionPipeline):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents.to(dtype=self.vae.dtype), return_dict=False)[0]
-        image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        return image
-
     @torch.no_grad()
     def __call__(
         self,
@@ -544,6 +538,8 @@ class VlpnStableDiffusion(DiffusionPipeline):
         do_classifier_free_guidance = guidance_scale > 1.0
         do_self_attention_guidance = sag_scale > 0.0
         prep_from_image = isinstance(image, PIL.Image.Image)
+        if not prep_from_image:
+            strength = 1
 
         # 3. Encode input prompt
         prompt_embeds = self.encode_prompt(
@@ -577,7 +573,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
             )
         else:
             latents = self.prepare_latents(
-                batch_size,
+                batch_size * num_images_per_prompt,
                 num_channels_latents,
                 height,
                 width,
@@ -623,9 +619,12 @@ class VlpnStableDiffusion(DiffusionPipeline):
                     noise_pred = noise_pred_uncond + guidance_scale * (
                         noise_pred_text - noise_pred_uncond
                     )
-                    noise_pred = rescale_noise_cfg(
-                        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
-                    )
+                    if guidance_rescale > 0.0:
+                        noise_pred = rescale_noise_cfg(
+                            noise_pred,
+                            noise_pred_text,
+                            guidance_rescale=guidance_rescale,
+                        )
 
                 if do_self_attention_guidance:
                     # classifier-free guidance produces two chunks of attention map
@@ -690,17 +689,17 @@ class VlpnStableDiffusion(DiffusionPipeline):
 
         has_nsfw_concept = None
 
-        if output_type == "latent":
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False
+            )[0]
+        else:
             image = latents
-        elif output_type == "pil":
-            # 9. Post-processing
-            image = self.decode_latents(latents)
 
-            # 10. Convert to PIL
-            image = self.numpy_to_pil(image)
-        else:
-            # 9. Post-processing
-            image = self.decode_latents(latents)
+        do_denormalize = [True] * image.shape[0]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
 
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-- 
cgit v1.2.3-70-g09d2