1 files changed, 20 insertions, 21 deletions
diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index 98703d5..204276e 100644
--- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -9,6 +9,7 @@ import torch.nn.functional as F
 import PIL
 from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
 from diffusers.utils import is_accelerate_available
 from diffusers import (
    AutoencoderKL,
@@ -161,6 +162,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
            scheduler=scheduler,
        )
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
        r"""
@@ -443,14 +445,6 @@ class VlpnStableDiffusion(DiffusionPipeline):
            extra_step_kwargs["generator"] = generator
        return extra_step_kwargs
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents.to(dtype=self.vae.dtype), return_dict=False)[0]
-        image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        return image
    @torch.no_grad()
    def __call__(
        self,
@@ -544,6 +538,8 @@ class VlpnStableDiffusion(DiffusionPipeline):
        do_classifier_free_guidance = guidance_scale > 1.0
        do_self_attention_guidance = sag_scale > 0.0
        prep_from_image = isinstance(image, PIL.Image.Image)
+        if not prep_from_image:
+            strength = 1
        # 3. Encode input prompt
        prompt_embeds = self.encode_prompt(
@@ -577,7 +573,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
            )
        else:
            latents = self.prepare_latents(
-                batch_size,
+                batch_size * num_images_per_prompt,
                num_channels_latents,
                height,
                width,
@@ -623,9 +619,12 @@ class VlpnStableDiffusion(DiffusionPipeline):
                    noise_pred = noise_pred_uncond + guidance_scale * (
                        noise_pred_text - noise_pred_uncond
                    )
-                    noise_pred = rescale_noise_cfg(
+                    if guidance_rescale > 0.0:
-                        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
+                        noise_pred = rescale_noise_cfg(
-                    )
+                            noise_pred,
+                            noise_pred_text,
+                            guidance_rescale=guidance_rescale,
+                        )
                if do_self_attention_guidance:
                    # classifier-free guidance produces two chunks of attention map
@@ -690,17 +689,17 @@ class VlpnStableDiffusion(DiffusionPipeline):
        has_nsfw_concept = None
-        if output_type == "latent":
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False
+            )[0]
+        else:
            image = latents
-        elif output_type == "pil":
-            # 9. Post-processing
-            image = self.decode_latents(latents)
-            # 10. Convert to PIL
+        do_denormalize = [True] * image.shape[0]
-            image = self.numpy_to_pil(image)
+        image = self.image_processor.postprocess(
-        else:
+            image, output_type=output_type, do_denormalize=do_denormalize
-            # 9. Post-processing
+        )
-            image = self.decode_latents(latents)
        # Offload last model to CPU
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: