From 89d471652644f449966a0cd944041c98dab7f66c Mon Sep 17 00:00:00 2001
From: Volpeon <git@volpeon.ink>
Date: Fri, 13 Jan 2023 07:25:24 +0100
Subject: Code deduplication

---
 .../stable_diffusion/vlpn_stable_diffusion.py      | 32 ++++++----------------
 1 file changed, 9 insertions(+), 23 deletions(-)

(limited to 'pipelines/stable_diffusion')

diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index cb300d1..6bc40e9 100644
--- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -20,7 +20,7 @@ from diffusers import (
     PNDMScheduler,
 )
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import logging
+from diffusers.utils import logging, randn_tensor
 from transformers import CLIPTextModel, CLIPTokenizer
 from models.clip.prompt import PromptProcessor
 
@@ -250,8 +250,8 @@ class VlpnStableDiffusion(DiffusionPipeline):
 
         return timesteps
 
-    def prepare_latents(self, batch_size, num_images_per_prompt, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
 
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
@@ -260,28 +260,16 @@ class VlpnStableDiffusion(DiffusionPipeline):
             )
 
         if latents is None:
-            rand_device = "cpu" if device.type == "mps" else device
-
-            if isinstance(generator, list):
-                shape = (1,) + shape[1:]
-                latents = [
-                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
-                    for i in range(batch_size)
-                ]
-                latents = torch.cat(latents, dim=0).to(device)
-            else:
-                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
+            latents = latents.to(device=device, dtype=dtype)
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
 
         return latents
 
-    def prepare_latents_from_image(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+    def prepare_latents_from_image(self, init_image, timestep, batch_size, dtype, device, generator=None):
         init_image = init_image.to(device=device, dtype=dtype)
         init_latent_dist = self.vae.encode(init_image).latent_dist
         init_latents = init_latent_dist.sample(generator=generator)
@@ -292,7 +280,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
                 f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
             )
         else:
-            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+            init_latents = torch.cat([init_latents] * batch_size, dim=0)
 
         # add noise to latents using the timesteps
         noise = torch.randn(init_latents.shape, generator=generator, device=device, dtype=dtype)
@@ -430,16 +418,14 @@ class VlpnStableDiffusion(DiffusionPipeline):
             latents = self.prepare_latents_from_image(
                 image,
                 latent_timestep,
-                batch_size,
-                num_images_per_prompt,
+                batch_size * num_images_per_prompt,
                 text_embeddings.dtype,
                 device,
                 generator
             )
         else:
             latents = self.prepare_latents(
-                batch_size,
-                num_images_per_prompt,
+                batch_size * num_images_per_prompt,
                 num_channels_latents,
                 height,
                 width,
-- 
cgit v1.2.3-70-g09d2