From 64c594869135354a38353551bd58a93e15bd5b85 Mon Sep 17 00:00:00 2001
From: Volpeon <git@volpeon.ink>
Date: Sun, 2 Oct 2022 20:57:43 +0200
Subject: Small performance improvements

---
 infer.py                                            | 18 +++++++++++++-----
 pipelines/stable_diffusion/vlpn_stable_diffusion.py | 18 ++++++++++--------
 schedulers/scheduling_euler_a.py                    |  9 +++++----
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/infer.py b/infer.py
index f2c380f..b15b17f 100644
--- a/infer.py
+++ b/infer.py
@@ -19,6 +19,7 @@ from schedulers.scheduling_euler_a import EulerAScheduler
 default_args = {
     "model": None,
     "scheduler": "euler_a",
+    "precision": "bf16",
     "output_dir": "output/inference",
     "config": None,
 }
@@ -28,7 +29,7 @@ default_cmds = {
     "prompt": None,
     "negative_prompt": None,
     "image": None,
-    "image_strength": .3,
+    "image_noise": .7,
     "width": 512,
     "height": 512,
     "batch_size": 1,
@@ -62,6 +63,11 @@ def create_args_parser():
         type=str,
         choices=["plms", "ddim", "klms", "euler_a"],
     )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        choices=["fp32", "fp16", "bf16"],
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
@@ -91,7 +97,7 @@ def create_cmd_parser():
         type=str,
     )
     parser.add_argument(
-        "--image_strength",
+        "--image_noise",
         type=float,
     )
     parser.add_argument(
@@ -153,7 +159,7 @@ def save_args(basepath, args, extra={}):
         json.dump(info, f, indent=4)
 
 
-def create_pipeline(model, scheduler, dtype=torch.bfloat16):
+def create_pipeline(model, scheduler, dtype):
     print("Loading Stable Diffusion pipeline...")
 
     tokenizer = CLIPTokenizer.from_pretrained(model + '/tokenizer', torch_dtype=dtype)
@@ -225,7 +231,7 @@ def generate(output_dir, pipeline, args):
                 guidance_scale=args.guidance_scale,
                 generator=generator,
                 latents=init_image,
-                strength=args.image_strength,
+                strength=args.image_noise,
             ).images
 
             for j, image in enumerate(images):
@@ -279,9 +285,11 @@ def main():
 
     args_parser = create_args_parser()
     args = run_parser(args_parser, default_args)
+
     output_dir = Path(args.output_dir)
+    dtype = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}[args.precision]
 
-    pipeline = create_pipeline(args.model, args.scheduler)
+    pipeline = create_pipeline(args.model, args.scheduler, dtype)
     cmd_parser = create_cmd_parser()
     cmd_prompt = CmdParse(output_dir, pipeline, cmd_parser)
     cmd_prompt.cmdloop()
diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index b4c85e9..8fbe5f9 100644
--- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -223,15 +223,16 @@ class VlpnStableDiffusion(DiffusionPipeline):
         # Unlike in other pipelines, latents need to be generated in the target device
         # for 1-to-1 results reproducibility with the CompVis implementation.
         # However this currently doesn't work in `mps`.
-        latents_device = "cpu" if self.device.type == "mps" else self.device
+        latents_dtype = text_embeddings.dtype
         latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
         if latents is None:
-            latents = torch.randn(
-                latents_shape,
-                generator=generator,
-                device=latents_device,
-                dtype=text_embeddings.dtype,
-            )
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
         elif isinstance(latents, PIL.Image.Image):
             latents = preprocess(latents, width, height)
             latent_dist = self.vae.encode(latents.to(self.device)).latent_dist
@@ -259,7 +260,8 @@ class VlpnStableDiffusion(DiffusionPipeline):
         else:
             if latents.shape != latents_shape:
                 raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents.to(self.device)
+            if latents.device != self.device:
+                raise ValueError(f"Unexpected latents device, got {latents.device}, expected {self.device}")
 
         # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
         if ensure_sigma:
diff --git a/schedulers/scheduling_euler_a.py b/schedulers/scheduling_euler_a.py
index a2d0e9f..d7fea85 100644
--- a/schedulers/scheduling_euler_a.py
+++ b/schedulers/scheduling_euler_a.py
@@ -36,7 +36,7 @@ def get_sigmas(sigmas, n=None):
     if n is None:
         return append_zero(sigmas.flip(0))
     t_max = len(sigmas) - 1  # = 999
-    t = torch.linspace(t_max, 0, n, device=sigmas.device)
+    t = torch.linspace(t_max, 0, n, device=sigmas.device, dtype=sigmas.dtype)
     return append_zero(t_to_sigma(t, sigmas))
 
 # from k_samplers utils.py
@@ -91,9 +91,10 @@ def DSsigma_to_t(sigma, quantize=False, DSsigmas=None):
 
 
 def DiscreteEpsDDPMDenoiserForward(Unet, input, sigma, DSsigmas=None, quantize=False, **kwargs):
-    sigma = sigma.to(Unet.device)
-    DSsigmas = DSsigmas.to(Unet.device)
+    sigma = sigma.to(dtype=input.dtype, device=Unet.device)
+    DSsigmas = DSsigmas.to(dtype=input.dtype, device=Unet.device)
     c_out, c_in = [append_dims(x, input.ndim) for x in get_scalings(sigma)]
+    # print(f">>>>>>>>>>> {input.dtype} {c_in.dtype} {sigma.dtype} {DSsigmas.dtype}")
     eps = Unet(input * c_in, DSsigma_to_t(sigma, quantize=quantize, DSsigmas=DSsigmas),
                encoder_hidden_states=kwargs['cond']).sample
     return input + eps * c_out
@@ -226,7 +227,7 @@ class EulerAScheduler(SchedulerMixin, ConfigMixin):
         d = to_d(latents, s, model_output)
         dt = sigma_down - s
         latents = latents + d * dt
-        latents = latents + torch.randn(latents.shape, layout=latents.layout, device=latents.device,
+        latents = latents + torch.randn(latents.shape, layout=latents.layout, device=latents.device, dtype=latents.dtype,
                                         generator=generator) * sigma_up
 
         return SchedulerOutput(prev_sample=latents)
-- 
cgit v1.2.3-70-g09d2