From c5cc1318c2a7597fe62d3379e50187d0b0f22538 Mon Sep 17 00:00:00 2001
From: Volpeon <git@volpeon.ink>
Date: Thu, 1 Dec 2022 13:45:21 +0100
Subject: Update

---
 dreambooth.py                                      | 11 +++++++--
 infer.py                                           |  3 ++-
 .../stable_diffusion/vlpn_stable_diffusion.py      | 27 +++++++++-------------
 textual_inversion.py                               |  1 +
 4 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/dreambooth.py b/dreambooth.py
index 3dd0920..31dbea2 100644
--- a/dreambooth.py
+++ b/dreambooth.py
@@ -32,6 +32,7 @@ logger = get_logger(__name__)
 
 
 torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.benchmark = True
 
 
 def parse_args():
@@ -474,7 +475,6 @@ class Checkpointer:
             scheduler=self.scheduler,
         ).to(self.accelerator.device)
         pipeline.set_progress_bar_config(dynamic_ncols=True)
-        pipeline.enable_vae_slicing()
 
         train_data = self.datamodule.train_dataloader()
         val_data = self.datamodule.val_dataloader()
@@ -550,6 +550,12 @@ class Checkpointer:
 def main():
     args = parse_args()
 
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
     instance_identifier = args.instance_identifier
 
     if len(args.placeholder_token) != 0:
@@ -587,6 +593,7 @@ def main():
     checkpoint_scheduler = DPMSolverMultistepScheduler.from_pretrained(
         args.pretrained_model_name_or_path, subfolder='scheduler')
 
+    vae.enable_slicing()
     unet.set_use_memory_efficient_attention_xformers(True)
 
     if args.gradient_checkpointing:
@@ -903,7 +910,7 @@ def main():
             sample_checkpoint = False
 
             for step, batch in enumerate(train_dataloader):
-                with accelerator.accumulate(itertools.chain(unet, text_encoder)):
+                with accelerator.accumulate(unet):
                     # Convert images to latent space
                     latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
                     latents = latents * 0.18215
diff --git a/infer.py b/infer.py
index ab5f247..eabeb5e 100644
--- a/infer.py
+++ b/infer.py
@@ -16,6 +16,7 @@ from pipelines.stable_diffusion.vlpn_stable_diffusion import VlpnStableDiffusion
 
 
 torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.benchmark = True
 
 
 default_args = {
@@ -37,7 +38,7 @@ default_cmds = {
     "height": 512,
     "batch_size": 1,
     "batch_num": 1,
-    "steps": 50,
+    "steps": 30,
     "guidance_scale": 7.0,
     "seed": None,
     "config": None,
diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index c77c4d1..9b51763 100644
--- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -20,7 +20,6 @@ from diffusers import (
     PNDMScheduler,
 )
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.models.vae import DecoderOutput
 from diffusers.utils import logging
 from transformers import CLIPTextModel, CLIPTokenizer
 from models.clip.prompt import PromptProcessor
@@ -70,7 +69,6 @@ class VlpnStableDiffusion(DiffusionPipeline):
             scheduler._internal_dict = FrozenDict(new_config)
 
         self.prompt_processor = PromptProcessor(tokenizer, text_encoder)
-        self.use_slicing = False
 
         self.register_modules(
             vae=vae,
@@ -108,9 +106,14 @@ class VlpnStableDiffusion(DiffusionPipeline):
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+
         self.unet.set_attention_slice(slice_size)
 
     def disable_attention_slicing(self):
@@ -144,14 +147,14 @@ class VlpnStableDiffusion(DiffusionPipeline):
         When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
         steps. This is useful to save some memory and allow larger batch sizes.
         """
-        self.use_slicing = True
+        self.vae.enable_slicing()
 
     def disable_vae_slicing(self):
         r"""
         Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
         computing decoding in one step.
         """
-        self.use_slicing = False
+        self.vae.disable_slicing()
 
     @property
     def execution_device(self):
@@ -297,20 +300,12 @@ class VlpnStableDiffusion(DiffusionPipeline):
 
     def decode_latents(self, latents):
         latents = 1 / 0.18215 * latents
-        image = self.vae_decode(latents).sample
+        image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
 
-    def vae_decode(self, latents):
-        if self.use_slicing:
-            decoded_slices = [self.vae.decode(latents_slice).sample for latents_slice in latents.split(1)]
-            decoded = torch.cat(decoded_slices)
-            return DecoderOutput(sample=decoded)
-        else:
-            return self.vae.decode(latents)
-
     @torch.no_grad()
     def __call__(
         self,
diff --git a/textual_inversion.py b/textual_inversion.py
index 7ac9638..d6be522 100644
--- a/textual_inversion.py
+++ b/textual_inversion.py
@@ -31,6 +31,7 @@ logger = get_logger(__name__)
 
 
 torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.benchmark = True
 
 
 def parse_args():
-- 
cgit v1.2.3-70-g09d2