From 7c02c2fe68da2411623f0a11c1187ccf0f7743d8 Mon Sep 17 00:00:00 2001
From: Volpeon <git@volpeon.ink>
Date: Thu, 1 Dec 2022 22:01:47 +0100
Subject: Update

---
 dreambooth.py        | 17 +++++++++--------
 environment.yaml     |  2 +-
 infer.py             |  1 +
 textual_inversion.py |  9 +++++----
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/dreambooth.py b/dreambooth.py
index 31dbea2..1ead6dd 100644
--- a/dreambooth.py
+++ b/dreambooth.py
@@ -550,11 +550,11 @@ class Checkpointer:
 def main():
     args = parse_args()
 
-    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
-        raise ValueError(
-            "Gradient accumulation is not supported when training the text encoder in distributed training. "
-            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
-        )
+    # if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+    #     raise ValueError(
+    #         "Gradient accumulation is not supported when training the text encoder in distributed training. "
+    #         "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+    #     )
 
     instance_identifier = args.instance_identifier
 
@@ -899,6 +899,9 @@ def main():
     )
     global_progress_bar.set_description("Total progress")
 
+    index_fixed_tokens = torch.arange(len(tokenizer))
+    index_fixed_tokens = index_fixed_tokens[~torch.isin(index_fixed_tokens, torch.tensor(placeholder_token_id))]
+
     try:
         for epoch in range(num_epochs):
             local_progress_bar.set_description(f"Epoch {epoch + 1} / {num_epochs}")
@@ -910,7 +913,7 @@ def main():
             sample_checkpoint = False
 
             for step, batch in enumerate(train_dataloader):
-                with accelerator.accumulate(unet):
+                with accelerator.accumulate(itertools.chain(unet, text_encoder)):
                     # Convert images to latent space
                     latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
                     latents = latents * 0.18215
@@ -967,8 +970,6 @@ def main():
                         else:
                             token_embeds = text_encoder.get_input_embeddings().weight
 
-                        # Get the index for tokens that we want to freeze
-                        index_fixed_tokens = torch.arange(len(tokenizer)) != placeholder_token_id
                         token_embeds.data[index_fixed_tokens, :] = original_token_embeds[index_fixed_tokens, :]
 
                     if accelerator.sync_gradients:
diff --git a/environment.yaml b/environment.yaml
index 4972ebd..24693d5 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -11,7 +11,7 @@ dependencies:
   - pytorch=1.12.1
   - torchvision=0.13.1
   - pandas=1.4.3
-  - xformers=0.0.15.dev337
+  - xformers=0.0.15.dev344
   - pip:
       - -e .
       - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
diff --git a/infer.py b/infer.py
index eabeb5e..75c8621 100644
--- a/infer.py
+++ b/infer.py
@@ -219,6 +219,7 @@ def create_pipeline(model, ti_embeddings_dir, dtype):
         scheduler=scheduler,
     )
     pipeline.enable_xformers_memory_efficient_attention()
+    pipeline.enable_vae_slicing()
     pipeline.to("cuda")
 
     print("Pipeline loaded.")
diff --git a/textual_inversion.py b/textual_inversion.py
index d6be522..80f1d7d 100644
--- a/textual_inversion.py
+++ b/textual_inversion.py
@@ -545,6 +545,7 @@ def main():
     checkpoint_scheduler = DPMSolverMultistepScheduler.from_pretrained(
         args.pretrained_model_name_or_path, subfolder='scheduler')
 
+    vae.enable_slicing()
     unet.set_use_memory_efficient_attention_xformers(True)
 
     if args.gradient_checkpointing:
@@ -814,6 +815,9 @@ def main():
     )
     global_progress_bar.set_description("Total progress")
 
+    index_fixed_tokens = torch.arange(len(tokenizer))
+    index_fixed_tokens = index_fixed_tokens[~torch.isin(index_fixed_tokens, torch.tensor(placeholder_token_id))]
+
     try:
         for epoch in range(num_epochs):
             local_progress_bar.set_description(f"Epoch {epoch + 1} / {num_epochs}")
@@ -827,7 +831,7 @@ def main():
             for step, batch in enumerate(train_dataloader):
                 with accelerator.accumulate(text_encoder):
                     # Convert images to latent space
-                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
+                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
                     latents = latents * 0.18215
 
                     # Sample noise that we'll add to the latents
@@ -883,7 +887,6 @@ def main():
                         token_embeds = text_encoder.get_input_embeddings().weight
 
                     # Get the index for tokens that we want to freeze
-                    index_fixed_tokens = torch.arange(len(tokenizer)) != placeholder_token_id
                     token_embeds.data[index_fixed_tokens, :] = original_token_embeds[index_fixed_tokens, :]
 
                     optimizer.step()
@@ -927,8 +930,6 @@ def main():
 
             accelerator.wait_for_everyone()
 
-            print(token_embeds[placeholder_token_id])
-
             text_encoder.eval()
             val_loss = 0.0
 
-- 
cgit v1.2.3-70-g09d2