From 0767c7bc82645186159965c2a6be4278e33c6721 Mon Sep 17 00:00:00 2001
From: Volpeon <git@volpeon.ink>
Date: Thu, 23 Mar 2023 11:07:57 +0100
Subject: Update

---
 data/csv.py                                         |  2 +-
 models/clip/util.py                                 |  6 +++---
 pipelines/stable_diffusion/vlpn_stable_diffusion.py |  4 +++-
 train_dreambooth.py                                 | 11 +++++++----
 train_lora.py                                       |  3 ---
 train_ti.py                                         |  2 +-
 training/functional.py                              |  7 ++-----
 training/strategy/dreambooth.py                     |  8 ++++----
 training/strategy/lora.py                           | 12 ++++++------
 training/strategy/ti.py                             |  4 ++--
 10 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/data/csv.py b/data/csv.py
index 619452e..fba5d4b 100644
--- a/data/csv.py
+++ b/data/csv.py
@@ -282,7 +282,7 @@ class VlpnDataModule():
         collate_fn_ = partial(collate_fn, self.dtype, self.tokenizer, self.num_class_images != 0)
 
         if valid_set_size == 0:
-            data_train, data_val = items, items[:self.batch_size]
+            data_train, data_val = items, items
         else:
             data_train, data_val = random_split(items, [train_set_size, valid_set_size], generator=generator)
 
diff --git a/models/clip/util.py b/models/clip/util.py
index 8de8c19..883de6a 100644
--- a/models/clip/util.py
+++ b/models/clip/util.py
@@ -23,11 +23,11 @@ def get_extended_embeddings(
     model_max_length = text_encoder.config.max_position_embeddings
     prompts = input_ids.shape[0]
 
-    input_ids = input_ids.view((-1, model_max_length)).to(text_encoder.device)
+    input_ids = input_ids.view((-1, model_max_length))
     if position_ids is not None:
-        position_ids = position_ids.view((-1, model_max_length)).to(text_encoder.device)
+        position_ids = position_ids.view((-1, model_max_length))
     if attention_mask is not None:
-        attention_mask = attention_mask.view((-1, model_max_length)).to(text_encoder.device)
+        attention_mask = attention_mask.view((-1, model_max_length))
 
     text_embeddings = text_encoder(input_ids, position_ids=position_ids, attention_mask=attention_mask)[0]
     text_embeddings = text_embeddings.view((prompts, -1, text_embeddings.shape[2]))
diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index 4505a2a..dbd262f 100644
--- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -291,7 +291,8 @@ class VlpnStableDiffusion(DiffusionPipeline):
         else:
             attention_mask = None
 
-        prompt_embeds = get_extended_embeddings(self.text_encoder, text_input_ids, attention_mask)
+        prompt_embeds = get_extended_embeddings(self.text_encoder, text_input_ids.to(device), attention_mask)
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
 
         return prompt_embeds
 
@@ -374,6 +375,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
 
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
+        # image = self.vae.decode(latents.to(dtype=self.vae.dtype)).sample
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
diff --git a/train_dreambooth.py b/train_dreambooth.py
index f8f6e84..a85ae4c 100644
--- a/train_dreambooth.py
+++ b/train_dreambooth.py
@@ -582,12 +582,15 @@ def main():
     )
     datamodule.setup()
 
-    optimizer = create_optimizer(
-        itertools.chain(
-            unet.parameters(),
+    params_to_optimize = (unet.parameters(), )
+    if args.train_text_encoder_epochs != 0:
+        params_to_optimize += (
             text_encoder.text_model.encoder.parameters(),
             text_encoder.text_model.final_layer_norm.parameters(),
-        ),
+        )
+
+    optimizer = create_optimizer(
+        itertools.chain(*params_to_optimize),
         lr=args.learning_rate,
     )
 
diff --git a/train_lora.py b/train_lora.py
index 787f271..8dd3c86 100644
--- a/train_lora.py
+++ b/train_lora.py
@@ -424,9 +424,6 @@ def main():
     tokenizer, text_encoder, vae, unet, noise_scheduler, sample_scheduler, embeddings = get_models(
         args.pretrained_model_name_or_path)
 
-    tokenizer.set_use_vector_shuffle(args.vector_shuffle)
-    tokenizer.set_dropout(args.vector_dropout)
-
     vae.enable_slicing()
     vae.set_use_memory_efficient_attention_xformers(True)
     unet.enable_xformers_memory_efficient_attention()
diff --git a/train_ti.py b/train_ti.py
index 7aeff7c..9bc74c1 100644
--- a/train_ti.py
+++ b/train_ti.py
@@ -340,7 +340,7 @@ def parse_args():
     parser.add_argument(
         "--checkpoint_frequency",
         type=int,
-        default=5,
+        default=999999,
         help="How often to save a checkpoint and sample image (in epochs)",
     )
     parser.add_argument(
diff --git a/training/functional.py b/training/functional.py
index ebb48ab..015fe5e 100644
--- a/training/functional.py
+++ b/training/functional.py
@@ -259,7 +259,7 @@ def snr_weight(noisy_latents, latents, gamma):
         sigma_mean_sq = F.mse_loss(sigma.float(), zeros.float(), reduction="none").mean([1, 2, 3])
         snr = torch.div(alpha_mean_sq, sigma_mean_sq)
         gamma_over_snr = torch.div(torch.ones_like(snr) * gamma, snr)
-        snr_weight = torch.minimum(gamma_over_snr, torch.ones_like(gamma_over_snr)).float()
+        snr_weight = torch.fmin(gamma_over_snr, torch.ones_like(gamma_over_snr)).float()
         return snr_weight
 
     return torch.tensor(
@@ -471,10 +471,7 @@ def train_loop(
                         "lr": lr_scheduler.get_last_lr()[0],
                     }
                     if isDadaptation:
-                        logs["lr/d*lr"] = (
-                            optimizer.param_groups[0]["d"] *
-                            optimizer.param_groups[0]["lr"]
-                        )
+                        logs["lr/d*lr"] = optimizer.param_groups[0]["d"] * optimizer.param_groups[0]["lr"]
                     logs.update(on_log())
 
                     local_progress_bar.set_postfix(**logs)
diff --git a/training/strategy/dreambooth.py b/training/strategy/dreambooth.py
index e5e84c8..28fccff 100644
--- a/training/strategy/dreambooth.py
+++ b/training/strategy/dreambooth.py
@@ -137,8 +137,8 @@ def dreambooth_strategy_callbacks(
 
         print("Saving model...")
 
-        unet_ = accelerator.unwrap_model(unet, False)
-        text_encoder_ = accelerator.unwrap_model(text_encoder, False)
+        unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=False)
+        text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=False)
 
         with ema_context():
             pipeline = VlpnStableDiffusion(
@@ -160,8 +160,8 @@ def dreambooth_strategy_callbacks(
     @torch.no_grad()
     def on_sample(step):
         with ema_context():
-            unet_ = accelerator.unwrap_model(unet, False)
-            text_encoder_ = accelerator.unwrap_model(text_encoder, False)
+            unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
+            text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
 
             orig_unet_dtype = unet_.dtype
             orig_text_encoder_dtype = text_encoder_.dtype
diff --git a/training/strategy/lora.py b/training/strategy/lora.py
index aa75bec..1c8fad6 100644
--- a/training/strategy/lora.py
+++ b/training/strategy/lora.py
@@ -47,7 +47,6 @@ def lora_strategy_callbacks(
     save_samples_ = partial(
         save_samples,
         accelerator=accelerator,
-        text_encoder=text_encoder,
         tokenizer=tokenizer,
         vae=vae,
         sample_scheduler=sample_scheduler,
@@ -72,6 +71,7 @@ def lora_strategy_callbacks(
     @contextmanager
     def on_train(epoch: int):
         tokenizer.train()
+        text_encoder.train()
         yield
 
     @contextmanager
@@ -89,8 +89,8 @@ def lora_strategy_callbacks(
     def on_checkpoint(step, postfix):
         print(f"Saving checkpoint for step {step}...")
 
-        unet_ = accelerator.unwrap_model(unet, False)
-        text_encoder_ = accelerator.unwrap_model(text_encoder, False)
+        unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=False)
+        text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=False)
 
         lora_config = {}
         state_dict = get_peft_model_state_dict(unet, state_dict=accelerator.get_state_dict(unet))
@@ -111,10 +111,10 @@ def lora_strategy_callbacks(
 
     @torch.no_grad()
     def on_sample(step):
-        unet_ = accelerator.unwrap_model(unet, False)
-        text_encoder_ = accelerator.unwrap_model(text_encoder, False)
+        unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
+        text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
 
-        save_samples_(step=step, unet=unet_)
+        save_samples_(step=step, unet=unet_, text_encoder=text_encoder_)
 
         del unet_
         del text_encoder_
diff --git a/training/strategy/ti.py b/training/strategy/ti.py
index bd0d178..2038e34 100644
--- a/training/strategy/ti.py
+++ b/training/strategy/ti.py
@@ -156,8 +156,8 @@ def textual_inversion_strategy_callbacks(
     @torch.no_grad()
     def on_sample(step):
         with ema_context():
-            unet_ = accelerator.unwrap_model(unet, False)
-            text_encoder_ = accelerator.unwrap_model(text_encoder, False)
+            unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
+            text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
 
             orig_unet_dtype = unet_.dtype
             orig_text_encoder_dtype = text_encoder_.dtype
-- 
cgit v1.2.3-70-g09d2