From b33ac00de283fe45edba689990dc96a5de93cd1e Mon Sep 17 00:00:00 2001
From: Volpeon <git@volpeon.ink>
Date: Tue, 13 Dec 2022 09:40:34 +0100
Subject: Add support for resume in Textual Inversion

---
 textual_inversion.py | 119 +++++++++++++++++++++++----------------------------
 1 file changed, 54 insertions(+), 65 deletions(-)

(limited to 'textual_inversion.py')

diff --git a/textual_inversion.py b/textual_inversion.py
index a9c3326..11babd8 100644
--- a/textual_inversion.py
+++ b/textual_inversion.py
@@ -170,7 +170,7 @@ def parse_args():
     parser.add_argument(
         "--lr_scheduler",
         type=str,
-        default="one_cycle",
+        default="constant_with_warmup",
         help=(
             'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
             ' "constant", "constant_with_warmup", "one_cycle"]'
@@ -231,14 +231,14 @@ def parse_args():
     parser.add_argument(
         "--checkpoint_frequency",
         type=int,
-        default=500,
-        help="How often to save a checkpoint and sample image",
+        default=5,
+        help="How often to save a checkpoint and sample image (in epochs)",
     )
     parser.add_argument(
         "--sample_frequency",
         type=int,
-        default=100,
-        help="How often to save a checkpoint and sample image",
+        default=1,
+        help="How often to save a checkpoint and sample image (in epochs)",
     )
     parser.add_argument(
         "--sample_image_size",
@@ -294,10 +294,9 @@ def parse_args():
         help="Path to a directory to resume training from (ie, logs/token_name/2022-09-22T23-36-27)"
     )
     parser.add_argument(
-        "--resume_checkpoint",
-        type=str,
-        default=None,
-        help="Path to a specific checkpoint to resume training from (ie, logs/token_name/2022-09-22T23-36-27/checkpoints/something.bin)."
+        "--global_step",
+        type=int,
+        default=0,
     )
     parser.add_argument(
         "--config",
@@ -512,19 +511,10 @@ def main():
     if len(args.placeholder_token) != 0:
         instance_identifier = instance_identifier.format(args.placeholder_token[0])
 
-    global_step_offset = 0
-    if args.resume_from is not None:
-        basepath = Path(args.resume_from)
-        print("Resuming state from %s" % args.resume_from)
-        with open(basepath.joinpath("resume.json"), 'r') as f:
-            state = json.load(f)
-        global_step_offset = state["args"].get("global_step", 0)
-
-        print("We've trained %d steps so far" % global_step_offset)
-    else:
-        now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
-        basepath = Path(args.output_dir).joinpath(slugify(instance_identifier), now)
-        basepath.mkdir(parents=True, exist_ok=True)
+    global_step_offset = args.global_step
+    now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    basepath = Path(args.output_dir).joinpath(slugify(instance_identifier), now)
+    basepath.mkdir(parents=True, exist_ok=True)
 
     accelerator = Accelerator(
         log_with=LoggerType.TENSORBOARD,
@@ -557,6 +547,7 @@ def main():
     set_use_memory_efficient_attention_xformers(vae, True)
 
     if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
         text_encoder.gradient_checkpointing_enable()
 
     print(f"Adding text embeddings: {args.placeholder_token}")
@@ -577,14 +568,25 @@ def main():
 
     # Initialise the newly added placeholder token with the embeddings of the initializer token
     token_embeds = text_encoder.get_input_embeddings().weight.data
-    original_token_embeds = token_embeds.detach().clone().to(accelerator.device)
 
-    if args.resume_checkpoint is not None:
-        token_embeds[placeholder_token_id] = torch.load(args.resume_checkpoint)[args.placeholder_token]
-    else:
-        initializer_token_embeddings = text_encoder.get_input_embeddings()(initializer_token_ids)
-        for (token_id, embeddings) in zip(placeholder_token_id, initializer_token_embeddings):
-            token_embeds[token_id] = embeddings
+    if args.resume_from:
+        resumepath = Path(args.resume_from).joinpath("checkpoints")
+
+        for (token_id, token) in zip(placeholder_token_id, args.placeholder_token):
+            embedding_file = resumepath.joinpath(f"{token}_{args.global_step}_end.bin")
+            embedding_data = torch.load(embedding_file, map_location="cpu")
+
+            emb = next(iter(embedding_data.values()))
+            if len(emb.shape) == 1:
+                emb = emb.unsqueeze(0)
+
+            token_embeds[token_id] = emb
+
+    original_token_embeds = token_embeds.clone().to(accelerator.device)
+
+    initializer_token_embeddings = text_encoder.get_input_embeddings()(initializer_token_ids)
+    for (token_id, embeddings) in zip(placeholder_token_id, initializer_token_embeddings):
+        token_embeds[token_id] = embeddings
 
     index_fixed_tokens = torch.arange(len(tokenizer))
     index_fixed_tokens = index_fixed_tokens[~torch.isin(index_fixed_tokens, torch.tensor(placeholder_token_id))]
@@ -891,21 +893,16 @@ def main():
 
                     accelerator.backward(loss)
 
-                    # Keep the token embeddings fixed except the newly added
-                    # embeddings for the concept, as we only want to optimize the concept embeddings
-                    if accelerator.num_processes > 1:
-                        token_embeds = text_encoder.module.get_input_embeddings().weight
-                    else:
-                        token_embeds = text_encoder.get_input_embeddings().weight
-
-                    # Get the index for tokens that we want to freeze
-                    token_embeds.data[index_fixed_tokens, :] = original_token_embeds[index_fixed_tokens, :]
-
                     optimizer.step()
                     if not accelerator.optimizer_step_was_skipped:
                         lr_scheduler.step()
                     optimizer.zero_grad(set_to_none=True)
 
+                    # Let's make sure we don't update any embedding weights besides the newly added token
+                    with torch.no_grad():
+                        text_encoder.get_input_embeddings(
+                        ).weight[index_fixed_tokens] = original_token_embeds[index_fixed_tokens]
+
                     loss = loss.detach().item()
                     train_loss += loss
 
@@ -916,19 +913,6 @@ def main():
 
                     global_step += 1
 
-                    if global_step % args.sample_frequency == 0:
-                        sample_checkpoint = True
-
-                    if global_step % args.checkpoint_frequency == 0 and global_step > 0 and accelerator.is_main_process:
-                        local_progress_bar.clear()
-                        global_progress_bar.clear()
-
-                        checkpointer.checkpoint(global_step + global_step_offset, "training")
-                        save_args(basepath, args, {
-                            "global_step": global_step + global_step_offset,
-                            "resume_checkpoint": f"{basepath}/checkpoints/last.bin"
-                        })
-
                 logs = {"train/loss": loss, "lr": lr_scheduler.get_last_lr()[0]}
 
                 accelerator.log(logs, step=global_step)
@@ -992,24 +976,30 @@ def main():
             local_progress_bar.clear()
             global_progress_bar.clear()
 
-            if min_val_loss > val_loss:
-                accelerator.print(
-                    f"Global step {global_step}: Validation loss reached new minimum: {min_val_loss:.2e} -> {val_loss:.2e}")
-                checkpointer.checkpoint(global_step + global_step_offset, "milestone")
-                min_val_loss = val_loss
+            if accelerator.is_main_process:
+                if min_val_loss > val_loss:
+                    accelerator.print(
+                        f"Global step {global_step}: Validation loss reached new minimum: {min_val_loss:.2e} -> {val_loss:.2e}")
+                    checkpointer.checkpoint(global_step + global_step_offset, "milestone")
+                    min_val_loss = val_loss
+
+                if epoch % args.checkpoint_frequency == 0:
+                    checkpointer.checkpoint(global_step + global_step_offset, "training")
+                    save_args(basepath, args, {
+                        "global_step": global_step + global_step_offset
+                    })
 
-            if sample_checkpoint and accelerator.is_main_process:
-                checkpointer.save_samples(
-                    global_step + global_step_offset,
-                    args.resolution, args.resolution, 7.5, 0.0, args.sample_steps)
+                if epoch % args.sample_frequency == 0:
+                    checkpointer.save_samples(
+                        global_step + global_step_offset,
+                        args.resolution, args.resolution, 7.5, 0.0, args.sample_steps)
 
         # Create the pipeline using using the trained modules and save it.
         if accelerator.is_main_process:
             print("Finished! Saving final checkpoint and resume state.")
             checkpointer.checkpoint(global_step + global_step_offset, "end")
             save_args(basepath, args, {
-                "global_step": global_step + global_step_offset,
-                "resume_checkpoint": f"{basepath}/checkpoints/last.bin"
+                "global_step": global_step + global_step_offset
             })
             accelerator.end_training()
 
@@ -1018,8 +1008,7 @@ def main():
             print("Interrupted, saving checkpoint and resume state...")
             checkpointer.checkpoint(global_step + global_step_offset, "end")
             save_args(basepath, args, {
-                "global_step": global_step + global_step_offset,
-                "resume_checkpoint": f"{basepath}/checkpoints/last.bin"
+                "global_step": global_step + global_step_offset
             })
             accelerator.end_training()
         quit()
-- 
cgit v1.2.3-70-g09d2