Improved automation caps

author: Volpeon <git@volpeon.ink> 2023-04-16 19:03:25 +0200
committer: Volpeon <git@volpeon.ink> 2023-04-16 19:03:25 +0200
commit: 71f4a40bb48be4f2759ba2d83faff39691cb2955 (patch)
tree: 29c704ca549a4c4323403b6cbb0e62f54040ae22
parent: Added option to use constant LR on cycles > 1 (diff)
download: textual-inversion-diff-71f4a40bb48be4f2759ba2d83faff39691cb2955.tar.gz
textual-inversion-diff-71f4a40bb48be4f2759ba2d83faff39691cb2955.tar.bz2
textual-inversion-diff-71f4a40bb48be4f2759ba2d83faff39691cb2955.zip
6 files changed, 100 insertions, 54 deletions
diff --git a/train_lora.py b/train_lora.py
index 4d4c16a..ba5aee1 100644
--- a/train_lora.py
+++ b/train_lora.py
@@ -84,9 +84,9 @@ def parse_args():
    )
    parser.add_argument(
        "--auto_cycles",
-        type=int,
+        type=str,
-        default=1,
+        default="o",
-        help="How many cycles to run automatically."
+        help="Cycles to run automatically."
    )
    parser.add_argument(
        "--cycle_decay",
@@ -95,11 +95,6 @@ def parse_args():
        help="Learning rate decay per cycle."
    )
    parser.add_argument(
-        "--cycle_constant",
-        action="store_true",
-        help="Use constant LR on cycles > 1."
-    )
-    parser.add_argument(
        "--placeholder_tokens",
        type=str,
        nargs='*',
@@ -920,7 +915,6 @@ def main():
        annealing_func=args.lr_annealing_func,
        warmup_exp=args.lr_warmup_exp,
        annealing_exp=args.lr_annealing_exp,
-        cycles=args.lr_cycles,
        end_lr=1e2,
        mid_point=args.lr_mid_point,
    )
@@ -964,20 +958,38 @@ def main():
    lora_sample_output_dir = output_dir / lora_project / "samples"
+    auto_cycles = list(args.auto_cycles)
+    lr_scheduler = args.lr_scheduler
+    lr_warmup_epochs = args.lr_warmup_epochs
+    lr_cycles = args.lr_cycles
    while True:
-        if training_iter >= args.auto_cycles:
+        if len(auto_cycles) != 0:
-            response = input("Run another cycle? [y/n] ")
+            response = auto_cycles.pop(0)
-            if response.lower().strip() == "n":
+        else:
-                break
+            response = input("Choose action: [o] one_cycle, [w] warmup, [c] constant, [d] decay, [s] stop \n--> ")
+        if response.lower().strip() == "o":
+            lr_scheduler = "one_cycle"
+            lr_warmup_epochs = args.lr_warmup_epochs
+            lr_cycles = args.lr_cycles
+        if response.lower().strip() == "w":
+            lr_scheduler = "constant"
+            lr_warmup_epochs = num_train_epochs
+        if response.lower().strip() == "c":
+            lr_scheduler = "constant"
+            lr_warmup_epochs = 0
+        if response.lower().strip() == "d":
+            lr_scheduler = "cosine"
+            lr_warmup_epochs = 0
+            lr_cycles = 1
+        elif response.lower().strip() == "s":
+            break
        print("")
        print(f"============ LoRA cycle {training_iter + 1} ============")
        print("")
-        if args.cycle_constant and training_iter == 1:
-            args.lr_scheduler = "constant"
-            args.lr_warmup_epochs = 0
        params_to_optimize = []
        if len(args.placeholder_tokens) != 0:
@@ -1012,12 +1024,13 @@ def main():
        lora_optimizer = create_optimizer(params_to_optimize)
        lora_lr_scheduler = create_lr_scheduler(
-            args.lr_scheduler,
+            lr_scheduler,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            optimizer=lora_optimizer,
            num_training_steps_per_epoch=len(lora_datamodule.train_dataloader),
            train_epochs=num_train_epochs,
-            warmup_epochs=args.lr_warmup_epochs,
+            cycles=lr_cycles,
+            warmup_epochs=lr_warmup_epochs,
        )
        lora_checkpoint_output_dir = output_dir / lora_project / f"model_{training_iter + 1}"
@@ -1031,7 +1044,7 @@ def main():
            num_train_epochs=num_train_epochs,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            global_step_offset=training_iter * num_train_steps,
-            initial_samples=training_iter == 0,
+            cycle=training_iter,
            # --
            group_labels=group_labels,
            sample_output_dir=lora_sample_output_dir,
diff --git a/train_ti.py b/train_ti.py
index c452269..880320f 100644
--- a/train_ti.py
+++ b/train_ti.py
@@ -68,9 +68,9 @@ def parse_args():
    )
    parser.add_argument(
        "--auto_cycles",
-        type=int,
+        type=str,
-        default=1,
+        default="o",
-        help="How many cycles to run automatically."
+        help="Cycles to run automatically."
    )
    parser.add_argument(
        "--cycle_decay",
@@ -79,11 +79,6 @@ def parse_args():
        help="Learning rate decay per cycle."
    )
    parser.add_argument(
-        "--cycle_constant",
-        action="store_true",
-        help="Use constant LR on cycles > 1."
-    )
-    parser.add_argument(
        "--placeholder_tokens",
        type=str,
        nargs='*',
@@ -921,27 +916,45 @@ def main():
        sample_output_dir = output_dir / project / "samples"
+        auto_cycles = list(args.auto_cycles)
+        lr_scheduler = args.lr_scheduler
+        lr_warmup_epochs = args.lr_warmup_epochs
+        lr_cycles = args.lr_cycles
        while True:
-            if training_iter >= args.auto_cycles:
+            if len(auto_cycles) != 0:
-                response = input("Run another cycle? [y/n] ")
+                response = auto_cycles.pop(0)
-                if response.lower().strip() == "n":
+            else:
-                    break
+                response = input("Choose action: [o] one_cycle, [w] warmup, [c] constant, [d] decay, [s] stop \n--> ")
+            if response.lower().strip() == "o":
+                lr_scheduler = "one_cycle"
+                lr_warmup_epochs = args.lr_warmup_epochs
+                lr_cycles = args.lr_cycles
+            if response.lower().strip() == "w":
+                lr_scheduler = "constant"
+                lr_warmup_epochs = num_train_epochs
+            if response.lower().strip() == "c":
+                lr_scheduler = "constant"
+                lr_warmup_epochs = 0
+            if response.lower().strip() == "d":
+                lr_scheduler = "cosine"
+                lr_warmup_epochs = 0
+                lr_cycles = 1
+            elif response.lower().strip() == "s":
+                break
            print("")
            print(f"------------ TI cycle {training_iter + 1} ------------")
            print("")
-            if args.cycle_constant and training_iter == 1:
-                args.lr_scheduler = "constant"
-                args.lr_warmup_epochs = 0
            optimizer = create_optimizer(
                text_encoder.text_model.embeddings.token_embedding.parameters(),
                lr=learning_rate,
            )
            lr_scheduler = get_scheduler(
-                args.lr_scheduler,
+                lr_scheduler,
                optimizer=optimizer,
                num_training_steps_per_epoch=len(datamodule.train_dataloader),
                gradient_accumulation_steps=args.gradient_accumulation_steps,
@@ -950,10 +963,10 @@ def main():
                annealing_func=args.lr_annealing_func,
                warmup_exp=args.lr_warmup_exp,
                annealing_exp=args.lr_annealing_exp,
-                cycles=args.lr_cycles,
+                cycles=lr_cycles,
                end_lr=1e3,
                train_epochs=num_train_epochs,
-                warmup_epochs=args.lr_warmup_epochs,
+                warmup_epochs=lr_warmup_epochs,
                mid_point=args.lr_mid_point,
            )
@@ -966,7 +979,7 @@ def main():
                lr_scheduler=lr_scheduler,
                num_train_epochs=num_train_epochs,
                global_step_offset=training_iter * num_train_steps,
-                initial_samples=training_iter == 0,
+                cycle=training_iter,
                # --
                group_labels=["emb"],
                checkpoint_output_dir=checkpoint_output_dir,
diff --git a/training/functional.py b/training/functional.py
index 2da0f69..ebc40de 100644
--- a/training/functional.py
+++ b/training/functional.py
@@ -42,7 +42,7 @@ class TrainingCallbacks():
    on_after_optimize: Callable[[Any, dict[str, float]], None] = const()
    on_after_epoch: Callable[[], None] = const()
    on_eval: Callable[[], _GeneratorContextManager] = const(nullcontext())
-    on_sample: Callable[[int], None] = const()
+    on_sample: Callable[[int, int], None] = const()
    on_checkpoint: Callable[[int, str], None] = const()
@@ -96,6 +96,7 @@ def save_samples(
    output_dir: Path,
    seed: int,
    step: int,
+    cycle: int = 1,
    batch_size: int = 1,
    num_batches: int = 1,
    num_steps: int = 20,
@@ -125,7 +126,7 @@ def save_samples(
    for pool, data, gen in datasets:
        all_samples = []
-        file_path = output_dir / pool / f"step_{step}.jpg"
+        file_path = output_dir / pool / f"step_{cycle}_{step}.jpg"
        file_path.parent.mkdir(parents=True, exist_ok=True)
        batches = list(itertools.islice(itertools.cycle(data), batch_size * num_batches))
@@ -455,7 +456,7 @@ def train_loop(
    sample_frequency: int = 10,
    checkpoint_frequency: int = 50,
    milestone_checkpoints: bool = True,
-    initial_samples: bool = True,
+    cycle: int = 1,
    global_step_offset: int = 0,
    num_epochs: int = 100,
    gradient_accumulation_steps: int = 1,
@@ -518,12 +519,12 @@ def train_loop(
    try:
        for epoch in range(num_epochs):
            if accelerator.is_main_process:
-                if epoch % sample_frequency == 0 and (initial_samples or epoch != 0):
+                if epoch % sample_frequency == 0 and (cycle == 1 or epoch != 0):
                    local_progress_bar.clear()
                    global_progress_bar.clear()
                    with on_eval():
-                        on_sample(global_step)
+                        on_sample(cycle, global_step)
                if epoch % checkpoint_frequency == 0 and epoch != 0:
                    local_progress_bar.clear()
@@ -648,7 +649,7 @@ def train_loop(
        if accelerator.is_main_process:
            print("Finished!")
            with on_eval():
-                on_sample(global_step)
+                on_sample(cycle, global_step)
            on_checkpoint(global_step, "end")
    except KeyboardInterrupt:
@@ -680,7 +681,7 @@ def train(
    sample_frequency: int = 20,
    checkpoint_frequency: int = 50,
    milestone_checkpoints: bool = True,
-    initial_samples: bool = True,
+    cycle: int = 1,
    global_step_offset: int = 0,
    guidance_scale: float = 0.0,
    prior_loss_weight: float = 1.0,
@@ -731,7 +732,7 @@ def train(
        sample_frequency=sample_frequency,
        checkpoint_frequency=checkpoint_frequency,
        milestone_checkpoints=milestone_checkpoints,
-        initial_samples=initial_samples,
+        cycle=cycle,
        global_step_offset=global_step_offset,
        num_epochs=num_train_epochs,
        gradient_accumulation_steps=gradient_accumulation_steps,
diff --git a/training/strategy/dreambooth.py b/training/strategy/dreambooth.py
index 4ae28b7..e6fcc89 100644
--- a/training/strategy/dreambooth.py
+++ b/training/strategy/dreambooth.py
@@ -148,7 +148,7 @@ def dreambooth_strategy_callbacks(
            torch.cuda.empty_cache()
    @torch.no_grad()
-    def on_sample(step):
+    def on_sample(cycle, step):
        unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
        text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
@@ -158,7 +158,7 @@ def dreambooth_strategy_callbacks(
        unet_.to(dtype=weight_dtype)
        text_encoder_.to(dtype=weight_dtype)
-        save_samples_(step=step, unet=unet_, text_encoder=text_encoder_)
+        save_samples_(cycle=cycle, step=step, unet=unet_, text_encoder=text_encoder_)
        unet_.to(dtype=orig_unet_dtype)
        text_encoder_.to(dtype=orig_text_encoder_dtype)
diff --git a/training/strategy/lora.py b/training/strategy/lora.py
index 48236fb..5c3012e 100644
--- a/training/strategy/lora.py
+++ b/training/strategy/lora.py
@@ -146,11 +146,11 @@ def lora_strategy_callbacks(
            torch.cuda.empty_cache()
    @torch.no_grad()
-    def on_sample(step):
+    def on_sample(cycle, step):
        unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
        text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
-        save_samples_(step=step, unet=unet_, text_encoder=text_encoder_)
+        save_samples_(cycle=cycle, step=step, unet=unet_, text_encoder=text_encoder_)
        del unet_, text_encoder_
diff --git a/training/strategy/ti.py b/training/strategy/ti.py
index f0b84b5..6bbff64 100644
--- a/training/strategy/ti.py
+++ b/training/strategy/ti.py
@@ -104,10 +104,28 @@ def textual_inversion_strategy_callbacks(
            yield
    @torch.no_grad()
+    def on_before_optimize(epoch: int):
+        if use_emb_decay:
+            params = [
+                p
+                for p in text_encoder.text_model.embeddings.token_embedding.parameters()
+                if p.grad is not None
+            ]
+            return torch.stack(params) if len(params) != 0 else None
+    @torch.no_grad()
    def on_after_optimize(w, lrs: dict[str, float]):
        if ema_embeddings is not None:
            ema_embeddings.step(text_encoder.text_model.embeddings.token_embedding.parameters())
+        if use_emb_decay and w is not None:
+            lr = lrs["emb"] or lrs["0"]
+            lambda_ = emb_decay * lr
+            if lambda_ != 0:
+                norm = w[:, :].norm(dim=-1, keepdim=True)
+                w[:].add_((w[:] / norm.clamp_min(1e-12)) * lambda_ * (emb_decay_target - norm))
    def on_log():
        if ema_embeddings is not None:
            return {"ema_decay": ema_embeddings.decay}
@@ -125,7 +143,7 @@ def textual_inversion_strategy_callbacks(
                )
    @torch.no_grad()
-    def on_sample(step):
+    def on_sample(cycle, step):
        unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
        text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
@@ -135,7 +153,7 @@ def textual_inversion_strategy_callbacks(
        unet_.to(dtype=weight_dtype)
        text_encoder_.to(dtype=weight_dtype)
-        save_samples_(step=step, unet=unet_, text_encoder=text_encoder_)
+        save_samples_(cycle=cycle, step=step, unet=unet_, text_encoder=text_encoder_)
        unet_.to(dtype=orig_unet_dtype)
        text_encoder_.to(dtype=orig_text_encoder_dtype)
@@ -148,6 +166,7 @@ def textual_inversion_strategy_callbacks(
    return TrainingCallbacks(
        on_train=on_train,
        on_eval=on_eval,
+        on_before_optimize=on_before_optimize,
        on_after_optimize=on_after_optimize,
        on_log=on_log,
        on_checkpoint=on_checkpoint,
author	Volpeon <git@volpeon.ink>	2023-04-16 19:03:25 +0200
committer	Volpeon <git@volpeon.ink>	2023-04-16 19:03:25 +0200
commit	71f4a40bb48be4f2759ba2d83faff39691cb2955 (patch)
tree	29c704ca549a4c4323403b6cbb0e62f54040ae22
parent	Added option to use constant LR on cycles > 1 (diff)
download	textual-inversion-diff-71f4a40bb48be4f2759ba2d83faff39691cb2955.tar.gz textual-inversion-diff-71f4a40bb48be4f2759ba2d83faff39691cb2955.tar.bz2 textual-inversion-diff-71f4a40bb48be4f2759ba2d83faff39691cb2955.zip