From 208e48134e324e934ad964bdc61880cc923f4c0d Mon Sep 17 00:00:00 2001
From: Volpeon <git@volpeon.ink>
Date: Sat, 1 Apr 2023 22:13:55 +0200
Subject: Revert

---
 training/strategy/ti.py | 100 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 81 insertions(+), 19 deletions(-)

(limited to 'training/strategy/ti.py')

diff --git a/training/strategy/ti.py b/training/strategy/ti.py
index 1b5adab..677f5a3 100644
--- a/training/strategy/ti.py
+++ b/training/strategy/ti.py
@@ -1,6 +1,6 @@
 from typing import Optional
 from functools import partial
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from pathlib import Path
 
 import torch
@@ -13,6 +13,7 @@ from diffusers import AutoencoderKL, UNet2DConditionModel, DPMSolverMultistepSch
 from slugify import slugify
 
 from models.clip.tokenizer import MultiCLIPTokenizer
+from training.util import EMAModel
 from training.functional import TrainingStrategy, TrainingCallbacks, save_samples
 
 
@@ -31,6 +32,13 @@ def textual_inversion_strategy_callbacks(
     placeholder_tokens: list[str],
     placeholder_token_ids: list[list[int]],
     gradient_checkpointing: bool = False,
+    use_emb_decay: bool = False,
+    emb_decay_target: float = 0.4,
+    emb_decay: float = 1e-2,
+    use_ema: bool = False,
+    ema_inv_gamma: float = 1.0,
+    ema_power: int = 1,
+    ema_max_decay: float = 0.9999,
     sample_batch_size: int = 1,
     sample_num_batches: int = 1,
     sample_num_steps: int = 20,
@@ -63,8 +71,27 @@ def textual_inversion_strategy_callbacks(
         image_size=sample_image_size,
     )
 
+    if use_ema:
+        ema_embeddings = EMAModel(
+            text_encoder.text_model.embeddings.temp_token_embedding.parameters(),
+            inv_gamma=ema_inv_gamma,
+            power=ema_power,
+            max_value=ema_max_decay,
+        )
+        ema_embeddings.to(accelerator.device)
+    else:
+        ema_embeddings = None
+
+    def ema_context():
+        if ema_embeddings is not None:
+            return ema_embeddings.apply_temporary(
+                text_encoder.text_model.embeddings.temp_token_embedding.parameters()
+            )
+        else:
+            return nullcontext()
+
     def on_accum_model():
-        return text_encoder.text_model.embeddings
+        return text_encoder.text_model.embeddings.temp_token_embedding
 
     @contextmanager
     def on_train(epoch: int):
@@ -74,36 +101,68 @@ def textual_inversion_strategy_callbacks(
     @contextmanager
     def on_eval():
         tokenizer.eval()
-        yield
+
+        with ema_context():
+            yield
+
+    @torch.no_grad()
+    def on_before_optimize(lr: float, epoch: int):
+        if use_emb_decay:
+            w = text_encoder.text_model.embeddings.temp_token_embedding.weight
+            return torch.all(w.grad == 0, dim=1)
+
+    @torch.no_grad()
+    def on_after_optimize(zero_ids, lr: float):
+        if ema_embeddings is not None:
+            ema_embeddings.step(text_encoder.text_model.embeddings.temp_token_embedding.parameters())
+
+        if use_emb_decay:
+            lambda_ = emb_decay * lr
+
+            if lambda_ != 0:
+                w = text_encoder.text_model.embeddings.temp_token_embedding.weight
+
+                mask = torch.ones(w.shape[0], dtype=torch.bool)
+                mask[zero_ids] = False
+
+                norm = w[mask, :].norm(dim=-1, keepdim=True)
+                w[mask].add_((w[mask] / norm.clamp_min(1e-12)) * lambda_ * (emb_decay_target - norm))
+
+    def on_log():
+        if ema_embeddings is not None:
+            return {"ema_decay": ema_embeddings.decay}
+        return {}
 
     @torch.no_grad()
     def on_checkpoint(step, postfix):
         print(f"Saving checkpoint for step {step}...")
 
-        for (token, ids) in zip(placeholder_tokens, placeholder_token_ids):
-            text_encoder.text_model.embeddings.save_embed(
-                ids,
-                checkpoint_output_dir / f"{slugify(token)}_{step}_{postfix}.bin"
-            )
+        with ema_context():
+            for (token, ids) in zip(placeholder_tokens, placeholder_token_ids):
+                text_encoder.text_model.embeddings.save_embed(
+                    ids,
+                    checkpoint_output_dir / f"{slugify(token)}_{step}_{postfix}.bin"
+                )
 
     @torch.no_grad()
     def on_sample(step):
-        unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
-        text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
+        with ema_context():
+            unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
+            text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True)
 
-        orig_unet_dtype = unet_.dtype
-        orig_text_encoder_dtype = text_encoder_.dtype
+            orig_unet_dtype = unet_.dtype
+            orig_text_encoder_dtype = text_encoder_.dtype
 
-        unet_.to(dtype=weight_dtype)
-        text_encoder_.to(dtype=weight_dtype)
+            unet_.to(dtype=weight_dtype)
+            text_encoder_.to(dtype=weight_dtype)
 
-        save_samples_(step=step, unet=unet_, text_encoder=text_encoder_)
+            save_samples_(step=step, unet=unet_, text_encoder=text_encoder_)
 
-        unet_.to(dtype=orig_unet_dtype)
-        text_encoder_.to(dtype=orig_text_encoder_dtype)
+            unet_.to(dtype=orig_unet_dtype)
+            text_encoder_.to(dtype=orig_text_encoder_dtype)
 
-        del unet_
-        del text_encoder_
+            del unet_
+            del text_encoder_
 
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
@@ -112,6 +171,9 @@ def textual_inversion_strategy_callbacks(
         on_accum_model=on_accum_model,
         on_train=on_train,
         on_eval=on_eval,
+        on_before_optimize=on_before_optimize,
+        on_after_optimize=on_after_optimize,
+        on_log=on_log,
         on_checkpoint=on_checkpoint,
         on_sample=on_sample,
     )
-- 
cgit v1.2.3-70-g09d2