From 7b04d813739c0b5595295dffdc86cc41108db2d3 Mon Sep 17 00:00:00 2001
From: Volpeon <git@volpeon.ink>
Date: Sat, 6 May 2023 16:25:36 +0200
Subject: Update

---
 train_lora.py             |  17 +++++
 train_ti.py               |  17 +++++
 training/functional.py    |  20 +++---
 training/sampler.py       | 154 ++++++++++++++++++++++++++++++++++++++++++++++
 training/strategy/lora.py |  10 +--
 5 files changed, 205 insertions(+), 13 deletions(-)
 create mode 100644 training/sampler.py

diff --git a/train_lora.py b/train_lora.py
index cc7c1ec..70fbae4 100644
--- a/train_lora.py
+++ b/train_lora.py
@@ -27,6 +27,7 @@ from data.csv import VlpnDataModule, keyword_filter
 from training.functional import train, add_placeholder_tokens, get_models
 from training.strategy.lora import lora_strategy
 from training.optimization import get_scheduler
+from training.sampler import create_named_schedule_sampler
 from training.util import AverageMeter, save_args
 
 # https://github.com/huggingface/peft/blob/main/examples/lora_dreambooth/train_dreambooth.py
@@ -409,6 +410,19 @@ def parse_args():
         default=0.04,
         help="Minimum learning rate in the lr scheduler."
     )
+    parser.add_argument(
+        "--min_snr_gamma",
+        type=int,
+        default=5,
+        help="MinSNR gamma."
+    )
+    parser.add_argument(
+        "--schedule_sampler",
+        type=str,
+        default="uniform",
+        choices=["uniform", "loss-second-moment"],
+        help="Noise schedule sampler."
+    )
     parser.add_argument(
         "--optimizer",
         type=str,
@@ -708,6 +722,7 @@ def main():
         args.emb_alpha,
         args.emb_dropout
     )
+    schedule_sampler = create_named_schedule_sampler(args.schedule_sampler, noise_scheduler.config.num_train_timesteps)
 
     unet_config = LoraConfig(
         r=args.lora_r,
@@ -923,6 +938,8 @@ def main():
         tokenizer=tokenizer,
         vae=vae,
         noise_scheduler=noise_scheduler,
+        schedule_sampler=schedule_sampler,
+        min_snr_gamma=args.min_snr_gamma,
         dtype=weight_dtype,
         seed=args.seed,
         compile_unet=args.compile_unet,
diff --git a/train_ti.py b/train_ti.py
index ae73639..26f7941 100644
--- a/train_ti.py
+++ b/train_ti.py
@@ -23,6 +23,7 @@ from data.csv import VlpnDataModule, keyword_filter
 from training.functional import train, add_placeholder_tokens, get_models
 from training.strategy.ti import textual_inversion_strategy
 from training.optimization import get_scheduler
+from training.sampler import create_named_schedule_sampler
 from training.util import AverageMeter, save_args
 
 logger = get_logger(__name__)
@@ -358,6 +359,19 @@ def parse_args():
         type=float,
         default=0.9999
     )
+    parser.add_argument(
+        "--min_snr_gamma",
+        type=int,
+        default=5,
+        help="MinSNR gamma."
+    )
+    parser.add_argument(
+        "--schedule_sampler",
+        type=str,
+        default="uniform",
+        choices=["uniform", "loss-second-moment"],
+        help="Noise schedule sampler."
+    )
     parser.add_argument(
         "--optimizer",
         type=str,
@@ -682,6 +696,7 @@ def main():
         args.emb_alpha,
         args.emb_dropout
     )
+    schedule_sampler = create_named_schedule_sampler(args.schedule_sampler, noise_scheduler.config.num_train_timesteps)
 
     tokenizer.set_use_vector_shuffle(args.vector_shuffle)
     tokenizer.set_dropout(args.vector_dropout)
@@ -837,6 +852,8 @@ def main():
         tokenizer=tokenizer,
         vae=vae,
         noise_scheduler=noise_scheduler,
+        schedule_sampler=schedule_sampler,
+        min_snr_gamma=args.min_snr_gamma,
         dtype=weight_dtype,
         seed=args.seed,
         compile_unet=args.compile_unet,
diff --git a/training/functional.py b/training/functional.py
index e7e1eb3..eae5681 100644
--- a/training/functional.py
+++ b/training/functional.py
@@ -25,6 +25,7 @@ from models.clip.util import get_extended_embeddings
 from models.clip.tokenizer import MultiCLIPTokenizer
 from models.convnext.discriminator import ConvNeXtDiscriminator
 from training.util import AverageMeter
+from training.sampler import ScheduleSampler, LossAwareSampler, UniformSampler
 from util.slerp import slerp
 
 
@@ -318,6 +319,7 @@ def get_original(
 def loss_step(
     vae: AutoencoderKL,
     noise_scheduler: SchedulerMixin,
+    schedule_sampler: ScheduleSampler,
     unet: UNet2DConditionModel,
     text_encoder: CLIPTextModel,
     guidance_scale: float,
@@ -362,14 +364,7 @@ def loss_step(
         new_noise = noise + input_pertubation * torch.randn_like(noise)
 
     # Sample a random timestep for each image
-    timesteps = torch.randint(
-        0,
-        noise_scheduler.config.num_train_timesteps,
-        (bsz,),
-        generator=generator,
-        device=latents.device,
-    )
-    timesteps = timesteps.long()
+    timesteps, weights = schedule_sampler.sample(bsz, latents.device)
 
     # Add noise to the latents according to the noise magnitude at each timestep
     # (this is the forward diffusion process)
@@ -443,6 +438,10 @@ def loss_step(
         )
         loss = loss * mse_loss_weights
 
+    if isinstance(schedule_sampler, LossAwareSampler):
+        schedule_sampler.update_with_all_losses(timesteps, loss.detach())
+
+    loss = loss * weights
     loss = loss.mean()
 
     return loss, acc, bsz
@@ -694,6 +693,7 @@ def train(
     offset_noise_strength: float = 0.01,
     input_pertubation: float = 0.1,
     disc: Optional[ConvNeXtDiscriminator] = None,
+    schedule_sampler: Optional[ScheduleSampler] = None,
     min_snr_gamma: int = 5,
     avg_loss: AverageMeter = AverageMeter(),
     avg_acc: AverageMeter = AverageMeter(),
@@ -725,10 +725,14 @@ def train(
         **kwargs,
     )
 
+    if schedule_sampler is None:
+        schedule_sampler = UniformSampler(noise_scheduler.config.num_train_timesteps)
+
     loss_step_ = partial(
         loss_step,
         vae,
         noise_scheduler,
+        schedule_sampler,
         unet,
         text_encoder,
         guidance_scale,
diff --git a/training/sampler.py b/training/sampler.py
new file mode 100644
index 0000000..8afe255
--- /dev/null
+++ b/training/sampler.py
@@ -0,0 +1,154 @@
+from abc import ABC, abstractmethod
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+
+def create_named_schedule_sampler(name, num_timesteps):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(num_timesteps)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(num_timesteps)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+
+
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+
+        The weights needn't be normalized, but must be positive.
+        """
+
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = torch.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = torch.from_numpy(weights_np).float().to(device)
+        return indices, weights
+
+
+class UniformSampler(ScheduleSampler):
+    def __init__(self, num_timesteps):
+        self.num_timesteps = num_timesteps
+        self._weights = np.ones([num_timesteps])
+
+    def weights(self):
+        return self._weights
+
+
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            torch.tensor([0], dtype=torch.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            torch.tensor([len(local_ts)], dtype=torch.int32, device=local_ts.device),
+        )
+
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+
+        timestep_batches = [torch.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [torch.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+
+
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, num_timesteps, history_per_term=10, uniform_prob=0.001):
+        self.num_timesteps = num_timesteps
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [self.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([self.num_timesteps], dtype=np.int)
+
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()
diff --git a/training/strategy/lora.py b/training/strategy/lora.py
index 3f4dbbc..0c0f633 100644
--- a/training/strategy/lora.py
+++ b/training/strategy/lora.py
@@ -120,11 +120,11 @@ def lora_strategy_callbacks(
         unet_ = accelerator.unwrap_model(unet, keep_fp32_wrapper=False)
         text_encoder_ = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=False)
 
-        for (token, ids) in zip(placeholder_tokens, placeholder_token_ids):
-            text_encoder_.text_model.embeddings.save_embed(
-                ids,
-                checkpoint_output_dir / f"{slugify(token)}_{step}_{postfix}.bin"
-            )
+        # for (token, ids) in zip(placeholder_tokens, placeholder_token_ids):
+        #     text_encoder_.text_model.embeddings.save_embed(
+        #         ids,
+        #         checkpoint_output_dir / f"{slugify(token)}_{step}_{postfix}.bin"
+        #     )
 
         if not pti_mode:
             lora_config = {}
-- 
cgit v1.2.3-70-g09d2