5 files changed, 113 insertions, 43 deletions
diff --git a/data/csv.py b/data/csv.py
index af36d9e..e901ab4 100644
--- a/data/csv.py
+++ b/data/csv.py
@@ -59,7 +59,7 @@ class CSVDataModule():
        center_crop: bool = False,
        template_key: str = "template",
        valid_set_size: Optional[int] = None,
-        generator: Optional[torch.Generator] = None,
+        seed: Optional[int] = None,
        filter: Optional[Callable[[CSVDataItem], bool]] = None,
        collate_fn=None,
        num_workers: int = 0
@@ -84,7 +84,7 @@ class CSVDataModule():
        self.template_key = template_key
        self.interpolation = interpolation
        self.valid_set_size = valid_set_size
-        self.generator = generator
+        self.seed = seed
        self.filter = filter
        self.collate_fn = collate_fn
        self.num_workers = num_workers
@@ -155,7 +155,11 @@ class CSVDataModule():
        valid_set_size = max(valid_set_size, 1)
        train_set_size = num_images - valid_set_size
-        data_train, data_val = random_split(items, [train_set_size, valid_set_size], self.generator)
+        generator = torch.Generator(device="cpu")
+        if self.seed is not None:
+            generator = generator.manual_seed(self.seed)
+        data_train, data_val = random_split(items, [train_set_size, valid_set_size], generator=generator)
        self.data_train = self.pad_items(data_train, self.num_class_images)
        self.data_val = self.pad_items(data_val)
diff --git a/train_dreambooth.py b/train_dreambooth.py
index df8b54c..6d9bae8 100644
--- a/train_dreambooth.py
+++ b/train_dreambooth.py
@@ -320,6 +320,12 @@ def parse_args():
        help="Epsilon value for the Adam optimizer"
    )
    parser.add_argument(
+        "--adam_amsgrad",
+        type=bool,
+        default=False,
+        help="Amsgrad value for the Adam optimizer"
+    )
+    parser.add_argument(
        "--mixed_precision",
        type=str,
        default="no",
@@ -642,7 +648,7 @@ def main():
        )
    if args.find_lr:
-        args.learning_rate = 1e-4
+        args.learning_rate = 1e-6
    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
    if args.use_8bit_adam:
@@ -674,6 +680,7 @@ def main():
        betas=(args.adam_beta1, args.adam_beta2),
        weight_decay=args.adam_weight_decay,
        eps=args.adam_epsilon,
+        amsgrad=args.adam_amsgrad,
    )
    weight_dtype = torch.float32
@@ -730,6 +737,7 @@ def main():
        template_key=args.train_data_template,
        valid_set_size=args.valid_set_size,
        num_workers=args.dataloader_num_workers,
+        seed=args.seed,
        filter=keyword_filter,
        collate_fn=collate_fn
    )
@@ -840,7 +848,7 @@ def main():
    def on_eval():
        tokenizer.eval()
-    def loop(batch):
+    def loop(batch, eval: bool = False):
        # Convert images to latent space
        latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
        latents = latents * 0.18215
@@ -849,8 +857,14 @@ def main():
        noise = torch.randn_like(latents)
        bsz = latents.shape[0]
        # Sample a random timestep for each image
-        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps,
+        timesteps_gen = torch.Generator(device=latents.device).manual_seed(args.seed) if eval else None
-                                  (bsz,), device=latents.device)
+        timesteps = torch.randint(
+            0,
+            noise_scheduler.config.num_train_timesteps,
+            (bsz,),
+            generator=timesteps_gen,
+            device=latents.device,
+        )
        timesteps = timesteps.long()
        # Add noise to the latents according to the noise magnitude at each timestep
@@ -1051,7 +1065,7 @@ def main():
            with torch.inference_mode():
                for step, batch in enumerate(val_dataloader):
-                    loss, acc, bsz = loop(batch)
+                    loss, acc, bsz = loop(batch, True)
                    loss = loss.detach_()
                    acc = acc.detach_()
diff --git a/train_ti.py b/train_ti.py
index 1685dc4..5d6eafc 100644
--- a/train_ti.py
+++ b/train_ti.py
@@ -289,6 +289,12 @@ def parse_args():
        help="Epsilon value for the Adam optimizer"
    )
    parser.add_argument(
+        "--adam_amsgrad",
+        type=bool,
+        default=False,
+        help="Amsgrad value for the Adam optimizer"
+    )
+    parser.add_argument(
        "--mixed_precision",
        type=str,
        default="no",
@@ -592,7 +598,7 @@ def main():
        )
    if args.find_lr:
-        args.learning_rate = 1e-4
+        args.learning_rate = 1e-6
    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
    if args.use_8bit_adam:
@@ -612,6 +618,7 @@ def main():
        betas=(args.adam_beta1, args.adam_beta2),
        weight_decay=args.adam_weight_decay,
        eps=args.adam_epsilon,
+        amsgrad=args.adam_amsgrad,
    )
    weight_dtype = torch.float32
@@ -673,6 +680,7 @@ def main():
        template_key=args.train_data_template,
        valid_set_size=args.valid_set_size,
        num_workers=args.dataloader_num_workers,
+        seed=args.seed,
        filter=keyword_filter,
        collate_fn=collate_fn
    )
@@ -791,7 +799,7 @@ def main():
    def on_eval():
        tokenizer.eval()
-    def loop(batch):
+    def loop(batch, eval: bool = False):
        # Convert images to latent space
        latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
        latents = latents * 0.18215
@@ -800,8 +808,14 @@ def main():
        noise = torch.randn_like(latents)
        bsz = latents.shape[0]
        # Sample a random timestep for each image
-        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps,
+        timesteps_gen = torch.Generator(device=latents.device).manual_seed(args.seed) if eval else None
-                                  (bsz,), device=latents.device)
+        timesteps = torch.randint(
+            0,
+            noise_scheduler.config.num_train_timesteps,
+            (bsz,),
+            generator=timesteps_gen,
+            device=latents.device,
+        )
        timesteps = timesteps.long()
        # Add noise to the latents according to the noise magnitude at each timestep
@@ -984,7 +998,7 @@ def main():
            with torch.inference_mode():
                for step, batch in enumerate(val_dataloader):
-                    loss, acc, bsz = loop(batch)
+                    loss, acc, bsz = loop(batch, True)
                    loss = loss.detach_()
                    acc = acc.detach_()
diff --git a/training/lora.py b/training/lora.py
index e1c0971..3857d78 100644
--- a/training/lora.py
+++ b/training/lora.py
@@ -1,3 +1,4 @@
+import torch
 import torch.nn as nn
 from diffusers import ModelMixin, ConfigMixin
@@ -13,56 +14,93 @@ else:
    xformers = None
-class LoraAttnProcessor(ModelMixin, ConfigMixin):
+class LoRALinearLayer(nn.Module):
-    @register_to_config
+    def __init__(self, in_features, out_features, rank=4):
-    def __init__(
-        self,
-        cross_attention_dim,
-        inner_dim,
-        r: int = 4
-    ):
        super().__init__()
-        if r > min(cross_attention_dim, inner_dim):
+        if rank > min(in_features, out_features):
            raise ValueError(
-                f"LoRA rank {r} must be less or equal than {min(cross_attention_dim, inner_dim)}"
+                f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}"
            )
-        self.lora_k_down = nn.Linear(cross_attention_dim, r, bias=False)
+        self.lora_down = nn.Linear(in_features, rank, bias=False)
-        self.lora_k_up = nn.Linear(r, inner_dim, bias=False)
+        self.lora_up = nn.Linear(rank, out_features, bias=False)
+        self.scale = 1.0
-        self.lora_v_down = nn.Linear(cross_attention_dim, r, bias=False)
+        nn.init.normal_(self.lora_down.weight, std=1 / rank)
-        self.lora_v_up = nn.Linear(r, inner_dim, bias=False)
+        nn.init.zeros_(self.lora_up.weight)
-        self.scale = 1.0
+    def forward(self, hidden_states):
+        down_hidden_states = self.lora_down(hidden_states)
+        up_hidden_states = self.lora_up(down_hidden_states)
-        nn.init.normal_(self.lora_k_down.weight, std=1 / r**2)
+        return up_hidden_states
-        nn.init.zeros_(self.lora_k_up.weight)
-        nn.init.normal_(self.lora_v_down.weight, std=1 / r**2)
-        nn.init.zeros_(self.lora_v_up.weight)
-    def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+class LoRACrossAttnProcessor(nn.Module):
-        batch_size, sequence_length, _ = hidden_states.shape
+    def __init__(self, hidden_size, cross_attention_dim=None, rank=4):
+        super().__init__()
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size)
+    def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
+        batch_size, sequence_length, _ = hidden_states.shape
        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
-        query = attn.to_q(hidden_states)
+        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        query = attn.head_to_batch_dim(query)
        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-        key = attn.to_k(encoder_hidden_states) + self.lora_k_up(self.lora_k_down(encoder_hidden_states)) * self.scale
-        value = attn.to_v(encoder_hidden_states) + self.lora_v_up(self.lora_v_down(encoder_hidden_states)) * self.scale
+        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class LoRAXFormersCrossAttnProcessor(nn.Module):
+    def __init__(self, hidden_size, cross_attention_dim, rank=4):
+        super().__init__()
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size)
+    def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
+        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
        query = attn.head_to_batch_dim(query).contiguous()
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
        key = attn.head_to_batch_dim(key).contiguous()
        value = attn.head_to_batch_dim(value).contiguous()
        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
-        hidden_states = hidden_states.to(query.dtype)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)
diff --git a/training/lr.py b/training/lr.py
index 37588b6..a3144ba 100644
--- a/training/lr.py
+++ b/training/lr.py
@@ -1,6 +1,6 @@
 import math
 import copy
-from typing import Callable
+from typing import Callable, Any, Tuple, Union
 from functools import partial
 import matplotlib.pyplot as plt
@@ -24,7 +24,7 @@ class LRFinder():
        optimizer,
        train_dataloader,
        val_dataloader,
-        loss_fn,
+        loss_fn: Union[Callable[[Any], Tuple[Any, Any, int]], Callable[[Any, bool], Tuple[Any, Any, int]]],
        on_train: Callable[[], None] = noop,
        on_eval: Callable[[], None] = noop
    ):
@@ -108,7 +108,7 @@ class LRFinder():
                    if step >= num_val_batches:
                        break
-                    loss, acc, bsz = self.loss_fn(batch)
+                    loss, acc, bsz = self.loss_fn(batch, True)
                    avg_loss.update(loss.detach_(), bsz)
                    avg_acc.update(acc.detach_(), bsz)