6 files changed, 102 insertions, 11 deletions
diff --git a/dreambooth.py b/dreambooth.py
index c0caf03..8c4bf50 100644
--- a/dreambooth.py
+++ b/dreambooth.py
@@ -26,6 +26,7 @@ from slugify import slugify
 from schedulers.scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
 from pipelines.stable_diffusion.vlpn_stable_diffusion import VlpnStableDiffusion
 from data.csv import CSVDataModule
+from training.optimization import get_one_cycle_schedule
 from models.clip.prompt import PromptProcessor
 logger = get_logger(__name__)
@@ -178,10 +179,10 @@ def parse_args():
    parser.add_argument(
        "--lr_scheduler",
        type=str,
-        default="cosine_with_restarts",
+        default="one_cycle",
        help=(
            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
+            ' "constant", "constant_with_warmup", "one_cycle"]'
        ),
    )
    parser.add_argument(
@@ -585,6 +586,8 @@ def main():
            device=accelerator.device
        )
+    unet.set_use_memory_efficient_attention_xformers(True)
    if args.gradient_checkpointing:
        unet.enable_gradient_checkpointing()
        text_encoder.gradient_checkpointing_enable()
@@ -784,7 +787,12 @@ def main():
        overrode_max_train_steps = True
    num_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-    if args.lr_scheduler == "cosine_with_restarts":
+    if args.lr_scheduler == "one_cycle":
+        lr_scheduler = get_one_cycle_schedule(
+            optimizer=optimizer,
+            num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+        )
+    elif args.lr_scheduler == "cosine_with_restarts":
        lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
diff --git a/environment.yaml b/environment.yaml
index de35645..7aa5312 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -1,6 +1,7 @@
 name: ldd
 channels:
  - pytorch
+  - xformers/label/dev
  - defaults
 dependencies:
  - cudatoolkit=11.3
@@ -10,13 +11,14 @@ dependencies:
  - pytorch=1.12.1
  - torchvision=0.13.1
  - pandas=1.4.3
+  - xformers=0.0.14.dev315
  - pip:
      - -e .
      - -e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
      - -e git+https://github.com/openai/CLIP.git@main#egg=clip
      - -e git+https://github.com/hlky/k-diffusion-sd#egg=k_diffusion
      - -e git+https://github.com/devilismyfriend/latent-diffusion#egg=latent-diffusion
-      - -e git+https://github.com/ShivamShrirao/diffusers#egg=diffusers
+      - -e git+https://github.com/huggingface/diffusers#egg=diffusers
      - accelerate==0.12.0
      - albumentations==1.1.0
      - bitsandbytes==0.34.0
@@ -34,4 +36,3 @@ dependencies:
      - torchmetrics==0.9.3
      - transformers==4.23.1
      - triton==2.0.0.dev20220924
-      - xformers==0.0.13
diff --git a/infer.py b/infer.py
index ac05955..9bc9efe 100644
--- a/infer.py
+++ b/infer.py
@@ -234,7 +234,7 @@ def create_pipeline(model, scheduler, ti_embeddings_dir, dtype):
        tokenizer=tokenizer,
        scheduler=scheduler,
    )
-    pipeline.aesthetic_gradient_iters = 20
+    pipeline.enable_xformers_memory_efficient_attention()
    pipeline.to("cuda")
    print("Pipeline loaded.")
diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index cd5ae7e..36942f0 100644
--- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -7,6 +7,7 @@ import torch
 import PIL
 from diffusers.configuration_utils import FrozenDict
+from diffusers.utils import is_accelerate_available
 from diffusers import AutoencoderKL, DiffusionPipeline, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import logging
@@ -61,13 +62,27 @@ class VlpnStableDiffusion(DiffusionPipeline):
            scheduler=scheduler,
        )
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
        r"""
        Enable sliced attention computation.
        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
        in several steps. This is useful to save some memory in exchange for a small speed decrease.
        Args:
            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
@@ -88,6 +103,23 @@ class VlpnStableDiffusion(DiffusionPipeline):
        # set slice_size = `None` to disable `attention slicing`
        self.enable_attention_slicing(None)
+    def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device("cuda")
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
    @torch.no_grad()
    def __call__(
        self,
diff --git a/textual_inversion.py b/textual_inversion.py
index 115f3aa..578c054 100644
--- a/textual_inversion.py
+++ b/textual_inversion.py
@@ -25,6 +25,7 @@ from slugify import slugify
 from schedulers.scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
 from pipelines.stable_diffusion.vlpn_stable_diffusion import VlpnStableDiffusion
 from data.csv import CSVDataModule
+from training.optimization import get_one_cycle_schedule
 from models.clip.prompt import PromptProcessor
 logger = get_logger(__name__)
@@ -162,10 +163,10 @@ def parse_args():
    parser.add_argument(
        "--lr_scheduler",
        type=str,
-        default="cosine_with_restarts",
+        default="one_cycle",
        help=(
            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
-            ' "constant", "constant_with_warmup"]'
+            ' "constant", "constant_with_warmup", "one_cycle"]'
        ),
    )
    parser.add_argument(
@@ -535,6 +536,8 @@ def main():
    prompt_processor = PromptProcessor(tokenizer, text_encoder)
+    unet.set_use_memory_efficient_attention_xformers(True)
    if args.gradient_checkpointing:
        text_encoder.gradient_checkpointing_enable()
@@ -693,7 +696,12 @@ def main():
        overrode_max_train_steps = True
    num_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-    if args.lr_scheduler == "cosine_with_restarts":
+    if args.lr_scheduler == "one_cycle":
+        lr_scheduler = get_one_cycle_schedule(
+            optimizer=optimizer,
+            num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+        )
+    elif args.lr_scheduler == "cosine_with_restarts":
        lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
diff --git a/training/optimization.py b/training/optimization.py
new file mode 100644
index 0000000..012beed
--- /dev/null
+++ b/training/optimization.py
@@ -0,0 +1,42 @@
+import math
+from torch.optim.lr_scheduler import LambdaLR
+from diffusers.utils import logging
+logger = logging.get_logger(__name__)
+def get_one_cycle_schedule(optimizer, num_training_steps, annealing="cos", min_lr=0.05, mid_point=0.42, last_epoch=-1):
+    """
+    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
+    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_training_steps (`int`):
+            The total number of training steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    def lr_lambda(current_step: int):
+        thresh_up = int(num_training_steps * min(mid_point, 0.5))
+        if current_step < thresh_up:
+            return min_lr + float(current_step) / float(max(1, thresh_up)) * (1 - min_lr)
+        if annealing == "linear":
+            thresh_down = thresh_up * 2
+            if current_step < thresh_down:
+                return min_lr + float(thresh_down - current_step) / float(max(1, thresh_down - thresh_up)) * (1 - min_lr)
+            return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - thresh_down))) * min_lr
+        else:
+            progress = float(current_step - thresh_up) / float(max(1, num_training_steps - thresh_up))
+            return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
+    return LambdaLR(optimizer, lr_lambda, last_epoch)