Don't rely on Accelerate for gradient accumulation

author: Volpeon <git@volpeon.ink> 2023-02-21 11:50:11 +0100
committer: Volpeon <git@volpeon.ink> 2023-02-21 11:50:11 +0100
commit: 9d6252e63bac241e5c6191eb47adb51b84a5d782 (patch)
tree: 6cb649510b48ca33419af3721e630f1c06bf1ae2
parent: Embedding normalization: Ignore tensors with grad = 0 (diff)
download: textual-inversion-diff-9d6252e63bac241e5c6191eb47adb51b84a5d782.tar.gz
textual-inversion-diff-9d6252e63bac241e5c6191eb47adb51b84a5d782.tar.bz2
textual-inversion-diff-9d6252e63bac241e5c6191eb47adb51b84a5d782.zip
5 files changed, 32 insertions, 33 deletions
diff --git a/train_dreambooth.py b/train_dreambooth.py
index 431ff3d..280cf77 100644
--- a/train_dreambooth.py
+++ b/train_dreambooth.py
@@ -439,7 +439,6 @@ def main():
    accelerator = Accelerator(
        log_with=LoggerType.TENSORBOARD,
        logging_dir=f"{output_dir}",
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision
    )
@@ -590,6 +589,7 @@ def main():
        lr_scheduler=lr_scheduler,
        prepare_unet=True,
        num_train_epochs=args.num_train_epochs,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
        sample_frequency=args.sample_frequency,
        # --
        tokenizer=tokenizer,
diff --git a/train_lora.py b/train_lora.py
index a06591d..d7c2de0 100644
--- a/train_lora.py
+++ b/train_lora.py
@@ -399,7 +399,6 @@ def main():
    accelerator = Accelerator(
        log_with=LoggerType.TENSORBOARD,
        logging_dir=f"{output_dir}",
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision
    )
@@ -561,6 +560,7 @@ def main():
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        num_train_epochs=args.num_train_epochs,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
        sample_frequency=args.sample_frequency,
        # --
        tokenizer=tokenizer,
diff --git a/train_ti.py b/train_ti.py
index 6dc07dd..68783ea 100644
--- a/train_ti.py
+++ b/train_ti.py
@@ -518,7 +518,6 @@ def main():
    accelerator = Accelerator(
        log_with=LoggerType.TENSORBOARD,
        logging_dir=f"{output_dir}",
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
        mixed_precision=args.mixed_precision
    )
@@ -611,6 +610,7 @@ def main():
        low_freq_noise=0,
        strategy=textual_inversion_strategy,
        num_train_epochs=args.num_train_epochs,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
        sample_frequency=args.sample_frequency,
        checkpoint_frequency=args.checkpoint_frequency,
        milestone_checkpoints=not args.no_milestone_checkpoints,
diff --git a/training/functional.py b/training/functional.py
index 739d055..3f5fa7e 100644
--- a/training/functional.py
+++ b/training/functional.py
@@ -365,15 +365,17 @@ def train_loop(
    milestone_checkpoints: bool = True,
    global_step_offset: int = 0,
    num_epochs: int = 100,
+    gradient_accumulation_steps: int = 1,
    callbacks: TrainingCallbacks = TrainingCallbacks(),
 ):
-    num_training_steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.gradient_accumulation_steps)
+    num_training_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
    num_val_steps_per_epoch = len(val_dataloader) if val_dataloader is not None else 0
    num_training_steps = num_training_steps_per_epoch * num_epochs
    num_val_steps = num_val_steps_per_epoch * num_epochs
    global_step = 0
+    train_step = 0
    avg_loss = AverageMeter()
    avg_acc = AverageMeter()
@@ -434,44 +436,45 @@ def train_loop(
            with on_train(epoch):
                for step, batch in enumerate(train_dataloader):
-                    with accelerator.accumulate(model):
+                    loss, acc, bsz = loss_step(step, batch)
-                        loss, acc, bsz = loss_step(step, batch)
+                    loss /= gradient_accumulation_steps
-                        accelerator.backward(loss)
+                    avg_loss.update(loss.detach_(), bsz)
+                    avg_acc.update(acc.detach_(), bsz)
+                    accelerator.backward(loss)
+                    logs = {
+                        "train/loss": avg_loss.avg.item(),
+                        "train/acc": avg_acc.avg.item(),
+                        "train/cur_loss": loss.item(),
+                        "train/cur_acc": acc.item(),
+                        "lr": lr_scheduler.get_last_lr()[0],
+                    }
+                    logs.update(on_log())
+                    local_progress_bar.set_postfix(**logs)
+                    train_step += 1
+                    if train_step % gradient_accumulation_steps == 0:
                        on_before_optimize(lr_scheduler.get_last_lr()[0], epoch)
                        optimizer.step()
                        lr_scheduler.step()
                        optimizer.zero_grad(set_to_none=True)
-                        avg_loss.update(loss.detach_(), bsz)
-                        avg_acc.update(acc.detach_(), bsz)
-                    # Checks if the accelerator has performed an optimization step behind the scenes
-                    if accelerator.sync_gradients:
                        on_after_optimize(lr_scheduler.get_last_lr()[0])
                        local_progress_bar.update(1)
                        global_progress_bar.update(1)
-                        global_step += 1
+                        accelerator.log(logs, step=global_step)
-                    logs = {
+                        global_step += 1
-                        "train/loss": avg_loss.avg.item(),
-                        "train/acc": avg_acc.avg.item(),
-                        "train/cur_loss": loss.item(),
-                        "train/cur_acc": acc.item(),
-                        "lr": lr_scheduler.get_last_lr()[0],
-                    }
-                    logs.update(on_log())
-                    accelerator.log(logs, step=global_step)
-                    local_progress_bar.set_postfix(**logs)
-                    if global_step >= num_training_steps:
+                        if global_step >= num_training_steps:
-                        break
+                            break
            accelerator.wait_for_everyone()
@@ -571,6 +574,7 @@ def train(
    strategy: TrainingStrategy,
    no_val: bool = False,
    num_train_epochs: int = 100,
+    gradient_accumulation_steps: int = 1,
    sample_frequency: int = 20,
    checkpoint_frequency: int = 50,
    milestone_checkpoints: bool = True,
@@ -631,6 +635,7 @@ def train(
        milestone_checkpoints=milestone_checkpoints,
        global_step_offset=global_step_offset,
        num_epochs=num_train_epochs,
+        gradient_accumulation_steps=gradient_accumulation_steps,
        callbacks=callbacks,
    )
diff --git a/training/strategy/dreambooth.py b/training/strategy/dreambooth.py
index d697554..fcf5c0d 100644
--- a/training/strategy/dreambooth.py
+++ b/training/strategy/dreambooth.py
@@ -41,12 +41,6 @@ def dreambooth_strategy_callbacks(
    sample_guidance_scale: float = 7.5,
    sample_image_size: Optional[int] = None,
 ):
-    if accelerator.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
-        raise ValueError(
-            "Gradient accumulation is not supported when training the text encoder in distributed training. "
-            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
-        )
    sample_output_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_output_dir.mkdir(parents=True, exist_ok=True)
author	Volpeon <git@volpeon.ink>	2023-02-21 11:50:11 +0100
committer	Volpeon <git@volpeon.ink>	2023-02-21 11:50:11 +0100
commit	9d6252e63bac241e5c6191eb47adb51b84a5d782 (patch)
tree	6cb649510b48ca33419af3721e630f1c06bf1ae2
parent	Embedding normalization: Ignore tensors with grad = 0 (diff)
download	textual-inversion-diff-9d6252e63bac241e5c6191eb47adb51b84a5d782.tar.gz textual-inversion-diff-9d6252e63bac241e5c6191eb47adb51b84a5d782.tar.bz2 textual-inversion-diff-9d6252e63bac241e5c6191eb47adb51b84a5d782.zip