From 6df1fc46daca9c289f1d7f7524e01deac5c92fd1 Mon Sep 17 00:00:00 2001 From: Volpeon Date: Tue, 27 Dec 2022 13:58:48 +0100 Subject: Improved learning rate finder --- train_dreambooth.py | 5 ++--- train_ti.py | 10 +++------- training/lr.py | 38 +++++++++++++++++++++++++------------- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/train_dreambooth.py b/train_dreambooth.py index a62cec9..325fe90 100644 --- a/train_dreambooth.py +++ b/train_dreambooth.py @@ -970,9 +970,8 @@ def main(): avg_loss_val.update(loss.detach_(), bsz) avg_acc_val.update(acc.detach_(), bsz) - if accelerator.sync_gradients: - local_progress_bar.update(1) - global_progress_bar.update(1) + local_progress_bar.update(1) + global_progress_bar.update(1) logs = { "val/loss": avg_loss_val.avg.item(), diff --git a/train_ti.py b/train_ti.py index 32f44f4..870b2ba 100644 --- a/train_ti.py +++ b/train_ti.py @@ -548,9 +548,6 @@ def main(): args.train_batch_size * accelerator.num_processes ) - if args.find_lr: - args.learning_rate = 1e2 - # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs if args.use_8bit_adam: try: @@ -783,7 +780,7 @@ def main(): if args.find_lr: lr_finder = LRFinder(accelerator, text_encoder, optimizer, train_dataloader, val_dataloader, loop) - lr_finder.run(num_train_steps=2) + lr_finder.run(min_lr=1e-6, num_train_batches=4) plt.savefig(basepath.joinpath("lr.png")) plt.close() @@ -908,9 +905,8 @@ def main(): avg_loss_val.update(loss.detach_(), bsz) avg_acc_val.update(acc.detach_(), bsz) - if accelerator.sync_gradients: - local_progress_bar.update(1) - global_progress_bar.update(1) + local_progress_bar.update(1) + global_progress_bar.update(1) logs = { "val/loss": avg_loss_val.avg.item(), diff --git a/training/lr.py b/training/lr.py index 5343f24..8e558e1 100644 --- a/training/lr.py +++ b/training/lr.py @@ -1,3 +1,6 @@ +import math +import copy + import matplotlib.pyplot as plt import numpy as np import torch @@ -16,15 +19,22 @@ class LRFinder(): self.val_dataloader = val_dataloader self.loss_fn = loss_fn - def run(self, num_epochs=100, num_train_steps=1, num_val_steps=1, smooth_f=0.05, diverge_th=5): + self.model_state = copy.deepcopy(model.state_dict()) + self.optimizer_state = copy.deepcopy(optimizer.state_dict()) + + def run(self, min_lr, num_epochs=100, num_train_batches=1, num_val_batches=math.inf, smooth_f=0.05, diverge_th=5): best_loss = None lrs = [] losses = [] - lr_scheduler = get_exponential_schedule(self.optimizer, num_epochs) + lr_scheduler = get_exponential_schedule(self.optimizer, min_lr, num_epochs) + + steps = min(num_train_batches, len(self.train_dataloader)) + steps += min(num_val_batches, len(self.val_dataloader)) + steps *= num_epochs progress_bar = tqdm( - range(num_epochs * (num_train_steps + num_val_steps)), + range(steps), disable=not self.accelerator.is_local_main_process, dynamic_ncols=True ) @@ -38,6 +48,9 @@ class LRFinder(): self.model.train() for step, batch in enumerate(self.train_dataloader): + if step >= num_train_batches: + break + with self.accelerator.accumulate(self.model): loss, acc, bsz = self.loss_fn(batch) @@ -49,21 +62,17 @@ class LRFinder(): if self.accelerator.sync_gradients: progress_bar.update(1) - if step >= num_train_steps: - break - self.model.eval() with torch.inference_mode(): for step, batch in enumerate(self.val_dataloader): + if step >= num_val_batches: + break + loss, acc, bsz = self.loss_fn(batch) avg_loss.update(loss.detach_(), bsz) - if self.accelerator.sync_gradients: - progress_bar.update(1) - - if step >= num_val_steps: - break + progress_bar.update(1) lr_scheduler.step() @@ -87,6 +96,9 @@ class LRFinder(): "lr": lr, }) + self.model.load_state_dict(self.model_state) + self.optimizer.load_state_dict(self.optimizer_state) + if loss > diverge_th * best_loss: print("Stopping early, the loss has diverged") break @@ -120,8 +132,8 @@ class LRFinder(): ax.set_ylabel("Loss") -def get_exponential_schedule(optimizer, num_epochs, last_epoch=-1): +def get_exponential_schedule(optimizer, min_lr, num_epochs, last_epoch=-1): def lr_lambda(current_epoch: int): - return (current_epoch / num_epochs) ** 5 + return min_lr + ((current_epoch / num_epochs) ** 10) * (1 - min_lr) return LambdaLR(optimizer, lr_lambda, last_epoch) -- cgit v1.2.3-70-g09d2