From a0b63ee7f4a8c793c0d200c86ef07677aa4cbf2e Mon Sep 17 00:00:00 2001 From: Volpeon Date: Thu, 13 Apr 2023 07:14:24 +0200 Subject: Update --- models/convnext/discriminator.py | 8 +---- models/sparse.py | 2 +- train_lora.py | 73 +++++++++++++++++++++++++--------------- train_ti.py | 13 +++++-- training/functional.py | 35 ++++++++++--------- training/strategy/dreambooth.py | 7 ++-- training/strategy/lora.py | 6 ++-- training/strategy/ti.py | 3 +- 8 files changed, 80 insertions(+), 67 deletions(-) diff --git a/models/convnext/discriminator.py b/models/convnext/discriminator.py index 7dbbe3a..571b915 100644 --- a/models/convnext/discriminator.py +++ b/models/convnext/discriminator.py @@ -15,13 +15,7 @@ class ConvNeXtDiscriminator(): self.img_std = torch.tensor(IMAGENET_DEFAULT_STD).view(1, -1, 1, 1) def get_score(self, img): - img_mean = self.img_mean.to(device=img.device, dtype=img.dtype) - img_std = self.img_std.to(device=img.device, dtype=img.dtype) - - img = ((img+1.)/2.).sub(img_mean).div(img_std) - - img = F.interpolate(img, size=(self.input_size, self.input_size), mode='bicubic', align_corners=True) - pred = self.net(img) + pred = self.get_all(img) return torch.softmax(pred, dim=-1)[:, 1] def get_all(self, img): diff --git a/models/sparse.py b/models/sparse.py index bcb2897..07b3413 100644 --- a/models/sparse.py +++ b/models/sparse.py @@ -15,7 +15,7 @@ class PseudoSparseEmbedding(nn.Module): if dropout_p > 0.0: self.dropout = nn.Dropout(p=dropout_p) else: - self.dropout = lambda x: x + self.dropout = nn.Identity() self.register_buffer('mapping', torch.zeros(0, device=device, dtype=torch.long)) diff --git a/train_lora.py b/train_lora.py index 29e40b2..073e939 100644 --- a/train_lora.py +++ b/train_lora.py @@ -86,6 +86,12 @@ def parse_args(): default=1, help="How many cycles to run automatically." ) + parser.add_argument( + "--cycle_decay", + type=float, + default=1.0, + help="Learning rate decay per cycle." + ) parser.add_argument( "--placeholder_tokens", type=str, @@ -924,39 +930,15 @@ def main(): if args.sample_num is not None: lora_sample_frequency = math.ceil(num_train_epochs / args.sample_num) - params_to_optimize = [] group_labels = [] if len(args.placeholder_tokens) != 0: - params_to_optimize.append({ - "params": text_encoder.text_model.embeddings.token_override_embedding.parameters(), - "lr": args.learning_rate_emb, - "weight_decay": 0, - }) group_labels.append("emb") - params_to_optimize += [ - { - "params": ( - param - for param in unet.parameters() - if param.requires_grad - ), - "lr": args.learning_rate_unet, - }, - { - "params": ( - param - for param in itertools.chain( - text_encoder.text_model.encoder.parameters(), - text_encoder.text_model.final_layer_norm.parameters(), - ) - if param.requires_grad - ), - "lr": args.learning_rate_text, - }, - ] group_labels += ["unet", "text"] training_iter = 0 + learning_rate_emb = args.learning_rate_emb + learning_rate_unet = args.learning_rate_unet + learning_rate_text = args.learning_rate_text lora_project = "lora" @@ -973,6 +955,37 @@ def main(): print(f"============ LoRA cycle {training_iter + 1} ============") print("") + params_to_optimize = [] + + if len(args.placeholder_tokens) != 0: + params_to_optimize.append({ + "params": text_encoder.text_model.embeddings.token_override_embedding.parameters(), + "lr": learning_rate_emb, + "weight_decay": 0, + }) + group_labels.append("emb") + params_to_optimize += [ + { + "params": ( + param + for param in unet.parameters() + if param.requires_grad + ), + "lr": learning_rate_unet, + }, + { + "params": ( + param + for param in itertools.chain( + text_encoder.text_model.encoder.parameters(), + text_encoder.text_model.final_layer_norm.parameters(), + ) + if param.requires_grad + ), + "lr": learning_rate_text, + }, + ] + lora_optimizer = create_optimizer(params_to_optimize) lora_lr_scheduler = create_lr_scheduler( @@ -1002,6 +1015,12 @@ def main(): ) training_iter += 1 + if args.learning_rate_emb is not None: + learning_rate_emb *= args.cycle_decay + if args.learning_rate_unet is not None: + learning_rate_unet *= args.cycle_decay + if args.learning_rate_text is not None: + learning_rate_text *= args.cycle_decay accelerator.end_training() diff --git a/train_ti.py b/train_ti.py index 082e9b7..94ddbb6 100644 --- a/train_ti.py +++ b/train_ti.py @@ -71,6 +71,12 @@ def parse_args(): default=1, help="How many cycles to run automatically." ) + parser.add_argument( + "--cycle_decay", + type=float, + default=1.0, + help="Learning rate decay per cycle." + ) parser.add_argument( "--placeholder_tokens", type=str, @@ -672,7 +678,6 @@ def main(): convnext.to(accelerator.device, dtype=weight_dtype) convnext.requires_grad_(False) convnext.eval() - disc = ConvNeXtDiscriminator(convnext, input_size=384) if len(args.alias_tokens) != 0: alias_placeholder_tokens = args.alias_tokens[::2] @@ -815,7 +820,6 @@ def main(): milestone_checkpoints=not args.no_milestone_checkpoints, global_step_offset=global_step_offset, offset_noise_strength=args.offset_noise_strength, - disc=disc, # -- use_emb_decay=args.use_emb_decay, emb_decay_target=args.emb_decay_target, @@ -890,6 +894,7 @@ def main(): sample_frequency = math.ceil(num_train_epochs / args.sample_num) training_iter = 0 + learning_rate = args.learning_rate project = placeholder_tokens[0] if len(placeholder_tokens) == 1 else "ti" @@ -908,7 +913,7 @@ def main(): optimizer = create_optimizer( text_encoder.text_model.embeddings.token_override_embedding.parameters(), - lr=args.learning_rate, + lr=learning_rate, ) lr_scheduler = get_scheduler( @@ -948,6 +953,8 @@ def main(): ) training_iter += 1 + if args.learning_rate is not None: + learning_rate *= args.cycle_decay accelerator.end_training() diff --git a/training/functional.py b/training/functional.py index be39776..ed8ae3a 100644 --- a/training/functional.py +++ b/training/functional.py @@ -168,8 +168,7 @@ def save_samples( image_grid = pipeline.numpy_to_pil(image_grid.unsqueeze(0).permute(0, 2, 3, 1).numpy())[0] image_grid.save(file_path, quality=85) - del generator - del pipeline + del generator, pipeline if torch.cuda.is_available(): torch.cuda.empty_cache() @@ -398,31 +397,32 @@ def loss_step( else: raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") - if disc is None: - if guidance_scale == 0 and prior_loss_weight != 0: - # Chunk the noise and model_pred into two parts and compute the loss on each part separately. - model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) - target, target_prior = torch.chunk(target, 2, dim=0) + acc = (model_pred == target).float().mean() - # Compute instance loss - loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") + if guidance_scale == 0 and prior_loss_weight != 0: + # Chunk the noise and model_pred into two parts and compute the loss on each part separately. + model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) + target, target_prior = torch.chunk(target, 2, dim=0) - # Compute prior loss - prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="none") + # Compute instance loss + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") - # Add the prior loss to the instance loss. - loss = loss + prior_loss_weight * prior_loss - else: - loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") + # Compute prior loss + prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="none") - loss = loss.mean([1, 2, 3]) + # Add the prior loss to the instance loss. + loss = loss + prior_loss_weight * prior_loss else: + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") + + loss = loss.mean([1, 2, 3]) + + if disc is not None: rec_latent = get_original(noise_scheduler, model_pred, noisy_latents, timesteps) rec_latent /= vae.config.scaling_factor rec_latent = rec_latent.to(dtype=vae.dtype) rec = vae.decode(rec_latent).sample loss = 1 - disc.get_score(rec) - del rec_latent, rec if min_snr_gamma != 0: snr = compute_snr(timesteps, noise_scheduler) @@ -432,7 +432,6 @@ def loss_step( loss *= mse_loss_weights loss = loss.mean() - acc = (model_pred == target).float().mean() return loss, acc, bsz diff --git a/training/strategy/dreambooth.py b/training/strategy/dreambooth.py index fa51bc7..4ae28b7 100644 --- a/training/strategy/dreambooth.py +++ b/training/strategy/dreambooth.py @@ -142,9 +142,7 @@ def dreambooth_strategy_callbacks( ) pipeline.save_pretrained(checkpoint_output_dir) - del unet_ - del text_encoder_ - del pipeline + del unet_, text_encoder_, pipeline if torch.cuda.is_available(): torch.cuda.empty_cache() @@ -165,8 +163,7 @@ def dreambooth_strategy_callbacks( unet_.to(dtype=orig_unet_dtype) text_encoder_.to(dtype=orig_text_encoder_dtype) - del unet_ - del text_encoder_ + del unet_, text_encoder_ if torch.cuda.is_available(): torch.cuda.empty_cache() diff --git a/training/strategy/lora.py b/training/strategy/lora.py index 73ec8f2..1517ee8 100644 --- a/training/strategy/lora.py +++ b/training/strategy/lora.py @@ -140,8 +140,7 @@ def lora_strategy_callbacks( with open(checkpoint_output_dir / "lora_config.json", "w") as f: json.dump(lora_config, f) - del unet_ - del text_encoder_ + del unet_, text_encoder_ if torch.cuda.is_available(): torch.cuda.empty_cache() @@ -153,8 +152,7 @@ def lora_strategy_callbacks( save_samples_(step=step, unet=unet_, text_encoder=text_encoder_) - del unet_ - del text_encoder_ + del unet_, text_encoder_ if torch.cuda.is_available(): torch.cuda.empty_cache() diff --git a/training/strategy/ti.py b/training/strategy/ti.py index 08af89d..ca7cc3d 100644 --- a/training/strategy/ti.py +++ b/training/strategy/ti.py @@ -158,8 +158,7 @@ def textual_inversion_strategy_callbacks( unet_.to(dtype=orig_unet_dtype) text_encoder_.to(dtype=orig_text_encoder_dtype) - del unet_ - del text_encoder_ + del unet_, text_encoder_ if torch.cuda.is_available(): torch.cuda.empty_cache() -- cgit v1.2.3-70-g09d2