diff options
| -rw-r--r-- | models/convnext/discriminator.py | 8 | ||||
| -rw-r--r-- | models/sparse.py | 2 | ||||
| -rw-r--r-- | train_lora.py | 73 | ||||
| -rw-r--r-- | train_ti.py | 13 | ||||
| -rw-r--r-- | training/functional.py | 35 | ||||
| -rw-r--r-- | training/strategy/dreambooth.py | 7 | ||||
| -rw-r--r-- | training/strategy/lora.py | 6 | ||||
| -rw-r--r-- | training/strategy/ti.py | 3 |
8 files changed, 80 insertions, 67 deletions
diff --git a/models/convnext/discriminator.py b/models/convnext/discriminator.py index 7dbbe3a..571b915 100644 --- a/models/convnext/discriminator.py +++ b/models/convnext/discriminator.py | |||
| @@ -15,13 +15,7 @@ class ConvNeXtDiscriminator(): | |||
| 15 | self.img_std = torch.tensor(IMAGENET_DEFAULT_STD).view(1, -1, 1, 1) | 15 | self.img_std = torch.tensor(IMAGENET_DEFAULT_STD).view(1, -1, 1, 1) |
| 16 | 16 | ||
| 17 | def get_score(self, img): | 17 | def get_score(self, img): |
| 18 | img_mean = self.img_mean.to(device=img.device, dtype=img.dtype) | 18 | pred = self.get_all(img) |
| 19 | img_std = self.img_std.to(device=img.device, dtype=img.dtype) | ||
| 20 | |||
| 21 | img = ((img+1.)/2.).sub(img_mean).div(img_std) | ||
| 22 | |||
| 23 | img = F.interpolate(img, size=(self.input_size, self.input_size), mode='bicubic', align_corners=True) | ||
| 24 | pred = self.net(img) | ||
| 25 | return torch.softmax(pred, dim=-1)[:, 1] | 19 | return torch.softmax(pred, dim=-1)[:, 1] |
| 26 | 20 | ||
| 27 | def get_all(self, img): | 21 | def get_all(self, img): |
diff --git a/models/sparse.py b/models/sparse.py index bcb2897..07b3413 100644 --- a/models/sparse.py +++ b/models/sparse.py | |||
| @@ -15,7 +15,7 @@ class PseudoSparseEmbedding(nn.Module): | |||
| 15 | if dropout_p > 0.0: | 15 | if dropout_p > 0.0: |
| 16 | self.dropout = nn.Dropout(p=dropout_p) | 16 | self.dropout = nn.Dropout(p=dropout_p) |
| 17 | else: | 17 | else: |
| 18 | self.dropout = lambda x: x | 18 | self.dropout = nn.Identity() |
| 19 | 19 | ||
| 20 | self.register_buffer('mapping', torch.zeros(0, device=device, dtype=torch.long)) | 20 | self.register_buffer('mapping', torch.zeros(0, device=device, dtype=torch.long)) |
| 21 | 21 | ||
diff --git a/train_lora.py b/train_lora.py index 29e40b2..073e939 100644 --- a/train_lora.py +++ b/train_lora.py | |||
| @@ -87,6 +87,12 @@ def parse_args(): | |||
| 87 | help="How many cycles to run automatically." | 87 | help="How many cycles to run automatically." |
| 88 | ) | 88 | ) |
| 89 | parser.add_argument( | 89 | parser.add_argument( |
| 90 | "--cycle_decay", | ||
| 91 | type=float, | ||
| 92 | default=1.0, | ||
| 93 | help="Learning rate decay per cycle." | ||
| 94 | ) | ||
| 95 | parser.add_argument( | ||
| 90 | "--placeholder_tokens", | 96 | "--placeholder_tokens", |
| 91 | type=str, | 97 | type=str, |
| 92 | nargs='*', | 98 | nargs='*', |
| @@ -924,39 +930,15 @@ def main(): | |||
| 924 | if args.sample_num is not None: | 930 | if args.sample_num is not None: |
| 925 | lora_sample_frequency = math.ceil(num_train_epochs / args.sample_num) | 931 | lora_sample_frequency = math.ceil(num_train_epochs / args.sample_num) |
| 926 | 932 | ||
| 927 | params_to_optimize = [] | ||
| 928 | group_labels = [] | 933 | group_labels = [] |
| 929 | if len(args.placeholder_tokens) != 0: | 934 | if len(args.placeholder_tokens) != 0: |
| 930 | params_to_optimize.append({ | ||
| 931 | "params": text_encoder.text_model.embeddings.token_override_embedding.parameters(), | ||
| 932 | "lr": args.learning_rate_emb, | ||
| 933 | "weight_decay": 0, | ||
| 934 | }) | ||
| 935 | group_labels.append("emb") | 935 | group_labels.append("emb") |
| 936 | params_to_optimize += [ | ||
| 937 | { | ||
| 938 | "params": ( | ||
| 939 | param | ||
| 940 | for param in unet.parameters() | ||
| 941 | if param.requires_grad | ||
| 942 | ), | ||
| 943 | "lr": args.learning_rate_unet, | ||
| 944 | }, | ||
| 945 | { | ||
| 946 | "params": ( | ||
| 947 | param | ||
| 948 | for param in itertools.chain( | ||
| 949 | text_encoder.text_model.encoder.parameters(), | ||
| 950 | text_encoder.text_model.final_layer_norm.parameters(), | ||
| 951 | ) | ||
| 952 | if param.requires_grad | ||
| 953 | ), | ||
| 954 | "lr": args.learning_rate_text, | ||
| 955 | }, | ||
| 956 | ] | ||
| 957 | group_labels += ["unet", "text"] | 936 | group_labels += ["unet", "text"] |
| 958 | 937 | ||
| 959 | training_iter = 0 | 938 | training_iter = 0 |
| 939 | learning_rate_emb = args.learning_rate_emb | ||
| 940 | learning_rate_unet = args.learning_rate_unet | ||
| 941 | learning_rate_text = args.learning_rate_text | ||
| 960 | 942 | ||
| 961 | lora_project = "lora" | 943 | lora_project = "lora" |
| 962 | 944 | ||
| @@ -973,6 +955,37 @@ def main(): | |||
| 973 | print(f"============ LoRA cycle {training_iter + 1} ============") | 955 | print(f"============ LoRA cycle {training_iter + 1} ============") |
| 974 | print("") | 956 | print("") |
| 975 | 957 | ||
| 958 | params_to_optimize = [] | ||
| 959 | |||
| 960 | if len(args.placeholder_tokens) != 0: | ||
| 961 | params_to_optimize.append({ | ||
| 962 | "params": text_encoder.text_model.embeddings.token_override_embedding.parameters(), | ||
| 963 | "lr": learning_rate_emb, | ||
| 964 | "weight_decay": 0, | ||
| 965 | }) | ||
| 966 | group_labels.append("emb") | ||
| 967 | params_to_optimize += [ | ||
| 968 | { | ||
| 969 | "params": ( | ||
| 970 | param | ||
| 971 | for param in unet.parameters() | ||
| 972 | if param.requires_grad | ||
| 973 | ), | ||
| 974 | "lr": learning_rate_unet, | ||
| 975 | }, | ||
| 976 | { | ||
| 977 | "params": ( | ||
| 978 | param | ||
| 979 | for param in itertools.chain( | ||
| 980 | text_encoder.text_model.encoder.parameters(), | ||
| 981 | text_encoder.text_model.final_layer_norm.parameters(), | ||
| 982 | ) | ||
| 983 | if param.requires_grad | ||
| 984 | ), | ||
| 985 | "lr": learning_rate_text, | ||
| 986 | }, | ||
| 987 | ] | ||
| 988 | |||
| 976 | lora_optimizer = create_optimizer(params_to_optimize) | 989 | lora_optimizer = create_optimizer(params_to_optimize) |
| 977 | 990 | ||
| 978 | lora_lr_scheduler = create_lr_scheduler( | 991 | lora_lr_scheduler = create_lr_scheduler( |
| @@ -1002,6 +1015,12 @@ def main(): | |||
| 1002 | ) | 1015 | ) |
| 1003 | 1016 | ||
| 1004 | training_iter += 1 | 1017 | training_iter += 1 |
| 1018 | if args.learning_rate_emb is not None: | ||
| 1019 | learning_rate_emb *= args.cycle_decay | ||
| 1020 | if args.learning_rate_unet is not None: | ||
| 1021 | learning_rate_unet *= args.cycle_decay | ||
| 1022 | if args.learning_rate_text is not None: | ||
| 1023 | learning_rate_text *= args.cycle_decay | ||
| 1005 | 1024 | ||
| 1006 | accelerator.end_training() | 1025 | accelerator.end_training() |
| 1007 | 1026 | ||
diff --git a/train_ti.py b/train_ti.py index 082e9b7..94ddbb6 100644 --- a/train_ti.py +++ b/train_ti.py | |||
| @@ -72,6 +72,12 @@ def parse_args(): | |||
| 72 | help="How many cycles to run automatically." | 72 | help="How many cycles to run automatically." |
| 73 | ) | 73 | ) |
| 74 | parser.add_argument( | 74 | parser.add_argument( |
| 75 | "--cycle_decay", | ||
| 76 | type=float, | ||
| 77 | default=1.0, | ||
| 78 | help="Learning rate decay per cycle." | ||
| 79 | ) | ||
| 80 | parser.add_argument( | ||
| 75 | "--placeholder_tokens", | 81 | "--placeholder_tokens", |
| 76 | type=str, | 82 | type=str, |
| 77 | nargs='*', | 83 | nargs='*', |
| @@ -672,7 +678,6 @@ def main(): | |||
| 672 | convnext.to(accelerator.device, dtype=weight_dtype) | 678 | convnext.to(accelerator.device, dtype=weight_dtype) |
| 673 | convnext.requires_grad_(False) | 679 | convnext.requires_grad_(False) |
| 674 | convnext.eval() | 680 | convnext.eval() |
| 675 | disc = ConvNeXtDiscriminator(convnext, input_size=384) | ||
| 676 | 681 | ||
| 677 | if len(args.alias_tokens) != 0: | 682 | if len(args.alias_tokens) != 0: |
| 678 | alias_placeholder_tokens = args.alias_tokens[::2] | 683 | alias_placeholder_tokens = args.alias_tokens[::2] |
| @@ -815,7 +820,6 @@ def main(): | |||
| 815 | milestone_checkpoints=not args.no_milestone_checkpoints, | 820 | milestone_checkpoints=not args.no_milestone_checkpoints, |
| 816 | global_step_offset=global_step_offset, | 821 | global_step_offset=global_step_offset, |
| 817 | offset_noise_strength=args.offset_noise_strength, | 822 | offset_noise_strength=args.offset_noise_strength, |
| 818 | disc=disc, | ||
| 819 | # -- | 823 | # -- |
| 820 | use_emb_decay=args.use_emb_decay, | 824 | use_emb_decay=args.use_emb_decay, |
| 821 | emb_decay_target=args.emb_decay_target, | 825 | emb_decay_target=args.emb_decay_target, |
| @@ -890,6 +894,7 @@ def main(): | |||
| 890 | sample_frequency = math.ceil(num_train_epochs / args.sample_num) | 894 | sample_frequency = math.ceil(num_train_epochs / args.sample_num) |
| 891 | 895 | ||
| 892 | training_iter = 0 | 896 | training_iter = 0 |
| 897 | learning_rate = args.learning_rate | ||
| 893 | 898 | ||
| 894 | project = placeholder_tokens[0] if len(placeholder_tokens) == 1 else "ti" | 899 | project = placeholder_tokens[0] if len(placeholder_tokens) == 1 else "ti" |
| 895 | 900 | ||
| @@ -908,7 +913,7 @@ def main(): | |||
| 908 | 913 | ||
| 909 | optimizer = create_optimizer( | 914 | optimizer = create_optimizer( |
| 910 | text_encoder.text_model.embeddings.token_override_embedding.parameters(), | 915 | text_encoder.text_model.embeddings.token_override_embedding.parameters(), |
| 911 | lr=args.learning_rate, | 916 | lr=learning_rate, |
| 912 | ) | 917 | ) |
| 913 | 918 | ||
| 914 | lr_scheduler = get_scheduler( | 919 | lr_scheduler = get_scheduler( |
| @@ -948,6 +953,8 @@ def main(): | |||
| 948 | ) | 953 | ) |
| 949 | 954 | ||
| 950 | training_iter += 1 | 955 | training_iter += 1 |
| 956 | if args.learning_rate is not None: | ||
| 957 | learning_rate *= args.cycle_decay | ||
| 951 | 958 | ||
| 952 | accelerator.end_training() | 959 | accelerator.end_training() |
| 953 | 960 | ||
diff --git a/training/functional.py b/training/functional.py index be39776..ed8ae3a 100644 --- a/training/functional.py +++ b/training/functional.py | |||
| @@ -168,8 +168,7 @@ def save_samples( | |||
| 168 | image_grid = pipeline.numpy_to_pil(image_grid.unsqueeze(0).permute(0, 2, 3, 1).numpy())[0] | 168 | image_grid = pipeline.numpy_to_pil(image_grid.unsqueeze(0).permute(0, 2, 3, 1).numpy())[0] |
| 169 | image_grid.save(file_path, quality=85) | 169 | image_grid.save(file_path, quality=85) |
| 170 | 170 | ||
| 171 | del generator | 171 | del generator, pipeline |
| 172 | del pipeline | ||
| 173 | 172 | ||
| 174 | if torch.cuda.is_available(): | 173 | if torch.cuda.is_available(): |
| 175 | torch.cuda.empty_cache() | 174 | torch.cuda.empty_cache() |
| @@ -398,31 +397,32 @@ def loss_step( | |||
| 398 | else: | 397 | else: |
| 399 | raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") | 398 | raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") |
| 400 | 399 | ||
| 401 | if disc is None: | 400 | acc = (model_pred == target).float().mean() |
| 402 | if guidance_scale == 0 and prior_loss_weight != 0: | ||
| 403 | # Chunk the noise and model_pred into two parts and compute the loss on each part separately. | ||
| 404 | model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) | ||
| 405 | target, target_prior = torch.chunk(target, 2, dim=0) | ||
| 406 | 401 | ||
| 407 | # Compute instance loss | 402 | if guidance_scale == 0 and prior_loss_weight != 0: |
| 408 | loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") | 403 | # Chunk the noise and model_pred into two parts and compute the loss on each part separately. |
| 404 | model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) | ||
| 405 | target, target_prior = torch.chunk(target, 2, dim=0) | ||
| 409 | 406 | ||
| 410 | # Compute prior loss | 407 | # Compute instance loss |
| 411 | prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="none") | 408 | loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") |
| 412 | 409 | ||
| 413 | # Add the prior loss to the instance loss. | 410 | # Compute prior loss |
| 414 | loss = loss + prior_loss_weight * prior_loss | 411 | prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="none") |
| 415 | else: | ||
| 416 | loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") | ||
| 417 | 412 | ||
| 418 | loss = loss.mean([1, 2, 3]) | 413 | # Add the prior loss to the instance loss. |
| 414 | loss = loss + prior_loss_weight * prior_loss | ||
| 419 | else: | 415 | else: |
| 416 | loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") | ||
| 417 | |||
| 418 | loss = loss.mean([1, 2, 3]) | ||
| 419 | |||
| 420 | if disc is not None: | ||
| 420 | rec_latent = get_original(noise_scheduler, model_pred, noisy_latents, timesteps) | 421 | rec_latent = get_original(noise_scheduler, model_pred, noisy_latents, timesteps) |
| 421 | rec_latent /= vae.config.scaling_factor | 422 | rec_latent /= vae.config.scaling_factor |
| 422 | rec_latent = rec_latent.to(dtype=vae.dtype) | 423 | rec_latent = rec_latent.to(dtype=vae.dtype) |
| 423 | rec = vae.decode(rec_latent).sample | 424 | rec = vae.decode(rec_latent).sample |
| 424 | loss = 1 - disc.get_score(rec) | 425 | loss = 1 - disc.get_score(rec) |
| 425 | del rec_latent, rec | ||
| 426 | 426 | ||
| 427 | if min_snr_gamma != 0: | 427 | if min_snr_gamma != 0: |
| 428 | snr = compute_snr(timesteps, noise_scheduler) | 428 | snr = compute_snr(timesteps, noise_scheduler) |
| @@ -432,7 +432,6 @@ def loss_step( | |||
| 432 | loss *= mse_loss_weights | 432 | loss *= mse_loss_weights |
| 433 | 433 | ||
| 434 | loss = loss.mean() | 434 | loss = loss.mean() |
| 435 | acc = (model_pred == target).float().mean() | ||
| 436 | 435 | ||
| 437 | return loss, acc, bsz | 436 | return loss, acc, bsz |
| 438 | 437 | ||
diff --git a/training/strategy/dreambooth.py b/training/strategy/dreambooth.py index fa51bc7..4ae28b7 100644 --- a/training/strategy/dreambooth.py +++ b/training/strategy/dreambooth.py | |||
| @@ -142,9 +142,7 @@ def dreambooth_strategy_callbacks( | |||
| 142 | ) | 142 | ) |
| 143 | pipeline.save_pretrained(checkpoint_output_dir) | 143 | pipeline.save_pretrained(checkpoint_output_dir) |
| 144 | 144 | ||
| 145 | del unet_ | 145 | del unet_, text_encoder_, pipeline |
| 146 | del text_encoder_ | ||
| 147 | del pipeline | ||
| 148 | 146 | ||
| 149 | if torch.cuda.is_available(): | 147 | if torch.cuda.is_available(): |
| 150 | torch.cuda.empty_cache() | 148 | torch.cuda.empty_cache() |
| @@ -165,8 +163,7 @@ def dreambooth_strategy_callbacks( | |||
| 165 | unet_.to(dtype=orig_unet_dtype) | 163 | unet_.to(dtype=orig_unet_dtype) |
| 166 | text_encoder_.to(dtype=orig_text_encoder_dtype) | 164 | text_encoder_.to(dtype=orig_text_encoder_dtype) |
| 167 | 165 | ||
| 168 | del unet_ | 166 | del unet_, text_encoder_ |
| 169 | del text_encoder_ | ||
| 170 | 167 | ||
| 171 | if torch.cuda.is_available(): | 168 | if torch.cuda.is_available(): |
| 172 | torch.cuda.empty_cache() | 169 | torch.cuda.empty_cache() |
diff --git a/training/strategy/lora.py b/training/strategy/lora.py index 73ec8f2..1517ee8 100644 --- a/training/strategy/lora.py +++ b/training/strategy/lora.py | |||
| @@ -140,8 +140,7 @@ def lora_strategy_callbacks( | |||
| 140 | with open(checkpoint_output_dir / "lora_config.json", "w") as f: | 140 | with open(checkpoint_output_dir / "lora_config.json", "w") as f: |
| 141 | json.dump(lora_config, f) | 141 | json.dump(lora_config, f) |
| 142 | 142 | ||
| 143 | del unet_ | 143 | del unet_, text_encoder_ |
| 144 | del text_encoder_ | ||
| 145 | 144 | ||
| 146 | if torch.cuda.is_available(): | 145 | if torch.cuda.is_available(): |
| 147 | torch.cuda.empty_cache() | 146 | torch.cuda.empty_cache() |
| @@ -153,8 +152,7 @@ def lora_strategy_callbacks( | |||
| 153 | 152 | ||
| 154 | save_samples_(step=step, unet=unet_, text_encoder=text_encoder_) | 153 | save_samples_(step=step, unet=unet_, text_encoder=text_encoder_) |
| 155 | 154 | ||
| 156 | del unet_ | 155 | del unet_, text_encoder_ |
| 157 | del text_encoder_ | ||
| 158 | 156 | ||
| 159 | if torch.cuda.is_available(): | 157 | if torch.cuda.is_available(): |
| 160 | torch.cuda.empty_cache() | 158 | torch.cuda.empty_cache() |
diff --git a/training/strategy/ti.py b/training/strategy/ti.py index 08af89d..ca7cc3d 100644 --- a/training/strategy/ti.py +++ b/training/strategy/ti.py | |||
| @@ -158,8 +158,7 @@ def textual_inversion_strategy_callbacks( | |||
| 158 | unet_.to(dtype=orig_unet_dtype) | 158 | unet_.to(dtype=orig_unet_dtype) |
| 159 | text_encoder_.to(dtype=orig_text_encoder_dtype) | 159 | text_encoder_.to(dtype=orig_text_encoder_dtype) |
| 160 | 160 | ||
| 161 | del unet_ | 161 | del unet_, text_encoder_ |
| 162 | del text_encoder_ | ||
| 163 | 162 | ||
| 164 | if torch.cuda.is_available(): | 163 | if torch.cuda.is_available(): |
| 165 | torch.cuda.empty_cache() | 164 | torch.cuda.empty_cache() |
