From 6b8a93f46f053668c8023520225a18445d48d8f1 Mon Sep 17 00:00:00 2001 From: Volpeon Date: Sat, 25 Mar 2023 16:34:48 +0100 Subject: Update --- data/csv.py | 21 +- .../stable_diffusion/vlpn_stable_diffusion.py | 61 +- ...nvert_diffusers_to_original_stable_diffusion.py | 234 ------- ...nvert_original_stable_diffusion_to_diffusers.py | 690 --------------------- train_dreambooth.py | 14 +- train_lora.py | 16 +- train_ti.py | 14 +- training/functional.py | 36 +- 8 files changed, 99 insertions(+), 987 deletions(-) delete mode 100644 scripts/convert_diffusers_to_original_stable_diffusion.py delete mode 100644 scripts/convert_original_stable_diffusion_to_diffusers.py diff --git a/data/csv.py b/data/csv.py index fba5d4b..a6cd065 100644 --- a/data/csv.py +++ b/data/csv.py @@ -99,14 +99,16 @@ def generate_buckets( return buckets, bucket_items, bucket_assignments -def collate_fn(dtype: torch.dtype, tokenizer: CLIPTokenizer, with_prior_preservation: bool, examples): +def collate_fn(dtype: torch.dtype, tokenizer: CLIPTokenizer, with_guidance: bool, with_prior_preservation: bool, examples): prompt_ids = [example["prompt_ids"] for example in examples] nprompt_ids = [example["nprompt_ids"] for example in examples] input_ids = [example["instance_prompt_ids"] for example in examples] pixel_values = [example["instance_images"] for example in examples] - if with_prior_preservation: + if with_guidance: + input_ids += [example["negative_prompt_ids"] for example in examples] + elif with_prior_preservation: input_ids += [example["class_prompt_ids"] for example in examples] pixel_values += [example["class_images"] for example in examples] @@ -133,7 +135,7 @@ class VlpnDataItem(NamedTuple): class_image_path: Path prompt: list[str] cprompt: str - nprompt: str + nprompt: list[str] collection: list[str] @@ -163,6 +165,7 @@ class VlpnDataModule(): data_file: str, tokenizer: CLIPTokenizer, class_subdir: str = "cls", + with_guidance: bool = False, num_class_images: int = 1, size: int = 768, num_buckets: int = 0, @@ -191,6 +194,7 @@ class VlpnDataModule(): self.class_root = self.data_root / class_subdir self.class_root.mkdir(parents=True, exist_ok=True) self.num_class_images = num_class_images + self.with_guidance = with_guidance self.tokenizer = tokenizer self.size = size @@ -228,10 +232,10 @@ class VlpnDataModule(): cprompt.format(**prepare_prompt(item["prompt"] if "prompt" in item else "")), expansions )), - keywords_to_prompt(prompt_to_keywords( + prompt_to_keywords( nprompt.format(**prepare_prompt(item["nprompt"] if "nprompt" in item else "")), expansions - )), + ), item["collection"].split(", ") if "collection" in item else [] ) for item in data @@ -279,7 +283,7 @@ class VlpnDataModule(): if self.seed is not None: generator = generator.manual_seed(self.seed) - collate_fn_ = partial(collate_fn, self.dtype, self.tokenizer, self.num_class_images != 0) + collate_fn_ = partial(collate_fn, self.dtype, self.tokenizer, self.with_guidance, self.num_class_images != 0) if valid_set_size == 0: data_train, data_val = items, items @@ -443,11 +447,14 @@ class VlpnDataset(IterableDataset): example = {} example["prompt_ids"] = self.get_input_ids(keywords_to_prompt(item.prompt)) - example["nprompt_ids"] = self.get_input_ids(item.nprompt) + example["nprompt_ids"] = self.get_input_ids(keywords_to_prompt(item.nprompt)) example["instance_prompt_ids"] = self.get_input_ids( keywords_to_prompt(item.prompt, self.dropout, True) ) + example["negative_prompt_ids"] = self.get_input_ids( + keywords_to_prompt(item.nprompt, self.dropout, True) + ) example["instance_images"] = image_transforms(get_image(item.instance_image_path)) if self.num_class_images != 0: diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py index ea2a656..127ca50 100644 --- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py +++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py @@ -307,39 +307,45 @@ class VlpnStableDiffusion(DiffusionPipeline): return timesteps, num_inference_steps - t_start - def prepare_image(self, batch_size, width, height, dtype, device, generator=None): - return (1.4 * perlin_noise( + def prepare_brightness_offset(self, batch_size, height, width, dtype, device, generator=None): + offset_image = perlin_noise( (batch_size, 1, width, height), res=1, - octaves=4, generator=generator, dtype=dtype, device=device - )).clamp(-1, 1).expand(batch_size, 3, width, height) + ) + offset_latents = self.vae.encode(offset_image).latent_dist.sample(generator=generator) + offset_latents = self.vae.config.scaling_factor * offset_latents + return offset_latents - def prepare_latents_from_image(self, init_image, timestep, batch_size, dtype, device, generator=None): + def prepare_latents_from_image(self, init_image, timestep, batch_size, brightness_offset, dtype, device, generator=None): init_image = init_image.to(device=device, dtype=dtype) - init_latents = self.vae.encode(init_image).latent_dist.sample(generator=generator) - init_latents = self.vae.config.scaling_factor * init_latents + latents = self.vae.encode(init_image).latent_dist.sample(generator=generator) + latents = self.vae.config.scaling_factor * latents - if batch_size % init_latents.shape[0] != 0: + if batch_size % latents.shape[0] != 0: raise ValueError( - f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + f"Cannot duplicate `init_image` of batch size {latents.shape[0]} to {batch_size} text prompts." ) else: - batch_multiplier = batch_size // init_latents.shape[0] - init_latents = torch.cat([init_latents] * batch_multiplier, dim=0) + batch_multiplier = batch_size // latents.shape[0] + latents = torch.cat([latents] * batch_multiplier, dim=0) # add noise to latents using the timesteps - noise = torch.randn(init_latents.shape, generator=generator, device=device, dtype=dtype) + noise = torch.randn(latents.shape, generator=generator, device=device, dtype=dtype) + + if brightness_offset != 0: + noise += brightness_offset * self.prepare_brightness_offset( + batch_size, init_image.shape[3], init_image.shape[2], dtype, device, generator + ) # get latents - init_latents = self.scheduler.add_noise(init_latents, noise, timestep) - latents = init_latents + latents = self.scheduler.add_noise(latents, noise, timestep) return latents - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + def prepare_latents(self, batch_size, num_channels_latents, height, width, brightness_offset, dtype, device, generator, latents=None): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( @@ -352,6 +358,11 @@ class VlpnStableDiffusion(DiffusionPipeline): else: latents = latents.to(device) + if brightness_offset != 0: + latents += brightness_offset * self.prepare_brightness_offset( + batch_size, height, width, dtype, device, generator + ) + # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma return latents @@ -395,7 +406,8 @@ class VlpnStableDiffusion(DiffusionPipeline): sag_scale: float = 0.75, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - image: Optional[Union[torch.FloatTensor, PIL.Image.Image, Literal["noise"]]] = None, + image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, + brightness_offset: Union[float, torch.FloatTensor] = 0, output_type: str = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, @@ -468,7 +480,7 @@ class VlpnStableDiffusion(DiffusionPipeline): num_channels_latents = self.unet.in_channels do_classifier_free_guidance = guidance_scale > 1.0 do_self_attention_guidance = sag_scale > 0.0 - prep_from_image = isinstance(image, PIL.Image.Image) or image == "noise" + prep_from_image = isinstance(image, PIL.Image.Image) # 3. Encode input prompt prompt_embeds = self.encode_prompt( @@ -482,15 +494,6 @@ class VlpnStableDiffusion(DiffusionPipeline): # 4. Prepare latent variables if isinstance(image, PIL.Image.Image): image = preprocess(image) - elif image == "noise": - image = self.prepare_image( - batch_size * num_images_per_prompt, - width, - height, - prompt_embeds.dtype, - device, - generator - ) # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) @@ -503,9 +506,10 @@ class VlpnStableDiffusion(DiffusionPipeline): image, latent_timestep, batch_size * num_images_per_prompt, + brightness_offset, prompt_embeds.dtype, device, - generator + generator, ) else: latents = self.prepare_latents( @@ -513,10 +517,11 @@ class VlpnStableDiffusion(DiffusionPipeline): num_channels_latents, height, width, + brightness_offset, prompt_embeds.dtype, device, generator, - image + image, ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline diff --git a/scripts/convert_diffusers_to_original_stable_diffusion.py b/scripts/convert_diffusers_to_original_stable_diffusion.py deleted file mode 100644 index 9888f62..0000000 --- a/scripts/convert_diffusers_to_original_stable_diffusion.py +++ /dev/null @@ -1,234 +0,0 @@ -# Script for converting a HF Diffusers saved pipeline to a Stable Diffusion checkpoint. -# *Only* converts the UNet, VAE, and Text Encoder. -# Does not convert optimizer state or any other thing. - -import argparse -import os.path as osp - -import torch - - -# =================# -# UNet Conversion # -# =================# - -unet_conversion_map = [ - # (stable-diffusion, HF Diffusers) - ("time_embed.0.weight", "time_embedding.linear_1.weight"), - ("time_embed.0.bias", "time_embedding.linear_1.bias"), - ("time_embed.2.weight", "time_embedding.linear_2.weight"), - ("time_embed.2.bias", "time_embedding.linear_2.bias"), - ("input_blocks.0.0.weight", "conv_in.weight"), - ("input_blocks.0.0.bias", "conv_in.bias"), - ("out.0.weight", "conv_norm_out.weight"), - ("out.0.bias", "conv_norm_out.bias"), - ("out.2.weight", "conv_out.weight"), - ("out.2.bias", "conv_out.bias"), -] - -unet_conversion_map_resnet = [ - # (stable-diffusion, HF Diffusers) - ("in_layers.0", "norm1"), - ("in_layers.2", "conv1"), - ("out_layers.0", "norm2"), - ("out_layers.3", "conv2"), - ("emb_layers.1", "time_emb_proj"), - ("skip_connection", "conv_shortcut"), -] - -unet_conversion_map_layer = [] -# hardcoded number of downblocks and resnets/attentions... -# would need smarter logic for other networks. -for i in range(4): - # loop over downblocks/upblocks - - for j in range(2): - # loop over resnets/attentions for downblocks - hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}." - sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0." - unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix)) - - if i < 3: - # no attention layers in down_blocks.3 - hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}." - sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1." - unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix)) - - for j in range(3): - # loop over resnets/attentions for upblocks - hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}." - sd_up_res_prefix = f"output_blocks.{3*i + j}.0." - unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix)) - - if i > 0: - # no attention layers in up_blocks.0 - hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}." - sd_up_atn_prefix = f"output_blocks.{3*i + j}.1." - unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix)) - - if i < 3: - # no downsample in down_blocks.3 - hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv." - sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op." - unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix)) - - # no upsample in up_blocks.3 - hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0." - sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}." - unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix)) - -hf_mid_atn_prefix = "mid_block.attentions.0." -sd_mid_atn_prefix = "middle_block.1." -unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix)) - -for j in range(2): - hf_mid_res_prefix = f"mid_block.resnets.{j}." - sd_mid_res_prefix = f"middle_block.{2*j}." - unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix)) - - -def convert_unet_state_dict(unet_state_dict): - # buyer beware: this is a *brittle* function, - # and correct output requires that all of these pieces interact in - # the exact order in which I have arranged them. - mapping = {k: k for k in unet_state_dict.keys()} - for sd_name, hf_name in unet_conversion_map: - mapping[hf_name] = sd_name - for k, v in mapping.items(): - if "resnets" in k: - for sd_part, hf_part in unet_conversion_map_resnet: - v = v.replace(hf_part, sd_part) - mapping[k] = v - for k, v in mapping.items(): - for sd_part, hf_part in unet_conversion_map_layer: - v = v.replace(hf_part, sd_part) - mapping[k] = v - new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()} - return new_state_dict - - -# ================# -# VAE Conversion # -# ================# - -vae_conversion_map = [ - # (stable-diffusion, HF Diffusers) - ("nin_shortcut", "conv_shortcut"), - ("norm_out", "conv_norm_out"), - ("mid.attn_1.", "mid_block.attentions.0."), -] - -for i in range(4): - # down_blocks have two resnets - for j in range(2): - hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}." - sd_down_prefix = f"encoder.down.{i}.block.{j}." - vae_conversion_map.append((sd_down_prefix, hf_down_prefix)) - - if i < 3: - hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0." - sd_downsample_prefix = f"down.{i}.downsample." - vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix)) - - hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0." - sd_upsample_prefix = f"up.{3-i}.upsample." - vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix)) - - # up_blocks have three resnets - # also, up blocks in hf are numbered in reverse from sd - for j in range(3): - hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}." - sd_up_prefix = f"decoder.up.{3-i}.block.{j}." - vae_conversion_map.append((sd_up_prefix, hf_up_prefix)) - -# this part accounts for mid blocks in both the encoder and the decoder -for i in range(2): - hf_mid_res_prefix = f"mid_block.resnets.{i}." - sd_mid_res_prefix = f"mid.block_{i+1}." - vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix)) - - -vae_conversion_map_attn = [ - # (stable-diffusion, HF Diffusers) - ("norm.", "group_norm."), - ("q.", "query."), - ("k.", "key."), - ("v.", "value."), - ("proj_out.", "proj_attn."), -] - - -def reshape_weight_for_sd(w): - # convert HF linear weights to SD conv2d weights - return w.reshape(*w.shape, 1, 1) - - -def convert_vae_state_dict(vae_state_dict): - mapping = {k: k for k in vae_state_dict.keys()} - for k, v in mapping.items(): - for sd_part, hf_part in vae_conversion_map: - v = v.replace(hf_part, sd_part) - mapping[k] = v - for k, v in mapping.items(): - if "attentions" in k: - for sd_part, hf_part in vae_conversion_map_attn: - v = v.replace(hf_part, sd_part) - mapping[k] = v - new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()} - weights_to_convert = ["q", "k", "v", "proj_out"] - for k, v in new_state_dict.items(): - for weight_name in weights_to_convert: - if f"mid.attn_1.{weight_name}.weight" in k: - print(f"Reshaping {k} for SD format") - new_state_dict[k] = reshape_weight_for_sd(v) - return new_state_dict - - -# =========================# -# Text Encoder Conversion # -# =========================# -# pretty much a no-op - - -def convert_text_enc_state_dict(text_enc_dict): - return text_enc_dict - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.") - parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.") - parser.add_argument("--half", action="store_true", help="Save weights in half precision.") - - args = parser.parse_args() - - assert args.model_path is not None, "Must provide a model path!" - - assert args.checkpoint_path is not None, "Must provide a checkpoint path!" - - unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.bin") - vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.bin") - text_enc_path = osp.join(args.model_path, "text_encoder", "pytorch_model.bin") - - # Convert the UNet model - unet_state_dict = torch.load(unet_path, map_location="cpu") - unet_state_dict = convert_unet_state_dict(unet_state_dict) - unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()} - - # Convert the VAE model - vae_state_dict = torch.load(vae_path, map_location="cpu") - vae_state_dict = convert_vae_state_dict(vae_state_dict) - vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()} - - # Convert the text encoder model - text_enc_dict = torch.load(text_enc_path, map_location="cpu") - text_enc_dict = convert_text_enc_state_dict(text_enc_dict) - text_enc_dict = {"cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items()} - - # Put together new checkpoint - state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict} - if args.half: - state_dict = {k: v.half() for k, v in state_dict.items()} - state_dict = {"state_dict": state_dict} - torch.save(state_dict, args.checkpoint_path) diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py deleted file mode 100644 index ee7fc33..0000000 --- a/scripts/convert_original_stable_diffusion_to_diffusers.py +++ /dev/null @@ -1,690 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Conversion script for the LDM checkpoints. """ - -import argparse -import os - -import torch - - -try: - from omegaconf import OmegaConf -except ImportError: - raise ImportError( - "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`." - ) - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - LDMTextToImagePipeline, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionPipeline, - UNet2DConditionModel, -) -from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel -from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker -from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer - - -def shave_segments(path, n_shave_prefix_segments=1): - """ - Removes segments. Positive values shave the first segments, negative shave the last segments. - """ - if n_shave_prefix_segments >= 0: - return ".".join(path.split(".")[n_shave_prefix_segments:]) - else: - return ".".join(path.split(".")[:n_shave_prefix_segments]) - - -def renew_resnet_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside resnets to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item.replace("in_layers.0", "norm1") - new_item = new_item.replace("in_layers.2", "conv1") - - new_item = new_item.replace("out_layers.0", "norm2") - new_item = new_item.replace("out_layers.3", "conv2") - - new_item = new_item.replace("emb_layers.1", "time_emb_proj") - new_item = new_item.replace("skip_connection", "conv_shortcut") - - new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({"old": old_item, "new": new_item}) - - return mapping - - -def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside resnets to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item - - new_item = new_item.replace("nin_shortcut", "conv_shortcut") - new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({"old": old_item, "new": new_item}) - - return mapping - - -def renew_attention_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside attentions to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item - - # new_item = new_item.replace('norm.weight', 'group_norm.weight') - # new_item = new_item.replace('norm.bias', 'group_norm.bias') - - # new_item = new_item.replace('proj_out.weight', 'proj_attn.weight') - # new_item = new_item.replace('proj_out.bias', 'proj_attn.bias') - - # new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({"old": old_item, "new": new_item}) - - return mapping - - -def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): - """ - Updates paths inside attentions to the new naming scheme (local renaming) - """ - mapping = [] - for old_item in old_list: - new_item = old_item - - new_item = new_item.replace("norm.weight", "group_norm.weight") - new_item = new_item.replace("norm.bias", "group_norm.bias") - - new_item = new_item.replace("q.weight", "query.weight") - new_item = new_item.replace("q.bias", "query.bias") - - new_item = new_item.replace("k.weight", "key.weight") - new_item = new_item.replace("k.bias", "key.bias") - - new_item = new_item.replace("v.weight", "value.weight") - new_item = new_item.replace("v.bias", "value.bias") - - new_item = new_item.replace("proj_out.weight", "proj_attn.weight") - new_item = new_item.replace("proj_out.bias", "proj_attn.bias") - - new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) - - mapping.append({"old": old_item, "new": new_item}) - - return mapping - - -def assign_to_checkpoint( - paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None -): - """ - This does the final conversion step: take locally converted weights and apply a global renaming - to them. It splits attention layers, and takes into account additional replacements - that may arise. - - Assigns the weights to the new checkpoint. - """ - assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." - - # Splits the attention layers into three variables. - if attention_paths_to_split is not None: - for path, path_map in attention_paths_to_split.items(): - old_tensor = old_checkpoint[path] - channels = old_tensor.shape[0] // 3 - - target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) - - num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 - - old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) - query, key, value = old_tensor.split(channels // num_heads, dim=1) - - checkpoint[path_map["query"]] = query.reshape(target_shape) - checkpoint[path_map["key"]] = key.reshape(target_shape) - checkpoint[path_map["value"]] = value.reshape(target_shape) - - for path in paths: - new_path = path["new"] - - # These have already been assigned - if attention_paths_to_split is not None and new_path in attention_paths_to_split: - continue - - # Global renaming happens here - new_path = new_path.replace("middle_block.0", "mid_block.resnets.0") - new_path = new_path.replace("middle_block.1", "mid_block.attentions.0") - new_path = new_path.replace("middle_block.2", "mid_block.resnets.1") - - if additional_replacements is not None: - for replacement in additional_replacements: - new_path = new_path.replace(replacement["old"], replacement["new"]) - - # proj_attn.weight has to be converted from conv 1D to linear - if "proj_attn.weight" in new_path: - checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0] - else: - checkpoint[new_path] = old_checkpoint[path["old"]] - - -def conv_attn_to_linear(checkpoint): - keys = list(checkpoint.keys()) - attn_keys = ["query.weight", "key.weight", "value.weight"] - for key in keys: - if ".".join(key.split(".")[-2:]) in attn_keys: - if checkpoint[key].ndim > 2: - checkpoint[key] = checkpoint[key][:, :, 0, 0] - elif "proj_attn.weight" in key: - if checkpoint[key].ndim > 2: - checkpoint[key] = checkpoint[key][:, :, 0] - - -def create_unet_diffusers_config(original_config): - """ - Creates a config for the diffusers based on the config of the LDM model. - """ - unet_params = original_config.model.params.unet_config.params - - block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] - - down_block_types = [] - resolution = 1 - for i in range(len(block_out_channels)): - block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" - down_block_types.append(block_type) - if i != len(block_out_channels) - 1: - resolution *= 2 - - up_block_types = [] - for i in range(len(block_out_channels)): - block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" - up_block_types.append(block_type) - resolution //= 2 - - config = dict( - sample_size=unet_params.image_size, - in_channels=unet_params.in_channels, - out_channels=unet_params.out_channels, - down_block_types=tuple(down_block_types), - up_block_types=tuple(up_block_types), - block_out_channels=tuple(block_out_channels), - layers_per_block=unet_params.num_res_blocks, - cross_attention_dim=unet_params.context_dim, - attention_head_dim=unet_params.num_heads, - ) - - return config - - -def create_vae_diffusers_config(original_config): - """ - Creates a config for the diffusers based on the config of the LDM model. - """ - vae_params = original_config.model.params.first_stage_config.params.ddconfig - _ = original_config.model.params.first_stage_config.params.embed_dim - - block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult] - down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels) - up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels) - - config = dict( - sample_size=vae_params.resolution, - in_channels=vae_params.in_channels, - out_channels=vae_params.out_ch, - down_block_types=tuple(down_block_types), - up_block_types=tuple(up_block_types), - block_out_channels=tuple(block_out_channels), - latent_channels=vae_params.z_channels, - layers_per_block=vae_params.num_res_blocks, - ) - return config - - -def create_diffusers_schedular(original_config): - schedular = DDIMScheduler( - num_train_timesteps=original_config.model.params.timesteps, - beta_start=original_config.model.params.linear_start, - beta_end=original_config.model.params.linear_end, - beta_schedule="scaled_linear", - ) - return schedular - - -def create_ldm_bert_config(original_config): - bert_params = original_config.model.parms.cond_stage_config.params - config = LDMBertConfig( - d_model=bert_params.n_embed, - encoder_layers=bert_params.n_layer, - encoder_ffn_dim=bert_params.n_embed * 4, - ) - return config - - -def convert_ldm_unet_checkpoint(checkpoint, config): - """ - Takes a state dict and a config, and returns a converted checkpoint. - """ - - # extract state_dict for UNet - unet_state_dict = {} - unet_key = "model.diffusion_model." - keys = list(checkpoint.keys()) - for key in keys: - if key.startswith(unet_key): - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key) - - new_checkpoint = {} - - new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] - new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] - new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] - new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] - - new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] - new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] - - new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] - new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"] - new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"] - new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] - - # Retrieves the keys for the input blocks only - num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) - input_blocks = { - layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] - for layer_id in range(num_input_blocks) - } - - # Retrieves the keys for the middle blocks only - num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) - middle_blocks = { - layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] - for layer_id in range(num_middle_blocks) - } - - # Retrieves the keys for the output blocks only - num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) - output_blocks = { - layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] - for layer_id in range(num_output_blocks) - } - - for i in range(1, num_input_blocks): - block_id = (i - 1) // (config["layers_per_block"] + 1) - layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) - - resnets = [ - key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key - ] - attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] - - if f"input_blocks.{i}.0.op.weight" in unet_state_dict: - new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.weight" - ) - new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.bias" - ) - - paths = renew_resnet_paths(resnets) - meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"} - assign_to_checkpoint( - paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config - ) - - if len(attentions): - paths = renew_attention_paths(attentions) - meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"} - assign_to_checkpoint( - paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config - ) - - resnet_0 = middle_blocks[0] - attentions = middle_blocks[1] - resnet_1 = middle_blocks[2] - - resnet_0_paths = renew_resnet_paths(resnet_0) - assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) - - resnet_1_paths = renew_resnet_paths(resnet_1) - assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) - - attentions_paths = renew_attention_paths(attentions) - meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} - assign_to_checkpoint( - attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config - ) - - for i in range(num_output_blocks): - block_id = i // (config["layers_per_block"] + 1) - layer_in_block_id = i % (config["layers_per_block"] + 1) - output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] - output_block_list = {} - - for layer in output_block_layers: - layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1) - if layer_id in output_block_list: - output_block_list[layer_id].append(layer_name) - else: - output_block_list[layer_id] = [layer_name] - - if len(output_block_list) > 1: - resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] - attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] - - resnet_0_paths = renew_resnet_paths(resnets) - paths = renew_resnet_paths(resnets) - - meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"} - assign_to_checkpoint( - paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config - ) - - if ["conv.weight", "conv.bias"] in output_block_list.values(): - index = list(output_block_list.values()).index(["conv.weight", "conv.bias"]) - new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.weight" - ] - new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.bias" - ] - - # Clear attentions as they have been attributed above. - if len(attentions) == 2: - attentions = [] - - if len(attentions): - paths = renew_attention_paths(attentions) - meta_path = { - "old": f"output_blocks.{i}.1", - "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}", - } - assign_to_checkpoint( - paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config - ) - else: - resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) - for path in resnet_0_paths: - old_path = ".".join(["output_blocks", str(i), path["old"]]) - new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]]) - - new_checkpoint[new_path] = unet_state_dict[old_path] - - return new_checkpoint - - -def convert_ldm_vae_checkpoint(checkpoint, config): - # extract state dict for VAE - vae_state_dict = {} - vae_key = "first_stage_model." - keys = list(checkpoint.keys()) - for key in keys: - if key.startswith(vae_key): - vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key) - - new_checkpoint = {} - - new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] - new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] - new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] - new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] - new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] - new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] - - new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] - new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] - new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] - new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] - new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] - new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] - - new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] - new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] - new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] - new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] - - # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) - down_blocks = { - layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) - } - - # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) - up_blocks = { - layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) - } - - for i in range(num_down_blocks): - resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] - - if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.weight" - ) - new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.bias" - ) - - paths = renew_vae_resnet_paths(resnets) - meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] - num_mid_res_blocks = 2 - for i in range(1, num_mid_res_blocks + 1): - resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] - - paths = renew_vae_resnet_paths(resnets) - meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] - paths = renew_vae_attention_paths(mid_attentions) - meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - conv_attn_to_linear(new_checkpoint) - - for i in range(num_up_blocks): - block_id = num_up_blocks - 1 - i - resnets = [ - key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key - ] - - if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.weight" - ] - new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.bias" - ] - - paths = renew_vae_resnet_paths(resnets) - meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] - num_mid_res_blocks = 2 - for i in range(1, num_mid_res_blocks + 1): - resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] - - paths = renew_vae_resnet_paths(resnets) - meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - - mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] - paths = renew_vae_attention_paths(mid_attentions) - meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} - assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config) - conv_attn_to_linear(new_checkpoint) - return new_checkpoint - - -def convert_ldm_bert_checkpoint(checkpoint, config): - def _copy_attn_layer(hf_attn_layer, pt_attn_layer): - hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight - hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight - hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight - - hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight - hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias - - def _copy_linear(hf_linear, pt_linear): - hf_linear.weight = pt_linear.weight - hf_linear.bias = pt_linear.bias - - def _copy_layer(hf_layer, pt_layer): - # copy layer norms - _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0]) - _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0]) - - # copy attn - _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1]) - - # copy MLP - pt_mlp = pt_layer[1][1] - _copy_linear(hf_layer.fc1, pt_mlp.net[0][0]) - _copy_linear(hf_layer.fc2, pt_mlp.net[2]) - - def _copy_layers(hf_layers, pt_layers): - for i, hf_layer in enumerate(hf_layers): - if i != 0: - i += i - pt_layer = pt_layers[i : i + 2] - _copy_layer(hf_layer, pt_layer) - - hf_model = LDMBertModel(config).eval() - - # copy embeds - hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight - hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight - - # copy layer norm - _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm) - - # copy hidden layers - _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers) - - _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits) - - return hf_model - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert." - ) - # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml - parser.add_argument( - "--original_config_file", - default=None, - type=str, - help="The YAML config file corresponding to the original architecture.", - ) - parser.add_argument( - "--scheduler_type", - default="pndm", - type=str, - help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']", - ) - parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.") - - args = parser.parse_args() - - if args.original_config_file is None: - os.system( - "wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" - ) - args.original_config_file = "./v1-inference.yaml" - - original_config = OmegaConf.load(args.original_config_file) - checkpoint = torch.load(args.checkpoint_path)["state_dict"] - - num_train_timesteps = original_config.model.params.timesteps - beta_start = original_config.model.params.linear_start - beta_end = original_config.model.params.linear_end - if args.scheduler_type == "pndm": - scheduler = PNDMScheduler( - beta_end=beta_end, - beta_schedule="scaled_linear", - beta_start=beta_start, - num_train_timesteps=num_train_timesteps, - skip_prk_steps=True, - ) - elif args.scheduler_type == "lms": - scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") - elif args.scheduler_type == "ddim": - scheduler = DDIMScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - else: - raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") - - # Convert the UNet2DConditionModel model. - unet_config = create_unet_diffusers_config(original_config) - converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config) - - unet = UNet2DConditionModel(**unet_config) - unet.load_state_dict(converted_unet_checkpoint) - - # Convert the VAE model. - vae_config = create_vae_diffusers_config(original_config) - converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) - - vae = AutoencoderKL(**vae_config) - vae.load_state_dict(converted_vae_checkpoint) - - # Convert the text model. - text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] - if text_model_type == "FrozenCLIPEmbedder": - text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") - feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") - pipe = StableDiffusionPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=safety_checker, - feature_extractor=feature_extractor, - ) - else: - text_config = create_ldm_bert_config(original_config) - text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) - tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") - pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler) - - pipe.save_pretrained(args.dump_path) diff --git a/train_dreambooth.py b/train_dreambooth.py index 1b8a3d2..7a33bca 100644 --- a/train_dreambooth.py +++ b/train_dreambooth.py @@ -110,7 +110,7 @@ def parse_args(): parser.add_argument( "--tag_dropout", type=float, - default=0.1, + default=0, help="Tag dropout probability.", ) parser.add_argument( @@ -130,6 +130,11 @@ def parse_args(): default="auto", help='Vector shuffling algorithm. Choose between ["all", "trailing", "leading", "between", "auto", "off"]', ) + parser.add_argument( + "--guidance_scale", + type=float, + default=0, + ) parser.add_argument( "--num_class_images", type=int, @@ -178,7 +183,7 @@ def parse_args(): parser.add_argument( "--offset_noise_strength", type=float, - default=0.15, + default=0, help="Perlin offset noise strength.", ) parser.add_argument( @@ -557,8 +562,8 @@ def main(): vae=vae, noise_scheduler=noise_scheduler, dtype=weight_dtype, - with_prior_preservation=args.num_class_images != 0, - prior_loss_weight=args.prior_loss_weight, + guidance_scale=args.guidance_scale, + prior_loss_weight=args.prior_loss_weight if args.num_class_images != 0 else 0, no_val=args.valid_set_size == 0, ) @@ -570,6 +575,7 @@ def main(): batch_size=args.train_batch_size, tokenizer=tokenizer, class_subdir=args.class_image_dir, + with_guidance=args.guidance_scale != 0, num_class_images=args.num_class_images, size=args.resolution, num_buckets=args.num_buckets, diff --git a/train_lora.py b/train_lora.py index b16a99b..684d0cc 100644 --- a/train_lora.py +++ b/train_lora.py @@ -88,7 +88,7 @@ def parse_args(): parser.add_argument( "--num_buckets", type=int, - default=0, + default=2, help="Number of aspect ratio buckets in either direction.", ) parser.add_argument( @@ -111,7 +111,7 @@ def parse_args(): parser.add_argument( "--tag_dropout", type=float, - default=0.1, + default=0, help="Tag dropout probability.", ) parser.add_argument( @@ -119,6 +119,11 @@ def parse_args(): action="store_true", help="Shuffle tags.", ) + parser.add_argument( + "--guidance_scale", + type=float, + default=0, + ) parser.add_argument( "--num_class_images", type=int, @@ -167,7 +172,7 @@ def parse_args(): parser.add_argument( "--offset_noise_strength", type=float, - default=0.15, + default=0, help="Perlin offset noise strength.", ) parser.add_argument( @@ -589,8 +594,8 @@ def main(): vae=vae, noise_scheduler=noise_scheduler, dtype=weight_dtype, - with_prior_preservation=args.num_class_images != 0, - prior_loss_weight=args.prior_loss_weight, + guidance_scale=args.guidance_scale, + prior_loss_weight=args.prior_loss_weight if args.num_class_images != 0 else 0, no_val=args.valid_set_size == 0, ) @@ -602,6 +607,7 @@ def main(): batch_size=args.train_batch_size, tokenizer=tokenizer, class_subdir=args.class_image_dir, + with_guidance=args.guidance_scale != 0, num_class_images=args.num_class_images, size=args.resolution, num_buckets=args.num_buckets, diff --git a/train_ti.py b/train_ti.py index bbc5524..83ad46d 100644 --- a/train_ti.py +++ b/train_ti.py @@ -90,6 +90,11 @@ def parse_args(): "--sequential", action="store_true", ) + parser.add_argument( + "--guidance_scale", + type=float, + default=0, + ) parser.add_argument( "--num_class_images", type=int, @@ -167,7 +172,7 @@ def parse_args(): parser.add_argument( "--tag_dropout", type=float, - default=0.1, + default=0, help="Tag dropout probability.", ) parser.add_argument( @@ -190,7 +195,7 @@ def parse_args(): parser.add_argument( "--offset_noise_strength", type=float, - default=0.15, + default=0, help="Perlin offset noise strength.", ) parser.add_argument( @@ -651,8 +656,8 @@ def main(): noise_scheduler=noise_scheduler, dtype=weight_dtype, seed=args.seed, - with_prior_preservation=args.num_class_images != 0, - prior_loss_weight=args.prior_loss_weight, + guidance_scale=args.guidance_scale, + prior_loss_weight=args.prior_loss_weight if args.num_class_images != 0 else 0, no_val=args.valid_set_size == 0, strategy=textual_inversion_strategy, num_train_epochs=args.num_train_epochs, @@ -705,6 +710,7 @@ def main(): batch_size=args.train_batch_size, tokenizer=tokenizer, class_subdir=args.class_image_dir, + with_guidance=args.guidance_scale != 0, num_class_images=args.num_class_images, size=args.resolution, num_buckets=args.num_buckets, diff --git a/training/functional.py b/training/functional.py index 87bb339..d285366 100644 --- a/training/functional.py +++ b/training/functional.py @@ -274,7 +274,7 @@ def loss_step( noise_scheduler: SchedulerMixin, unet: UNet2DConditionModel, text_encoder: CLIPTextModel, - with_prior_preservation: bool, + guidance_scale: float, prior_loss_weight: float, seed: int, offset_noise_strength: float, @@ -283,13 +283,13 @@ def loss_step( eval: bool = False, min_snr_gamma: int = 5, ): - # Convert images to latent space - latents = vae.encode(batch["pixel_values"]).latent_dist.sample() - latents = latents * vae.config.scaling_factor - - bsz = latents.shape[0] + images = batch["pixel_values"] + generator = torch.Generator(device=images.device).manual_seed(seed + step) if eval else None + bsz = images.shape[0] - generator = torch.Generator(device=latents.device).manual_seed(seed + step) if eval else None + # Convert images to latent space + latents = vae.encode(images).latent_dist.sample(generator=generator) + latents *= vae.config.scaling_factor # Sample noise that we'll add to the latents noise = torch.randn( @@ -301,13 +301,13 @@ def loss_step( ) if offset_noise_strength != 0: - noise += offset_noise_strength * perlin_noise( - latents.shape, - res=1, + offset_noise = torch.randn( + (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype, device=latents.device, generator=generator - ) + ).expand(noise.shape) + noise += offset_noise_strength * offset_noise # Sample a random timestep for each image timesteps = torch.randint( @@ -343,7 +343,13 @@ def loss_step( else: raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") - if with_prior_preservation: + if guidance_scale != 0: + # Chunk the noise and model_pred into two parts and compute the loss on each part separately. + model_pred_uncond, model_pred_text = torch.chunk(model_pred, 2, dim=0) + model_pred = model_pred_uncond + guidance_scale * (model_pred_text - model_pred_uncond) + + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") + elif prior_loss_weight != 0: # Chunk the noise and model_pred into two parts and compute the loss on each part separately. model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) target, target_prior = torch.chunk(target, 2, dim=0) @@ -607,9 +613,9 @@ def train( checkpoint_frequency: int = 50, milestone_checkpoints: bool = True, global_step_offset: int = 0, - with_prior_preservation: bool = False, + guidance_scale: float = 0.0, prior_loss_weight: float = 1.0, - offset_noise_strength: float = 0.1, + offset_noise_strength: float = 0.15, **kwargs, ): text_encoder, unet, optimizer, train_dataloader, val_dataloader, lr_scheduler, extra = strategy.prepare( @@ -638,7 +644,7 @@ def train( noise_scheduler, unet, text_encoder, - with_prior_preservation, + guidance_scale, prior_loss_weight, seed, offset_noise_strength, -- cgit v1.2.3-70-g09d2