From 515f0f1fdc9a76bf63bd746c291dcfec7fc747fb Mon Sep 17 00:00:00 2001 From: Volpeon Date: Thu, 13 Oct 2022 21:11:53 +0200 Subject: Added support for Aesthetic Gradients --- .gitignore | 4 +- aesthetic_gradient.py | 137 +++++++++++++++++++++ data/csv.py | 2 +- dreambooth.py | 10 -- dreambooth_plus.py | 16 +-- infer.py | 75 +++++++---- .../stable_diffusion/vlpn_stable_diffusion.py | 52 +++++++- textual_inversion.py | 10 -- 8 files changed, 245 insertions(+), 61 deletions(-) create mode 100644 aesthetic_gradient.py diff --git a/.gitignore b/.gitignore index 6b9605f..d84b4dd 100644 --- a/.gitignore +++ b/.gitignore @@ -161,5 +161,7 @@ cython_debug/ output/ conf/ -embeddings/ +embeddings_ti/ +embeddings_ag/ v1-inference.yaml* +*.old diff --git a/aesthetic_gradient.py b/aesthetic_gradient.py new file mode 100644 index 0000000..5386d0f --- /dev/null +++ b/aesthetic_gradient.py @@ -0,0 +1,137 @@ +import argparse +import datetime +import logging +import json +from pathlib import Path + +import torch +import torch.utils.checkpoint +from torchvision import transforms +import pandas as pd + +from accelerate.logging import get_logger +from PIL import Image +from tqdm import tqdm +from transformers import CLIPModel +from slugify import slugify + +logger = get_logger(__name__) + + +torch.backends.cuda.matmul.allow_tf32 = True + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Simple example of a training script." + ) + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--train_data_file", + type=str, + default=None, + help="A directory." + ) + parser.add_argument( + "--token", + type=str, + default=None, + help="A token to use as a placeholder for the concept.", + ) + parser.add_argument( + "--resolution", + type=int, + default=224, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--output_dir", + type=str, + default="output/aesthetic-gradient", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument( + "--config", + type=str, + default=None, + help="Path to a JSON configuration file containing arguments for invoking this script. If resume_from is given, its resume.json takes priority over this." + ) + + args = parser.parse_args() + if args.config is not None: + with open(args.config, 'rt') as f: + args = parser.parse_args( + namespace=argparse.Namespace(**json.load(f)["args"])) + + if args.train_data_file is None: + raise ValueError("You must specify --train_data_file") + + if args.token is None: + raise ValueError("You must specify --token") + + if args.output_dir is None: + raise ValueError("You must specify --output_dir") + + return args + + +def main(): + args = parse_args() + + now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + basepath = Path(args.output_dir) + basepath.mkdir(parents=True, exist_ok=True) + target = basepath.joinpath(f"{slugify(args.token)}-{now}.pt") + + logging.basicConfig(filename=basepath.joinpath("log.txt"), level=logging.DEBUG) + + data_file = Path(args.train_data_file) + if not data_file.is_file(): + raise ValueError("data_file must be a file") + data_root = data_file.parent + metadata = pd.read_csv(data_file) + image_paths = [ + data_root.joinpath(item.image) + for item in metadata.itertuples() + if "skip" not in item or item.skip != "x" + ] + + model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") + + image_transforms = transforms.Compose( + [ + transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.LANCZOS), + transforms.RandomCrop(args.resolution), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + with torch.no_grad(): + embs = [] + for path in tqdm(image_paths): + image = Image.open(path) + if not image.mode == "RGB": + image = image.convert("RGB") + image = image_transforms(image).unsqueeze(0) + emb = model.get_image_features(image) + print(f">>>> {emb.shape}") + embs.append(emb) + + embs = torch.cat(embs, dim=0).mean(dim=0, keepdim=True) + + print(embs.shape) + + torch.save(embs, target) + + +if __name__ == "__main__": + main() diff --git a/data/csv.py b/data/csv.py index 253ce9e..aad970c 100644 --- a/data/csv.py +++ b/data/csv.py @@ -23,7 +23,7 @@ class CSVDataModule(pl.LightningDataModule): tokenizer, instance_identifier, class_identifier=None, - class_subdir="db_cls", + class_subdir="cls", num_class_images=100, size=512, repeats=100, diff --git a/dreambooth.py b/dreambooth.py index 699313e..072142e 100644 --- a/dreambooth.py +++ b/dreambooth.py @@ -215,12 +215,6 @@ def parse_args(): "and an Nvidia Ampere GPU." ), ) - parser.add_argument( - "--local_rank", - type=int, - default=-1, - help="For distributed training: local_rank" - ) parser.add_argument( "--sample_frequency", type=int, @@ -287,10 +281,6 @@ def parse_args(): args = parser.parse_args( namespace=argparse.Namespace(**json.load(f)["args"])) - env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) - if env_local_rank != -1 and env_local_rank != args.local_rank: - args.local_rank = env_local_rank - if args.train_data_file is None: raise ValueError("You must specify --train_data_file") diff --git a/dreambooth_plus.py b/dreambooth_plus.py index 9e482b3..7996bc2 100644 --- a/dreambooth_plus.py +++ b/dreambooth_plus.py @@ -112,7 +112,7 @@ def parse_args(): parser.add_argument( "--max_train_steps", type=int, - default=3000, + default=1600, help="Total number of training steps to perform. If provided, overrides num_train_epochs.", ) parser.add_argument( @@ -129,13 +129,13 @@ def parse_args(): parser.add_argument( "--learning_rate_unet", type=float, - default=1e-5, + default=5e-6, help="Initial learning rate (after the potential warmup period) to use.", ) parser.add_argument( "--learning_rate_text", type=float, - default=1e-4, + default=5e-4, help="Initial learning rate (after the potential warmup period) to use.", ) parser.add_argument( @@ -221,12 +221,6 @@ def parse_args(): "and an Nvidia Ampere GPU." ), ) - parser.add_argument( - "--local_rank", - type=int, - default=-1, - help="For distributed training: local_rank" - ) parser.add_argument( "--sample_frequency", type=int, @@ -293,10 +287,6 @@ def parse_args(): args = parser.parse_args( namespace=argparse.Namespace(**json.load(f)["args"])) - env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) - if env_local_rank != -1 and env_local_rank != args.local_rank: - args.local_rank = env_local_rank - if args.train_data_file is None: raise ValueError("You must specify --train_data_file") diff --git a/infer.py b/infer.py index 63b16d8..650c119 100644 --- a/infer.py +++ b/infer.py @@ -23,7 +23,8 @@ default_args = { "model": None, "scheduler": "euler_a", "precision": "fp32", - "embeddings_dir": "embeddings", + "ti_embeddings_dir": "embeddings_ti", + "ag_embeddings_dir": "embeddings_ag", "output_dir": "output/inference", "config": None, } @@ -73,7 +74,11 @@ def create_args_parser(): choices=["fp32", "fp16", "bf16"], ) parser.add_argument( - "--embeddings_dir", + "--ti_embeddings_dir", + type=str, + ) + parser.add_argument( + "--ag_embeddings_dir", type=str, ) parser.add_argument( @@ -167,42 +172,63 @@ def save_args(basepath, args, extra={}): json.dump(info, f, indent=4) -def load_embeddings(tokenizer, text_encoder, embeddings_dir): +def load_embeddings_ti(tokenizer, text_encoder, embeddings_dir): + print(f"Loading Textual Inversion embeddings") + embeddings_dir = Path(embeddings_dir) embeddings_dir.mkdir(parents=True, exist_ok=True) for file in embeddings_dir.iterdir(): - placeholder_token = file.stem + if file.is_file(): + placeholder_token = file.stem - num_added_tokens = tokenizer.add_tokens(placeholder_token) - if num_added_tokens == 0: - raise ValueError( - f"The tokenizer already contains the token {placeholder_token}. Please pass a different" - " `placeholder_token` that is not already in the tokenizer." - ) + num_added_tokens = tokenizer.add_tokens(placeholder_token) + if num_added_tokens == 0: + raise ValueError( + f"The tokenizer already contains the token {placeholder_token}. Please pass a different" + " `placeholder_token` that is not already in the tokenizer." + ) text_encoder.resize_token_embeddings(len(tokenizer)) token_embeds = text_encoder.get_input_embeddings().weight.data for file in embeddings_dir.iterdir(): - placeholder_token = file.stem - placeholder_token_id = tokenizer.convert_tokens_to_ids(placeholder_token) + if file.is_file(): + placeholder_token = file.stem + placeholder_token_id = tokenizer.convert_tokens_to_ids(placeholder_token) + + data = torch.load(file, map_location="cpu") + + assert len(data.keys()) == 1, 'embedding file has multiple terms in it' + + emb = next(iter(data.values())) + if len(emb.shape) == 1: + emb = emb.unsqueeze(0) - data = torch.load(file, map_location="cpu") + token_embeds[placeholder_token_id] = emb - assert len(data.keys()) == 1, 'embedding file has multiple terms in it' + print(f"Loaded {placeholder_token}") - emb = next(iter(data.values())) - if len(emb.shape) == 1: - emb = emb.unsqueeze(0) - token_embeds[placeholder_token_id] = emb +def load_embeddings_ag(pipeline, embeddings_dir): + print(f"Loading Aesthetic Gradient embeddings") - print(f"Loaded embedding: {placeholder_token}") + embeddings_dir = Path(embeddings_dir) + embeddings_dir.mkdir(parents=True, exist_ok=True) + + for file in embeddings_dir.iterdir(): + if file.is_file(): + placeholder_token = file.stem + data = torch.load(file, map_location="cpu") -def create_pipeline(model, scheduler, embeddings_dir, dtype): + pipeline.add_aesthetic_gradient_embedding(placeholder_token, data) + + print(f"Loaded {placeholder_token}") + + +def create_pipeline(model, scheduler, ti_embeddings_dir, ag_embeddings_dir, dtype): print("Loading Stable Diffusion pipeline...") tokenizer = CLIPTokenizer.from_pretrained(model, subfolder='tokenizer', torch_dtype=dtype) @@ -210,7 +236,7 @@ def create_pipeline(model, scheduler, embeddings_dir, dtype): vae = AutoencoderKL.from_pretrained(model, subfolder='vae', torch_dtype=dtype) unet = UNet2DConditionModel.from_pretrained(model, subfolder='unet', torch_dtype=dtype) - load_embeddings(tokenizer, text_encoder, embeddings_dir) + load_embeddings_ti(tokenizer, text_encoder, ti_embeddings_dir) if scheduler == "plms": scheduler = PNDMScheduler( @@ -236,10 +262,13 @@ def create_pipeline(model, scheduler, embeddings_dir, dtype): tokenizer=tokenizer, scheduler=scheduler, ) + pipeline.aesthetic_gradient_iters = 30 pipeline.to("cuda") print("Pipeline loaded.") + load_embeddings_ag(pipeline, ag_embeddings_dir) + return pipeline @@ -259,7 +288,7 @@ def generate(output_dir, pipeline, args): else: init_image = None - with torch.autocast("cuda"), torch.inference_mode(): + with torch.autocast("cuda"): for i in range(args.batch_num): pipeline.set_progress_bar_config( desc=f"Batch {i + 1} of {args.batch_num}", @@ -337,7 +366,7 @@ def main(): output_dir = Path(args.output_dir) dtype = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}[args.precision] - pipeline = create_pipeline(args.model, args.scheduler, args.embeddings_dir, dtype) + pipeline = create_pipeline(args.model, args.scheduler, args.ti_embeddings_dir, args.ag_embeddings_dir, dtype) cmd_parser = create_cmd_parser() cmd_prompt = CmdParse(output_dir, pipeline, cmd_parser) cmd_prompt.cmdloop() diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py index 8927a78..1a84c8d 100644 --- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py +++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py @@ -4,13 +4,14 @@ from typing import List, Optional, Union import numpy as np import torch +import torch.optim as optim import PIL from diffusers.configuration_utils import FrozenDict from diffusers import AutoencoderKL, DiffusionPipeline, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput from diffusers.utils import logging -from transformers import CLIPTextModel, CLIPTokenizer +from transformers import CLIPTextModel, CLIPTokenizer, CLIPModel from schedulers.scheduling_euler_a import EulerAScheduler logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -50,6 +51,10 @@ class VlpnStableDiffusion(DiffusionPipeline): new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) + self.aesthetic_gradient_embeddings = {} + self.aesthetic_gradient_lr = 1e-4 + self.aesthetic_gradient_iters = 10 + self.register_modules( vae=vae, text_encoder=text_encoder, @@ -58,6 +63,47 @@ class VlpnStableDiffusion(DiffusionPipeline): scheduler=scheduler, ) + def add_aesthetic_gradient_embedding(self, keyword: str, tensor: torch.IntTensor): + self.aesthetic_gradient_embeddings[keyword] = tensor + + def get_text_embeddings(self, prompt, text_input_ids): + prompt = " ".join(prompt) + + embeddings = [ + embedding + for key, embedding in self.aesthetic_gradient_embeddings.items() + if key in prompt + ] + + if len(embeddings) != 0: + with torch.enable_grad(): + full_clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") + full_clip_model.to(self.device) + full_clip_model.text_model.train() + + optimizer = optim.Adam(full_clip_model.text_model.parameters(), lr=self.aesthetic_gradient_lr) + + for embs in embeddings: + embs = embs.clone().detach().to(self.device) + embs /= embs.norm(dim=-1, keepdim=True) + + for i in range(self.aesthetic_gradient_iters): + text_embs = full_clip_model.get_text_features(text_input_ids) + text_embs /= text_embs.norm(dim=-1, keepdim=True) + sim = text_embs @ embs.T + loss = -sim + loss = loss.mean() + + loss.backward() + optimizer.step() + optimizer.zero_grad() + + full_clip_model.text_model.eval() + + return full_clip_model.text_model(text_input_ids)[0] + else: + return self.text_encoder(text_input_ids)[0] + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): r""" Enable sliced attention computation. @@ -195,7 +241,7 @@ class VlpnStableDiffusion(DiffusionPipeline): ) print(f"Too many tokens: {removed_text}") text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] - text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0] + text_embeddings = self.get_text_embeddings(prompt, text_input_ids.to(self.device)) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -207,7 +253,7 @@ class VlpnStableDiffusion(DiffusionPipeline): uncond_input = self.tokenizer( negative_prompt, padding="max_length", max_length=max_length, return_tensors="pt" ) - uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] + uncond_embeddings = self.get_text_embeddings(negative_prompt, uncond_input.input_ids.to(self.device)) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch diff --git a/textual_inversion.py b/textual_inversion.py index 181a318..9d2840d 100644 --- a/textual_inversion.py +++ b/textual_inversion.py @@ -192,12 +192,6 @@ def parse_args(): "and an Nvidia Ampere GPU." ), ) - parser.add_argument( - "--local_rank", - type=int, - default=-1, - help="For distributed training: local_rank" - ) parser.add_argument( "--checkpoint_frequency", type=int, @@ -280,10 +274,6 @@ def parse_args(): args = parser.parse_args( namespace=argparse.Namespace(**json.load(f)["args"])) - env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) - if env_local_rank != -1 and env_local_rank != args.local_rank: - args.local_rank = env_local_rank - if args.train_data_file is None: raise ValueError("You must specify --train_data_file") -- cgit v1.2.3-54-g00ecf