From 347ad308f8223d966793f0421c72432f7e912377 Mon Sep 17 00:00:00 2001 From: Volpeon Date: Wed, 8 Feb 2023 11:38:56 +0100 Subject: Integrate Self-Attention-Guided (SAG) Stable Diffusion in my custom pipeline --- .../stable_diffusion/vlpn_stable_diffusion.py | 169 ++++++++++++++++++++- 1 file changed, 162 insertions(+), 7 deletions(-) (limited to 'pipelines') diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py index dab7878..66566b0 100644 --- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py +++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py @@ -1,9 +1,11 @@ import inspect import warnings +import math from typing import List, Dict, Any, Optional, Union, Callable import numpy as np import torch +import torchvision.transforms as T import PIL from diffusers.configuration_utils import FrozenDict @@ -37,6 +39,35 @@ def preprocess(image): return 2.0 * image - 1.0 +class CrossAttnStoreProcessor: + def __init__(self): + self.attention_probs = None + + def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None): + batch_size, sequence_length, _ = hidden_states.shape + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + query = attn.to_q(hidden_states) + query = attn.head_to_batch_dim(query) + + encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + self.attention_probs = attn.get_attention_scores(query, key, attention_mask) + hidden_states = torch.bmm(self.attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states + + class VlpnStableDiffusion(DiffusionPipeline): def __init__( self, @@ -233,9 +264,9 @@ class VlpnStableDiffusion(DiffusionPipeline): else: attention_mask = None - text_embeddings = get_extended_embeddings(self.text_encoder, text_input_ids, attention_mask) + prompt_embeds = get_extended_embeddings(self.text_encoder, text_input_ids, attention_mask) - return text_embeddings + return prompt_embeds def get_timesteps(self, latents_are_image, num_inference_steps, strength, device): if latents_are_image: @@ -330,6 +361,7 @@ class VlpnStableDiffusion(DiffusionPipeline): width: Optional[int] = None, num_inference_steps: int = 50, guidance_scale: float = 7.5, + sag_scale: float = 0.75, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, @@ -403,10 +435,11 @@ class VlpnStableDiffusion(DiffusionPipeline): batch_size = len(prompt) device = self.execution_device do_classifier_free_guidance = guidance_scale > 1.0 + do_self_attention_guidance = sag_scale > 0.0 latents_are_image = isinstance(image, PIL.Image.Image) # 3. Encode input prompt - text_embeddings = self.encode_prompt( + prompt_embeds = self.encode_prompt( prompt, negative_prompt, num_images_per_prompt, @@ -427,7 +460,7 @@ class VlpnStableDiffusion(DiffusionPipeline): image, latent_timestep, batch_size * num_images_per_prompt, - text_embeddings.dtype, + prompt_embeds.dtype, device, generator ) @@ -437,7 +470,7 @@ class VlpnStableDiffusion(DiffusionPipeline): num_channels_latents, height, width, - text_embeddings.dtype, + prompt_embeds.dtype, device, generator, image, @@ -446,7 +479,11 @@ class VlpnStableDiffusion(DiffusionPipeline): # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 7. Denoising loop + # 7. Denoising loo + if do_self_attention_guidance: + store_processor = CrossAttnStoreProcessor() + self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): @@ -458,7 +495,7 @@ class VlpnStableDiffusion(DiffusionPipeline): noise_pred = self.unet( latent_model_input, t, - encoder_hidden_states=text_embeddings, + encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, ).sample @@ -467,6 +504,36 @@ class VlpnStableDiffusion(DiffusionPipeline): noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + if do_self_attention_guidance: + # classifier-free guidance produces two chunks of attention map + # and we only use unconditional one according to equation (24) + # in https://arxiv.org/pdf/2210.00939.pdf + if do_classifier_free_guidance: + # DDIM-like prediction of x0 + pred_x0 = self.pred_x0_from_eps(latents, noise_pred_uncond, t) + # get the stored attention maps + uncond_attn, cond_attn = store_processor.attention_probs.chunk(2) + # self-attention-based degrading of latents + degraded_latents = self.sag_masking( + pred_x0, uncond_attn, t, self.pred_eps_from_noise(latents, noise_pred_uncond, t) + ) + uncond_emb, _ = prompt_embeds.chunk(2) + # forward and give guidance + degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=uncond_emb).sample + noise_pred += sag_scale * (noise_pred_uncond - degraded_pred) + else: + # DDIM-like prediction of x0 + pred_x0 = self.pred_x0_from_eps(latents, noise_pred, t) + # get the stored attention maps + cond_attn = store_processor.attention_probs + # self-attention-based degrading of latents + degraded_latents = self.sag_masking( + pred_x0, cond_attn, t, self.pred_eps_from_noise(latents, noise_pred, t) + ) + # forward and give guidance + degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=prompt_embeds).sample + noise_pred += sag_scale * (noise_pred - degraded_pred) + # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample @@ -490,3 +557,91 @@ class VlpnStableDiffusion(DiffusionPipeline): return (image, has_nsfw_concept) return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) + + # Self-Attention-Guided (SAG) Stable Diffusion + + def sag_masking(self, original_latents, attn_map, t, eps): + # Same masking process as in SAG paper: https://arxiv.org/pdf/2210.00939.pdf + bh, hw1, hw2 = attn_map.shape + b, latent_channel, latent_h, latent_w = original_latents.shape + h = self.unet.attention_head_dim + if isinstance(h, list): + h = h[-1] + map_size = math.isqrt(hw1) + + # Produce attention mask + attn_map = attn_map.reshape(b, h, hw1, hw2) + attn_mask = attn_map.mean(1, keepdim=False).sum(1, keepdim=False) > 1.0 + attn_mask = ( + attn_mask.reshape(b, map_size, map_size).unsqueeze(1).repeat(1, latent_channel, 1, 1).type(attn_map.dtype) + ) + attn_mask = torch.nn.functional.interpolate(attn_mask, (latent_h, latent_w)) + + # Blur according to the self-attention mask + transform = T.GaussianBlur(kernel_size=9, sigma=1.0) + degraded_latents = transform(original_latents) + degraded_latents = degraded_latents * attn_mask + original_latents * (1 - attn_mask) + + # Noise it again to match the noise level + degraded_latents = self.scheduler.add_noise(degraded_latents, noise=eps, timesteps=t) + + return degraded_latents + + # Modified from diffusers.schedulers.scheduling_ddim.DDIMScheduler.step + def pred_x0_from_eps(self, sample, model_output, timestep): + # 1. get previous step value (=t-1) + # prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps + + # 2. compute alphas, betas + alpha_prod_t = self.scheduler.alphas_cumprod[timestep] + # alpha_prod_t_prev = ( + # self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.scheduler.final_alpha_cumprod + # ) + + beta_prod_t = 1 - alpha_prod_t + # 3. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + if self.scheduler.config.prediction_type == "epsilon": + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + elif self.scheduler.config.prediction_type == "sample": + pred_original_sample = model_output + elif self.scheduler.config.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + # predict V + model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample + else: + raise ValueError( + f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`," + " or `v_prediction`" + ) + # # 4. Clip "predicted x_0" + # if self.scheduler.config.clip_sample: + # pred_original_sample = torch.clamp(pred_original_sample, -1, 1) + + return pred_original_sample + + def pred_eps_from_noise(self, sample, model_output, timestep): + # 1. get previous step value (=t-1) + # prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps + + # 2. compute alphas, betas + alpha_prod_t = self.scheduler.alphas_cumprod[timestep] + # alpha_prod_t_prev = ( + # self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.scheduler.final_alpha_cumprod + # ) + + beta_prod_t = 1 - alpha_prod_t + # 3. compute predicted eps from model output + if self.scheduler.config.prediction_type == "epsilon": + pred_eps = model_output + elif self.scheduler.config.prediction_type == "sample": + pred_eps = (sample - (alpha_prod_t**0.5) * model_output) / (beta_prod_t**0.5) + elif self.scheduler.config.prediction_type == "v_prediction": + pred_eps = (beta_prod_t**0.5) * sample + (alpha_prod_t**0.5) * model_output + else: + raise ValueError( + f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`," + " or `v_prediction`" + ) + + return pred_eps -- cgit v1.2.3-54-g00ecf