Removed offset noise from training, added init offset to pipeline

author: Volpeon <git@volpeon.ink> 2023-03-03 22:09:24 +0100
committer: Volpeon <git@volpeon.ink> 2023-03-03 22:09:24 +0100
commit: 220806dbd21da3ba83c14096225c31824dfe81df (patch)
tree: a201272876bfe894f9d504d1582ac022add4b205 /pipelines/stable_diffusion/vlpn_stable_diffusion.py
parent: Implemented different noise offset (diff)
download: textual-inversion-diff-220806dbd21da3ba83c14096225c31824dfe81df.tar.gz
textual-inversion-diff-220806dbd21da3ba83c14096225c31824dfe81df.tar.bz2
textual-inversion-diff-220806dbd21da3ba83c14096225c31824dfe81df.zip
1 files changed, 39 insertions, 56 deletions
diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index cb09fe1..c4f7401 100644
--- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -293,53 +293,39 @@ class VlpnStableDiffusion(DiffusionPipeline):
        return prompt_embeds
-    def get_timesteps(self, latents_are_image, num_inference_steps, strength, device):
+    def get_timesteps(self, num_inference_steps, strength, device):
-        if latents_are_image:
+        # get the original timestep using init_timestep
-            # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
-            offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
-            init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
-            init_timestep = min(init_timestep, num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
-            t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
-            timesteps = self.scheduler.timesteps[t_start:]
-        else:
-            timesteps = self.scheduler.timesteps
        timesteps = timesteps.to(device)
        return timesteps
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+    def prepare_image(self, batch_size, width, height, dtype, device, generator=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        return torch.randn(
+            (batch_size, 1, 1, 1),
-        if isinstance(generator, list) and len(generator) != batch_size:
+            dtype=dtype,
-            raise ValueError(
+            device=device,
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+            generator=generator
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+        ).expand(batch_size, 3, width, height)
-            )
-        if latents is None:
+    def prepare_latents(self, init_image, timestep, batch_size, dtype, device, generator=None):
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device=device, dtype=dtype)
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-    def prepare_latents_from_image(self, init_image, timestep, batch_size, dtype, device, generator=None):
        init_image = init_image.to(device=device, dtype=dtype)
-        init_latent_dist = self.vae.encode(init_image).latent_dist
+        init_latents = self.vae.encode(init_image).latent_dist.sample(generator=generator)
-        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = self.vae.config.scaling_factor * init_latents
-        init_latents = 0.18215 * init_latents
-        if batch_size > init_latents.shape[0]:
+        if batch_size % init_latents.shape[0] != 0:
            raise ValueError(
                f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
            )
        else:
-            init_latents = torch.cat([init_latents] * batch_size, dim=0)
+            batch_multiplier = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * batch_multiplier, dim=0)
        # add noise to latents using the timesteps
        noise = torch.randn(init_latents.shape, generator=generator, device=device, dtype=dtype)
@@ -368,7 +354,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
        return extra_step_kwargs
    def decode_latents(self, latents):
-        latents = 1 / 0.18215 * latents
+        latents = 1 / self.vae.config.scaling_factor * latents
        image = self.vae.decode(latents).sample
        image = (image / 2 + 0.5).clamp(0, 1)
        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
@@ -381,7 +367,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
        prompt: Union[str, List[str], List[int], List[List[int]]],
        negative_prompt: Optional[Union[str, List[str], List[int], List[List[int]]]] = None,
        num_images_per_prompt: int = 1,
-        strength: float = 0.8,
+        strength: float = 1.0,
        height: Optional[int] = None,
        width: Optional[int] = None,
        num_inference_steps: int = 50,
@@ -461,7 +447,6 @@ class VlpnStableDiffusion(DiffusionPipeline):
        device = self.execution_device
        do_classifier_free_guidance = guidance_scale > 1.0
        do_self_attention_guidance = sag_scale > 0.0
-        latents_are_image = isinstance(image, PIL.Image.Image)
        # 3. Encode input prompt
        prompt_embeds = self.encode_prompt(
@@ -474,33 +459,31 @@ class VlpnStableDiffusion(DiffusionPipeline):
        # 4. Prepare timesteps
        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.get_timesteps(latents_are_image, num_inference_steps, strength, device)
+        timesteps = self.get_timesteps(num_inference_steps, strength, device)
        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
+        if isinstance(image, PIL.Image.Image):
-        if latents_are_image:
            image = preprocess(image)
-            latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        elif image is None:
-            latents = self.prepare_latents_from_image(
+            image = self.prepare_image(
-                image,
-                latent_timestep,
                batch_size * num_images_per_prompt,
-                prompt_embeds.dtype,
-                device,
-                generator
-            )
-        else:
-            latents = self.prepare_latents(
-                batch_size * num_images_per_prompt,
-                num_channels_latents,
-                height,
                width,
+                height,
                prompt_embeds.dtype,
                device,
-                generator,
+                generator
-                image,
            )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator
+        )
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
author	Volpeon <git@volpeon.ink>	2023-03-03 22:09:24 +0100
committer	Volpeon <git@volpeon.ink>	2023-03-03 22:09:24 +0100
commit	220806dbd21da3ba83c14096225c31824dfe81df (patch)
tree	a201272876bfe894f9d504d1582ac022add4b205 /pipelines/stable_diffusion/vlpn_stable_diffusion.py
parent	Implemented different noise offset (diff)
download	textual-inversion-diff-220806dbd21da3ba83c14096225c31824dfe81df.tar.gz textual-inversion-diff-220806dbd21da3ba83c14096225c31824dfe81df.tar.bz2 textual-inversion-diff-220806dbd21da3ba83c14096225c31824dfe81df.zip

diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py index cb09fe1..c4f7401 100644 --- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py +++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -293,53 +293,39 @@ class VlpnStableDiffusion(DiffusionPipeline):
293		293
294	return prompt_embeds	294	return prompt_embeds
295		295
296	def get_timesteps(self, latents_are_image, num_inference_steps, strength, device):	296	def get_timesteps(self, num_inference_steps, strength, device):
297	if latents_are_image:	297	# get the original timestep using init_timestep
298	# get the original timestep using init_timestep	298	offset = self.scheduler.config.get("steps_offset", 0)
299	offset = self.scheduler.config.get("steps_offset", 0)	299	init_timestep = int(num_inference_steps * strength) + offset
300	init_timestep = int(num_inference_steps * strength) + offset	300	init_timestep = min(init_timestep, num_inference_steps)
301	init_timestep = min(init_timestep, num_inference_steps)	301
302		302	t_start = max(num_inference_steps - init_timestep + offset, 0)
303	t_start = max(num_inference_steps - init_timestep + offset, 0)	303	timesteps = self.scheduler.timesteps[t_start:]
304	timesteps = self.scheduler.timesteps[t_start:]
305	else:
306	timesteps = self.scheduler.timesteps
307		304
308	timesteps = timesteps.to(device)	305	timesteps = timesteps.to(device)
309		306
310	return timesteps	307	return timesteps
311		308
312	def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):	309	def prepare_image(self, batch_size, width, height, dtype, device, generator=None):
313	shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)	310	return torch.randn(
314		311	(batch_size, 1, 1, 1),
315	if isinstance(generator, list) and len(generator) != batch_size:	312	dtype=dtype,
316	raise ValueError(	313	device=device,
317	f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"	314	generator=generator
318	f" size of {batch_size}. Make sure the batch size matches the length of the generators."	315	).expand(batch_size, 3, width, height)
319	)
320		316
321	if latents is None:	317	def prepare_latents(self, init_image, timestep, batch_size, dtype, device, generator=None):
322	latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
323	else:
324	latents = latents.to(device=device, dtype=dtype)
325
326	# scale the initial noise by the standard deviation required by the scheduler
327	latents = latents * self.scheduler.init_noise_sigma
328
329	return latents
330
331	def prepare_latents_from_image(self, init_image, timestep, batch_size, dtype, device, generator=None):
332	init_image = init_image.to(device=device, dtype=dtype)	318	init_image = init_image.to(device=device, dtype=dtype)
333	init_latent_dist = self.vae.encode(init_image).latent_dist	319	init_latents = self.vae.encode(init_image).latent_dist.sample(generator=generator)
334	init_latents = init_latent_dist.sample(generator=generator)	320	init_latents = self.vae.config.scaling_factor * init_latents
335	init_latents = 0.18215 * init_latents
336		321
337	if batch_size > init_latents.shape[0]:	322	if batch_size % init_latents.shape[0] != 0:
338	raise ValueError(	323	raise ValueError(
339	f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."	324	f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
340	)	325	)
341	else:	326	else:
342	init_latents = torch.cat([init_latents] * batch_size, dim=0)	327	batch_multiplier = batch_size // init_latents.shape[0]
		328	init_latents = torch.cat([init_latents] * batch_multiplier, dim=0)
343		329
344	# add noise to latents using the timesteps	330	# add noise to latents using the timesteps
345	noise = torch.randn(init_latents.shape, generator=generator, device=device, dtype=dtype)	331	noise = torch.randn(init_latents.shape, generator=generator, device=device, dtype=dtype)
@@ -368,7 +354,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
368	return extra_step_kwargs	354	return extra_step_kwargs
369		355
370	def decode_latents(self, latents):	356	def decode_latents(self, latents):
371	latents = 1 / 0.18215 * latents	357	latents = 1 / self.vae.config.scaling_factor * latents
372	image = self.vae.decode(latents).sample	358	image = self.vae.decode(latents).sample
373	image = (image / 2 + 0.5).clamp(0, 1)	359	image = (image / 2 + 0.5).clamp(0, 1)
374	# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16	360	# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
@@ -381,7 +367,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
381	prompt: Union[str, List[str], List[int], List[List[int]]],	367	prompt: Union[str, List[str], List[int], List[List[int]]],
382	negative_prompt: Optional[Union[str, List[str], List[int], List[List[int]]]] = None,	368	negative_prompt: Optional[Union[str, List[str], List[int], List[List[int]]]] = None,
383	num_images_per_prompt: int = 1,	369	num_images_per_prompt: int = 1,
384	strength: float = 0.8,	370	strength: float = 1.0,
385	height: Optional[int] = None,	371	height: Optional[int] = None,
386	width: Optional[int] = None,	372	width: Optional[int] = None,
387	num_inference_steps: int = 50,	373	num_inference_steps: int = 50,
@@ -461,7 +447,6 @@ class VlpnStableDiffusion(DiffusionPipeline):
461	device = self.execution_device	447	device = self.execution_device
462	do_classifier_free_guidance = guidance_scale > 1.0	448	do_classifier_free_guidance = guidance_scale > 1.0
463	do_self_attention_guidance = sag_scale > 0.0	449	do_self_attention_guidance = sag_scale > 0.0
464	latents_are_image = isinstance(image, PIL.Image.Image)
465		450
466	# 3. Encode input prompt	451	# 3. Encode input prompt
467	prompt_embeds = self.encode_prompt(	452	prompt_embeds = self.encode_prompt(
@@ -474,33 +459,31 @@ class VlpnStableDiffusion(DiffusionPipeline):
474		459
475	# 4. Prepare timesteps	460	# 4. Prepare timesteps
476	self.scheduler.set_timesteps(num_inference_steps, device=device)	461	self.scheduler.set_timesteps(num_inference_steps, device=device)
477	timesteps = self.get_timesteps(latents_are_image, num_inference_steps, strength, device)	462	timesteps = self.get_timesteps(num_inference_steps, strength, device)
478		463
479	# 5. Prepare latent variables	464	# 5. Prepare latent variables
480	num_channels_latents = self.unet.in_channels	465	if isinstance(image, PIL.Image.Image):
481	if latents_are_image:
482	image = preprocess(image)	466	image = preprocess(image)
483	latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)	467	elif image is None:
484	latents = self.prepare_latents_from_image(	468	image = self.prepare_image(
485	image,
486	latent_timestep,
487	batch_size * num_images_per_prompt,	469	batch_size * num_images_per_prompt,
488	prompt_embeds.dtype,
489	device,
490	generator
491	)
492	else:
493	latents = self.prepare_latents(
494	batch_size * num_images_per_prompt,
495	num_channels_latents,
496	height,
497	width,	470	width,
		471	height,
498	prompt_embeds.dtype,	472	prompt_embeds.dtype,
499	device,	473	device,
500	generator,	474	generator
501	image,
502	)	475	)
503		476
		477	latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
		478	latents = self.prepare_latents(
		479	image,
		480	latent_timestep,
		481	batch_size * num_images_per_prompt,
		482	prompt_embeds.dtype,
		483	device,
		484	generator
		485	)
		486
504	# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline	487	# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
505	extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)	488	extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
506		489