1 files changed, 21 insertions, 15 deletions
diff --git a/training/functional.py b/training/functional.py
index 87bb339..d285366 100644
--- a/training/functional.py
+++ b/training/functional.py
@@ -274,7 +274,7 @@ def loss_step(
    noise_scheduler: SchedulerMixin,
    unet: UNet2DConditionModel,
    text_encoder: CLIPTextModel,
-    with_prior_preservation: bool,
+    guidance_scale: float,
    prior_loss_weight: float,
    seed: int,
    offset_noise_strength: float,
@@ -283,13 +283,13 @@ def loss_step(
    eval: bool = False,
    min_snr_gamma: int = 5,
 ):
-    # Convert images to latent space
+    images = batch["pixel_values"]
-    latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
+    generator = torch.Generator(device=images.device).manual_seed(seed + step) if eval else None
-    latents = latents * vae.config.scaling_factor
+    bsz = images.shape[0]
-    bsz = latents.shape[0]
-    generator = torch.Generator(device=latents.device).manual_seed(seed + step) if eval else None
+    # Convert images to latent space
+    latents = vae.encode(images).latent_dist.sample(generator=generator)
+    latents *= vae.config.scaling_factor
    # Sample noise that we'll add to the latents
    noise = torch.randn(
@@ -301,13 +301,13 @@ def loss_step(
    )
    if offset_noise_strength != 0:
-        noise += offset_noise_strength * perlin_noise(
+        offset_noise = torch.randn(
-            latents.shape,
+            (latents.shape[0], latents.shape[1], 1, 1),
-            res=1,
            dtype=latents.dtype,
            device=latents.device,
            generator=generator
-        )
+        ).expand(noise.shape)
+        noise += offset_noise_strength * offset_noise
    # Sample a random timestep for each image
    timesteps = torch.randint(
@@ -343,7 +343,13 @@ def loss_step(
    else:
        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-    if with_prior_preservation:
+    if guidance_scale != 0:
+        # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+        model_pred_uncond, model_pred_text = torch.chunk(model_pred, 2, dim=0)
+        model_pred = model_pred_uncond + guidance_scale * (model_pred_text - model_pred_uncond)
+        loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+    elif prior_loss_weight != 0:
        # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
        model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
        target, target_prior = torch.chunk(target, 2, dim=0)
@@ -607,9 +613,9 @@ def train(
    checkpoint_frequency: int = 50,
    milestone_checkpoints: bool = True,
    global_step_offset: int = 0,
-    with_prior_preservation: bool = False,
+    guidance_scale: float = 0.0,
    prior_loss_weight: float = 1.0,
-    offset_noise_strength: float = 0.1,
+    offset_noise_strength: float = 0.15,
    **kwargs,
 ):
    text_encoder, unet, optimizer, train_dataloader, val_dataloader, lr_scheduler, extra = strategy.prepare(
@@ -638,7 +644,7 @@ def train(
        noise_scheduler,
        unet,
        text_encoder,
-        with_prior_preservation,
+        guidance_scale,
        prior_loss_weight,
        seed,
        offset_noise_strength,