1 files changed, 12 insertions, 13 deletions
diff --git a/training/functional.py b/training/functional.py
index 15b95ba..8dc2b9f 100644
--- a/training/functional.py
+++ b/training/functional.py
@@ -261,7 +261,8 @@ def loss_step(
    seed: int,
    step: int,
    batch: dict[str, Any],
-    eval: bool = False
+    eval: bool = False,
+    min_snr_gamma: int = 5
 ):
    # Convert images to latent space
    latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
@@ -307,23 +308,21 @@ def loss_step(
    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
    # Get the target for loss depending on the prediction type
+    alpha_t = noise_scheduler.alphas_cumprod[timesteps].float()
+    snr = alpha_t / (1 - alpha_t)
+    min_snr = snr.clamp(max=min_snr_gamma)
    if noise_scheduler.config.prediction_type == "epsilon":
        target = noise
+        loss_weight = min_snr / snr
-        snr_weights = 1
    elif noise_scheduler.config.prediction_type == "v_prediction":
        target = noise_scheduler.get_velocity(latents, noise, timesteps)
+        loss_weight = min_snr / (snr + 1)
-        p2_gamma = 1
-        p2_k = 1
-        alpha_t = noise_scheduler.alphas_cumprod[timesteps].float()
-        snr = 1.0 / (1 - alpha_t) - 1
-        snr_weights = 1 / (p2_k + snr) ** p2_gamma
-        snr_weights = snr_weights[..., None, None, None]
    else:
        raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+    loss_weight = loss_weight[..., None, None, None]
    if with_prior_preservation:
        # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
        model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
@@ -340,7 +339,7 @@ def loss_step(
    else:
        loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
-    loss = (snr_weights * loss).mean([1, 2, 3]).mean()
+    loss = (loss_weight * loss).mean([1, 2, 3]).mean()
    acc = (model_pred == target).float().mean()
    return loss, acc, bsz
@@ -413,7 +412,7 @@ def train_loop(
    try:
        for epoch in range(num_epochs):
            if accelerator.is_main_process:
-                if epoch % sample_frequency == 0:
+                if epoch % sample_frequency == 0 and epoch != 0:
                    local_progress_bar.clear()
                    global_progress_bar.clear()

diff --git a/training/functional.py b/training/functional.py index 15b95ba..8dc2b9f 100644 --- a/training/functional.py +++ b/training/functional.py
@@ -261,7 +261,8 @@ def loss_step(
261	seed: int,	261	seed: int,
262	step: int,	262	step: int,
263	batch: dict[str, Any],	263	batch: dict[str, Any],
264	eval: bool = False	264	eval: bool = False,
		265	min_snr_gamma: int = 5
265	):	266	):
266	# Convert images to latent space	267	# Convert images to latent space
267	latents = vae.encode(batch["pixel_values"]).latent_dist.sample()	268	latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
@@ -307,23 +308,21 @@ def loss_step(
307	model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample	308	model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
308		309
309	# Get the target for loss depending on the prediction type	310	# Get the target for loss depending on the prediction type
		311	alpha_t = noise_scheduler.alphas_cumprod[timesteps].float()
		312	snr = alpha_t / (1 - alpha_t)
		313	min_snr = snr.clamp(max=min_snr_gamma)
		314
310	if noise_scheduler.config.prediction_type == "epsilon":	315	if noise_scheduler.config.prediction_type == "epsilon":
311	target = noise	316	target = noise
312		317	loss_weight = min_snr / snr
313	snr_weights = 1
314	elif noise_scheduler.config.prediction_type == "v_prediction":	318	elif noise_scheduler.config.prediction_type == "v_prediction":
315	target = noise_scheduler.get_velocity(latents, noise, timesteps)	319	target = noise_scheduler.get_velocity(latents, noise, timesteps)
316		320	loss_weight = min_snr / (snr + 1)
317	p2_gamma = 1
318	p2_k = 1
319
320	alpha_t = noise_scheduler.alphas_cumprod[timesteps].float()
321	snr = 1.0 / (1 - alpha_t) - 1
322	snr_weights = 1 / (p2_k + snr) ** p2_gamma
323	snr_weights = snr_weights[..., None, None, None]
324	else:	321	else:
325	raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")	322	raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
326		323
		324	loss_weight = loss_weight[..., None, None, None]
		325
327	if with_prior_preservation:	326	if with_prior_preservation:
328	# Chunk the noise and model_pred into two parts and compute the loss on each part separately.	327	# Chunk the noise and model_pred into two parts and compute the loss on each part separately.
329	model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)	328	model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
@@ -340,7 +339,7 @@ def loss_step(
340	else:	339	else:
341	loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")	340	loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
342		341
343	loss = (snr_weights * loss).mean([1, 2, 3]).mean()	342	loss = (loss_weight * loss).mean([1, 2, 3]).mean()
344	acc = (model_pred == target).float().mean()	343	acc = (model_pred == target).float().mean()
345		344
346	return loss, acc, bsz	345	return loss, acc, bsz
@@ -413,7 +412,7 @@ def train_loop(
413	try:	412	try:
414	for epoch in range(num_epochs):	413	for epoch in range(num_epochs):
415	if accelerator.is_main_process:	414	if accelerator.is_main_process:
416	if epoch % sample_frequency == 0:	415	if epoch % sample_frequency == 0 and epoch != 0:
417	local_progress_bar.clear()	416	local_progress_bar.clear()
418	global_progress_bar.clear()	417	global_progress_bar.clear()
419		418