2 files changed, 83 insertions, 72 deletions
diff --git a/data/csv.py b/data/csv.py
index 4c91ded..df15c5a 100644
--- a/data/csv.py
+++ b/data/csv.py
@@ -76,7 +76,7 @@ class CSVDataModule(pl.LightningDataModule):
    def prepare_data(self):
        metadata = pd.read_json(self.data_file)
-        metadata = [item for item in metadata.itertuples() if "skip" not in item or item.skip != True]
+        metadata = [item for item in metadata.itertuples() if not hasattr(item, "skip") or item.skip != True]
        num_images = len(metadata)
        valid_set_size = int(num_images * 0.2)
diff --git a/dreambooth.py b/dreambooth.py
index 1539e81..72c56cd 100644
--- a/dreambooth.py
+++ b/dreambooth.py
@@ -810,93 +810,85 @@ def main():
    )
    global_progress_bar.set_description("Total progress")
-    def run_step(batch, train=False, class_images=False):
+    try:
-        # Convert images to latent space
+        for epoch in range(num_epochs):
-        latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
+            local_progress_bar.set_description(f"Epoch {epoch + 1} / {num_epochs}")
-        latents = latents * 0.18215
+            local_progress_bar.reset()
-        # Sample noise that we'll add to the latents
-        noise = torch.randn_like(latents)
-        bsz = latents.shape[0]
-        # Sample a random timestep for each image
-        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
-        timesteps = timesteps.long()
-        # Add noise to the latents according to the noise magnitude at each timestep
+            unet.train()
-        # (this is the forward diffusion process)
+            text_encoder.train()
-        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+            train_loss = 0.0
-        # Get the text embedding for conditioning
+            sample_checkpoint = False
-        encoder_hidden_states = prompt_processor.get_embeddings(batch["input_ids"])
-        # Predict the noise residual
+            for step, batch in enumerate(train_dataloader):
-        noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                with accelerator.accumulate(itertools.chain(unet, text_encoder)):
+                    # Convert images to latent space
+                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
+                    latents = latents * 0.18215
-        if class_images:
+                    # Sample noise that we'll add to the latents
-            # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
+                    noise = torch.randn_like(latents)
-            noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
+                    bsz = latents.shape[0]
-            noise, noise_prior = torch.chunk(noise, 2, dim=0)
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps,
+                                              (bsz,), device=latents.device)
+                    timesteps = timesteps.long()
-            # Compute instance loss
+                    # Add noise to the latents according to the noise magnitude at each timestep
-            loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="none").mean([1, 2, 3]).mean()
+                    # (this is the forward diffusion process)
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-            # Compute prior loss
+                    # Get the text embedding for conditioning
-            prior_loss = F.mse_loss(noise_pred_prior.float(), noise_prior.float(), reduction="mean")
+                    encoder_hidden_states = prompt_processor.get_embeddings(batch["input_ids"])
-            # Add the prior loss to the instance loss.
+                    # Predict the noise residual
-            loss = loss + args.prior_loss_weight * prior_loss
+                    noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-        else:
-            loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
-        if train:
+                    if args.num_class_images != 0:
-            accelerator.backward(loss)
+                        # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
+                        noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
+                        noise, noise_prior = torch.chunk(noise, 2, dim=0)
-            if args.initializer_token is not None:
+                        # Compute instance loss
-                # Keep the token embeddings fixed except the newly added
+                        loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="none").mean([1, 2, 3]).mean()
-                # embeddings for the concept, as we only want to optimize the concept embeddings
-                if accelerator.num_processes > 1:
-                    token_embeds = text_encoder.module.get_input_embeddings().weight
-                else:
-                    token_embeds = text_encoder.get_input_embeddings().weight
-                # Get the index for tokens that we want to freeze
+                        # Compute prior loss
-                index_fixed_tokens = torch.arange(len(tokenizer)) != placeholder_token_id
+                        prior_loss = F.mse_loss(noise_pred_prior.float(), noise_prior.float(), reduction="mean")
-                token_embeds.data[index_fixed_tokens, :] = original_token_embeds[index_fixed_tokens, :]
-            if accelerator.sync_gradients:
+                        # Add the prior loss to the instance loss.
-                params_to_clip = (
+                        loss = loss + args.prior_loss_weight * prior_loss
-                    unet.parameters()
+                    else:
-                    if args.initializer_token is not None
+                        loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
-                    else itertools.chain(unet.parameters(), text_encoder.parameters())
-                )
-                accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-            optimizer.step()
+                    accelerator.backward(loss)
-            if not accelerator.optimizer_step_was_skipped:
-                lr_scheduler.step()
-            optimizer.zero_grad(set_to_none=True)
-        loss = loss.detach().item()
+                    if args.initializer_token is not None:
-        return loss
+                        # Keep the token embeddings fixed except the newly added
+                        # embeddings for the concept, as we only want to optimize the concept embeddings
+                        if accelerator.num_processes > 1:
+                            token_embeds = text_encoder.module.get_input_embeddings().weight
+                        else:
+                            token_embeds = text_encoder.get_input_embeddings().weight
-    try:
+                        # Get the index for tokens that we want to freeze
-        for epoch in range(num_epochs):
+                        index_fixed_tokens = torch.arange(len(tokenizer)) != placeholder_token_id
-            local_progress_bar.set_description(f"Epoch {epoch + 1} / {num_epochs}")
+                        token_embeds.data[index_fixed_tokens, :] = original_token_embeds[index_fixed_tokens, :]
-            local_progress_bar.reset()
-            unet.train()
+                    if accelerator.sync_gradients:
-            text_encoder.train()
+                        params_to_clip = (
-            train_loss = 0.0
+                            unet.parameters()
+                            if args.initializer_token is not None
+                            else itertools.chain(unet.parameters(), text_encoder.parameters())
+                        )
+                        accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
-            sample_checkpoint = False
+                    optimizer.step()
+                    if not accelerator.optimizer_step_was_skipped:
+                        lr_scheduler.step()
+                    optimizer.zero_grad(set_to_none=True)
-            for step, batch in enumerate(train_dataloader):
+                    loss = loss.detach().item()
-                with accelerator.accumulate(itertools.chain(unet, text_encoder)):
-                    loss = run_step(
-                        batch,
-                        train=True,
-                        class_images=args.num_class_images != 0
-                    )
                    train_loss += loss
                # Checks if the accelerator has performed an optimization step behind the scenes
@@ -937,7 +929,26 @@ def main():
            with torch.inference_mode():
                for step, batch in enumerate(val_dataloader):
-                    loss = run_step(batch)
+                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
+                    latents = latents * 0.18215
+                    noise = torch.randn_like(latents)
+                    bsz = latents.shape[0]
+                    timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps,
+                                              (bsz,), device=latents.device)
+                    timesteps = timesteps.long()
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                    encoder_hidden_states = prompt_processor.get_embeddings(batch["input_ids"])
+                    noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                    noise_pred, noise = accelerator.gather_for_metrics((noise_pred, noise))
+                    loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
+                    loss = loss.detach().item()
                    val_loss += loss
                    if accelerator.sync_gradients: