From 68a3735352d3e62d46556b677407fc71b78c47c4 Mon Sep 17 00:00:00 2001
From: Volpeon <git@volpeon.ink>
Date: Tue, 27 Sep 2022 08:21:28 +0200
Subject: Better sample filenames, optimizations

---
 .gitignore       |  4 ++--
 environment.yaml | 10 ++++------
 main.py          | 37 ++++++++++++++++++++++++++-----------
 3 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/.gitignore b/.gitignore
index 00e7681..a8893c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,5 +160,5 @@ cython_debug/
 #.idea/
 
 text-inversion-model/
-conf.json
-v1-inference.yaml
+conf*.json
+v1-inference.yaml*
diff --git a/environment.yaml b/environment.yaml
index a460158..8dc930c 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -16,21 +16,19 @@ dependencies:
       - -e git+https://github.com/openai/CLIP.git@main#egg=clip
       - -e git+https://github.com/hlky/k-diffusion-sd#egg=k_diffusion
       - -e git+https://github.com/devilismyfriend/latent-diffusion#egg=latent-diffusion
+      - -e git+https://github.com/huggingface/diffusers#egg=diffusers
       - accelerate==0.12.0
       - albumentations==1.1.0
-      - diffusers==0.3.0
       - einops==0.4.1
-      - imageio-ffmpeg==0.4.7
-      - imageio==2.14.1
+      - imageio==2.22.0
       - kornia==0.6
       - pudb==2019.2
-      - omegaconf==2.1.1
+      - omegaconf==2.2.3
       - opencv-python-headless==4.6.0.66
       - python-slugify>=6.1.2
       - pytorch-lightning==1.7.7
       - setuptools==59.5.0
-      - streamlit>=0.73.1
       - test-tube>=0.7.5
       - torch-fidelity==0.3.0
       - torchmetrics==0.9.3
-      - transformers==4.19.2
+      - transformers==4.22.1
diff --git a/main.py b/main.py
index aa5af72..8be79e5 100644
--- a/main.py
+++ b/main.py
@@ -105,6 +105,11 @@ def parse_args():
         default=1,
         help="Number of updates steps to accumulate before performing a backward/update pass.",
     )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
     parser.add_argument(
         "--learning_rate",
         type=float,
@@ -213,7 +218,7 @@ def parse_args():
     elif args.config is not None:
         with open(args.config, 'rt') as f:
             args = parser.parse_args(
-                namespace=argparse.Namespace(**json.load(f)))
+                namespace=argparse.Namespace(**json.load(f)["args"]))
 
     env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
     if env_local_rank != -1 and env_local_rank != args.local_rank:
@@ -289,7 +294,7 @@ class Checkpointer:
         self.stable_sample_batches = stable_sample_batches
 
     @torch.no_grad()
-    def checkpoint(self, step, text_encoder, save_samples=True, path=None):
+    def checkpoint(self, step, postfix, text_encoder, save_samples=True, path=None):
         print("Saving checkpoint for step %d..." % step)
         with self.accelerator.autocast():
             if path is None:
@@ -302,12 +307,11 @@ class Checkpointer:
             learned_embeds = unwrapped.get_input_embeddings().weight[self.placeholder_token_id]
             learned_embeds_dict = {self.placeholder_token: learned_embeds.detach().cpu()}
 
-            filename = f"%s_%d.bin" % (slugify(self.placeholder_token), step)
+            filename = f"%s_%d_%s.bin" % (slugify(self.placeholder_token), step, postfix)
             if path is not None:
                 torch.save(learned_embeds_dict, path)
             else:
-                torch.save(learned_embeds_dict,
-                           f"{checkpoints_path}/{filename}")
+                torch.save(learned_embeds_dict, f"{checkpoints_path}/{filename}")
                 torch.save(learned_embeds_dict, f"{checkpoints_path}/last.bin")
             del unwrapped
             del learned_embeds
@@ -338,7 +342,7 @@ class Checkpointer:
             "validation": self.datamodule.val_dataloader(),
         }[mode]
 
-        if mode == "validation" and self.stable_sample_batches > 0:
+        if mode == "validation" and self.stable_sample_batches > 0 and step > 0:
             stable_latents = torch.randn(
                 (self.sample_batch_size, pipeline.unet.in_channels, height // 8, width // 8),
                 device=pipeline.device,
@@ -348,15 +352,18 @@ class Checkpointer:
             all_samples = []
             filename = f"stable_step_%d.png" % (step)
 
+            data_enum = enumerate(data)
+
             # Generate and save stable samples
             for i in range(0, self.stable_sample_batches):
-                prompt = [batch["prompt"] for i, batch in enumerate(data) if i < self.sample_batch_size]
+                prompt = [prompt for i, batch in data_enum for j, prompt in enumerate(
+                    batch["prompt"]) if i * data.batch_size + j < self.sample_batch_size]
 
                 with self.accelerator.autocast():
                     samples = pipeline(
                         prompt=prompt,
                         height=self.sample_image_size,
-                        latents=stable_latents,
+                        latents=stable_latents[:len(prompt)],
                         width=self.sample_image_size,
                         guidance_scale=guidance_scale,
                         eta=eta,
@@ -377,9 +384,12 @@ class Checkpointer:
         all_samples = []
         filename = f"step_%d.png" % (step)
 
+        data_enum = enumerate(data)
+
         # Generate and save random samples
         for i in range(0, self.random_sample_batches):
-            prompt = [batch["prompt"] for i, batch in enumerate(data) if i < self.sample_batch_size]
+            prompt = [prompt for i, batch in data_enum for j, prompt in enumerate(
+                batch["prompt"]) if i * data.batch_size + j < self.sample_batch_size]
 
             with self.accelerator.autocast():
                 samples = pipeline(
@@ -486,6 +496,9 @@ def main():
         args.pretrained_model_name_or_path + '/unet',
     )
 
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
     slice_size = unet.config.attention_head_dim // 2
     unet.set_attention_slice(slice_size)
 
@@ -693,7 +706,7 @@ def main():
                         progress_bar.clear()
                         local_progress_bar.clear()
 
-                        checkpointer.checkpoint(global_step + global_step_offset, text_encoder)
+                        checkpointer.checkpoint(global_step + global_step_offset, "training", text_encoder)
                         save_resume_file(basepath, args, {
                             "global_step": global_step + global_step_offset,
                             "resume_checkpoint": f"{basepath}/checkpoints/last.bin"
@@ -753,6 +766,7 @@ def main():
 
             if min_val_loss > val_loss:
                 accelerator.print(f"Validation loss reached new minimum: {min_val_loss:.2e} -> {val_loss:.2e}")
+                checkpointer.checkpoint(global_step + global_step_offset, "milestone", text_encoder)
                 min_val_loss = val_loss
 
             checkpointer.save_samples(
@@ -768,6 +782,7 @@ def main():
             print("Finished! Saving final checkpoint and resume state.")
             checkpointer.checkpoint(
                 global_step + global_step_offset,
+                "end",
                 text_encoder,
                 path=f"{basepath}/learned_embeds.bin"
             )
@@ -782,7 +797,7 @@ def main():
     except KeyboardInterrupt:
         if accelerator.is_main_process:
             print("Interrupted, saving checkpoint and resume state...")
-            checkpointer.checkpoint(global_step + global_step_offset, text_encoder)
+            checkpointer.checkpoint(global_step + global_step_offset, "end", text_encoder)
             save_resume_file(basepath, args, {
                 "global_step": global_step + global_step_offset,
                 "resume_checkpoint": f"{basepath}/checkpoints/last.bin"
-- 
cgit v1.2.3-70-g09d2