3 files changed, 38 insertions, 60 deletions
diff --git a/infer.py b/infer.py
index c40335c..f2c380f 100644
--- a/infer.py
+++ b/infer.py
@@ -28,7 +28,7 @@ default_cmds = {
    "prompt": None,
    "negative_prompt": None,
    "image": None,
-    "image_strength": .7,
+    "image_strength": .3,
    "width": 512,
    "height": 512,
    "batch_size": 1,
@@ -225,6 +225,7 @@ def generate(output_dir, pipeline, args):
                guidance_scale=args.guidance_scale,
                generator=generator,
                latents=init_image,
+                strength=args.image_strength,
            ).images
            for j, image in enumerate(images):
diff --git a/pipelines/stable_diffusion/vlpn_stable_diffusion.py b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
index a8ecedf..b4c85e9 100644
--- a/pipelines/stable_diffusion/vlpn_stable_diffusion.py
+++ b/pipelines/stable_diffusion/vlpn_stable_diffusion.py
@@ -178,35 +178,6 @@ class VlpnStableDiffusion(DiffusionPipeline):
        # set timesteps
        self.scheduler.set_timesteps(num_inference_steps)
-        offset = self.scheduler.config.get("steps_offset", 0)
-        if latents is not None and isinstance(latents, PIL.Image.Image):
-            latents = preprocess(latents, width, height)
-            latent_dist = self.vae.encode(latents.to(self.device)).latent_dist
-            latents = latent_dist.sample(generator=generator)
-            latents = 0.18215 * latents
-            # expand init_latents for batch_size
-            latents = torch.cat([latents] * batch_size)
-            # get the original timestep using init_timestep
-            init_timestep = int(num_inference_steps * strength) + offset
-            init_timestep = min(init_timestep, num_inference_steps)
-            if isinstance(self.scheduler, LMSDiscreteScheduler):
-                timesteps = torch.tensor(
-                    [num_inference_steps - init_timestep] * batch_size, dtype=torch.long, device=self.device
-                )
-            else:
-                timesteps = self.scheduler.timesteps[-init_timestep]
-                timesteps = torch.tensor([timesteps] * batch_size, dtype=torch.long, device=self.device)
-            # add noise to latents using the timesteps
-            noise = torch.randn(latents.shape, generator=generator, device=self.device)
-            latents = self.scheduler.add_noise(latents, noise, timesteps)
-        else:
-            init_timestep = num_inference_steps + offset
        # get prompt text embeddings
        text_inputs = self.tokenizer(
            prompt,
@@ -243,6 +214,10 @@ class VlpnStableDiffusion(DiffusionPipeline):
            # to avoid doing two forward passes
            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = num_inference_steps + offset
+        ensure_sigma = not isinstance(latents, PIL.Image.Image)
        # get the initial random noise unless the user supplied it
        # Unlike in other pipelines, latents need to be generated in the target device
@@ -257,23 +232,48 @@ class VlpnStableDiffusion(DiffusionPipeline):
                device=latents_device,
                dtype=text_embeddings.dtype,
            )
+        elif isinstance(latents, PIL.Image.Image):
+            latents = preprocess(latents, width, height)
+            latent_dist = self.vae.encode(latents.to(self.device)).latent_dist
+            latents = latent_dist.sample(generator=generator)
+            latents = 0.18215 * latents
+            # expand init_latents for batch_size
+            latents = torch.cat([latents] * batch_size)
+            # get the original timestep using init_timestep
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                timesteps = torch.tensor(
+                    [num_inference_steps - init_timestep] * batch_size, dtype=torch.long, device=self.device
+                )
+            else:
+                timesteps = self.scheduler.timesteps[-init_timestep]
+                timesteps = torch.tensor([timesteps] * batch_size, dtype=torch.long, device=self.device)
+            # add noise to latents using the timesteps
+            noise = torch.randn(latents.shape, generator=generator, device=self.device)
+            latents = self.scheduler.add_noise(latents, noise, timesteps)
        else:
            if latents.shape != latents_shape:
                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
            latents = latents.to(self.device)
+        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
+        if ensure_sigma:
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                latents = latents * self.scheduler.sigmas[0]
+            elif isinstance(self.scheduler, EulerAScheduler):
+                latents = latents * self.scheduler.sigmas[0]
        t_start = max(num_inference_steps - init_timestep + offset, 0)
        # Some schedulers like PNDM have timesteps as arrays
        # It's more optimzed to move all timesteps to correct device beforehand
        timesteps_tensor = torch.tensor(self.scheduler.timesteps[t_start:], device=self.device)
-        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
-        if isinstance(self.scheduler, LMSDiscreteScheduler):
-            latents = latents * self.scheduler.sigmas[0]
-        elif isinstance(self.scheduler, EulerAScheduler):
-            latents = latents * self.scheduler.sigmas[0]
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
@@ -292,6 +292,7 @@ class VlpnStableDiffusion(DiffusionPipeline):
            # expand the latents if we are doing classifier free guidance
            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
            if isinstance(self.scheduler, LMSDiscreteScheduler):
                sigma = self.scheduler.sigmas[t_index]
                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
diff --git a/schedulers/scheduling_euler_a.py b/schedulers/scheduling_euler_a.py
index 1b1c9cf..a2d0e9f 100644
--- a/schedulers/scheduling_euler_a.py
+++ b/schedulers/scheduling_euler_a.py
@@ -191,27 +191,6 @@ class EulerAScheduler(SchedulerMixin, ConfigMixin):
        self.sigmas = get_sigmas(self.DSsigmas, self.num_inference_steps).to(device=device)
        self.timesteps = np.arange(0, self.num_inference_steps)
-    def add_noise_to_input(
-        self, sample: torch.FloatTensor, sigma: float, generator: Optional[torch.Generator] = None
-    ) -> Tuple[torch.FloatTensor, float]:
-        """
-        Explicit Langevin-like "churn" step of adding noise to the sample according to a factor gamma_i ≥ 0 to reach a
-        higher noise level sigma_hat = sigma_i + gamma_i*sigma_i.
-        TODO Args:
-        """
-        if self.config.s_min <= sigma <= self.config.s_max:
-            gamma = min(self.config.s_churn / self.num_inference_steps, 2**0.5 - 1)
-        else:
-            gamma = 0
-        # sample eps ~ N(0, S_noise^2 * I)
-        eps = self.config.s_noise * torch.randn(sample.shape, generator=generator).to(sample.device)
-        sigma_hat = sigma + gamma * sigma
-        sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
-        return sample_hat, sigma_hat
    def step(
        self,
        model_output: torch.FloatTensor,
@@ -219,9 +198,6 @@ class EulerAScheduler(SchedulerMixin, ConfigMixin):
        timestep_prev: int,
        sample: torch.FloatTensor,
        generator: None,
-        # ,sigma_hat: float,
-        #  sigma_prev: float,
-        # sample_hat: torch.FloatTensor,
        return_dict: bool = True,
    ) -> Union[SchedulerOutput, Tuple]:
        """