-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Cog] some minor fixes and nits #9466
Merged
+65
−61
Merged
Changes from 5 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
f4ce633
fix positional arguments in check_inputs().
sayakpaul 588d759
add video and latetns to check_inputs().
sayakpaul 37c8922
prep latents_in_channels.
sayakpaul 4b0dc80
quality
sayakpaul 24b83a6
multiple fixes.
sayakpaul 514ed23
fix
sayakpaul e237924
Merge branch 'main' into correct-args-cog-v2v
sayakpaul 271c110
Merge branch 'main' into correct-args-cog-v2v
sayakpaul File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -207,6 +207,9 @@ def __init__( | |
self.vae_scale_factor_temporal = ( | ||
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4 | ||
) | ||
self.vae_scaling_factor_image = ( | ||
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7 | ||
) | ||
Comment on lines
+210
to
+212
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is beneficial for scenarios where we want to run the pipeline without the VAE. |
||
|
||
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial) | ||
|
||
|
@@ -348,6 +351,12 @@ def prepare_latents( | |
generator: Optional[torch.Generator] = None, | ||
latents: Optional[torch.Tensor] = None, | ||
): | ||
if isinstance(generator, list) and len(generator) != batch_size: | ||
raise ValueError( | ||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" | ||
f" size of {batch_size}. Make sure the batch size matches the length of the generators." | ||
) | ||
|
||
num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1 | ||
shape = ( | ||
batch_size, | ||
|
@@ -357,12 +366,6 @@ def prepare_latents( | |
width // self.vae_scale_factor_spatial, | ||
) | ||
|
||
if isinstance(generator, list) and len(generator) != batch_size: | ||
raise ValueError( | ||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" | ||
f" size of {batch_size}. Make sure the batch size matches the length of the generators." | ||
) | ||
|
||
image = image.unsqueeze(2) # [B, C, F, H, W] | ||
|
||
if isinstance(generator, list): | ||
|
@@ -373,7 +376,7 @@ def prepare_latents( | |
image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image] | ||
|
||
image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4) # [B, F, C, H, W] | ||
image_latents = self.vae.config.scaling_factor * image_latents | ||
image_latents = self.vae_scaling_factor_image * image_latents | ||
|
||
padding_shape = ( | ||
batch_size, | ||
|
@@ -397,7 +400,7 @@ def prepare_latents( | |
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents | ||
def decode_latents(self, latents: torch.Tensor) -> torch.Tensor: | ||
latents = latents.permute(0, 2, 1, 3, 4) # [batch_size, num_channels, num_frames, height, width] | ||
latents = 1 / self.vae.config.scaling_factor * latents | ||
latents = 1 / self.vae_scaling_factor_image * latents | ||
|
||
frames = self.vae.decode(latents).sample | ||
return frames | ||
|
@@ -438,7 +441,6 @@ def check_inputs( | |
width, | ||
negative_prompt, | ||
callback_on_step_end_tensor_inputs, | ||
video=None, | ||
latents=None, | ||
prompt_embeds=None, | ||
negative_prompt_embeds=None, | ||
|
@@ -494,9 +496,6 @@ def check_inputs( | |
f" {negative_prompt_embeds.shape}." | ||
) | ||
|
||
if video is not None and latents is not None: | ||
raise ValueError("Only one of `video` or `latents` should be provided") | ||
|
||
# Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections | ||
def fuse_qkv_projections(self) -> None: | ||
r"""Enables fused QKV projections.""" | ||
|
@@ -584,18 +583,18 @@ def __call__( | |
|
||
Args: | ||
image (`PipelineImageInput`): | ||
The input video to condition the generation on. Must be an image, a list of images or a `torch.Tensor`. | ||
The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`. | ||
prompt (`str` or `List[str]`, *optional*): | ||
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. | ||
instead. | ||
negative_prompt (`str` or `List[str]`, *optional*): | ||
The prompt or prompts not to guide the image generation. If not defined, one has to pass | ||
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is | ||
less than `1`). | ||
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): | ||
The height in pixels of the generated image. This is set to 1024 by default for the best results. | ||
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): | ||
The width in pixels of the generated image. This is set to 1024 by default for the best results. | ||
height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial): | ||
The height in pixels of the generated image. This is set to 480 by default for the best results. | ||
width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial): | ||
The width in pixels of the generated image. This is set to 720 by default for the best results. | ||
num_frames (`int`, defaults to `48`): | ||
Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will | ||
contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where | ||
|
@@ -665,20 +664,19 @@ def __call__( | |
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): | ||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs | ||
|
||
height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial | ||
width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial | ||
num_videos_per_prompt = 1 | ||
|
||
# 1. Check inputs. Raise error if not correct | ||
self.check_inputs( | ||
image, | ||
prompt, | ||
height, | ||
width, | ||
negative_prompt, | ||
callback_on_step_end_tensor_inputs, | ||
prompt_embeds, | ||
negative_prompt_embeds, | ||
image=image, | ||
prompt=prompt, | ||
height=height, | ||
width=width, | ||
negative_prompt=negative_prompt, | ||
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, | ||
latents=latents, | ||
prompt_embeds=prompt_embeds, | ||
negative_prompt_embeds=negative_prompt_embeds, | ||
) | ||
self._guidance_scale = guidance_scale | ||
self._interrupt = False | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need it because
height
andwidth
are already at their default values.