Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cogvideo draft #1072

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions onediff_diffusers_extensions/examples/cog/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
RUN:

python3 onediff_diffusers_extensions/examples/cog/text_to_image_cog.py --model /data0/hf_models/CogVideoX-2b --compiler nexfort --compiler-config '{"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, "triton.fuse_attention_allow_fp16_reduction": false}}'
193 changes: 193 additions & 0 deletions onediff_diffusers_extensions/examples/cog/text_to_image_cog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import argparse
import json
import time
from typing import Union, List

import PIL
import imageio
import numpy as np
import torch

from diffusers import CogVideoXPipeline
from onediffx import compile_pipe, quantize_pipe
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add missing import for tempfile.

The tempfile module is used in the export_to_video_imageio function but is not imported, leading to a runtime error.

+ import tempfile
Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
import argparse
import json
import time
from typing import Union, List
import PIL
import imageio
import numpy as np
import torch
from diffusers import CogVideoXPipeline
from onediffx import compile_pipe, quantize_pipe
import argparse
import json
import time
from typing import Union, List
import PIL
import imageio
import numpy as np
import torch
from diffusers import CogVideoXPipeline
from onediffx import compile_pipe, quantize_pipe
import tempfile



def export_to_video_imageio(
video_frames: Union[List[np.ndarray], List[PIL.Image.Image]], output_video_path: str = None, fps: int = 8
) -> str:
"""
Export the video frames to a video file using imageio lib to Avoid "green screen" issue (for example CogVideoX)
"""
if output_video_path is None:
output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
if isinstance(video_frames[0], PIL.Image.Image):
video_frames = [np.array(frame) for frame in video_frames]
with imageio.get_writer(output_video_path, fps=fps) as writer:
for frame in video_frames:
writer.append_data(frame)
return output_video_path

def parse_args():
parser = argparse.ArgumentParser(
description="Use onediif to accelerate image generation with CogVideoX"
)
parser.add_argument(
"--model",
type=str,
default="THUDM/CogVideoX-2b",
help="Model path or identifier.",
)
parser.add_argument(
"--compiler",
type=str,
default="none",
help="Compiler backend to use. Options: 'none', 'nexfort'",
)
parser.add_argument(
"--compiler-config", type=str, help="JSON string for compiler config."
)
parser.add_argument(
"--quantize-config", type=str, help="JSON string for quantization config."
)
parser.add_argument(
"--prompt",
type=str,
default='In the haunting backdrop of a war-torn city, where ruins and crumbled walls tell a story of devastation, a poignant close-up frames a young girl. Her face is smudged with ash, a silent testament to the chaos around her. Her eyes glistening with a mix of sorrow and resilience, capturing the raw emotion of a world that has lost its innocence to the ravages of conflict.',
help="Prompt for the image generation.",
)
parser.add_argument(
"--guidance_scale",
type=float,
default=6.5,
help="The scale factor for the guidance.",
)
parser.add_argument(
"--num-inference-steps", type=int, default=50, help="Number of inference steps."
)
parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
parser.add_argument(
"--output_path", type=str, default="./output.mp4", help="The path where the generated video will be saved"
)
parser.add_argument(
"--seed", type=int, default=66, help="Seed for random number generation."
)
parser.add_argument(
"--warmup-iterations",
type=int,
default=1,
help="Number of warm-up iterations before actual inference.",
)
return parser.parse_args()


args = parse_args()

device = torch.device("cuda")


class CogVideoGenerator:
def __init__(
self, model, compiler_config=None, quantize_config=None, compiler="none"
):
self.pipe = CogVideoXPipeline.from_pretrained(
model, torch_dtype=torch.float16, variant="fp16"
).to(device)

self.prompt_embeds = None

if compiler == "nexfort":
if compiler_config:
print("nexfort backend compile...")
self.pipe = self.compile_pipe(self.pipe, compiler_config)

if quantize_config:
print("nexfort backend quant...")
self.pipe = self.quantize_pipe(self.pipe, quantize_config)

def encode_prompt(self, prompt, num_videos_per_prompt):
self.prompt_embeds, _ = self.pipe.encode_prompt(
prompt=prompt,
negative_prompt=None,
do_classifier_free_guidance=True,
num_videos_per_prompt=num_videos_per_prompt,
max_sequence_length=226,
device=device,
dtype=torch.float16,
)

def warmup(self, gen_args, warmup_iterations):
warmup_args = gen_args.copy()

warmup_args["generator"] = torch.Generator(device=device).manual_seed(0)

print("Starting warmup...")
start_time = time.time()

for _ in range(warmup_iterations):
self.pipe(**warmup_args)

end_time = time.time()
print("Warmup complete.")
print(f"Warmup time: {end_time - start_time:.2f} seconds")

def generate(self, gen_args):
gen_args["generator"] = torch.Generator(device=device).manual_seed(args.seed)

# Run the model
start_time = time.time()
video = self.pipe(**gen_args).frames[0]
end_time = time.time()

export_to_video_imageio(video, args.output_path, fps=8)

return video, end_time - start_time

def compile_pipe(self, pipe, compiler_config):
options = compiler_config
pipe = compile_pipe(
pipe, backend="nexfort", options=options, fuse_qkv_projections=True
)
return pipe

def quantize_pipe(self, pipe, quantize_config):
pipe = quantize_pipe(pipe, ignores=[], **quantize_config)
return pipe


def main():
nexfort_compiler_config = (
json.loads(args.compiler_config) if args.compiler_config else None
)
nexfort_quantize_config = (
json.loads(args.quantize_config) if args.quantize_config else None
)

CogVideo = CogVideoGenerator(
args.model,
nexfort_compiler_config,
nexfort_quantize_config,
compiler=args.compiler,
)

CogVideo.encode_prompt(args.prompt, args.num_videos_per_prompt)

gen_args = {
"prompt_embeds": CogVideo.prompt_embeds,
"num_inference_steps": args.num_inference_steps,
"guidance_scale": args.guidance_scale,
"negative_prompt_embeds": torch.zeros_like(CogVideo.prompt_embeds), # Not Supported negative prompt
"num_frames": 8,
}

CogVideo.warmup(gen_args, args.warmup_iterations)

_, inference_time = CogVideo.generate(gen_args)
print(
f"Generated video saved to {args.output_path} in {inference_time:.2f} seconds."
)
cuda_mem_after_used = torch.cuda.max_memory_allocated() / (1024**3)
print(f"Max used CUDA memory : {cuda_mem_after_used:.3f}GiB")


if __name__ == "__main__":
main()
Loading