Skip to content

Commit

Permalink
Add cogvideo draft
Browse files Browse the repository at this point in the history
  • Loading branch information
lixiang007666 committed Aug 8, 2024
1 parent 92c10ec commit a16f066
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 0 deletions.
3 changes: 3 additions & 0 deletions onediff_diffusers_extensions/examples/cog/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
RUN:

python3 onediff_diffusers_extensions/examples/cog/text_to_image_cog.py --model /data0/hf_models/CogVideoX-2b --compiler nexfort --compiler-config '{"mode": "max-optimize:max-autotune:max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, "triton.fuse_attention_allow_fp16_reduction": false}}'
193 changes: 193 additions & 0 deletions onediff_diffusers_extensions/examples/cog/text_to_image_cog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import argparse
import json
import time
from typing import Union, List

import PIL
import imageio
import numpy as np
import torch

from diffusers import CogVideoXPipeline
from onediffx import compile_pipe, quantize_pipe


def export_to_video_imageio(
video_frames: Union[List[np.ndarray], List[PIL.Image.Image]], output_video_path: str = None, fps: int = 8
) -> str:
"""
Export the video frames to a video file using imageio lib to Avoid "green screen" issue (for example CogVideoX)
"""
if output_video_path is None:
output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
if isinstance(video_frames[0], PIL.Image.Image):
video_frames = [np.array(frame) for frame in video_frames]
with imageio.get_writer(output_video_path, fps=fps) as writer:
for frame in video_frames:
writer.append_data(frame)
return output_video_path

def parse_args():
parser = argparse.ArgumentParser(
description="Use onediif to accelerate image generation with CogVideoX"
)
parser.add_argument(
"--model",
type=str,
default="THUDM/CogVideoX-2b",
help="Model path or identifier.",
)
parser.add_argument(
"--compiler",
type=str,
default="none",
help="Compiler backend to use. Options: 'none', 'nexfort'",
)
parser.add_argument(
"--compiler-config", type=str, help="JSON string for compiler config."
)
parser.add_argument(
"--quantize-config", type=str, help="JSON string for quantization config."
)
parser.add_argument(
"--prompt",
type=str,
default='In the haunting backdrop of a war-torn city, where ruins and crumbled walls tell a story of devastation, a poignant close-up frames a young girl. Her face is smudged with ash, a silent testament to the chaos around her. Her eyes glistening with a mix of sorrow and resilience, capturing the raw emotion of a world that has lost its innocence to the ravages of conflict.',
help="Prompt for the image generation.",
)
parser.add_argument(
"--guidance_scale",
type=float,
default=6.5,
help="The scale factor for the guidance.",
)
parser.add_argument(
"--num-inference-steps", type=int, default=50, help="Number of inference steps."
)
parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
parser.add_argument(
"--output_path", type=str, default="./output.mp4", help="The path where the generated video will be saved"
)
parser.add_argument(
"--seed", type=int, default=66, help="Seed for random number generation."
)
parser.add_argument(
"--warmup-iterations",
type=int,
default=1,
help="Number of warm-up iterations before actual inference.",
)
return parser.parse_args()


args = parse_args()

device = torch.device("cuda")


class CogVideoGenerator:
def __init__(
self, model, compiler_config=None, quantize_config=None, compiler="none"
):
self.pipe = CogVideoXPipeline.from_pretrained(
model, torch_dtype=torch.float16, variant="fp16"
).to(device)

self.prompt_embeds = None

if compiler == "nexfort":
if compiler_config:
print("nexfort backend compile...")
self.pipe = self.compile_pipe(self.pipe, compiler_config)

if quantize_config:
print("nexfort backend quant...")
self.pipe = self.quantize_pipe(self.pipe, quantize_config)

def encode_prompt(self, prompt, num_videos_per_prompt):
self.prompt_embeds, _ = self.pipe.encode_prompt(
prompt=prompt,
negative_prompt=None,
do_classifier_free_guidance=True,
num_videos_per_prompt=num_videos_per_prompt,
max_sequence_length=226,
device=device,
dtype=torch.float16,
)

def warmup(self, gen_args, warmup_iterations):
warmup_args = gen_args.copy()

warmup_args["generator"] = torch.Generator(device=device).manual_seed(0)

print("Starting warmup...")
start_time = time.time()

for _ in range(warmup_iterations):
self.pipe(**warmup_args)

end_time = time.time()
print("Warmup complete.")
print(f"Warmup time: {end_time - start_time:.2f} seconds")

def generate(self, gen_args):
gen_args["generator"] = torch.Generator(device=device).manual_seed(args.seed)

# Run the model
start_time = time.time()
video = self.pipe(**gen_args).frames[0]
end_time = time.time()

export_to_video_imageio(video, args.output_path, fps=8)

return video, end_time - start_time

def compile_pipe(self, pipe, compiler_config):
options = compiler_config
pipe = compile_pipe(
pipe, backend="nexfort", options=options, fuse_qkv_projections=True
)
return pipe

def quantize_pipe(self, pipe, quantize_config):
pipe = quantize_pipe(pipe, ignores=[], **quantize_config)
return pipe


def main():
nexfort_compiler_config = (
json.loads(args.compiler_config) if args.compiler_config else None
)
nexfort_quantize_config = (
json.loads(args.quantize_config) if args.quantize_config else None
)

CogVideo = CogVideoGenerator(
args.model,
nexfort_compiler_config,
nexfort_quantize_config,
compiler=args.compiler,
)

CogVideo.encode_prompt(args.prompt, args.num_videos_per_prompt)

gen_args = {
"prompt_embeds": CogVideo.prompt_embeds,
"num_inference_steps": args.num_inference_steps,
"guidance_scale": args.guidance_scale,
"negative_prompt_embeds": torch.zeros_like(CogVideo.prompt_embeds), # Not Supported negative prompt
"num_frames": 8,
}

CogVideo.warmup(gen_args, args.warmup_iterations)

_, inference_time = CogVideo.generate(gen_args)
print(
f"Generated video saved to {args.output_path} in {inference_time:.2f} seconds."
)
cuda_mem_after_used = torch.cuda.max_memory_allocated() / (1024**3)
print(f"Max used CUDA memory : {cuda_mem_after_used:.3f}GiB")


if __name__ == "__main__":
main()

0 comments on commit a16f066

Please sign in to comment.