Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix sd3 bench infer #931

Merged
merged 4 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 57 additions & 66 deletions ppdiffusers/deploy/sd3/infer_dygraph_paddle.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,16 @@ def change_scheduler(self, scheduler_type="ddim"):
raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
return scheduler


def get_paddle_memory_info():
"""get_memory_info"""
divisor = 2**30
return (
paddle.device.cuda.memory_allocated() / divisor,
paddle.device.cuda.max_memory_allocated() / divisor,
paddle.device.cuda.memory_reserved() / divisor,
paddle.device.cuda.max_memory_reserved() / divisor,
)

def parse_arguments():

parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -146,9 +155,6 @@ def parse_arguments():
help="The parse_prompt_type can be one of [raw, lpw]. ",
)
parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
parser.add_argument(
"--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
)
parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
parser.add_argument(
"--scheduler",
Expand Down Expand Up @@ -192,71 +198,56 @@ def main(args):
scheduler = change_scheduler(pipe, args.scheduler)
pipe.scheduler = scheduler

if args.attention_type == "all":
args.attention_type = ["raw", "cutlass", "flash"]
else:
args.attention_type = [args.attention_type]

for attention_type in args.attention_type:
if attention_type == "raw":
pipe.disable_xformers_memory_efficient_attention()
else:
try:
pipe.enable_xformers_memory_efficient_attention(attention_type)
except Exception as e:
if attention_type == "flash":
warnings.warn(
"Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
)
continue
else:
raise ValueError(e)

if not args.use_fp16 and attention_type == "flash":
print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
continue

width = args.width
height = args.height
pipe.set_progress_bar_config(disable=False)

folder = f"paddle_attn_{attention_type}_fp16" if args.use_fp16 else f"paddle_attn_{attention_type}_fp32"
os.makedirs(folder, exist_ok=True)
if args.task_name in ["text2img", "all"]:
init_image = load_image(
"https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
)
# text2img
prompt = "bird"
time_costs = []
# warmup
pipe(
width = args.width
height = args.height
pipe.set_progress_bar_config(disable=False)

folder = f"paddle_fp16" if args.use_fp16 else f"paddle_fp32"
os.makedirs(folder, exist_ok=True)
if args.task_name in ["text2img", "all"]:
# text2img
prompt = "bird"
time_costs = []
memory_metrics = []

# warmup
pipe(
prompt,
num_inference_steps=10,
height=height,
width=width,
)
print("==> Test text2img performance.")
for step in trange(args.benchmark_steps):
start = time.time()
paddle.seed(seed)
images = pipe(
prompt,
num_inference_steps=10,
num_inference_steps=args.inference_steps,
height=height,
width=width,
)
print("==> Test text2img performance.")
for step in trange(args.benchmark_steps):
start = time.time()
paddle.seed(seed)
images = pipe(
prompt,
num_inference_steps=args.inference_steps,
height=height,
width=width,
).images
latency = time.time() - start
time_costs += [latency]
# print(f"No {step:3d} time cost: {latency:2f} s")
print(
f"Attention type: {attention_type}, "
f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
)
images[0].save(f"{folder}/text2img.png")
).images
latency = time.time() - start
time_costs += [latency]

# 收集显存信息
memory_allocated, max_memory_allocated, memory_reserved, max_memory_reserved = get_paddle_memory_info()
memory_metrics.append([memory_allocated, max_memory_allocated, memory_reserved, max_memory_reserved])

# 计算平均显存使用情况
avg_memory = np.mean(memory_metrics, axis=0)

print(
f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
)
print(
f"Memory Info (GB) - Allocated: {avg_memory[0]:.2f}, Max Allocated: {avg_memory[1]:.2f}, "
f"Reserved: {avg_memory[2]:.2f}, Max Reserved: {avg_memory[3]:.2f}"
)
images[0].save(f"{folder}/text2img.png")


if __name__ == "__main__":
Expand Down
24 changes: 23 additions & 1 deletion ppdiffusers/deploy/sd3/infer_dygraph_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,17 @@ def fn_recursive_attn_processor(name: str, module, processor):
fn_recursive_attn_processor(name, module, processor)


def get_torch_memory_info():
"""get_memory_info"""
divisor = 2**30
return (
torch.cuda.memory_allocated() / divisor,
torch.cuda.max_memory_allocated() / divisor,
torch.cuda.memory_reserved() / divisor,
torch.cuda.max_memory_reserved() / divisor,
)


def main(args):
if args.tf32:
torch.backends.cuda.matmul.allow_tf32 = True
Expand Down Expand Up @@ -289,6 +300,8 @@ def main(args):
# text2img
prompt = "bird"
time_costs = []
memory_metrics = []

# warmup
pipe(
prompt,
Expand All @@ -308,14 +321,23 @@ def main(args):
).images
latency = time.time() - start
time_costs += [latency]
# print(f"No {step:3d} time cost: {latency:2f} s")

memory_allocated, max_memory_allocated, memory_reserved, max_memory_reserved = get_torch_memory_info()
memory_metrics.append([memory_allocated, max_memory_allocated, memory_reserved, max_memory_reserved])

avg_memory = np.mean(memory_metrics, axis=0)

print(
f"Attention type: {attention_type}, "
f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
)
print(
f"Memory Info (GB) - Allocated: {avg_memory[0]:.2f}, Max Allocated: {avg_memory[1]:.2f}, "
f"Reserved: {avg_memory[2]:.2f}, Max Reserved: {avg_memory[3]:.2f}"
)
images[0].save(f"{folder}/text2img.png")


Expand Down
14 changes: 14 additions & 0 deletions ppdiffusers/deploy/sd3/scripts/benchmark_deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
set -eux

# 安装 triton并适配paddle
python -m pip install triton
python -m pip install git+https://github.com/zhoutianzi666/UseTritonInPaddle.git
python -c "import use_triton_in_paddle; use_triton_in_paddle.make_triton_compatible_with_paddle()"
# 安装paddlemix库,使用集成在paddlemix库中的自定义算子。
cd PaddleMIX
pip install -e .
cd PaddleMIX/ppdiffusers/deploy/sd3

export FLAGS_enable_pir_api=0
# text_to_image_generation-stable_diffusion_3.py中设置exp_enable_use_cutlass=False
python text_to_image_generation-stable_diffusion_3.py --dtype float16 --height 1024 --width 1024 --num-inference-steps 50 --inference_optimize 1 --benchmark yes
16 changes: 2 additions & 14 deletions ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# attention raw fp16
python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10

# attention cutlass fp16
python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10

# attention flash fp16
python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10


# attention raw fp32
python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
python infer_dygraph_paddle.py --scheduler "flow" --task_name all --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10

# attention cutlass fp32
python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10

# attention flash fp32
python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
python infer_dygraph_paddle.py --scheduler "flow" --task_name all --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def parse_args():
save_model_dir="./tmp/sd3",
enable_new_ir=True,
cache_static_model=True,
exp_enable_use_cutlass=True,
exp_enable_use_cutlass=False,
delete_pass_lists=["add_norm_fuse_pass"],
)

Expand All @@ -107,7 +107,6 @@ def parse_args():
image = pipe(
prompt, num_inference_steps=args.num_inference_steps, width=args.width, height=args.height, generator=generator
).images[0]

if args.benchmark:
# warmup
for i in range(3):
Expand Down