From 44026ba7e7f5376a80cf0f2b333a0f25c0eeda6c Mon Sep 17 00:00:00 2001
From: Bing Xu <bingxu@fb.com>
Date: Wed, 28 Sep 2022 22:54:11 -0700
Subject: [PATCH] v0.1

---
 .circleci/config.yml                          |   80 +
 .clang-format                                 |   88 +
 .flake8                                       |   63 +
 .github/workflows/docs.yml                    |   67 +
 .github/workflows/lint.yml                    |   41 +
 .gitignore                                    |  143 ++
 .gitmodules                                   |   10 +
 3rdparty/composable_kernel                    |    1 +
 3rdparty/cub                                  |    1 +
 3rdparty/cutlass                              |    1 +
 CITATION.cff                                  |   54 +
 CODE_OF_CONDUCT.md                            |   80 +
 CONTRIBUTING.md                               |   37 +
 LICENSE                                       |  201 ++
 README.md                                     |  119 +
 docker/Dockerfile.cuda                        |   58 +
 docker/Dockerfile.rocm                        |  147 ++
 docker/README.md                              |   30 +
 docker/build.sh                               |   24 +
 docker/install/install_ait.sh                 |    5 +
 docker/install/install_basic_dep.sh           |    4 +
 docker/install/install_detection_deps.sh      |    9 +
 docker/install/install_doc_dep.sh             |    6 +
 docker/install/install_test_dep.sh            |   11 +
 docker/install/rocm_dev-requirements.txt      |    3 +
 docker/rocm_fix/fix_10736.py                  |    9 +
 docs/Makefile                                 |   22 +
 docs/README.md                                |   20 +
 docs/make.bat                                 |   35 +
 docs/source/arch/index.rst                    |   12 +
 docs/source/arch/philosophy.rst               |   16 +
 docs/source/conf.py                           |   67 +
 docs/source/debughints.rst                    |   14 +
 docs/source/genindex.rst                      |    2 +
 docs/source/index.rst                         |   44 +
 docs/source/install/index.rst                 |   64 +
 docs/source/reference/backend.rst             |   60 +
 docs/source/reference/compiler.rst            |   37 +
 docs/source/reference/cuda.rst                |   12 +
 docs/source/reference/env.rst                 |   37 +
 docs/source/reference/frontend.rst            |   14 +
 docs/source/reference/index.rst               |   16 +
 docs/source/reference/ops.rst                 |    8 +
 docs/source/reference/rocm.rst                |   11 +
 docs/source/reference/testing.rst             |   27 +
 docs/source/reference/transform.rst           |  209 ++
 docs/source/reference/utils.rst               |   12 +
 docs/source/runtime/cxx_design.rst            |   29 +
 docs/source/runtime/index.rst                 |    9 +
 docs/source/runtime/py_design.rst             |  135 +
 docs/source/tutorial/how_to_add_op.rst        |  302 +++
 docs/source/tutorial/how_to_infer_pt.rst      |  188 ++
 docs/source/tutorial/how_to_visualize.rst     |   85 +
 docs/source/tutorial/index.rst                |    9 +
 docs/static/ait_model.html                    |  866 +++++++
 examples/01_resnet-50/README.md               |   84 +
 examples/01_resnet-50/benchmark_ait.py        |  132 +
 examples/01_resnet-50/benchmark_mi250.sh      |    4 +
 examples/01_resnet-50/benchmark_pt.py         |   51 +
 examples/01_resnet-50/infer_with_torch.py     |  135 +
 examples/01_resnet-50/modeling/__init__.py    |   14 +
 examples/01_resnet-50/modeling/resnet.py      |  456 ++++
 examples/01_resnet-50/weight_utils.py         |  173 ++
 examples/02_detectron2/README.md              |  169 ++
 examples/02_detectron2/compile_model.py       |  149 ++
 examples/02_detectron2/configs/__init__.py    |   17 +
 examples/02_detectron2/configs/config.py      |   26 +
 examples/02_detectron2/configs/defaults.py    |  668 +++++
 .../configs/faster_rcnn_R_101_FPN.yaml        |   47 +
 .../configs/faster_rcnn_R_50_FPN.yaml         |   45 +
 .../configs/mask_rcnn_R_101_FPN.yaml          |   48 +
 .../configs/mask_rcnn_R_50_FPN.yaml           |   46 +
 examples/02_detectron2/demo.py                |  105 +
 .../modeling/backbone/__init__.py             |   25 +
 .../02_detectron2/modeling/backbone/fpn.py    |  228 ++
 .../02_detectron2/modeling/backbone/resnet.py |  459 ++++
 .../02_detectron2/modeling/backbone/utils.py  |   30 +
 .../modeling/meta_arch/__init__.py            |   18 +
 .../02_detectron2/modeling/meta_arch/rcnn.py  |   56 +
 .../modeling/proposal_generator/__init__.py   |   18 +
 .../modeling/proposal_generator/rpn.py        |  177 ++
 .../modeling/roi_heads/__init__.py            |   20 +
 .../modeling/roi_heads/box_head.py            |   67 +
 .../modeling/roi_heads/fast_rcnn.py           |  209 ++
 .../modeling/roi_heads/mask_head.py           |   65 +
 .../modeling/roi_heads/roi_heads.py           |   91 +
 examples/02_detectron2/predictor/__init__.py  |   18 +
 .../02_detectron2/predictor/builtin_meta.py   |  180 ++
 examples/02_detectron2/predictor/predictor.py |  359 +++
 .../02_detectron2/prepare_and_run_rcnn.sh     |   59 +
 .../02_detectron2/tools/convert_pt2ait.py     |  157 ++
 examples/03_bert/README.md                    |  303 +++
 examples/03_bert/benchmark_ait.py             |  298 +++
 examples/03_bert/benchmark_mi250.sh           |    4 +
 examples/03_bert/benchmark_pt.py              |  148 ++
 examples/03_bert/demo.py                      |  108 +
 examples/03_bert/modeling/__init__.py         |   14 +
 examples/03_bert/modeling/bert.py             |  391 +++
 examples/03_bert/modeling/torch_model.py      |   51 +
 examples/04_vit/README.md                     |  126 +
 examples/04_vit/benchmark_ait.py              |  186 ++
 examples/04_vit/benchmark_mi250.sh            |    4 +
 examples/04_vit/benchmark_pt.py               |  100 +
 .../04_vit/modeling/vision_transformer.py     |  323 +++
 examples/04_vit/verification.py               |  164 ++
 examples/04_vit/weight_utils.py               |  115 +
 examples/05_stable_diffusion/README.md        |  136 ++
 examples/05_stable_diffusion/benchmark.py     |  304 +++
 examples/05_stable_diffusion/benchmark_pt.py  |   46 +
 examples/05_stable_diffusion/compile.py       |  353 +++
 examples/05_stable_diffusion/demo.py          |   46 +
 .../05_stable_diffusion/modeling/attention.py |  104 +
 examples/05_stable_diffusion/modeling/clip.py |  590 +++++
 .../modeling/embeddings.py                    |  101 +
 .../05_stable_diffusion/modeling/resnet.py    |  238 ++
 .../modeling/unet_2d_condition.py             |  251 ++
 .../modeling/unet_blocks.py                   |  761 ++++++
 examples/05_stable_diffusion/modeling/vae.py  |  152 ++
 .../pipeline_stable_diffusion_ait.py          |  371 +++
 .../06_how_to_add_an_op/how_to_add_an_op.py   |  249 ++
 .../how_to_run_pt_model.py                    |  131 +
 licenses/LICENSE.composable_kernel.txt        |   28 +
 licenses/LICENSE.cub.txt                      |   24 +
 licenses/LICENSE.cutlass.txt                  |   27 +
 licenses/LICENSE.dmlc.txt                     |  201 ++
 licenses/LICENSE.flash_attention.txt          |  201 ++
 licenses/LICENSE.hipcub.txt                   |   25 +
 licenses/LICENSE.markdown_table.txt           |   21 +
 licenses/LICENSE.oneflow.txt                  |  202 ++
 licenses/LICENSE.pydot.txt                    |   21 +
 licenses/LICENSE.pytorch.txt                  |   77 +
 licenses/LICENSE.tensorrt.txt                 |  337 +++
 licenses/license.header.txt                   |   13 +
 python/aitemplate/__init__.py                 |   42 +
 python/aitemplate/_libinfo.py                 |   17 +
 python/aitemplate/backend/__init__.py         |   37 +
 python/aitemplate/backend/backend_spec.py     |  280 +++
 python/aitemplate/backend/builder.py          |  295 +++
 python/aitemplate/backend/codegen.py          |  744 ++++++
 .../backend/common/concatenate_common.py      |  839 +++++++
 .../backend/common/elementwise_common.py      |  881 +++++++
 .../aitemplate/backend/common/gemm_common.py  |   72 +
 .../aitemplate/backend/common/split_common.py |  569 +++++
 .../backend/common/tensor/argmax_common.py    |  456 ++++
 .../common/tensor/batch_gather_common.py      |  221 ++
 .../common/tensor/permute021_common.py        |  304 +++
 .../common/tensor/permute102_common.py        |  310 +++
 .../common/tensor/permute210_common.py        |  289 +++
 .../backend/common/tensor/slice_common.py     |  902 +++++++
 .../tensor/slice_reshape_scatter_common.py    |  149 ++
 .../backend/common/tensor/topk_common.py      |  769 ++++++
 .../backend/common/tensor_accessor.cuh        |  110 +
 .../backend/common/tensor_accessor_codegen.py |  163 ++
 .../backend/common/upsampling2d_common.py     |  425 ++++
 .../common/vision_ops/efficient_nms_common.py |  250 ++
 .../common/vision_ops/efficient_nms_kernel.py | 1160 +++++++++
 .../multi_level_roi_align_common.py           |  464 ++++
 .../backend/common/vision_ops/nms_common.py   |  235 ++
 .../backend/common/vision_ops/nms_kernel.py   |  565 +++++
 .../common/vision_ops/roi_align_common.py     |  392 +++
 python/aitemplate/backend/cuda/__init__.py    |   37 +
 .../backend/cuda/attention/__init__.py        |   20 +
 .../backend/cuda/attention/flash_attention.py |  319 +++
 .../backend/cuda/attention/src/fmha.h         |  211 ++
 .../backend/cuda/attention/src/fmha/gemm.h    |  482 ++++
 .../cuda/attention/src/fmha/gmem_tile.h       |  608 +++++
 .../cuda/attention/src/fmha/kernel_traits.h   |  143 ++
 .../backend/cuda/attention/src/fmha/mask.h    |  117 +
 .../cuda/attention/src/fmha/smem_tile.h       | 1843 ++++++++++++++
 .../backend/cuda/attention/src/fmha/softmax.h |  708 ++++++
 .../backend/cuda/attention/src/fmha/utils.h   | 1332 ++++++++++
 .../src/fmha_block_fprop_fp16_kernel.sm80.cu  |  155 ++
 .../src/fmha_block_fprop_kernel_1xN.h         |  661 +++++
 .../cuda/attention/src/fmha_blockmask.h       |   69 +
 .../src/fmha_fprop_fp16_kernel.sm80.cu        |  262 ++
 .../attention/src/fmha_fprop_kernel_1xN.h     |  795 ++++++
 .../backend/cuda/attention/src/fmha_kernel.h  |  204 ++
 .../backend/cuda/attention/src/fmha_utils.h   |  111 +
 .../cuda/attention/src/licenses/LICENSE       |  201 ++
 .../backend/cuda/attention/src/philox.cuh     |  171 ++
 .../backend/cuda/common/__init__.py           |   19 +
 .../backend/cuda/common/dummy_op.py           |   36 +
 .../backend/cuda/conv2d/__init__.py           |   33 +
 .../aitemplate/backend/cuda/conv2d/common.py  |  244 ++
 .../conv2d/common_conv2d_bias_activation.py   |  373 +++
 .../common_conv2d_bias_add_activation.py      |  348 +++
 .../cuda/conv2d/common_conv2d_few_channels.py |  111 +
 .../aitemplate/backend/cuda/conv2d/conv2d.py  |  420 ++++
 .../backend/cuda/conv2d/conv2d_bias.py        |   86 +
 .../backend/cuda/conv2d/conv2d_bias_add.py    |  149 ++
 .../cuda/conv2d/conv2d_bias_add_hardswish.py  |  149 ++
 .../cuda/conv2d/conv2d_bias_add_relu.py       |  149 ++
 .../cuda/conv2d/conv2d_bias_few_channels.py   |  211 ++
 .../cuda/conv2d/conv2d_bias_hardswish.py      |   81 +
 .../conv2d_bias_hardswish_few_channels.py     |  123 +
 .../backend/cuda/conv2d/conv2d_bias_relu.py   |   81 +
 .../conv2d/conv2d_bias_relu_few_channels.py   |  115 +
 .../cuda/conv2d/conv2d_bias_sigmoid.py        |   82 +
 .../backend/cuda/conv2d/transposed_conv2d.py  |  256 ++
 .../cuda/conv2d/transposed_conv2d_bias.py     |  264 ++
 python/aitemplate/backend/cuda/cuda_common.py |   48 +
 .../backend/cuda/elementwise/__init__.py      |   20 +
 .../backend/cuda/elementwise/custom_math.cuh  |  299 +++
 .../cuda/elementwise/fused_elementwise.py     |   65 +
 .../backend/cuda/embedding/__init__.py        |   16 +
 .../backend/cuda/embedding/bert_embeddings.py |  450 ++++
 .../cuda/gemm_epilogue_vistor/__init__.py     |   18 +
 .../bmm_common_softmax.py                     |  256 ++
 .../gemm_epilogue_vistor/bmm_rcr_softmax.py   |  161 ++
 .../gemm_epilogue_vistor/common_softmax.py    |  538 ++++
 .../gemm_rcr_bias_softmax.py                  |  118 +
 .../gemm_epilogue_vistor/gemm_rcr_softmax.py  |  216 ++
 .../include/gemm_with_softmax.h               |  302 +++
 .../backend/cuda/gemm_special/__init__.py     |   21 +
 .../backend/cuda/gemm_special/bmm_rcr_n1.py   |  616 +++++
 .../cuda/gemm_special/bmm_rrr_k1_tanh.py      |  258 ++
 .../cuda/gemm_special/gemm_rrr_small_nk.py    |  374 +++
 .../backend/cuda/gemm_universal/__init__.py   |   61 +
 .../backend/cuda/gemm_universal/bmm_ccr.py    |  142 ++
 .../cuda/gemm_universal/bmm_ccr_add.py        |  120 +
 .../backend/cuda/gemm_universal/bmm_common.py |  391 +++
 .../backend/cuda/gemm_universal/bmm_crr.py    |  144 ++
 .../cuda/gemm_universal/bmm_crr_add.py        |  104 +
 .../cuda/gemm_universal/bmm_permute_common.py |  166 ++
 .../backend/cuda/gemm_universal/bmm_rcr.py    |  211 ++
 .../cuda/gemm_universal/bmm_rcr_permute.py    |  211 ++
 .../backend/cuda/gemm_universal/bmm_rrr.py    |  145 ++
 .../cuda/gemm_universal/bmm_rrr_add.py        |  121 +
 .../cuda/gemm_universal/bmm_rrr_permute.py    |  219 ++
 .../gemm_universal/bmm_softmax_bmm_permute.py |   31 +
 .../backend/cuda/gemm_universal/common.py     |  944 +++++++
 .../cuda/gemm_universal/common_bias.py        |  134 +
 .../gemm_universal/common_bias_activation.py  |   93 +
 .../gemm_universal/common_bias_broadcast.py   |  585 +++++
 .../cuda/gemm_universal/common_permute.py     |  351 +++
 .../backend/cuda/gemm_universal/gemm_rcr.py   |  229 ++
 .../cuda/gemm_universal/gemm_rcr_bias.py      |  158 ++
 .../cuda/gemm_universal/gemm_rcr_bias_add.py  |   98 +
 .../gemm_universal/gemm_rcr_bias_add_add.py   |   98 +
 .../gemm_rcr_bias_add_add_relu.py             |   98 +
 .../gemm_universal/gemm_rcr_bias_add_relu.py  |   98 +
 .../gemm_universal/gemm_rcr_bias_fast_gelu.py |  144 ++
 .../cuda/gemm_universal/gemm_rcr_bias_gelu.py |  106 +
 .../gemm_universal/gemm_rcr_bias_hardswish.py |  106 +
 .../cuda/gemm_universal/gemm_rcr_bias_mul.py  |   98 +
 .../gemm_universal/gemm_rcr_bias_mul_add.py   |   98 +
 .../gemm_universal/gemm_rcr_bias_mul_tanh.py  |   98 +
 .../gemm_universal/gemm_rcr_bias_permute.py   |  117 +
 .../cuda/gemm_universal/gemm_rcr_bias_relu.py |  107 +
 .../gemm_universal/gemm_rcr_bias_sigmoid.py   |  107 +
 .../gemm_rcr_bias_sigmoid_mul.py              |   98 +
 .../gemm_rcr_bias_sigmoid_mul_tanh.py         |   98 +
 .../gemm_universal/gemm_rcr_bias_swish.py     |  107 +
 .../cuda/gemm_universal/gemm_rcr_bias_tanh.py |  144 ++
 .../cuda/gemm_universal/gemm_rcr_permute.py   |  220 ++
 .../backend/cuda/gemm_universal/gemm_rrr.py   |  161 ++
 .../cuda/gemm_universal/gemm_rrr_permute.py   |  221 ++
 .../cuda/gemm_universal/group_common.py       |  974 ++++++++
 .../cuda/gemm_universal/group_common_bias.py  |   76 +
 .../cuda/gemm_universal/group_gemm_rcr.py     |  102 +
 .../gemm_universal/group_gemm_rcr_bias.py     |   75 +
 .../group_gemm_rcr_bias_relu.py               |   75 +
 .../group_gemm_rcr_bias_sigmoid.py            |   75 +
 .../backend/cuda/gemm_universal/layout.py     |   79 +
 .../cuda/gemm_universal/perm021fc_ccr.py      |  124 +
 .../cuda/gemm_universal/perm021fc_ccr_bias.py |  130 +
 .../perm021fc_ccr_bias_permute.py             |  165 ++
 .../cuda/gemm_universal/perm021fc_crc.py      |  127 +
 .../cuda/gemm_universal/perm021fc_crc_bias.py |  133 +
 .../cuda/gemm_universal/perm102_bmm_rcr.py    |  179 ++
 .../gemm_universal/perm102_bmm_rcr_bias.py    |  155 ++
 .../cuda/gemm_universal/perm102_bmm_rrr.py    |  148 ++
 .../gemm_universal/perm102_bmm_rrr_bias.py    |  155 ++
 .../backend/cuda/groupnorm/__init__.py        |   17 +
 .../backend/cuda/groupnorm/groupnorm.py       |   38 +
 .../cuda/groupnorm/groupnorm_common.py        |  179 ++
 .../cuda/groupnorm/groupnorm_kernel.cuh       |  561 +++++
 .../backend/cuda/groupnorm/groupnorm_swish.py |   38 +
 .../cuda/layernorm_sigmoid_mul/__init__.py    |   28 +
 .../batch_layernorm_sigmoid_mul.py            |  136 ++
 .../group_layernorm_sigmoid_mul.py            |  303 +++
 .../layernorm_sigmoid_mul/layernorm_common.py |  113 +
 .../layernorm_sigmoid_mul.py                  |  184 ++
 .../layernorm_sigmoid_mul_kernel.cuh          | 1735 +++++++++++++
 .../aitemplate/backend/cuda/lib_template.py   |   46 +
 .../backend/cuda/padding/__init__.py          |   20 +
 .../backend/cuda/padding/nhwc3to4.py          |  218 ++
 .../backend/cuda/padding/nhwc3to8.py          |  221 ++
 .../backend/cuda/padding/pad_last_dim.py      |  262 ++
 .../backend/cuda/pool2d/__init__.py           |   20 +
 .../backend/cuda/pool2d/avg_pool2d.py         |  191 ++
 .../backend/cuda/pool2d/max_pool2d.py         |  236 ++
 .../aitemplate/backend/cuda/pool2d/pool2d.py  |   76 +
 .../backend/cuda/reduce/__init__.py           |   27 +
 .../backend/cuda/reduce/reduce_3d.py          |  995 ++++++++
 .../backend/cuda/reduce/reduce_common.py      |  241 ++
 .../backend/cuda/reduce/reduce_mean.py        |   88 +
 .../backend/cuda/reduce/reduce_small_axis.py  |  425 ++++
 .../backend/cuda/reduce/reduce_sum.py         |  105 +
 python/aitemplate/backend/cuda/reduce/var.py  |  289 +++
 .../backend/cuda/reduce/vector_norm.py        |  102 +
 .../backend/cuda/softmax/__init__.py          |   20 +
 .../backend/cuda/softmax/softmax.cuh          |  538 ++++
 .../backend/cuda/softmax/softmax.py           |  347 +++
 python/aitemplate/backend/cuda/target_def.py  |  171 ++
 .../backend/cuda/tensor/__init__.py           |   51 +
 .../aitemplate/backend/cuda/tensor/argmax.py  |   52 +
 .../backend/cuda/tensor/batch_gather.py       |   46 +
 .../backend/cuda/tensor/concatenate.py        |   87 +
 .../backend/cuda/tensor/concatenate_tanh.py   |  103 +
 .../backend/cuda/tensor/dynamic_slice.py      |   84 +
 .../aitemplate/backend/cuda/tensor/expand.py  |   31 +
 .../aitemplate/backend/cuda/tensor/gather.py  |  412 ++++
 .../backend/cuda/tensor/permute021.py         |   91 +
 .../backend/cuda/tensor/permute102.py         |   91 +
 .../backend/cuda/tensor/permute210.py         |   72 +
 .../cuda/tensor/slice_reshape_scatter.py      |  167 ++
 .../backend/cuda/tensor/slice_scatter.py      |   90 +
 .../aitemplate/backend/cuda/tensor/split.py   |   77 +
 python/aitemplate/backend/cuda/tensor/topk.py |   52 +
 .../backend/cuda/upsample/__init__.py         |   20 +
 .../backend/cuda/upsample/upsampling2d.py     |   96 +
 .../backend/cuda/upsample/upsampling2d_add.py |   99 +
 python/aitemplate/backend/cuda/utils.py       |   63 +
 .../backend/cuda/view_ops/__init__.py         |   20 +
 .../backend/cuda/view_ops/view_ops.py         |  230 ++
 .../backend/cuda/vision_ops/__init__.py       |   21 +
 .../backend/cuda/vision_ops/nms/__init__.py   |   18 +
 .../cuda/vision_ops/nms/batched_nms.py        |  141 ++
 .../vision_ops/nms/batched_nms_kernel.cuh     |  203 ++
 .../cuda/vision_ops/nms/efficient_nms.py      |   62 +
 .../backend/cuda/vision_ops/nms/nms.py        |   52 +
 .../cuda/vision_ops/roi_ops/__init__.py       |   20 +
 .../roi_ops/multi_level_roi_align.py          |   86 +
 .../cuda/vision_ops/roi_ops/roi_align.py      |  108 +
 .../cuda/vision_ops/roi_ops/roi_ops.py        |   94 +
 python/aitemplate/backend/main_templates.py   |  378 +++
 python/aitemplate/backend/profiler_cache.py   |  554 +++++
 python/aitemplate/backend/profiler_runner.py  |  123 +
 python/aitemplate/backend/registry.py         |   99 +
 python/aitemplate/backend/rocm/__init__.py    |   30 +
 .../backend/rocm/common/__init__.py           |   19 +
 .../backend/rocm/common/dummy_op.py           |   36 +
 .../backend/rocm/conv2d/__init__.py           |   36 +
 .../aitemplate/backend/rocm/conv2d/common.py  |  892 +++++++
 .../aitemplate/backend/rocm/conv2d/conv2d.py  |  170 ++
 .../backend/rocm/conv2d/conv2d_bias.py        |  163 ++
 .../rocm/conv2d/conv2d_bias_add_relu.py       |  207 ++
 .../backend/rocm/conv2d/conv2d_bias_relu.py   |  165 ++
 .../rocm/conv2d/conv2d_bias_sigmoid.py        |  212 ++
 .../backend/rocm/conv2d/transposed_conv2d.py  |  198 ++
 .../conv2d/transposed_conv2d_bias_relu.py     |  172 ++
 .../backend/rocm/elementwise/__init__.py      |   20 +
 .../backend/rocm/elementwise/custom_math.h    |  318 +++
 .../rocm/elementwise/fused_elementwise.py     |   65 +
 .../aitemplate/backend/rocm/gemm/__init__.py  |   49 +
 .../aitemplate/backend/rocm/gemm/bmm_ccr.py   |  170 ++
 .../backend/rocm/gemm/bmm_common.py           |  252 ++
 .../aitemplate/backend/rocm/gemm/bmm_crr.py   |  170 ++
 .../backend/rocm/gemm/bmm_permute_common.py   |   65 +
 .../aitemplate/backend/rocm/gemm/bmm_rcr.py   |  170 ++
 .../backend/rocm/gemm/bmm_rcr_permute.py      |  185 ++
 .../aitemplate/backend/rocm/gemm/bmm_rrr.py   |  170 ++
 .../backend/rocm/gemm/bmm_rrr_permute.py      |  185 ++
 .../backend/rocm/gemm/bmm_softmax_bmm.py      |  289 +++
 .../rocm/gemm/bmm_softmax_bmm_permute.py      |  387 +++
 python/aitemplate/backend/rocm/gemm/common.py |  974 ++++++++
 .../backend/rocm/gemm/gemm_epilogue.py        |   90 +
 .../aitemplate/backend/rocm/gemm/gemm_rcr.py  |  151 ++
 .../backend/rocm/gemm/gemm_rcr_bias.py        |  151 ++
 .../backend/rocm/gemm/gemm_rcr_bias_add.py    |  193 ++
 .../rocm/gemm/gemm_rcr_bias_add_add.py        |  193 ++
 .../rocm/gemm/gemm_rcr_bias_add_add_relu.py   |  194 ++
 .../rocm/gemm/gemm_rcr_bias_add_relu.py       |  194 ++
 .../rocm/gemm/gemm_rcr_bias_fast_gelu.py      |  156 ++
 .../backend/rocm/gemm/gemm_rcr_bias_mul.py    |  193 ++
 .../rocm/gemm/gemm_rcr_bias_mul_add.py        |  164 ++
 .../rocm/gemm/gemm_rcr_bias_mul_tanh.py       |  196 ++
 .../rocm/gemm/gemm_rcr_bias_permute.py        |  166 ++
 .../rocm/gemm/gemm_rcr_bias_permute_m2n3.py   |  183 ++
 .../rocm/gemm/gemm_rcr_bias_permute_m3n2.py   |  183 ++
 .../backend/rocm/gemm/gemm_rcr_bias_relu.py   |  153 ++
 .../rocm/gemm/gemm_rcr_bias_sigmoid.py        |  204 ++
 .../rocm/gemm/gemm_rcr_bias_sigmoid_mul.py    |  195 ++
 .../gemm/gemm_rcr_bias_sigmoid_mul_tanh.py    |  200 ++
 .../backend/rocm/gemm/gemm_rcr_bias_swish.py  |  157 ++
 .../backend/rocm/gemm/gemm_rcr_bias_tanh.py   |  206 ++
 .../rocm/gemm/gemm_rcr_permute_m2n3.py        |  183 ++
 .../aitemplate/backend/rocm/gemm/gemm_rrr.py  |  151 ++
 .../rocm/gemm/gemm_rrr_bias_permute.py        |  166 ++
 python/aitemplate/backend/rocm/gemm/layout.py |  246 ++
 .../backend/rocm/gemm/permute_common.py       |  128 +
 .../aitemplate/backend/rocm/lib_template.py   |   42 +
 .../backend/rocm/normalization/__init__.py    |   18 +
 .../backend/rocm/normalization/groupnorm.py   |  444 ++++
 .../rocm/normalization/groupnorm_swish.py     |   50 +
 .../backend/rocm/normalization/layernorm.py   |  371 +++
 .../backend/rocm/normalization/norm_common.py |  503 ++++
 .../backend/rocm/normalization/softmax.py     |  239 ++
 .../backend/rocm/pool2d/__init__.py           |   20 +
 .../backend/rocm/pool2d/avg_pool2d.py         |   45 +
 .../backend/rocm/pool2d/max_pool2d.py         |   45 +
 .../aitemplate/backend/rocm/pool2d/pool2d.py  |  278 +++
 python/aitemplate/backend/rocm/target_def.py  |  265 ++
 .../backend/rocm/tensor/__init__.py           |   31 +
 .../aitemplate/backend/rocm/tensor/argmax.py  |   51 +
 .../backend/rocm/tensor/batch_gather.py       |   45 +
 .../backend/rocm/tensor/concatenate.py        |   85 +
 .../backend/rocm/tensor/concatenate_tanh.py   |  122 +
 .../backend/rocm/tensor/dynamic_slice.py      |   84 +
 .../backend/rocm/tensor/permute021.py         |   90 +
 .../backend/rocm/tensor/permute102.py         |   90 +
 .../backend/rocm/tensor/permute210.py         |   71 +
 .../rocm/tensor/slice_reshape_scatter.py      |  129 +
 .../backend/rocm/tensor/slice_scatter.py      |   90 +
 .../aitemplate/backend/rocm/tensor/split.py   |   77 +
 python/aitemplate/backend/rocm/tensor/topk.py |   51 +
 .../backend/rocm/upsample/__init__.py         |   20 +
 .../backend/rocm/upsample/upsampling2d.py     |   96 +
 .../backend/rocm/upsample/upsampling2d_add.py |   99 +
 python/aitemplate/backend/rocm/utils.py       |  114 +
 .../backend/rocm/view_ops/__init__.py         |   20 +
 .../backend/rocm/view_ops/view_ops.py         |  228 ++
 .../backend/rocm/vision_ops/__init__.py       |   19 +
 .../backend/rocm/vision_ops/efficient_nms.py  |   53 +
 .../aitemplate/backend/rocm/vision_ops/nms.py |   51 +
 .../rocm/vision_ops/roi_ops/__init__.py       |   20 +
 .../roi_ops/multi_level_roi_align.py          |   87 +
 .../rocm/vision_ops/roi_ops/roi_align.py      |  108 +
 python/aitemplate/backend/target.py           |  433 ++++
 python/aitemplate/backend/task_runner.py      |  327 +++
 python/aitemplate/compiler/__init__.py        |   29 +
 python/aitemplate/compiler/base.py            |  829 +++++++
 python/aitemplate/compiler/compiler.py        |  236 ++
 python/aitemplate/compiler/model.py           |  856 +++++++
 python/aitemplate/compiler/op_registry.py     |   23 +
 python/aitemplate/compiler/ops/__init__.py    |   34 +
 .../compiler/ops/attention/__init__.py        |   21 +
 .../compiler/ops/attention/flash_attention.py |  186 ++
 .../compiler/ops/common/__init__.py           |   24 +
 .../compiler/ops/common/elementwise.py        |  153 ++
 .../compiler/ops/common/epilogue.py           |   59 +
 .../compiler/ops/common/fused_elementwise.py  |  156 ++
 python/aitemplate/compiler/ops/common/math.py |   87 +
 .../compiler/ops/common/python_ops.py         |   56 +
 .../compiler/ops/common/view_ops.py           |  495 ++++
 .../aitemplate/compiler/ops/conv/__init__.py  |   32 +
 .../compiler/ops/conv/cache_entry.py          |   71 +
 .../ops/conv/common_conv2d_bias_activation.py |   97 +
 .../conv/common_conv2d_bias_add_activation.py |   74 +
 python/aitemplate/compiler/ops/conv/conv2d.py |  621 +++++
 .../compiler/ops/conv/conv2d_bias.py          |   74 +
 .../compiler/ops/conv/conv2d_bias_add.py      |   77 +
 .../ops/conv/conv2d_bias_add_hardswish.py     |   76 +
 .../compiler/ops/conv/conv2d_bias_add_relu.py |   77 +
 .../ops/conv/conv2d_bias_few_channels.py      |   41 +
 .../ops/conv/conv2d_bias_hardswish.py         |   72 +
 .../conv2d_bias_hardswish_few_channels.py     |   29 +
 .../compiler/ops/conv/conv2d_bias_relu.py     |   71 +
 .../ops/conv/conv2d_bias_relu_few_channels.py |   39 +
 .../compiler/ops/conv/conv2d_bias_sigmoid.py  |   72 +
 .../conv/special_conv2d_bias_activation.py    |   87 +
 .../compiler/ops/conv/transposed_conv2d.py    |  111 +
 .../ops/conv/transposed_conv2d_bias.py        |  110 +
 .../ops/conv/transposed_conv2d_bias_relu.py   |   73 +
 .../compiler/ops/embedding/__init__.py        |   20 +
 .../compiler/ops/embedding/bert_embeddings.py |  136 ++
 .../ops/gemm_epilogue_vistor/__init__.py      |   20 +
 .../gemm_epilogue_vistor/bmm_rcr_softmax.py   |  127 +
 .../gemm_rcr_bias_softmax.py                  |   76 +
 .../gemm_epilogue_vistor/gemm_rcr_softmax.py  |   67 +
 .../compiler/ops/gemm_special/__init__.py     |   23 +
 .../compiler/ops/gemm_special/bmm_rcr_n1.py   |   97 +
 .../ops/gemm_special/bmm_rrr_k1_tanh.py       |   84 +
 .../ops/gemm_special/gemm_rrr_small_nk.py     |  109 +
 .../compiler/ops/gemm_universal/__init__.py   |   63 +
 .../compiler/ops/gemm_universal/bmm.py        |   67 +
 .../compiler/ops/gemm_universal/bmm_ccr.py    |  111 +
 .../ops/gemm_universal/bmm_ccr_add.py         |   81 +
 .../compiler/ops/gemm_universal/bmm_crr.py    |  111 +
 .../ops/gemm_universal/bmm_crr_add.py         |   81 +
 .../compiler/ops/gemm_universal/bmm_rcr.py    |  111 +
 .../ops/gemm_universal/bmm_rcr_permute.py     |  106 +
 .../compiler/ops/gemm_universal/bmm_rrr.py    |  109 +
 .../ops/gemm_universal/bmm_rrr_add.py         |   77 +
 .../ops/gemm_universal/bmm_rrr_permute.py     |  105 +
 .../ops/gemm_universal/bmm_softmax_bmm.py     |  159 ++
 .../gemm_universal/bmm_softmax_bmm_permute.py |  184 ++
 .../ops/gemm_universal/cache_entry.py         |   58 +
 .../ops/gemm_universal/gemm_common.py         |  762 ++++++
 .../compiler/ops/gemm_universal/gemm_rcr.py   |  103 +
 .../ops/gemm_universal/gemm_rcr_bias.py       |  100 +
 .../ops/gemm_universal/gemm_rcr_bias_add.py   |   45 +
 .../gemm_universal/gemm_rcr_bias_add_add.py   |   46 +
 .../gemm_rcr_bias_add_add_relu.py             |   46 +
 .../gemm_universal/gemm_rcr_bias_add_relu.py  |   45 +
 .../gemm_universal/gemm_rcr_bias_broadcast.py |   74 +
 .../gemm_universal/gemm_rcr_bias_fast_gelu.py |   43 +
 .../ops/gemm_universal/gemm_rcr_bias_gelu.py  |   43 +
 .../gemm_universal/gemm_rcr_bias_hardswish.py |   42 +
 .../ops/gemm_universal/gemm_rcr_bias_mul.py   |   45 +
 .../gemm_universal/gemm_rcr_bias_mul_add.py   |   46 +
 .../gemm_universal/gemm_rcr_bias_mul_tanh.py  |   45 +
 .../gemm_universal/gemm_rcr_bias_permute.py   |   69 +
 .../ops/gemm_universal/gemm_rcr_bias_relu.py  |   43 +
 .../gemm_universal/gemm_rcr_bias_sigmoid.py   |   43 +
 .../gemm_rcr_bias_sigmoid_mul.py              |   44 +
 .../gemm_rcr_bias_sigmoid_mul_tanh.py         |   45 +
 .../ops/gemm_universal/gemm_rcr_bias_swish.py |   43 +
 .../ops/gemm_universal/gemm_rcr_bias_tanh.py  |   43 +
 .../ops/gemm_universal/gemm_rcr_permute.py    |   72 +
 .../compiler/ops/gemm_universal/gemm_rrr.py   |  106 +
 .../ops/gemm_universal/gemm_rrr_bias.py       |   86 +
 .../gemm_universal/gemm_rrr_bias_permute.py   |   67 +
 .../ops/gemm_universal/gemm_rrr_permute.py    |   62 +
 .../ops/gemm_universal/group_gemm_rcr.py      |  319 +++
 .../ops/gemm_universal/group_gemm_rcr_bias.py |  168 ++
 .../group_gemm_rcr_bias_relu.py               |   52 +
 .../group_gemm_rcr_bias_sigmoid.py            |   52 +
 .../ops/gemm_universal/perm021fc_ccr.py       |  147 ++
 .../ops/gemm_universal/perm021fc_ccr_bias.py  |   77 +
 .../perm021fc_ccr_bias_permute.py             |   77 +
 .../ops/gemm_universal/perm021fc_crc.py       |  113 +
 .../ops/gemm_universal/perm021fc_crc_bias.py  |   79 +
 .../ops/gemm_universal/perm102_bmm_rcr.py     |   98 +
 .../gemm_universal/perm102_bmm_rcr_bias.py    |   91 +
 .../ops/gemm_universal/perm102_bmm_rrr.py     |   98 +
 .../gemm_universal/perm102_bmm_rrr_bias.py    |   72 +
 .../compiler/ops/groupnorm/__init__.py        |   19 +
 .../compiler/ops/groupnorm/groupnorm.py       |  403 +++
 .../compiler/ops/groupnorm/groupnorm_swish.py |   26 +
 .../compiler/ops/layernorm/__init__.py        |   28 +
 .../layernorm/batch_layernorm_sigmoid_mul.py  |   91 +
 .../compiler/ops/layernorm/group_layernorm.py |  160 ++
 .../layernorm/group_layernorm_sigmoid_mul.py  |   39 +
 .../compiler/ops/layernorm/layernorm.py       |  417 ++++
 .../ops/layernorm/layernorm_sigmoid_mul.py    |   97 +
 .../compiler/ops/padding/__init__.py          |   23 +
 .../compiler/ops/padding/nhwc3to4.py          |   39 +
 .../compiler/ops/padding/nhwc3to8.py          |   38 +
 .../compiler/ops/padding/nhwc_pad_common.py   |  109 +
 .../compiler/ops/padding/pad_last_dim.py      |   93 +
 .../aitemplate/compiler/ops/pool/__init__.py  |   22 +
 .../compiler/ops/pool/avg_pool2d.py           |   53 +
 .../compiler/ops/pool/max_pool2d.py           |   56 +
 python/aitemplate/compiler/ops/pool/pool2d.py |  180 ++
 .../compiler/ops/reduce/__init__.py           |   24 +
 .../compiler/ops/reduce/reduce_common.py      |  249 ++
 .../compiler/ops/reduce/reduce_mean.py        |   46 +
 .../compiler/ops/reduce/reduce_sum.py         |   46 +
 python/aitemplate/compiler/ops/reduce/var.py  |   52 +
 .../compiler/ops/reduce/vector_norm.py        |   59 +
 .../compiler/ops/softmax/__init__.py          |   21 +
 .../compiler/ops/softmax/cache_entry.py       |   57 +
 .../compiler/ops/softmax/softmax.py           |  367 +++
 .../compiler/ops/tensor/__init__.py           |   35 +
 .../aitemplate/compiler/ops/tensor/argmax.py  |  206 ++
 .../compiler/ops/tensor/batch_gather.py       |  121 +
 .../aitemplate/compiler/ops/tensor/chunk.py   |   70 +
 .../compiler/ops/tensor/concatenate.py        |  260 ++
 .../compiler/ops/tensor/concatenate_tanh.py   |   28 +
 .../compiler/ops/tensor/dynamic_slice.py      |  186 ++
 .../aitemplate/compiler/ops/tensor/expand.py  |  135 +
 .../aitemplate/compiler/ops/tensor/gather.py  |   77 +
 .../aitemplate/compiler/ops/tensor/permute.py |   54 +
 .../compiler/ops/tensor/permute021.py         |  104 +
 .../compiler/ops/tensor/permute102.py         |  142 ++
 .../compiler/ops/tensor/permute210.py         |  115 +
 python/aitemplate/compiler/ops/tensor/size.py |   68 +
 .../ops/tensor/slice_reshape_scatter.py       |  144 ++
 .../compiler/ops/tensor/slice_scatter.py      |   99 +
 .../aitemplate/compiler/ops/tensor/split.py   |  165 ++
 python/aitemplate/compiler/ops/tensor/topk.py |  189 ++
 .../compiler/ops/upsample/__init__.py         |   22 +
 .../compiler/ops/upsample/upsampling2d.py     |   41 +
 .../compiler/ops/upsample/upsampling2d_add.py |   56 +
 .../ops/upsample/upsampling_common.py         |  172 ++
 .../compiler/ops/vision_ops/__init__.py       |   19 +
 .../compiler/ops/vision_ops/nms/__init__.py   |   23 +
 .../ops/vision_ops/nms/batched_nms.py         |  114 +
 .../ops/vision_ops/nms/efficient_nms.py       |  244 ++
 .../compiler/ops/vision_ops/nms/nms.py        |  228 ++
 .../ops/vision_ops/roi_ops/__init__.py        |   21 +
 .../roi_ops/multi_level_roi_align.py          |  119 +
 .../ops/vision_ops/roi_ops/roi_align.py       |   72 +
 .../ops/vision_ops/roi_ops/roi_ops.py         |  214 ++
 python/aitemplate/compiler/public/__init__.py |   77 +
 python/aitemplate/compiler/tensor_accessor.py |  447 ++++
 .../aitemplate/compiler/transform/__init__.py |   39 +
 .../compiler/transform/apply_padding.py       |  245 ++
 .../compiler/transform/bind_constants.py      |   53 +
 .../compiler/transform/constant_folding.py    |  192 ++
 .../transform/fuse_conv_elementwise.py        |   72 +
 .../compiler/transform/fuse_conv_patterns.py  |  137 ++
 .../compiler/transform/fuse_group_ops.py      |  716 ++++++
 .../compiler/transform/fuse_mm_elementwise.py |  218 ++
 .../transform/fuse_mm_elementwise_patterns.py |  169 ++
 .../aitemplate/compiler/transform/fuse_ops.py |  200 ++
 .../compiler/transform/fuse_parallel_gemms.py |  461 ++++
 .../compiler/transform/fuse_permute_bmm.py    |  224 ++
 .../compiler/transform/fuse_split.py          |  282 +++
 .../compiler/transform/fuse_utils.py          |  191 ++
 .../compiler/transform/mark_param_tensor.py   |   61 +
 .../compiler/transform/memory_planning.py     |  289 +++
 .../compiler/transform/name_graph.py          |   86 +
 .../compiler/transform/optimize_graph.py      |   87 +
 .../aitemplate/compiler/transform/profile.py  |   72 +
 .../compiler/transform/profile_dynamic_dim.py |   46 +
 .../compiler/transform/refine_graph.py        |  159 ++
 .../compiler/transform/remove_no_ops.py       |  168 ++
 .../compiler/transform/remove_unused_ops.py   |   43 +
 .../aitemplate/compiler/transform/toposort.py |   65 +
 .../transform/transform_memory_ops.py         |  174 ++
 .../transform/transform_odd_alignment.py      |  301 +++
 .../transform/transform_special_ops.py        |  301 +++
 .../transform_strided_op_and_view_op.py       |  154 ++
 .../transform/transform_strided_ops.py        |  475 ++++
 .../transform/transform_strided_ops_utils.py  |  108 +
 .../transform/transform_strided_slice.py      |  268 ++
 .../compiler/transform/transform_utils.py     |  341 +++
 python/aitemplate/frontend/__init__.py        |   19 +
 python/aitemplate/frontend/nn/__init__.py     |   34 +
 python/aitemplate/frontend/nn/attention.py    |  227 ++
 python/aitemplate/frontend/nn/container.py    |  890 +++++++
 .../aitemplate/frontend/nn/conv2d/__init__.py |   30 +
 .../nn/conv2d/common_conv2d_bias_act.py       |   76 +
 .../nn/conv2d/common_conv2d_bias_add_act.py   |   51 +
 .../aitemplate/frontend/nn/conv2d/conv2d.py   |  114 +
 .../frontend/nn/conv2d/conv2d_bias.py         |   43 +
 .../nn/conv2d/conv2d_bias_add_hardswish.py    |   43 +
 .../nn/conv2d/conv2d_bias_add_relu.py         |   43 +
 .../nn/conv2d/conv2d_bias_few_channels.py     |   45 +
 .../nn/conv2d/conv2d_bias_hardswish.py        |   43 +
 .../conv2d_bias_hardswish_few_channels.py     |   45 +
 .../frontend/nn/conv2d/conv2d_bias_relu.py    |   43 +
 .../conv2d/conv2d_bias_relu_few_channels.py   |   45 +
 .../frontend/nn/conv2d/conv2d_bias_sigmoid.py |   43 +
 .../nn/conv2d/special_conv2d_bias_act.py      |   57 +
 .../nn/conv2d/transposed_conv2d_bias.py       |   43 +
 .../nn/conv2d/transposed_conv2d_bias_act.py   |   76 +
 .../nn/conv2d/transposed_conv2d_bias_relu.py  |   43 +
 python/aitemplate/frontend/nn/dropout.py      |   40 +
 python/aitemplate/frontend/nn/embedding.py    |  121 +
 python/aitemplate/frontend/nn/fpn_proposal.py |  118 +
 python/aitemplate/frontend/nn/group_norm.py   |   50 +
 python/aitemplate/frontend/nn/identity.py     |   33 +
 python/aitemplate/frontend/nn/layer_norm.py   |   58 +
 python/aitemplate/frontend/nn/linear.py       |   70 +
 python/aitemplate/frontend/nn/module.py       |  757 ++++++
 python/aitemplate/frontend/nn/padding.py      |   30 +
 python/aitemplate/frontend/nn/parameter.py    |   30 +
 python/aitemplate/frontend/nn/pool2d.py       |   41 +
 python/aitemplate/frontend/nn/proposal.py     |  278 +++
 python/aitemplate/frontend/nn/roi_ops.py      |   75 +
 python/aitemplate/frontend/nn/upsample.py     |   42 +
 python/aitemplate/frontend/nn/view_ops.py     |   54 +
 python/aitemplate/frontend/parameter.py       |   30 +
 python/aitemplate/testing/__init__.py         |   25 +
 python/aitemplate/testing/benchmark_ait.py    |  160 ++
 python/aitemplate/testing/benchmark_pt.py     |   55 +
 python/aitemplate/testing/detect_target.py    |   97 +
 python/aitemplate/testing/test_utils.py       |  105 +
 python/aitemplate/utils/__init__.py           |   26 +
 python/aitemplate/utils/graph_utils.py        |   74 +
 python/aitemplate/utils/logger.py             |   38 +
 python/aitemplate/utils/markdown_table.py     |  183 ++
 .../utils/mk_ck_lib/conv2d_operation.py       |  388 +++
 .../utils/mk_ck_lib/gemm_operation.py         |  513 ++++
 .../aitemplate/utils/mk_ck_lib/generator.py   | 2164 +++++++++++++++++
 .../utils/mk_ck_lib/groupnorm_operation.py    |  119 +
 .../utils/mk_ck_lib/layernorm_operation.py    |  119 +
 python/aitemplate/utils/mk_ck_lib/library.py  |  375 +++
 python/aitemplate/utils/mk_ck_lib/manifest.py |  178 ++
 .../utils/mk_ck_lib/softmax_operation.py      |  113 +
 .../utils/mk_cutlass_lib/extra_conv_emit.py   |  127 +
 .../mk_cutlass_lib/extra_cutlass_generator.py |  112 +
 .../utils/mk_cutlass_lib/extra_enum.py        |  139 ++
 .../utils/mk_cutlass_lib/extra_gemm_emit.py   |  250 ++
 .../utils/mk_cutlass_lib/mk_cutlass_lib.py    |   90 +
 python/aitemplate/utils/shape_utils.py        |  187 ++
 python/aitemplate/utils/tensor_utils.py       |   28 +
 python/aitemplate/utils/torch_utils.py        |   38 +
 .../utils/visualization/__init__.py           |   18 +
 .../utils/visualization/op_attr_factory.py    |   21 +
 python/aitemplate/utils/visualization/plot.py |  202 ++
 .../aitemplate/utils/visualization/pydot.py   | 1962 +++++++++++++++
 .../utils/visualization/web_template.py       |  381 +++
 python/setup.py                               |  176 ++
 static/README.md                              |  143 ++
 static/csrc/model_container.cpp               |  475 ++++
 static/csrc/model_interface.cpp               |  229 ++
 static/csrc/rocm_hack.cpp                     |   62 +
 static/csrc/utility.cpp                       |   69 +
 static/include/cuda_device_functions.h        |  185 ++
 static/include/logging.h                      |  622 +++++
 static/include/macros.h                       |   29 +
 static/include/model_container.h              |  189 ++
 static/include/model_interface.h              |  184 ++
 static/include/owned_constants.h              |   46 +
 static/include/raii_wrapper.h                 |   53 +
 static/include/rocm_device_functions.h        |  192 ++
 static/include/utility.h                      |   54 +
 tests/ci_profile_cache/README.md              |    5 +
 tests/ci_profile_cache/update_cache.py        |  827 +++++++
 tests/lint/check_meta_header.py               |  108 +
 tests/lint/flake8_problem_matcher.json        |   17 +
 .../backend/test_fused_elementwise_backend.py |  412 ++++
 tests/unittest/backend/test_model_api.py      | 1408 +++++++++++
 .../benchmark/test_group_gemm_benchmark.py    |  654 +++++
 .../test_strided_layernorm_benchmark.py       |   85 +
 .../compiler/test_constant_folding.py         |  316 +++
 .../compiler/test_fuse_conv_elementwise.py    |  696 ++++++
 tests/unittest/compiler/test_fuse_expand.py   |   63 +
 .../compiler/test_fuse_mm_elementwise.py      | 1387 +++++++++++
 .../compiler/test_fuse_permute_bmm.py         |  647 +++++
 ...st_fused_elementwise_complex_dependency.py |  283 +++
 .../test_fused_elementwise_out_of_order.py    |  135 +
 tests/unittest/compiler/test_group_fusions.py |  458 ++++
 .../unittest/compiler/test_memory_planning.py |  124 +
 .../test_pad_bmm_rrr_bias_with_cat.py         |   99 +
 .../compiler/test_pad_gemm_rrr_with_cat.py    |   83 +
 .../compiler/test_pad_gemm_with_cat.py        |   95 +
 .../test_pad_gemm_with_elementwise.py         |  171 ++
 .../compiler/test_parallel_gemm_fusions.py    |  542 +++++
 .../compiler/test_permute_bmm_special_op.py   |   79 +
 tests/unittest/compiler/test_public_import.py |   54 +
 tests/unittest/compiler/test_refine_graph.py  |  319 +++
 .../compiler/test_remove_unused_ops.py        |   77 +
 .../compiler/test_slice_elemwise_fusion.py    |  513 ++++
 .../compiler/test_slice_gemm_fusion.py        |  769 ++++++
 .../compiler/test_slice_reshape_scatter.py    |  140 ++
 .../compiler/test_slice_scatter_pattern.py    |  474 ++++
 .../compiler/test_slice_view_strided.py       |  122 +
 .../compiler/test_split_bmm_fusion.py         |  299 +++
 .../compiler/test_split_bmm_softmax_bmm.py    |   92 +
 .../compiler/test_split_view_strided.py       |  180 ++
 .../compiler/test_strided_group_gemm.py       |  255 ++
 .../compiler/test_strided_group_layernorm.py  |  335 +++
 .../compiler/test_strided_layernorm.py        |  297 +++
 .../test_strided_layernorm_reshape.py         |  147 ++
 .../compiler/test_strided_op_cat_pattern.py   | 1557 ++++++++++++
 .../compiler/test_strided_reshape_cat.py      |  247 ++
 .../unittest/compiler/test_strided_scatter.py |  870 +++++++
 .../compiler/test_strided_split_group_gemm.py |  338 +++
 .../compiler/test_strided_view_cat.py         |  206 ++
 .../unittest/compiler/test_strided_view_op.py |  390 +++
 .../unittest/compiler/test_tensor_accessor.py |  360 +++
 .../compiler/test_transform_memory_ops.py     |  247 ++
 .../compiler/test_transform_odd_alignment.py  |  493 ++++
 .../compiler/test_transform_special_op.py     |  366 +++
 .../unittest/compiler/test_transform_utils.py |  171 ++
 .../unittest/compiler/test_view_strided_op.py |  519 ++++
 tests/unittest/frontend/test_module.py        |  246 ++
 tests/unittest/ops/test_activation.py         |  136 ++
 tests/unittest/ops/test_argmax.py             |   58 +
 tests/unittest/ops/test_attention.py          |  294 +++
 tests/unittest/ops/test_avg_pool2d.py         |   51 +
 tests/unittest/ops/test_batch_gather.py       |  166 ++
 tests/unittest/ops/test_bert_embeddings.py    |  167 ++
 tests/unittest/ops/test_bmm.py                |  399 +++
 tests/unittest/ops/test_bmm_add.py            |  288 +++
 tests/unittest/ops/test_bmm_alpha.py          |  282 +++
 tests/unittest/ops/test_bmm_permute.py        |  112 +
 tests/unittest/ops/test_bmm_rcr_n1.py         |   89 +
 tests/unittest/ops/test_bmm_rrr_k1_tanh.py    |   55 +
 tests/unittest/ops/test_bmm_softmax.py        |   62 +
 tests/unittest/ops/test_bmm_softmax_bmm.py    |  189 ++
 tests/unittest/ops/test_chunk.py              |  120 +
 tests/unittest/ops/test_clamp_nan_to_num.py   |  178 ++
 tests/unittest/ops/test_concatenate.py        |  401 +++
 tests/unittest/ops/test_concatenate_tanh.py   |  369 +++
 tests/unittest/ops/test_conv.py               |   58 +
 tests/unittest/ops/test_conv2d_bias_add.py    |   71 +
 tests/unittest/ops/test_conv_bias.py          |   62 +
 .../ops/test_conv_bias_act_few_channels.py    |  104 +
 .../ops/test_conv_bias_add_hardswish.py       |   74 +
 tests/unittest/ops/test_conv_bias_add_relu.py |   71 +
 .../unittest/ops/test_conv_bias_hardswish.py  |   71 +
 tests/unittest/ops/test_conv_bias_relu.py     |   63 +
 tests/unittest/ops/test_conv_bias_sigmoid.py  |   63 +
 tests/unittest/ops/test_dynamic_conv.py       |   70 +
 tests/unittest/ops/test_efficient_nms.py      |  307 +++
 tests/unittest/ops/test_expand.py             |  104 +
 tests/unittest/ops/test_flatten.py            |  135 +
 tests/unittest/ops/test_fpn_roi_align.py      |  201 ++
 tests/unittest/ops/test_fused_elementwise.py  |  381 +++
 .../ops/test_fused_elementwise_broadcast.py   |  471 ++++
 ..._fused_elementwise_with_strided_outputs.py |  141 ++
 tests/unittest/ops/test_gather.py             |   90 +
 tests/unittest/ops/test_gemm.py               |  190 ++
 tests/unittest/ops/test_gemm_bias.py          |   79 +
 .../unittest/ops/test_gemm_bias_broadcast.py  |  316 +++
 .../unittest/ops/test_gemm_bias_hardswish.py  |   55 +
 tests/unittest/ops/test_gemm_bias_permute.py  |  187 ++
 tests/unittest/ops/test_gemm_bias_relu.py     |   76 +
 tests/unittest/ops/test_gemm_bias_sigmoid.py  |   50 +
 tests/unittest/ops/test_gemm_bias_softmax.py  |   69 +
 tests/unittest/ops/test_gemm_bias_swish.py    |   55 +
 tests/unittest/ops/test_gemm_bias_tanh.py     |   68 +
 tests/unittest/ops/test_gemm_permute.py       |  100 +
 .../ops/test_gemm_rcr_bias_fast_gelu.py       |   79 +
 tests/unittest/ops/test_gemm_rrr_small_nk.py  |   68 +
 tests/unittest/ops/test_gemm_softmax.py       |   67 +
 tests/unittest/ops/test_group_gemm_rcr.py     |   91 +
 .../unittest/ops/test_group_gemm_rcr_bias.py  |   77 +
 .../test_group_gemm_rcr_bias_activation.py    |   77 +
 .../ops/test_group_gemm_rcr_bias_cat.py       |   79 +
 tests/unittest/ops/test_group_gemm_rcr_cat.py |   74 +
 tests/unittest/ops/test_groupnorm.py          |  149 ++
 tests/unittest/ops/test_layernorm.py          |  146 ++
 .../ops/test_layernorm_sigmoid_mul.py         |  700 ++++++
 tests/unittest/ops/test_max_pool2d.py         |   51 +
 tests/unittest/ops/test_nhwc3to4.py           |   57 +
 tests/unittest/ops/test_nhwc3to8.py           |   57 +
 tests/unittest/ops/test_nms.py                |  207 ++
 tests/unittest/ops/test_norm.py               |  176 ++
 tests/unittest/ops/test_pad_last_dim.py       |   70 +
 tests/unittest/ops/test_perm021fc_ccr.py      |   62 +
 tests/unittest/ops/test_perm021fc_ccr_bias.py |   68 +
 .../ops/test_perm021fc_ccr_bias_perm021.py    |   69 +
 tests/unittest/ops/test_perm021fc_crc.py      |   63 +
 tests/unittest/ops/test_perm021fc_crc_bias.py |   66 +
 tests/unittest/ops/test_perm102_bmm_rcr.py    |   97 +
 tests/unittest/ops/test_perm102_bmm_rrr.py    |   97 +
 tests/unittest/ops/test_permute.py            |   55 +
 tests/unittest/ops/test_permute021.py         |   45 +
 tests/unittest/ops/test_permute102.py         |   45 +
 tests/unittest/ops/test_permute210.py         |   48 +
 tests/unittest/ops/test_proposal.py           |  499 ++++
 tests/unittest/ops/test_reduce.py             |  367 +++
 tests/unittest/ops/test_reshape.py            |  168 ++
 tests/unittest/ops/test_roi_align.py          |  136 ++
 tests/unittest/ops/test_size_getitem_ops.py   |  112 +
 tests/unittest/ops/test_slice.py              |  220 ++
 tests/unittest/ops/test_softmax.py            |   74 +
 tests/unittest/ops/test_split.py              |  193 ++
 tests/unittest/ops/test_split_getitem.py      |  222 ++
 tests/unittest/ops/test_squeeze.py            |  133 +
 tests/unittest/ops/test_topk.py               |   73 +
 tests/unittest/ops/test_transpose_conv2d.py   |   58 +
 .../ops/test_transpose_conv2d_bias.py         |   67 +
 .../ops/test_transpose_conv2d_bias_relu.py    |   68 +
 .../unittest/ops/test_tuple_list_construct.py |   82 +
 tests/unittest/ops/test_upsamping2d.py        |   78 +
 tests/unittest/ops/test_upsamping2d_add.py    |  109 +
 tests/unittest/ops/test_var.py                |  132 +
 846 files changed, 149832 insertions(+)
 create mode 100644 .circleci/config.yml
 create mode 100644 .clang-format
 create mode 100644 .flake8
 create mode 100644 .github/workflows/docs.yml
 create mode 100644 .github/workflows/lint.yml
 create mode 100644 .gitignore
 create mode 100644 .gitmodules
 create mode 160000 3rdparty/composable_kernel
 create mode 160000 3rdparty/cub
 create mode 160000 3rdparty/cutlass
 create mode 100644 CITATION.cff
 create mode 100644 CODE_OF_CONDUCT.md
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 docker/Dockerfile.cuda
 create mode 100644 docker/Dockerfile.rocm
 create mode 100644 docker/README.md
 create mode 100755 docker/build.sh
 create mode 100644 docker/install/install_ait.sh
 create mode 100644 docker/install/install_basic_dep.sh
 create mode 100644 docker/install/install_detection_deps.sh
 create mode 100644 docker/install/install_doc_dep.sh
 create mode 100644 docker/install/install_test_dep.sh
 create mode 100644 docker/install/rocm_dev-requirements.txt
 create mode 100644 docker/rocm_fix/fix_10736.py
 create mode 100644 docs/Makefile
 create mode 100644 docs/README.md
 create mode 100644 docs/make.bat
 create mode 100644 docs/source/arch/index.rst
 create mode 100644 docs/source/arch/philosophy.rst
 create mode 100644 docs/source/conf.py
 create mode 100644 docs/source/debughints.rst
 create mode 100644 docs/source/genindex.rst
 create mode 100644 docs/source/index.rst
 create mode 100644 docs/source/install/index.rst
 create mode 100644 docs/source/reference/backend.rst
 create mode 100644 docs/source/reference/compiler.rst
 create mode 100644 docs/source/reference/cuda.rst
 create mode 100644 docs/source/reference/env.rst
 create mode 100644 docs/source/reference/frontend.rst
 create mode 100644 docs/source/reference/index.rst
 create mode 100644 docs/source/reference/ops.rst
 create mode 100644 docs/source/reference/rocm.rst
 create mode 100644 docs/source/reference/testing.rst
 create mode 100644 docs/source/reference/transform.rst
 create mode 100644 docs/source/reference/utils.rst
 create mode 100644 docs/source/runtime/cxx_design.rst
 create mode 100644 docs/source/runtime/index.rst
 create mode 100644 docs/source/runtime/py_design.rst
 create mode 100644 docs/source/tutorial/how_to_add_op.rst
 create mode 100644 docs/source/tutorial/how_to_infer_pt.rst
 create mode 100644 docs/source/tutorial/how_to_visualize.rst
 create mode 100644 docs/source/tutorial/index.rst
 create mode 100644 docs/static/ait_model.html
 create mode 100644 examples/01_resnet-50/README.md
 create mode 100644 examples/01_resnet-50/benchmark_ait.py
 create mode 100644 examples/01_resnet-50/benchmark_mi250.sh
 create mode 100644 examples/01_resnet-50/benchmark_pt.py
 create mode 100644 examples/01_resnet-50/infer_with_torch.py
 create mode 100644 examples/01_resnet-50/modeling/__init__.py
 create mode 100644 examples/01_resnet-50/modeling/resnet.py
 create mode 100644 examples/01_resnet-50/weight_utils.py
 create mode 100644 examples/02_detectron2/README.md
 create mode 100644 examples/02_detectron2/compile_model.py
 create mode 100644 examples/02_detectron2/configs/__init__.py
 create mode 100644 examples/02_detectron2/configs/config.py
 create mode 100644 examples/02_detectron2/configs/defaults.py
 create mode 100644 examples/02_detectron2/configs/faster_rcnn_R_101_FPN.yaml
 create mode 100644 examples/02_detectron2/configs/faster_rcnn_R_50_FPN.yaml
 create mode 100644 examples/02_detectron2/configs/mask_rcnn_R_101_FPN.yaml
 create mode 100644 examples/02_detectron2/configs/mask_rcnn_R_50_FPN.yaml
 create mode 100644 examples/02_detectron2/demo.py
 create mode 100644 examples/02_detectron2/modeling/backbone/__init__.py
 create mode 100644 examples/02_detectron2/modeling/backbone/fpn.py
 create mode 100644 examples/02_detectron2/modeling/backbone/resnet.py
 create mode 100644 examples/02_detectron2/modeling/backbone/utils.py
 create mode 100644 examples/02_detectron2/modeling/meta_arch/__init__.py
 create mode 100644 examples/02_detectron2/modeling/meta_arch/rcnn.py
 create mode 100644 examples/02_detectron2/modeling/proposal_generator/__init__.py
 create mode 100644 examples/02_detectron2/modeling/proposal_generator/rpn.py
 create mode 100644 examples/02_detectron2/modeling/roi_heads/__init__.py
 create mode 100644 examples/02_detectron2/modeling/roi_heads/box_head.py
 create mode 100644 examples/02_detectron2/modeling/roi_heads/fast_rcnn.py
 create mode 100644 examples/02_detectron2/modeling/roi_heads/mask_head.py
 create mode 100644 examples/02_detectron2/modeling/roi_heads/roi_heads.py
 create mode 100644 examples/02_detectron2/predictor/__init__.py
 create mode 100644 examples/02_detectron2/predictor/builtin_meta.py
 create mode 100644 examples/02_detectron2/predictor/predictor.py
 create mode 100755 examples/02_detectron2/prepare_and_run_rcnn.sh
 create mode 100644 examples/02_detectron2/tools/convert_pt2ait.py
 create mode 100644 examples/03_bert/README.md
 create mode 100644 examples/03_bert/benchmark_ait.py
 create mode 100644 examples/03_bert/benchmark_mi250.sh
 create mode 100644 examples/03_bert/benchmark_pt.py
 create mode 100644 examples/03_bert/demo.py
 create mode 100644 examples/03_bert/modeling/__init__.py
 create mode 100644 examples/03_bert/modeling/bert.py
 create mode 100644 examples/03_bert/modeling/torch_model.py
 create mode 100644 examples/04_vit/README.md
 create mode 100644 examples/04_vit/benchmark_ait.py
 create mode 100644 examples/04_vit/benchmark_mi250.sh
 create mode 100644 examples/04_vit/benchmark_pt.py
 create mode 100644 examples/04_vit/modeling/vision_transformer.py
 create mode 100644 examples/04_vit/verification.py
 create mode 100644 examples/04_vit/weight_utils.py
 create mode 100644 examples/05_stable_diffusion/README.md
 create mode 100644 examples/05_stable_diffusion/benchmark.py
 create mode 100644 examples/05_stable_diffusion/benchmark_pt.py
 create mode 100644 examples/05_stable_diffusion/compile.py
 create mode 100644 examples/05_stable_diffusion/demo.py
 create mode 100644 examples/05_stable_diffusion/modeling/attention.py
 create mode 100644 examples/05_stable_diffusion/modeling/clip.py
 create mode 100644 examples/05_stable_diffusion/modeling/embeddings.py
 create mode 100644 examples/05_stable_diffusion/modeling/resnet.py
 create mode 100644 examples/05_stable_diffusion/modeling/unet_2d_condition.py
 create mode 100644 examples/05_stable_diffusion/modeling/unet_blocks.py
 create mode 100644 examples/05_stable_diffusion/modeling/vae.py
 create mode 100644 examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
 create mode 100644 examples/06_how_to_add_an_op/how_to_add_an_op.py
 create mode 100644 examples/07_how_to_run_pt_model/how_to_run_pt_model.py
 create mode 100644 licenses/LICENSE.composable_kernel.txt
 create mode 100644 licenses/LICENSE.cub.txt
 create mode 100644 licenses/LICENSE.cutlass.txt
 create mode 100644 licenses/LICENSE.dmlc.txt
 create mode 100644 licenses/LICENSE.flash_attention.txt
 create mode 100644 licenses/LICENSE.hipcub.txt
 create mode 100644 licenses/LICENSE.markdown_table.txt
 create mode 100644 licenses/LICENSE.oneflow.txt
 create mode 100644 licenses/LICENSE.pydot.txt
 create mode 100644 licenses/LICENSE.pytorch.txt
 create mode 100644 licenses/LICENSE.tensorrt.txt
 create mode 100644 licenses/license.header.txt
 create mode 100644 python/aitemplate/__init__.py
 create mode 100644 python/aitemplate/_libinfo.py
 create mode 100644 python/aitemplate/backend/__init__.py
 create mode 100644 python/aitemplate/backend/backend_spec.py
 create mode 100644 python/aitemplate/backend/builder.py
 create mode 100644 python/aitemplate/backend/codegen.py
 create mode 100644 python/aitemplate/backend/common/concatenate_common.py
 create mode 100644 python/aitemplate/backend/common/elementwise_common.py
 create mode 100644 python/aitemplate/backend/common/gemm_common.py
 create mode 100644 python/aitemplate/backend/common/split_common.py
 create mode 100644 python/aitemplate/backend/common/tensor/argmax_common.py
 create mode 100644 python/aitemplate/backend/common/tensor/batch_gather_common.py
 create mode 100644 python/aitemplate/backend/common/tensor/permute021_common.py
 create mode 100644 python/aitemplate/backend/common/tensor/permute102_common.py
 create mode 100644 python/aitemplate/backend/common/tensor/permute210_common.py
 create mode 100644 python/aitemplate/backend/common/tensor/slice_common.py
 create mode 100644 python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
 create mode 100644 python/aitemplate/backend/common/tensor/topk_common.py
 create mode 100644 python/aitemplate/backend/common/tensor_accessor.cuh
 create mode 100644 python/aitemplate/backend/common/tensor_accessor_codegen.py
 create mode 100644 python/aitemplate/backend/common/upsampling2d_common.py
 create mode 100644 python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
 create mode 100644 python/aitemplate/backend/common/vision_ops/efficient_nms_kernel.py
 create mode 100644 python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
 create mode 100644 python/aitemplate/backend/common/vision_ops/nms_common.py
 create mode 100644 python/aitemplate/backend/common/vision_ops/nms_kernel.py
 create mode 100644 python/aitemplate/backend/common/vision_ops/roi_align_common.py
 create mode 100644 python/aitemplate/backend/cuda/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/attention/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/attention/flash_attention.py
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha/gemm.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha/mask.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha/softmax.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha/utils.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha_kernel.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/fmha_utils.h
 create mode 100644 python/aitemplate/backend/cuda/attention/src/licenses/LICENSE
 create mode 100644 python/aitemplate/backend/cuda/attention/src/philox.cuh
 create mode 100644 python/aitemplate/backend/cuda/common/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/common/dummy_op.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/common.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
 create mode 100644 python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
 create mode 100644 python/aitemplate/backend/cuda/cuda_common.py
 create mode 100644 python/aitemplate/backend/cuda/elementwise/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/elementwise/custom_math.cuh
 create mode 100644 python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
 create mode 100644 python/aitemplate/backend/cuda/embedding/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/embedding/bert_embeddings.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
 create mode 100644 python/aitemplate/backend/cuda/gemm_special/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/common.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/common_bias.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/common_permute.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/group_common.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/layout.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
 create mode 100644 python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
 create mode 100644 python/aitemplate/backend/cuda/groupnorm/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/groupnorm/groupnorm.py
 create mode 100644 python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
 create mode 100644 python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
 create mode 100644 python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py
 create mode 100644 python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
 create mode 100644 python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
 create mode 100644 python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_common.py
 create mode 100644 python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
 create mode 100644 python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
 create mode 100644 python/aitemplate/backend/cuda/lib_template.py
 create mode 100644 python/aitemplate/backend/cuda/padding/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/padding/nhwc3to4.py
 create mode 100644 python/aitemplate/backend/cuda/padding/nhwc3to8.py
 create mode 100644 python/aitemplate/backend/cuda/padding/pad_last_dim.py
 create mode 100644 python/aitemplate/backend/cuda/pool2d/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
 create mode 100644 python/aitemplate/backend/cuda/pool2d/max_pool2d.py
 create mode 100644 python/aitemplate/backend/cuda/pool2d/pool2d.py
 create mode 100644 python/aitemplate/backend/cuda/reduce/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/reduce/reduce_3d.py
 create mode 100644 python/aitemplate/backend/cuda/reduce/reduce_common.py
 create mode 100644 python/aitemplate/backend/cuda/reduce/reduce_mean.py
 create mode 100644 python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
 create mode 100644 python/aitemplate/backend/cuda/reduce/reduce_sum.py
 create mode 100644 python/aitemplate/backend/cuda/reduce/var.py
 create mode 100644 python/aitemplate/backend/cuda/reduce/vector_norm.py
 create mode 100644 python/aitemplate/backend/cuda/softmax/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/softmax/softmax.cuh
 create mode 100644 python/aitemplate/backend/cuda/softmax/softmax.py
 create mode 100644 python/aitemplate/backend/cuda/target_def.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/argmax.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/batch_gather.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/concatenate.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/dynamic_slice.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/expand.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/gather.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/permute021.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/permute102.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/permute210.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/slice_scatter.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/split.py
 create mode 100644 python/aitemplate/backend/cuda/tensor/topk.py
 create mode 100644 python/aitemplate/backend/cuda/upsample/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/upsample/upsampling2d.py
 create mode 100644 python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
 create mode 100644 python/aitemplate/backend/cuda/utils.py
 create mode 100644 python/aitemplate/backend/cuda/view_ops/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/view_ops/view_ops.py
 create mode 100644 python/aitemplate/backend/cuda/vision_ops/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/vision_ops/nms/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
 create mode 100644 python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh
 create mode 100644 python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py
 create mode 100644 python/aitemplate/backend/cuda/vision_ops/nms/nms.py
 create mode 100644 python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py
 create mode 100644 python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
 create mode 100644 python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
 create mode 100644 python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_ops.py
 create mode 100644 python/aitemplate/backend/main_templates.py
 create mode 100644 python/aitemplate/backend/profiler_cache.py
 create mode 100644 python/aitemplate/backend/profiler_runner.py
 create mode 100644 python/aitemplate/backend/registry.py
 create mode 100644 python/aitemplate/backend/rocm/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/common/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/common/dummy_op.py
 create mode 100644 python/aitemplate/backend/rocm/conv2d/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/conv2d/common.py
 create mode 100644 python/aitemplate/backend/rocm/conv2d/conv2d.py
 create mode 100644 python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
 create mode 100644 python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
 create mode 100644 python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
 create mode 100644 python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
 create mode 100644 python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
 create mode 100644 python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
 create mode 100644 python/aitemplate/backend/rocm/elementwise/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/elementwise/custom_math.h
 create mode 100644 python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_ccr.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_common.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_crr.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_permute_common.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_rcr.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_rrr.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/common.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_epilogue.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rrr.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/layout.py
 create mode 100644 python/aitemplate/backend/rocm/gemm/permute_common.py
 create mode 100644 python/aitemplate/backend/rocm/lib_template.py
 create mode 100644 python/aitemplate/backend/rocm/normalization/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/normalization/groupnorm.py
 create mode 100644 python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
 create mode 100644 python/aitemplate/backend/rocm/normalization/layernorm.py
 create mode 100644 python/aitemplate/backend/rocm/normalization/norm_common.py
 create mode 100644 python/aitemplate/backend/rocm/normalization/softmax.py
 create mode 100644 python/aitemplate/backend/rocm/pool2d/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
 create mode 100644 python/aitemplate/backend/rocm/pool2d/max_pool2d.py
 create mode 100644 python/aitemplate/backend/rocm/pool2d/pool2d.py
 create mode 100644 python/aitemplate/backend/rocm/target_def.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/argmax.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/batch_gather.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/concatenate.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/concatenate_tanh.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/dynamic_slice.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/permute021.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/permute102.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/permute210.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/slice_scatter.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/split.py
 create mode 100644 python/aitemplate/backend/rocm/tensor/topk.py
 create mode 100644 python/aitemplate/backend/rocm/upsample/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/upsample/upsampling2d.py
 create mode 100644 python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
 create mode 100644 python/aitemplate/backend/rocm/utils.py
 create mode 100644 python/aitemplate/backend/rocm/view_ops/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/view_ops/view_ops.py
 create mode 100644 python/aitemplate/backend/rocm/vision_ops/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/vision_ops/efficient_nms.py
 create mode 100644 python/aitemplate/backend/rocm/vision_ops/nms.py
 create mode 100644 python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py
 create mode 100644 python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
 create mode 100644 python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
 create mode 100644 python/aitemplate/backend/target.py
 create mode 100644 python/aitemplate/backend/task_runner.py
 create mode 100644 python/aitemplate/compiler/__init__.py
 create mode 100644 python/aitemplate/compiler/base.py
 create mode 100644 python/aitemplate/compiler/compiler.py
 create mode 100644 python/aitemplate/compiler/model.py
 create mode 100644 python/aitemplate/compiler/op_registry.py
 create mode 100644 python/aitemplate/compiler/ops/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/attention/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/attention/flash_attention.py
 create mode 100644 python/aitemplate/compiler/ops/common/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/common/elementwise.py
 create mode 100644 python/aitemplate/compiler/ops/common/epilogue.py
 create mode 100644 python/aitemplate/compiler/ops/common/fused_elementwise.py
 create mode 100644 python/aitemplate/compiler/ops/common/math.py
 create mode 100644 python/aitemplate/compiler/ops/common/python_ops.py
 create mode 100644 python/aitemplate/compiler/ops/common/view_ops.py
 create mode 100644 python/aitemplate/compiler/ops/conv/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/conv/cache_entry.py
 create mode 100644 python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
 create mode 100644 python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_bias.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
 create mode 100644 python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
 create mode 100644 python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
 create mode 100644 python/aitemplate/compiler/ops/conv/transposed_conv2d.py
 create mode 100644 python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
 create mode 100644 python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
 create mode 100644 python/aitemplate/compiler/ops/embedding/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/embedding/bert_embeddings.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_special/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/cache_entry.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
 create mode 100644 python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
 create mode 100644 python/aitemplate/compiler/ops/groupnorm/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/groupnorm/groupnorm.py
 create mode 100644 python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py
 create mode 100644 python/aitemplate/compiler/ops/layernorm/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py
 create mode 100644 python/aitemplate/compiler/ops/layernorm/group_layernorm.py
 create mode 100644 python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
 create mode 100644 python/aitemplate/compiler/ops/layernorm/layernorm.py
 create mode 100644 python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
 create mode 100644 python/aitemplate/compiler/ops/padding/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/padding/nhwc3to4.py
 create mode 100644 python/aitemplate/compiler/ops/padding/nhwc3to8.py
 create mode 100644 python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
 create mode 100644 python/aitemplate/compiler/ops/padding/pad_last_dim.py
 create mode 100644 python/aitemplate/compiler/ops/pool/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/pool/avg_pool2d.py
 create mode 100644 python/aitemplate/compiler/ops/pool/max_pool2d.py
 create mode 100644 python/aitemplate/compiler/ops/pool/pool2d.py
 create mode 100644 python/aitemplate/compiler/ops/reduce/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/reduce/reduce_common.py
 create mode 100644 python/aitemplate/compiler/ops/reduce/reduce_mean.py
 create mode 100644 python/aitemplate/compiler/ops/reduce/reduce_sum.py
 create mode 100644 python/aitemplate/compiler/ops/reduce/var.py
 create mode 100644 python/aitemplate/compiler/ops/reduce/vector_norm.py
 create mode 100644 python/aitemplate/compiler/ops/softmax/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/softmax/cache_entry.py
 create mode 100644 python/aitemplate/compiler/ops/softmax/softmax.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/argmax.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/batch_gather.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/chunk.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/concatenate.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/concatenate_tanh.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/dynamic_slice.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/expand.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/gather.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/permute.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/permute021.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/permute102.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/permute210.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/size.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/slice_scatter.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/split.py
 create mode 100644 python/aitemplate/compiler/ops/tensor/topk.py
 create mode 100644 python/aitemplate/compiler/ops/upsample/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/upsample/upsampling2d.py
 create mode 100644 python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
 create mode 100644 python/aitemplate/compiler/ops/upsample/upsampling_common.py
 create mode 100644 python/aitemplate/compiler/ops/vision_ops/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/vision_ops/nms/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
 create mode 100644 python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
 create mode 100644 python/aitemplate/compiler/ops/vision_ops/nms/nms.py
 create mode 100644 python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py
 create mode 100644 python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
 create mode 100644 python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py
 create mode 100644 python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
 create mode 100644 python/aitemplate/compiler/public/__init__.py
 create mode 100644 python/aitemplate/compiler/tensor_accessor.py
 create mode 100644 python/aitemplate/compiler/transform/__init__.py
 create mode 100644 python/aitemplate/compiler/transform/apply_padding.py
 create mode 100644 python/aitemplate/compiler/transform/bind_constants.py
 create mode 100644 python/aitemplate/compiler/transform/constant_folding.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_conv_elementwise.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_conv_patterns.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_group_ops.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_mm_elementwise.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_ops.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_parallel_gemms.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_permute_bmm.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_split.py
 create mode 100644 python/aitemplate/compiler/transform/fuse_utils.py
 create mode 100644 python/aitemplate/compiler/transform/mark_param_tensor.py
 create mode 100644 python/aitemplate/compiler/transform/memory_planning.py
 create mode 100644 python/aitemplate/compiler/transform/name_graph.py
 create mode 100644 python/aitemplate/compiler/transform/optimize_graph.py
 create mode 100644 python/aitemplate/compiler/transform/profile.py
 create mode 100644 python/aitemplate/compiler/transform/profile_dynamic_dim.py
 create mode 100644 python/aitemplate/compiler/transform/refine_graph.py
 create mode 100644 python/aitemplate/compiler/transform/remove_no_ops.py
 create mode 100644 python/aitemplate/compiler/transform/remove_unused_ops.py
 create mode 100644 python/aitemplate/compiler/transform/toposort.py
 create mode 100644 python/aitemplate/compiler/transform/transform_memory_ops.py
 create mode 100644 python/aitemplate/compiler/transform/transform_odd_alignment.py
 create mode 100644 python/aitemplate/compiler/transform/transform_special_ops.py
 create mode 100644 python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
 create mode 100644 python/aitemplate/compiler/transform/transform_strided_ops.py
 create mode 100644 python/aitemplate/compiler/transform/transform_strided_ops_utils.py
 create mode 100644 python/aitemplate/compiler/transform/transform_strided_slice.py
 create mode 100644 python/aitemplate/compiler/transform/transform_utils.py
 create mode 100644 python/aitemplate/frontend/__init__.py
 create mode 100644 python/aitemplate/frontend/nn/__init__.py
 create mode 100644 python/aitemplate/frontend/nn/attention.py
 create mode 100644 python/aitemplate/frontend/nn/container.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/__init__.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
 create mode 100644 python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
 create mode 100644 python/aitemplate/frontend/nn/dropout.py
 create mode 100644 python/aitemplate/frontend/nn/embedding.py
 create mode 100644 python/aitemplate/frontend/nn/fpn_proposal.py
 create mode 100644 python/aitemplate/frontend/nn/group_norm.py
 create mode 100644 python/aitemplate/frontend/nn/identity.py
 create mode 100644 python/aitemplate/frontend/nn/layer_norm.py
 create mode 100644 python/aitemplate/frontend/nn/linear.py
 create mode 100644 python/aitemplate/frontend/nn/module.py
 create mode 100644 python/aitemplate/frontend/nn/padding.py
 create mode 100644 python/aitemplate/frontend/nn/parameter.py
 create mode 100644 python/aitemplate/frontend/nn/pool2d.py
 create mode 100644 python/aitemplate/frontend/nn/proposal.py
 create mode 100644 python/aitemplate/frontend/nn/roi_ops.py
 create mode 100644 python/aitemplate/frontend/nn/upsample.py
 create mode 100644 python/aitemplate/frontend/nn/view_ops.py
 create mode 100644 python/aitemplate/frontend/parameter.py
 create mode 100644 python/aitemplate/testing/__init__.py
 create mode 100644 python/aitemplate/testing/benchmark_ait.py
 create mode 100644 python/aitemplate/testing/benchmark_pt.py
 create mode 100644 python/aitemplate/testing/detect_target.py
 create mode 100644 python/aitemplate/testing/test_utils.py
 create mode 100644 python/aitemplate/utils/__init__.py
 create mode 100644 python/aitemplate/utils/graph_utils.py
 create mode 100644 python/aitemplate/utils/logger.py
 create mode 100644 python/aitemplate/utils/markdown_table.py
 create mode 100644 python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
 create mode 100644 python/aitemplate/utils/mk_ck_lib/gemm_operation.py
 create mode 100644 python/aitemplate/utils/mk_ck_lib/generator.py
 create mode 100644 python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
 create mode 100644 python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
 create mode 100644 python/aitemplate/utils/mk_ck_lib/library.py
 create mode 100644 python/aitemplate/utils/mk_ck_lib/manifest.py
 create mode 100644 python/aitemplate/utils/mk_ck_lib/softmax_operation.py
 create mode 100644 python/aitemplate/utils/mk_cutlass_lib/extra_conv_emit.py
 create mode 100644 python/aitemplate/utils/mk_cutlass_lib/extra_cutlass_generator.py
 create mode 100644 python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
 create mode 100644 python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
 create mode 100644 python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
 create mode 100644 python/aitemplate/utils/shape_utils.py
 create mode 100644 python/aitemplate/utils/tensor_utils.py
 create mode 100644 python/aitemplate/utils/torch_utils.py
 create mode 100644 python/aitemplate/utils/visualization/__init__.py
 create mode 100644 python/aitemplate/utils/visualization/op_attr_factory.py
 create mode 100644 python/aitemplate/utils/visualization/plot.py
 create mode 100644 python/aitemplate/utils/visualization/pydot.py
 create mode 100644 python/aitemplate/utils/visualization/web_template.py
 create mode 100644 python/setup.py
 create mode 100644 static/README.md
 create mode 100644 static/csrc/model_container.cpp
 create mode 100644 static/csrc/model_interface.cpp
 create mode 100644 static/csrc/rocm_hack.cpp
 create mode 100644 static/csrc/utility.cpp
 create mode 100644 static/include/cuda_device_functions.h
 create mode 100644 static/include/logging.h
 create mode 100644 static/include/macros.h
 create mode 100644 static/include/model_container.h
 create mode 100644 static/include/model_interface.h
 create mode 100644 static/include/owned_constants.h
 create mode 100644 static/include/raii_wrapper.h
 create mode 100644 static/include/rocm_device_functions.h
 create mode 100644 static/include/utility.h
 create mode 100644 tests/ci_profile_cache/README.md
 create mode 100644 tests/ci_profile_cache/update_cache.py
 create mode 100644 tests/lint/check_meta_header.py
 create mode 100644 tests/lint/flake8_problem_matcher.json
 create mode 100644 tests/unittest/backend/test_fused_elementwise_backend.py
 create mode 100644 tests/unittest/backend/test_model_api.py
 create mode 100644 tests/unittest/benchmark/test_group_gemm_benchmark.py
 create mode 100644 tests/unittest/benchmark/test_strided_layernorm_benchmark.py
 create mode 100644 tests/unittest/compiler/test_constant_folding.py
 create mode 100644 tests/unittest/compiler/test_fuse_conv_elementwise.py
 create mode 100644 tests/unittest/compiler/test_fuse_expand.py
 create mode 100644 tests/unittest/compiler/test_fuse_mm_elementwise.py
 create mode 100644 tests/unittest/compiler/test_fuse_permute_bmm.py
 create mode 100644 tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
 create mode 100644 tests/unittest/compiler/test_fused_elementwise_out_of_order.py
 create mode 100644 tests/unittest/compiler/test_group_fusions.py
 create mode 100644 tests/unittest/compiler/test_memory_planning.py
 create mode 100644 tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
 create mode 100644 tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
 create mode 100644 tests/unittest/compiler/test_pad_gemm_with_cat.py
 create mode 100644 tests/unittest/compiler/test_pad_gemm_with_elementwise.py
 create mode 100644 tests/unittest/compiler/test_parallel_gemm_fusions.py
 create mode 100644 tests/unittest/compiler/test_permute_bmm_special_op.py
 create mode 100644 tests/unittest/compiler/test_public_import.py
 create mode 100644 tests/unittest/compiler/test_refine_graph.py
 create mode 100644 tests/unittest/compiler/test_remove_unused_ops.py
 create mode 100644 tests/unittest/compiler/test_slice_elemwise_fusion.py
 create mode 100644 tests/unittest/compiler/test_slice_gemm_fusion.py
 create mode 100644 tests/unittest/compiler/test_slice_reshape_scatter.py
 create mode 100644 tests/unittest/compiler/test_slice_scatter_pattern.py
 create mode 100644 tests/unittest/compiler/test_slice_view_strided.py
 create mode 100644 tests/unittest/compiler/test_split_bmm_fusion.py
 create mode 100644 tests/unittest/compiler/test_split_bmm_softmax_bmm.py
 create mode 100644 tests/unittest/compiler/test_split_view_strided.py
 create mode 100644 tests/unittest/compiler/test_strided_group_gemm.py
 create mode 100644 tests/unittest/compiler/test_strided_group_layernorm.py
 create mode 100644 tests/unittest/compiler/test_strided_layernorm.py
 create mode 100644 tests/unittest/compiler/test_strided_layernorm_reshape.py
 create mode 100644 tests/unittest/compiler/test_strided_op_cat_pattern.py
 create mode 100644 tests/unittest/compiler/test_strided_reshape_cat.py
 create mode 100644 tests/unittest/compiler/test_strided_scatter.py
 create mode 100644 tests/unittest/compiler/test_strided_split_group_gemm.py
 create mode 100644 tests/unittest/compiler/test_strided_view_cat.py
 create mode 100644 tests/unittest/compiler/test_strided_view_op.py
 create mode 100644 tests/unittest/compiler/test_tensor_accessor.py
 create mode 100644 tests/unittest/compiler/test_transform_memory_ops.py
 create mode 100644 tests/unittest/compiler/test_transform_odd_alignment.py
 create mode 100644 tests/unittest/compiler/test_transform_special_op.py
 create mode 100644 tests/unittest/compiler/test_transform_utils.py
 create mode 100644 tests/unittest/compiler/test_view_strided_op.py
 create mode 100644 tests/unittest/frontend/test_module.py
 create mode 100644 tests/unittest/ops/test_activation.py
 create mode 100644 tests/unittest/ops/test_argmax.py
 create mode 100644 tests/unittest/ops/test_attention.py
 create mode 100644 tests/unittest/ops/test_avg_pool2d.py
 create mode 100644 tests/unittest/ops/test_batch_gather.py
 create mode 100644 tests/unittest/ops/test_bert_embeddings.py
 create mode 100644 tests/unittest/ops/test_bmm.py
 create mode 100644 tests/unittest/ops/test_bmm_add.py
 create mode 100644 tests/unittest/ops/test_bmm_alpha.py
 create mode 100644 tests/unittest/ops/test_bmm_permute.py
 create mode 100644 tests/unittest/ops/test_bmm_rcr_n1.py
 create mode 100644 tests/unittest/ops/test_bmm_rrr_k1_tanh.py
 create mode 100644 tests/unittest/ops/test_bmm_softmax.py
 create mode 100644 tests/unittest/ops/test_bmm_softmax_bmm.py
 create mode 100644 tests/unittest/ops/test_chunk.py
 create mode 100644 tests/unittest/ops/test_clamp_nan_to_num.py
 create mode 100644 tests/unittest/ops/test_concatenate.py
 create mode 100644 tests/unittest/ops/test_concatenate_tanh.py
 create mode 100644 tests/unittest/ops/test_conv.py
 create mode 100644 tests/unittest/ops/test_conv2d_bias_add.py
 create mode 100644 tests/unittest/ops/test_conv_bias.py
 create mode 100644 tests/unittest/ops/test_conv_bias_act_few_channels.py
 create mode 100644 tests/unittest/ops/test_conv_bias_add_hardswish.py
 create mode 100644 tests/unittest/ops/test_conv_bias_add_relu.py
 create mode 100644 tests/unittest/ops/test_conv_bias_hardswish.py
 create mode 100644 tests/unittest/ops/test_conv_bias_relu.py
 create mode 100644 tests/unittest/ops/test_conv_bias_sigmoid.py
 create mode 100644 tests/unittest/ops/test_dynamic_conv.py
 create mode 100644 tests/unittest/ops/test_efficient_nms.py
 create mode 100644 tests/unittest/ops/test_expand.py
 create mode 100644 tests/unittest/ops/test_flatten.py
 create mode 100644 tests/unittest/ops/test_fpn_roi_align.py
 create mode 100644 tests/unittest/ops/test_fused_elementwise.py
 create mode 100644 tests/unittest/ops/test_fused_elementwise_broadcast.py
 create mode 100644 tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
 create mode 100644 tests/unittest/ops/test_gather.py
 create mode 100644 tests/unittest/ops/test_gemm.py
 create mode 100644 tests/unittest/ops/test_gemm_bias.py
 create mode 100644 tests/unittest/ops/test_gemm_bias_broadcast.py
 create mode 100644 tests/unittest/ops/test_gemm_bias_hardswish.py
 create mode 100644 tests/unittest/ops/test_gemm_bias_permute.py
 create mode 100644 tests/unittest/ops/test_gemm_bias_relu.py
 create mode 100644 tests/unittest/ops/test_gemm_bias_sigmoid.py
 create mode 100644 tests/unittest/ops/test_gemm_bias_softmax.py
 create mode 100644 tests/unittest/ops/test_gemm_bias_swish.py
 create mode 100644 tests/unittest/ops/test_gemm_bias_tanh.py
 create mode 100644 tests/unittest/ops/test_gemm_permute.py
 create mode 100644 tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
 create mode 100644 tests/unittest/ops/test_gemm_rrr_small_nk.py
 create mode 100644 tests/unittest/ops/test_gemm_softmax.py
 create mode 100644 tests/unittest/ops/test_group_gemm_rcr.py
 create mode 100644 tests/unittest/ops/test_group_gemm_rcr_bias.py
 create mode 100644 tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
 create mode 100644 tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
 create mode 100644 tests/unittest/ops/test_group_gemm_rcr_cat.py
 create mode 100644 tests/unittest/ops/test_groupnorm.py
 create mode 100644 tests/unittest/ops/test_layernorm.py
 create mode 100644 tests/unittest/ops/test_layernorm_sigmoid_mul.py
 create mode 100644 tests/unittest/ops/test_max_pool2d.py
 create mode 100644 tests/unittest/ops/test_nhwc3to4.py
 create mode 100644 tests/unittest/ops/test_nhwc3to8.py
 create mode 100644 tests/unittest/ops/test_nms.py
 create mode 100644 tests/unittest/ops/test_norm.py
 create mode 100644 tests/unittest/ops/test_pad_last_dim.py
 create mode 100644 tests/unittest/ops/test_perm021fc_ccr.py
 create mode 100644 tests/unittest/ops/test_perm021fc_ccr_bias.py
 create mode 100644 tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
 create mode 100644 tests/unittest/ops/test_perm021fc_crc.py
 create mode 100644 tests/unittest/ops/test_perm021fc_crc_bias.py
 create mode 100644 tests/unittest/ops/test_perm102_bmm_rcr.py
 create mode 100644 tests/unittest/ops/test_perm102_bmm_rrr.py
 create mode 100644 tests/unittest/ops/test_permute.py
 create mode 100644 tests/unittest/ops/test_permute021.py
 create mode 100644 tests/unittest/ops/test_permute102.py
 create mode 100644 tests/unittest/ops/test_permute210.py
 create mode 100644 tests/unittest/ops/test_proposal.py
 create mode 100644 tests/unittest/ops/test_reduce.py
 create mode 100644 tests/unittest/ops/test_reshape.py
 create mode 100644 tests/unittest/ops/test_roi_align.py
 create mode 100644 tests/unittest/ops/test_size_getitem_ops.py
 create mode 100644 tests/unittest/ops/test_slice.py
 create mode 100644 tests/unittest/ops/test_softmax.py
 create mode 100644 tests/unittest/ops/test_split.py
 create mode 100644 tests/unittest/ops/test_split_getitem.py
 create mode 100644 tests/unittest/ops/test_squeeze.py
 create mode 100644 tests/unittest/ops/test_topk.py
 create mode 100644 tests/unittest/ops/test_transpose_conv2d.py
 create mode 100644 tests/unittest/ops/test_transpose_conv2d_bias.py
 create mode 100644 tests/unittest/ops/test_transpose_conv2d_bias_relu.py
 create mode 100644 tests/unittest/ops/test_tuple_list_construct.py
 create mode 100644 tests/unittest/ops/test_upsamping2d.py
 create mode 100644 tests/unittest/ops/test_upsamping2d_add.py
 create mode 100644 tests/unittest/ops/test_var.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 000000000..19c2d377a
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,80 @@
+# Use the latest 2.1 version of CircleCI pipeline process engine.
+# See: https://circleci.com/docs/2.0/configuration-reference
+version: 2.1
+
+# Orbs are reusable packages of CircleCI configuration that you may share across projects, enabling you to create encapsulated, parameterized commands, jobs, and executors that can be used across multiple projects.
+# See: https://circleci.com/docs/2.0/orb-intro/
+orbs:
+  # The python orb contains a set of prepackaged CircleCI configuration you can use repeatedly in your configuration files
+  # Orb commands and jobs help you with common scripting around a language/tool
+  # so you dont have to copy and paste it everywhere.
+  # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python
+  python: circleci/python@1.5.0
+
+# -------------------------------------------------------------------------------------
+# Re-usable commands
+# -------------------------------------------------------------------------------------
+setup_env: &setup_env
+  - run:
+      name: Setup environment
+      command: |
+        python3.8 --version
+        python3.8 -m pip install --upgrade pip
+        cd python
+        python3.8 setup.py bdist_wheel
+        sudo python3.8 -m pip install --no-input dist/*.whl
+        cd ..
+        python3.8 -m pip install pytest
+        python3.8 -m pip install torch
+        python3.8 -m pip install numpy
+        python3.8 -m pip install jinja2
+        python3.8 -m pip install recordtype
+        python3.8 -m pip install parameterized
+        python3.8 -m pip install einops
+        git submodule sync
+        git submodule update --init
+        echo 'export PYTHONPATH=$PWD/python:$PYTHONPATH' >> $BASH_ENV
+        echo 'export PATH=/usr/local/cuda-11.4/bin:$PATH' >> $BASH_ENV
+        echo 'export CI_FLAG=CIRCLECI' >> $BASH_ENV
+        echo 'export CACHE_DIR=$PWD/tests/ci_profile_cache' >> $BASH_ENV
+
+basic_tests: &basic_tests
+  - run:
+      name: Run tests
+      command: |
+        set -e
+        TEST_FILES=$(circleci tests glob "tests/unittest/**/test_*.py" | grep -v benchmark | circleci tests split --split-by=timings)
+        mkdir test-results
+        python3.8 -m pytest $TEST_FILES --junitxml=test-results/junit.xml --verbose --continue-on-collection-errors -rA
+
+
+# Define a job to be invoked later in a workflow.
+# See: https://circleci.com/docs/2.0/configuration-reference/#jobs
+jobs:
+  build-and-test:
+    machine:
+      image: ubuntu-2004-cuda-11.4:202110-01
+      # Check T101565170 for multi-gpu use cases.
+      resource_class: gpu.nvidia.medium
+
+    parallelism: 10
+
+    # Checkout the code as the first step. This is a dedicated CircleCI step.
+    # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default.
+    # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt.
+    # Then run your tests!
+    # CircleCI will report the results back to your VCS provider.
+    steps:
+      - checkout
+      - <<: *setup_env
+      - <<: *basic_tests
+      - store_test_results:
+          path: test-results
+
+# Invoke jobs via workflows
+# See: https://circleci.com/docs/2.0/configuration-reference/#workflows
+workflows:
+  unittest: # This is the name of the workflow, feel free to change it to better match your workflow.
+    # Inside the workflow, you define the jobs you want to run.
+    jobs:
+      - build-and-test
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..73304266b
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,88 @@
+---
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 2000000
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
diff --git a/.flake8 b/.flake8
new file mode 100644
index 000000000..3f29e8612
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,63 @@
+[flake8]
+select = B,C,E,F,P,W,B9
+max-line-length = 80
+# Main Explanation Docs: https://github.com/grantmcconnaughey/Flake8Rules
+ignore =
+  # Black conflicts and overlaps.
+  # Found in https://github.com/psf/black/issues/429
+  B950, # Line too long.
+  E111, # Indentation is not a multiple of four.
+  E115, # Expected an indented block (comment).
+  E117, # Over-indented.
+  E121, # Continuation line under-indented for hanging indent.
+  E122, # Continuation line missing indentation or outdented.
+  E123, # Closing bracket does not match indentation of opening bracket's line.
+  E124, # Closing bracket does not match visual indentation.
+  E125, # Continuation line with same indent as next logical line.
+  E126, # Continuation line over-indented for hanging indent.
+  E127, # Continuation line over-indented for visual indent.
+  E128, # Continuation line under-indented for visual indent.
+  E129, # Visually indented line with same indent as next logical line.
+  E131, # Continuation line unaligned for hanging indent.
+  E201, # Whitespace after '('.
+  E202, # Whitespace before ')'.
+  E203, # Whitespace before ':'.
+  E221, # Multiple spaces before operator.
+  E222, # Multiple spaces after operator.
+  E225, # Missing whitespace around operator.
+  E226, # Missing whitespace around arithmetic operator.
+  E227, # Missing whitespace around bitwise or shift operator.
+  E231, # Missing whitespace after ',', ';', or ':'.
+  E241, # Multiple spaces after ','.
+  E251, # Unexpected spaces around keyword / parameter equals.
+  E252, # Missing whitespace around parameter equals.
+  E261, # At least two spaces before inline comment.
+  E262, # Inline comment should start with '# '.
+  E265, # Block comment should start with '# '.
+  E271, # Multiple spaces after keyword.
+  E272, # Multiple spaces before keyword.
+  E301, # Expected 1 blank line, found 0.
+  E302, # Expected 2 blank lines, found 0.
+  E303, # Too many blank lines (3).
+  E305, # Expected 2 blank lines after end of function or class.
+  E306, # Expected 1 blank line before a nested definition.
+  E501, # Line too long (82 > 79 characters).
+  E502, # The backslash is redundant between brackets.
+  E701, # Multiple statements on one line (colon).
+  E702, # Multiple statements on one line (semicolon).
+  E703, # Statement ends with a semicolon.
+  E704, # Multiple statements on one line (def).
+  W291, # Trailing whitespace.
+  W292, # No newline at end of file.
+  W293, # Blank line contains whitespace.
+  W391, # Blank line at end of file.
+  W504, # Line break occurred after a binary operator.
+
+  # Too opinionated.
+  E265, # Block comment should start with '# '.
+  E266, # Too many leading '#' for block comment.
+  E402, # Module level import not at top of file. (Use cases like demandimport https://fburl.com/demandimport require statements before imports)
+  E722, # Do not use bare except, specify exception instead. (Duplicate of B001)
+  P207, # (Duplicate of B003)
+  P208, # (Duplicate of C403)
+  W503  # Line break occurred before a binary operator.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 000000000..208bd1f77
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,67 @@
+# Simple workflow for deploying static content to GitHub Pages
+name: Documentation
+
+on:
+  # Runs on pushes targeting the default branch
+  push:
+    branches: ["main"]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow one concurrent deployment
+concurrency:
+  group: "pages"
+  cancel-in-progress: true
+
+jobs:
+  # Single deploy job since we're just deploying
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install autodocsumm
+          pip install sphinx_rtd_theme
+          pip install sphinx_gallery
+          pip install sphinxcontrib-inlinesyntaxhighlight
+          pip install sphinx_toolbox
+          pip install numpy
+          pip install jinja2
+          pip install torch
+          cd python
+          python setup.py develop
+          cd ..
+      - name: Build documents with Sphinx
+        run: |
+          cd docs
+          BUILD_DOCS=1 make html
+          cd ..
+      - name: Setup Pages
+        uses: actions/configure-pages@v2
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v1
+        with:
+          path: './docs/build/html'
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v1
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 000000000..dbd4beb83
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,41 @@
+name: Lint
+
+on:
+  push:
+    branches:
+      - main
+
+  pull_request:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install ufmt
+        pip install click
+        pip install flake8
+    - name: Analyzing the code with flake8
+      run: |
+        echo "::add-matcher::tests/lint/flake8_problem_matcher.json"
+        flake8 .
+    - name: Analyzing the code with ufmt
+      run: |
+        ufmt diff python
+        ufmt diff tests
+        ufmt diff docs
+    - name: Check Meta copyright header
+      run: |
+        python tests/lint/check_meta_header.py --path=./tests --fixit=False
+        python tests/lint/check_meta_header.py --path=./python --fixit=False
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..f3bbc0889
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,143 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# tmp
+tmp/
+
+tags
+
+# macOS dir files
+.DS_Store
+
+# vscode
+.vscode
+
+# vim temp files
+*.swp
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..2aeb63ba5
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,10 @@
+[submodule "3rdparty/cutlass"]
+	path = 3rdparty/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
+[submodule "3rdparty/cub"]
+	path = 3rdparty/cub
+	url = https://github.com/NVIDIA/cub.git
+[submodule "3rdparty/composable_kernel"]
+	path = 3rdparty/composable_kernel
+	url = https://github.com/ROCmSoftwarePlatform/composable_kernel.git
+	branch = develop
diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
new file mode 160000
index 000000000..b88255475
--- /dev/null
+++ b/3rdparty/composable_kernel
@@ -0,0 +1 @@
+Subproject commit b8825547586855ec730a2eca47e415b1404bb5f2
diff --git a/3rdparty/cub b/3rdparty/cub
new file mode 160000
index 000000000..dcd5b06a4
--- /dev/null
+++ b/3rdparty/cub
@@ -0,0 +1 @@
+Subproject commit dcd5b06a417bdfdc2699678bddf7dd7ee38be466
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
new file mode 160000
index 000000000..dadc881a9
--- /dev/null
+++ b/3rdparty/cutlass
@@ -0,0 +1 @@
+Subproject commit dadc881a9606f95cba1b20acda03c9d07c286239
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 000000000..810680431
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,54 @@
+cff-version: 1.2.0
+title: AITemplate
+message: >-
+  If you use this software, please cite using the
+  following metadata.
+type: software
+authors:
+  - given-names: Bing
+    family-names: Xu
+    affiliation: Meta
+    email: bingxu@meta.com
+  - given-names: Ying
+    family-names: Zhang
+    affiliation: Meta
+    email: yingz@meta.com
+  - given-names: Hao
+    family-names: Lu
+    affiliation: Meta
+    email: hlu@meta.com
+  - given-names: Yang
+    family-names: Chen
+    affiliation: Meta
+    email: yangche@meta.com
+  - given-names: Terry
+    family-names: Chen
+    affiliation: Meta
+    email: terrychen@meta.com
+  - given-names: Mike
+    family-names: Iovine
+    affiliation: Meta
+    email: mikeiovine@meta.com
+  - given-names: Mu-Chu
+    family-names: Lee
+    affiliation: Meta
+    email: mlee8@meta.com
+  - given-names: Zhijing
+    family-names: Li
+    affiliation: Meta
+    email: tissue030@meta.com
+
+
+repository-code: 'https://github.com/facebookincubator/AITemplate'
+abstract: >-
+  AITemplate (AIT) is a unified inference framework with separate acceleration backends for both AMD and NVIDIA GPU hardware. It delivers close to hardware-native Tensor Core (NVIDIA GPU) and Matrix Core (AMD GPU) performance on a variety of widely used AI models such as convolutional neural networks, transformers, and diffusers.
+keywords:
+  - 'neural network, cutlass, composable kernel, cuda, rocm'
+license: Apache 2.0
+license-url: https://github.com/facebookincubator/AITemplate/LICENSE
+version: '0.1'
+date-released: '2022-10-03'
+identifiers:
+  - type: url
+    value: "https://github.com/facebookincubator/AITemplate/tree/v0.1.0"
+    description: The GitHub release URL of tag 0.1.0
\ No newline at end of file
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..08b500a22
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,80 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..fde4225fa
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,37 @@
+# Contributing to AITemplate
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Our Development Process
+1. For major change, submit RFC to discuss the change.
+2. For feature extension, submit PR with tests and documentation.
+3. For bug fix, submit PR with tests and documentation.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+
+## License
+By contributing to AITemplate, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..b09cd7856
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..833b6cebc
--- /dev/null
+++ b/README.md
@@ -0,0 +1,119 @@
+# AITemplate
+
+[![License](https://img.shields.io/badge/License-Apache_2.0-brightgreen.svg)](https://github.com/facebookincubator/AITemplate/blob/main/LICENSE) |
+[![Documentation](https://github.com/facebookincubator/AITemplate/actions/workflows/docs.yml/badge.svg)](https://facebookincubator.github.io/AITemplate) |
+[![CircleCI](https://circleci.com/gh/facebookincubator/AITemplate.svg?style=svg)](https://app.circleci.com/pipelines/github/facebookincubator/AITemplate)
+
+
+
+
+AITemplate (AIT) is a Python framework that transforms deep neural networks into CUDA (NVIDIA GPU) / HIP (AMD GPU) C++ code for lightning-fast inference serving. AITemplate highlights include:
+
+- High performance: close to roofline fp16 TensorCore (NVIDIA GPU) / MatrixCore (AMD GPU) performance on major models, including ResNet, MaskRCNN, BERT, VisionTransformer, Stable Diffusion, etc.
+- Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easy extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms.
+
+## More about AITemplate
+
+### Excellent Backward Capability
+
+AITemplate doesn't depend on third-party libraries or runtimes, such as cuBLAS, cuDNN, rocBLAS, MIOpen, TensorRT, MIGraphX, etc. Each model is compiled into a self-contained portable binary, which can be used on any software environment with the same hardware.
+
+### Horizontal Fusion
+
+AITemplate provides unique advanced horizontal fusion. AITemplate can fuse parallel GEMM, LayerNorm, and other operators with different input shapes into a single GPU kernel.
+
+### Vertical Fusion
+
+AITemplate provides strong vertical fusion. AITemplate can fuse a large range of operations into TensorCore/MatrixCore operations, such as elementwise operations, reduction operations, and layout permutation operations. AITemplate also provides back-to-back style TensorCore / MatrixCore operation fusion.
+
+### Memory Fusion
+
+AITemplate provides innovative memory fusions. AITemplate can fuse GEMM, LayerNorm, and other operators, followed by memory operations such as concatenation, split, and slice into a single operator.
+
+### Working w/wo PyTorch
+The AITemplate-generated Python runtime can take PyTorch tensors as inputs and outputs without an extra copy. For environments without PyTorch, the AITemplate Python/C++ runtime is self-contained.
+
+### Extensions without suffering
+
+AITemplate provides a straightforward approach for making an extension in codegen. To add a new operator or a new fused kernel into AITemplate, most of the time one only needs to add two Python files: one for a graph node definition and another for the backend codegen. The CUDA/HIP kernel in a text header file can be directly utilized in the codegen.
+
+## Installalation
+
+**Hardware requirement:**
+  - **NVIDIA**: AIT is only tested on SM80+ GPUs (Ampere etc). Not all kernels work with old SM75/SM70 (T4/V100) GPUs.
+  - **AMD**:  AIT is only tested on CDNA2 (MI-210/250) GPUs. There may be compiler issues for old CDNA1 (MI-100) GPUs.
+
+### Docker Image
+We highly recommend using AITemplate with Docker to avoid accidentally using a wrong version of NVCC or HIPCC.
+- CUDA: `./docker/build.sh cuda`
+- ROCM: `DOCKER_BUILDKIT=1 ./docker/build.sh rocm`
+
+This will build a docker image with tag `ait:latest`.
+
+### From Source
+The following command will create a Python wheel for AITemplate. Please ensure you have correct CUDA/ROCm compiler installed.
+- CUDA: CUDA 11.6
+- ROCm: We tested on ROCm 5.2.3 with a customized build HIPCC with the command in docker/Dockerfile.rocm#L87-L96
+
+*Incorrect compiler will lead performance regression.*
+
+```
+cd python
+python setup.py bdist_wheel
+pip install dist/*.whl
+```
+
+## Getting Started
+
+Check out the [AITemplate Documentation](https://facebookincubator.github.io/AITemplate) for API reference.
+
+There are a few tutorials for onboarding:
+
+- 01: [How to inference a PyTorch model with AIT](https://facebookincubator.github.io/AITemplate/tutorial/how_to_infer_pt.html)
+- 02: [How to add an op to AIT codegen](https://facebookincubator.github.io/AITemplate/tutorial/how_to_add_op.html)
+- 03: [How to visualize AIT's optimization](https://facebookincubator.github.io/AITemplate/tutorial/how_to_visualize.html)
+
+
+## Examples & Performance
+AITemplate provides the following model templates & reference performance data on A100/MI-250
+
+- [01_ResNet-50](examples/01_resnet-50/) with PyTorch Image Models (TIMM)
+- [02_MaskRCNN-FPN](examples/02_detectron2/) with Detectron2
+- [03_BERT](examples/03_bert/) with HuggingFace Transformer
+- [04_Vision Transformer](examples/04_vit/) with PyTorch Image Models (TIMM)
+- [05_Stable Diffusion](examples/05_stable_diffusion/) with HuggingFace Diffusers
+
+## Release
+
+AITemplate has a 90 days release cycle.
+In the next one or two releases, we will focus on:
+- Deprecating FlashAttention: Unify CUDA Attention computation to Composable Kernel (AMD GPU) style back-to-back fusion to improve performance and increase flexibility for NVIDIA GPU Transformer users.
+- Remove kernel profiling requirement.
+- GEMM + LayerNorm fusion, GEMM + GEMM fusion, Conv + Conv fusion.
+- Better dynamic shape support: Focus on the dynamic sequence in Transformers.
+- More model templates:  Provide model templates with control flow and containers.
+- More automatic graph passes: Relief manual rewrite models to obtain the best performance.
+- Enable more fusions on AMD backend.
+
+Some ongoing/potential work that won't appear in the next short-term release:
+- Automatic Pytorch-FX, ONNX, Open-XLA and other format model conversion.
+- Quantized model (int8/fp8/int4) support.
+- Composable Kernel CPU extension on AVX2/AVX-512 for AMD Epyc CPU.
+
+## Contributing
+Check our [contributing guide](CONTRIBUTING.md) to learn about how to contribute to the project.
+
+## The Team
+
+AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank the discussions with Andrew Tulloch, Yinghai Lu, Lu Fang.
+
+AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), [Terry Chen](https://github.com/terrychenism), [Mike Iovine](https://github.com/mikeiovine), [Mu-Chu Lee](https://github.com/muchulee8) and [Bing Xu](https://github.com/antinucleon).
+
+
+## Acknowledgement
+
+AITemplate team works deeply with NVIDIA [CUTLASS](https://github.com/NVIDIA/cutlass) Team (Led by Andrew Kerr, Haicheng Wu) and AMD [Composable Kernel](https://github.com/ROCmSoftwarePlatform/composable_kernel) Team (Led by Chao Liu, Jing Zhang). We co-designed many advanced GPU optimizations specialized for each platform, and nothing is possible without our close collaboration.
+
+
+## License
+AITemplate is licensed under the [Apache 2.0 License](https://github.com/facebookincubator/AITemplate/blob/main/LICENSE).
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
new file mode 100644
index 000000000..0461f45bf
--- /dev/null
+++ b/docker/Dockerfile.cuda
@@ -0,0 +1,58 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# CUDA Docker Image for AITemplate
+
+FROM nvidia/cuda:11.6.2-devel-ubuntu20.04
+
+# Base scripts
+RUN apt-get update --fix-missing
+RUN apt install -y python3 python3-dev python3-pip
+
+# Environment variables
+ENV PATH=/usr/local/nvidia/bin:${PATH}
+ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV LIBRARY_PATH=/usr/local/cuda/lib64:${LIBRARY_PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
+
+ADD ./docker/install/ /Install
+# necessary package
+RUN bash /Install/install_basic_dep.sh
+
+# for test
+RUN bash /Install/install_test_dep.sh
+
+# for docs
+RUN bash /Install/install_doc_dep.sh
+
+
+# install Pytorch
+RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
+
+# for detection
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
+RUN bash /Install/install_detection_deps.sh
+
+# Copy AITemplate to Docker
+RUN mkdir /AITemplate
+ADD ./COMMIT_INFO /AITemplate/COMMIT_INFO
+ADD ./python /AITemplate/python
+ADD ./3rdparty /AITemplate/3rdparty
+ADD ./examples /AITemplate/examples
+ADD ./tests /AITemplate/tests
+ADD ./docs /AITemplate/docs
+ADD ./static /AITemplate/static
+ADD ./licenses /AITemplate/licenses
+ADD ./docker/install/install_ait.sh /AITemplate/
+RUN bash /AITemplate/install_ait.sh
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
new file mode 100644
index 000000000..991bc3095
--- /dev/null
+++ b/docker/Dockerfile.rocm
@@ -0,0 +1,147 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# ROCM Docker Image for AITemplate
+FROM ubuntu:20.04
+
+ARG ROCMVERSION=5.2.3
+ARG compiler_version=b0f4678b9058a4ae00200dfb1de0da5f2ea84dcb
+
+
+RUN set -xe
+
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
+# Add rocm repository
+RUN apt-get update
+RUN apt-get install -y wget gnupg
+RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
+RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
+RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
+RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
+
+# Install dependencies
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    apt-utils \
+    build-essential \
+    cmake-data \
+    cmake \
+    curl \
+    git \
+    hip-rocclr \
+    jq \
+    libelf-dev \
+    libncurses5-dev \
+    libnuma-dev \
+    libpthread-stubs0-dev \
+    llvm-amdgpu \
+    pkg-config \
+    python \
+    python3 \
+    python-dev \
+    python3-dev \
+    python3-pip \
+    software-properties-common \
+    rocm-dev \
+    rocm-device-libs \
+    rocm-cmake \
+    rocm-libs \
+    vim \
+    zlib1g-dev \
+    openssh-server \
+    clang-format-10 \
+    kmod && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Setup ubsan environment to printstacktrace
+RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
+ENV UBSAN_OPTIONS=print_stacktrace=1
+
+# Install an init system
+RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
+RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
+
+ARG PREFIX=/opt/rocm
+# Install packages for processing the performance results
+RUN pip3 install --upgrade pip
+RUN pip3 install sqlalchemy
+RUN pip3 install pymysql
+RUN pip3 install pandas
+RUN pip3 install setuptools-rust
+RUN pip3 install sshtunnel
+# Setup ubsan environment to printstacktrace
+ENV UBSAN_OPTIONS=print_stacktrace=1
+
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+ADD ./docker/install/rocm_dev-requirements.txt dev-requirements.txt
+RUN groupadd -f render
+
+# Install the new rocm-cmake version
+RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git  && \
+  cd rocm-cmake && mkdir build && cd build && \
+  cmake  .. && cmake --build . && cmake --build . --target install
+
+WORKDIR /
+
+ENV compiler_version=$compiler_version
+RUN sh -c "echo compiler version = '$compiler_version'"
+
+RUN --mount=type=ssh if [ "$compiler_version" != "release" ]; then \
+        git clone https://github.com/RadeonOpenCompute/llvm-project.git && \
+        cd llvm-project && \
+        git checkout "$compiler_version" && \
+        mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+        make -j 96 ; \
+    else echo "using the release compiler"; \
+    fi
+
+ENV HIP_CLANG_PATH='/llvm-project/build/bin'
+RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
+
+# Fix compiler bug in 10736
+ADD ./docker/rocm_fix /rocm_fix
+RUN python3 /rocm_fix/fix_10736.py
+
+ADD ./docker/install/ /Install
+# necessary package
+RUN bash /Install/install_basic_dep.sh
+
+# for test
+RUN bash /Install/install_test_dep.sh
+
+# for docs
+RUN bash /Install/install_doc_dep.sh
+
+# Install Pytorch
+RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
+
+# for detection
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
+RUN bash /Install/install_detection_deps.sh
+
+
+# Copy AITemplate to Docker
+RUN mkdir /AITemplate
+ADD ./COMMIT_INFO /AITemplate/COMMIT_INFO
+ADD ./python /AITemplate/python
+ADD ./3rdparty /AITemplate/3rdparty
+ADD ./examples /AITemplate/examples
+ADD ./tests /AITemplate/tests
+ADD ./docs /AITemplate/docs
+ADD ./static /AITemplate/static
+ADD ./licenses /AITemplate/licenses
+ADD ./docker/install/install_ait.sh /AITemplate/
+RUN bash /AITemplate/install_ait.sh
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 000000000..dea4b35b9
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,30 @@
+# Docker + AITemplate
+
+AITemplate provides a Docker image with all test, benchmark, and documentation dependencies installed.
+
+## Build CUDA Docker Image
+
+```bash docker/build.sh cuda```
+This will build a CUDA 11 docker image with tag: `ait:latest`
+
+## Build ROCM Docker Image
+
+```DOCKER_BUILDKIT=1 bash docker/build.sh rocm```
+This will build a RCOM 5 docker image with tag: `ait:latest`
+
+## Running Unit Tests in Docker
+
+```docker run --gpus all ait:latest bash /AITemplate/tests/nightly/unittest.sh```
+
+## Launching CUDA Docker
+```docker run --gpus all -it ait:latest```
+
+## Launching ROCM Docker
+
+```docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined ait:latest```
+
+
+## Common questions:
+- Q: When building ROCm Docker, I hit this error ` => ERROR [internal] load metadata for docker.io/library/ubuntu:20.04`, what shall I do?
+
+  A: Run `docker pull docker.io/library/ubuntu:20.04` to pull base image manually, then re-run `./docker/build.sh rocm`
diff --git a/docker/build.sh b/docker/build.sh
new file mode 100755
index 000000000..37c3612b1
--- /dev/null
+++ b/docker/build.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+TARGET=$1
+COMMIT=$(git show --format="%H" --no-patch)
+COMMIT_AUTHOR=$(git show --format="%an" --no-patch)
+COMMIT_TIME=$(git show --format="%cI" --no-patch)
+echo "$COMMIT" > COMMIT_INFO
+echo "$COMMIT_AUTHOR" >> COMMIT_INFO
+echo "$COMMIT_TIME" >> COMMIT_INFO
+
+if [ "$TARGET" = "cuda" ]; then
+    if [ "$2" = "debug" ]; then
+        echo "Build in DEBUG mode with git files"
+        echo "RUN apt install -y vim git" >> ./docker/Dockerfile.cuda
+        echo "ADD .git /AITemplate/.git" >> ./docker/Dockerfile.cuda
+    fi
+    echo "Building CUDA Docker Image with tag ait:latest"
+    docker build -f ./docker/Dockerfile.cuda -t ait .
+elif [ "$TARGET" = "rocm" ]; then
+    echo "Building ROCM Docker Image with tag ait:latest"
+    docker build -f ./docker/Dockerfile.rocm -t ait .
+else
+    echo "Unknown target"
+fi
diff --git a/docker/install/install_ait.sh b/docker/install/install_ait.sh
new file mode 100644
index 000000000..3b1fdf6f3
--- /dev/null
+++ b/docker/install/install_ait.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+cd /AITemplate/python
+python3 setup.py bdist_wheel
+pip3 install --no-input /AITemplate/python/dist/*.whl
diff --git a/docker/install/install_basic_dep.sh b/docker/install/install_basic_dep.sh
new file mode 100644
index 000000000..801ef53ef
--- /dev/null
+++ b/docker/install/install_basic_dep.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+pip3 install numpy
+pip3 install jinja2
diff --git a/docker/install/install_detection_deps.sh b/docker/install/install_detection_deps.sh
new file mode 100644
index 000000000..47238cd3c
--- /dev/null
+++ b/docker/install/install_detection_deps.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+apt install -y ffmpeg libsm6 libxext6 wget
+pip3 install yacs
+pip3 install opencv-python
+pip3 install tqdm
+pip3 install timm
+pip3 install transformers
+pip3 install diffusers
diff --git a/docker/install/install_doc_dep.sh b/docker/install/install_doc_dep.sh
new file mode 100644
index 000000000..350738142
--- /dev/null
+++ b/docker/install/install_doc_dep.sh
@@ -0,0 +1,6 @@
+#! /bin/bash
+pip3 install autodocsumm
+pip3 install sphinx_rtd_theme
+pip3 install sphinx_gallery
+pip3 install sphinxcontrib-inlinesyntaxhighlight
+pip3 install sphinx_toolbox
diff --git a/docker/install/install_test_dep.sh b/docker/install/install_test_dep.sh
new file mode 100644
index 000000000..6cc7c1b44
--- /dev/null
+++ b/docker/install/install_test_dep.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+pip3 install click
+pip3 install pytest
+pip3 install parameterized
+pip3 install pylint==2.13.9
+pip3 install ufmt
+pip3 install pyGithub
+pip3 install gitpython
+pip3 install xmltodict
+pip3 install einops
diff --git a/docker/install/rocm_dev-requirements.txt b/docker/install/rocm_dev-requirements.txt
new file mode 100644
index 000000000..3c8cbd155
--- /dev/null
+++ b/docker/install/rocm_dev-requirements.txt
@@ -0,0 +1,3 @@
+ROCmSoftwarePlatform/rocm-recipes
+# 1.90+
+danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f
diff --git a/docker/rocm_fix/fix_10736.py b/docker/rocm_fix/fix_10736.py
new file mode 100644
index 000000000..c91e7f200
--- /dev/null
+++ b/docker/rocm_fix/fix_10736.py
@@ -0,0 +1,9 @@
+src = ""
+with open("/opt/rocm/hip/bin/hipcc.pl", "r") as fi:
+    src = fi.read()
+
+src = src.replace(
+    "$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);", "chomp($HIP_CLANG_TARGET);"
+)
+with open("/opt/rocm/hip/bin/hipcc.pl", "w") as fo:
+    fo.write(src)
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 000000000..7f6e76eb5
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,22 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	cp static/ait_model.html build/html/tutorial/ait_model.html
+
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 000000000..1a07a9b4c
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,20 @@
+# AITemplate Documentation
+
+
+## Build locally
+
+1. Install AITemplate
+
+2. Install Sphinx
+```
+pip install autodocsumm
+pip install sphinx_rtd_theme
+pip install sphinx_gallery
+pip install sphinxcontrib-inlinesyntaxhighlight
+pip install sphinx_toolbox
+```
+
+3. Build HTML
+```
+make html
+```
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 000000000..747ffb7b3
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/arch/index.rst b/docs/source/arch/index.rst
new file mode 100644
index 000000000..13fdf207f
--- /dev/null
+++ b/docs/source/arch/index.rst
@@ -0,0 +1,12 @@
+Design and Architecture
+=======================
+
+
+.. toctree::
+   :maxdepth: 1
+
+   philosophy
+   
+
+
+Stay tuned for more...
diff --git a/docs/source/arch/philosophy.rst b/docs/source/arch/philosophy.rst
new file mode 100644
index 000000000..2eefb8f5d
--- /dev/null
+++ b/docs/source/arch/philosophy.rst
@@ -0,0 +1,16 @@
+Design  Philosophy
+==================
+
+
+KISS (Keep it simple and stupid)
+--------------------------------
+
+AITemplate avoids deep IR lowering stacks to reduce the system's complexity. A highly modularized, multiple backend codegen system written in pure Python directly attacks the pain point in high-performance GPU inference.
+
+Pragmatism
+----------
+
+AITemplate provides a PyTorch-style frontend to enable engineers to manually match the PyTorch model & weights to AITemplate for optimization. Using it is less painful than debugging different lowering IR stacks, especially for complex models such as MaskRCNN.
+
+
+We believe most of the neural network workload can be decoupled. For example, most of the network can be decoupled into Encoder, Decoder, and Decoder logics. For encoder and decoder, it is a computation bounded problem. For decoder logic, it may involve more control flows. By using divide and conquer, we left the decoder logic part to C++ or Python rather than build a unified language / IR stack to play as the silver bullet.
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 000000000..bf239d5d1
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,67 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = "AITemplate"
+copyright = "2022, Meta Platforms"
+author = "Meta Platforms"
+
+# The full version, including alpha/beta/rc tags
+release = "0.1"
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.todo",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.mathjax",
+    "autodocsumm",
+    "sphinxcontrib.inlinesyntaxhighlight",
+    "sphinx_toolbox.code",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'alabaster'
+html_theme = "sphinx_rtd_theme"
+
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
diff --git a/docs/source/debughints.rst b/docs/source/debughints.rst
new file mode 100644
index 000000000..074254a75
--- /dev/null
+++ b/docs/source/debughints.rst
@@ -0,0 +1,14 @@
+Debug Hints
+===========
+
+AITemplate is a new project under active development. We have a rich test set to avoid bugs but don't be surprised if there is anything unexpected.
+
+Here are some helpful tips when we learned during the development AITemplate:
+
+1. Once the codegen for op which requires profiling is changed, remember to delete old profilers (usually located at workdir), and flush the cache by either deleting ~/.aitemplate or setting environment variable FLUSH_PROFILE_CACHE=1
+
+2. Check the pseudo code/visualization generated by each optimization pass if some optimization is harmful.
+
+3. Always do the numerical test, from small to large, to make sure the entire model is correct.
+
+4. Try to make the new fusion subgraph work in a manual way, then try to add an automatic pass to rewrite the graph with the fused subgraph.
\ No newline at end of file
diff --git a/docs/source/genindex.rst b/docs/source/genindex.rst
new file mode 100644
index 000000000..66a235227
--- /dev/null
+++ b/docs/source/genindex.rst
@@ -0,0 +1,2 @@
+Index
+=====
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 000000000..c8e070eac
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,44 @@
+
+AITemplate Documentation
+======================================
+
+AITemplate (AIT) is a Python framework that transforms deep neural networks into CUDA (NVIDIA GPU) / HIP (AMD GPU) C++ code for lightning-fast inference serving. AITemplate highlights include:
+
+* High performance: close to roofline fp16 TensorCore (NVIDIA GPU) / MatrixCore (AMD GPU) performance on major models, including ResNet, MaskRCNN, BERT, VisionTransformer, Stable Diffusion, etc.
+* Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easy extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms.
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: : Getting Started
+
+   install/index
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: User Guide
+
+   tutorial/index
+   debughints
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Runtime Design
+
+   runtime/index
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Architecture Guide
+
+   arch/index
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Reference Guide
+
+   reference/index
+   reference/env
+   genindex
\ No newline at end of file
diff --git a/docs/source/install/index.rst b/docs/source/install/index.rst
new file mode 100644
index 000000000..862212889
--- /dev/null
+++ b/docs/source/install/index.rst
@@ -0,0 +1,64 @@
+Installing AITemplate
+=====================
+
+Using Docker
+------------
+
+The easiest way to get started is to use Docker.  Using docker is able to avoid performance regression caused by incorrect version of NVCC and HIPCC.
+To use docker, we provide a bash script to build the docker image.
+
+- CUDA: 
+    .. code-block:: bash
+
+        ./docker/build.sh cuda
+- ROCM: 
+    .. code-block:: bash
+
+        DOCKER_BUILDKIT=1 ./docker/build.sh rocm
+
+
+This will build a docker image with tag `ait:latest`.
+
+To launch the docker container
+
+- CUDA:
+    .. code-block:: bash
+
+        docker run --gpus all -it ait:latest
+
+- ROCM:
+    .. code-block:: bash
+
+        docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined ait:latest
+
+AITemplate will be installed in as a Python package to Python 3.8. There will be also a copy of source code and examples at `/AITemplate`
+
+
+Install as standard Python package
+----------------------------------
+
+Before start installing AITemplate, first make sure you have correct hardware and software environment.
+
+- Hardware
+    - NVIDIA: AIT is only tested on SM80+ GPUs (Ampere etc).
+    - AMD: AIT is only tested on CDNA2 (MI-210/250) GPUs.
+
+.. warning::
+    - Not all kernels work with old SM75/SM70 (T4/V100) GPUs.
+    - There may be compiler issues for old CDNA1 (MI-100) GPUs.
+
+- Software
+    - NVIDIA: CUDA 11.6
+    - AMD: ROCm 5.2, with HIPCC 10736 (commit `b0f4678b9058a4ae00200dfb1de0da5f2ea84dcb`)
+
+.. warning::
+    - Incorrect compiler version will lead performance regression.
+    - Instruction for building HIPCC 10736 can be founded in `docker/Dockerfile.rocm`
+
+Then build Python wheel package and install.
+
+    .. code-block:: bash
+
+        cd python
+        python setup.py bdist_wheel
+        pip install dist/aitemplate-0.0.1-py3-none-any.whl
diff --git a/docs/source/reference/backend.rst b/docs/source/reference/backend.rst
new file mode 100644
index 000000000..fefac5d3f
--- /dev/null
+++ b/docs/source/reference/backend.rst
@@ -0,0 +1,60 @@
+aitemplate.backend
+===========================
+
+aitemplate.backend.task_runner
+------------------------------
+.. automodule:: aitemplate.backend.task_runner
+   :members: 
+   :imported-members: 
+   :exclude-members: OrderedDict
+   :autosummary:
+
+
+aitemplate.backend.builder
+--------------------------
+.. automodule:: aitemplate.backend.builder
+   :members: 
+   :imported-members: 
+   :exclude-members: BaseRunner, Target
+   :autosummary:
+
+
+aitemplate.backend.codegen
+---------------------------
+.. automodule:: aitemplate.backend.codegen
+   :members: 
+   :imported-members: 
+   :exclude-members: Tensor, Target
+   :autosummary:
+
+aitemplate.backend.profiler_cache
+----------------------------------
+.. automodule:: aitemplate.backend.profiler_cache
+   :members: 
+   :imported-members: 
+   :exclude-members:
+   :autosummary:
+
+aitemplate.backend.profiler_runner
+-----------------------------------
+.. automodule:: aitemplate.backend.profiler_runner
+   :members: 
+   :imported-members: 
+   :exclude-members: Target, Task, namedtuple, BaseRunner
+   :autosummary:
+
+aitemplate.backend.registry
+----------------------------
+.. automodule:: aitemplate.backend.registry
+   :members: 
+   :imported-members: 
+   :exclude-members:
+   :autosummary:
+
+aitemplate.backend.target
+--------------------------
+.. automodule:: aitemplate.backend.target
+   :members: 
+   :imported-members: 
+   :exclude-members:
+   :autosummary:
diff --git a/docs/source/reference/compiler.rst b/docs/source/reference/compiler.rst
new file mode 100644
index 000000000..7b41c26b9
--- /dev/null
+++ b/docs/source/reference/compiler.rst
@@ -0,0 +1,37 @@
+aitemplate.compiler
+==============================
+
+
+base
+------------------------
+.. automodule:: aitemplate.compiler.base
+   :members: 
+   :imported-members: 
+   :exclude-members: ABC, Enum, abstructmethod, dataclass, pformat, reduce
+   :autosummary:
+
+
+tensor_accessor
+-----------------------------------
+.. automodule:: aitemplate.compiler.tensor_accessor
+   :members: 
+   :imported-members: 
+   :exclude-members: IntImm, IntVar, Tensor, pformat
+   :autosummary:
+
+compiler
+----------------------------
+
+.. automodule:: aitemplate.compiler.compiler
+   :members: 
+   :imported-members: 
+   :exclude-members: IntImm, IntVar, Tensor, pformat, DynamicProfileStrategy
+   :autosummary:
+
+model
+----------------------------
+.. automodule:: aitemplate.compiler.model
+   :members: 
+   :imported-members: 
+   :exclude-members: NamedTuple, TypeVar
+   :autosummary:
\ No newline at end of file
diff --git a/docs/source/reference/cuda.rst b/docs/source/reference/cuda.rst
new file mode 100644
index 000000000..4770026eb
--- /dev/null
+++ b/docs/source/reference/cuda.rst
@@ -0,0 +1,12 @@
+aitemplate.backend.cuda
+===========================
+
+target_def
+----------
+.. automodule:: aitemplate.backend.cuda.target_def
+   :members: 
+   :imported-members: 
+   :exclude-members: Path, ProfileCacheDB, TargetType
+   :autosummary:
+
+
diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
new file mode 100644
index 000000000..1342becf6
--- /dev/null
+++ b/docs/source/reference/env.rst
@@ -0,0 +1,37 @@
+Environment Variables
+=====================
+AITemplate uses environment variables to control the behavior of codegen and profiling. All the environment variables used in AITemplate are listed here.
+
+Codegen
+-------
+
+**NUM_BUILDERS**: The number of CPU jobs running in parallel during codegen. It controls both the profiler codegen and the final .so codegen. It's set to 12 in NIGHTLY jobs. Internally, it's set to 12 for normal tests and 24 for heavy tests. By default, the builder uses all the available CPUs for building.
+
+**RECOMPILE**: If set to "0", it skips compilation for the .so and reuses the previously compiled ones. It is used to speed up local testing. The default value is "1" to always recompile.
+
+Profiling
+---------
+
+**CACHE_DIR**: The directory for the profiling cache. If unset, it defaults to `~/.aitemplate`.
+
+**FLUSH_PROFILE_CACHE**: If set to "1", it removes the cache file and recreates an empty one.
+
+**DISABLE_PROFILER_CODEGEN**: Normally in CI we randomly choose two profilers to codegen. If set to "1", this flag disables profiler codegen completely to speed up long running tests so that the tests don't time out. The default value is "0".
+
+**CUDA_VISIBLE_DEVICES**: This one is from CUDA itself. It's used to set the number of GPU devices available for profiling. Set to "0,1,2,3,4,5,6,7" to speed up profiling. For benchmarking, it's useful to set to a particular device to lower noise.
+
+**HIP_VISIBLE_DEVICES**: This one is from ROCm itself. It's used to set the number of GPU devices available for profiling. Set to "0,1,2,3,4,5,6,7" to speed up profiling. For benchmarking, it's useful to set to a particular device to lower noise.
+
+**FORCE_PROFILE**: If set to "1", it will do profiling regarless in_ci_env and disable_profiler_codegen. For non-NIGHTLY CI, we do not do profiling, and we could use FORCE_PROFILE=1 in these CI to do runs with codegen, compile, and profile.
+
+OSS CI
+------
+
+**CI_FLAG**: It is set to "CIRCLECI" in OSS CI to indicate we're in OSS CI environment. The behavior of the profiler and codegen is different in CI to speed up testing. Profiling itself for gemm/conv ops is disabled in CI. But we still compiles two random profilers to make sure the profiler codegen is not broken.
+
+**BUILD_DOCS**: If set to "1", it will create a fake CUDA target to enable doc building in Github Actions.
+
+Miscellaneous
+-------------
+
+**LOGLEVEL**: It is used to control the logging level in python. It's default to "INFO". "DEBUG" is useful for debugging.
diff --git a/docs/source/reference/frontend.rst b/docs/source/reference/frontend.rst
new file mode 100644
index 000000000..41bd777dc
--- /dev/null
+++ b/docs/source/reference/frontend.rst
@@ -0,0 +1,14 @@
+aitemplate.frontend
+====================
+
+.. automodule:: aitemplate.frontend.nn
+   :members:
+   :imported-members:
+   :exclude-members:
+   :autosummary:
+
+.. automodule:: aitemplate.frontend.tensor
+   :members:
+   :imported-members:
+   :exclude-members:
+   :autosummary:
diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst
new file mode 100644
index 000000000..8eb88c925
--- /dev/null
+++ b/docs/source/reference/index.rst
@@ -0,0 +1,16 @@
+Python API
+==========
+
+
+.. toctree::
+   :maxdepth: 2
+
+   compiler
+   ops
+   transform
+   backend
+   cuda
+   rocm
+   frontend
+   testing
+   utils
diff --git a/docs/source/reference/ops.rst b/docs/source/reference/ops.rst
new file mode 100644
index 000000000..f75510665
--- /dev/null
+++ b/docs/source/reference/ops.rst
@@ -0,0 +1,8 @@
+aitemplate.compiler.ops
+========================
+
+.. automodule:: aitemplate.compiler.ops
+   :members:
+   :imported-members:
+   :exclude-members: Tensor, TensorAccessor, Enum, Operator, IntImm, IntVar, IntVarTensor, wrap_dim
+   :autosummary:
diff --git a/docs/source/reference/rocm.rst b/docs/source/reference/rocm.rst
new file mode 100644
index 000000000..2dc4f6c1e
--- /dev/null
+++ b/docs/source/reference/rocm.rst
@@ -0,0 +1,11 @@
+aitemplate.backend.rocm
+===========================
+
+target_def
+----------
+.. automodule:: aitemplate.backend.rocm.target_def
+   :members: 
+   :imported-members: 
+   :exclude-members:
+   :autosummary:
+
diff --git a/docs/source/reference/testing.rst b/docs/source/reference/testing.rst
new file mode 100644
index 000000000..042df07d2
--- /dev/null
+++ b/docs/source/reference/testing.rst
@@ -0,0 +1,27 @@
+aitemplate.testing
+==================
+
+detect_target
+-------------
+.. automodule:: aitemplate.testing.detect_target
+   :members:
+   :imported-members:
+   :exclude-members: CUDA, ROCM, Popen
+   :autosummary:
+
+
+benchmark_pt
+------------
+.. automodule:: aitemplate.testing.benchmark_pt
+   :members:
+   :imported-members:
+   :exclude-members: CUDA, ROCM, Popen
+   :autosummary:
+
+benchmark_ait
+-------------
+.. automodule:: aitemplate.testing.benchmark_ait
+   :members:
+   :imported-members:
+   :exclude-members: CUDA, ROCM, Popen
+   :autosummary:
\ No newline at end of file
diff --git a/docs/source/reference/transform.rst b/docs/source/reference/transform.rst
new file mode 100644
index 000000000..4614e15ca
--- /dev/null
+++ b/docs/source/reference/transform.rst
@@ -0,0 +1,209 @@
+aitemplate.compiler.transform
+==============================
+
+
+apply_padding
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.apply_padding
+   :members:
+   :imported-members:
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+
+bind_constants
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.bind_constants
+   :members:
+   :imported-members:
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+constant_folding
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.constant_folding
+   :members:
+   :imported-members:
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm, AITData, replace_tensor
+   :autosummary:
+
+fuse_conv_elementwise
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.fuse_conv_elementwise
+   :members:
+   :imported-members:
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm, AITData, replace_tensor
+   :autosummary:
+
+fuse_group_ops
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.fuse_group_ops
+   :members: 
+   :imported-members: 
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm, AITData, replace_tensor, all_static_dimensions
+   :autosummary:
+
+
+fuse_mm_elementwise
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.fuse_mm_elementwise
+   :members:
+   :imported-members:
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm, AITData, replace_tensor, FuncEnum, elementwise, gemm_rcr, gemm_rcr_bias, gemm_rcr_bias_swish, copy_tensor_attributes, extract_only_one_op, get_patterns, remove_single_tensor_op_from_sorted_graph, sanitize_sorted_graph
+   :autosummary:
+
+fuse_ops
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.fuse_ops
+   :members:
+   :imported-members:
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm, AITData, replace_tensor, FuncEnum, elementwise, gemm_rcr, gemm_rcr_bias, gemm_rcr_bias_swish, copy_tensor_attributes, extract_only_one_op, get_patterns, remove_single_tensor_op_from_sorted_graph, sanitize_sorted_graph, layernorm_sigmoid_mul
+   :autosummary:
+
+fuse_parallel_gemms
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.fuse_parallel_gemms
+   :members:
+   :imported-members:
+   :exclude-members: TensorAccessor, is_static_dimension, DimInfo, IntImm, Operator, Source, Tensor, gemm, AITData, replace_tensor, FuncEnum, elementwise, gemm_rcr, gemm_rcr_bias, gemm_rcr_bias_swish, copy_tensor_attributes, extract_only_one_op, get_patterns, remove_single_tensor_op_from_sorted_graph, sanitize_sorted_graph, layernorm_sigmoid_mul
+   :autosummary:
+
+fuse_permute_bmm
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.fuse_permute_bmm
+   :members:
+   :imported-members:
+   :exclude-members: copy_src_op_attributes, remove_tensor_from_sorted_graph, bmm_ccr, bmm_crr, bmm_rcr, bmm_rrr, gemm_rrr, gemm_rrr_bias, permute021, TensorAccessor, is_static_dimension, DimInfo, IntImm, Operator, Source, Tensor, gemm, AITData, replace_tensor, FuncEnum, elementwise, gemm_rcr, gemm_rcr_bias, gemm_rcr_bias_swish, copy_tensor_attributes, extract_only_one_op, get_patterns, remove_single_tensor_op_from_sorted_graph, sanitize_sorted_graph, layernorm_sigmoid_mul
+   :autosummary:
+
+fuse_split
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.fuse_split
+   :members:
+   :imported-members: _fuse_split_and_strided_op
+   :exclude-members: IntVar, copy_src_op_attributes, remove_tensor_from_sorted_graph, bmm_ccr, bmm_crr, bmm_rcr, bmm_rrr, gemm_rrr, gemm_rrr_bias, permute021, TensorAccessor, is_static_dimension, DimInfo, IntImm, Operator, Source, Tensor, gemm, AITData, replace_tensor, FuncEnum, elementwise, gemm_rcr, gemm_rcr_bias, gemm_rcr_bias_swish, copy_tensor_attributes, extract_only_one_op, get_patterns, remove_single_tensor_op_from_sorted_graph, sanitize_sorted_graph, layernorm_sigmoid_mul
+   :autosummary:
+
+mark_param_tensor
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.mark_param_tensor
+   :members:
+   :imported-members:
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+memory_planning
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.memory_planning
+   :members:
+   :imported-members:
+   :exclude-members: TensorUsageRecord, Workspace, assign_offsets_to_views_and_outputs, greedy_by_size_memory_planning, DimInfo, IntImm, Operator, Source, Tensor, gemm, defaultdict, dataclass
+   :autosummary:
+
+name_graph
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.name_graph
+   :members:
+   :imported-members:
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+
+optimize_graph
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.optimize_graph
+   :members:
+   :imported-members:
+   :exclude-members: transform_strided_ops, transform_special_ops, transform_odd_alignment, transform_memory_ops, fuse_permute_bmm, fuse_parallel_gemms, fuse_mm_elementwise, apply_padding, fuse_conv_elementwise, fuse_group_ops, DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+
+profile
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.profile
+   :members:
+   :imported-members:
+   :exclude-members: DynamicProfileStrategy, DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+refine_graph
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.refine_graph
+   :members:
+   :imported-members:
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+remove_no_ops
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.remove_no_ops
+   :members:
+   :imported-members:
+   :exclude-members: IntVar, is_singleton_dimension, DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+remove_unused_ops
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.remove_unused_ops
+   :members:
+   :imported-members:
+   :exclude-members: deque, DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+toposort
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.toposort
+   :members:
+   :imported-members:
+   :exclude-members: DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+
+transform_memory_ops
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.transform_memory_ops
+   :members:
+   :imported-members:
+   :exclude-members: TensorAccessor, DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+transform_odd_alignment
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.transform_odd_alignment
+   :members:
+   :imported-members:
+   :exclude-members: can_be_constant_folded, copy_src_op_attributes, copy_tensor_attributes, extract_only_one_op, remove_tensor_from_sorted_graph, replace_tensor, sanitize_sorted_graph, toposort, IntVar, bmm_ccr, bmm_crr, bmm_rcr, bmm_rrr, permute021, unsqueeze, TensorAccessor, DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+
+transform_special_ops
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.transform_special_ops
+   :members:
+   :imported-members:
+   :exclude-members: gemm_rrr, is_singleton_dimension, gemm_rcr, gemm_rrr_small_nk, can_be_constant_folded, copy_src_op_attributes, copy_tensor_attributes, extract_only_one_op, remove_tensor_from_sorted_graph, replace_tensor, sanitize_sorted_graph, toposort, IntVar, bmm_ccr, bmm_crr, bmm_rcr, bmm_rrr, permute021, unsqueeze, TensorAccessor, DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+transform_strided_op_and_view_op
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.transform_strided_op_and_view_op
+   :members:
+   :imported-members:
+   :exclude-members: IntVar, is_singleton_dimension, DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+transform_strided_ops
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.transform_strided_ops
+   :members:
+   :imported-members:
+   :exclude-members: get_tensor_index, slice_reshape_scatter, slice_scatter, gen_tensor_index, IntVar, is_singleton_dimension, DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
+
+transform_strided_slice
+-------------------------------------------
+.. automodule:: aitemplate.compiler.transform.transform_strided_slice
+   :members:
+   :imported-members:
+   :exclude-members: dynamic_slice, get_tensor_index, slice_reshape_scatter, slice_scatter, gen_tensor_index, IntVar, is_singleton_dimension, DimInfo, IntImm, Operator, Source, Tensor, gemm
+   :autosummary:
diff --git a/docs/source/reference/utils.rst b/docs/source/reference/utils.rst
new file mode 100644
index 000000000..6c35fc39d
--- /dev/null
+++ b/docs/source/reference/utils.rst
@@ -0,0 +1,12 @@
+aitemplate.utils
+==================
+
+
+visualization.plot
+------------------
+.. automodule:: aitemplate.utils.visualization.plot
+   :members:
+   :imported-members:
+   :exclude-members: Tensor, Operator
+   :autosummary:
+
diff --git a/docs/source/runtime/cxx_design.rst b/docs/source/runtime/cxx_design.rst
new file mode 100644
index 000000000..5ef18f889
--- /dev/null
+++ b/docs/source/runtime/cxx_design.rst
@@ -0,0 +1,29 @@
+==================
+C++ Runtime Note
+==================
+
+`Model` v.s. `ModelContainer`
+==============================
+
+These are the two main classes involved in the C++ runtime implementation.
+
+* The bulk of the runtime implementation is in `Model`.
+* `ModelContainer` stores a set of shared constants and a collection of `Model`s. Almost all functions in `model_interface.h` forward to a method on `ModelContainer`. When `Run` is invoked, `ModelContainer` looks for an available `Model`, or blocks until one is available (see the section on asynchronous predictions). It then forwards the run request to the runtime.
+
+Code Structure
+==============
+
+Some important files:
+
+1. `include/model_interface.h`: The interface that we expose in the compiled .so
+2. `include/model_container.h`: The bulk of the `ModelContainer` implementation.
+
+Some files are generated at compile time. These include:
+
+* `model-generated.h`: The implementation for `Model`.
+* `model_container_base.cu`: A small part of the implementation for `ModelContainer` needs to be codegened. So `ModelContainer` inherits from `ModelContainerBase`, and `ModelContainerBase`'s implementation lives in this file. See `model_container.h` for more details.
+
+All codegen templates can be found in `backend/main_templates.py`. The codegen implementation is in `backend/codegen.py`.
+
+Note that many of the headers in this directory rely on generated code and thus cannot be `#include`d in external projects. The exception is `model_interface.h`.
+
diff --git a/docs/source/runtime/index.rst b/docs/source/runtime/index.rst
new file mode 100644
index 000000000..0dd2462ff
--- /dev/null
+++ b/docs/source/runtime/index.rst
@@ -0,0 +1,9 @@
+Runtime Note
+==================
+
+
+.. toctree::
+   :maxdepth: 1
+
+   cxx_design
+   py_design
diff --git a/docs/source/runtime/py_design.rst b/docs/source/runtime/py_design.rst
new file mode 100644
index 000000000..c143123de
--- /dev/null
+++ b/docs/source/runtime/py_design.rst
@@ -0,0 +1,135 @@
+=====================
+Python Runtime Note
+=====================
+
+Python `Model`
+==============
+
+`Model` is a collection of Python bindings to the C++ AIT runtime. This section describes the API.
+
+`AITData`
+---------
+
+This class represents a contiguous blob of memory that AIT will use as a tensor. It is simply a named tuple with these fields:
+
+* `data_ptr: int`: An **unowned** pointer to **GPU** memory. In general, all of the APIs expect that this pointer will be valid for the entire duration of the call.
+* `shape: List[int]`: The shape of the tensor.
+* `dtype: str`: The tensor's dtype; one of `"float32", "float16", "int32", "int64"`. Note that most ops only support float16 at this stage.
+
+If using AITemplate with PyTorch, `AITData`s can be constructed with the `torch_to_ait_data` utility:
+
+.. code-block:: python
+
+      x = torch.randn(3, 3, 3).half().cuda()
+      # Equivalent to AITData(x.data_ptr(), [3, 3, 3], "float16")
+      x_ait = torch_to_ait_data(x)
+
+
+If PyTorch is not available, `Model` provides a set of functions for copying, allocating, and freeing GPU memory. See the docstrings in `compiler/model.py` for more information.
+
+`run`
+-----
+
+`run` takes a set of inputs and outputs as `AITData`s. Both arguments can be passed as either an ordered list or a dictionary (mapping name to tensor).
+
+.. code-block:: python
+
+    # Arguments as a dictionary
+    module.run(
+      {"input0": in0_ait, "input1": in1_ait},
+      {"output0": out0_ait, "output1": out0_ait},
+    )
+
+    # Arguments as an ordered list. Note that you might need to query
+    # the index mapping.
+    input_name_to_idx = module.get_input_name_to_index_map()
+    output_name_to_idx = module.get_output_name_to_index_map()
+
+    inputs = [None for i in range(len(input_name_to_idx))]
+    outputs = [None for i in range(len(input_name_to_idx))]
+
+    for name in input_name_to_idx:
+      inputs[input_name_to_idx[name]] = ait_inputs[name]
+
+    for name in output_name_to_idx:
+      outputs[output_name_to_idx[name]] = ait_outputs[name]
+
+    module.run(inputs, outputs)
+      
+
+One important caveat is that the output must be its **maximum** size. This is because of dynamic shapes - the size of the output may vary, but its shape is not inferred until inference time. The maximum shape can be queried with the `get_output_maximum_shape()`:
+
+.. code-block:: python
+
+    # Can use either name or index.
+    name_to_idx = module.get_output_name_to_idx()
+    max_shape = module.get_output_maximum_shape(name_to_idx["output"])
+    max_shape = module.get_output_maximum_shape("output")
+
+
+`Model.run` returns a dictionary of output `AITData`s with (possibly dynamic) shapes that the runtime inferred.
+
+Nullptr Inputs/Outputs
+----------------------
+
+In general, inputs are allowed to be null if they are size 0 (e.g. at least one dimension is 0). The runtime enforces this with a check before any kernels are launched.
+
+.. code-block:: cpp
+
+    If (input_name == nullptr && dim0 * dim1 * … * dimN != 0) {
+      throw std::runtime_error(“input_name cannot be null!”);
+    }
+
+
+This is convenient since torch.data_ptr() returns null for size zero tensors. The dynamic shape computation is skipped if the lower bound of the tensor’s size is positive.
+
+Constants
+---------
+
+There are two types of constants in AIT; *bound* and *unbound* constants. A bound constant is known at compile time and may participate in constant folding. Bound constants are copied into GPU memory at model loading time. Values for bound constants may be provided by passing a dictionary (mapping constant name to AIT tensor) to `compile_model`.
+
+Unbound constants, on the other hand, do not participate in constant folding and must be provided before running the model. These must be set via `Model.set_constant`:
+
+.. code-block:: python
+
+    module.set_constant("my_constant", AITData(...))
+    # The pointer in the the tensor must live for the entire duration of run()
+    module.run(...)
+
+
+Constants are read-only and *shared* with all runtimes in the `ModelContainer`.
+
+`run_with_tensors`
+------------------
+
+`run_with_tensors` is a convenience method with the same interface as `run`, except it can take lists of `torch.Tensor`s:
+
+.. code-block:: python
+
+    input0 = torch.randn(input0_shape).cuda().half()
+    output0 = torch.empty(output0_shape).cuda().half()
+    # Returns a dictionary of reshaped outputs
+    result = module.run_with_tensors([input0], [output0])
+
+
+Streams and Asynchronous Predictions
+------------------------------------
+
+A pointer to a stream can optionally be passed to `run`. If none is given, the prediction happens on the default stream 0. If the `sync` argument is set to `True`, the stream is synchronized before `run()` returns. `sync` is `True` by default.
+
+Multiple predictions can happen at the same time (on the same or different streams). Under the hood, there is a fixed-size pool of runtime objects. When all the runtimes are used, `run()` blocks until one is available.
+The size of this pool can be configured with the `num_runtimes` option in `Model`'s constructor.
+
+CUDA Graph
+----------
+
+Run also takes a `graph_mode` option. If set to true, the runtime will try to use [CUDA graphs](https://developer.nvidia.com/blog/cuda-graphs/) to run the model. `graph_mode` is not supported on ROCm.
+
+The following is a high level overview of how graph mode works:
+
+1) Each `Model` has an internal stream used for graph capturing. The model first runs all ops on this stream in capture mode. No kernel launches happen during this stage.
+2) If this is the first run, a graph is instantiated via `cudaGraphInstantiate`.
+3) On subsequent runs, we try to avoid the relatively expensive `cudaGraphInstantiate` call by updating the graph executor (`cudaGraphExecUpdate`). However, a new graph may still be instantiated if the topology of the graph somehow changed between runs.
+4) Once we have the graph executor, we launch a single kernel on the stream that the user provided to `run()`.
+
+Graph mode is mainly beneficial when there are many small kernel launches. A lot of overhead can be avoided since there is only a single kernel launch in graph mode.
diff --git a/docs/source/tutorial/how_to_add_op.rst b/docs/source/tutorial/how_to_add_op.rst
new file mode 100644
index 000000000..160745336
--- /dev/null
+++ b/docs/source/tutorial/how_to_add_op.rst
@@ -0,0 +1,302 @@
+How to add an operator to the AIT codegen
+========================================= 
+
+This tutorial will demonstrate how to add a new operator to the AIT codegen.
+Full source code can be founded at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`
+
+
+0. Prerequisites
+-----------------
+
+We need to import necessary Python modules
+
+.. code-block:: python
+  
+  from typing import Any, Dict, List
+
+  import jinja2
+  import torch
+
+  from aitemplate import backend
+  from aitemplate.backend import registry
+  from aitemplate.backend.backend_spec import CUDASpec, ROCMSpec
+  from aitemplate.compiler import compile_model
+  from aitemplate.compiler.base import IntVar, Operator, Tensor
+  from aitemplate.testing import detect_target
+
+
+1. Define the operator graph node
+----------------------------------
+
+Graph node is usually defined at `aitemplate/compiler/ops`.
+
+.. code-block:: python
+
+  class add_one(Operator):
+    def __init__(self):
+      super().__init__()
+      # required, unique identity of operator category
+      self._attrs["op"] = "add_one"
+      # we can put whatever we want into the op attrs for later use
+      self._attrs["has_profiler"] = False
+      self._attrs["nop"] = False
+
+    def __call__(self, x: Tensor) -> Tensor:
+      # each operator needs to keep a record of input tensors
+      self._attrs["inputs"] = [x]
+      # optional, to set depth of the op based on inputs' depth, used in DFS
+      self._set_depth()
+      # infer output shape
+      output_shape = self._infer_shape(x)
+      # create output Tensor, of which the source op is the current op
+      output = Tensor(output_shape, src_ops={self})
+      # remember current op's outputs
+      self._attrs["outputs"] = [output]
+      return output
+
+    def _infer_shape(self, x) -> List[IntVar]:
+      # infer output shape
+      # In case of we need infer shape in C++ side, we will create a jinja2 template
+      # for shape inference function, and render to Python code in graph node
+      # and render the template into C++ code in codegen
+      return x.shape()
+
+    def gen_function(self) -> str:
+      # this function will be used in codegen
+      # here we only need to redirect to backend codegen function
+      target = backend.target.Target.current()
+      func_key = f"{target.name()}.{self._attrs['op']}.gen_function"
+      func = registry.get(func_key)
+      return func(self._attrs)
+
+.. note::
+
+  - `_attrs` in Operator is the most important data structure for codegen.
+  - `_attrs["op"]` is the identity of operator category, which is used to find the corresponding codegen function in backend, must be **unique**.
+
+2. Define the necessary templates for Codegen
+----------------------------------------------
+
+In AIT, there are 4 important templates for codegen:
+
+- `FUNC_TEMPLATE`: the template for generating the function body of the operator, and invoke GPU kernel in the body.
+- `FUNC_SIGNATURE_TEMPLATE`: the template for generating the function signature of the operator. The signature defined name, and arguments of the function.
+- `FUNC_CALL_TEMPLATE`: the template for generating the function call of the operator. The call will be used during inference to invoke the GPU kernel with given arguments.
+- `FUNC_DECL`: the template for forward declaration of the operator function. This is usually an alias of `FUNC_SIGNATURE_TEMPLATE`.
+
+.. code-block:: python
+
+  FUNC_TEMPLATE = jinja2.Template(
+      """
+  {{header_files}}
+  namespace {
+  {{kernel}}
+  }  // namespace
+  {{func_signature}}
+  {
+      invoke_add_one(output, input, num_elements, stream);
+  }
+      """
+  )
+
+  FUNC_SIGNATURE = jinja2.Template(
+      """
+  void {{func_name}}(half* output,
+                    const half* input,
+                    const int64_t num_elements,
+                    {{prefix}}Stream_t stream)
+      """
+  )
+
+  FUNC_DECL = jinja2.Template(
+      """
+      {{func_signature}};
+      """
+  )
+
+
+  FUNC_CALL_TEMPLATE = jinja2.Template(
+      """
+  {{indent}}int64_t num_elements = 1;
+  {% for dim_name in dim_names %}
+  {{indent}}num_elements *= {{dim_name}};
+  {% endfor %}
+  {{indent}}{{func_name}}(
+  {{indent}}   {{output}}, {{input}}, num_elements, stream /* default stream */
+  {{indent}});
+      """
+  )
+
+3. Create the GPU kernels
+--------------------------
+
+In this example we use a simplest add one kernel. The kernel can be written by hand (as what programmer is expected to do), or generated by other tools.
+
+.. code-block:: python
+
+  KERNEL_TEMPLATE = jinja2.Template(
+      """
+  __global__ void add_one(half* output, const half* input, const int64_t num_elements) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_elements) {
+      output[idx] = input[idx] + half(1.0);
+    }
+  }
+  void invoke_add_one(half* output, const half* input, int64_t num_elements, {{prefix}}Stream_t stream) {
+    if (num_elements < 1024) {
+      dim3 grid(1);
+      dim3 block(num_elements);
+      add_one<<<grid, block, 0, stream>>>(output, input, num_elements);
+    } else {
+      dim3 grid((num_elements + 1024 - 1) / 1024);
+      dim3 block(1024);
+      add_one<<<grid, block, 0, stream>>>(output, input, num_elements);
+    }
+  }
+      """
+  )
+
+(Optional) We also provide a helper function to handle CUDA/ROCm float16 data type difference.
+
+.. code-block:: python
+
+  FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+      """reinterpret_cast<half*>(
+          {% if is_cuda %}&({% endif %}{{name}}{% if is_cuda %}->raw()){% endif %})"""
+  )
+
+4. Define the codegen function
+-------------------------------
+
+The codegen function is the function that render the templates we defined into valid C++ code string.
+The codegen function will take `func_attrs` from graph node, and fill into the jinja2 template.
+
+.. code-block:: python
+
+  def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) -> str:
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 1
+
+    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"], is_cuda=is_cuda
+    )
+    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][0]._attrs["name"], is_cuda=is_cuda
+    )
+
+    dim_names = [dim._attrs["name"] for dim in func_attrs["inputs"][0].shape()]
+    return FUNC_CALL_TEMPLATE.render(
+          func_name=func_attrs["name"],
+          output=output_name,
+          input=input_name,
+          dim_names=dim_names,
+          indent=indent,
+    )
+
+
+  def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+    prefix = backend_spec.prefix
+    return FUNC_TEMPLATE.render(
+          header_files=header_files,
+          kernel=KERNEL_TEMPLATE.render(prefix=prefix),
+          func_signature=FUNC_SIGNATURE.render(
+              func_name=func_attrs["name"], prefix=prefix
+          ),
+    )
+
+
+  def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    return FUNC_DECL.render(
+          func_signature=FUNC_SIGNATURE.render(
+              func_name=func_attrs["name"],
+              prefix=backend_spec.prefix,
+          ).strip()
+    )
+
+5.1 Register the codegen function to CUDA backend
+---------------------------------------------------
+
+CUDA backend functions is usually defined at `aitemplate/backend/cuda/`.
+
+.. code-block:: python
+
+  CUDA_HEADER_FILES = """
+  #include <cuda_fp16.h>
+  """
+
+
+  @registry.reg("cuda.add_one.gen_function")
+  def cuda_add_one_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return gen_function(func_attrs, CUDA_HEADER_FILES, CUDASpec())
+
+
+  @registry.reg("cuda.add_one.func_decl")
+  def cuda_add_one_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    return gen_function_decl(func_attrs, CUDASpec())
+
+
+  @registry.reg("cuda.add_one.func_call")
+  def cuda_add_one_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return gen_function_call(func_attrs, indent, is_cuda=True)
+
+5.2 (Optional) Register the codegen function to ROCm backend
+--------------------------------------------------------------
+
+ROCm backend functions is usually defined at `aitemplate/backend/rocm/`.
+
+
+.. code-block:: python
+
+  HIP_HEADER_FILES = """
+  #include <hip/hip_fp16.h>
+  #include <hip/hip_runtime.h>
+  """
+
+
+  @registry.reg("rocm.add_one.gen_function")
+  def rocm_add_one_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return gen_function(func_attrs, HIP_HEADER_FILES, ROCMSpec())
+
+
+  @registry.reg("rocm.add_one.func_decl")
+  def rocm_add_one_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    return gen_function_decl(func_attrs, ROCMSpec())
+
+
+  @registry.reg("rocm.add_one.func_call")
+  def rocm_add_one_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return gen_function_call(func_attrs, indent, is_cuda=False)
+
+
+6. Compile and verify the results with PyTorch
+------------------------------------------------
+
+.. code-block:: python
+
+  def create_ait_model(shapes):
+    X = Tensor(
+          shape=shapes,
+          dtype="float16",
+          name="X",
+          is_input=True,
+    )
+    Y = add_one()(X)
+    Y._attrs["is_output"] = True
+    Y._attrs["name"] = "Y"
+    return Y
+
+
+  def verify_add_one():
+    shapes = [16, 512]
+    x = torch.randn(shapes).cuda().half()
+    y_pt = x + 1.0
+
+    Y = create_ait_model([16, 512])
+    target = detect_target()
+    with compile_model(Y, target, "./tmp", "add_one") as module:
+      y = torch.empty(shapes).cuda().half()
+      inputs = {"X": x}
+      outputs = {"Y": y}
+      module.run_with_tensors(inputs, outputs)
+      print(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
diff --git a/docs/source/tutorial/how_to_infer_pt.rst b/docs/source/tutorial/how_to_infer_pt.rst
new file mode 100644
index 000000000..67891c46a
--- /dev/null
+++ b/docs/source/tutorial/how_to_infer_pt.rst
@@ -0,0 +1,188 @@
+How to inference a PyTorch model with AIT
+==========================================
+
+This tutorial will demonstrate how to inference a PyTorch model with AIT.
+Full source code can be founded at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`
+
+0. Prerequisites
+-----------------
+
+We need to import necessary Python modules
+
+.. code-block:: python
+  
+  from collections import OrderedDict
+
+  import torch
+
+  from aitemplate.compiler import compile_model
+  from aitemplate.frontend import nn, Tensor
+  from aitemplate.testing import detect_target
+  from aitemplate.testing.benchmark_pt import benchmark_torch_function
+  from aitemplate.utils.graph_utils import sorted_graph_pseudo_code
+
+
+1. Define a PyTorch module
+---------------------------
+
+Here we define a PyTorch model which is commonly seen in Transformers.
+
+.. code-block:: python
+
+  class PTSimpleModel(torch.nn.Module):
+    def __init__(self, hidden, eps: float = 1e-5):
+      super().__init__()
+      self.dense1 = torch.nn.Linear(hidden, 4 * hidden)
+      self.act1 = torch.nn.functional.gelu
+      self.dense2 = torch.nn.Linear(4 * hidden, hidden)
+      self.layernorm = torch.nn.LayerNorm(hidden, eps=eps)
+
+    def forward(self, input):
+      hidden_states = self.dense1(input)
+      hidden_states = self.act1(hidden_states)
+      hidden_states = self.dense2(hidden_states)
+      hidden_states = hidden_states + input
+      hidden_states = self.layernorm(hidden_states)
+      return hidden_states
+
+2. Define an AIT module
+------------------------
+
+We can define a similar AIT module as follows:
+
+.. code-block:: python
+
+  class AITSimpleModel(nn.Module):
+    def __init__(self, hidden, eps: float = 1e-5):
+      super().__init__()
+      self.dense1 = nn.Linear(hidden, 4 * hidden, specialization="fast_gelu")
+      self.dense2 = nn.Linear(4 * hidden, hidden)
+      self.layernorm = nn.LayerNorm(hidden, eps=eps)
+
+    def forward(self, input):
+      hidden_states = self.dense1(input)
+      hidden_states = self.dense2(hidden_states)
+      hidden_states = hidden_states + input
+      hidden_states = self.layernorm(hidden_states)
+      return hidden_states
+
+.. warning::
+  The `nn.Module` API in AIT looks similar to PyTorch, but it is not the same.
+
+  The fundamental difference is that AIT module is a container to build graph, while PyTorch module is a container to store parameters for eager.
+  Which means, each AIT module's `forward` method can be only called once, and the graph is built during the first call. If you want to share parameters, needs to call `compiler.ops` instead. The `compiler.ops` is similar to `functional` in PyTorch.
+
+  AITemplate supports automatically fusion on linear followed by other operators. However in many case especially for quick iterations, we use manual `specialization` to specify the fused operator. For example, `specialization="fast_gelu"` will fuse linear with `fast_gelu` operator.
+  
+3. Define a helper function to map PyTorch parameters to AIT parameters
+-------------------------------------------------------------------------
+
+In AIT, all names must follow C variable naming standard because the name will be used in codegen process.
+
+.. code-block:: python
+
+  def map_pt_params(ait_model, pt_model):
+    ait_model.name_parameter_tensor()
+    pt_params = dict(pt_model.named_parameters())
+    mapped_pt_params = OrderedDict()
+    for name, _ in ait_model.named_parameters():
+      ait_name = name.replace(".", "_")
+      assert name in pt_params
+      mapped_pt_params[ait_name] = pt_params[name]
+    return mapped_pt_params
+
+.. warning::
+
+  - Different to PyTorch, it is required to call ait_model **.name_parameter_tensor()** method to provide each parameter a name with direct map to PyTorch.
+  - Because all names in AIT must follow C variable naming standard, you can easier replace `.` to `_` or use a regular expression to make sure the name in valid.
+  - For network with conv + bn subgraph, we currently haven't provide automatic pass to fold it. Refer our ResNet and Detectron2 examples to see how we handle CNN layout transform and BatchNorm folding.
+
+4. Create PyTorch module, inputs/outputs
+-----------------------------------------
+
+.. code-block:: python
+
+  batch_size=1024
+  hidden=512
+  # create pt model
+  pt_model = PTSimpleModel(hidden).cuda().half()
+
+  # create pt input
+  x = torch.randn([batch_size, hidden]).cuda().half()
+
+  # run pt model
+  pt_model.eval()
+  y_pt = pt_model(x)
+
+5. Create AIT module, inputs/outputs
+-------------------------------------
+
+.. code-block:: python
+
+  batch_size=1024
+  hidden=512
+  # create AIT model
+  ait_model = AITSimpleModel(hidden)
+  # create AIT input Tensor
+  X = Tensor(
+        shape=[batch_size, hidden],
+        name="X",
+        dtype="float16",
+        is_input=True,
+  )
+  # run AIT module to generate output tensor
+  Y = ait_model(X)
+  # mark the output tensor
+  Y._attrs["is_output"] = True
+  Y._attrs["name"] = "Y"
+
+.. warning::
+
+  - Similar to MetaTensor, LazyTensor and a lot of other lazy evaluation frameworks, AIT's Tensor records the computation graph, and the graph is built when the Tensor is compiled.
+  - For input tensor, it is required to set the attribute **is_input=True**
+  - For output tensor, it is required to set the attribute **Y._attrs["is_output"] = True**
+  - For input and output tensors, it is better to provide **name** attributes to use in runtime
+
+6. Compile AIT module in to runtime, and do verification
+--------------------------------------------------------
+
+.. code-block:: python
+
+  # map pt weights to ait
+  weights = map_pt_params(ait_model, pt_model)
+
+  # codegen
+  target = detect_target()
+  with compile_model(
+      Y, target, "./tmp", "simple_model_demo", constants=weights
+  ) as module:
+    # create storage for output tensor
+    y = torch.empty([batch_size, hidden]).cuda().half()
+
+    # inputs and outputs dict
+    inputs = {"X": x}
+    outputs = {"Y": y}
+
+    # run
+    module.run_with_tensors(inputs, outputs, graph_mode=True)
+
+    # verify output is correct
+    print(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    # benchmark ait and pt
+    count = 1000
+    ait_t, _, _ = module.benchmark_with_tensors(
+        inputs, outputs, graph_mode=True, count=count
+    )
+    print(f"AITemplate time: {ait_t} ms/iter")
+
+    pt_t = benchmark_torch_function(count, pt_model.forward, x)
+    print(f"PyTorch eager time: {pt_t} ms/iter")
+
+
+In this example, AIT will automatically fuse GELU and elementwise add into TensorCore/MatrixCore gemm operation. On RTX-3080 for this example, AIT is about 1.15X fast than PyTorch Eager in this example.
+
+.. note::
+
+  - In this example, we fold parameters (weights) into AIT runtime, which the final dynamic library will contains parameters.
+  - If during compile we don't provide parameters, for example the total parameters size is greater than 2GB, we can always call `set_constant` function in runtime. Check runtime API for details.
\ No newline at end of file
diff --git a/docs/source/tutorial/how_to_visualize.rst b/docs/source/tutorial/how_to_visualize.rst
new file mode 100644
index 000000000..5af7c89a5
--- /dev/null
+++ b/docs/source/tutorial/how_to_visualize.rst
@@ -0,0 +1,85 @@
+How to visualize an AIT model
+==============================
+
+Visualization is important for understanding the behavior of a model optimization.
+In AIT, we modify the codegen a little bit, from generating CUDA/HIP C++ code to HTML/Javascript code,
+then we can generate a visualization of the model.
+
+
+The following code will generate a visualization of our first example.
+
+1. Define the AIT Model
+------------------------
+
+.. code-block:: python
+
+  from aitemplate import compiler
+  from aitemplate.frontend import nn, Tensor
+  from aitemplate.testing import detect_target
+  from aitemplate.utils.visualization import plot_graph
+
+  class AITSimpleModel(nn.Module):
+    def __init__(self, hidden, eps: float = 1e-5):
+      super().__init__()
+      self.dense1 = nn.Linear(hidden, 4 * hidden, specialization="fast_gelu")
+      self.dense2 = nn.Linear(4 * hidden, hidden)
+      self.layernorm = nn.LayerNorm(hidden, eps=eps)
+
+    def forward(self, input):
+      hidden_states = self.dense1(input)
+      hidden_states = self.dense2(hidden_states)
+      hidden_states = hidden_states + input
+      hidden_states = self.layernorm(hidden_states)
+      return hidden_states
+
+  def gen_ait_model():
+    batch_size = 512
+    hidden = 1024
+    ait_model = AITSimpleModel(hidden)
+    ait_model.name_parameter_tensor()
+    X = Tensor(
+              shape=[batch_size, hidden],
+              name="X",
+              dtype="float16",
+              is_input=True,
+    )
+    Y = ait_model(X)
+    Y._attrs["is_output"] = True
+    Y._attrs["name"] = "Y"
+    return Y
+
+  output_tensor = gen_ait_model()
+
+2. Apply optimizations on the AIT Model
+---------------------------------------
+
+.. code-block:: python
+
+  def apply_optimizations(tensors):
+    target = detect_target()
+    # first, convert output tensors to graph
+    with target:
+      graph = compiler.transform.toposort(tensors)
+      # second, provide names to the graph
+      compiler.transform.name_graph(graph)
+      compiler.transform.mark_param_tensor(graph)
+      compiler.transform.mark_special_views(graph)
+      # we can apply optimizations to the graph, or test single optimization pass on the graph
+      graph = compiler.transform.optimize_graph(graph, "./tmp")
+    return graph
+
+  graph = apply_optimizations(output_tensor)
+
+3. Generate visualization
+--------------------------
+
+.. code-block:: python
+
+  # Plot the graph
+  plot_graph(graph, file_path="ait_model.html", network_name="ait_sample_net")
+
+The visualization will be generated in the "ait_model.html" file. This file can be opened in Chrome without any web server.
+
+.. raw:: html
+
+  <iframe src="ait_model.html" width="100%" height="600px"></iframe>
\ No newline at end of file
diff --git a/docs/source/tutorial/index.rst b/docs/source/tutorial/index.rst
new file mode 100644
index 000000000..339bd16c1
--- /dev/null
+++ b/docs/source/tutorial/index.rst
@@ -0,0 +1,9 @@
+Tutorials
+=========
+
+.. toctree::
+   :maxdepth: 1
+
+   how_to_infer_pt
+   how_to_add_op
+   how_to_visualize
diff --git a/docs/static/ait_model.html b/docs/static/ait_model.html
new file mode 100644
index 000000000..18c56089d
--- /dev/null
+++ b/docs/static/ait_model.html
@@ -0,0 +1,866 @@
+
+<!DOCTYPE html>
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.3.1/dist/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
+  <title>ait_sample_net</title>
+</head>
+
+<style>
+
+html {
+  scroll-behavior: smooth;
+}
+
+* { box-sizing: border-box; }
+body {
+  font: 16px Arial;
+}
+.autocomplete {
+  /*the container must be positioned relative:*/
+  position: relative;
+  display: inline-block;
+}
+input {
+  border: 1px solid transparent;
+  background-color: #f1f1f1;
+  padding: 10px;
+  font-size: 16px;
+}
+input[type=text] {
+  background-color: #f1f1f1;
+  width: 100%;
+}
+input[type=submit] {
+  background-color: DodgerBlue;
+  color: #fff;
+}
+.autocomplete-items {
+  position: absolute;
+  border: 1px solid #d4d4d4;
+  border-bottom: none;
+  border-top: none;
+  z-index: 99;
+  /*position the autocomplete items to be the same width as the container:*/
+  top: 100%;
+  left: 0;
+  right: 0;
+}
+.autocomplete-items div {
+  padding: 10px;
+  cursor: pointer;
+  background-color: #fff;
+  border-bottom: 1px solid #d4d4d4;
+}
+.autocomplete-items div:hover {
+  /*when hovering an item:*/
+  background-color: #e9e9e9;
+}
+.autocomplete-active {
+  /*when navigating through the items using the arrow keys:*/
+  background-color: DodgerBlue !important;
+  color: #ffffff;
+}
+
+.popover {
+  box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2);
+  title-bg: "#0d6efd";
+}
+
+.header {
+  position: fixed;
+  width: 100%;
+  top: 0;
+  left: 0;
+
+}
+
+</style>
+
+
+<body>
+
+<nav id="nav_bar" class="navbar fixed-top bg-light">
+  <div class="container-fluid">
+    <a onclick="back_to_head()" class="navbar-brand">ait_sample_net</a>
+    <div class="navbar-right">
+        <div class="autocomplete" style="width:300px;">
+        <input id="name_input" class="form-control me-2" type="search" placeholder="Search" aria-label="Search">
+        </div>
+        <button class="btn btn-outline-success" onclick="launch_modal_with_input()">Search</button>
+    </div>
+  </div>
+</nav>
+
+
+  <script
+  src="https://code.jquery.com/jquery-3.6.0.js"
+  integrity="sha256-H+K7U5CnXl1h5ywQfKtSj8PCmoN9aaq30gDh27Xc0jk="
+  crossorigin="anonymous"></script>
+  
+  <script src="https://cdn.jsdelivr.net/npm/popper.js@1.12.9/dist/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
+  <script src="https://d3js.org/d3.v5.min.js"></script>
+  <script src="https://unpkg.com/@hpcc-js/wasm@0.3.11/dist/index.min.js"></script>
+  <script src="https://unpkg.com/d3-graphviz@3.0.5/build/d3-graphviz.js"></script>
+
+
+
+  <div id="graph" style="text-align: center;"></div>
+  
+<div class="modal fade" id="X_modal" tabindex="-1" role="dialog" aria-labelledby="X_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="X_label">X</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> shape </td>
+      <td> [512, 1024] </td>
+    </tr>  <tr>
+      <td> is_view_of </td>
+      <td> None </td>
+    </tr>  <tr>
+      <td> is_output </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> is_param </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> dtype </td>
+      <td> float16 </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="gemm_rcr_bias_fast_gelu_0_modal" tabindex="-1" role="dialog" aria-labelledby="gemm_rcr_bias_fast_gelu_0_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="gemm_rcr_bias_fast_gelu_0_label">gemm_rcr_bias_fast_gelu_0</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> op_type </td>
+      <td> gemm_rcr_bias_fast_gelu </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="gemm_rcr_bias_add_4_modal" tabindex="-1" role="dialog" aria-labelledby="gemm_rcr_bias_add_4_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="gemm_rcr_bias_add_4_label">gemm_rcr_bias_add_4</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> op_type </td>
+      <td> gemm_rcr_bias_add </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="dense1_weight_modal" tabindex="-1" role="dialog" aria-labelledby="dense1_weight_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="dense1_weight_label">dense1_weight</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> shape </td>
+      <td> [4096, 1024] </td>
+    </tr>  <tr>
+      <td> is_view_of </td>
+      <td> None </td>
+    </tr>  <tr>
+      <td> is_output </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> is_param </td>
+      <td> True </td>
+    </tr>  <tr>
+      <td> dtype </td>
+      <td> float16 </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="dense1_bias_modal" tabindex="-1" role="dialog" aria-labelledby="dense1_bias_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="dense1_bias_label">dense1_bias</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> shape </td>
+      <td> [4096] </td>
+    </tr>  <tr>
+      <td> is_view_of </td>
+      <td> None </td>
+    </tr>  <tr>
+      <td> is_output </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> is_param </td>
+      <td> True </td>
+    </tr>  <tr>
+      <td> dtype </td>
+      <td> float16 </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="gemm_rcr_bias_fast_gelu_0_0_modal" tabindex="-1" role="dialog" aria-labelledby="gemm_rcr_bias_fast_gelu_0_0_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="gemm_rcr_bias_fast_gelu_0_0_label">gemm_rcr_bias_fast_gelu_0_0</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> shape </td>
+      <td> [512, 4096] </td>
+    </tr>  <tr>
+      <td> is_view_of </td>
+      <td> None </td>
+    </tr>  <tr>
+      <td> is_output </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> is_param </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> dtype </td>
+      <td> float16 </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="dense2_weight_modal" tabindex="-1" role="dialog" aria-labelledby="dense2_weight_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="dense2_weight_label">dense2_weight</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> shape </td>
+      <td> [1024, 4096] </td>
+    </tr>  <tr>
+      <td> is_view_of </td>
+      <td> None </td>
+    </tr>  <tr>
+      <td> is_output </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> is_param </td>
+      <td> True </td>
+    </tr>  <tr>
+      <td> dtype </td>
+      <td> float16 </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="dense2_bias_modal" tabindex="-1" role="dialog" aria-labelledby="dense2_bias_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="dense2_bias_label">dense2_bias</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> shape </td>
+      <td> [1024] </td>
+    </tr>  <tr>
+      <td> is_view_of </td>
+      <td> None </td>
+    </tr>  <tr>
+      <td> is_output </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> is_param </td>
+      <td> True </td>
+    </tr>  <tr>
+      <td> dtype </td>
+      <td> float16 </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="elementwise_2_0_modal" tabindex="-1" role="dialog" aria-labelledby="elementwise_2_0_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="elementwise_2_0_label">elementwise_2_0</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> shape </td>
+      <td> [512, 1024] </td>
+    </tr>  <tr>
+      <td> is_view_of </td>
+      <td> None </td>
+    </tr>  <tr>
+      <td> is_output </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> is_param </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> dtype </td>
+      <td> float16 </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="layernorm_3_modal" tabindex="-1" role="dialog" aria-labelledby="layernorm_3_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="layernorm_3_label">layernorm_3</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> op_type </td>
+      <td> layernorm </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="layernorm_weight_modal" tabindex="-1" role="dialog" aria-labelledby="layernorm_weight_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="layernorm_weight_label">layernorm_weight</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> shape </td>
+      <td> [1024] </td>
+    </tr>  <tr>
+      <td> is_view_of </td>
+      <td> None </td>
+    </tr>  <tr>
+      <td> is_output </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> is_param </td>
+      <td> True </td>
+    </tr>  <tr>
+      <td> dtype </td>
+      <td> float16 </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="layernorm_bias_modal" tabindex="-1" role="dialog" aria-labelledby="layernorm_bias_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="layernorm_bias_label">layernorm_bias</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> shape </td>
+      <td> [1024] </td>
+    </tr>  <tr>
+      <td> is_view_of </td>
+      <td> None </td>
+    </tr>  <tr>
+      <td> is_output </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> is_param </td>
+      <td> True </td>
+    </tr>  <tr>
+      <td> dtype </td>
+      <td> float16 </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+
+<div class="modal fade" id="Y_modal" tabindex="-1" role="dialog" aria-labelledby="Y_label" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="Y_label">Y</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody>  <tr>
+      <td> shape </td>
+      <td> [512, 1024] </td>
+    </tr>  <tr>
+      <td> is_view_of </td>
+      <td> None </td>
+    </tr>  <tr>
+      <td> is_output </td>
+      <td> True </td>
+    </tr>  <tr>
+      <td> is_param </td>
+      <td> False </td>
+    </tr>  <tr>
+      <td> dtype </td>
+      <td> float16 </td>
+    </tr>  </tbody>
+</table>
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+  
+
+  <script>
+  items = ["X", "gemm_rcr_bias_fast_gelu_0", "gemm_rcr_bias_add_4", "dense1_weight", "dense1_bias", "gemm_rcr_bias_fast_gelu_0_0", "dense2_weight", "dense2_bias", "elementwise_2_0", "layernorm_3", "layernorm_weight", "layernorm_bias", "Y"];
+  function autocomplete(inp, arr) {
+  /*the autocomplete function takes two arguments,
+  the text field element and an array of possible autocompleted values:*/
+  var currentFocus;
+  /*execute a function when someone writes in the text field:*/
+  inp.addEventListener("input", function(e) {
+      var a, b, i, val = this.value;
+      /*close any already open lists of autocompleted values*/
+      closeAllLists();
+      if (!val) { return false;}
+      currentFocus = -1;
+      /*create a DIV element that will contain the items (values):*/
+      a = document.createElement("DIV");
+      a.setAttribute("id", this.id + "autocomplete-list");
+      a.setAttribute("class", "autocomplete-items");
+      /*append the DIV element as a child of the autocomplete container:*/
+      this.parentNode.appendChild(a);
+      /*for each item in the array...*/
+      for (i = 0; i < arr.length; i++) {
+        /*check if the item starts with the same letters as the text field value:*/
+        if (arr[i].substr(0, val.length).toUpperCase() == val.toUpperCase()) {
+          /*create a DIV element for each matching element:*/
+          b = document.createElement("DIV");
+          /*make the matching letters bold:*/
+          b.innerHTML = "<strong>" + arr[i].substr(0, val.length) + "</strong>";
+          b.innerHTML += arr[i].substr(val.length);
+          /*insert a input field that will hold the current array item's value:*/
+          b.innerHTML += "<input type='hidden' value='" + arr[i] + "'>";
+          /*execute a function when someone clicks on the item value (DIV element):*/
+              b.addEventListener("click", function(e) {
+              /*insert the value for the autocomplete text field:*/
+              inp.value = this.getElementsByTagName("input")[0].value;
+              /*close the list of autocompleted values,
+              (or any other open lists of autocompleted values:*/
+              closeAllLists();
+          });
+          a.appendChild(b);
+        }
+      }
+  });
+  /*execute a function presses a key on the keyboard:*/
+  inp.addEventListener("keydown", function(e) {
+      var x = document.getElementById(this.id + "autocomplete-list");
+      if (x) x = x.getElementsByTagName("div");
+      if (e.keyCode == 40) {
+        /*If the arrow DOWN key is pressed,
+        increase the currentFocus variable:*/
+        currentFocus++;
+        /*and and make the current item more visible:*/
+        addActive(x);
+      } else if (e.keyCode == 38) { //up
+        /*If the arrow UP key is pressed,
+        decrease the currentFocus variable:*/
+        currentFocus--;
+        /*and and make the current item more visible:*/
+        addActive(x);
+      } else if (e.keyCode == 13) {
+        /*If the ENTER key is pressed, prevent the form from being submitted,*/
+        e.preventDefault();
+        if (currentFocus > -1) {
+          /*and simulate a click on the "active" item:*/
+          if (x) x[currentFocus].click();
+        }
+      }
+  });
+  function addActive(x) {
+    /*a function to classify an item as "active":*/
+    if (!x) return false;
+    /*start by removing the "active" class on all items:*/
+    removeActive(x);
+    if (currentFocus >= x.length) currentFocus = 0;
+    if (currentFocus < 0) currentFocus = (x.length - 1);
+    /*add class "autocomplete-active":*/
+    x[currentFocus].classList.add("autocomplete-active");
+  }
+  function removeActive(x) {
+    /*a function to remove the "active" class from all autocomplete items:*/
+    for (var i = 0; i < x.length; i++) {
+      x[i].classList.remove("autocomplete-active");
+    }
+  }
+  function closeAllLists(elmnt) {
+    /*close all autocomplete lists in the document,
+    except the one passed as an argument:*/
+    var x = document.getElementsByClassName("autocomplete-items");
+    for (var i = 0; i < x.length; i++) {
+      if (elmnt != x[i] && elmnt != inp) {
+      x[i].parentNode.removeChild(x[i]);
+    }
+  }
+}
+/*execute a function when someone clicks in the document:*/
+document.addEventListener("click", function (e) {
+    closeAllLists(e.target);
+});
+}
+  autocomplete(document.getElementById("name_input"), items);
+
+  function back_to_head() {
+    var modal_id = items[0];
+    var modal = document.getElementById(modal_id);
+    modal.scrollIntoView({ block: 'center',  behavior: 'smooth' });
+  }
+
+  function launch_modal_with_input() {
+    var modal_id = document.getElementById("name_input").value;
+    var modal = document.getElementById(modal_id);
+    if (modal == null) {
+        var msg = "Could not find node with name: " + modal_id;
+        alert(msg);
+    } else {
+        modal.scrollIntoView({ block: 'center',  behavior: 'smooth' });
+        var obj = $("#" + modal_id);
+        var shape = obj.find("polygon:first");
+        var color = shape.attr("stroke");
+        shape.attr("fill", color);
+        for (let i = 0; i < 5; i++) {
+            obj.fadeOut(100).fadeIn(100).fadeOut(100).fadeIn(100);
+        }
+        obj.promise().done(function(){
+            shape.attr("fill", "none");
+        });
+    }
+  }
+
+
+
+
+  </script>
+
+
+
+
+  <script>
+    var dotSrc = `digraph G {
+X [color=lightskyblue1, id=X, shape=note];
+gemm_rcr_bias_fast_gelu_0 [color=mediumpurple1, id=gemm_rcr_bias_fast_gelu_0, shape=folder];
+X -> gemm_rcr_bias_fast_gelu_0;
+gemm_rcr_bias_add_4 [color=mediumpurple1, id=gemm_rcr_bias_add_4, shape=folder];
+X -> gemm_rcr_bias_add_4;
+dense1_weight [color=x11gray, id=dense1_weight, shape=note];
+dense1_weight -> gemm_rcr_bias_fast_gelu_0;
+dense1_bias [color=x11gray, id=dense1_bias, shape=note];
+dense1_bias -> gemm_rcr_bias_fast_gelu_0;
+gemm_rcr_bias_fast_gelu_0_0 [color=lightskyblue1, id=gemm_rcr_bias_fast_gelu_0_0, shape=note];
+gemm_rcr_bias_fast_gelu_0 -> gemm_rcr_bias_fast_gelu_0_0;
+gemm_rcr_bias_fast_gelu_0_0 -> gemm_rcr_bias_add_4;
+dense2_weight [color=x11gray, id=dense2_weight, shape=note];
+dense2_weight -> gemm_rcr_bias_add_4;
+dense2_bias [color=x11gray, id=dense2_bias, shape=note];
+dense2_bias -> gemm_rcr_bias_add_4;
+elementwise_2_0 [color=lightskyblue1, id=elementwise_2_0, shape=note];
+gemm_rcr_bias_add_4 -> elementwise_2_0;
+layernorm_3 [color=mediumpurple1, id=layernorm_3, shape=folder];
+elementwise_2_0 -> layernorm_3;
+layernorm_weight [color=x11gray, id=layernorm_weight, shape=note];
+layernorm_weight -> layernorm_3;
+layernorm_bias [color=x11gray, id=layernorm_bias, shape=note];
+layernorm_bias -> layernorm_3;
+Y [color=violetred, id=Y, shape=note];
+layernorm_3 -> Y;
+}
+`;
+    var popover_data = {"X": "shape: [512, 1024]", "gemm_rcr_bias_fast_gelu_0": "op: gemm_rcr_bias_fast_gelu", "gemm_rcr_bias_add_4": "op: gemm_rcr_bias_add", "dense1_weight": "shape: [4096, 1024]", "dense1_bias": "shape: [4096]", "gemm_rcr_bias_fast_gelu_0_0": "shape: [512, 4096]", "dense2_weight": "shape: [1024, 4096]", "dense2_bias": "shape: [1024]", "elementwise_2_0": "shape: [512, 1024]", "layernorm_3": "op: layernorm", "layernorm_weight": "shape: [1024]", "layernorm_bias": "shape: [1024]", "Y": "shape: [512, 1024]"};
+    var graphviz = d3.select("#graph").graphviz();
+    var pop_finish = 0;
+    // var dotSrcLines;
+
+    function add_popover() {
+      for (let i = 0; i < items.length; i++) {
+        var id = items[i];
+        var obj = $("#" + id);
+        obj.attr("data-content", popover_data[id]);
+        obj.attr("rel", "popover");
+        obj.attr("data-original-title", id);
+        obj.attr("data-placement", "top");
+        obj.attr("data-trigger", "hover");
+        obj.popover();
+      }
+    }
+  
+
+    function render() {
+      // console.log('DOT source =', dotSrc);
+      // dotSrcLines = dotSrc.split('\n');
+      graphviz.transition(function() {
+        return d3.transition().delay(100).duration(1000);
+      }).renderDot(dotSrc).on("end", interactive);
+    }
+
+    function launch_modal(modal_id) {
+      $('#' + modal_id + "_modal").modal('show');
+    }
+
+
+    function interactive() {
+      nodes = d3.selectAll('.node,.edge');
+      nodes.on("click", function() {
+        var id = d3.select(this).attr('id');
+        // console.log('Element id="%s"', id);
+        document.getElementById("name_input").value = id;
+        launch_modal(id);
+      });
+      nodes.on("mouseover", function() {
+        if (pop_finish == 0) {
+            add_popover();
+            pop_finish = 1;
+        }
+        var id = d3.select(this).attr("id");
+        // console.log('Move over Element id="%s"', id);
+        var obj = $("#" + id);
+        var shape = obj.find("polygon:first");
+        var color = shape.attr("stroke");
+        shape.attr("fill", color);
+        
+      });
+      nodes.on("mouseout", function() {
+        var id = d3.select(this).attr("id");
+        // console.log('Move off Element id="%s"', id);
+        var obj = $("#" + id);
+        var shape = obj.find("polygon:first");
+        shape.attr("fill", "none");
+      });
+
+    }
+    render(dotSrc);
+  </script>
+  
+</body>
\ No newline at end of file
diff --git a/examples/01_resnet-50/README.md b/examples/01_resnet-50/README.md
new file mode 100644
index 000000000..3f75060ff
--- /dev/null
+++ b/examples/01_resnet-50/README.md
@@ -0,0 +1,84 @@
+# ResNet-50
+
+In this example, we will demo how to use AITemplate for inference on the ResNet-50 model from PyTorch Image Models (TIMM).
+
+We will demo two usages:
+* Using AIT to accelerate PyTorch inference
+* Using AIT standalone without PyTorch
+
+## Code structure
+```
+modeling
+    resnet.py              # ResNet definition using AIT's frontend API
+weight_utils.py            # Utils to convert TIMM R-50 weights to AIT
+infer_with_torch.py        # Example to accelerate PyTorch, and seamlessly use with other PyTorch code
+infer_with_numpy.py        # Dump TIMM weights to Numpy and use AIT & Numpy without 3rdparties
+benchmark_pt.py            # Benchmark code for PyTorch
+benchmark_ait.py           # Benchmark code for AIT
+```
+
+## Multi-GPU profiling
+AIT requires to do profiling to decide best algorithms for CUTLASS and CK.
+To enable multiple GPUs profiling, use the environment variable `CUDA_VISIBLE_DEVICES` on NVIDIA platform and `HIP_VISIBLE_DEVICES` on AMD platform.
+
+For example, `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 benchmark_ait.py`.
+
+Benchmark is fast once the profilers are built.
+
+## Reference Speed vs PyTorch Eager
+
+### A100-40GB / CUDA 11.6.2
+_PT = PyTorch 1.12 Eager_
+
+| Batch size | PT Latency (ms) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) |
+|------------|-----------------|---------------|------------------|----------------|
+| 1          | 7.68            | 130.29        | 0.58             | 1730.17        |
+| 2          | 7.16            | 279.36        | 0.62             | 3250.74        |
+| 4          | 7.17            | 557.68        | 0.69             | 5773.20        |
+| 8          | 7.02            | 1138.83       | 0.88             | 9104.44        |
+| 16         | 7.01            | 2280.97       | 1.33             | 12012.81       |
+| 32         | 7.53            | 4251.30       | 2.40             | 13350.58       |
+| 64         | 13.98           | 4578.09       | 4.53             | 14140.83       |
+| 128        | 26.57           | 4816.71       | 8.57             | 14935.82       |
+| 256        | 50.93           | 5026.40       | 16.58            | 15444.57       |
+
+
+### MI-250 / ROCm 5.2.3 / HIPCC-10736
+_PT = PyTorch 1.12 Eager_
+#### 1 GCD
+
+| Batch size | PT Latency (ms) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) |
+|------------|-----------------|---------------|------------------|----------------|
+| 1          | 3.94            | 254.06        | 2.28             | 438.60         |
+| 2          | 3.89            | 514.48        | 2.25             | 888.89         |
+| 4          | 3.82            | 1047.11       | 2.38             | 1680.67        |
+| 8          | 4.40            | 1819.27       | 2.62             | 3053.44        |
+| 16         | 6.48            | 2468.65       | 3.41             | 4692.08        |
+| 32         | 10.40           | 3076.97       | 4.86             | 6584.36        |
+| 64         | 18.35           | 3488.12       | 8.26             | 7748.18        |
+| 128        | 34.36           | 3724.76       | 15.38            | 8322.50        |
+| 256        | 65.35           | 3917.29       | 29.62            | 8642.81        |
+
+#### 2 GCDs
+
+| Batch size | PT Latency (ms) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) |
+|------------|-----------------|---------------|------------------|----------------|
+| 1          |                 |               |                  |                |
+| 2          | 3.94            | 507.54        | 2.36             | 848.15         |
+| 4          | 3.89            | 1028.60       | 2.34             | 1710.94        |
+| 8          | 3.88            | 2059.41       | 2.70             | 2960.46        |
+| 16         | 4.56            | 3507.48       | 2.83             | 5663.52        |
+| 32         | 6.72            | 4762.89       | 3.87             | 8275.98        |
+| 64         | 10.82           | 5917.63       | 5.26             | 12173.67       |
+| 128        | 18.79           | 6812.09       | 8.98             | 14247.09       |
+| 256        | 35.99           | 7112.59       | 16.69            | 15338.58       |
+
+
+
+### Note for Performance Results
+
+- For NVIDIA A100, our test cluster doesn't allow to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- To benchmark MI-250, the first step is to run `python3 benchmark_ait.py` to generate all necessary model dynamic library files with single GCD. Then run `./benchmark_mi250.sh {batch_size}` to simulate data parallel execution on 2 GCDs, each GCD is processing half of the batch.
+- To benchmark MI-250 1 GCD, we lock the frequency with command `rocm-smi -d x --setperfdeterminism 1700`, where `x` is the GPU id.
+- To benchmark MI-250 2 GCDs, we observed performance regression with rocm perf-determ mode. The 2 GCDs number is running without perf-determ mode set with command `rocm-smi -d x --resetperfdeterminism`, where `x` is the GPU id.
+- Performance results are what we can reproduce and for reference only. It should not be used for other purposes.
diff --git a/examples/01_resnet-50/benchmark_ait.py b/examples/01_resnet-50/benchmark_ait.py
new file mode 100644
index 000000000..577a4472d
--- /dev/null
+++ b/examples/01_resnet-50/benchmark_ait.py
@@ -0,0 +1,132 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""benchmark for resnet50"""
+
+import os
+
+import click
+
+import torch
+from aitemplate.compiler import compile_model, Model
+
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from modeling.resnet import build_resnet_backbone
+from weight_utils import export_to_torch_tensor
+
+
+def mark_output(y):
+    """Different to PyTorch, we need to explicit mark output tensor for optimization,
+
+    Parameters
+    ----------
+    y : List[Tensor]
+        List of output tensors
+    """
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+def compile_module(model_name, batch_size, **kwargs):
+
+    if model_name != "resnet50":
+        raise NotImplementedError
+
+    model_name = f"{model_name}_{batch_size}"
+    target = detect_target(**kwargs)
+    # Create input tensor, need to specify the shape, dtype and is_input flag
+    x = Tensor(
+        shape=[batch_size, 224, 224, 3], dtype="float16", name="input0", is_input=True
+    )
+    model = build_resnet_backbone(50, activation="ReLU")
+    # Mark all parameters with name same to PyTorch name convention
+    model.name_parameter_tensor()
+    # Forward the input tensor to the model, get output tensor
+    y = model(x)
+    # Mark output tensor
+    mark_output(y)
+    # Compile the model
+    module = compile_model(y, target, "./tmp", model_name)
+    return module
+
+
+def benchmark(model_name, batch_size, mod=None, graph_mode=True):
+    # Load params
+    cuda_params = export_to_torch_tensor(model_name)
+    # Load compiled model
+    if mod is None:
+        model_name = f"{model_name}_{batch_size}"
+        mod = Model(os.path.join("./tmp", model_name, "test.so"))
+
+    # Set params
+    for k, v in cuda_params.items():
+        mod.set_constant_with_tensor(k, v)
+
+    # prepare input/output tensor
+    x_input = torch.randn([batch_size, 224, 224, 3]).cuda().half()
+    x_input = x_input.contiguous()
+    y_output = torch.zeros([batch_size, 1, 1, 1000]).cuda().half()
+    y_output = y_output.contiguous()
+
+    # warm up
+    t, _, __ = mod.benchmark_with_tensors(
+        [x_input],
+        [y_output],
+        count=100,
+        repeat=4,
+        graph_mode=graph_mode,
+    )
+    # benchmark
+    t, _, __ = mod.benchmark_with_tensors(
+        [x_input],
+        [y_output],
+        count=100,
+        repeat=4,
+        graph_mode=graph_mode,
+    )
+    print(f"batch_size: {batch_size}, latency: {t}")
+    dev_flag = os.environ.get("HIP_VISIBLE_DEVICES", "-1")
+    dev_flag = dev_flag.replace(",", "_")
+    with open(f"resnet50_ait_benchmark_dev_{dev_flag}.txt", "a") as f:
+        f.write(f"batch_size: {batch_size}, latency: {t}\n")
+
+
+@click.command()
+@click.option(
+    "--use-fp16-acc",
+    type=bool,
+    default=True,
+    help="Whether to use FP16 for accumulation (similar to TensorRT)",
+)
+@click.option("--use-graph", type=bool, default=True, help="Whether to use CUDA graph")
+@click.option("--batch-size", type=int, default=0, help="Batch size")
+def main(use_fp16_acc=True, use_graph=True, batch_size=0):
+    if detect_target().name() == "rocm":
+        use_graph = False
+    if batch_size < 1:
+        for bs in (1, 2, 4, 8, 16, 32, 64, 128, 256):
+            compile_module("resnet50", bs, use_fp16_acc=use_fp16_acc)
+            benchmark("resnet50", bs, graph_mode=use_graph)
+    else:
+        benchmark("resnet50", batch_size, graph_mode=use_graph)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/01_resnet-50/benchmark_mi250.sh b/examples/01_resnet-50/benchmark_mi250.sh
new file mode 100644
index 000000000..883846b68
--- /dev/null
+++ b/examples/01_resnet-50/benchmark_mi250.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size "$1" &
+HIP_VISIBLE_DEVICES=1 python3 benchmark_ait.py --batch-size "$1" && fg
diff --git a/examples/01_resnet-50/benchmark_pt.py b/examples/01_resnet-50/benchmark_pt.py
new file mode 100644
index 000000000..82c74bc89
--- /dev/null
+++ b/examples/01_resnet-50/benchmark_pt.py
@@ -0,0 +1,51 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+
+import click
+import timm
+import torch
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+
+def benchmark(model, batch_size):
+    with torch.inference_mode():
+        input_shape = (batch_size, 3, 224, 224)
+        input_data = torch.randn(input_shape).cuda().half()
+        # warm up
+        benchmark_torch_function(100, model, input_data)
+        # benchmark
+        t = benchmark_torch_function(100, model, input_data)
+        print("batch_size: {}, time: {}".format(batch_size, t))
+        dev_flag = os.environ.get("HIP_VISIBLE_DEVICES", "-1")
+        dev_flag = dev_flag.replace(",", "_")
+        with open(f"resnet50_pt_benchmark_dev_{dev_flag}.txt", "a") as f:
+            f.write("batch_size: {}, latency: {}\n".format(batch_size, t))
+
+
+@click.command()
+@click.option("--batch-size", default=0, type=int)
+def main(batch_size):
+    model = timm.create_model("resnet50", pretrained=False).cuda().half()
+    model.eval()
+    if batch_size == 0:
+        for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
+            benchmark(model, batch_size)
+    else:
+        benchmark(model, batch_size)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/01_resnet-50/infer_with_torch.py b/examples/01_resnet-50/infer_with_torch.py
new file mode 100644
index 000000000..23269b2e4
--- /dev/null
+++ b/examples/01_resnet-50/infer_with_torch.py
@@ -0,0 +1,135 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model, Model
+
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from modeling.resnet import build_resnet_backbone
+from PIL import Image
+from weight_utils import timm_export
+
+
+def mark_output(y):
+    """Different to PyTorch, we need to explicit mark output tensor for optimization,
+
+    Parameters
+    ----------
+    y : List[Tensor]
+        List of output tensors
+    """
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+def compile_module(model_name, **kwargs):
+    batch_size = 1
+
+    if model_name != "resnet50":
+        raise NotImplementedError
+
+    model_name = f"{model_name}_{batch_size}"
+    target = detect_target(**kwargs)
+    # Create input tensor, need to specify the shape, dtype and is_input flag
+    x = Tensor(
+        shape=[batch_size, 224, 224, 3], dtype="float16", name="input0", is_input=True
+    )
+    model = build_resnet_backbone(50, activation="ReLU")
+    # Mark all parameters with name same to PyTorch name convention
+    model.name_parameter_tensor()
+    # Forward the input tensor to the model, get output tensor
+    y = model(x)
+    # Mark output tensor
+    mark_output(y)
+    # Compile the model
+    module = compile_model(y, target, "./tmp", model_name)
+    return module
+
+
+def prepare_data(img_path=None):
+    # we find a 224x224 image online for demo purpose:
+    img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
+    if img_path is None:
+        if os.path.exists("cat.png") is False:
+            os.system(f"wget -O cat.png {img_url}")
+        img_path = "cat.png"
+    image = Image.open(img_path).resize((224, 224))
+    image = torch.as_tensor(np.array(image).astype("float32")).cuda().half()
+    image = image.unsqueeze(0)
+    mean = torch.tensor([0.485, 0.456, 0.406]).cuda().half()
+    std = torch.tensor([0.229, 0.224, 0.225]).cuda().half()
+    image = (image / 255.0 - mean[None, None, None, :]) / std[None, None, None, :]
+    return image
+
+
+def export_to_torch_tensor(model_name="resnet50"):
+    if model_name != "resnet50":
+        raise NotImplementedError
+    timm2ait = timm_export(model_name)
+    params = timm2ait.export_model(half=True)
+    return params, timm2ait.pt_model
+
+
+def inference(model_name, mod=None):
+    # Load params
+    cuda_params, pt_model = export_to_torch_tensor(model_name)
+    # Load compiled model
+    if mod is None:
+        mod = Model(os.path.join("./tmp", model_name, "test.so"))
+
+    # Set torch tensor params to runtime
+    for k, v in cuda_params.items():
+        mod.set_constant_with_tensor(k, v)
+
+    # prepare input/output tensor
+    x_input = prepare_data()
+    x_input = x_input.contiguous()
+    y_output = torch.zeros([1, 1, 1, 1000]).cuda().half()
+    y_output = y_output.contiguous()
+
+    # execute
+    mod.run_with_tensors([x_input], [y_output])
+
+    # process output with pytorch
+    y_label = torch.argmax(y_output, dim=-1)
+    y_cpu = y_label.cpu().numpy()
+    print(y_cpu)
+
+    # run pytorch
+    pt_model.eval()
+    pt_model = pt_model.cuda().half()
+    pt_output = pt_model(x_input.permute([0, 3, 1, 2]))
+    y_label = torch.argmax(pt_output, dim=-1)
+    y_cpu = y_label.cpu().numpy()
+    print(y_cpu)
+
+    # verify outputs
+    assert torch.allclose(y_output, pt_output, 1e-1, 1e-1)
+    print("Verification done!")
+
+
+if __name__ == "__main__":
+    np.random.seed(4896)
+    model_name = "resnet50"
+    mod = compile_module(model_name, use_fp16_acc=True)
+    inference(model_name, mod)
diff --git a/examples/01_resnet-50/modeling/__init__.py b/examples/01_resnet-50/modeling/__init__.py
new file mode 100644
index 000000000..5cf1a826f
--- /dev/null
+++ b/examples/01_resnet-50/modeling/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/examples/01_resnet-50/modeling/resnet.py b/examples/01_resnet-50/modeling/resnet.py
new file mode 100644
index 000000000..9842aa18d
--- /dev/null
+++ b/examples/01_resnet-50/modeling/resnet.py
@@ -0,0 +1,456 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import numpy as np
+from aitemplate.frontend import nn
+from aitemplate.testing import detect_target
+
+
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NHWC tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+
+class BasicStem(CNNBlockBase):
+    """
+    The standard ResNet stem (layers before the first residual block),
+    with a conv, relu and max_pool.
+    """
+
+    def __init__(self, in_channels=3, out_channels=64, norm="BN", activation="ReLU"):
+        super().__init__(in_channels, out_channels, 4)
+        conv_op = None
+        if detect_target().name() == "cuda":
+            if activation == "ReLU":
+                conv_op = nn.Conv2dBiasReluFewChannels
+            elif activation == "Hardswish":
+                conv_op = nn.Conv2dBiasHardswishFewChannels
+            else:
+                raise NotImplementedError
+        else:
+            if activation == "ReLU":
+                conv_op = nn.Conv2dBiasRelu
+            elif activation == "Hardswish":
+                conv_op = nn.Conv2dBiasHardswish
+            else:
+                raise NotImplementedError
+        self.conv1 = conv_op(in_channels, out_channels, 7, 2, 7 // 2)
+        self.pool = nn.MaxPool2d(3, 2, 1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pool(x)
+        return x
+
+
+class BasicBlock(CNNBlockBase):
+    """
+    The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
+    with two 3x3 conv layers and a projection shortcut if needed.
+    """
+
+    def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
+        super().__init__(in_channels, out_channels, stride)
+
+    def forward(self, x):
+        raise NotImplementedError()
+
+
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        activation="ReLU",
+        stride_in_1x1=False,
+        dilation=1,
+    ):
+        """
+        Args:
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            num_groups (int): number of groups for the 3x3 conv layer.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            stride_in_1x1 (bool): when stride>1, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+            dilation (int): the dilation rate of the 3x3 conv layer.
+        """
+        super().__init__(in_channels, out_channels, stride)
+
+        if in_channels != out_channels:
+            self.downsample_0 = nn.Conv2dBias(in_channels, out_channels, 1, stride, 0)
+        else:
+            self.downsample_0 = None
+
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        conv_op = None
+        conv_op_add = None
+        if activation == "ReLU":
+            conv_op = nn.Conv2dBiasRelu
+            conv_op_add = nn.Conv2dBiasAddRelu
+        elif activation == "Hardswish":
+            conv_op = nn.Conv2dBiasHardswish
+            conv_op_add = nn.Conv2dBiasAddHardswish
+        else:
+            raise NotImplementedError
+
+        self.conv1 = conv_op(in_channels, bottleneck_channels, 1, stride_1x1, 0)
+
+        self.conv2 = conv_op(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            stride_3x3,
+            1 * dilation,
+            dilation,
+        )
+
+        self.conv3 = conv_op_add(bottleneck_channels, out_channels, 1, 1, 0)
+
+        # for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+        #     if layer is not None:  # shortcut can be None
+        #         weight_init.c2_msra_fill(layer)
+
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.downsample_0 is not None:
+            downsample = self.downsample_0(x)
+        else:
+            downsample = x
+
+        out = self.conv3(out, downsample)
+        return out
+
+
+class ResNet(nn.Module):
+    """
+    Implement :paper:`ResNet`.
+    """
+
+    def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
+        """
+        Args:
+            stem (nn.Module): a stem module
+            stages (list[list[CNNBlockBase]]): several (typically 4) stages,
+                each contains multiple :class:`CNNBlockBase`.
+            activation (str): activation function to use.
+            num_classes (None or int): if None, will not perform classification.
+                Otherwise, will create a linear layer.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
+                If None, will return the output of the last layer.
+            freeze_at (int): The number of stages at the beginning to freeze.
+                see :meth:`freeze` for detailed explanation.
+        """
+        super().__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+
+        self.stage_names, self.stages = [], []
+
+        if out_features is not None:
+            # Avoid keeping unused layers in this module. They consume extra memory
+            # and may cause allreduce to fail
+            num_stages = max(
+                [
+                    {"layer1": 1, "layer2": 2, "layer3": 3, "layer4": 4}.get(f, 0)
+                    for f in out_features
+                ]
+            )
+            stages = stages[:num_stages]
+
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, CNNBlockBase), block
+
+            name = "layer" + str(i + 1)
+            stage = nn.Sequential(*blocks)
+
+            self.add_module(name, stage)
+            self.stage_names.append(name)
+            self.stages.append(stage)
+
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
+
+        self.stage_names = tuple(self.stage_names)  # Make it static for scripting
+
+        if num_classes is not None:
+            self.avgpool = nn.AvgPool2d(7, 1, 0)
+            self.fc = nn.Linear(curr_channels, num_classes)
+
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(
+                ", ".join(children)
+            )
+        self.reshape = nn.Reshape()
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        # assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name, stage in zip(self.stage_names, self.stages):
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = self.fc(x)
+            if x._rank() == 2:
+                x = self.reshape(x, [x._size(0), 1, 1, x._size(1)])
+            return x
+        return outputs
+
+    @staticmethod
+    def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
+        """
+        Create a list of blocks of the same type that forms one ResNet stage.
+        Args:
+            block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
+                stage. A module of this type must not change spatial resolution of inputs unless its
+                stride != 1.
+            num_blocks (int): number of blocks in this stage
+            in_channels (int): input channels of the entire stage.
+            out_channels (int): output channels of **every block** in the stage.
+            kwargs: other arguments passed to the constructor of
+                `block_class`. If the argument name is "xx_per_block", the
+                argument is a list of values to be passed to each block in the
+                stage. Otherwise, the same argument is passed to every block
+                in the stage.
+        Returns:
+            list[CNNBlockBase]: a list of block module.
+        Examples:
+        ::
+            stage = ResNet.make_stage(
+                BottleneckBlock, 3, in_channels=16, out_channels=64,
+                bottleneck_channels=16, num_groups=1,
+                stride_per_block=[2, 1, 1],
+                dilations_per_block=[1, 1, 2]
+            )
+        Usually, layers that produce the same feature map spatial size are defined as one
+        "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
+        all be 1.
+        """
+        blocks = []
+        for i in range(num_blocks):
+            curr_kwargs = {}
+            for k, v in kwargs.items():
+                if k.endswith("_per_block"):
+                    assert len(v) == num_blocks, (
+                        f"Argument '{k}' of make_stage should have the "
+                        f"same length as num_blocks={num_blocks}."
+                    )
+                    newk = k[: -len("_per_block")]
+                    assert (
+                        newk not in kwargs
+                    ), f"Cannot call make_stage with both {k} and {newk}!"
+                    curr_kwargs[newk] = v[i]
+                else:
+                    curr_kwargs[k] = v
+
+            blocks.append(
+                block_class(
+                    in_channels=in_channels, out_channels=out_channels, **curr_kwargs
+                )
+            )
+            in_channels = out_channels
+        return blocks
+
+    @staticmethod
+    def make_default_stages(depth, block_class=None, **kwargs):
+        """
+        Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
+        If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
+        instead for fine-grained customization.
+        Args:
+            depth (int): depth of ResNet
+            block_class (type): the CNN block class. Has to accept
+                `bottleneck_channels` argument for depth > 50.
+                By default it is BasicBlock or BottleneckBlock, based on the
+                depth.
+            kwargs:
+                other arguments to pass to `make_stage`. Should not contain
+                stride and channels, as they are predefined for each depth.
+        Returns:
+            list[list[CNNBlockBase]]: modules in all stages; see arguments of
+                :class:`ResNet.__init__`.
+        """
+        num_blocks_per_stage = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }[depth]
+        if block_class is None:
+            block_class = BasicBlock if depth < 50 else BottleneckBlock
+        if depth < 50:
+            in_channels = [64, 64, 128, 256]
+            out_channels = [64, 128, 256, 512]
+        else:
+            in_channels = [64, 256, 512, 1024]
+            out_channels = [256, 512, 1024, 2048]
+        ret = []
+        for (n, s, i, o) in zip(
+            num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels
+        ):
+            if depth >= 50:
+                kwargs["bottleneck_channels"] = o // 4
+            ret.append(
+                ResNet.make_stage(
+                    block_class=block_class,
+                    num_blocks=n,
+                    stride_per_block=[s] + [1] * (n - 1),
+                    in_channels=i,
+                    out_channels=o,
+                    **kwargs,
+                )
+            )
+        return ret
+
+
+def make_stage(*args, **kwargs):
+    """
+    Deprecated alias for backward compatibiltiy.
+    """
+    return ResNet.make_stage(*args, **kwargs)
+
+
+def build_resnet_backbone(depth, activation):
+    """
+    Create a ResNet instance from config.
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    norm = "BN"
+    activation = activation
+    num_groups = 1
+    stride_in_1x1 = False
+    num_groups = 1
+    width_per_group = 64
+    bottleneck_channels = num_groups * width_per_group
+    in_channels = 64
+    out_channels = 256
+
+    stem = BasicStem(in_channels=3, out_channels=64, norm=norm, activation=activation)
+
+    num_blocks_per_stage = {
+        18: [2, 2, 2, 2],
+        34: [3, 4, 6, 3],
+        50: [3, 4, 6, 3],
+        101: [3, 4, 23, 3],
+        152: [3, 8, 36, 3],
+    }[depth]
+
+    stages = []
+
+    for idx, stage_idx in enumerate(range(2, 6)):
+        # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper
+        dilation = 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1),
+            "in_channels": in_channels,
+            "out_channels": out_channels,
+            "norm": norm,
+            "activation": activation,
+        }
+        # Use BasicBlock for R18 and R34.
+        if depth in [18, 34]:
+            stage_kargs["block_class"] = BasicBlock
+        else:
+            stage_kargs["bottleneck_channels"] = bottleneck_channels
+            stage_kargs["stride_in_1x1"] = stride_in_1x1
+            stage_kargs["dilation"] = dilation
+            stage_kargs["num_groups"] = num_groups
+            stage_kargs["block_class"] = BottleneckBlock
+        blocks = ResNet.make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+        stages.append(blocks)
+
+    return ResNet(stem, stages, num_classes=1000)
diff --git a/examples/01_resnet-50/weight_utils.py b/examples/01_resnet-50/weight_utils.py
new file mode 100644
index 000000000..beaebd330
--- /dev/null
+++ b/examples/01_resnet-50/weight_utils.py
@@ -0,0 +1,173 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+script for converting model from timm to aitemplate
+Only tested on resnet50
+"""
+
+
+import pickle
+import re
+
+import click
+import numpy as np
+import timm
+import torch
+from aitemplate.testing import detect_target
+
+CONV_WEIGHT_PATTERN = re.compile(r"conv\d+\.weight")
+
+
+class timm_export(object):
+    def __init__(self, model_name):
+        self.model_name = model_name
+        if model_name != "resnet50":
+            raise NotImplementedError
+
+        with torch.no_grad():
+            self.pt_model = timm.create_model(
+                model_name, pretrained=True, num_classes=1000
+            )
+        self.pt_state = self.pt_model.state_dict()
+
+    def export_model(self, half=True):
+        fused_model = {}
+        for param_name in self.pt_state.keys():
+            self.transform_params(param_name, fused_model)
+        ait_model = {k.replace(".", "_"): weight for k, weight in fused_model.items()}
+        if detect_target().name() == "cuda":
+            self.export_conv0(ait_model, fused_model)
+        if half:
+            half_params = {}
+            for k, v in ait_model.items():
+                half_params[k] = v.detach().cuda().half().contiguous()
+            return half_params
+        return ait_model
+
+    def fuse_conv_bn_weights(
+        self, conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b, transpose=False
+    ):
+        conv_w = torch.tensor(conv_w)
+        bn_rm = torch.tensor(bn_rm)
+        bn_rv = torch.tensor(bn_rv)
+        bn_w = torch.tensor(bn_w)
+        bn_b = torch.tensor(bn_b)
+        bn_eps = torch.tensor(bn_eps)
+
+        if conv_b is None:
+            conv_b = torch.zeros_like(bn_rm)
+        if bn_w is None:
+            bn_w = torch.ones_like(bn_rm)
+        if bn_b is None:
+            bn_b = torch.zeros_like(bn_rm)
+        bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
+
+        if transpose:
+            shape = [1, -1] + [1] * (len(conv_w.shape) - 2)
+        else:
+            shape = [-1, 1] + [1] * (len(conv_w.shape) - 2)
+
+        conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape(shape)
+        conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
+
+        # NCHW -> NHWC
+        conv_w = conv_w.permute(0, 2, 3, 1).contiguous()
+        for arr in [conv_w.numpy(), conv_b.numpy()]:
+            if np.isnan(arr).any():
+                print("fuse bn error")
+        return conv_w, conv_b
+
+    def transform_conv0(self):
+        conv_w = self.pt_state["conv1.weight"]
+        bn_w = self.pt_state["bn1.weight"]
+        bn_b = self.pt_state["bn1.bias"]
+        bn_rm = self.pt_state["bn1.running_mean"]
+        bn_rv = self.pt_state["bn1.running_var"]
+        fused_w, fused_b = self.fuse_conv_bn_weights(
+            conv_w, None, bn_rm, bn_rv, 1e-5, bn_w, bn_b
+        )
+        return fused_w, fused_b
+
+    def transform_params(self, param_name, fused_model):
+        if param_name == "conv1.weight":
+            fused_w, fused_b = self.transform_conv0()
+            fused_model["stem.conv1.weight"] = fused_w
+            fused_model["stem.conv1.bias"] = fused_b
+        elif "downsample.0.weight" in param_name:
+            fused_w, fused_b = self.transform_downsample(param_name)
+            fused_model[param_name] = fused_w
+            fused_model[param_name.replace("weight", "bias")] = fused_b
+        elif param_name == "fc.weight":
+            fused_model["fc.weight"] = self.pt_state["fc.weight"]
+            fused_model["fc.bias"] = self.pt_state["fc.bias"]
+        elif CONV_WEIGHT_PATTERN.search(param_name) is not None:
+            bn_w_name = param_name.replace("conv", "bn")
+            conv_w = self.pt_state[param_name]
+            bn_w = self.pt_state[bn_w_name]
+            bn_b = self.pt_state[bn_w_name.replace("weight", "bias")]
+            bn_rm = self.pt_state[bn_w_name.replace("weight", "running_mean")]
+            bn_rv = self.pt_state[bn_w_name.replace("weight", "running_var")]
+            fused_w, fused_b = self.fuse_conv_bn_weights(
+                conv_w, None, bn_rm, bn_rv, 1e-5, bn_w, bn_b
+            )
+            fused_model[param_name] = fused_w
+            fused_model[param_name.replace("weight", "bias")] = fused_b
+        else:
+            pass
+
+    def transform_downsample(self, param_name):
+        assert "downsample" in param_name
+        tags = param_name.split(".")
+        block_tag = ".".join(tags[:2])
+        conv_w = self.pt_state[f"{block_tag}.downsample.0.weight"]
+        bn_w = self.pt_state[f"{block_tag}.downsample.1.weight"]
+        bn_b = self.pt_state[f"{block_tag}.downsample.1.bias"]
+        bn_rm = self.pt_state[f"{block_tag}.downsample.1.running_mean"]
+        bn_rv = self.pt_state[f"{block_tag}.downsample.1.running_var"]
+        fused_w, fused_b = self.fuse_conv_bn_weights(
+            conv_w, None, bn_rm, bn_rv, 1e-5, bn_w, bn_b
+        )
+        return fused_w, fused_b
+
+    def export_conv0(self, ait_model, fuse_model):
+        pt_name = "stem.conv1.weight"
+        x = fuse_model[pt_name]
+        conv_w = torch.zeros((64, 7, 7, 4))
+        conv_w[:, :, :, :3] = x
+        ait_model[pt_name.replace(".", "_")] = conv_w
+
+
+def export_to_torch_tensor(model_name="resnet50"):
+    if model_name != "resnet50":
+        raise NotImplementedError
+    timm2ait = timm_export(model_name)
+    ait_model = timm2ait.export_model(half=True)
+    return ait_model
+
+
+@click.command()
+@click.option("--param-path", type=str, default="resnet50.pkl")
+def export_to_numpy(param_path):
+    ait_model = export_to_torch_tensor()
+    np_weights = {}
+    for k, v in ait_model.items():
+        np_weights[k] = v.detach().cpu().numpy().astype(np.float16)
+
+    with open(param_path, "wb") as f:
+        pickle.dump(np_weights, f)
+
+
+if __name__ == "__main__":
+    export_to_numpy()
diff --git a/examples/02_detectron2/README.md b/examples/02_detectron2/README.md
new file mode 100644
index 000000000..99fadec85
--- /dev/null
+++ b/examples/02_detectron2/README.md
@@ -0,0 +1,169 @@
+# Getting Started with AIT for the Inference of Detectron2 Based Models
+
+This document describes the usage of AIT for detectron2 vision models such as mask RCNN and faster RCNN.
+
+For an end-to-end example with the API, see `prepare_and_run_rcnn.sh` which covers how to prepare and run inference with `mask_rcnn_R_50_FPN`.
+
+## Create the AIT Model from a Config File
+
+1. Pick a model and its config file from `configs`, for example, `mask_rcnn_R_50_FPN.yaml`.
+
+2. Build the AIT Model by running `compile_model.py` with the config file, for example,
+
+```
+cfg=examples/02_detectron2/configs/mask_rcnn_R_50_FPN.yaml
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 examples/02_detectron2/compile_model.py   \
+  --config $cfg \
+  --batch 1
+```
+
+All parameters in the built AIT model are not initialized, and therefore are filled with random values. We will initialize these parameters in the following step (i.e., exporting the weights of the pre-trained model to the AIT model). Check `tmp/mask_rcnn_R_50_FPN/params.json` for the list of parameters in the AIT model and their shapes.
+
+## Download the Detectron2 Pre-trained Model, and Export the Weights to the AIT Model
+
+1. For example, download Detectron2 `mask_rcnn_R_50_FPN` pre-trained model and save it to `tmp/pt_mask_rcnn_R_50_FPN.pkl`:
+
+```
+wget https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl -O tmp/pt_mask_rcnn_R_50_FPN.pkl
+```
+
+2. Export the weights from the pre-trained model to AIT model by running `tools/convert_pt2ait.py`:
+
+```
+python3 examples/02_detectron2/tools/convert_pt2ait.py  \
+  --d2-weight tmp/pt_mask_rcnn_R_50_FPN.pkl \
+  --ait-weight tmp/ait_mask_rcnn_R_50_FPN.pt \
+  --model-name mask_rcnn_R_50_FPN
+```
+
+The weights are exported to AIT and saved as `tmp/ait_mask_rcnn_R_50_FPN.pt` for inference run.
+
+## Download Inference DataSet and Run AIT Model
+
+1. For example, download the COCO 2017 Dataset:
+
+```
+mkdir -p ~/.torch/datasets/coco
+
+wget https://dl.fbaipublicfiles.com/detectron2/annotations/coco/val2017_100.tgz -O ~/.torch/datasets/coco/val2017_100.tgz
+tar xzf ~/.torch/datasets/coco/val2017_100.tgz -C ~/.torch/datasets/coco && rm -f ~/.torch/datasets/coco/val2017_100.tgz
+```
+
+2. Run inference of the AIT model on the inputs with `demo.py`:
+
+```
+python3 examples/02_detectron2/demo.py \
+  --weight tmp/ait_mask_rcnn_R_50_FPN.pt \
+  --config examples/02_detectron2/configs/mask_rcnn_R_50_FPN.yaml \
+  --batch 1 --input "~/.torch/datasets/coco/val2017/*.jpg" \
+  --confidence-threshold 0.5 \
+  --display \
+  --cudagraph
+```
+
+## Multi-GPU profiling
+AIT requires to do profiling to decide best algorithms for CUTLASS and CK.
+To enable multiple GPUs profiling, set the environment variable `CUDA_VISIBLE_DEVICES` on NVIDIA platform and `HIP_VISIBLE_DEVICES` on AMD platform with available GPU ids.
+
+
+## Results
+_PT = PyTorch 1.12 Eager_
+### A100-40GB / CUDA 11.6
+
+- Input size: 448x608
+
+| Batch size | PT Latency (ms) | PT FPS | AIT Latency (ms) | AIT FPS |
+|------------|-----------------|--------|------------------|---------|
+| 1          | 21.70           | 46.09  | 4.40             | 227.27  |
+| 2          | 29.71           | 67.32  | 6.68             | 299.40  |
+| 4          | 35.67           | 112.13 | 11.12            | 359.71  |
+| 8          | 59.71           | 133.98 | 22.24            | 359.71  |
+| 16         | 112.91          | 141.70 | 36.64            | 436.68  |
+| 32         | 224.24          | 142.70 | 71.04            | 450.45  |
+| 64         | 448.84          | 142.59 | 140.16           | 456.62  |
+
+- Input size: 800x1344
+
+| Batch size | PT Latency (ms) | PT FPS | AIT Latency (ms) | AIT FPS |
+|------------|-----------------|--------|------------------|---------|
+| 1          | 22.99           | 43.50  | 8.50             | 117.65  |
+| 2          | 34.48           | 58.01  | 13.42            | 149.03  |
+| 4          | 65.00           | 61.54  | 22.88            | 174.83  |
+| 8          | 125.25          | 63.87  | 41.44            | 193.05  |
+| 16         | 246.49          | 64.91  | 78.56            | 203.67  |
+| 32         | 503.21          | 63.59  | 154.56           | 207.04  |
+| 64         | OOM             | OOM    | 304.64           | 210.08  |
+
+
+### MI-250 / ROCm 5.2.3 / HIPCC-10736
+_PT = PyTorch 1.12 Eager_
+#### 1 GCDs
+
+- Input size: 448x608
+
+| Batch size | PT Latency (ms) | PT FPS | AIT Latency (ms) | AIT FPS |
+|------------|-----------------|--------|------------------|---------|
+| 1          | 24.75           | 40.41  | 10.63            | 94.07   |
+| 2          | 29.28           | 68.30  | 15.96            | 125.31  |
+| 4          | 42.45           | 94.24  | 26.24            | 152.44  |
+| 8          | 79.73           | 100.34 | 51.04            | 156.74  |
+| 16         | 141.84          | 112.81 | 89.12            | 179.53  |
+| 32         | 284.39          | 112.52 | 161.92           | 197.63  |
+| 64         | 600.84          | 106.52 | Error            | Error   |
+
+- Input size: 800x1344
+
+| Batch size | PT Latency (ms) | PT FPS | AIT Latency (ms) | AIT FPS |
+|------------|-----------------|--------|------------------|---------|
+| 1          | 26.80           | 37.31  | 19.23            | 52.00   |
+| 2          | 43.61           | 45.86  | 30.28            | 66.05   |
+| 4          | 98.88           | 40.45  | 51.56            | 77.58   |
+| 8          | 189.45          | 42.23  | 98.80            | 80.97   |
+| 16         | 389.94          | 41.03  | 177.28           | 90.25   |
+| 32         | 807.22          | 39.64  | 333.44           | 95.97   |
+| 64         | 1768.66         | 36.19  | Error            | Error   |
+
+#### 2 GCDs
+
+- Input size: 448x608
+
+| Batch size | AIT Latency (ms) | AIT FPS |
+|------------|------------------|---------|
+| 1          |                  |         |
+| 2          | 12.78            | 156.49  |
+| 4          | 20.66            | 193.61  |
+| 8          | 32.16            | 248.76  |
+| 16         | 61.52            | 260.08  |
+| 32         | 106.08           | 301.66  |
+| 64         | 194.24           | 329.49  |
+
+
+- Input size: 800x1344
+
+| Batch size | AIT Latency (ms) | AIT FPS |
+|------------|------------------|---------|
+| 1          |                  |         |
+| 2          | 22               | 90.91   |
+| 4          | 34               | 117.65  |
+| 8          | 55.52            | 144.09  |
+| 16         | 104.48           | 153.14  |
+| 32         | 190.24           | 168.21  |
+| 64         | 362.88           | 176.37  |
+
+
+### Sample outputs
+
+![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_d2_1.jpg)
+
+![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_d2_2.jpg)
+
+![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_d2_3.jpg)
+
+
+### Note for Performance Results
+
+- For NVIDIA A100, our test cluster doesn't allow to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- To benchmark MI-250, the first step is to run `python3 benchmark_ait.py` to generate all necessary model dynamic library files with single GCD. Then run `./benchmark_mi250.sh {batch_size}` to simulate data parallel execution on 2 GCDs, each GCD is processing half of the batch.
+- To benchmark MI-250 1 GCD, we lock the frequency with command `rocm-smi -d x --setperfdeterminism 1700`, where `x` is the GPU id.
+- To benchmark MI-250 2 GCDs, we observed performance regression with rocm perf-determ mode. The 2 GCDs number is running without perf-determ mode set with command `rocm-smi -d x --resetperfdeterminism`, where `x` is the GPU id.
+- Performance results are what we can reproduced. It should not be used for other purposes.
diff --git a/examples/02_detectron2/compile_model.py b/examples/02_detectron2/compile_model.py
new file mode 100644
index 000000000..4bf5d4d25
--- /dev/null
+++ b/examples/02_detectron2/compile_model.py
@@ -0,0 +1,149 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import json
+import os
+
+import click
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model, Model
+
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from configs import get_cfg_defaults
+from modeling.meta_arch import GeneralizedRCNN
+
+# pylint: disable=W0102
+
+
+def rand_init(shape):
+    if len(shape) == 1:
+        arr = np.zeros(shape).astype("float16")
+    else:
+        fout = shape[0]
+        fin = shape[-1]
+        scale = np.sqrt(2) / np.sqrt(fout + fin)
+        arr = np.random.normal(0, scale, shape).astype("float16")
+    return torch.from_numpy(arr).cuda().half()
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def extract_params_meta(net):
+    ret = []
+    params = net.parameters()
+    for p in params:
+        t = p.tensor()
+        name = t._attrs["name"]
+        shape = [x._attrs["values"][0] for x in t._attrs["shape"]]
+        ret.append([name, shape])
+    return ret
+
+
+def benchmark(cfg, mod=None):
+    im_shape = (cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST, 3)
+    HH, WW, CC = im_shape
+    BS = cfg.SOLVER.IMS_PER_BATCH
+    inputs = np.random.normal(0, 1, (BS, HH, WW, CC)).astype("float16")
+
+    model_name = cfg.MODEL.NAME
+    if mod is None:
+        mod = Model(os.path.join("./tmp", model_name, "test.so"))
+
+    ait_mod = GeneralizedRCNN(cfg)
+
+    for name, param in ait_mod.named_parameters():
+        shape = get_shape(param.tensor())
+        arr = rand_init(shape)
+        mod.set_constant_with_tensor(name.replace(".", "_"), arr)
+
+    x_input = torch.tensor(inputs).cuda().half()
+    x = x_input.contiguous()
+
+    GeneralizedRCNN(cfg).set_anchors(mod)
+
+    topk = cfg.POSTPROCESS.TOPK
+    outputs = [
+        torch.empty([BS, 1], dtype=torch.int64).cuda(),
+        torch.empty([BS, topk, 4]).cuda().half(),
+        torch.empty([BS, topk]).cuda().half(),
+        torch.empty([BS, topk], dtype=torch.int64).cuda(),
+    ]
+    if cfg.MODEL.MASK_ON:
+        mask_size = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION * 2
+        outputs.append(torch.empty([BS, topk, mask_size, mask_size]).cuda().half())
+
+    mod.benchmark_with_tensors([x], outputs, count=100, repeat=2, graph_mode=True)
+
+
+def compile_module(cfg):
+    model_name = cfg.MODEL.NAME
+    target = detect_target()
+
+    im_shape = (cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST, 3)
+    HH, WW, CC = im_shape
+    BS = cfg.SOLVER.IMS_PER_BATCH
+    x = Tensor(shape=[BS, HH, WW, CC], dtype="float16", name="input_0", is_input=True)
+    model = GeneralizedRCNN(cfg)
+    model.name_parameter_tensor()
+
+    y = model(x)
+    mark_output(y)
+    module = compile_model(y, target, "./tmp", model_name)
+
+    with open(os.path.join("./tmp", model_name, "params.json"), "w") as fo:
+        fo.write(json.dumps(extract_params_meta(model)))
+
+    benchmark(cfg, module)
+
+
+@click.command()
+@click.option("--config", default="", metavar="FILE", help="path to config file")
+@click.option("--bench-config", default="", metavar="FILE", help="path to config file")
+@click.option("--batch", default=0, help="batch size")
+@click.option("--eval/--no-eval", default=False, help="perform evaluation only")
+def compile_and_benchmark(config, bench_config, batch, eval):
+    cfg = get_cfg_defaults()
+    cfg.merge_from_file(config)
+    if bench_config != "":
+        cfg.merge_from_file(bench_config)
+    if batch > 0:
+        cfg.SOLVER.IMS_PER_BATCH = batch
+    cfg.freeze()
+    print(cfg.MODEL.NAME)
+
+    if eval:
+        benchmark(cfg)
+    else:
+        compile_module(cfg)
+
+
+if __name__ == "__main__":
+    np.random.seed(4896)
+    compile_and_benchmark()
diff --git a/examples/02_detectron2/configs/__init__.py b/examples/02_detectron2/configs/__init__.py
new file mode 100644
index 000000000..679ca77c9
--- /dev/null
+++ b/examples/02_detectron2/configs/__init__.py
@@ -0,0 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from .config import get_cfg_defaults
+
+__all__ = ["get_cfg_defaults"]
diff --git a/examples/02_detectron2/configs/config.py b/examples/02_detectron2/configs/config.py
new file mode 100644
index 000000000..c9cf1e5c3
--- /dev/null
+++ b/examples/02_detectron2/configs/config.py
@@ -0,0 +1,26 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from yacs.config import CfgNode
+
+
+def get_cfg_defaults() -> CfgNode:
+    """
+    Get a copy of the default config.
+    Returns:
+        a detectron2 CfgNode instance.
+    """
+    from .defaults import _C
+
+    return _C.clone()
diff --git a/examples/02_detectron2/configs/defaults.py b/examples/02_detectron2/configs/defaults.py
new file mode 100644
index 000000000..c2bb11eb7
--- /dev/null
+++ b/examples/02_detectron2/configs/defaults.py
@@ -0,0 +1,668 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from yacs.config import CfgNode as CN
+
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+
+_C = CN()
+
+# The version number, to upgrade from old configs to new ones if any
+# changes happen. It's recommended to keep a VERSION in your config file.
+_C.VERSION = 2
+
+_C.MODEL = CN()
+_C.MODEL.NAME = ""
+_C.MODEL.LOAD_PROPOSALS = False
+_C.MODEL.MASK_ON = False
+_C.MODEL.KEYPOINT_ON = False
+_C.MODEL.DEVICE = "cuda"
+_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
+
+# Path (a file path, or URL like detectron2://.., https://..) to a checkpoint file
+# to be loaded to the model. You can find available models in the model zoo.
+_C.MODEL.WEIGHTS = ""
+
+# Values to be used for image normalization (BGR order, since INPUT.FORMAT defaults to BGR).
+# To train on images of different number of channels, just set different mean & std.
+# Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
+_C.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675]
+# When using pre-trained models in Detectron1 or any MSRA models,
+# std has been absorbed into its conv1 weights, so the std needs to be set 1.
+# Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
+_C.MODEL.PIXEL_STD = [1.0, 1.0, 1.0]
+
+# -----------------------------------------------------------------------------
+# POST PROCESS
+# -----------------------------------------------------------------------------
+_C.POSTPROCESS = CN()
+_C.POSTPROCESS.POST_ON = True
+_C.POSTPROCESS.USE_TOPK = True
+_C.POSTPROCESS.TOPK = 130
+
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# By default, {MIN,MAX}_SIZE options are used in transforms.ResizeShortestEdge.
+# Please refer to ResizeShortestEdge for detailed definition.
+# Size of the smallest side of the image during training
+_C.INPUT.MIN_SIZE_TRAIN = (800,)
+# Sample size of smallest side by choice or random selection from range give by
+# INPUT.MIN_SIZE_TRAIN
+_C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice"
+# Maximum size of the side of the image during training
+_C.INPUT.MAX_SIZE_TRAIN = 1333
+# Size of the smallest side of the image during testing. Set to zero to disable resize in testing.
+_C.INPUT.MIN_SIZE_TEST = 800
+# Maximum size of the side of the image during testing
+_C.INPUT.MAX_SIZE_TEST = 1333
+# Mode for flipping images used in data augmentation during training
+# choose one of ["horizontal, "vertical", "none"]
+_C.INPUT.RANDOM_FLIP = "horizontal"
+
+# `True` if cropping is used for data augmentation during training
+_C.INPUT.CROP = CN({"ENABLED": False})
+# Cropping type. See documentation of `detectron2.data.transforms.RandomCrop` for explanation.
+_C.INPUT.CROP.TYPE = "relative_range"
+# Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of
+# pixels if CROP.TYPE is "absolute"
+_C.INPUT.CROP.SIZE = [0.9, 0.9]
+
+
+# Whether the model needs RGB, YUV, HSV etc.
+# Should be one of the modes defined here, as we use PIL to read the image:
+# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
+# with BGR being the one exception. One can set image format to BGR, we will
+# internally use RGB for conversion and flip the channels over
+_C.INPUT.FORMAT = "BGR"
+# The ground truth mask format that the model will use.
+# Mask R-CNN supports either "polygon" or "bitmask" as ground truth.
+_C.INPUT.MASK_FORMAT = "polygon"  # alternative: "bitmask"
+
+
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training. Must be registered in DatasetCatalog
+# Samples from these datasets will be merged and used as one dataset.
+_C.DATASETS.TRAIN = ()
+# List of the pre-computed proposal files for training, which must be consistent
+# with datasets listed in DATASETS.TRAIN.
+_C.DATASETS.PROPOSAL_FILES_TRAIN = ()
+# Number of top scoring precomputed proposals to keep for training
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000
+# List of the dataset names for testing. Must be registered in DatasetCatalog
+_C.DATASETS.TEST = ()
+# List of the pre-computed proposal files for test, which must be consistent
+# with datasets listed in DATASETS.TEST.
+_C.DATASETS.PROPOSAL_FILES_TEST = ()
+# Number of top scoring precomputed proposals to keep for test
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000
+
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 4
+# If True, each batch should contain only images for which the aspect ratio
+# is compatible. This groups portrait images together, and landscape images
+# are not batched with portrait images.
+_C.DATALOADER.ASPECT_RATIO_GROUPING = True
+# Options: TrainingSampler, RepeatFactorTrainingSampler
+_C.DATALOADER.SAMPLER_TRAIN = "TrainingSampler"
+# Repeat threshold for RepeatFactorTrainingSampler
+_C.DATALOADER.REPEAT_THRESHOLD = 0.0
+# Tf True, when working on datasets that have instance annotations, the
+# training dataloader will filter out images without associated annotations
+_C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True
+
+# ---------------------------------------------------------------------------- #
+# Backbone options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+
+_C.MODEL.BACKBONE.NAME = "build_resnet_backbone"
+# Freeze the first several stages so they are not trained.
+# There are 5 stages in ResNet. The first is a convolution, and the following
+# stages are each group of residual blocks.
+_C.MODEL.BACKBONE.FREEZE_AT = 2
+
+
+# ---------------------------------------------------------------------------- #
+# FPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FPN = CN()
+# Names of the input feature maps to be used by FPN
+# They must have contiguous power of 2 strides
+# e.g., ["res2", "res3", "res4", "res5"]
+_C.MODEL.FPN.IN_FEATURES = []
+_C.MODEL.FPN.OUT_CHANNELS = 256
+
+# Options: "" (no norm), "GN"
+_C.MODEL.FPN.NORM = ""
+
+# Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg"
+_C.MODEL.FPN.FUSE_TYPE = "sum"
+
+
+# ---------------------------------------------------------------------------- #
+# Proposal generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.PROPOSAL_GENERATOR = CN()
+# Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals"
+_C.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
+# Proposal height and width both need to be greater than MIN_SIZE
+# (a the scale used during training or inference)
+_C.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0
+
+
+# ---------------------------------------------------------------------------- #
+# Anchor generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ANCHOR_GENERATOR = CN()
+# The generator can be any name in the ANCHOR_GENERATOR registry
+_C.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
+# Anchor sizes (i.e. sqrt of area) in absolute pixels w.r.t. the network input.
+# Format: list[list[float]]. SIZES[i] specifies the list of sizes to use for
+# IN_FEATURES[i]; len(SIZES) must be equal to len(IN_FEATURES) or 1.
+# When len(SIZES) == 1, SIZES[0] is used for all IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]]
+# Anchor aspect ratios. For each area given in `SIZES`, anchors with different aspect
+# ratios are generated by an anchor generator.
+# Format: list[list[float]]. ASPECT_RATIOS[i] specifies the list of aspect ratios (H/W)
+# to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true,
+# or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used
+# for all IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]]
+# Anchor angles.
+# list[list[float]], the angle in degrees, for each input feature map.
+# ANGLES[i] specifies the list of angles for IN_FEATURES[i].
+_C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]]
+# Relative offset between the center of the first anchor and the top-left corner of the image
+# Value has to be in [0, 1). Recommend to use 0.5, which means half stride.
+# The value is not expected to affect model accuracy.
+_C.MODEL.ANCHOR_GENERATOR.OFFSET = 0.0
+
+# ---------------------------------------------------------------------------- #
+# RPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RPN = CN()
+_C.MODEL.RPN.HEAD_NAME = "StandardRPNHead"  # used by RPN_HEAD_REGISTRY
+
+# Names of the input feature maps to be used by RPN
+# e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN
+_C.MODEL.RPN.IN_FEATURES = ["res4"]
+# Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels
+# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
+_C.MODEL.RPN.BOUNDARY_THRESH = -1
+# IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD]
+# Minimum overlap required between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
+# ==> positive RPN example: 1)
+# Maximum overlap allowed between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
+# ==> negative RPN example: 0)
+# Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD)
+# are ignored (-1)
+_C.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7]
+_C.MODEL.RPN.IOU_LABELS = [0, -1, 1]
+# Number of regions per image used to train RPN
+_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
+# Target fraction of foreground (positive) examples per RPN minibatch
+_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.RPN.BBOX_REG_LOSS_TYPE = "smooth_l1"
+_C.MODEL.RPN.BBOX_REG_LOSS_WEIGHT = 1.0
+# Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets
+_C.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.RPN.SMOOTH_L1_BETA = 0.0
+_C.MODEL.RPN.LOSS_WEIGHT = 1.0
+# Number of top scoring RPN proposals to keep before applying NMS
+# When FPN is used, this is *per FPN level* (not total)
+_C.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000
+_C.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000
+# Number of top scoring RPN proposals to keep after applying NMS
+# When FPN is used, this limit is applied per level and then again to the union
+# of proposals from all levels
+# NOTE: When FPN is used, the meaning of this config is different from Detectron1.
+# It means per-batch topk in Detectron1, but per-image topk here.
+# See the "find_top_rpn_proposals" function for details.
+_C.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000
+_C.MODEL.RPN.POST_NMS_TOPK_TEST = 1000
+# NMS threshold used on RPN proposals
+_C.MODEL.RPN.NMS_THRESH = 0.7
+# Set this to -1 to use the same number of output channels as input channels.
+_C.MODEL.RPN.CONV_DIMS = [-1]
+
+_C.MODEL.RPN.RPN_DIM = 256
+
+# ---------------------------------------------------------------------------- #
+# ROI HEADS options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_HEADS = CN()
+_C.MODEL.ROI_HEADS.NAME = "Res5ROIHeads"
+# Number of foreground classes
+_C.MODEL.ROI_HEADS.NUM_CLASSES = 80
+# Names of the input feature maps to be used by ROI heads
+# Currently all heads (box, mask, ...) use the same input feature map list
+# e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN
+_C.MODEL.ROI_HEADS.IN_FEATURES = ["res4"]
+# IOU overlap ratios [IOU_THRESHOLD]
+# Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD)
+# Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD)
+_C.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5]
+_C.MODEL.ROI_HEADS.IOU_LABELS = [0, 1]
+# RoI minibatch size *per image* (number of regions of interest [ROIs]) during training
+# Total number of RoIs per training minibatch =
+#   ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH
+# E.g., a common configuration is: 512 * 16 = 8192
+_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
+# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
+_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
+
+# Only used on test mode
+
+# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
+# balance obtaining high recall with not having too many low precision
+# detections that will slow down inference post processing steps (like NMS)
+# A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down
+# inference.
+_C.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+_C.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5
+# If True, augment proposals with ground-truth boxes before sampling proposals to
+# train ROI heads.
+_C.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True
+
+# ---------------------------------------------------------------------------- #
+# Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_HEAD = CN()
+# C4 don't use head name option
+# Options for non-C4 models: FastRCNNConvFCHead,
+_C.MODEL.ROI_BOX_HEAD.NAME = ""
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE = "smooth_l1"
+# The final scaling coefficient on the box regression loss, used to balance the magnitude of its
+# gradients with other losses in the model. See also `MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT`.
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT = 1.0
+# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
+# These are empirically chosen to approximately lead to unit variance targets
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0
+_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+_C.MODEL.ROI_BOX_HEAD.NUM_FC = 0
+# Hidden layer dimension for FC layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.FC_DIM = 1024
+_C.MODEL.ROI_BOX_HEAD.NUM_CONV = 0
+# Channel dimension for Conv layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_BOX_HEAD.NORM = ""
+# Whether to use class agnostic for bbox regression
+_C.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False
+# If true, RoI heads use bounding boxes predicted by the box head rather than proposal boxes.
+_C.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES = False
+
+# Federated loss can be used to improve the training of LVIS
+_C.MODEL.ROI_BOX_HEAD.USE_FED_LOSS = False
+# Sigmoid cross entrophy is used with federated loss
+_C.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE = False
+# The power value applied to image_count when calcualting frequency weight
+_C.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER = 0.5
+# Number of classes to keep in total
+_C.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES = 50
+
+# ---------------------------------------------------------------------------- #
+# Cascaded Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_CASCADE_HEAD = CN()
+# The number of cascade stages is implicitly defined by the length of the following two configs.
+_C.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = (
+    (10.0, 10.0, 5.0, 5.0),
+    (20.0, 20.0, 10.0, 10.0),
+    (30.0, 30.0, 15.0, 15.0),
+)
+_C.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7)
+
+
+# ---------------------------------------------------------------------------- #
+# Mask Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_MASK_HEAD = CN()
+_C.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead"
+_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_MASK_HEAD.NUM_CONV = 0  # The number of convs in the mask head
+_C.MODEL.ROI_MASK_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_MASK_HEAD.NORM = ""
+# Whether to use class agnostic for mask prediction
+_C.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+
+# ---------------------------------------------------------------------------- #
+# Keypoint Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_KEYPOINT_HEAD = CN()
+_C.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead"
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8))
+_C.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17  # 17 is the number of keypoints in COCO.
+
+# Images with too few (or no) keypoints are excluded from training.
+_C.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1
+# Normalize by the total number of visible keypoints in the minibatch if True.
+# Otherwise, normalize by the total number of keypoints that could ever exist
+# in the minibatch.
+# The keypoint softmax loss is only calculated on visible keypoints.
+# Since the number of visible keypoints can vary significantly between
+# minibatches, this has the effect of up-weighting the importance of
+# minibatches with few visible keypoints. (Imagine the extreme case of
+# only one visible keypoint versus N: in the case of N, each one
+# contributes 1/N to the gradient compared to the single keypoint
+# determining the gradient direction). Instead, we can normalize the
+# loss by the total number of keypoints, if it were the case that all
+# keypoints were visible in a full minibatch. (Returning to the example,
+# this means that the one visible keypoint contributes as much as each
+# of the N keypoints.)
+_C.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True
+# Multi-task loss weight to use for keypoints
+# Recommended values:
+#   - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True
+#   - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False
+_C.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+# ---------------------------------------------------------------------------- #
+# Semantic Segmentation Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SEM_SEG_HEAD = CN()
+_C.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead"
+_C.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"]
+# Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for
+# the correposnding pixel.
+_C.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255
+# Number of classes in the semantic segmentation head
+_C.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54
+# Number of channels in the 3x3 convs inside semantic-FPN heads.
+_C.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128
+# Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride.
+_C.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4
+# Normalization method for the convolution layers. Options: "" (no norm), "GN".
+_C.MODEL.SEM_SEG_HEAD.NORM = "GN"
+_C.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0
+
+_C.MODEL.PANOPTIC_FPN = CN()
+# Scaling of all losses from instance detection / segmentation head.
+_C.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0
+
+# options when combining instance & semantic segmentation outputs
+_C.MODEL.PANOPTIC_FPN.COMBINE = CN(
+    {"ENABLED": True}
+)  # "COMBINE.ENABLED" is deprecated & not used
+_C.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5
+_C.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096
+_C.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5
+
+
+# ---------------------------------------------------------------------------- #
+# RetinaNet Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RETINANET = CN()
+
+# This is the number of foreground classes.
+_C.MODEL.RETINANET.NUM_CLASSES = 80
+
+_C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
+
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.MODEL.RETINANET.NUM_CONVS = 4
+
+# IoU overlap ratio [bg, fg] for labeling anchors.
+# Anchors with < bg are labeled negative (0)
+# Anchors  with >= bg and < fg are ignored (-1)
+# Anchors with >= fg are labeled positive (1)
+_C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5]
+_C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1]
+
+# Prior prob for rare case (i.e. foreground) at the beginning of training.
+# This is used to set the bias for the logits layer of the classifier subnet.
+# This improves training stability in the case of heavy class imbalance.
+_C.MODEL.RETINANET.PRIOR_PROB = 0.01
+
+# Inference cls score threshold, only anchors with score > INFERENCE_TH are
+# considered for inference (to improve speed)
+_C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
+# Select topk candidates before NMS
+_C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
+_C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5
+
+# Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets
+_C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+
+# Loss parameters
+_C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
+_C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
+_C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1
+# Options are: "smooth_l1", "giou", "diou", "ciou"
+_C.MODEL.RETINANET.BBOX_REG_LOSS_TYPE = "smooth_l1"
+
+# One of BN, SyncBN, FrozenBN, GN
+# Only supports GN until unshared norm is implemented
+_C.MODEL.RETINANET.NORM = ""
+
+
+# ---------------------------------------------------------------------------- #
+# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
+# Note that parts of a resnet may be used for both the backbone and the head
+# These options apply to both
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RESNETS = CN()
+
+_C.MODEL.RESNETS.DEPTH = 50
+
+_C.MODEL.RESNETS.STAGES = [3, 4, 6, 3]
+
+_C.MODEL.RESNETS.FILTERS = [64, 256, 512, 1024, 2048]
+
+_C.MODEL.RESNETS.OUT_FEATURES = [
+    "res4"
+]  # res4 for C4 backbone, res2..5 for FPN backbone
+
+# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
+_C.MODEL.RESNETS.NUM_GROUPS = 1
+
+# Options: FrozenBN, GN, "SyncBN", "BN"
+_C.MODEL.RESNETS.NORM = "FrozenBN"
+
+# Baseline width of each group.
+# Scaling this parameters will scale the width of all bottleneck layers.
+_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
+
+# Place the stride 2 conv on the 1x1 filter
+# Use True only for the original MSRA ResNet; use False for C2 and Torch models
+_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
+
+# Apply dilation in stage "res5"
+_C.MODEL.RESNETS.RES5_DILATION = 1
+
+# Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet
+# For R18 and R34, this needs to be set to 64
+_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
+_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
+
+# Apply Deformable Convolution in stages
+# Specify if apply deform_conv on Res2, Res3, Res4, Res5
+_C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False]
+# Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168);
+# Use False for DeformableV1.
+_C.MODEL.RESNETS.DEFORM_MODULATED = False
+# Number of groups in deformable conv.
+_C.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1
+
+
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+
+# Options: WarmupMultiStepLR, WarmupCosineLR.
+# See detectron2/solver/build.py for definition.
+_C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR"
+
+_C.SOLVER.MAX_ITER = 40000
+
+_C.SOLVER.BASE_LR = 0.001
+# The end lr, only used by WarmupCosineLR
+_C.SOLVER.BASE_LR_END = 0.0
+
+_C.SOLVER.MOMENTUM = 0.9
+
+_C.SOLVER.NESTEROV = False
+
+_C.SOLVER.WEIGHT_DECAY = 0.0001
+# The weight decay that's applied to parameters of normalization layers
+# (typically the affine transformation)
+_C.SOLVER.WEIGHT_DECAY_NORM = 0.0
+
+_C.SOLVER.GAMMA = 0.1
+# The iteration number to decrease learning rate by GAMMA.
+_C.SOLVER.STEPS = (30000,)
+
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 1000
+_C.SOLVER.WARMUP_ITERS = 1000
+_C.SOLVER.WARMUP_METHOD = "linear"
+
+# Save a checkpoint after every this number of iterations
+_C.SOLVER.CHECKPOINT_PERIOD = 5000
+
+# Number of images per batch across all machines. This is also the number
+# of training images per step (i.e. per iteration). If we use 16 GPUs
+# and IMS_PER_BATCH = 32, each GPU will see 2 images per batch.
+# May be adjusted automatically if REFERENCE_WORLD_SIZE is set.
+_C.SOLVER.IMS_PER_BATCH = 16
+
+# The reference number of workers (GPUs) this config is meant to train with.
+# It takes no effect when set to 0.
+# With a non-zero value, it will be used by DefaultTrainer to compute a desired
+# per-worker batch size, and then scale the other related configs (total batch size,
+# learning rate, etc) to match the per-worker batch size.
+# See documentation of `DefaultTrainer.auto_scale_workers` for details:
+_C.SOLVER.REFERENCE_WORLD_SIZE = 0
+
+# Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for
+# biases. This is not useful (at least for recent models). You should avoid
+# changing these and they exist only to reproduce Detectron v1 training if
+# desired.
+_C.SOLVER.BIAS_LR_FACTOR = 1.0
+_C.SOLVER.WEIGHT_DECAY_BIAS = None  # None means following WEIGHT_DECAY
+
+# Gradient clipping
+_C.SOLVER.CLIP_GRADIENTS = CN({"ENABLED": False})
+# Type of gradient clipping, currently 2 values are supported:
+# - "value": the absolute values of elements of each gradients are clipped
+# - "norm": the norm of the gradient for each parameter is clipped thus
+#   affecting all elements in the parameter
+_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "value"
+# Maximum absolute value used for clipping gradients
+_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0
+# Floating point number p for L-p norm to be used with the "norm"
+# gradient clipping type; for L-inf, please specify .inf
+_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
+
+# Enable automatic mixed precision for training
+# Note that this does not change model's inference behavior.
+# To use AMP in inference, run inference under autocast()
+_C.SOLVER.AMP = CN({"ENABLED": False})
+
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+# For end-to-end tests to verify the expected accuracy.
+# Each item is [task, metric, value, tolerance]
+# e.g.: [['bbox', 'AP', 38.5, 0.2]]
+_C.TEST.EXPECTED_RESULTS = []
+# The period (in terms of steps) to evaluate the model during training.
+# Set to 0 to disable.
+_C.TEST.EVAL_PERIOD = 0
+# The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval
+# When empty, it will use the defaults in COCO.
+# Otherwise it should be a list[float] with the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+_C.TEST.KEYPOINT_OKS_SIGMAS = []
+# Maximum number of detections to return per image during inference (100 is
+# based on the limit established for the COCO dataset).
+_C.TEST.DETECTIONS_PER_IMAGE = 100
+
+_C.TEST.AUG = CN({"ENABLED": False})
+_C.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
+_C.TEST.AUG.MAX_SIZE = 4000
+_C.TEST.AUG.FLIP = True
+
+_C.TEST.PRECISE_BN = CN({"ENABLED": False})
+_C.TEST.PRECISE_BN.NUM_ITER = 200
+
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+# Directory where output files are written
+_C.OUTPUT_DIR = "./output"
+# Set seed to negative to fully randomize everything.
+# Set seed to positive to use a fixed seed. Note that a fixed seed increases
+# reproducibility but does not guarantee fully deterministic behavior.
+# Disabling all parallelism further increases reproducibility.
+_C.SEED = -1
+# Benchmark different cudnn algorithms.
+# If input images have very different sizes, this option will have large overhead
+# for about 10k iterations. It usually hurts total time, but can benefit for certain models.
+# If input images have the same or similar sizes, benchmark is often helpful.
+_C.CUDNN_BENCHMARK = False
+# The period (in terms of steps) for minibatch visualization at train time.
+# Set to 0 to disable.
+_C.VIS_PERIOD = 0
+
+# global config is for quick hack purposes.
+# You can set them in command line or config files,
+# and access it with:
+#
+# from detectron2.config import global_cfg
+# print(global_cfg.HACK)
+#
+# Do not commit any configs into it.
+_C.GLOBAL = CN()
+_C.GLOBAL.HACK = 1.0
+
+
+# def get_cfg_defaults():
+#     return _C.clone()
diff --git a/examples/02_detectron2/configs/faster_rcnn_R_101_FPN.yaml b/examples/02_detectron2/configs/faster_rcnn_R_101_FPN.yaml
new file mode 100644
index 000000000..b69b96822
--- /dev/null
+++ b/examples/02_detectron2/configs/faster_rcnn_R_101_FPN.yaml
@@ -0,0 +1,47 @@
+MODEL:
+  NAME: "faster_rcnn_R_101_FPN"
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEPTH: 101
+    STAGES: [3, 4, 23, 3]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [0.5, 1.0, 2.0]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 1
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1344
+POSTPROCESS:
+  POST_ON: True
+  USE_TOPK: True
+  TOPK: 100
+VERSION: 2
diff --git a/examples/02_detectron2/configs/faster_rcnn_R_50_FPN.yaml b/examples/02_detectron2/configs/faster_rcnn_R_50_FPN.yaml
new file mode 100644
index 000000000..26aa4c210
--- /dev/null
+++ b/examples/02_detectron2/configs/faster_rcnn_R_50_FPN.yaml
@@ -0,0 +1,45 @@
+MODEL:
+  NAME: "faster_rcnn_R_50_FPN"
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [0.5, 1.0, 2.0]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 1
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1344
+POSTPROCESS:
+  POST_ON: True
+  USE_TOPK: True
+  TOPK: 100
+VERSION: 2
diff --git a/examples/02_detectron2/configs/mask_rcnn_R_101_FPN.yaml b/examples/02_detectron2/configs/mask_rcnn_R_101_FPN.yaml
new file mode 100644
index 000000000..c2c6c946c
--- /dev/null
+++ b/examples/02_detectron2/configs/mask_rcnn_R_101_FPN.yaml
@@ -0,0 +1,48 @@
+MODEL:
+  NAME: "mask_rcnn_R_101_FPN"
+  MASK_ON: True
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEPTH: 101
+    STAGES: [3, 4, 23, 3]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [0.5, 1.0, 2.0]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 1
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1344
+POSTPROCESS:
+  POST_ON: True
+  USE_TOPK: False
+  TOPK: 100
+VERSION: 2
diff --git a/examples/02_detectron2/configs/mask_rcnn_R_50_FPN.yaml b/examples/02_detectron2/configs/mask_rcnn_R_50_FPN.yaml
new file mode 100644
index 000000000..47149bf18
--- /dev/null
+++ b/examples/02_detectron2/configs/mask_rcnn_R_50_FPN.yaml
@@ -0,0 +1,46 @@
+MODEL:
+  NAME: "mask_rcnn_R_50_FPN"
+  MASK_ON: True
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [0.5, 1.0, 2.0]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 1
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1344
+POSTPROCESS:
+  POST_ON: True
+  USE_TOPK: False
+  TOPK: 100
+VERSION: 2
diff --git a/examples/02_detectron2/demo.py b/examples/02_detectron2/demo.py
new file mode 100644
index 000000000..749a1eab8
--- /dev/null
+++ b/examples/02_detectron2/demo.py
@@ -0,0 +1,105 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A main inference script for rcnn models
+"""
+import glob
+import os
+
+import click
+import tqdm
+from configs import get_cfg_defaults
+from predictor import Predictor
+
+
+@click.command()
+@click.option("--config", default="", metavar="FILE", help="path to config file")
+@click.option("--bench-config", default="", metavar="FILE", help="path to config file")
+@click.option(
+    "--input",
+    multiple=True,
+    help="A list of space separated input images; "
+    "or a single glob pattern such as 'directory/*.jpg'",
+)
+@click.option(
+    "--output",
+    help="A file or directory to save output visualizations. "
+    "If not given, will show output in an OpenCV window.",
+)
+@click.option(
+    "--confidence-threshold",
+    type=float,
+    default=0.5,
+    help="Minimum score for instance predictions to be shown",
+)
+@click.option("--weight", default="", metavar="FILE", help="path to model weights")
+@click.option("--batch", default=0, help="batch size")
+@click.option("--display/--no-display", default=False, help="display results")
+@click.option("--cudagraph/--no-cudagraph", default=False, help="enable CUDA graph")
+def run_model(
+    config,
+    bench_config,
+    input,
+    output,
+    confidence_threshold,
+    weight,
+    batch,
+    display,
+    cudagraph,
+):
+    cfg = get_cfg_defaults()
+    cfg.merge_from_file(config)
+    if bench_config != "":
+        cfg.merge_from_file(bench_config)
+    if batch > 0:
+        cfg.SOLVER.IMS_PER_BATCH = batch
+    cfg.MODEL.WEIGHTS = weight
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = confidence_threshold
+    cfg.freeze()
+
+    assert (
+        weight != ""
+    ), "export model first: python convert_pt2ait.py model_d2.pkl params_ait.pkl \
+    --config configs/faster_rcnn_R_50_DC5.yaml"
+
+    demo = Predictor(cfg)
+    print("run {} end2end".format(cfg.MODEL.NAME))
+
+    cnt = 0
+    duration = 0
+    detections = {}
+    bs = cfg.SOLVER.IMS_PER_BATCH
+    if input:
+        if len(input) == 1:
+            input = glob.glob(os.path.expanduser(input[0]))
+            assert input, "The input path(s) was not found"
+        batch_data = demo.data_loader(input)
+        print("{} images, run {} batch".format(len(input), len(batch_data)))
+        for batch in tqdm.tqdm(batch_data, disable=not output):
+            results = demo.run_batch(batch, cudagraph)
+            detections.update(results)
+            if display:
+                demo.visualize(results)
+            duration += demo.benchmark(batch["data"], 10, cudagraph)
+            cnt += 1
+
+    duration /= cnt * bs
+    print(
+        f"AIT Detection: Batch size: {bs}, Time per iter: {duration:.2f} ms, FPS: {1000 / duration:.2f}"
+    )
+
+
+if __name__ == "__main__":
+    run_model()
diff --git a/examples/02_detectron2/modeling/backbone/__init__.py b/examples/02_detectron2/modeling/backbone/__init__.py
new file mode 100644
index 000000000..e2778377d
--- /dev/null
+++ b/examples/02_detectron2/modeling/backbone/__init__.py
@@ -0,0 +1,25 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from .fpn import build_resnet_fpn_backbone, FPN
+from .resnet import (
+    BasicStem,
+    BottleneckBlock,
+    build_resnet_backbone,
+    make_stage,
+    ResNet,
+)
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/examples/02_detectron2/modeling/backbone/fpn.py b/examples/02_detectron2/modeling/backbone/fpn.py
new file mode 100644
index 000000000..fe14b3b98
--- /dev/null
+++ b/examples/02_detectron2/modeling/backbone/fpn.py
@@ -0,0 +1,228 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import math
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn
+
+from .resnet import build_resnet_backbone
+from .utils import ShapeSpec
+
+
+class FPN(nn.Module):
+    """
+    This module implements :paper:`FPN`.
+    It creates pyramid features built on top of some input feature maps.
+    """
+
+    def __init__(
+        self,
+        bottom_up,
+        in_features,
+        out_channels,
+        norm="",
+        top_block=None,
+        fuse_type="sum",
+        square_pad=0,
+    ):
+        """
+        Args:
+            bottom_up (Backbone): module representing the bottom up subnetwork.
+                Must be a subclass of :class:`Backbone`. The multi-scale feature
+                maps generated by the bottom up network, and listed in `in_features`,
+                are used to generate FPN levels.
+            in_features (list[str]): names of the input feature maps coming
+                from the backbone to which FPN is attached. For example, if the
+                backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
+                of these may be used; order must be from high to low resolution.
+            out_channels (int): number of channels in the output feature maps.
+            norm (str): the normalization to use.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                FPN output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra FPN levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            fuse_type (str): types for fusing the top down features and the lateral
+                ones. It can be "sum" (default), which sums up element-wise; or "avg",
+                which takes the element-wise mean of the two.
+            square_pad (int): If > 0, require input images to be padded to specific square size.
+        """
+        super().__init__()
+        assert in_features, in_features
+
+        # Feature map strides and channels from the bottom up network (e.g. ResNet)
+        input_shapes = bottom_up.output_shape()
+        strides = [input_shapes[f].stride for f in in_features]
+        in_channels_per_feature = [input_shapes[f].channels for f in in_features]
+
+        _assert_strides_are_log2_contiguous(strides)
+        lateral_convs = []
+        output_convs = []
+
+        # use_bias = norm == ""
+        for idx, in_channels in enumerate(in_channels_per_feature):
+            lateral_conv = nn.Conv2dBias(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+            output_conv = nn.Conv2dBias(
+                out_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )
+            stage = int(math.log2(strides[idx]))
+            self.add_module("fpn_lateral{}".format(stage), lateral_conv)
+            self.add_module("fpn_output{}".format(stage), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+        self.top_block = top_block
+        self.in_features = tuple(in_features)
+        self.bottom_up = bottom_up
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {
+            "p{}".format(int(math.log2(s))): s for s in strides
+        }
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[-1]
+        self._square_pad = square_pad
+        assert fuse_type in {"avg", "sum"}
+        self._fuse_type = fuse_type
+
+    def size_divisibility(self):
+        return self._size_divisibility
+
+    def padding_constraints(self):
+        return {"square_size": self._square_pad}
+
+    def forward(self, x):
+        """
+        Args:
+            input (dict[str->Tensor]): mapping feature map name (e.g., "res5") to
+                feature map tensor for each feature level in high to low resolution order.
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to FPN feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        bottom_up_features = self.bottom_up(x)
+
+        results = []
+        prev_features = self.lateral_convs[0](bottom_up_features[self.in_features[-1]])
+        results.append(self.output_convs[0](prev_features))
+
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, (lateral_conv, output_conv) in enumerate(
+            zip(self.lateral_convs, self.output_convs)
+        ):
+            # Slicing of ModuleList is not supported https://github.com/pytorch/pytorch/issues/47336
+            # Therefore we loop over all modules but skip the first one
+            if idx > 0:
+                features = self.in_features[-idx - 1]
+                features = bottom_up_features[features]
+                # top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest")
+                lateral_features = lateral_conv(features)
+                # prev_features = lateral_features + top_down_features
+                interpolate_op = ops.upsampling2d_add(scale_factor=2.0, mode="nearest")
+                prev_features = interpolate_op(prev_features, lateral_features)
+                if self._fuse_type == "avg":
+                    prev_features /= 2
+                results.insert(0, output_conv(prev_features))
+
+        if self.top_block is not None:
+            if self.top_block.in_feature in bottom_up_features:
+                top_block_in_feature = bottom_up_features[self.top_block.in_feature]
+            else:
+                top_block_in_feature = results[
+                    self._out_features.index(self.top_block.in_feature)
+                ]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return {f: res for f, res in zip(self._out_features, results)}
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name],
+                stride=self._out_feature_strides[name],
+            )
+            for name in self._out_features
+        }
+
+
+def _assert_strides_are_log2_contiguous(strides):
+    """
+    Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
+    """
+    for i, stride in enumerate(strides[1:], 1):
+        assert (
+            stride == 2 * strides[i - 1]
+        ), "Strides {} {} are not log2 contiguous".format(stride, strides[i - 1])
+
+
+class LastLevelMaxPool(nn.Module):
+    """
+    This module is used in the original FPN to generate a downsampled
+    P6 feature from P5.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.num_levels = 1
+        self.in_feature = "p5"
+        self.pool = nn.MaxPool2d(1, 2, 0)
+
+    def forward(self, x):
+        return [self.pool(x)]
+
+
+def build_resnet_fpn_backbone(cfg):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_resnet_backbone(cfg)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
diff --git a/examples/02_detectron2/modeling/backbone/resnet.py b/examples/02_detectron2/modeling/backbone/resnet.py
new file mode 100644
index 000000000..5b3777a0d
--- /dev/null
+++ b/examples/02_detectron2/modeling/backbone/resnet.py
@@ -0,0 +1,459 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import numpy as np
+from aitemplate.frontend import nn
+from aitemplate.testing import detect_target
+
+from .utils import ShapeSpec
+
+
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NHWC tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+
+class BasicStem(CNNBlockBase):
+    """
+    The standard ResNet stem (layers before the first residual block),
+    with a conv, relu and max_pool.
+    """
+
+    def __init__(self, in_channels=3, out_channels=64, norm="BN"):
+        super().__init__(in_channels, out_channels, 4)
+        conv_op = (
+            nn.Conv2dBiasReluFewChannels
+            if detect_target().name() == "cuda"
+            else nn.Conv2dBiasRelu
+        )
+        self.conv1 = conv_op(in_channels, out_channels, 7, 2, 7 // 2)
+        self.pool = nn.MaxPool2d(3, 2, 1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pool(x)
+        return x
+
+
+class BasicBlock(CNNBlockBase):
+    """
+    The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
+    with two 3x3 conv layers and a projection shortcut if needed.
+    """
+
+    def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
+        super().__init__(in_channels, out_channels, stride)
+
+    def forward(self, x):
+        raise NotImplementedError()
+
+
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+    ):
+        """
+        Args:
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            num_groups (int): number of groups for the 3x3 conv layer.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            stride_in_1x1 (bool): when stride>1, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+            dilation (int): the dilation rate of the 3x3 conv layer.
+        """
+        super().__init__(in_channels, out_channels, stride)
+
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2dBias(in_channels, out_channels, 1, stride, 0)
+        else:
+            self.shortcut = None
+
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = nn.Conv2dBiasRelu(
+            in_channels, bottleneck_channels, 1, stride_1x1, 0
+        )
+
+        self.conv2 = nn.Conv2dBiasRelu(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            stride_3x3,
+            1 * dilation,
+            dilation,
+        )
+
+        self.conv3 = nn.Conv2dBiasAddRelu(bottleneck_channels, out_channels, 1, 1, 0)
+
+        # for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+        #     if layer is not None:  # shortcut can be None
+        #         weight_init.c2_msra_fill(layer)
+
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out = self.conv3(out, shortcut)
+        return out
+
+
+class ResNet(nn.Module):
+    """
+    Implement :paper:`ResNet`.
+    """
+
+    def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
+        """
+        Args:
+            stem (nn.Module): a stem module
+            stages (list[list[CNNBlockBase]]): several (typically 4) stages,
+                each contains multiple :class:`CNNBlockBase`.
+            num_classes (None or int): if None, will not perform classification.
+                Otherwise, will create a linear layer.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
+                If None, will return the output of the last layer.
+            freeze_at (int): The number of stages at the beginning to freeze.
+                see :meth:`freeze` for detailed explanation.
+        """
+        super().__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+
+        self.stage_names, self.stages = [], []
+
+        if out_features is not None:
+            # Avoid keeping unused layers in this module. They consume extra memory
+            # and may cause allreduce to fail
+            num_stages = max(
+                [
+                    {"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0)
+                    for f in out_features
+                ]
+            )
+            stages = stages[:num_stages]
+
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, CNNBlockBase), block
+
+            name = "res" + str(i + 2)
+            stage = nn.Sequential(*blocks)
+
+            self.add_module(name, stage)
+            self.stage_names.append(name)
+            self.stages.append(stage)
+
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
+        self.stage_names = tuple(self.stage_names)  # Make it static for scripting
+
+        if num_classes is not None:
+            self.avgpool = nn.AvgPool2d(7, 1, 0)
+            self.linear = nn.Linear(curr_channels, num_classes)
+
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(
+                ", ".join(children)
+            )
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        # assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name, stage in zip(self.stage_names, self.stages):
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = self.linear(x)
+            if "linear" in self._out_features:
+                outputs["linear"] = x
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name],
+                stride=self._out_feature_strides[name],
+            )
+            for name in self._out_features
+        }
+
+    @staticmethod
+    def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
+        """
+        Create a list of blocks of the same type that forms one ResNet stage.
+        Args:
+            block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
+                stage. A module of this type must not change spatial resolution of inputs unless its
+                stride != 1.
+            num_blocks (int): number of blocks in this stage
+            in_channels (int): input channels of the entire stage.
+            out_channels (int): output channels of **every block** in the stage.
+            kwargs: other arguments passed to the constructor of
+                `block_class`. If the argument name is "xx_per_block", the
+                argument is a list of values to be passed to each block in the
+                stage. Otherwise, the same argument is passed to every block
+                in the stage.
+        Returns:
+            list[CNNBlockBase]: a list of block module.
+        Examples:
+        ::
+            stage = ResNet.make_stage(
+                BottleneckBlock, 3, in_channels=16, out_channels=64,
+                bottleneck_channels=16, num_groups=1,
+                stride_per_block=[2, 1, 1],
+                dilations_per_block=[1, 1, 2]
+            )
+        Usually, layers that produce the same feature map spatial size are defined as one
+        "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
+        all be 1.
+        """
+        blocks = []
+        for i in range(num_blocks):
+            curr_kwargs = {}
+            for k, v in kwargs.items():
+                if k.endswith("_per_block"):
+                    assert len(v) == num_blocks, (
+                        f"Argument '{k}' of make_stage should have the "
+                        f"same length as num_blocks={num_blocks}."
+                    )
+                    newk = k[: -len("_per_block")]
+                    assert (
+                        newk not in kwargs
+                    ), f"Cannot call make_stage with both {k} and {newk}!"
+                    curr_kwargs[newk] = v[i]
+                else:
+                    curr_kwargs[k] = v
+
+            blocks.append(
+                block_class(
+                    in_channels=in_channels, out_channels=out_channels, **curr_kwargs
+                )
+            )
+            in_channels = out_channels
+        return blocks
+
+    @staticmethod
+    def make_default_stages(depth, block_class=None, **kwargs):
+        """
+        Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
+        If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
+        instead for fine-grained customization.
+        Args:
+            depth (int): depth of ResNet
+            block_class (type): the CNN block class. Has to accept
+                `bottleneck_channels` argument for depth > 50.
+                By default it is BasicBlock or BottleneckBlock, based on the
+                depth.
+            kwargs:
+                other arguments to pass to `make_stage`. Should not contain
+                stride and channels, as they are predefined for each depth.
+        Returns:
+            list[list[CNNBlockBase]]: modules in all stages; see arguments of
+                :class:`ResNet.__init__`.
+        """
+        num_blocks_per_stage = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }[depth]
+        if block_class is None:
+            block_class = BasicBlock if depth < 50 else BottleneckBlock
+        if depth < 50:
+            in_channels = [64, 64, 128, 256]
+            out_channels = [64, 128, 256, 512]
+        else:
+            in_channels = [64, 256, 512, 1024]
+            out_channels = [256, 512, 1024, 2048]
+        ret = []
+        for (n, s, i, o) in zip(
+            num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels
+        ):
+            if depth >= 50:
+                kwargs["bottleneck_channels"] = o // 4
+            ret.append(
+                ResNet.make_stage(
+                    block_class=block_class,
+                    num_blocks=n,
+                    stride_per_block=[s] + [1] * (n - 1),
+                    in_channels=i,
+                    out_channels=o,
+                    **kwargs,
+                )
+            )
+        return ret
+
+
+def make_stage(*args, **kwargs):
+    """
+    Deprecated alias for backward compatibiltiy.
+    """
+    return ResNet.make_stage(*args, **kwargs)
+
+
+def build_resnet_backbone(cfg):
+    """
+    Create a ResNet instance from config.
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    norm = cfg.MODEL.RESNETS.NORM
+    stem = BasicStem(
+        in_channels=3,
+        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
+        norm=norm,
+    )
+
+    # fmt: off
+    freeze_at           = cfg.MODEL.BACKBONE.FREEZE_AT
+    out_features        = cfg.MODEL.RESNETS.OUT_FEATURES
+    depth               = cfg.MODEL.RESNETS.DEPTH
+    num_groups          = cfg.MODEL.RESNETS.NUM_GROUPS
+    width_per_group     = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+    bottleneck_channels = num_groups * width_per_group
+    in_channels         = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+    out_channels        = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    stride_in_1x1       = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+    res5_dilation       = cfg.MODEL.RESNETS.RES5_DILATION
+    # fmt: on
+    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
+
+    num_blocks_per_stage = {
+        18: [2, 2, 2, 2],
+        34: [3, 4, 6, 3],
+        50: [3, 4, 6, 3],
+        101: [3, 4, 23, 3],
+        152: [3, 8, 36, 3],
+    }[depth]
+
+    if depth in [18, 34]:
+        assert (
+            out_channels == 64
+        ), "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
+        assert (
+            res5_dilation == 1
+        ), "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
+        assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"
+
+    stages = []
+
+    for idx, stage_idx in enumerate(range(2, 6)):
+        # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper
+        dilation = res5_dilation if stage_idx == 5 else 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1),
+            "in_channels": in_channels,
+            "out_channels": out_channels,
+            "norm": norm,
+        }
+        # Use BasicBlock for R18 and R34.
+        if depth in [18, 34]:
+            stage_kargs["block_class"] = BasicBlock
+        else:
+            stage_kargs["bottleneck_channels"] = bottleneck_channels
+            stage_kargs["stride_in_1x1"] = stride_in_1x1
+            stage_kargs["dilation"] = dilation
+            stage_kargs["num_groups"] = num_groups
+            stage_kargs["block_class"] = BottleneckBlock
+        blocks = ResNet.make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+        stages.append(blocks)
+    return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at)
diff --git a/examples/02_detectron2/modeling/backbone/utils.py b/examples/02_detectron2/modeling/backbone/utils.py
new file mode 100644
index 000000000..81a0cb203
--- /dev/null
+++ b/examples/02_detectron2/modeling/backbone/utils.py
@@ -0,0 +1,30 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class ShapeSpec:
+    """
+    A simple structure that contains basic shape specification about a tensor.
+    It is often used as the auxiliary inputs/outputs of models,
+    to complement the lack of shape inference ability among pytorch modules.
+    """
+
+    channels: Optional[int] = None
+    height: Optional[int] = None
+    width: Optional[int] = None
+    stride: Optional[int] = None
diff --git a/examples/02_detectron2/modeling/meta_arch/__init__.py b/examples/02_detectron2/modeling/meta_arch/__init__.py
new file mode 100644
index 000000000..0093ee3f1
--- /dev/null
+++ b/examples/02_detectron2/modeling/meta_arch/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from .rcnn import GeneralizedRCNN
+
+__all__ = list(globals().keys())
diff --git a/examples/02_detectron2/modeling/meta_arch/rcnn.py b/examples/02_detectron2/modeling/meta_arch/rcnn.py
new file mode 100644
index 000000000..1f60c0171
--- /dev/null
+++ b/examples/02_detectron2/modeling/meta_arch/rcnn.py
@@ -0,0 +1,56 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from aitemplate.frontend import nn
+from aitemplate.frontend.nn.proposal import gen_batch_inds
+
+from ..backbone import build_resnet_fpn_backbone
+from ..proposal_generator import build_rpn_head
+from ..roi_heads import build_roi_heads
+
+
+class GeneralizedRCNN(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        im_shape = (cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST)
+        self._batch_size = cfg.SOLVER.IMS_PER_BATCH
+        self._mask_on = cfg.MODEL.MASK_ON
+        self._num_mask_roi = cfg.POSTPROCESS.TOPK
+
+        self.backbone = build_resnet_fpn_backbone(cfg)
+        self.proposal_generator = build_rpn_head(cfg, im_shape)
+        self.roi_heads = build_roi_heads(cfg, im_shape)
+        self._params = self.get_params()
+
+    def forward(self, x):
+        features = self.backbone(x)
+        rois, proposals = self.proposal_generator(features)
+        results = self.roi_heads(features, rois, proposals)
+        return results
+
+    def set_anchors(self, mod):
+        self.proposal_generator.set_anchors(mod)
+        if self._mask_on:
+            batch_inds_mask = gen_batch_inds(self._batch_size, self._num_mask_roi)
+            weight = torch.from_numpy(batch_inds_mask).cuda().half()
+            mod.set_constant_with_tensor("batch_inds_mask", weight)
+
+    def get_params(self):
+        params = self.proposal_generator.get_params()
+        if self._mask_on:
+            params["batch_inds_mask"] = gen_batch_inds(
+                self._batch_size, self._num_mask_roi
+            )
+        return params
diff --git a/examples/02_detectron2/modeling/proposal_generator/__init__.py b/examples/02_detectron2/modeling/proposal_generator/__init__.py
new file mode 100644
index 000000000..07de3d901
--- /dev/null
+++ b/examples/02_detectron2/modeling/proposal_generator/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from .rpn import build_rpn_head, RPN, StandardRPNHead
+
+__all__ = list(globals().keys())
diff --git a/examples/02_detectron2/modeling/proposal_generator/rpn.py b/examples/02_detectron2/modeling/proposal_generator/rpn.py
new file mode 100644
index 000000000..ce7a0f2bc
--- /dev/null
+++ b/examples/02_detectron2/modeling/proposal_generator/rpn.py
@@ -0,0 +1,177 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import numpy as np
+import torch
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.frontend import nn
+
+
+class StandardRPNHead(nn.Module):
+    """
+    Standard RPN classification and regression heads described in :paper:`Faster R-CNN`.
+    Uses a 3x3 conv to produce a shared hidden state from which one 1x1 conv predicts
+    objectness logits for each anchor and a second 1x1 conv predicts bounding-box deltas
+    specifying how to deform each anchor into an object proposal.
+    """
+
+    def __init__(
+        self,
+        in_planes,
+        rpn_dim=256,
+        scales=((32,), (64,), (128,), (256,), (512,)),
+        ratios=(0.5, 1, 2),
+    ):
+        super().__init__()
+        num_anchors = len(scales) * len(ratios)
+        self.conv = nn.Conv2dBiasRelu(in_planes, rpn_dim, 3, 1, 1)
+        self.objectness_logits = nn.Conv2dBiasSigmoid(rpn_dim, num_anchors, 1, 1, 0)
+        self.anchor_deltas = nn.Conv2dBias(rpn_dim, num_anchors * 4, 1, 1, 0)
+
+    def forward(self, features):
+        pred_objectness_logits = []
+        pred_anchor_deltas = []
+        for _, x in features.items():
+            t = ops.conv2d_bias_relu(stride=1, pad=1)(
+                x, self.conv.weight.tensor(), self.conv.bias.tensor()
+            )
+            logit = ops.conv2d_bias_sigmoid(stride=1, pad=0)(
+                t,
+                self.objectness_logits.weight.tensor(),
+                self.objectness_logits.bias.tensor(),
+            )
+            reg = ops.conv2d_bias(stride=1, pad=0)(
+                t, self.anchor_deltas.weight.tensor(), self.anchor_deltas.bias.tensor()
+            )
+            pred_objectness_logits.append(logit)
+            pred_anchor_deltas.append(reg)
+
+        return pred_objectness_logits, pred_anchor_deltas
+
+
+class RPN(nn.Module):
+    """
+    Region Proposal Network, introduced by :paper:`Faster R-CNN`.
+    """
+
+    def __init__(
+        self,
+        cfg,
+        im_shape,
+        dtype="float16",
+    ):
+        super().__init__()
+        # fmt: off
+        in_planes          = cfg.MODEL.FPN.OUT_CHANNELS
+        batch_size         = cfg.SOLVER.IMS_PER_BATCH
+        rpn_pre_nms_top_n  = cfg.MODEL.RPN.PRE_NMS_TOPK_TEST
+        rpn_post_nms_top_n = cfg.MODEL.RPN.POST_NMS_TOPK_TEST
+        self.iou_threshold = cfg.MODEL.RPN.NMS_THRESH
+        self.rpn_min_size  = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE
+        self.scales        = cfg.MODEL.ANCHOR_GENERATOR.SIZES
+        self.ratios        = cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS
+        # fmt: on
+        self.rpn_pre_nms_top_n = rpn_pre_nms_top_n
+        self.rpn_post_nms_top_n = rpn_post_nms_top_n
+        self.topk = rpn_pre_nms_top_n
+        self.dtype = dtype
+        self.im_shape = im_shape
+        self.feat_strides = (4, 8, 16, 32, 64)
+        self.batch_size = batch_size
+        self.batch_inds = np.zeros((batch_size, rpn_post_nms_top_n, 1)).astype(dtype)
+
+        self.rpn_head = StandardRPNHead(
+            in_planes,
+            in_planes,
+            scales=self.scales[0],
+            ratios=self.ratios,
+        )
+
+        self.proposal = nn.FPNProposal(
+            im_shape=im_shape,
+            feat_strides=self.feat_strides,
+            scales=self.scales,
+            ratios=self.ratios,
+            clip_box=True,
+            nms_on=False,
+            rpn_pre_nms_top_n=self.rpn_pre_nms_top_n,
+            rpn_post_nms_top_n=self.rpn_post_nms_top_n,
+            iou_threshold=self.iou_threshold,
+            rpn_min_size=self.rpn_min_size,
+            batch_size=batch_size,
+        )
+
+    def forward(self, features):
+        N = self.batch_size
+        pred_logits, pred_deltas = self.rpn_head(features)
+        pred_rois = self.proposal(pred_deltas)
+
+        proposal_list = []
+        score_list = []
+        for rois, logit in zip(pred_rois, pred_logits):
+            rois = ops.reshape()(rois, [N, -1, 4])
+            if self.topk > 0 and rois.shape()[1].value() > self.topk:
+                score_inds = ops.topk(k=self.topk)(ops.reshape()(logit, [N, -1]))
+                boxes_topk = ops.batch_gather()(rois, score_inds)
+                scores_topk = ops.batch_gather()(
+                    ops.reshape()(logit, [N, -1, 1]), score_inds
+                )
+                proposal_list.append(boxes_topk)
+                score_list.append(ops.reshape()(scores_topk, [N, -1]))
+            else:
+                proposal_list.append(rois)
+                score_list.append(ops.reshape()(logit, [N, -1]))
+
+        proposals_concat = ops.concatenate()(proposal_list, dim=1)
+        scores_concat = ops.concatenate()(score_list, dim=1)
+        scores_r = ops.reshape()(scores_concat, [N, -1])
+        proposals_r = ops.reshape()(proposals_concat, [N, -1, 4])
+
+        dets = ops.nms(
+            self.rpn_pre_nms_top_n,
+            self.rpn_post_nms_top_n,
+            self.iou_threshold,
+            self.rpn_min_size,
+        )(proposals_r, scores_r)
+
+        batch_inds = Tensor(
+            shape=[N, self.rpn_post_nms_top_n, 1],
+            dtype=self.dtype,
+            name="batch_inds",
+            value=0,
+        )
+        ret = ops.reshape()(ops.concatenate()([batch_inds, dets], dim=2), [-1, 5])
+        return ret, ops.reshape()(dets, [-1, 4])
+
+    def set_anchors(self, mod):
+        param = {"batch_inds": self.batch_inds.copy()}
+        for idx, _ in enumerate(self.feat_strides):
+            param["anchors_%d" % (idx + 2)] = self.proposal._anchors[idx].copy()
+
+        weights = {name: torch.from_numpy(w).cuda().half() for name, w in param.items()}
+        for name, weight in weights.items():
+            mod.set_constant_with_tensor(name, weight)
+
+    def get_params(self):
+        params = {
+            "anchors_%d" % (idx + 2): anchor.copy()
+            for idx, anchor in enumerate(self.proposal._anchors)
+        }
+        params["batch_inds"] = self.batch_inds
+        return params
+
+
+def build_rpn_head(cfg, input_shape):
+    return RPN(cfg, input_shape)
diff --git a/examples/02_detectron2/modeling/roi_heads/__init__.py b/examples/02_detectron2/modeling/roi_heads/__init__.py
new file mode 100644
index 000000000..f812e3c17
--- /dev/null
+++ b/examples/02_detectron2/modeling/roi_heads/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from .box_head import build_box_head, FastRCNNConvFCHead
+from .mask_head import MaskRCNNConvUpsampleHead
+from .roi_heads import build_roi_heads, StandardROIHeads
+
+__all__ = list(globals().keys())
diff --git a/examples/02_detectron2/modeling/roi_heads/box_head.py b/examples/02_detectron2/modeling/roi_heads/box_head.py
new file mode 100644
index 000000000..0269a6a4a
--- /dev/null
+++ b/examples/02_detectron2/modeling/roi_heads/box_head.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Tuple
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn
+
+from .fast_rcnn import FastRCNNOutputLayers
+
+
+class FastRCNNConvFCHead(nn.Module):
+    """
+    A head with a multi_level roi align layer and two fc layers.
+    """
+
+    def __init__(
+        self,
+        num_rois: int,
+        num_classes: int,
+        feat_dim: int,
+        fc_dim: int,
+        pooled_size: int,
+        im_shape: Tuple[int, int],
+    ):
+        super().__init__()
+        self.num_rois = num_rois
+        HH, WW = im_shape
+        self.roi_align = ops.multi_level_roi_align(
+            num_rois=num_rois,
+            pooled_size=pooled_size,
+            spatial_scale=1.0,
+            sampling_ratio=0,
+            position_sensitive=False,
+            continuous_coordinate=False,
+            im_shape=im_shape,
+        )
+        in_channel = int(feat_dim * pooled_size**2)
+        mid_channel = fc_dim
+
+        self.fc1 = nn.Linear(in_channel, mid_channel, specialization="relu")
+        self.fc2 = nn.Linear(mid_channel, mid_channel, specialization="relu")
+
+    def forward(self, feat, rois):
+        roi_feat = self.roi_align(feat[0], feat[1], feat[2], feat[3], rois)
+        roi_feat = ops.reshape()(roi_feat, [ops.size()(roi_feat, 0), -1])
+        fc1 = self.fc1(roi_feat)
+        fc2 = self.fc2(fc1)
+        return fc2
+
+
+def build_box_head(cfg, input_shape):
+    """
+    Build a box head through `FastRCNNOutputLayers`.
+    """
+    return FastRCNNOutputLayers(cfg, input_shape)
diff --git a/examples/02_detectron2/modeling/roi_heads/fast_rcnn.py b/examples/02_detectron2/modeling/roi_heads/fast_rcnn.py
new file mode 100644
index 000000000..d825a59a0
--- /dev/null
+++ b/examples/02_detectron2/modeling/roi_heads/fast_rcnn.py
@@ -0,0 +1,209 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Tuple
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn, Tensor
+
+
+class fast_rcnn_inference:
+    def __init__(
+        self,
+        im_shape: Tuple[int, int],
+        num_rois: int,
+        num_classes: int,
+        clip_box: bool = True,
+        nms_on: bool = True,
+        use_topk: bool = True,
+        topk_per_image: int = 100,
+        iou_thresh: float = 0.5,
+        roi_align_on: bool = False,
+        batch_size: int = 1,
+        dtype: str = "float16",
+    ):
+        self.im_h, self.im_w = im_shape
+        self.num_rois = num_rois
+        self.num_classes = num_classes
+        self.dtype = dtype
+        self.clip_box = clip_box
+        self.topk_per_image = topk_per_image
+        self.iou_threshold = iou_thresh
+        self.nms_on = nms_on
+        self.use_topk = use_topk
+        self.roi_align_on = roi_align_on
+        self.batch_size = batch_size
+        self.class_agnostic_nms = False
+
+    def __call__(self, boxes, scores, deltas):
+        """
+        Args:
+            boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
+                boxes for each image. Element i has shape (Ri, K * 4) if doing
+                class-specific regression, or (Ri, 4) if doing class-agnostic
+                regression, where Ri is the number of predicted objects for image i.
+
+            scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
+                Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+                for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
+
+            deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box
+                transform (see :class:`fast_rcnn_inference.box_transform`).
+
+        Returns:
+            proposals.
+        """
+        proposals = self.box_transform(boxes, deltas)
+        if self.nms_on:
+            return self.nms_wrapper(proposals, scores)
+        else:
+            return proposals
+
+    def nms_wrapper(self, proposals, scores):
+        N = self.batch_size
+        proposals_p = ops.permute102()(proposals)
+        scores_x = ops.dynamic_slice()(
+            scores, start_indices=[0, 0], end_indices=[self.num_rois, self.num_classes]
+        )
+
+        OP = ops.efficient_nms(
+            self.num_rois // N, self.topk_per_image, self.iou_threshold, 0
+        )
+        args = (
+            ops.reshape()(proposals_p, [N, -1, self.num_classes, 4]),
+            ops.reshape()(scores_x, [N, -1, self.num_classes]),
+        )
+        detections = OP(*args)
+        if self.roi_align_on:
+            batch_inds = Tensor(
+                shape=[N, self.topk_per_image, 1],
+                dtype=self.dtype,
+                name="batch_inds_mask",
+                value=0,
+            )
+            rois = ops.reshape()(
+                ops.concatenate()([batch_inds, detections[1]], dim=2), [-1, 5]
+            )
+            return detections + (rois,)
+        else:
+            return detections
+
+    def layout_transform(self, delta):
+        return ops.permute210()(
+            ops.reshape()(delta, [1, self.num_rois, self.num_classes])
+        )
+
+    def apply_weight(self, deltas, weights=(0.1, 0.2)):
+        ww = weights[0]
+        wh = weights[1]
+
+        deltas_r = ops.reshape()(deltas, [self.num_rois, -1, 4])
+        (delta_x, delta_y, delta_w, delta_h) = ops.split()(deltas_r, 1, dim=2)
+        delta_xm = delta_x * ww
+        delta_ym = delta_y * ww
+        delta_wm = delta_w * wh
+        delta_hm = delta_h * wh
+
+        return (
+            self.layout_transform(delta_xm),
+            self.layout_transform(delta_ym),
+            self.layout_transform(delta_wm),
+            self.layout_transform(delta_hm),
+        )
+
+    def box_transform(self, boxes, deltas):
+        """
+        The box-to-box transform defined in R-CNN. The transformation is parameterized by 4 deltas: (dx, dy, dw, dh). The transformation scales the box’s width and height by exp(dw), exp(dh) and shifts a box’s center by the offset (dx * width, dy * height).
+        """
+        const_0_5 = 0.5
+
+        (delta_x, delta_y, delta_w, delta_h) = self.apply_weight(deltas)
+
+        boxes_r = ops.reshape()(boxes, [self.num_rois, 4])
+        (anchor_x1, anchor_y1, anchor_x2, anchor_y2) = ops.split()(boxes_r, 1, dim=1)
+        widths = ops.reshape()(anchor_x2 - anchor_x1, [self.num_rois, 1])
+        heights = ops.reshape()(anchor_y2 - anchor_y1, [self.num_rois, 1])
+
+        width_mid = widths * const_0_5
+        height_mid = heights * const_0_5
+        ctr_x = anchor_x1 + width_mid
+        ctr_y = anchor_y1 + height_mid
+
+        pred_ctr_x = (delta_x * widths) + ctr_x
+
+        pred_ctr_y = (delta_y * heights) + ctr_y
+        pred_w = ops.exp(delta_w) * widths
+        pred_h = ops.exp(delta_h) * heights
+
+        p_x1 = pred_ctr_x - (const_0_5 * pred_w)
+        p_y1 = pred_ctr_y - (const_0_5 * pred_h)
+        p_x2 = pred_ctr_x + (const_0_5 * pred_w)
+        p_y2 = pred_ctr_y + (const_0_5 * pred_h)
+
+        if self.clip_box:
+            f_x1, f_y1, f_x2, f_y2 = self.box_clip(p_x1, p_y1, p_x2, p_y2)
+            proposals = ops.concatenate()([f_x1, f_y1, f_x2, f_y2], dim=2)
+        else:
+            proposals = ops.concatenate()([p_x1, p_y1, p_x2, p_y2], dim=2)
+
+        return proposals
+
+    def box_clip(self, p_x1, p_y1, p_x2, p_y2):
+        x_min = 0
+        x_max_h = self.im_h
+        x_max_w = self.im_w
+
+        f_x1 = ops.hardtanh(p_x1, x_min, x_max_w)
+        f_y1 = ops.hardtanh(p_y1, x_min, x_max_h)
+        f_x2 = ops.hardtanh(p_x2, x_min, x_max_w)
+        f_y2 = ops.hardtanh(p_y2, x_min, x_max_h)
+        return f_x1, f_y1, f_x2, f_y2
+
+
+class FastRCNNOutputLayers(nn.Module):
+    """
+    Two linear layers for predicting Fast R-CNN outputs:
+
+    1. proposal-to-detection box regression deltas
+    2. classification scores
+
+    , and a postprocess procedure.
+    """
+
+    def __init__(self, cfg, im_shape):
+        super().__init__()
+        in_channel = cfg.MODEL.ROI_BOX_HEAD.FC_DIM
+        num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+
+        self.cls_score = nn.Linear(in_channel, num_classes + 1)
+        self.bbox_pred = nn.Linear(in_channel, num_classes * 4)
+
+        self.postprocess = fast_rcnn_inference(
+            im_shape=im_shape,
+            num_classes=num_classes,
+            num_rois=cfg.MODEL.RPN.POST_NMS_TOPK_TEST * cfg.SOLVER.IMS_PER_BATCH,
+            use_topk=cfg.POSTPROCESS.USE_TOPK,
+            roi_align_on=True if cfg.MODEL.MASK_ON else False,
+            topk_per_image=cfg.POSTPROCESS.TOPK,
+            iou_thresh=cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0],
+            clip_box=True,
+            nms_on=True,
+            batch_size=cfg.SOLVER.IMS_PER_BATCH,
+        )
+
+    def forward(self, x, proposals):
+        rcnn_logit = self.cls_score(x)
+        rcnn_logit = ops.softmax()(rcnn_logit, -1)
+        rcnn_reg = self.bbox_pred(x)
+        return self.postprocess(proposals, rcnn_logit, rcnn_reg)
diff --git a/examples/02_detectron2/modeling/roi_heads/mask_head.py b/examples/02_detectron2/modeling/roi_heads/mask_head.py
new file mode 100644
index 000000000..94e022205
--- /dev/null
+++ b/examples/02_detectron2/modeling/roi_heads/mask_head.py
@@ -0,0 +1,65 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Tuple
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn
+
+
+class MaskRCNNConvUpsampleHead(nn.Module):
+    """
+    A mask head with several conv layers, plus an upsample layer (or `ConvTranspose2d`).
+    Predictions are made with a final 1x1 conv layer.
+    """
+
+    def __init__(
+        self,
+        num_rois: int,
+        num_classes: int,
+        feat_dim: int,
+        conv_dim: int,
+        pooled_size: int,
+        im_shape: Tuple[int, int],
+    ):
+        super().__init__()
+        HH, WW = im_shape
+        self.roi_align = ops.multi_level_roi_align(
+            num_rois=num_rois,
+            pooled_size=pooled_size,
+            spatial_scale=1.0,
+            sampling_ratio=0,
+            position_sensitive=False,
+            continuous_coordinate=False,
+            im_shape=im_shape,
+        )
+        in_channel = feat_dim
+        mid_channel = conv_dim
+
+        self.mask_fcn1 = nn.Conv2dBiasRelu(in_channel, mid_channel, 3, 1, 1)
+        self.mask_fcn2 = nn.Conv2dBiasRelu(mid_channel, mid_channel, 3, 1, 1)
+        self.mask_fcn3 = nn.Conv2dBiasRelu(mid_channel, mid_channel, 3, 1, 1)
+        self.mask_fcn4 = nn.Conv2dBiasRelu(mid_channel, mid_channel, 3, 1, 1)
+        self.deconv = nn.ConvTranspose2dBiasRelu(mid_channel, mid_channel, 2, 2, 0)
+        self.predictor = nn.Conv2dBiasSigmoid(mid_channel, num_classes, 1, 1, 0)
+
+    def forward(self, feat, rois):
+        roi_feat = self.roi_align(feat[0], feat[1], feat[2], feat[3], rois)
+        conv1 = self.mask_fcn1(roi_feat)
+        conv2 = self.mask_fcn2(conv1)
+        conv3 = self.mask_fcn3(conv2)
+        conv4 = self.mask_fcn4(conv3)
+        upsp = self.deconv(conv4)
+        mask = self.predictor(upsp)
+        return mask
diff --git a/examples/02_detectron2/modeling/roi_heads/roi_heads.py b/examples/02_detectron2/modeling/roi_heads/roi_heads.py
new file mode 100644
index 000000000..587d9601b
--- /dev/null
+++ b/examples/02_detectron2/modeling/roi_heads/roi_heads.py
@@ -0,0 +1,91 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Dict
+
+from aitemplate.compiler import ops
+
+from aitemplate.frontend import nn, Tensor
+
+from .box_head import build_box_head, FastRCNNConvFCHead
+from .mask_head import MaskRCNNConvUpsampleHead
+
+
+class StandardROIHeads(nn.Module):
+    """
+    The StandardROIHeads in a typical "C4" R-CNN model, where
+    the box and mask head share the cropping and
+    the per-region feature computation by a Res5 block.
+    See :paper:`ResNet` Appendix A.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__()
+        self.mask_on = cfg.MODEL.MASK_ON
+        self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        self.box_predictor = build_box_head(cfg, input_shape)
+
+        self.box_head = FastRCNNConvFCHead(
+            num_rois=cfg.MODEL.RPN.POST_NMS_TOPK_TEST,
+            num_classes=cfg.MODEL.ROI_HEADS.NUM_CLASSES,
+            feat_dim=cfg.MODEL.FPN.OUT_CHANNELS,
+            fc_dim=cfg.MODEL.ROI_BOX_HEAD.FC_DIM,
+            pooled_size=cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION,
+            im_shape=input_shape,
+        )
+        if cfg.MODEL.MASK_ON:
+            self.mask_head = MaskRCNNConvUpsampleHead(
+                num_rois=cfg.POSTPROCESS.TOPK,
+                num_classes=cfg.MODEL.ROI_HEADS.NUM_CLASSES,
+                feat_dim=cfg.MODEL.FPN.OUT_CHANNELS,
+                conv_dim=cfg.MODEL.ROI_MASK_HEAD.CONV_DIM,
+                pooled_size=cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION,
+                im_shape=input_shape,
+            )
+
+    def get_shape(self, x):
+        shape = [it.value() for it in x._attrs["shape"]]
+        return shape
+
+    def forward(self, features: Dict[str, Tensor], rois: Tensor, proposals: Tensor):
+
+        box_features = [features[f] for f in self.in_features]
+        roi_feat = self.box_head(box_features, rois)
+        detections = self.box_predictor(roi_feat, proposals)
+        if self.mask_on:
+            num_dets, boxes, probs, class_pred, mask_rois = detections
+            pred_mask_logits = self.mask_head(box_features, mask_rois)
+
+            num_rois, roi_size, _, num_classes = self.get_shape(pred_mask_logits)
+            batch_size = self.get_shape(boxes)[0]
+            batch_rois = num_rois // batch_size
+
+            pred_mask_logits = ops.permute021()(
+                ops.reshape()(pred_mask_logits, [num_rois, -1, num_classes])
+            )
+            indices = ops.reshape()(class_pred, [num_rois, 1])
+            mask_probs_pred = ops.batch_gather()(pred_mask_logits, indices)
+            mask_probs_pred = ops.reshape()(
+                mask_probs_pred, [batch_size, batch_rois, roi_size, roi_size]
+            )
+            return num_dets, boxes, probs, class_pred, mask_probs_pred
+        else:
+            return detections
+
+
+def build_roi_heads(cfg, input_shape):
+    """
+    Build ROIHeads through `StandardROIHeads`.
+    """
+    return StandardROIHeads(cfg, input_shape)
diff --git a/examples/02_detectron2/predictor/__init__.py b/examples/02_detectron2/predictor/__init__.py
new file mode 100644
index 000000000..96a749d96
--- /dev/null
+++ b/examples/02_detectron2/predictor/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from .builtin_meta import _get_coco_instances_meta
+from .predictor import Predictor
+
+__all__ = ["Predictor", "_get_coco_instances_meta"]
diff --git a/examples/02_detectron2/predictor/builtin_meta.py b/examples/02_detectron2/predictor/builtin_meta.py
new file mode 100644
index 000000000..c09e5a5ba
--- /dev/null
+++ b/examples/02_detectron2/predictor/builtin_meta.py
@@ -0,0 +1,180 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Note:
+For your custom dataset, there is no need to hard-code metadata anywhere in the code.
+For example, for COCO-format dataset, metadata will be obtained automatically
+when calling `load_coco_json`. For other dataset, metadata may also be obtained in other ways
+during loading.
+However, we hard-coded metadata for a few common dataset here.
+The only goal is to allow users who don't have these dataset to use pre-trained models.
+Users don't have to download a COCO json (which contains metadata), in order to visualize a
+COCO model (with correct class names and colors).
+"""
+
+
+# All coco categories, together with their nice-looking visualization colors
+# It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
+COCO_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+    {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"},
+    {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"},
+    {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"},
+    {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"},
+    {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"},
+    {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"},
+    {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"},
+    {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"},
+    {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"},
+    {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"},
+    {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"},
+    {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"},
+    {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"},
+    {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"},
+    {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"},
+    {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"},
+    {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"},
+    {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"},
+    {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"},
+    {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"},
+    {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"},
+    {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"},
+    {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"},
+    {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"},
+    {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"},
+    {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"},
+    {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"},
+    {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"},
+    {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"},
+    {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"},
+    {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"},
+    {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"},
+    {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"},
+    {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"},
+    {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"},
+    {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"},
+    {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"},
+    {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"},
+    {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"},
+    {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"},
+    {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"},
+    {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"},
+    {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"},
+    {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"},
+    {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"},
+    {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"},
+    {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"},
+    {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"},
+    {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"},
+    {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"},
+    {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"},
+    {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"},
+    {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},
+]
+
+
+def _get_coco_instances_meta():
+    thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    assert len(thing_ids) == 80, len(thing_ids)
+    # Mapping from the incontiguous COCO category id to an id in [0, 79]
+    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+    thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    ret = {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes,
+        "thing_colors": thing_colors,
+    }
+    return ret
diff --git a/examples/02_detectron2/predictor/predictor.py b/examples/02_detectron2/predictor/predictor.py
new file mode 100644
index 000000000..324a138c2
--- /dev/null
+++ b/examples/02_detectron2/predictor/predictor.py
@@ -0,0 +1,359 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import os
+from typing import Tuple
+
+import cv2
+import numpy as np
+import torch
+
+from aitemplate.compiler import Model
+from modeling.meta_arch import GeneralizedRCNN
+from PIL import Image
+
+from .builtin_meta import _get_coco_instances_meta
+
+
+class Predictor:
+    """
+    Use this class to create AIT inference instances for detectron2 models. It includes procedures that is to 1) preprocess the input images, 2) load the weights of the AIT model, 3) run the AIT model and visualize the outputs, 4) benchmark the AIT model.
+    """
+
+    def __init__(self, cfg, workdir="./tmp"):
+        self.cfg = cfg
+        self.model_name = cfg.MODEL.NAME
+        self.batch_size = cfg.SOLVER.IMS_PER_BATCH
+        self.im_shape = (cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST)
+        self.pixel_mean = cfg.MODEL.PIXEL_MEAN
+        self.pixel_std = cfg.MODEL.PIXEL_STD
+        self.mask_on = cfg.MODEL.MASK_ON
+        self.model = GeneralizedRCNN(cfg)
+        self.weights = self.get_parameters()
+        self.module = self.init_modules(cfg.MODEL.NAME, workdir)
+        self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        self.min_size = cfg.INPUT.MIN_SIZE_TEST
+        self.max_size = cfg.INPUT.MAX_SIZE_TEST
+        self.interp_method = Image.BILINEAR
+
+    def get_parameters(self):
+        """
+        Obtain the weights.
+        """
+        parameters = {
+            name: w.contiguous().cuda().half()
+            for name, w in torch.load(self.cfg.MODEL.WEIGHTS).items()
+        }
+        for name, param in self.model._params.items():
+            parameters[name] = torch.from_numpy(param).cuda().half()
+        return parameters
+
+    def preprocess(self, im_path: str, pad_value: float = 0.0):
+        """
+        Image preprocess: resize the image (see `apply_transform`), normalize the pixels,
+        and add padding.
+        """
+        # HH, WW = self.im_shape
+        ori_img = cv2.imread(im_path)
+        ori_shape = ori_img.shape
+        if ori_shape[0] > ori_shape[1]:
+            img = np.rot90(ori_img, k=1)
+        else:
+            img = ori_img
+        inputs = self.apply_transform(img)
+        resize_scale = img.shape[0] / inputs.shape[0]
+        pixel_mean = np.array(self.pixel_mean).reshape(1, 1, -1)
+        pixel_std = np.array(self.pixel_std).reshape(1, 1, -1)
+        inputs = (inputs - pixel_mean) / pixel_std
+        padding_size = (
+            (0, self.min_size - inputs.shape[0]),
+            (0, self.max_size - inputs.shape[1]),
+            (0, 0),
+        )
+        inputs = np.pad(inputs, padding_size, constant_values=pad_value)
+        inputs = inputs[np.newaxis, :]
+        return inputs.astype("float16"), ori_img, ori_shape, resize_scale
+
+    def apply_transform(self, img):
+        """
+        Resize the image while keeping the aspect ratio unchanged.
+        It attempts to scale the shorter edge to the given `short_edge_length`,
+        as long as the longer edge does not exceed `max_size`.
+        If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
+        """
+        h, w = img.shape[:2]
+        new_h, new_w = Predictor.get_output_shape(h, w, self.min_size, self.max_size)
+        if len(img.shape) > 2 and img.shape[2] == 1:
+            pil_image = Image.fromarray(img[:, :, 0], mode="L")
+        else:
+            pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize((new_w, new_h), self.interp_method)
+        ret = np.asarray(pil_image)
+        if len(img.shape) > 2 and img.shape[2] == 1:
+            ret = np.expand_dims(ret, -1)
+        return ret
+
+    def apply_bbox(self, bbox, im_w, im_h):
+        if im_h > im_w:
+            x0 = bbox[:, 0][..., np.newaxis]
+            y0 = bbox[:, 1][..., np.newaxis]
+            x1 = bbox[:, 2][..., np.newaxis]
+            y1 = bbox[:, 3][..., np.newaxis]
+            bbox = np.hstack((im_w - y1, x0, im_w - y0, x1))
+        return bbox
+
+    @staticmethod
+    def get_output_shape(
+        oldh: int, oldw: int, short_edge_length: int, max_size: int
+    ) -> Tuple[int, int]:
+        """
+        Compute the output size given input size and target short edge length.
+        """
+        h, w = oldh, oldw
+        size = short_edge_length * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            newh, neww = size, scale * w
+        else:
+            newh, neww = scale * h, size
+        if max(newh, neww) > max_size:
+            scale = max_size * 1.0 / max(newh, neww)
+            newh = newh * scale
+            neww = neww * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)
+
+    def data_loader(self, image_list):
+        """
+        Load the images and convert them to batched data.
+        """
+        batch_data = []
+        HH, WW = self.im_shape
+        batch = np.zeros((self.batch_size, HH, WW, 3), dtype="float16")
+        img_paths, img_shapes, img_scales, raw_images = [], [], [], []
+        num_samples = len(image_list)
+        max_iter = (
+            (num_samples + self.batch_size - 1) // self.batch_size * self.batch_size
+        )
+        datasets = itertools.cycle(image_list)
+        for idx in range(max_iter):
+            im_path = next(datasets)
+            input_data, raw_input, im_shape, im_scale = self.preprocess(im_path)
+            im_name = im_path.split("/")[-1]
+            img_paths.append(im_name)
+            img_shapes.append(im_shape)
+            img_scales.append(im_scale)
+            raw_images.append(raw_input)
+            batch[idx % self.batch_size, :, :, :] = input_data
+            if (idx + 1) % self.batch_size == 0:
+                batch_data.append(
+                    {
+                        "data": batch.astype("float16"),
+                        "image_shape": img_shapes,
+                        "image_scale": img_scales,
+                        "path": img_paths,
+                        "image": raw_images,
+                    }
+                )
+                img_paths, img_shapes, img_scales, raw_images = [], [], [], []
+        return batch_data
+
+    def init_modules(self, detection_model_name, workdir):
+        """
+        Load the AIT module of the detection model, and set the weights.
+        """
+        mod = Model(os.path.join(workdir, detection_model_name, "test.so"))
+        for name, weight in self.weights.items():
+            mod.set_constant_with_tensor(name, weight)
+
+        return mod
+
+    def run_batch(self, batch_data, graph_mode=False):
+        """
+        Run the inference of the AIT model with batched data.
+        """
+        score_thresh = self.cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
+        results = {}
+        inputs = batch_data["data"]
+        image_list = batch_data["path"]
+        image_shapes = batch_data["image_shape"]
+        image_scales = batch_data["image_scale"]
+        images = batch_data["image"]
+        ret = self.run_on_image(inputs, graph_mode=graph_mode)
+        batched_boxes, batched_scores, batched_classes = ret[:3]
+        if self.mask_on:
+            batched_masks = ret[-1]
+        for i in range(self.batch_size):
+            boxes, scores, classes = (
+                batched_boxes[i, :],
+                batched_scores[i, :],
+                batched_classes[i, :],
+            )
+
+            filter_mask = scores > score_thresh
+            filter_inds = filter_mask.nonzero()[0]
+            scores = scores[filter_inds]
+            boxes = boxes[filter_inds, :] * image_scales[i]
+            boxes = self.apply_bbox(boxes, image_shapes[i][1], image_shapes[i][0])
+            classes = classes[filter_inds]
+
+            results[image_list[i]] = {
+                "boxes": boxes,
+                "scores": scores,
+                "classes": classes,
+                "image_height": image_shapes[i][0],
+                "image_width": image_shapes[i][1],
+                "num_instances": boxes.shape[0],
+                "image": images[i],
+            }
+            if self.mask_on:
+                mask_pred = batched_masks[i, filter_inds, :, :]
+                results[image_list[i]]["masks"] = mask_pred
+        return results
+
+    @staticmethod
+    def overlay(image, mask, color, alpha_transparency=0.5):
+        for channel in range(3):
+            image[:, :, channel] = np.where(
+                mask == 1,
+                image[:, :, channel] * (1 - alpha_transparency)
+                + alpha_transparency * color[channel] * 255,
+                image[:, :, channel],
+            )
+        return image
+
+    def visualize(
+        self, detections, output_path="./tmp/outputs", thickness=1, mask_thresh=0.5
+    ):
+        """
+        Visualize the outputs.
+        """
+        os.makedirs(output_path, exist_ok=True)
+        meta_data = _get_coco_instances_meta()
+        thing_colors = meta_data["thing_colors"]
+        thing_classes = meta_data["thing_classes"]
+        for file_name, result in detections.items():
+            img = result["image"]
+            boxes = result["boxes"]
+            classes = result["classes"]
+            scores = result["scores"]
+            for pred_box, pred_class, pred_score in zip(boxes, classes, scores):
+                box = pred_box.astype("int")
+                start_point = (box[0], box[1])
+                end_point = (box[2], box[3])
+                color = tuple(thing_colors[pred_class])
+                img = cv2.rectangle(img, start_point, end_point, color, thickness)
+                text = thing_classes[pred_class] + ": " + str(pred_score)
+                img = cv2.putText(
+                    img,
+                    text,
+                    (box[0], box[1] - 10),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    0.4,
+                    color,
+                    thickness,
+                )
+
+            if self.mask_on:
+                masks = result["masks"]
+                im_height, im_width = img.shape[:2]
+                for pred_box, pred_class, mask in zip(boxes, classes, masks):
+                    np_color = np.array(thing_colors[pred_class]) / 255
+                    if im_height > im_width:
+                        mask = np.rot90(mask, k=-1)
+                    box = pred_box.astype("int")
+                    det_width = box[2] - box[0]
+                    det_height = box[3] - box[1]
+                    mask = mask.astype(np.float32)
+                    small_mask = Image.fromarray(mask)
+                    mask = small_mask.resize(
+                        (det_width, det_height), resample=self.interp_method
+                    )
+                    mask = np.array(mask, copy=False)
+                    mask = np.array(mask > mask_thresh, dtype=np.uint8)
+                    padded_mask = np.zeros((im_height, im_width), dtype=np.uint8)
+                    x_0 = max(box[0], 0)
+                    x_1 = min(box[2], im_width)
+                    y_0 = max(box[1], 0)
+                    y_1 = min(box[3], im_height)
+                    padded_mask[y_0:y_1, x_0:x_1] = mask[
+                        (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
+                    ]
+                    img = Predictor.overlay(img, padded_mask, np_color)
+            cv2.imwrite(os.path.join(output_path, file_name), img)
+
+    def run_on_image(self, inputs, graph_mode=False):
+        """
+        Call the AIT module for the inference of the model on given inputs, and return the outputs.
+        """
+        topk = self.cfg.POSTPROCESS.TOPK
+        mod = self.module
+        if type(inputs) is np.ndarray:
+            arr = torch.from_numpy(inputs).cuda()
+        else:
+            arr = inputs.contiguous()
+
+        inputs = [arr]
+
+        outputs = [
+            torch.empty([self.batch_size, 1], dtype=torch.int64).cuda(),
+            torch.empty([self.batch_size, topk, 4]).cuda().half(),
+            torch.empty([self.batch_size, topk]).cuda().half(),
+            torch.empty([self.batch_size, topk], dtype=torch.int64).cuda(),
+        ]
+        if self.mask_on:
+            mask_size = self.cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION * 2
+            mask_blob = torch.empty([self.batch_size, topk, mask_size, mask_size])
+            outputs.append(mask_blob.cuda().half())
+        mod.run_with_tensors(inputs, outputs, graph_mode=graph_mode)
+
+        ret = [
+            outputs[1].cpu().numpy(),
+            outputs[2].cpu().numpy(),
+            outputs[3].cpu().numpy(),
+        ]
+        if self.mask_on:
+            ret.append(outputs[-1].cpu().numpy())
+        return ret
+
+    def benchmark(self, inputs, count=10, graph_mode=False):
+        """
+        Benchmark the inference of the AIT model on given inputs, and return the runtime in ms.
+        """
+        mod = self.module
+        if type(inputs) is np.ndarray:
+            arr = torch.from_numpy(inputs).cuda()
+        else:
+            arr = inputs.cuda().contiguous()
+        topk = self.cfg.POSTPROCESS.TOPK
+        outputs = [
+            torch.empty([self.batch_size, 1], dtype=torch.int64).cuda(),
+            torch.empty([self.batch_size, topk, 4]).cuda().half(),
+            torch.empty([self.batch_size, topk]).cuda().half(),
+            torch.empty([self.batch_size, topk], dtype=torch.int64).cuda(),
+        ]
+        if self.mask_on:
+            mask_blob = torch.empty([self.batch_size, topk, 28, 28])
+            outputs.append(mask_blob.cuda().half())
+
+        duration, _, _ = mod.benchmark_with_tensors(
+            [arr],
+            outputs,
+            count=count,
+            repeat=2,
+            graph_mode=graph_mode,
+        )
+        return duration
diff --git a/examples/02_detectron2/prepare_and_run_rcnn.sh b/examples/02_detectron2/prepare_and_run_rcnn.sh
new file mode 100755
index 000000000..f1edabe26
--- /dev/null
+++ b/examples/02_detectron2/prepare_and_run_rcnn.sh
@@ -0,0 +1,59 @@
+#!/bin/bash -e
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+### Download COCO 2017 Dataset 
+
+#### Download image annotations
+BASE=https://dl.fbaipublicfiles.com/detectron2
+ROOT=~/.torch/datasets
+mkdir -p $ROOT/coco/annotations
+echo "$ROOT"
+
+for anno in instances_val2017_100 \
+  person_keypoints_val2017_100 ; do
+
+  dest=$ROOT/coco/annotations/$anno.json
+  [[ -s $dest ]] && {
+    echo "$dest exists. Skipping ..."
+  } || {
+    wget $BASE/annotations/coco/$anno.json -O $dest
+  }
+done
+
+#### Download images
+dest=$ROOT/coco/val2017_100.tgz
+[[ -d $ROOT/coco/val2017 ]] && {
+  echo "$ROOT/coco/val2017 exists. Skipping ..."
+} || {
+  wget $BASE/annotations/coco/val2017_100.tgz -O $dest
+  tar xzf $dest -C $ROOT/coco/ && rm -f $dest
+}
+IMG_PATH=$ROOT/coco/val2017
+
+### Download Pre-trained Model
+
+MODEL_PATH=~/.torch/model
+mkdir -p $MODEL_PATH
+MODEL_NAME=mask_rcnn_R_50_FPN
+
+wget $BASE/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl -O tmp/pt_$MODEL_NAME.pkl
+
+### Build AIT Model, Export the Pre-trained Weights and Run Inference 
+
+cfg=examples/02_detectron2/configs/$MODEL_NAME.yaml
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 examples/02_detectron2/compile_model.py   \
+  --config $cfg \
+  --batch 1
+
+python3 examples/02_detectron2/tools/convert_pt2ait.py  \
+  --d2-weight tmp/pt_$MODEL_NAME.pkl \
+  --ait-weight tmp/ait_$MODEL_NAME.pt \
+  --model-name $MODEL_NAME
+
+python3 examples/02_detectron2/demo.py \
+  --weight tmp/ait_$MODEL_NAME.pt \
+  --config $cfg \
+  --batch 1 --input "$IMG_PATH/*.jpg" \
+  --confidence-threshold 0.5 \
+  --display \
+  --cudagraph
diff --git a/examples/02_detectron2/tools/convert_pt2ait.py b/examples/02_detectron2/tools/convert_pt2ait.py
new file mode 100644
index 000000000..584e14560
--- /dev/null
+++ b/examples/02_detectron2/tools/convert_pt2ait.py
@@ -0,0 +1,157 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+script for converting model from detectron2 to aitemplate
+"""
+
+import json
+import os
+import pickle as pkl
+
+import click
+
+import numpy as np
+import torch
+from aitemplate.testing import detect_target
+
+# pylint: disable=C0103
+
+
+class detectron2_export:
+    def __init__(self, model_name):
+        self.model_name = model_name
+
+    def export_model(self, model):
+        fuse_model = {}
+        bn_keys = set()
+        for k, _ in model.items():
+            if "norm" in k:
+                param_name = k.split(".norm")[0]
+                if param_name in bn_keys:
+                    continue
+                bn_keys.add(param_name)
+                self.transform_params(param_name, model, fuse_model, fuse_bn=True)
+            else:
+                self.transform_params(k, model, fuse_model, fuse_bn=False)
+
+        ait_model = {
+            k.replace(".", "_"): weight
+            for k, weight in fuse_model.items()
+            if "anchors" not in k
+        }
+
+        if detect_target().name() == "cuda":
+            self.export_conv0(ait_model, fuse_model)
+
+        self.check_model(ait_model)
+        return ait_model
+
+    def check_model(self, ait_model):
+        with open(os.path.join("./tmp", self.model_name, "params.json")) as fi:
+            param_map = json.load(fi)
+        for name, shape in param_map:
+            assert ait_model[name].shape == tuple(
+                shape
+            ), "weight shape mismatch {} {} expected {}".format(
+                name, ait_model[name].shape, shape
+            )
+
+    def fuse_conv_bn_weights(
+        self, conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b, transpose=False
+    ):
+        conv_w = torch.tensor(conv_w)
+        bn_rm = torch.tensor(bn_rm)
+        bn_rv = torch.tensor(bn_rv)
+        bn_w = torch.tensor(bn_w)
+        bn_b = torch.tensor(bn_b)
+        bn_eps = torch.tensor(bn_eps)
+
+        if conv_b is None:
+            conv_b = torch.zeros_like(bn_rm)
+        if bn_w is None:
+            bn_w = torch.ones_like(bn_rm)
+        if bn_b is None:
+            bn_b = torch.zeros_like(bn_rm)
+        bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
+
+        if transpose:
+            shape = [1, -1] + [1] * (len(conv_w.shape) - 2)
+        else:
+            shape = [-1, 1] + [1] * (len(conv_w.shape) - 2)
+
+        conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape(shape)
+        conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
+
+        for arr in [conv_w.numpy(), conv_b.numpy()]:
+            if np.isnan(arr).any():
+                print("fuse bn error")
+        return conv_w, conv_b
+
+    def transform_params(self, param_name, obj, fuse_model, fuse_bn=True):
+        if not fuse_bn:
+            arr = obj[param_name]
+            if len(arr.shape) == 4:
+                arr = np.transpose(arr, (0, 2, 3, 1))
+            elif "fc1.weight" in param_name:
+                arr = arr.reshape((1024, -1, 7, 7))
+                arr = np.transpose(arr, (0, 2, 3, 1))
+                arr = arr.reshape((1024, -1))
+            fuse_model[param_name] = torch.tensor(arr)
+
+        else:
+            conv_k = "%s.weight" % (param_name)
+            conv_b = "%s.bias" % (param_name)
+            bn_w_k = "%s.norm.weight" % (param_name)
+            bn_b_k = "%s.norm.bias" % (param_name)
+            bn_rm_k = "%s.norm.running_mean" % (param_name)
+            bn_rv_k = "%s.norm.running_var" % (param_name)
+            fused_conv_weight, fused_conv_bias = self.fuse_conv_bn_weights(
+                obj[conv_k],
+                None,
+                obj[bn_rm_k],
+                obj[bn_rv_k],
+                1e-5,
+                obj[bn_w_k],
+                obj[bn_b_k],
+            )
+            fuse_model[conv_k] = fused_conv_weight.permute((0, 2, 3, 1))
+            fuse_model[conv_b] = fused_conv_bias
+
+    def export_conv0(self, ait_model, fuse_model):
+        pt_name = "backbone.bottom_up.stem.conv1.weight"
+        x = fuse_model[pt_name]
+        conv_w = torch.zeros((64, 7, 7, 4))
+        conv_w[:, :, :, :3] = x
+        ait_model[pt_name.replace(".", "_")] = conv_w
+
+
+@click.command()
+@click.option("--model-name", default="", metavar="FILE", help="path to ait param file")
+@click.option("--d2-weight", default="", metavar="FILE", help="D2 weight")
+@click.option("--ait-weight", default="", metavar="FILE", help="AIT weight")
+def export_pt_model_to_ait(model_name, d2_weight, ait_weight):
+    d2ait = detectron2_export(model_name)
+    with open(d2_weight, "rb") as f:
+        file = f.read()
+    obj = pkl.loads(file, encoding="latin1")
+    pt_model = obj["model"]
+
+    ait_model = d2ait.export_model(pt_model)
+
+    torch.save(ait_model, ait_weight)
+
+
+if __name__ == "__main__":
+    export_pt_model_to_ait()
diff --git a/examples/03_bert/README.md b/examples/03_bert/README.md
new file mode 100644
index 000000000..2c6e4a489
--- /dev/null
+++ b/examples/03_bert/README.md
@@ -0,0 +1,303 @@
+# BERT
+
+This directory contains an AIT demo for the [BERT language representation model](https://huggingface.co/docs/transformers/v4.22.1/en/model_doc/bert).
+
+Only `bert-base-uncased` is included.
+
+## Prerequisites
+
+Install the dependencies:
+```
+python3 -m pip install transformers click torch
+```
+
+## Benchmarking
+
+To run a basic benchmark, use `benchmark.py`:
+
+```
+python3 examples/03_bert/benchmark_ait.py
+```
+
+There are two options for hidden activations, `gelu` and `fast_gelu` (`fast_gelu` by default).
+`gelu` is not supported on AMD hardware yet.
+
+```
+python3 examples/03_bert/benchmark_ait.py --activation gelu
+python3 examples/03_bert/benchmark_ait.py --activation fast_gelu
+```
+
+The batch size and sequence length can also be configured via the command line:
+```
+python3 examples/03_bert/benchmark_ait.py --batch_size 1 --seq_length 128
+```
+
+PyTorch eager mode benchmarks can also be run:
+```
+python3 examples/03_bert/benchmark_pt.py
+```
+
+To benchmark BERT embeddings, run benchmark with `--encoders-only False`
+
+## Quick Demo
+
+To run a quick demo with a simple prompt, use `demo.py`:
+```
+python3 examples/03_bert/demo.py --prompt "The quick brown fox jumps over the lazy dog."
+```
+
+The demo prints out the resulting logits. The demo only works with sequence length <= 512.
+
+## Multi-GPU profiling
+AIT requires to do profiling to decide best algorithms for CUTLASS and CK.
+To enable multiple GPUs profiling, use the environment variable `CUDA_VISIBLE_DEVICES` on NVIDIA platform and `HIP_VISIBLE_DEVICES` on AMD platform.
+
+## Reference Speed vs PyTorch Eager
+_PT = PyTorch 1.12 Eager_
+_OOM = Out of Memory_
+
+### A100-40GB / CUDA 11.6.2
+
+- Sequence length 64
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          | 7.96            | 125.65         | 0.71             | 1399.64         |
+| 2          | 8.38            | 238.59         | 0.74             | 2719.15         |
+| 4          | 8.29            | 482.30         | 0.80             | 4994.37         |
+| 8          | 8.51            | 939.97         | 0.95             | 8439.67         |
+| 16         | 8.09            | 1978.47        | 1.41             | 11385.85        |
+| 32         | 9.19            | 3481.34        | 2.23             | 14357.58        |
+| 64         | 9.12            | 7016.80        | 4.14             | 15458.15        |
+| 128        | 14.52           | 8814.57        | 8.00             | 15991.44        |
+| 256        | 27.75           | 9224.39        | 15.99            | 16006.79        |
+
+
+- Sequence length 128
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          | 8.02            | 124.72         | 0.78             | 1281.52         |
+| 2          | 8.29            | 241.22         | 0.85             | 2364.94         |
+| 4          | 8.51            | 470.29         | 0.99             | 4044.33         |
+| 8          | 8.12            | 985.72         | 1.43             | 5600.93         |
+| 16         | 9.22            | 1735.20        | 2.21             | 7232.47         |
+| 32         | 9.11            | 3512.80        | 4.17             | 7677.82         |
+| 64         | 15.29           | 4184.93        | 8.05             | 7949.06         |
+| 128        | 29.44           | 4347.33        | 16.03            | 7987.11         |
+| 256        | 56.34           | 4543.88        | 31.57            | 8109.08         |
+
+
+- Sequence length 384
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          | 8.72            | 114.73         | 1.63             | 611.91          |
+| 2          | 8.31            | 240.73         | 1.97             | 1013.19         |
+| 4          | 8.64            | 463.10         | 2.55             | 1569.23         |
+| 8          | 9.32            | 858.70         | 3.95             | 2025.62         |
+| 16         | 13.90           | 1151.03        | 6.80             | 2354.21         |
+| 32         | 26.72           | 1197.74        | 13.30            | 2405.46         |
+| 64         | 51.02           | 1254.34        | 26.68            | 2398.95         |
+| 128        | 100.26          | 1276.67        | 51.60            | 2480.67         |
+| 256        | OOM             | OOM            | 101.55           | 2520.81         |
+
+
+- Sequence length 1024
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          | 9.74            | 102.65         | 2.20             | 454.12          |
+| 2          | 11.38           | 175.75         | 4.15             | 481.95          |
+| 4          | 13.61           | 293.90         | 8.36             | 478.44          |
+| 8          | 25.79           | 310.15         | 12.53            | 638.53          |
+| 16         | 49.91           | 320.59         | 21.61            | 740.48          |
+| 32         | 97.00           | 329.91         | 42.84            | 746.88          |
+| 64         | 191.14          | 334.83         | 83.95            | 762.39          |
+| 128        | OOM             | OOM            | 163.96           | 780.70          |
+| 256        | OOM             | OOM            | 324.22           | 789.58          |
+
+
+
+- Sequence length 4096
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          | 32.82           | 30.47          | 18.23            | 54.87           |
+| 2          | 65.25           | 30.65          | 35.64            | 56.11           |
+| 4          | 128.73          | 31.07          | 103.67           | 38.58           |
+| 8          | OOM             | OOM            | 119.45           | 66.98           |
+| 16         | OOM             | OOM            | 166.25           | 96.24           |
+| 32         | OOM             | OOM            | 333.98           | 95.81           |
+| 64         | OOM             | OOM            | 662.29           | 96.63           |
+| 128        | OOM             | OOM            | 1313.77          | 97.43           |
+| 256        |                 |                |                  |                 |
+
+
+
+### MI-250 / ROCm 5.2.3 / HIPCC-10736
+
+#### 1 GCD
+
+- Sequence length 64
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          | 5.72            | 174.72         | 2.78             | 359.88          |
+| 2          | 5.96            | 335.38         | 2.87             | 697.76          |
+| 4          | 5.85            | 684.16         | 2.85             | 1404.31         |
+| 8          | 6.15            | 1300.72        | 3.15             | 2540.72         |
+| 16         | 6.14            | 2605.40        | 3.78             | 4231.12         |
+| 32         | 7.73            | 4138.06        | 5.34             | 5993.50         |
+| 64         | 14.38           | 4451.07        | 9.10             | 7030.42         |
+| 128        | 26.18           | 4889.95        | 16.45            | 7780.40         |
+| 256        | 49.95           | 5125.04        | 31.90            | 8023.98         |
+
+
+- Sequence length 128
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          | 5.76            | 173.55         | 2.68             | 373.03          |
+| 2          | 6.06            | 330.18         | 2.87             | 697.33          |
+| 4          | 5.96            | 670.65         | 3.02             | 1324.91         |
+| 8          | 6.03            | 1326.23        | 3.65             | 2194.62         |
+| 16         | 9.35            | 1711.55        | 4.98             | 3212.12         |
+| 32         | 16.46           | 1943.61        | 8.48             | 3775.22         |
+| 64         | 30.83           | 2075.74        | 15.44            | 4146.40         |
+| 128        | 58.74           | 2179.24        | 30.57            | 4187.68         |
+| 256        | 115.27          | 2220.87        | 59.28            | 4318.61         |
+
+
+- Sequence length 384
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          | 5.78            | 172.87         | 2.97             | 336.14          |
+| 2          | 6.02            | 332.30         | 3.45             | 579.89          |
+| 4          | 8.00            | 499.85         | 4.68             | 854.16          |
+| 8          | 13.79           | 580.01         | 7.47             | 1070.24         |
+| 16         | 24.39           | 656.06         | 13.04            | 1226.77         |
+| 32         | 45.56           | 702.33         | 24.26            | 1318.80         |
+| 64         | 87.84           | 728.57         | 47.87            | 1336.92         |
+| 128        | 172.57          | 741.71         | 95.22            | 1344.26         |
+| 256        | 352.27          | 726.71         | 185.94           | 1376.78         |
+
+
+
+- Sequence length 1024
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          | 6.86            | 145.71         | 4.20             | 237.84          |
+| 2          | 12.41           | 161.21         | 5.82             | 343.62          |
+| 4          | 22.25           | 179.80         | 10.20            | 392.26          |
+| 8          | 41.94           | 190.73         | 18.91            | 423.05          |
+| 16         | 81.03           | 197.45         | 37.86            | 422.60          |
+| 32         | 159.06          | 201.19         | 71.65            | 446.62          |
+| 64         | 321.51          | 199.06         | 148.86           | 429.95          |
+| 128        | OOM             | OOM            | 277.53           | 461.21          |
+| 256        | OOM             | OOM            | 563.07           | 454.65          |
+
+
+- Sequence length 4096
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          | 49.89           | 20.04          | 16.18            | 61.81           |
+| 2          | 93.22           | 21.45          | 30.67            | 65.21           |
+| 4          | 183.57          | 21.79          | 66.78            | 59.90           |
+| 8          | 366.57          | 21.82          | 117.49           | 68.09           |
+| 16         | OOM             | OOM            | 231.15           | 69.22           |
+| 32         | OOM             | OOM            | 459.46           | 69.65           |
+| 64         | OOM             | OOM            | 1031.86          | 62.02           |
+| 128        |                 |                |                  |                 |
+| 256        |                 |                |                  |                 |
+
+
+#### 2 GCDs
+
+- Sequence length 64
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          |                 |                |                  |                 |
+| 2          | 5.52            | 362.55         | 2.80             | 714.99          |
+| 4          | 6.04            | 661.73         | 2.89             | 1385.05         |
+| 8          | 6.07            | 1317.20        | 2.82             | 2835.38         |
+| 16         | 6.02            | 2659.82        | 3.29             | 4866.99         |
+| 32         | 6.09            | 5257.45        | 3.83             | 8352.10         |
+| 64         | 8.53            | 7506.95        | 5.81             | 11013.02        |
+| 128        | 15.34           | 8346.14        | 10.00            | 12806.23        |
+| 256        | 28.44           | 9002.30        | 18.92            | 13528.13        |
+
+
+- Sequence length 128
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          |                 |                |                  |                 |
+| 2          | 5.58            | 358.62         | 2.68             | 745.20          |
+| 4          | 6.20            | 644.91         | 2.83             | 1411.55         |
+| 8          | 6.08            | 1316.09        | 3.21             | 2492.88         |
+| 16         | 5.89            | 2716.79        | 3.86             | 4144.50         |
+| 32         | 9.86            | 3247.03        | 5.41             | 5915.33         |
+| 64         | 17.71           | 3614.25        | 9.64             | 6640.53         |
+| 128        | 32.74           | 3909.15        | 17.81            | 7186.25         |
+| 256        | 62.73           | 4080.77        | 35.73            | 7165.20         |
+
+
+- Sequence length 384
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          |                 |                |                  |                 |
+| 2          | 5.57            | 358.88         | 3.09             | 647.71          |
+| 4          | 6.12            | 653.83         | 3.62             | 1104.69         |
+| 8          | 8.35            | 958.19         | 4.94             | 1620.06         |
+| 16         | 14.29           | 1119.38        | 8.29             | 1930.01         |
+| 32         | 26.10           | 1226.17        | 14.96            | 2139.07         |
+| 64         | 50.01           | 1279.72        | 28.22            | 2268.02         |
+| 128        | 97.55           | 1312.15        | 55.94            | 2288.37         |
+| 256        | 193.00          | 1326.44        | 111.27           | 2300.68         |
+
+
+
+- Sequence length 1024
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          |                 |                |                  |                 |
+| 2          | 6.80            | 294.16         | 4.36             | 458.93          |
+| 4          | 13.01           | 307.55         | 6.43             | 622.23          |
+| 8          | 23.39           | 341.99         | 11.52            | 694.52          |
+| 16         | 44.45           | 359.94         | 21.83            | 732.90          |
+| 32         | 87.23           | 366.84         | 43.73            | 731.77          |
+| 64         | 172.92          | 370.12         | 82.92            | 771.85          |
+| 128        | 352.09          | 363.54         | 173.14           | 739.29          |
+| 256        | OOM             | OOM            | 322.97           | 792.64          |
+
+
+- Sequence length 4096
+
+| Batch size | PT Latency (ms) | PT QPS (seq/s) | AIT Latency (ms) | AIT QPS (seq/s) |
+|------------|-----------------|----------------|------------------|-----------------|
+| 1          |                 |                |                  |                 |
+| 2          | 54.67           | 36.58          | 18.31            | 109.23          |
+| 4          | 104.19          | 38.39          | 35.09            | 113.99          |
+| 8          | 206.62          | 38.72          | 77.03            | 103.86          |
+| 16         | 412.58          | 38.78          | 133.59           | 119.77          |
+| 32         | OOM             | OOM            | 263.40           | 121.49          |
+| 64         | OOM             | OOM            | 524.11           | 122.11          |
+| 128        | OOM             | OOM            | 1186.20          | 107.91          |
+| 256        |                 |                |                  |                 |
+
+
+### Note Performance Results
+
+- For NVIDIA A100, our test cluster doesn't allow to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- To benchmark MI-250, the first step is to run `python3 benchmark_ait.py` to generate all necessary model dynamic library files with single GCD. Then run `./benchmark_mi250.sh {batch_size}` to simulate data parallel execution on 2 GCDs, each GCD is processing half of the batch.
+- To benchmark MI-250 1 GCD, we lock the frequency with command `rocm-smi -d x --setperfdeterminism 1700`, where `x` is the GPU id.
+- To benchmark MI-250 2 GCDs, we observed performance regression with rocm perf-determ mode. The 2 GCDs number is running without perf-determ mode set with command `rocm-smi -d x --resetperfdeterminism`, where `x` is the GPU id.
+- PyTorch Eager result doesn't reflect [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/), mainly due to BetterTransformer integration to TIMM/Transformer package is not yet landed.
+- Performance results are what we can reproduced. It should not be used for other purposes.
diff --git a/examples/03_bert/benchmark_ait.py b/examples/03_bert/benchmark_ait.py
new file mode 100644
index 000000000..9847cb910
--- /dev/null
+++ b/examples/03_bert/benchmark_ait.py
@@ -0,0 +1,298 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+from collections import OrderedDict
+
+from typing import Dict, List
+
+import click
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model, Model
+
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from modeling.bert import BertBaseEncodersOnly, BertBaseUncased
+from modeling.torch_model import BertBaseUncased as BertPt
+
+
+def mark_output(y: Tensor) -> None:
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+def create_bert_inputs(
+    batch_size: int, seq_length: int, dtype: str = "int64"
+) -> List[Tensor]:
+    input_ids = Tensor(
+        shape=[batch_size, seq_length],
+        name="input_ids",
+        dtype=dtype,
+        is_input=True,
+    )
+    token_type_ids = Tensor(
+        shape=[batch_size, seq_length],
+        name="token_type_ids",
+        dtype=dtype,
+        is_input=True,
+    )
+    position_ids = Tensor(
+        shape=[batch_size, seq_length],
+        name="position_ids",
+        dtype=dtype,
+        is_input=True,
+    )
+    return [input_ids, token_type_ids, position_ids]
+
+
+def create_bert_encoders_input(
+    batch_size: int, seq_length: int, hidden: int, dtype: str = "float16"
+):
+    encoder_input = Tensor(
+        shape=[batch_size, seq_length, hidden],
+        name="input",
+        dtype=dtype,
+        is_input=True,
+    )
+    return [encoder_input]
+
+
+def create_bert_inputs_pt(
+    batch_size: int, seq_length: int, dtype: torch.dtype = torch.int64
+) -> Dict[str, torch.Tensor]:
+    input_ids = torch.randn(batch_size, seq_length).to(dtype).cuda()
+    token_type_ids = torch.randn(batch_size, seq_length).to(dtype).cuda()
+    position_ids = torch.randn(batch_size, seq_length).to(dtype).cuda()
+
+    return {
+        "input_ids": input_ids,
+        "token_type_ids": token_type_ids,
+        "position_ids": position_ids,
+    }
+
+
+def create_bert_encoders_inputs_pt(
+    batch_size: int, seq_length: int, hidden_size: int
+) -> Dict[str, torch.Tensor]:
+    encoder_input = torch.randn([batch_size, seq_length, hidden_size]).cuda().half()
+    return {"input": encoder_input}
+
+
+def map_pt_params(
+    ait_bert, pt_bert, batch_size: int, seq_length: int
+) -> Dict[str, torch.Tensor]:
+    pt_params = dict(pt_bert.named_parameters())
+    mapped_pt_params = OrderedDict()
+    for name, _ in ait_bert.named_parameters():
+        ait_name = name.replace(".", "_")
+        if name in pt_params:
+            mapped_pt_params[ait_name] = pt_params[name]
+            continue
+
+        if name.endswith("self.qkv.weight"):
+            prefix = name[: -len("qkv.weight")]
+            q_weight = pt_params[prefix + "query.weight"]
+            k_weight = pt_params[prefix + "key.weight"]
+            v_weight = pt_params[prefix + "value.weight"]
+            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
+            mapped_pt_params[ait_name] = qkv_weight
+        elif name.endswith("self.qkv.bias"):
+            prefix = name[: -len("qkv.bias")]
+            q_bias = pt_params[prefix + "query.bias"]
+            k_bias = pt_params[prefix + "key.bias"]
+            v_bias = pt_params[prefix + "value.bias"]
+            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0)
+            mapped_pt_params[ait_name] = qkv_bias
+        elif name.endswith("self.proj.weight"):
+            prefix = name[: -len("self.proj.weight")]
+            pt_name = prefix + "output.dense.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("self.proj.bias"):
+            prefix = name[: -len("self.proj.bias")]
+            pt_name = prefix + "output.dense.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("cu_length"):
+            cu_len = np.cumsum([0] + [seq_length] * batch_size).astype("int32")
+            mapped_pt_params[ait_name] = torch.from_numpy(cu_len).cuda()
+        else:
+            pt_param = pt_bert.get_parameter(name)
+            mapped_pt_params[ait_name] = pt_param
+
+    return mapped_pt_params
+
+
+def benchmark(
+    batch_size: int,
+    seq_length: int,
+    hidden_size: int,
+    mod: Model,
+    graph_mode: bool,
+    encoders_only: bool,
+):
+    if encoders_only:
+        inputs = create_bert_encoders_inputs_pt(batch_size, seq_length, hidden_size)
+    else:
+        inputs = create_bert_inputs_pt(batch_size, seq_length)
+    outputs = [torch.empty(mod.get_output_maximum_shape(0)).cuda().half()]
+
+    # warm up
+    t, _, __ = mod.benchmark_with_tensors(
+        inputs,
+        outputs,
+        count=100,
+        repeat=4,
+        graph_mode=graph_mode,
+    )
+    # benchmark
+    t, _, __ = mod.benchmark_with_tensors(
+        inputs,
+        outputs,
+        count=100,
+        repeat=4,
+        graph_mode=graph_mode,
+    )
+    print(f"batch_size: {batch_size}, seq_length: {seq_length}, latency: {t}")
+    dev_flag = os.environ.get("HIP_VISIBLE_DEVICES", "-1")
+    dev_flag = dev_flag.replace(",", "_")
+    with open(f"bert_ait_benchmark_dev_{dev_flag}.txt", "a") as f:
+        f.write(f"batch_size: {batch_size}, seq_length: {seq_length}, latency: {t}\n")
+
+
+def compile_module(
+    batch_size: int,
+    seq_length: int,
+    hidden_size: int,
+    activation: str,
+    use_fp16_acc: bool,
+    encoders_only: bool,
+    pt_model: torch.nn.Module,
+) -> None:
+    model_name = f"BERT_{activation}_{batch_size}_{seq_length}"
+    target = detect_target(use_fp16_acc=use_fp16_acc)
+
+    if encoders_only:
+        inputs = create_bert_encoders_input(batch_size, seq_length, hidden_size)
+    else:
+        inputs = create_bert_inputs(batch_size, seq_length)
+
+    if encoders_only:
+        model = BertBaseEncodersOnly(batch_size, seq_length, hidden_act=activation)
+    else:
+        model = BertBaseUncased(batch_size, seq_length, hidden_act=activation)
+
+    # Mark all parameters with name same to PyTorch name convention
+    model.name_parameter_tensor()
+    # Forward the input tensor to the model, get output tensor
+    y = model(*inputs)
+    # Mark output tensor
+    mark_output(y)
+
+    params = map_pt_params(model, pt_model, batch_size, seq_length)
+
+    mod = compile_model(y, target, "./tmp", model_name)
+
+    for k, v in params.items():
+        mod.set_constant_with_tensor(k, v)
+
+    return mod
+
+
+@click.command()
+@click.option("--batch-size", type=int, default=0, help="Inference batch size")
+@click.option("--seq-length", type=int, default=0, help="Inference sequence length")
+@click.option(
+    "--activation",
+    type=str,
+    default="fast_gelu",
+    help="Activation function applied on BERT, currently only support fast_gelu on Rocm. CUDA supports both gelu and fast_gelu. No effect if framework is pt.",
+)
+@click.option(
+    "--graph-mode",
+    type=bool,
+    default=True,
+    help="Use CUDA graph or not. hipGraph is not supported yet. No effect if framework is pt.",
+)
+@click.option(
+    "--use-fp16-acc",
+    type=bool,
+    default=True,
+    help="Use fp16 accumulation or not (TensorRT is using fp16_acc). No effect if framework is pt.",
+)
+@click.option(
+    "--use-pretrained-pt-model",
+    type=bool,
+    default=True,
+    help="Whether or not to use the pretrained BERT model weights.",
+)
+@click.option(
+    "--encoders-only",
+    type=bool,
+    default=True,
+    help="Whether or not to run the BERT benchmark with encoders only. If enabled, only the transformer blocks without BERT embeddings are benchmarked.",
+)
+def compile_and_benchmark(
+    batch_size: int,
+    seq_length: int,
+    activation: str,
+    graph_mode: bool,
+    use_fp16_acc: bool,
+    use_pretrained_pt_model: bool,
+    encoders_only: bool,
+):
+    if detect_target().name() == "rocm":
+        graph_mode = False
+        assert activation in (
+            "fast_gelu"
+        ), f"Unsupported activation: {activation} on rocm"
+
+    pt_model = BertPt(pretrained=use_pretrained_pt_model)._model
+    pt_model.eval()
+    hidden_size = pt_model.config.hidden_size
+
+    if batch_size < 1:
+        batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
+    else:
+        batch_sizes = [batch_size]
+
+    if seq_length < 1:
+        seq_lengths = (
+            [64, 128, 384, 512, 1024, 4096] if encoders_only else [64, 128, 384, 512]
+        )
+    else:
+        seq_lengths = [seq_length]
+
+    for seq_length in seq_lengths:
+        for bs in batch_sizes:
+            mod = compile_module(
+                bs,
+                seq_length,
+                hidden_size,
+                activation,
+                use_fp16_acc,
+                encoders_only,
+                pt_model,
+            )
+            benchmark(bs, seq_length, hidden_size, mod, graph_mode, encoders_only)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(4896)
+    compile_and_benchmark()
diff --git a/examples/03_bert/benchmark_mi250.sh b/examples/03_bert/benchmark_mi250.sh
new file mode 100644
index 000000000..32e935650
--- /dev/null
+++ b/examples/03_bert/benchmark_mi250.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+HIP_VISIBLE_DEVICES=0 python3 benchmark.py --batch-size $1 &
+HIP_VISIBLE_DEVICES=1 python3 benchmark.py --batch-size $1 && fg
diff --git a/examples/03_bert/benchmark_pt.py b/examples/03_bert/benchmark_pt.py
new file mode 100644
index 000000000..586df4fea
--- /dev/null
+++ b/examples/03_bert/benchmark_pt.py
@@ -0,0 +1,148 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import click
+import torch
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from modeling.torch_model import BertBaseUncased
+
+
+def benchmark_pt(pretrained=True, batchsize=0):
+    bert = BertBaseUncased(pretrained)
+    model = bert._model
+    model.eval()
+
+    if batchsize == 0:
+        candidate_batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
+    else:
+        candidate_batch_sizes = [batchsize]
+
+    with torch.inference_mode():
+        for seq_length in [64, 128, 384, 512]:
+            for batch_size in candidate_batch_sizes:
+                try:
+                    input_ids, token_type_ids, position_ids = bert.generate_inputs(
+                        batch_size, seq_length
+                    )
+                    bert.forward(
+                        input_ids=input_ids,
+                        token_type_ids=token_type_ids,
+                        position_ids=position_ids,
+                    )
+                    # warmup
+                    t = benchmark_torch_function(
+                        100,
+                        bert.forward,
+                        input_ids=input_ids,
+                        token_type_ids=token_type_ids,
+                        position_ids=position_ids,
+                    )
+                    # benchmark
+                    t = benchmark_torch_function(
+                        100,
+                        bert.forward,
+                        input_ids=input_ids,
+                        token_type_ids=token_type_ids,
+                        position_ids=position_ids,
+                    )
+                    print(
+                        f"bert pt: batch_size: {batch_size}, seq_length: {seq_length}, {t} ms",
+                    )
+                    with open("bert_pt_benchmark.txt", "a") as f:
+                        f.write(
+                            f"batch_size: {batch_size}, seq_length: {seq_length} latency: {t} ms\n"
+                        )
+                except RuntimeError:
+                    # pt runs out of memory
+                    break
+
+
+def benchmark_pt_encoders_only(pretrained=True, batchsize=0):
+    model = BertBaseUncased(pretrained)
+    pt_bert = model._model
+    pt_bert.eval()
+
+    encoder = pt_bert.bert.encoder
+    hidden_size = pt_bert.config.hidden_size
+
+    if batchsize == 0:
+        candidate_batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
+    else:
+        candidate_batch_sizes = [batchsize]
+
+    for seq_length in [64, 128, 384, 512, 1024, 4096]:
+        for batch_size in candidate_batch_sizes:
+            try:
+                encoder_input = (
+                    torch.randn([batch_size, seq_length, hidden_size]).cuda().half()
+                )
+                encoder.forward(encoder_input)
+                # warmup
+                t = benchmark_torch_function(
+                    100,
+                    encoder.forward,
+                    encoder_input,
+                )
+                # benchmark
+                t = benchmark_torch_function(
+                    100,
+                    encoder.forward,
+                    encoder_input,
+                )
+                print(
+                    f"bert encoders pt: batch_size: {batch_size}, seq_length: {seq_length}, {t} ms",
+                )
+                with open("bert_encoders_pt_benchmark.txt", "a") as f:
+                    f.write(
+                        f"batch_size: {batch_size}, seq_length: {seq_length} latency: {t} ms\n"
+                    )
+            except RuntimeError:
+                # pt runs out of memory
+                break
+
+
+@click.command()
+@click.option(
+    "--use-pretrained-pt-model",
+    type=bool,
+    default=True,
+    help="Whether or not to use the pretrained BERT model weights.",
+)
+@click.option(
+    "--encoders-only",
+    type=bool,
+    default=True,
+    help="Whether or not to run the BERT benchmark with encoders only. If enabled, only the transformer blocks without BERT embeddings are benchmarked.",
+)
+@click.option(
+    "--batch-size",
+    type=int,
+    default=0,
+    help="The batch size to use for the benchmark. If 0, the batch size is default [1 : 128].",
+)
+def benchmark(
+    use_pretrained_pt_model: bool,
+    encoders_only: bool,
+    batch_size: int,
+):
+    if encoders_only:
+        benchmark_pt_encoders_only(use_pretrained_pt_model, batch_size)
+    else:
+        benchmark_pt(use_pretrained_pt_model, batch_size)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(4896)
+    benchmark()
diff --git a/examples/03_bert/demo.py b/examples/03_bert/demo.py
new file mode 100644
index 000000000..d783b6423
--- /dev/null
+++ b/examples/03_bert/demo.py
@@ -0,0 +1,108 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import click
+
+import torch
+
+from benchmark_ait import compile_module
+from modeling.torch_model import BertBaseUncased as BertPt
+from transformers import BertTokenizer
+
+
+def prepare_data(prompt: str):
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    result = tokenizer(prompt, return_attention_mask=False, return_tensors="pt")
+    target_size = result["input_ids"].size()
+    if target_size[1] > 512:
+        raise ValueError("Sequence length > 512 is not supported")
+
+    result["position_ids"] = (
+        torch.arange(target_size[1], dtype=torch.int64)
+        .reshape(result["input_ids"].size())
+        .contiguous()
+        .cuda()
+    )
+    return result
+
+
+def run_model(
+    prompt: str, activation: str, graph_mode: bool, use_fp16_acc: bool, verify: bool
+):
+    inputs = prepare_data(prompt)
+    inputs_pt = {name: data.cuda() for name, data in inputs.items()}
+    batch_size, seq_len = inputs["input_ids"].size()
+
+    pt_model = BertPt(pretrained=True)._model
+    pt_model.eval()
+    hidden_size = pt_model.config.hidden_size
+
+    mod = compile_module(
+        batch_size, seq_len, hidden_size, activation, use_fp16_acc, False, pt_model
+    )
+
+    outputs = [torch.empty(mod.get_output_maximum_shape(0)).half().cuda()]
+    mod.run_with_tensors(inputs_pt, outputs, graph_mode=graph_mode)
+
+    print(f"Logits: {outputs[0]}")
+    if verify:
+        pt_outputs = pt_model.bert(**inputs_pt)
+        torch.allclose(outputs[0], pt_outputs.last_hidden_state, 1e-1, 1e-1)
+        print("Verification done!")
+
+
+@click.command()
+@click.option(
+    "--prompt",
+    type=str,
+    default="The quick brown fox jumps over the lazy dog.",
+    help="The prompt to give BERT.",
+)
+@click.option(
+    "--activation",
+    type=str,
+    default="fast_gelu",
+    help="Activation function applied on BERT, currently only support gelu and fast_gelu",
+)
+@click.option(
+    "--graph_mode",
+    type=bool,
+    default=True,
+    help="Use CUDA graph or not. (hipGraph is not supported yet)",
+)
+@click.option(
+    "--use_fp16_acc",
+    type=bool,
+    default=True,
+    help="Use fp16 accumulation or not (TensorRT is using fp16_acc)",
+)
+@click.option(
+    "--verify",
+    type=bool,
+    default=True,
+    help="Verify AIT outputs against PT",
+)
+def run_demo(
+    prompt: str,
+    activation: str,
+    graph_mode: bool,
+    use_fp16_acc: bool,
+    verify: bool,
+):
+    run_model(prompt, activation, graph_mode, use_fp16_acc, verify)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(4896)
+    run_demo()
diff --git a/examples/03_bert/modeling/__init__.py b/examples/03_bert/modeling/__init__.py
new file mode 100644
index 000000000..5cf1a826f
--- /dev/null
+++ b/examples/03_bert/modeling/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/examples/03_bert/modeling/bert.py b/examples/03_bert/modeling/bert.py
new file mode 100644
index 000000000..a3a29b54f
--- /dev/null
+++ b/examples/03_bert/modeling/bert.py
@@ -0,0 +1,391 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Tuple
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+# pylint: disable=W0102
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, hidden_size, layer_norm_eps):
+        """dense + add is included in nn.MultiheadAttention.
+        This class now only contains LayerNorm.
+        """
+        super().__init__()
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        if not USE_CUDA:
+            hidden_states = (
+                hidden_states
+                if hidden_states._rank() == 2
+                else ops.reshape()(hidden_states, [-1, hidden_states._size(-1)])
+            )
+        # [B, S, H] on cuda, [B * S, H] on rocm
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        seq_len,
+        hidden_size,
+        num_attention_heads,
+        layer_norm_eps,
+        attention_probs_dropout_prob=0.0,
+        hidden_dropout_prob=0.0,
+    ):
+        super().__init__()
+        self.self = nn.MultiheadAttention(
+            dim=hidden_size,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_heads=num_attention_heads,
+            qkv_bias=True,
+            attn_drop=attention_probs_dropout_prob,
+            proj_drop=hidden_dropout_prob,
+            has_residual=True,
+        )
+        self.output = BertSelfOutput(hidden_size, layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+    ) -> Tuple[Tensor]:
+        self_output = self.self(hidden_states, hidden_states)
+        attention_output = self.output(self_output)
+        outputs = (attention_output,)
+        return outputs
+
+
+# FFN block
+class BertIntermediate(nn.Module):
+    def __init__(self, hidden_size, intermediate_size, hidden_act):
+        super().__init__()
+        # dense + activation
+        self.dense = nn.Linear(
+            hidden_size, intermediate_size, specialization=hidden_act
+        )
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(
+        self, hidden_size, intermediate_size, layer_norm_eps, hidden_dropout_prob
+    ):
+        super().__init__()
+        assert hidden_dropout_prob == 0.0
+        # dense + add
+        self.dense = nn.Linear(intermediate_size, hidden_size, specialization="add")
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+
+    def forward(self, hidden_states: Tensor, input_tensor: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states, input_tensor)
+        # hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        batch_size,
+        seq_len,
+        num_attention_heads,
+        intermediate_size,
+        hidden_act,
+        layer_norm_eps,
+        attention_probs_dropout_prob,
+        hidden_dropout_prob,
+    ):
+        super().__init__()
+        self.attention = BertAttention(
+            batch_size=batch_size,
+            seq_len=seq_len,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            layer_norm_eps=layer_norm_eps,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+        )
+        self.intermediate = BertIntermediate(hidden_size, intermediate_size, hidden_act)
+        self.output = BertOutput(
+            hidden_size, intermediate_size, layer_norm_eps, hidden_dropout_prob
+        )
+
+    def feed_forward(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+    ):
+        # [B, S, H]
+        shape = hidden_states.shape()
+        # [B, S, H] on cuda, [B * S, H] on rocm
+        self_attention_outputs = self.attention(hidden_states)
+        layer_output = self.feed_forward(self_attention_outputs[0])
+        # [B * S, H] to [B, S, H] on rocm
+        layer_output = (
+            layer_output
+            if layer_output._rank() == 3
+            else ops.reshape()(layer_output, shape)
+        )
+        return (layer_output,)
+
+
+class BertEncoder(nn.Module):
+    def __init__(
+        self,
+        num_hidden_layers,
+        hidden_size,
+        batch_size,
+        seq_len,
+        num_attention_heads,
+        intermediate_size,
+        hidden_act,
+        layer_norm_eps,
+        attention_probs_dropout_prob,
+        hidden_dropout_prob,
+    ):
+        super().__init__()
+        self.layer = nn.ModuleList(
+            [
+                BertLayer(
+                    batch_size=batch_size,
+                    seq_len=seq_len,
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    intermediate_size=intermediate_size,
+                    hidden_act=hidden_act,
+                    layer_norm_eps=layer_norm_eps,
+                    attention_probs_dropout_prob=attention_probs_dropout_prob,
+                    hidden_dropout_prob=hidden_dropout_prob,
+                )
+                for _ in range(num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+    ):
+        for layer_module in self.layer:
+            layer_outputs = layer_module(hidden_states)
+            hidden_states = layer_outputs[0]
+
+        return layer_outputs
+
+
+class BertModel(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        seq_len,
+        vocab_size,
+        max_position_embeddings,
+        type_vocab_size,
+        num_hidden_layers,
+        hidden_size,
+        num_attention_heads,
+        intermediate_size,
+        hidden_act,
+        layer_norm_eps,
+        attention_probs_dropout_prob,
+        hidden_dropout_prob,
+        add_pooling_layer=False,
+    ):
+        super().__init__()
+        assert not add_pooling_layer
+
+        self.embeddings = nn.BertEmbeddings(
+            hidden_size=hidden_size,
+            vocab_size=vocab_size,
+            max_position_embeddings=max_position_embeddings,
+            type_vocab_size=type_vocab_size,
+            layer_norm_eps=layer_norm_eps,
+            hidden_dropout_prob=hidden_dropout_prob,
+        )
+        self.encoder = BertEncoder(
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_hidden_layers=num_hidden_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            hidden_act=hidden_act,
+            layer_norm_eps=layer_norm_eps,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+        )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Tensor,
+        position_ids: Tensor,
+    ):
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+        )
+        return encoder_outputs
+
+
+class BertModelEncodersOnly(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        seq_len,
+        num_hidden_layers,
+        hidden_size,
+        num_attention_heads,
+        intermediate_size,
+        hidden_act,
+        layer_norm_eps,
+        attention_probs_dropout_prob,
+        hidden_dropout_prob,
+        add_pooling_layer=False,
+    ):
+        super().__init__()
+        assert not add_pooling_layer
+
+        self.encoder = BertEncoder(
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_hidden_layers=num_hidden_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            hidden_act=hidden_act,
+            layer_norm_eps=layer_norm_eps,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+        )
+
+    def forward(
+        self,
+        encoder_input: Tensor,
+    ):
+        encoder_outputs = self.encoder(encoder_input)
+        return encoder_outputs
+
+
+class BertBaseUncased(nn.Module):
+    """Bert base uncased with no classification head."""
+
+    def __init__(
+        self,
+        batch_size,
+        seq_len,
+        vocab_size=30522,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        num_hidden_layers=12,
+        hidden_size=768,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        layer_norm_eps=1e-12,
+        attention_probs_dropout_prob=0.0,
+        hidden_dropout_prob=0.0,
+    ):
+        super().__init__()
+        self.bert = BertModel(
+            batch_size=batch_size,
+            seq_len=seq_len,
+            vocab_size=vocab_size,
+            max_position_embeddings=max_position_embeddings,
+            type_vocab_size=type_vocab_size,
+            num_hidden_layers=num_hidden_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            hidden_act=hidden_act,
+            layer_norm_eps=layer_norm_eps,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+            add_pooling_layer=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Tensor,
+        position_ids: Tensor,
+    ) -> Tensor:
+        outputs = self.bert(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+        )
+        return outputs
+
+
+class BertBaseEncodersOnly(nn.Module):
+    """Bert base uncased with no classification head and no embeddings."""
+
+    def __init__(
+        self,
+        batch_size,
+        seq_len,
+        num_hidden_layers=12,
+        hidden_size=768,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        layer_norm_eps=1e-12,
+        attention_probs_dropout_prob=0.0,
+        hidden_dropout_prob=0.0,
+    ):
+        super().__init__()
+        self.bert = BertModelEncodersOnly(
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_hidden_layers=num_hidden_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            hidden_act=hidden_act,
+            layer_norm_eps=layer_norm_eps,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+            add_pooling_layer=False,
+        )
+
+    def forward(
+        self,
+        encoder_input: Tensor,
+    ) -> Tensor:
+        outputs = self.bert(encoder_input)
+        return outputs
diff --git a/examples/03_bert/modeling/torch_model.py b/examples/03_bert/modeling/torch_model.py
new file mode 100644
index 000000000..cbc965c70
--- /dev/null
+++ b/examples/03_bert/modeling/torch_model.py
@@ -0,0 +1,51 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from transformers import AutoModelForMaskedLM, BertForMaskedLM
+
+
+class BertBaseUncased:
+    def __init__(self, pretrained=True):
+        if not pretrained:
+            pretrained = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
+            self._model = BertForMaskedLM(pretrained.config).cuda().half()
+        else:
+            self._model = (
+                AutoModelForMaskedLM.from_pretrained("bert-base-uncased").cuda().half()
+            )
+        self._vocab_size = 30522
+
+    def forward(self, *args, **kwargs):
+        # runs the full model with classification head
+        outputs = self._model(*args, **kwargs)
+        return outputs.logits
+
+    def generate_inputs(self, batch_size, seq_len):
+        dtype = torch.long
+        input_ids = torch.randint(
+            0, self._vocab_size, (batch_size, seq_len), dtype=dtype
+        ).cuda()
+        token_type_ids = torch.zeros(input_ids.size(), dtype=dtype).cuda()
+        position_ids = (
+            torch.arange(seq_len, dtype=dtype)
+            .reshape((1, -1))
+            .expand(batch_size, -1)
+            .contiguous()
+            .cuda()
+        )
+        return (input_ids, token_type_ids, position_ids)
+
+    def get_parameters(self):
+        return dict(self._model.named_parameters())
diff --git a/examples/04_vit/README.md b/examples/04_vit/README.md
new file mode 100644
index 000000000..fc747b4a1
--- /dev/null
+++ b/examples/04_vit/README.md
@@ -0,0 +1,126 @@
+# Vision Transformer (VIT)
+
+In this example, we will demo how to lower a pretrained Vision Transformer from TIMM, and run inference in AITemplate. We tested on two variants of Vision Transformer: Base version with 224x224 input / patch 16, and Large version with 384x384 input / patch 16.
+
+## Code structure
+```
+modeling
+    vision_transformer.py    # VIT definition using AIT's frontend API
+weight_utils.py              # Utils to convert TIMM VIT weights to AIT
+verification.py              # Numerical verification between TIMM and AIT
+benchmark_pt.py              # Benchmark code for PyTorch
+benchmark_ait.py             # Benchmark code for AITemplate
+```
+
+## Reference Speed vs PyTorch Eager
+
+### A100-40GB / CUDA 11.6.2
+_PT = PyTorch 1.12 Eager_
+
+- vit_base_patch16_224
+
+| Batch size | PT Latency (ms) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) |
+|------------|-----------------|---------------|------------------|----------------|
+| 1          | 4.95            | 202.15        | 1.02             | 979.31         |
+| 2          | 5.26            | 380.43        | 1.15             | 1735.64        |
+| 4          | 5.51            | 726.08        | 1.57             | 2543.72        |
+| 8          | 5.56            | 1439.03       | 2.20             | 3642.16        |
+| 16         | 8.59            | 1863.35       | 3.64             | 4396.74        |
+| 32         | 15.95           | 2006.62       | 6.51             | 4916.93        |
+| 64         | 31.48           | 2032.77       | 12.67            | 5052.52        |
+| 128        | 59.86           | 2138.35       | 25.10            | 5099.77        |
+| 256        | 115.00          | 2226.10       | 48.55            | 5273.03        |
+
+
+- vit_large_patch16_384
+
+| Batch size | PT Latency (ms) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) |
+|------------|-----------------|---------------|------------------|----------------|
+| 1          | 9.88            | 101.17        | 3.84             | 260.21         |
+| 2          | 11.90           | 168.02        | 5.87             | 340.98         |
+| 4          | 21.20           | 188.66        | 11.49            | 348.09         |
+| 8          | 39.33           | 203.43        | 19.09            | 419.07         |
+| 16         | 76.00           | 210.54        | 36.19            | 442.08         |
+| 32         | 147.24          | 217.33        | 70.03            | 456.93         |
+| 64         | 291.00          | 219.93        | 135.25           | 473.21         |
+| 128        | 578.99          | 221.08        | 267.09           | 479.24         |
+| 256        | 1204.16         | 212.60        | 538.97           | 474.98         |
+
+
+### MI-250 / ROCm 5.2.3 / HIPCC-10736
+_PT = PyTorch 1.12 Eager_
+
+#### 1 GCD
+
+- vit_base_patch16_224
+
+| Batch size | PT Latency (ms) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) |
+|------------|-----------------|---------------|------------------|----------------|
+| 1          | 3.54            | 282.12        | 3.49             | 286.26         |
+| 2          | 4.43            | 451.73        | 3.78             | 528.84         |
+| 4          | 6.09            | 657.02        | 4.05             | 986.95         |
+| 8          | 9.65            | 829.27        | 5.31             | 1507.06        |
+| 16         | 16.62           | 962.98        | 8.50             | 1882.72        |
+| 32         | 29.87           | 1071.25       | 14.43            | 2218.07        |
+| 64         | 56.58           | 1131.08       | 26.52            | 2413.45        |
+| 128        | 110.28          | 1160.73       | 51.62            | 2479.69        |
+| 256        | 217.07          | 1179.35       | 102.82           | 2489.89        |
+
+
+
+- vit_large_patch16_384
+
+| Batch size | PT Latency (ms) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) |
+|------------|-----------------|---------------|------------------|----------------|
+| 1          | 12.90           | 77.51         | 9.70             | 103.05         |
+| 2          | 22.42           | 89.19         | 13.40            | 149.29         |
+| 4          | 38.16           | 104.83        | 22.12            | 180.86         |
+| 8          | 70.58           | 113.35        | 38.46            | 208.00         |
+| 16         | 136.28          | 117.40        | 70.44            | 227.15         |
+| 32         | 261.97          | 122.15        | 138.14           | 231.65         |
+| 64         | 541.90          | 118.10        | 270.01           | 237.02         |
+| 128        | 1108.36         | 115.49        | 534.97           | 239.27         |
+| 256        | 2213.09         | 115.68        | 1063.24          | 240.77         |
+
+
+#### 2 GCDs
+
+- vit_base_patch16_224
+
+| Batch size | PT Latency (ms) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) |
+|------------|-----------------|---------------|------------------|----------------|
+| 1          |                 |               |                  |                |
+| 2          | 3.49            | 572.95        | 3.59             | 556.55         |
+| 4          | 4.11            | 974.26        | 3.97             | 1006.80        |
+| 8          | 5.88            | 1359.64       | 4.23             | 1889.44        |
+| 16         | 9.75            | 1641.06       | 5.71             | 2800.69        |
+| 32         | 17.55           | 1823.03       | 9.34             | 3426.32        |
+| 64         | 31.31           | 2043.79       | 16.24            | 3940.53        |
+| 128        | 60.33           | 2121.64       | 30.97            | 4133.14        |
+| 256        | 117.96          | 2170.29       | 59.82            | 4279.21        |
+
+
+- vit_large_patch16_384
+
+| Batch size | PT Latency (ms) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) |
+|------------|-----------------|---------------|------------------|----------------|
+| 1          |                 |               |                  |                |
+| 2          | 12.73           | 157.07        | 10.52            | 190.13         |
+| 4          | 22.97           | 174.12        | 14.94            | 267.82         |
+| 8          | 39.78           | 201.08        | 24.55            | 325.85         |
+| 16         | 74.95           | 213.48        | 43.95            | 364.07         |
+| 32         | 146.18          | 218.91        | 82.04            | 390.06         |
+| 64         | 283.04          | 226.12        | 162.62           | 393.55         |
+| 128        | 583.03          | 219.54        | 313.34           | 408.51         |
+| 256        | 1197.56         | 213.77        | 621.71           | 411.77         |
+
+
+
+### Note for Performance Results
+
+- For NVIDIA A100, our test cluster doesn't allow to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- To benchmark MI-250, the first step is to run `python3 benchmark_ait.py` to generate all necessary model dynamic library files with single GCD. Then run `./benchmark_mi250.sh {batch_size}` to simulate data parallel execution on 2 GCDs, each GCD is processing half of the batch.
+- To benchmark MI-250 1 GCD, we lock the frequency with command `rocm-smi -d x --setperfdeterminism 1700`, where `x` is the GPU id.
+- To benchmark MI-250 2 GCDs, we observed performance regression with rocm perf-determ mode. The 2 GCDs number is running without perf-determ mode set with command `rocm-smi -d x --resetperfdeterminism`, where `x` is the GPU id.
+- PyTorch Eager result doesn't reflect [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/), mainly due to BetterTransformer integration to TIMM/Transformer package is not yet landed.
+- Performance results are what we can reproduce. It should not be used for other purposes.
diff --git a/examples/04_vit/benchmark_ait.py b/examples/04_vit/benchmark_ait.py
new file mode 100644
index 000000000..c302d297d
--- /dev/null
+++ b/examples/04_vit/benchmark_ait.py
@@ -0,0 +1,186 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""benchmark for vit"""
+
+import os
+
+import click
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model, Model
+
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from modeling.vision_transformer import VisionTransformer
+from weight_utils import export_to_torch_tensor
+
+# flake8: noqa
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def compile_vit(
+    model_name,
+    batch_size,
+    class_token=False,
+    global_pool="avg",
+    use_fp16_acc=True,
+):
+    img_size = 224
+    patch_size = 16
+    embed_dim = 768
+    num_heads = 12
+    depth = 12
+    if model_name == "vit_base_patch16_224":
+        img_size = 224
+        patch_size = 16
+        embed_dim = 768
+        num_heads = 12
+        depth = 12
+    elif model_name == "vit_large_patch16_384":
+        img_size = 384
+        patch_size = 16
+        embed_dim = 1024
+        num_heads = 16
+        depth = 24
+    seqlen = (img_size // patch_size) ** 2 + (1 if class_token else 0)
+    ait_model = VisionTransformer(
+        batch_size=batch_size,
+        img_size=img_size,
+        class_token=class_token,
+        global_pool=global_pool,
+        num_heads=num_heads,
+        embed_dim=embed_dim,
+        patch_size=patch_size,
+        depth=depth,
+        act_layer="GELU",
+    )
+    ait_model.name_parameter_tensor()
+    inputs_ait = Tensor(
+        [batch_size, img_size, img_size, 3], name="input0", is_input=True
+    )
+    Y = ait_model(inputs_ait)
+    mark_output(Y)
+
+    target = detect_target(use_fp16_acc=use_fp16_acc)
+    exe_module = compile_model(
+        Y, target, "./tmp", "vision_transformer_bs%d_seq%d" % (batch_size, seqlen)
+    )
+    return exe_module
+
+
+def benchmark(model_name, batch_size, mod=None, graph_mode=True):
+    # load mod
+    if model_name == "vit_base_patch16_224":
+        img_size = 224
+        patch_size = 16
+        embed_dim = 768
+        num_heads = 12
+        depth = 12
+    elif model_name == "vit_large_patch16_384":
+        img_size = 384
+        patch_size = 16
+        embed_dim = 1024
+        num_heads = 16
+        depth = 24
+    else:
+        raise NotImplementedError
+
+    seqlen = (img_size // patch_size) ** 2
+
+    if mod is None:
+        model_dir = f"vision_transformer_bs{batch_size}_seq{seqlen}"
+        mod = Model(os.path.join("./tmp", model_dir, "test.so"))
+
+    # prepare params
+    params_ait = export_to_torch_tensor(model_name)
+    if detect_target().name() == "cuda":
+        ait_key = "attn_cu_length"
+        for i in range(depth):
+            prefix = "blocks_%d" % (i)
+            cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
+            params_ait[f"{prefix}_{ait_key}"] = torch.from_numpy(cu_len).cuda()
+
+    # set weights
+    for name, weight in params_ait.items():
+        mod.set_constant_with_tensor(name, weight)
+
+    # prepare input/output tensor
+    inputs = [torch.randn([batch_size, img_size, img_size, 3]).cuda().half()]
+    ys = []
+    num_ouputs = len(mod.get_output_name_to_index_map())
+    for i in range(num_ouputs):
+        shape = mod.get_output_maximum_shape(i)
+        ys.append(torch.empty(shape).cuda().half())
+    # warm up
+    t, _, __ = mod.benchmark_with_tensors(
+        inputs,
+        ys,
+        count=100,
+        repeat=4,
+        graph_mode=graph_mode,
+    )
+    # benchmark
+    t, _, __ = mod.benchmark_with_tensors(
+        inputs,
+        ys,
+        count=100,
+        repeat=4,
+        graph_mode=graph_mode,
+    )
+    print(f"batch_size: {batch_size}, latency: {t}")
+    dev_flag = os.environ.get("HIP_VISIBLE_DEVICES", "-1")
+    dev_flag = dev_flag.replace(",", "_")
+    with open(f"{model_name}_ait_benchmark_dev_{dev_flag}.txt", "a") as f:
+        f.write(f"batch_size: {batch_size}, latency: {t}\n")
+
+
+@click.command()
+@click.option("--model-name", type=str, default="vit_base_patch16_224")
+@click.option(
+    "--use-fp16-acc",
+    type=bool,
+    default=True,
+    help="Whether to use FP16 for accumulation (similar to TensorRT)",
+)
+@click.option("--use-graph", type=bool, default=True, help="Whether to use CUDA graph")
+@click.option("--batch-size", type=int, default=0, help="Batch size")
+def main(
+    model_name="vit_base_patch16_224", use_fp16_acc=True, use_graph=True, batch_size=0
+):
+    if detect_target().name() == "rocm":
+        use_graph = False
+    if batch_size < 1:
+        for bs in (1, 2, 4, 8, 16, 32, 64, 128, 256):
+            compile_vit(model_name, bs, use_fp16_acc=use_fp16_acc)
+            benchmark(model_name, bs, graph_mode=use_graph)
+    else:
+        benchmark(model_name, batch_size, graph_mode=use_graph)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/04_vit/benchmark_mi250.sh b/examples/04_vit/benchmark_mi250.sh
new file mode 100644
index 000000000..883846b68
--- /dev/null
+++ b/examples/04_vit/benchmark_mi250.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size "$1" &
+HIP_VISIBLE_DEVICES=1 python3 benchmark_ait.py --batch-size "$1" && fg
diff --git a/examples/04_vit/benchmark_pt.py b/examples/04_vit/benchmark_pt.py
new file mode 100644
index 000000000..48834e295
--- /dev/null
+++ b/examples/04_vit/benchmark_pt.py
@@ -0,0 +1,100 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+
+import click
+import torch
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from timm.models.vision_transformer import VisionTransformer
+from torch import nn
+
+
+def create_vit(model_name):
+    if model_name == "vit_base_patch16_224":
+        img_size = 224
+        embed_dim = 768
+        class_token = False
+        global_pool = "avg"
+        depth = 12
+        patch_size = 16
+        num_heads = 12
+    elif model_name == "vit_large_patch16_384":
+        img_size = 384
+        embed_dim = 1024
+        class_token = False
+        global_pool = "avg"
+        depth = 24
+        patch_size = 16
+        num_heads = 16
+    else:
+        raise NotImplementedError
+    model = (
+        VisionTransformer(
+            img_size=img_size,
+            act_layer=nn.GELU,
+            norm_layer=nn.LayerNorm,
+            class_token=class_token,
+            global_pool=global_pool,
+            depth=depth,
+            patch_size=patch_size,
+            num_heads=num_heads,
+            embed_dim=embed_dim,
+        )
+        .cuda()
+        .half()
+    )
+    return model
+
+
+def benchmark(model_name, batch_size, img_size):
+    if model_name == "vit_base_patch16_224":
+        img_size = 224
+    elif model_name == "vit_large_patch16_384":
+        img_size = 384
+    model = create_vit(model_name)
+    with torch.inference_mode():
+        input_shape = (batch_size, 3, img_size, img_size)
+        input_data = torch.randn(input_shape).cuda().half()
+        # warm up
+        benchmark_torch_function(100, model, input_data)
+        # benchmark
+        t = benchmark_torch_function(100, model, input_data)
+        print("batch_size: {}, time: {}".format(batch_size, t))
+        dev_flag = os.environ.get("HIP_VISIBLE_DEVICES", "-1")
+        dev_flag = dev_flag.replace(",", "_")
+        with open(f"{model_name}_pt_benchmark_dev_{dev_flag}.txt", "a") as f:
+            f.write("batch_size: {}, latency: {}\n".format(batch_size, t))
+
+
+@click.command()
+@click.option("--model-name", type=str, default="vit_base_patch16_224")
+@click.option("--batch-size", default=0, type=int)
+def main(model_name, batch_size):
+    img_size = 224
+    if model_name == "vit_base_patch16_224":
+        img_size = 224
+    elif model_name == "vit_large_patch16_384":
+        img_size = 384
+    else:
+        raise NotImplementedError
+    if batch_size == 0:
+        for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
+            benchmark(model_name, batch_size, img_size)
+    else:
+        benchmark(model_name, batch_size, img_size)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/04_vit/modeling/vision_transformer.py b/examples/04_vit/modeling/vision_transformer.py
new file mode 100644
index 000000000..5b4fb01f1
--- /dev/null
+++ b/examples/04_vit/modeling/vision_transformer.py
@@ -0,0 +1,323 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from functools import partial
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn
+from aitemplate.testing import detect_target
+
+# pylint: disable=W0102
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer="GELU",
+        drop=0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+            specialization="fast_gelu" if act_layer == "GELU" else "relu",
+        )
+        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
+
+    def forward(self, x, res):
+        shape = get_shape(x)
+        x = self.fc1(x)
+        x = self.fc2(x, res)
+        return ops.reshape()(x, shape)
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        batch_size,
+        seq_len,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        init_values=None,
+        drop_path=0.0,
+        act_layer="GELU",
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = nn.MultiheadAttention(
+            dim,
+            batch_size,
+            seq_len,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = nn.Identity()
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path1 = nn.DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=drop,
+        )
+        self.ls2 = nn.Identity()
+        self.drop_path2 = nn.DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, x):
+        x = self.attn(self.norm1(x), x)
+        x = self.mlp(self.norm2(x), x)
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size // patch_size, img_size // patch_size)
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.embed_dim = embed_dim
+
+        conv_op = (
+            nn.Conv2dBiasFewChannels
+            if detect_target().name() == "cuda"
+            else nn.Conv2dBias
+        )
+        self.proj = conv_op(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+        self.proj_norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        B, H, W, C = get_shape(x)
+        x = self.proj(x)
+        if self.flatten:
+            x = ops.reshape()(x, [B, -1, self.embed_dim])
+        x = self.proj_norm(x)
+        return x
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        batch_size=1,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        global_pool="token",
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        init_values=None,
+        class_token=True,
+        no_embed_class=False,
+        fc_norm=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        weight_init="",
+        embed_layer=PatchEmbed,
+        norm_layer=nn.LayerNorm,
+        act_layer=None,
+        block_fn=Block,
+        dtype="float16",
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            global_pool (str): type of global pooling for final sequence (default: 'token')
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            init_values: (float): layer-scale init values
+            class_token (bool): use class token
+            fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None)
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            weight_init (str): weight init scheme
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            act_layer: (nn.Module): MLP activation layer
+        """
+        super().__init__()
+        assert global_pool in ("", "avg", "token")
+        assert class_token or global_pool != "token"
+        use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.no_embed_class = no_embed_class
+        self.grad_checkpointing = False
+
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = (
+            nn.Parameter(shape=[1, 1, embed_dim], dtype=dtype) if class_token else None
+        )
+        self.cls_token_mask = (
+            nn.Parameter(shape=[batch_size, 1, embed_dim], dtype=dtype)
+            if class_token
+            else None
+        )
+        embed_len = (
+            num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        )
+        self.pos_embed = nn.Parameter(shape=[1, embed_len, embed_dim], dtype=dtype)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        seq_len = (img_size // patch_size) ** 2 + (1 if class_token else 0)
+        self.pool_size = img_size // patch_size
+
+        self.blocks = nn.Sequential(
+            *[
+                block_fn(
+                    dim=embed_dim,
+                    batch_size=batch_size,
+                    seq_len=seq_len,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    init_values=init_values,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=0,
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
+
+        if global_pool == "avg":
+            self.pool = nn.AvgPool2d(kernel_size=self.pool_size, stride=1, padding=0)
+
+        # Classifier Head
+        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+
+    def _pos_embed(self, x):
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + self.pos_embed.tensor()
+            if self.cls_token is not None:
+                cls_token_expand = ops.expand()(
+                    self.cls_token.tensor(), [get_shape(x)[0], -1, -1]
+                )
+                cls_token_expand = cls_token_expand + self.cls_token_mask.tensor()
+                x = ops.concatenate()([cls_token_expand, x], dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if self.cls_token is not None:
+                cls_token_expand = ops.expand()(
+                    self.cls_token.tensor(), [get_shape(x)[0], -1, -1]
+                )
+                cls_token_expand = cls_token_expand + self.cls_token_mask.tensor()
+                x = ops.concatenate()([cls_token_expand, x], dim=1)
+            x = x + self.pos_embed.tensor()
+        return self.pos_drop(x)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.blocks(x)
+        x = self.norm(x)
+        return x
+
+    def _global_pool(self, x):
+        batch, seq, d = get_shape(x)
+        x = ops.reshape()(x, [batch, self.pool_size, self.pool_size, d])
+        y = self.pool(x)
+        return ops.reshape()(y, [batch, d])
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            if self.global_pool == "avg":
+                x = self._global_pool(x)
+            else:
+                batch, seq, d = get_shape(x)
+                x = ops.dynamic_slice()(
+                    x, start_indices=[0, 0, 0], end_indices=[batch, 1, d]
+                )
+        x = self.fc_norm(x)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
diff --git a/examples/04_vit/verification.py b/examples/04_vit/verification.py
new file mode 100644
index 000000000..0584707bf
--- /dev/null
+++ b/examples/04_vit/verification.py
@@ -0,0 +1,164 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import click
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from modeling.vision_transformer import VisionTransformer
+from timm.models.vision_transformer import vit_base_patch16_224, vit_large_patch16_384
+
+from weight_utils import export_to_torch_tensor
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def compile_vit(
+    batch_size=128,
+    img_size=224,
+    patch_size=16,
+    embed_dim=768,
+    num_heads=12,
+    depth=12,
+    class_token=True,
+    global_pool="token",
+    use_fp16_acc=True,
+):
+    seqlen = (img_size // patch_size) ** 2 + (1 if class_token else 0)
+    ait_model = VisionTransformer(
+        batch_size=batch_size,
+        img_size=img_size,
+        class_token=class_token,
+        global_pool=global_pool,
+        num_heads=num_heads,
+        embed_dim=embed_dim,
+        patch_size=patch_size,
+        depth=depth,
+        act_layer="GELU",
+    )
+    ait_model.name_parameter_tensor()
+    inputs_ait = Tensor(
+        [batch_size, img_size, img_size, 3], name="input0", is_input=True
+    )
+    Y = ait_model(inputs_ait)
+    mark_output(Y)
+
+    target = detect_target(use_fp16_acc=use_fp16_acc)
+    exe_module = compile_model(
+        Y, target, "./tmp", "vision_transformer_bs%d_seq%d" % (batch_size, seqlen)
+    )
+    return exe_module
+
+
+def verification(
+    model_name,
+    batch_size=3,
+    use_fp16_acc=True,
+):
+    img_size = 224
+    embed_dim = 768
+    depth = 12
+    patch_size = 16
+    num_heads = 12
+    class_token = True
+    global_pool = "token"
+    if model_name == "vit_base_patch16_224":
+        img_size = 224
+        embed_dim = 768
+        depth = 12
+        patch_size = 16
+        num_heads = 12
+        pt_mod = vit_base_patch16_224(pretrained=True).cuda().half()
+    elif model_name == "vit_large_patch16_384":
+        img_size = 384
+        embed_dim = 1024
+        depth = 24
+        patch_size = 16
+        num_heads = 16
+        pt_mod = vit_large_patch16_384(pretrained=True).cuda().half()
+
+    seqlen = (img_size // patch_size) ** 2 + (1 if class_token else 0)
+    input_pt = torch.randn([batch_size, 3, img_size, img_size]).cuda().half() * 255
+    pt_ys = pt_mod(input_pt)
+    pt_ys = pt_ys.reshape((batch_size, 1, -1))
+
+    ait_mod = compile_vit(
+        batch_size=batch_size,
+        img_size=img_size,
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        depth=depth,
+        class_token=True,
+        global_pool=global_pool,
+        use_fp16_acc=use_fp16_acc,
+    )
+
+    # convert weights
+    params_ait = export_to_torch_tensor(model_name, True)
+    params_ait["cls_token_mask"] = torch.zeros((batch_size, 1, embed_dim)).cuda().half()
+    if detect_target().name() == "cuda":
+        ait_key = "attn_cu_length"
+        for i in range(depth):
+            prefix = "blocks_%d" % (i)
+            cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
+            params_ait[f"{prefix}_{ait_key}"] = torch.from_numpy(cu_len).cuda()
+
+    # set weights
+    for name, weight in params_ait.items():
+        ait_mod.set_constant_with_tensor(name, weight)
+
+    inputs = [input_pt.permute((0, 2, 3, 1)).contiguous()]
+    ys = []
+    num_ouputs = len(ait_mod.get_output_name_to_index_map())
+    for i in range(num_ouputs):
+        shape = ait_mod.get_output_maximum_shape(i)
+        ys.append(torch.empty(shape).cuda().half())
+    ait_mod.run_with_tensors(inputs, ys)
+    eps = 1e-1
+    np.testing.assert_allclose(
+        pt_ys.detach().cpu().numpy(),
+        ys[0].cpu().numpy(),
+        atol=eps,
+        rtol=eps,
+    )
+    print("vision transformer verification pass")
+
+
+@click.command()
+@click.option("--model-name", type=str, default="vit_base_patch16_224")
+@click.option("--use-fp16-acc", type=bool, default=True)
+def main(model_name, use_fp16_acc):
+    if model_name not in ("vit_base_patch16_224", "vit_large_patch16_384"):
+        raise ValueError(
+            "model name should be vit_base_patch16_224 or vit_large_patch16_384"
+        )
+    verification(model_name, use_fp16_acc=use_fp16_acc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/04_vit/weight_utils.py b/examples/04_vit/weight_utils.py
new file mode 100644
index 000000000..49d3c9eed
--- /dev/null
+++ b/examples/04_vit/weight_utils.py
@@ -0,0 +1,115 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""script for converting vit model from timm to ait
+"""
+import pickle
+
+import click
+import torch
+import torch.nn as nn
+from aitemplate.testing.detect_target import detect_target
+from timm.models.vision_transformer import (
+    VisionTransformer,
+    vit_base_patch16_224,
+    vit_large_patch16_384,
+)
+
+
+def convert_vit(model_name, pretrained=False):
+    img_size = 224
+    embed_dim = 768
+    patch_size = 16
+    depth = 12
+    mod = None
+    if model_name == "vit_base_patch16_224":
+        if pretrained:
+            mod = vit_base_patch16_224(pretrained=pretrained).cuda().half()
+        else:
+            mod = (
+                VisionTransformer(
+                    img_size=img_size,
+                    act_layer=nn.GELU,
+                    norm_layer=nn.LayerNorm,
+                    class_token=False,
+                    global_pool="avg",
+                    depth=depth,
+                    patch_size=patch_size,
+                    num_heads=12,
+                    embed_dim=embed_dim,
+                )
+                .cuda()
+                .half()
+            )
+    elif model_name == "vit_large_patch16_384":
+        img_size = 384
+        embed_dim = 1024
+        depth = 24
+        if pretrained:
+            mod = vit_large_patch16_384(pretrained=pretrained).cuda().half()
+        else:
+            mod = (
+                VisionTransformer(
+                    img_size=img_size,
+                    act_layer=nn.GELU,
+                    norm_layer=nn.LayerNorm,
+                    class_token=False,
+                    global_pool="avg",
+                    depth=24,
+                    patch_size=patch_size,
+                    num_heads=16,
+                    embed_dim=embed_dim,
+                )
+                .cuda()
+                .half()
+            )
+    else:
+        print(model_name)
+        raise NotImplementedError
+    params_pt = mod.named_parameters()
+    params_ait = {}
+    params_ait = {}
+    for key, arr in params_pt:
+        ait_key = key.replace(".", "_")
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+            if detect_target().name() == "cuda":
+                conv0_w_pad = (
+                    torch.zeros((embed_dim, patch_size, patch_size, 4)).cuda().half()
+                )
+                conv0_w_pad[:, :, :, :3] = arr
+                arr = conv0_w_pad
+        params_ait[f"{ait_key}"] = arr
+    return params_ait
+
+
+def export_to_torch_tensor(model_name, pretrained=False):
+    params_ait = convert_vit(model_name, pretrained)
+    return params_ait
+
+
+@click.command()
+@click.option("--model_name", default="vit_base_patch16_224", help="model name")
+@click.option("--param-path", default="vit.pkl", help="saved numpy weights path")
+@click.option("--pretrained", default=False, help="use pretrained weights")
+def export_to_numpy(model_name, param_path, pretrained=False):
+    params_ait = convert_vit(model_name, pretrained)
+    params_np = {k: v.detach().cpu().numpy() for k, v in params_ait.items()}
+
+    with open(param_path, "wb") as f:
+        pickle.dump(params_np, f)
+
+
+if __name__ == "__main__":
+    export_to_numpy()
diff --git a/examples/05_stable_diffusion/README.md b/examples/05_stable_diffusion/README.md
new file mode 100644
index 000000000..a98021a3a
--- /dev/null
+++ b/examples/05_stable_diffusion/README.md
@@ -0,0 +1,136 @@
+## Stable Diffusion Example
+
+In this example, we show how to build fast AIT modules for CLIP, UNet, VAE models, and benchmark/run them.
+
+### Build Dependencies
+
+The AIT stable diffusion example depends on `diffusers` and `transformers`. 
+
+Verify the library versions. We have tested transformers 4.21/4.22/4.23, diffusers 0.3/0.4 and torch 1.11/1.12.
+
+```
+>>> import transformers
+>>> transformers.__version__
+'4.21.2'
+>>> import diffusers
+>>> diffusers.__version__
+'0.3.0'
+>>> torch.__version__
+'1.12.1+cu116'
+```
+
+### Build AIT modules for CLIP, UNet, VAE
+
+Build the AIT moduels by running `compile.py`,
+
+```
+python3 examples/05_stable_diffusion/compile.py
+```
+It generates three folders: `./tmp/CLIPTextModel`, `./tmp/UNet2DConditionModel`, `./tmp/AutoencoderKL`. In each folder, there is a `test.so` file which is the generated AIT module for the model.
+
+#### Multi-GPU profiling
+AIT needs to do profiling to select the best algorithms for CUTLASS and CK.
+To enable multiple GPUs for profiling, use the environment variable `CUDA_VISIBLE_DEVICES` on NVIDIA platform and `HIP_VISIBLE_DEVICES` on AMD platform.
+
+### Prepare Weights and Benchmark
+
+In this step, we dowanload the Stable Diffusion weights for each model, and use them to initialize the parameters in AIT modules. Then we benchmark the AIT modules.
+
+1. Register in Hugging Face Hub to obtain an access token for Stable Diffusion weights. See [user access tokens](https://huggingface.co/docs/hub/security-tokens).
+
+2. (Optional) Run `benchmark.py` with the access token to initialize the weights and benchmark.
+
+```
+python3 examples/05_stable_diffusion/benchmark.py --token ACCESS_TOKEN
+```
+
+### Run Models
+
+Run AIT models with an example image:
+
+```
+python3 examples/05_stable_diffusion/demo.py --token ACCESS_TOKEN
+```
+
+Check the resulted image: `example_ait.png`
+
+
+### Sample outputs
+
+Command: `python3 examples/05_stable_diffusion/demo.py --token hf_xxx --prompt "Mountain Rainier in van Gogh's world"`
+
+![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_ait_rainier.png)
+
+Command: `python3 examples/05_stable_diffusion/demo.py --token hf_xxx --prompt "Sitting in a tea house in Japan with Mount Fuji in the background, sunset professional portrait, Nikon 85mm f/1.4G"`
+
+![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_ait_fuji.png)
+
+Command: `python3 examples/05_stable_diffusion/demo.py --token hf_xxx --prompt "A lot of wild flowers with North Cascade Mountain in background, sunset professional photo, Unreal Engine"`
+
+![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_ait_cascade2.png)
+
+## Results
+
+_PT = PyTorch 1.12 Eager_
+
+_OOM = Out of Memory_
+### A100-40GB / CUDA 11.6, 50 steps
+
+| Module   | PT Latency (ms) | AIT Latency (ms) |
+|----------|-----------------|------------------|
+| CLIP     | 9.48            | 0.87             |
+| UNet     | 60.52           | 22.47            |
+| VAE      | 47.78           | 37.43            |
+| Pipeline | 3058.27         | 1282.98          |
+
+- PT: 17.50 it/s
+- AIT: 42.45 it/s
+
+### RTX 3080-10GB / CUDA 11.6, 50 steps
+
+| Module   | PT Latency (ms) | AIT Latency (ms) |
+|----------|-----------------|------------------|
+| CLIP     | OOM             | 0.85             |
+| UNet     | OOM             | 40.22            |
+| VAE      | OOM             | 44.12            |
+| Pipeline | OOM             | 2163.43          |
+
+- PT: OOM
+- AIT: 24.51 it/s
+
+### MI-250 1 GCD, 50 steps
+
+| Module   | PT Latency (ms) | AIT Latency (ms) |
+|----------|-----------------|------------------|
+| CLIP     | 6.16            | 2.98             |
+| UNet     | 78.42           | 62.18            |
+| VAE      | 63.83           | 164.50           |
+| Pipeline | 4300.16         | 3476.07          |
+
+- PT: 12.43 it/s
+- AIT: 15.60 it/s
+
+## Batched Version
+
+A batched version of AIT Stable Diffusion can be found at: https://github.com/terrychenism/AIT_StableDiffusion/tree/main/examples/05_stable_diffusion
+
+
+Some reference results are taken from the repo:
+
+### A100-40GB, 25 Steps
+
+| Batch size | AIT Latency (ms) | AVG im/s |
+|------------|------------------|----------|
+| 1          | 695              | 0.69     |
+| 3          | 1651             | 0.55     |
+| 8          | 3975             | 0.50     |
+| 16         | 7906             | 0.49     |
+
+
+
+### Note for Performance Results
+
+- For all benchmarks we render the images of size 512x512
+- For NVIDIA A100, our test cluster doesn't allow to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- To benchmark MI-250 1 GCD, we lock the frequency with command `rocm-smi -d x --setperfdeterminism 1700`, where `x` is the GPU id.
+- Performance results are what we can reproduced & take reference. It should not be used for other purposes.
diff --git a/examples/05_stable_diffusion/benchmark.py b/examples/05_stable_diffusion/benchmark.py
new file mode 100644
index 000000000..9035ad73e
--- /dev/null
+++ b/examples/05_stable_diffusion/benchmark.py
@@ -0,0 +1,304 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+
+import click
+
+import numpy as np
+import torch
+from aitemplate.compiler import Model
+from aitemplate.testing import detect_target
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from diffusers import StableDiffusionPipeline
+
+from torch import autocast
+from transformers import CLIPTokenizer
+
+USE_CUDA = detect_target().name() == "cuda"
+
+access_token = True
+pipe = None
+
+
+def get_int_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("AIT output_{} shape: {}".format(i, y_shape))
+
+
+def benchmark_unet(
+    batch_size=2,
+    hh=64,
+    ww=64,
+    dim=320,
+    benchmark_pt=False,
+    verify=False,
+):
+
+    exe_module = Model("./tmp/UNet2DConditionModel/test.so")
+    if exe_module is None:
+        print("Error!! Cannot find compiled module for UNet2DConditionModel.")
+        exit(-1)
+
+    # run PT unet model
+    pt_mod = pipe.unet
+    pt_mod = pt_mod.eval()
+
+    latent_model_input_pt = torch.randn(batch_size, 4, hh, ww).cuda().half()
+    text_embeddings_pt = torch.randn(batch_size, 64, 768).cuda().half()
+    timesteps_pt = torch.Tensor([1, 1]).cuda().half()
+
+    with autocast("cuda"):
+        pt_ys = pt_mod(
+            latent_model_input_pt,
+            timesteps_pt,
+            encoder_hidden_states=text_embeddings_pt,
+        ).sample
+
+        # PT benchmark
+        if benchmark_pt:
+            args = (latent_model_input_pt, 1, text_embeddings_pt)
+            pt_time = benchmark_torch_function(100, pt_mod, *args)
+            print(f"PT batch_size: {batch_size}, {pt_time} ms")
+            with open("sd_pt_benchmark.txt", "a") as f:
+                f.write(f"unet batch_size: {batch_size}, latency: {pt_time} ms\n")
+
+    print("pt output:", pt_ys.shape)
+
+    # run AIT unet model
+    inputs = {
+        "input0": latent_model_input_pt.permute((0, 2, 3, 1)).contiguous(),
+        "input1": timesteps_pt,
+        "input2": text_embeddings_pt,
+    }
+
+    ys = []
+    num_ouputs = len(exe_module.get_output_name_to_index_map())
+    for i in range(num_ouputs):
+        shape = exe_module.get_output_maximum_shape(i)
+        ys.append(torch.empty(shape).cuda().half())
+    exe_module.run_with_tensors(inputs, ys)
+
+    # verification
+    y_transpose = ys[0].permute((0, 3, 1, 2))
+
+    if verify:
+        eps = 1e-1
+        np.testing.assert_allclose(
+            pt_ys.detach().cpu().numpy(),
+            y_transpose.cpu().numpy(),
+            atol=eps,
+            rtol=eps,
+        )
+        print("UNet2DCondition verification pass")
+
+    # AIT benchmark
+    # warmup
+    exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
+    # benchmark
+    t, _, _ = exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
+    with open("sd_ait_benchmark.txt", "a") as f:
+        f.write(f"unet batch_size: {batch_size}, latency: {t} ms\n")
+
+
+def benchmark_clip(
+    batch_size=1,
+    seqlen=64,
+    dim=768,
+    num_heads=12,
+    hidden_size=768,
+    vocab_size=49408,
+    max_position_embeddings=77,
+    benchmark_pt=False,
+    verify=False,
+):
+    mask_seq = 0
+    version = "openai/clip-vit-large-patch14"
+
+    exe_module = Model("./tmp/CLIPTextModel/test.so")
+    if exe_module is None:
+        print("Error!! Cannot find compiled module for CLIPTextModel.")
+        exit(-1)
+
+    # run PT clip
+    pt_mod = pipe.text_encoder
+    pt_mod = pt_mod.eval()
+
+    tokenizer = CLIPTokenizer.from_pretrained(version)
+    text_input = tokenizer(
+        ["a photo of an astronaut riding a horse on mars"],
+        padding="max_length",
+        max_length=seqlen,
+        truncation=True,
+        return_tensors="pt",
+    )
+    input_ids = text_input["input_ids"].cuda()
+
+    attention_mask = torch.ones((batch_size, seqlen))
+    attention_mask[-1, -mask_seq:] = 0
+    attention_mask = None
+
+    position_ids = torch.arange(seqlen).expand((1, -1)).cuda()
+    pt_ys = pt_mod(input_ids, attention_mask, position_ids)
+    print("pt output:", pt_ys[0].shape)
+
+    # PT benchmark
+    if benchmark_pt:
+        args = (input_ids, attention_mask, position_ids)
+        pt_time = benchmark_torch_function(100, pt_mod, *args)
+        print(f"PT batch_size: {batch_size}, {pt_time} ms")
+        with open("sd_pt_benchmark.txt", "a") as f:
+            f.write(f"clip batch_size: {batch_size}, latency: {pt_time} ms\n")
+
+    # run AIT clip
+    inputs = {
+        "input0": input_ids,
+        "input1": position_ids,
+    }
+    ys = []
+    num_ouputs = len(exe_module.get_output_name_to_index_map())
+    for i in range(num_ouputs):
+        shape = exe_module.get_output_maximum_shape(i)
+        ys.append(torch.empty(shape).cuda().half())
+    exe_module.run_with_tensors(inputs, ys)
+
+    # verification
+    if verify:
+        eps = 1e-1
+        pt_np = pt_ys[0].detach().cpu().numpy()
+        np.testing.assert_allclose(
+            pt_np,
+            ys[0].cpu().numpy(),
+            atol=eps,
+            rtol=eps,
+        )
+        print("CLIPTextTransformer verification pass")
+
+    # AIT benchmark
+    # warmup
+    exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
+    # benchmark
+    t, _, _ = exe_module.benchmark_with_tensors(inputs, ys, count=100, repeat=4)
+    with open("sd_ait_benchmark.txt", "a") as f:
+        f.write(f"clip batch_size: {batch_size}, latency: {t} ms\n")
+
+
+def benchmark_vae(batch_size=1, height=64, width=64, benchmark_pt=False, verify=False):
+
+    latent_channels = 4
+
+    exe_module = Model("./tmp/AutoencoderKL/test.so")
+    if exe_module is None:
+        print("Error!! Cannot find compiled module for AutoencoderKL.")
+        exit(-1)
+
+    # run PT vae
+    pt_vae = pipe.vae
+    pt_vae = pt_vae.cuda().half()
+    pt_vae.eval()
+
+    pt_input = torch.rand([batch_size, latent_channels, height, width]).cuda().half()
+    print("pt_input shape", pt_input.shape)
+    with autocast("cuda"):
+        pt_output = pt_vae.decode(pt_input).sample
+        pt_output = pt_output.half()
+
+        # PT benchmark
+        if benchmark_pt:
+            args = (pt_input,)
+            pt_time = benchmark_torch_function(100, pt_vae.decode, *args)
+            print(f"PT batch_size: {batch_size}, {pt_time} ms")
+            with open("sd_pt_benchmark.txt", "a") as f:
+                f.write(f"vae batch_size: {batch_size}, latency: {pt_time} ms\n")
+
+    # run AIT vae
+    y = (
+        torch.empty(
+            pt_output.size(0),
+            pt_output.size(2),
+            pt_output.size(3),
+            pt_output.size(1),
+        )
+        .cuda()
+        .half()
+    )
+    ait_input_pt_tensor = torch.permute(pt_input, (0, 2, 3, 1)).contiguous()
+    print("input pt tensor size: ", ait_input_pt_tensor.shape)
+    print("output pt tensor size: ", y.shape)
+    exe_module.run_with_tensors([ait_input_pt_tensor], [y])
+
+    # verification
+    if verify:
+        y_pt = torch.permute(y, (0, 3, 1, 2))
+        eps = 1e-1
+        np.testing.assert_allclose(
+            pt_output.detach().cpu().numpy(),
+            y_pt.cpu().numpy(),
+            atol=eps,
+            rtol=eps,
+        )
+        logging.info("VAE Verification done!")
+
+    # AIT benchmark:
+    # warmup
+    exe_module.benchmark_with_tensors([ait_input_pt_tensor], [y], count=100, repeat=4)
+    # benchmark
+    t, _, _ = exe_module.benchmark_with_tensors(
+        [ait_input_pt_tensor], [y], count=100, repeat=4
+    )
+    with open("sd_ait_benchmark.txt", "a") as f:
+        f.write(f"vae batch_size: {batch_size}, latency: {t} ms\n")
+
+
+@click.command()
+@click.option("--token", default="", help="access token")
+@click.option("--verify", type=bool, default=False, help="verify correctness")
+@click.option("--benchmark-pt", type=bool, default=False, help="run pt benchmark")
+def benchmark_diffusers(token, verify, benchmark_pt):
+    logging.getLogger().setLevel(logging.INFO)
+    np.random.seed(0)
+    torch.manual_seed(4896)
+
+    global access_token, pipe
+    if token != "":
+        access_token = token
+
+    pipe = StableDiffusionPipeline.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        revision="fp16",
+        torch_dtype=torch.float16,
+        use_auth_token=access_token,
+    ).to("cuda")
+
+    # CLIP
+    benchmark_clip(benchmark_pt=benchmark_pt, verify=verify)
+    # UNet
+    benchmark_unet(batch_size=2, benchmark_pt=benchmark_pt, verify=verify)
+    # VAE
+    benchmark_vae(benchmark_pt=benchmark_pt, verify=verify)
+
+
+if __name__ == "__main__":
+    benchmark_diffusers()
diff --git a/examples/05_stable_diffusion/benchmark_pt.py b/examples/05_stable_diffusion/benchmark_pt.py
new file mode 100644
index 000000000..3534eaf62
--- /dev/null
+++ b/examples/05_stable_diffusion/benchmark_pt.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import click
+import torch
+
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from diffusers import StableDiffusionPipeline
+
+
+@click.command()
+@click.option("--token", default="", help="access token")
+@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option(
+    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
+)
+def run(token, prompt, benchmark):
+    pipe = StableDiffusionPipeline.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        revision="fp16",
+        torch_dtype=torch.float16,
+        use_auth_token=token,
+    ).to("cuda")
+
+    with torch.autocast("cuda"):
+        image = pipe(prompt).images[0]
+        if benchmark:
+            t = benchmark_torch_function(10, pipe, prompt)
+            print(f"sd pt e2e: {t} ms")
+
+    image.save("example_pt.png")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/compile.py b/examples/05_stable_diffusion/compile.py
new file mode 100644
index 000000000..d6bd33c9f
--- /dev/null
+++ b/examples/05_stable_diffusion/compile.py
@@ -0,0 +1,353 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+from collections import OrderedDict
+
+import click
+import numpy as np
+
+import torch
+
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from diffusers import StableDiffusionPipeline
+
+from modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
+
+from modeling.unet_2d_condition import UNet2DConditionModel as ait_UNet2DConditionModel
+
+from modeling.vae import AutoencoderKL as ait_AutoencoderKL
+
+
+USE_CUDA = detect_target().name() == "cuda"
+
+access_token = True
+pipe = None
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("AIT output_{} shape: {}".format(i, y_shape))
+
+
+def map_unet_params(pt_mod, dim):
+    pt_params = dict(pt_mod.named_parameters())
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+
+    params_ait["arange"] = (
+        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+def map_vae_params(ait_module, pt_module, batch_size, seq_len):
+    pt_params = dict(pt_module.named_parameters())
+    mapped_pt_params = OrderedDict()
+    for name, _ in ait_module.named_parameters():
+        ait_name = name.replace(".", "_")
+        if name in pt_params:
+            if (
+                "conv" in name
+                and "norm" not in name
+                and name.endswith(".weight")
+                and len(pt_params[name].shape) == 4
+            ):
+                mapped_pt_params[ait_name] = torch.permute(
+                    pt_params[name], [0, 2, 3, 1]
+                ).contiguous()
+            else:
+                mapped_pt_params[ait_name] = pt_params[name]
+        elif name.endswith("attention.qkv.weight"):
+            prefix = name[: -len("attention.qkv.weight")]
+            q_weight = pt_params[prefix + "query.weight"]
+            k_weight = pt_params[prefix + "key.weight"]
+            v_weight = pt_params[prefix + "value.weight"]
+            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
+            mapped_pt_params[ait_name] = qkv_weight
+        elif name.endswith("attention.qkv.bias"):
+            prefix = name[: -len("attention.qkv.bias")]
+            q_bias = pt_params[prefix + "query.bias"]
+            k_bias = pt_params[prefix + "key.bias"]
+            v_bias = pt_params[prefix + "value.bias"]
+            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0)
+            mapped_pt_params[ait_name] = qkv_bias
+        elif name.endswith("attention.proj.weight"):
+            prefix = name[: -len("attention.proj.weight")]
+            pt_name = prefix + "proj_attn.weight"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.proj.bias"):
+            prefix = name[: -len("attention.proj.bias")]
+            pt_name = prefix + "proj_attn.bias"
+            mapped_pt_params[ait_name] = pt_params[pt_name]
+        elif name.endswith("attention.cu_length"):
+            cu_len = np.cumsum([0] + [seq_len] * batch_size).astype("int32")
+            mapped_pt_params[ait_name] = torch.from_numpy(cu_len).cuda()
+        else:
+            pt_param = pt_module.get_parameter(name)
+            mapped_pt_params[ait_name] = pt_param
+
+    return mapped_pt_params
+
+
+def map_clip_params(pt_mod, batch_size, seqlen, depth):
+
+    params_pt = list(pt_mod.named_parameters())
+
+    params_ait = {}
+    pt_params = {}
+    for key, arr in params_pt:
+        pt_params[key.replace("text_model.", "")] = arr
+
+    pt_params = dict(pt_mod.named_parameters())
+    for key, arr in pt_params.items():
+        name = key.replace("text_model.", "")
+        ait_name = name.replace(".", "_")
+        if name.endswith("out_proj.weight"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("out_proj.bias"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("q_proj.weight"):
+            ait_name = ait_name.replace("q_proj", "qkv")
+            prefix = key[: -len("q_proj.weight")]
+            q = pt_params[prefix + "q_proj.weight"]
+            k = pt_params[prefix + "k_proj.weight"]
+            v = pt_params[prefix + "v_proj.weight"]
+            qkv_weight = torch.cat([q, k, v], dim=0)
+            params_ait[ait_name] = qkv_weight
+            continue
+        elif name.endswith("q_proj.bias"):
+            ait_name = ait_name.replace("q_proj", "qkv")
+            prefix = key[: -len("q_proj.bias")]
+            q = pt_params[prefix + "q_proj.bias"]
+            k = pt_params[prefix + "k_proj.bias"]
+            v = pt_params[prefix + "v_proj.bias"]
+            qkv_bias = torch.cat([q, k, v], dim=0)
+            params_ait[ait_name] = qkv_bias
+            continue
+        elif name.endswith("k_proj.weight"):
+            continue
+        elif name.endswith("k_proj.bias"):
+            continue
+        elif name.endswith("v_proj.weight"):
+            continue
+        elif name.endswith("v_proj.bias"):
+            continue
+        params_ait[ait_name] = arr
+
+        if USE_CUDA:
+            for i in range(depth):
+                prefix = "encoder_layers_%d_self_attn_cu_length" % (i)
+                cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
+                params_ait[prefix] = torch.from_numpy(cu_len).cuda()
+
+    return params_ait
+
+
+def compile_unet(
+    batch_size=2,
+    hh=64,
+    ww=64,
+    dim=320,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+):
+
+    ait_mod = ait_UNet2DConditionModel(sample_size=64, cross_attention_dim=768)
+    ait_mod.name_parameter_tensor()
+
+    # set AIT parameters
+    pt_mod = pipe.unet
+    pt_mod = pt_mod.eval()
+    params_ait = map_unet_params(pt_mod, dim)
+
+    latent_model_input_ait = Tensor(
+        [batch_size, hh, ww, 4], name="input0", is_input=True
+    )
+    timesteps_ait = Tensor([2], name="input1", is_input=True)
+    text_embeddings_pt_ait = Tensor([batch_size, 64, 768], name="input2", is_input=True)
+
+    Y = ait_mod(latent_model_input_ait, timesteps_ait, text_embeddings_pt_ait)
+    mark_output(Y)
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(Y, target, "./tmp", "UNet2DConditionModel", constants=params_ait)
+
+
+def compile_clip(
+    batch_size=1,
+    seqlen=64,
+    dim=768,
+    num_heads=12,
+    hidden_size=768,
+    vocab_size=49408,
+    max_position_embeddings=77,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+):
+    mask_seq = 0
+    causal = True
+    depth = 12
+
+    ait_mod = ait_CLIPTextTransformer(
+        num_hidden_layers=depth,
+        hidden_size=dim,
+        num_attention_heads=num_heads,
+        batch_size=batch_size,
+        seq_len=seqlen,
+        causal=causal,
+        mask_seq=mask_seq,
+    )
+    ait_mod.name_parameter_tensor()
+
+    pt_mod = pipe.text_encoder
+    pt_mod = pt_mod.eval()
+    params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
+
+    input_ids_ait = Tensor(
+        [batch_size, seqlen], name="input0", dtype="int64", is_input=True
+    )
+    position_ids_ait = Tensor(
+        [batch_size, seqlen], name="input1", dtype="int64", is_input=True
+    )
+    Y = ait_mod(input_ids=input_ids_ait, position_ids=position_ids_ait)
+    mark_output(Y)
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(Y, target, "./tmp", "CLIPTextModel", constants=params_ait)
+
+
+def compile_vae(
+    batch_size=1, height=64, width=64, use_fp16_acc=False, convert_conv_to_gemm=False
+):
+    in_channels = 3
+    out_channels = 3
+    down_block_types = [
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+    ]
+    up_block_types = [
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+    ]
+    block_out_channels = [128, 256, 512, 512]
+    layers_per_block = 2
+    act_fn = "silu"
+    latent_channels = 4
+    sample_size = 512
+
+    ait_vae = ait_AutoencoderKL(
+        batch_size,
+        height,
+        width,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        down_block_types=down_block_types,
+        up_block_types=up_block_types,
+        block_out_channels=block_out_channels,
+        layers_per_block=layers_per_block,
+        act_fn=act_fn,
+        latent_channels=latent_channels,
+        sample_size=sample_size,
+    )
+    ait_input = Tensor(
+        shape=[batch_size, height, width, latent_channels],
+        name="vae_input",
+        is_input=True,
+    )
+    ait_vae.name_parameter_tensor()
+
+    pt_mod = pipe.vae
+    pt_mod = pt_mod.eval()
+    params_ait = map_vae_params(ait_vae, pt_mod, batch_size, height * width)
+
+    Y = ait_vae.decode(ait_input)
+    mark_output(Y)
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(
+        Y,
+        target,
+        "./tmp",
+        "AutoencoderKL",
+        constants=params_ait,
+    )
+
+
+@click.command()
+@click.option("--token", default="", help="access token")
+@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
+@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
+def compile_diffusers(token, use_fp16_acc=True, convert_conv_to_gemm=True):
+    logging.getLogger().setLevel(logging.INFO)
+    np.random.seed(0)
+    torch.manual_seed(4896)
+
+    if detect_target().name() == "rocm":
+        convert_conv_to_gemm = False
+
+    global access_token, pipe
+    if token != "":
+        access_token = token
+
+    pipe = StableDiffusionPipeline.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        revision="fp16",
+        torch_dtype=torch.float16,
+        use_auth_token=access_token,
+    ).to("cuda")
+
+    # CLIP
+    compile_clip(use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
+    # UNet
+    compile_unet(
+        batch_size=2,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+    )
+    # VAE
+    compile_vae(use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm)
+
+
+if __name__ == "__main__":
+    compile_diffusers()
diff --git a/examples/05_stable_diffusion/demo.py b/examples/05_stable_diffusion/demo.py
new file mode 100644
index 000000000..5a7b8b79e
--- /dev/null
+++ b/examples/05_stable_diffusion/demo.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import click
+import torch
+
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from pipeline_stable_diffusion_ait import StableDiffusionAITPipeline
+
+
+@click.command()
+@click.option("--token", default="", help="access token")
+@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option(
+    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
+)
+def run(token, prompt, benchmark):
+    pipe = StableDiffusionAITPipeline.from_pretrained(
+        "CompVis/stable-diffusion-v1-4",
+        revision="fp16",
+        torch_dtype=torch.float16,
+        use_auth_token=token,
+    ).to("cuda")
+
+    with torch.autocast("cuda"):
+        image = pipe(prompt).images[0]
+        if benchmark:
+            t = benchmark_torch_function(10, pipe, prompt)
+            print(f"sd e2e: {t} ms")
+
+    image.save("example_ait.png")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/modeling/attention.py b/examples/05_stable_diffusion/modeling/attention.py
new file mode 100644
index 000000000..efabc3c0c
--- /dev/null
+++ b/examples/05_stable_diffusion/modeling/attention.py
@@ -0,0 +1,104 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Implementations are translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py.
+"""
+
+from typing import Optional
+
+from aitemplate.compiler.ops import reshape
+
+from aitemplate.frontend import nn, Tensor
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
+    to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    Uses three q, k, v linear layers to compute attention.
+    Parameters:
+        batch_size (:obj:`int`): The number of examples per batch.
+        height (:obj:`int`): Height of each image example.
+        width (:obj:`int`): Width of each image example.
+        channels (:obj:`int`): The number of channels in the input and output.
+        num_head_channels (:obj:`int`, *optional*):
+            The number of channels in each head. If None, then `num_heads` = 1.
+        num_groups (:obj:`int`, *optional*, defaults to 32): The number of groups to use for group norm.
+        eps (:obj:`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        channels: int,
+        num_head_channels: Optional[int] = None,
+        num_groups: int = 32,
+        rescale_output_factor: float = 1.0,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+        self.height = height
+        self.width = width
+        self.channels = channels
+        self.num_heads = (
+            channels // num_head_channels if num_head_channels is not None else 1
+        )
+        self.num_head_size = num_head_channels
+        self.group_norm = nn.GroupNorm(num_groups, channels, eps)
+        self.attention = nn.MultiheadAttention(
+            channels,
+            batch_size,
+            height * width,
+            self.num_heads,
+            qkv_bias=True,
+            has_residual=True,
+        )
+        self.rescale_output_factor = rescale_output_factor
+
+    def forward(self, hidden_states) -> Tensor:
+        """
+        input hidden_states shape: [batch, height, width, channel]
+        output shape: [batch, height, width, channel]
+        """
+        residual = hidden_states
+
+        # norm
+        hidden_states = self.group_norm(hidden_states)
+
+        hidden_states = reshape()(
+            hidden_states, [self.batch_size, self.height * self.width, self.channels]
+        )
+
+        batch, hw, channel = hidden_states.shape()
+        if (
+            batch.value() != self.batch_size
+            or hw.value() != self.width * self.height
+            or channel.value() != self.channels
+        ):
+            raise RuntimeError(
+                "nchw params do not match! "
+                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
+                f"actual: {batch}, {channel}, {hw}."
+            )
+
+        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
+        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
+
+        return res
diff --git a/examples/05_stable_diffusion/modeling/clip.py b/examples/05_stable_diffusion/modeling/clip.py
new file mode 100644
index 000000000..c66ecfb90
--- /dev/null
+++ b/examples/05_stable_diffusion/modeling/clip.py
@@ -0,0 +1,590 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from inspect import isfunction
+from typing import Optional
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+# pylint: disable=W0102
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        dtype="float16",
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+
+        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
+        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+
+    def forward(self, x, context=None, mask=None, residual=None):
+        nheads = self.heads
+        d = self.dim_head
+
+        layout = "20314" if USE_CUDA else "m2n3"
+
+        bs, seqlen, _ = get_shape(x)
+        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
+        )
+        context = default(context, x)
+
+        seqlen = get_shape(context)[1]
+        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
+        )
+        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
+            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
+        )
+
+        if USE_CUDA:
+            q = q * self.scale
+            attn = ops.bmm_rcr()(
+                (ops.reshape()(q, [bs * nheads, -1, d])),
+                (ops.reshape()(k, [bs * nheads, -1, d])),
+            )
+            attn = ops.softmax()(attn, -1)
+            v = ops.reshape()(v, [bs * nheads, -1, d])
+            out = ops.bmm_rrr_permute((nheads,))(attn, v)
+        else:
+            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
+            out = OP(
+                (ops.reshape()(q, [bs * nheads, -1, d])),
+                (ops.reshape()(k, [bs * nheads, -1, d])),
+                (ops.reshape()(v, [bs * nheads, -1, d])),
+            )
+        out = ops.reshape()(out, [bs, -1, nheads * d])
+        proj = self.to_out(out)
+        proj = ops.reshape()(proj, [bs, -1, nheads * d])
+        if residual is not None:
+            return proj + residual
+        else:
+            return proj
+
+
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, specialization="mul")
+        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu")
+
+    def forward(self, x):
+        return self.proj(x, self.gate(x))
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = (
+            nn.Sequential(
+                nn.Linear(dim, inner_dim, specialization="fast_gelu"),
+            )
+            if not glu
+            else GEGLU(dim, inner_dim)
+        )
+
+        self.net = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x, residual=None):
+        shape = ops.size()(x)
+        x = self.net(x)
+        x = ops.reshape()(x, shape)
+        if residual is not None:
+            return x + residual
+        else:
+            return x
+
+
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        n_heads,
+        d_head,
+        dropout=0.0,
+        context_dim=None,
+        gated_ff=True,
+        checkpoint=True,
+    ):
+        super().__init__()
+        self.attn1 = CrossAttention(
+            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
+        )  # is a self-attention
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+        self.param = (dim, n_heads, d_head, context_dim, gated_ff, checkpoint)
+
+    def forward(self, x, context=None):
+        x = self.attn1(self.norm1(x), residual=x)
+        x = self.attn2(self.norm2(x), context=context, residual=x)
+        x = self.ff(self.norm3(x), residual=x)
+        return x
+
+
+def Normalize(in_channels):
+    return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+
+    def __init__(
+        self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)  # Group Norm
+
+        self.proj_in = nn.Conv2dBias(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
+                )
+                for d in range(depth)
+            ]
+        )
+
+        self.proj_out = nn.Conv2dBias(
+            inner_dim, in_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, h, w, c = get_shape(x)
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = ops.reshape()(x, [b, -1, c])
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        x = ops.reshape()(x, [b, h, w, c])
+        x = self.proj_out(x)
+        return x + x_in
+
+
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_attention_heads=12,
+        attention_dropout=0.0,
+        batch_size=1,
+        seq_len=16,
+        layer_norm_eps=1e-5,
+        hidden_dropout_prob=0.0,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(
+            dim=hidden_size,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_heads=num_attention_heads,
+            qkv_bias=True,
+            attn_drop=attention_dropout,
+            proj_drop=hidden_dropout_prob,
+            has_residual=False,
+            causal=causal,
+            mask_seq=mask_seq,
+        )
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        causal_attention_mask: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        residual: Optional[Tensor] = None,
+    ):
+        if residual is not None:
+            self_output = self.attn(hidden_states, residual)
+        else:
+            self_output = self.attn(hidden_states)
+        return self_output
+
+
+class QuickGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, x):
+        x1 = x * 1.702
+        x1 = ops.sigmoid(x1)
+        x = x * x1
+        return x
+
+
+class CLIPMLP(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer="GELU",
+        drop=0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+        )
+        self.activation_fn = QuickGELUActivation()
+        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
+
+    def forward(self, x, res):
+        shape = get_shape(x)
+        x = self.fc1(x)
+        x = self.activation_fn(x)
+        x = self.fc2(x, res)
+        return ops.reshape()(x, shape)
+
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        num_attention_heads=12,
+        attention_dropout=0.0,
+        mlp_ratio=4.0,
+        batch_size=1,
+        seq_len=16,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        self.embed_dim = hidden_size
+        self.self_attn = nn.MultiheadAttention(
+            dim=hidden_size,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            num_heads=num_attention_heads,
+            qkv_bias=True,
+            attn_drop=attention_dropout,
+            proj_drop=0,
+            has_residual=True,
+            causal=causal,
+            mask_seq=mask_seq,
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim)
+        self.mlp = CLIPMLP(hidden_size, int(hidden_size * mlp_ratio))
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states, residual)
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states, residual)
+
+        return hidden_states
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(
+        self,
+        num_hidden_layers=12,
+        output_attentions=False,
+        output_hidden_states=False,
+        use_return_dict=False,
+        hidden_size=768,
+        num_attention_heads=12,
+        batch_size=1,
+        seq_len=64,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                CLIPEncoderLayer(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    batch_size=batch_size,
+                    seq_len=seq_len,
+                    causal=causal,
+                    mask_seq=mask_seq,
+                )
+                for _ in range(num_hidden_layers)
+            ]
+        )
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_return_dict = use_return_dict
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[Tensor] = None,
+        causal_attention_mask: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        # all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for _, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(hidden_states)
+            hidden_states = layer_outputs
+
+        return hidden_states
+
+
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        vocab_size=49408,
+        max_position_embeddings=77,
+        dtype="float16",
+    ):
+        super().__init__()
+        embed_dim = hidden_size
+
+        self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
+        self.position_embedding = nn.Embedding(
+            shape=[max_position_embeddings, embed_dim], dtype=dtype
+        )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        inputs_embeds: Optional[Tensor] = None,
+    ) -> Tensor:
+
+        input_shape = ops.size()(input_ids)
+
+        # [B * S]
+        input_ids = ops.reshape()(input_ids, [-1])
+
+        position_ids = ops.reshape()(position_ids, [-1])
+
+        if inputs_embeds is None:
+            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
+
+        position_embeddings = ops.batch_gather()(
+            self.position_embedding.tensor(), position_ids
+        )
+
+        embeddings = inputs_embeds + position_embeddings
+
+        embeddings = ops.reshape()(embeddings, [input_shape[0], input_shape[1], -1])
+
+        return embeddings
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        output_attentions=False,
+        output_hidden_states=False,
+        use_return_dict=False,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        batch_size=1,
+        seq_len=64,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        embed_dim = hidden_size
+        self.embeddings = CLIPTextEmbeddings()
+        self.encoder = CLIPEncoder(
+            num_hidden_layers=num_hidden_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            batch_size=batch_size,
+            seq_len=seq_len,
+            causal=causal,
+            mask_seq=mask_seq,
+        )
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_return_dict = use_return_dict
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+        )
+
+        last_hidden_state = encoder_outputs
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+        return last_hidden_state
diff --git a/examples/05_stable_diffusion/modeling/embeddings.py b/examples/05_stable_diffusion/modeling/embeddings.py
new file mode 100644
index 000000000..36b96a4fb
--- /dev/null
+++ b/examples/05_stable_diffusion/modeling/embeddings.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import math
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn, Tensor
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def get_timestep_embedding(
+    timesteps: Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+
+    exponent = (-math.log(max_period)) * Tensor(
+        shape=[half_dim], dtype="float16", name="arange"
+    )
+
+    exponent = exponent * (1.0 / (half_dim - downscale_freq_shift))
+
+    emb = ops.exp(exponent)
+    emb = ops.reshape()(timesteps, [-1, 1]) * ops.reshape()(emb, [1, -1])
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = ops.concatenate()(
+            [ops.cos(emb), ops.sin(emb)],
+            dim=-1,
+        )
+    else:
+        emb = ops.concatenate()(
+            [ops.sin(emb), ops.cos(emb)],
+            dim=-1,
+        )
+    return emb
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(channel, time_embed_dim, specialization="swish")
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+
+    def forward(self, sample):
+        sample = self.linear_1(sample)
+        sample = self.linear_2(sample)
+        return sample
+
+
+class Timesteps(nn.Module):
+    def __init__(
+        self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
diff --git a/examples/05_stable_diffusion/modeling/resnet.py b/examples/05_stable_diffusion/modeling/resnet.py
new file mode 100644
index 000000000..03e4f8023
--- /dev/null
+++ b/examples/05_stable_diffusion/modeling/resnet.py
@@ -0,0 +1,238 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+class Upsample2D(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+        self,
+        channels,
+        use_conv=False,
+        use_conv_transpose=False,
+        out_channels=None,
+        name="conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+
+        conv = None
+        if use_conv_transpose:
+            conv = nn.ConvTranspose2dBias(channels, self.out_channels, 4, 2, 1)
+        elif use_conv:
+            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+
+    def forward(self, x):
+        assert get_shape(x)[-1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(x)
+
+        x = nn.Upsampling2d(scale_factor=2.0, mode="nearest")(x)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if self.use_conv:
+            if self.name == "conv":
+                x = self.conv(x)
+            else:
+                x = self.Conv2d_0(x)
+
+        return x
+
+
+class Downsample2D(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(
+        self, channels, use_conv=False, out_channels=None, padding=1, name="conv"
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+
+        if use_conv:
+            conv = nn.Conv2dBias(
+                self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            conv = nn.AvgPool2d(kernel_size=stride, stride=stride, padding=0)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+
+    def forward(self, x):
+        assert get_shape(x)[-1] == self.channels
+        x = self.conv(x)
+
+        return x
+
+
+class ResnetBlock2D(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        kernel=None,
+        output_scale_factor=1.0,
+        use_nin_shortcut=None,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        self.norm1 = nn.GroupNorm(
+            num_groups=groups,
+            num_channels=in_channels,
+            eps=eps,
+            affine=True,
+            use_swish=True,
+        )
+
+        self.conv1 = nn.Conv2dBias(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        if temb_channels is not None:
+            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+        else:
+            self.time_emb_proj = None
+
+        self.norm2 = nn.GroupNorm(
+            num_groups=groups_out,
+            num_channels=out_channels,
+            eps=eps,
+            affine=True,
+            use_swish=True,
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2dBias(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        self.upsample = self.downsample = None
+
+        self.use_nin_shortcut = (
+            self.in_channels != self.out_channels
+            if use_nin_shortcut is None
+            else use_nin_shortcut
+        )
+
+        if self.use_nin_shortcut:
+            self.conv_shortcut = nn.Conv2dBias(
+                in_channels, out_channels, 1, 1, 0
+            )  # kernel_size=1, stride=1, padding=0) # conv_bias_add
+        else:
+            self.conv_shortcut = None
+
+    def forward(self, x, temb=None):
+        hidden_states = x
+
+        # make sure hidden states is in float32
+        # when running in half-precision
+        hidden_states = self.norm1(
+            hidden_states
+        )  # .float()).type(hidden_states.dtype) # fused swish
+        # hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            x = self.upsample(x)
+            hidden_states = self.upsample(hidden_states)
+        elif self.downsample is not None:
+            x = self.downsample(x)
+            hidden_states = self.downsample(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
+
+        if temb is not None:
+            temb = self.time_emb_proj(ops.silu(temb))
+            bs, dim = get_shape(temb)
+            temb = ops.reshape()(temb, [bs, 1, 1, dim])
+            hidden_states = hidden_states + temb
+
+        # make sure hidden states is in float32
+        # when running in half-precision
+        hidden_states = self.norm2(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            x = self.conv_shortcut(x)
+
+        out = hidden_states + x
+
+        return out
diff --git a/examples/05_stable_diffusion/modeling/unet_2d_condition.py b/examples/05_stable_diffusion/modeling/unet_2d_condition.py
new file mode 100644
index 000000000..9c1d9f07c
--- /dev/null
+++ b/examples/05_stable_diffusion/modeling/unet_2d_condition.py
@@ -0,0 +1,251 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Optional, Tuple
+
+from aitemplate.frontend import nn
+
+from modeling.embeddings import TimestepEmbedding, Timesteps
+from modeling.unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
+
+
+class UNet2DConditionModel(nn.Module):
+    r"""
+    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        sample_size (`int`, *optional*): The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+    """
+
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: int = 8,
+    ):
+        super().__init__()
+        self.center_input_sample = center_input_sample
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+
+        # input
+        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim,
+                downsample_padding=downsample_padding,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift="default",
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim,
+            resnet_groups=norm_num_groups,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[
+                min(i + 1, len(block_out_channels) - 1)
+            ]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=norm_num_groups,
+            eps=norm_eps,
+            use_swish=True,
+        )
+
+        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
+
+    def forward(
+        self,
+        sample,
+        timesteps,
+        encoder_hidden_states,
+        return_dict: bool = True,
+    ):
+        """r
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+
+        # 1. time
+        t_emb = self.time_proj(timesteps)
+        emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if (
+                hasattr(downsample_block, "attentions")
+                and downsample_block.attentions is not None
+            ):
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states
+        )
+
+        # 5. up
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+
+            if (
+                hasattr(upsample_block, "attentions")
+                and upsample_block.attentions is not None
+            ):
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
+                )
+
+        # 6. post-process
+        # make sure hidden states is in float32
+        # when running in half-precision
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_out(sample)
+        return sample
diff --git a/examples/05_stable_diffusion/modeling/unet_blocks.py b/examples/05_stable_diffusion/modeling/unet_blocks.py
new file mode 100644
index 000000000..75de2e0c8
--- /dev/null
+++ b/examples/05_stable_diffusion/modeling/unet_blocks.py
@@ -0,0 +1,761 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# flake8: noqa
+from aitemplate.compiler import ops
+
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+from modeling.attention import AttentionBlock
+
+from modeling.clip import SpatialTransformer
+from modeling.resnet import Downsample2D, ResnetBlock2D, Upsample2D
+
+# pylint: disable=W0102
+
+
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    cross_attention_dim=None,
+    downsample_padding=None,
+):
+    down_block_type = (
+        down_block_type[7:]
+        if down_block_type.startswith("UNetRes")
+        else down_block_type
+    )
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnDownBlock2D"
+            )
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+
+
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    cross_attention_dim=None,
+):
+    up_block_type = (
+        up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    )
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnUpBlock2D"
+            )
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            attentions.append(
+                SpatialTransformer(
+                    in_channels,
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+    ):
+        super().__init__()
+
+        resnets = []
+        attentions = []
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                SpatialTransformer(
+                    out_channels,
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, context=encoder_hidden_states)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        in_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_upsample=True,
+    ):
+        super().__init__()
+
+        resnets = []
+        attentions = []
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                SpatialTransformer(
+                    out_channels,
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    depth=1,
+                    context_dim=cross_attention_dim,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = ops.concatenate()(
+                [hidden_states, res_hidden_states], dim=-1
+            )
+
+            hidden_states = resnet(hidden_states, temb=temb)
+            hidden_states = attn(hidden_states, context=encoder_hidden_states)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = ops.concatenate()(
+                [hidden_states, res_hidden_states], dim=-1
+            )
+
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UNetMidBlock2D(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__()
+
+        if attention_type != "default":
+            raise NotImplementedError(
+                f"attention_type must be default! current value: {attention_type}"
+            )
+
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            attentions.append(
+                AttentionBlock(
+                    batch_size,
+                    height,
+                    width,
+                    in_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None, encoder_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
diff --git a/examples/05_stable_diffusion/modeling/vae.py b/examples/05_stable_diffusion/modeling/vae.py
new file mode 100644
index 000000000..6a239f233
--- /dev/null
+++ b/examples/05_stable_diffusion/modeling/vae.py
@@ -0,0 +1,152 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""
+Translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/vae.py.
+"""
+
+from typing import Tuple
+
+from aitemplate.frontend import nn, Tensor
+from modeling.unet_blocks import get_up_block, UNetMidBlock2D
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels=3,
+        out_channels=3,
+        up_block_types=("UpDecoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        act_fn="silu",
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2dBias(
+            in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1
+        )
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            batch_size,
+            height,
+            width,
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attn_num_head_channels=None,
+            resnet_groups=32,
+            temb_channels=None,
+        )
+
+        # up
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=None,
+                temb_channels=None,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        num_groups_out = 32
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=num_groups_out,
+            eps=1e-6,
+            use_swish=True,
+        )
+        self.conv_out = nn.Conv2dBias(
+            block_out_channels[0], out_channels, kernel_size=3, padding=1, stride=1
+        )
+
+    def forward(self, z) -> Tensor:
+        sample = z
+        sample = self.conv_in(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # up
+        for up_block in self.up_blocks:
+            sample = up_block(sample)
+
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class AutoencoderKL(nn.Module):
+    def __init__(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        sample_size: int = 32,
+    ):
+        super().__init__()
+        self.decoder = Decoder(
+            batch_size,
+            height,
+            width,
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+        )
+        self.post_quant_conv = nn.Conv2dBias(
+            latent_channels, latent_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def decode(self, z: Tensor, return_dict: bool = True):
+
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+
+    def forward(self):
+        raise NotImplementedError("Only decode() is implemented for AutoencoderKL!")
diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
new file mode 100644
index 000000000..211fc99d9
--- /dev/null
+++ b/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
@@ -0,0 +1,371 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import inspect
+
+import os
+import warnings
+from typing import List, Optional, Union
+
+import torch
+from aitemplate.compiler import Model
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionPipelineOutput,
+    StableDiffusionSafetyChecker,
+)
+
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+class StableDiffusionAITPipeline(StableDiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offsensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+        workdir = "tmp/"
+        self.clip_ait_exe = self.init_ait_module(
+            model_name="CLIPTextModel", workdir=workdir
+        )
+        self.unet_ait_exe = self.init_ait_module(
+            model_name="UNet2DConditionModel", workdir=workdir
+        )
+        self.vae_ait_exe = self.init_ait_module(
+            model_name="AutoencoderKL", workdir=workdir
+        )
+
+    def init_ait_module(
+        self,
+        model_name,
+        workdir,
+    ):
+        mod = Model(os.path.join(workdir, model_name, "test.so"))
+        return mod
+
+    def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
+        exe_module = self.unet_ait_exe
+        timesteps_pt = timesteps.expand(latent_model_input.shape[0])
+        inputs = {
+            "input0": latent_model_input.permute((0, 2, 3, 1))
+            .contiguous()
+            .cuda()
+            .half(),
+            "input1": timesteps_pt.cuda().half(),
+            "input2": encoder_hidden_states.cuda().half(),
+        }
+        ys = []
+        num_ouputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_ouputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=True)
+        noise_pred = ys[0].permute((0, 3, 1, 2)).float()
+        return noise_pred
+
+    def clip_inference(self, input_ids, seqlen=64):
+        exe_module = self.clip_ait_exe
+        position_ids = torch.arange(seqlen).expand((1, -1)).cuda()
+        inputs = {
+            "input0": input_ids,
+            "input1": position_ids,
+        }
+        ys = []
+        num_ouputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_ouputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=True)
+        return ys[0].float()
+
+    def vae_inference(self, vae_input):
+        exe_module = self.vae_ait_exe
+        inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
+        ys = []
+        num_ouputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_ouputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=True)
+        vae_out = ys[0].permute((0, 3, 1, 2)).float()
+        return vae_out
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if "torch_device" in kwargs:
+            device = kwargs.pop("torch_device")
+            warnings.warn(
+                "`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
+                " Consider using `pipe.to(torch_device)` instead."
+            )
+
+            # Set device as before (to be removed in 0.3.0)
+            if device is None:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.to(device)
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=64,  # self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.clip_inference(text_input.input_ids.to(self.device))
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            max_length = text_input.input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                [""] * batch_size,
+                padding="max_length",
+                max_length=max_length,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.clip_inference(
+                uncond_input.input_ids.to(self.device)
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_device = "cpu" if self.device.type == "mps" else self.device
+        latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
+        if latents is None:
+            latents = torch.randn(
+                latents_shape,
+                generator=generator,
+                device=latents_device,
+            )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(
+                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
+                )
+        latents = latents.to(self.device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(
+            inspect.signature(self.scheduler.set_timesteps).parameters.keys()
+        )
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            latents = latents * self.scheduler.sigmas[0]
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            )
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                sigma = self.scheduler.sigmas[i]
+                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
+                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
+
+            # predict the noise residual
+            noise_pred = self.unet_inference(
+                latent_model_input, t, encoder_hidden_states=text_embeddings
+            )
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+
+            # compute the previous noisy sample x_t -> x_t-1
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                latents = self.scheduler.step(
+                    noise_pred, i, latents, **extra_step_kwargs
+                ).prev_sample
+            else:
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                ).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae_inference(latents)
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        # run safety checker
+        safety_cheker_input = self.feature_extractor(
+            self.numpy_to_pil(image), return_tensors="pt"
+        ).to(self.device)
+        image, has_nsfw_concept = self.safety_checker(
+            images=image, clip_input=safety_cheker_input.pixel_values
+        )
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )
diff --git a/examples/06_how_to_add_an_op/how_to_add_an_op.py b/examples/06_how_to_add_an_op/how_to_add_an_op.py
new file mode 100644
index 000000000..cd1646aeb
--- /dev/null
+++ b/examples/06_how_to_add_an_op/how_to_add_an_op.py
@@ -0,0 +1,249 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from typing import Any, Dict, List
+
+import jinja2
+import torch
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec, ROCMSpec
+from aitemplate.compiler import compile_model
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.testing import detect_target
+
+
+class add_one(Operator):
+    def __init__(self):
+        super().__init__()
+        # required, unique identity of operator category
+        self._attrs["op"] = "add_one"
+        # we can put whatever we want into the op attrs for later use
+        self._attrs["has_profiler"] = False
+        self._attrs["nop"] = False
+
+    def __call__(self, x: Tensor) -> Tensor:
+        # each operator needs to keep a record of input tensors
+        self._attrs["inputs"] = [x]
+        # optional, to set depth of the op based on inputs' depth, used in DFS
+        self._set_depth()
+        # infer output shape
+        output_shape = self._infer_shape(x)
+        # create output Tensor, of which the source op is the current op
+        output = Tensor(output_shape, src_ops={self})
+        # remember current op's outputs
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _infer_shape(self, x) -> List[IntVar]:
+        return x.shape()
+
+    def gen_function(self) -> str:
+        # this function will be used in codegen
+        # here we only need to redirect to backend codegen function
+        target = backend.target.Target.current()
+        func_key = f"{target.name()}.{self._attrs['op']}.gen_function"
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{kernel}}
+
+}  // namespace
+
+{{func_signature}}
+{
+    invoke_add_one(output, input, num_elements, stream);
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(half* output,
+                   const half* input,
+                   const int64_t num_elements,
+                   {{prefix}}Stream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}int64_t num_elements = 1;
+{% for dim_name in dim_names %}
+{{indent}}num_elements *= {{dim_name}};
+{% endfor %}
+
+{{indent}}{{func_name}}(
+{{indent}}   {{output}}, {{input}}, num_elements, stream /* default stream */
+{{indent}});
+    """
+)
+
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+__global__ void add_one(half* output, const half* input, const int64_t num_elements) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_elements) {
+    output[idx] = input[idx] + half(1.0);
+  }
+}
+
+void invoke_add_one(half* output, const half* input, int64_t num_elements, {{prefix}}Stream_t stream) {
+  if (num_elements < 1024) {
+    dim3 grid(1);
+    dim3 block(num_elements);
+    add_one<<<grid, block, 0, stream>>>(output, input, num_elements);
+  } else {
+    dim3 grid((num_elements + 1024 - 1) / 1024);
+    dim3 block(1024);
+    add_one<<<grid, block, 0, stream>>>(output, input, num_elements);
+  }
+}
+    """
+)
+
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    """reinterpret_cast<half*>(
+        {% if is_cuda %}&({% endif %}{{name}}{% if is_cuda %}->raw()){% endif %})"""
+)
+
+
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) -> str:
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 1
+
+    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"], is_cuda=is_cuda
+    )
+    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][0]._attrs["name"], is_cuda=is_cuda
+    )
+
+    dim_names = [dim._attrs["name"] for dim in func_attrs["inputs"][0].shape()]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        input=input_name,
+        dim_names=dim_names,
+        indent=indent,
+    )
+
+
+def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+    prefix = backend_spec.prefix
+    return FUNC_TEMPLATE.render(
+        header_files=header_files,
+        kernel=KERNEL_TEMPLATE.render(prefix=prefix),
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"], prefix=prefix
+        ),
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            prefix=backend_spec.prefix,
+        ).strip()
+    )
+
+
+CUDA_HEADER_FILES = """
+#include <cuda_fp16.h>
+"""
+
+
+@registry.reg("cuda.add_one.gen_function")
+def cuda_add_one_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return gen_function(func_attrs, CUDA_HEADER_FILES, CUDASpec())
+
+
+@registry.reg("cuda.add_one.func_decl")
+def cuda_add_one_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    return gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.add_one.func_call")
+def cuda_add_one_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return gen_function_call(func_attrs, indent, is_cuda=True)
+
+
+HIP_HEADER_FILES = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+"""
+
+
+@registry.reg("rocm.add_one.gen_function")
+def rocm_add_one_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return gen_function(func_attrs, HIP_HEADER_FILES, ROCMSpec())
+
+
+@registry.reg("rocm.add_one.func_decl")
+def rocm_add_one_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    return gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.add_one.func_call")
+def rocm_add_one_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return gen_function_call(func_attrs, indent, is_cuda=False)
+
+
+def create_ait_model(shapes):
+    X = Tensor(
+        shape=shapes,
+        dtype="float16",
+        name="X",
+        is_input=True,
+    )
+    Y = add_one()(X)
+    Y._attrs["is_output"] = True
+    Y._attrs["name"] = "Y"
+    return Y
+
+
+def verify_add_one():
+    shapes = [16, 512]
+    x = torch.randn(shapes).cuda().half()
+    y_pt = x + 1.0
+
+    Y = create_ait_model([16, 512])
+    target = detect_target()
+    with compile_model(Y, target, "./tmp", "add_one") as module:
+        y = torch.empty(shapes).cuda().half()
+        inputs = {"X": x}
+        outputs = {"Y": y}
+        module.run_with_tensors(inputs, outputs)
+        print(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+
+verify_add_one()
diff --git a/examples/07_how_to_run_pt_model/how_to_run_pt_model.py b/examples/07_how_to_run_pt_model/how_to_run_pt_model.py
new file mode 100644
index 000000000..993b7c69f
--- /dev/null
+++ b/examples/07_how_to_run_pt_model/how_to_run_pt_model.py
@@ -0,0 +1,131 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from collections import OrderedDict
+
+import torch
+
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.utils.graph_utils import sorted_graph_pseudo_code
+
+
+class PTSimpleModel(torch.nn.Module):
+    def __init__(self, hidden, eps: float = 1e-5):
+        super().__init__()
+        self.dense1 = torch.nn.Linear(hidden, 4 * hidden)
+        self.act1 = torch.nn.functional.gelu
+        self.dense2 = torch.nn.Linear(4 * hidden, hidden)
+        self.layernorm = torch.nn.LayerNorm(hidden, eps=eps)
+
+    def forward(self, input):
+        hidden_states = self.dense1(input)
+        hidden_states = self.act1(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = hidden_states + input
+        hidden_states = self.layernorm(hidden_states)
+        return hidden_states
+
+
+class AITSimpleModel(nn.Module):
+    def __init__(self, hidden, eps: float = 1e-5):
+        super().__init__()
+        self.dense1 = nn.Linear(hidden, 4 * hidden, specialization="fast_gelu")
+        self.dense2 = nn.Linear(4 * hidden, hidden)
+        self.layernorm = nn.LayerNorm(hidden, eps=eps)
+
+    def forward(self, input):
+        hidden_states = self.dense1(input)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = hidden_states + input
+        hidden_states = self.layernorm(hidden_states)
+        return hidden_states
+
+
+def map_pt_params(ait_model, pt_model):
+    ait_model.name_parameter_tensor()
+    pt_params = dict(pt_model.named_parameters())
+    mapped_pt_params = OrderedDict()
+    for name, _ in ait_model.named_parameters():
+        ait_name = name.replace(".", "_")
+        assert name in pt_params
+        mapped_pt_params[ait_name] = pt_params[name]
+    return mapped_pt_params
+
+
+def verify_simple_model(batch_size=1024, hidden=512):
+    # create pt model
+    pt_model = PTSimpleModel(hidden).cuda().half()
+
+    # create pt input
+    x = torch.randn([batch_size, hidden]).cuda().half()
+
+    # run pt model
+    pt_model.eval()
+    y_pt = pt_model(x)
+
+    # create ait model
+    ait_model = AITSimpleModel(hidden)
+    X = Tensor(
+        shape=[batch_size, hidden],
+        name="X",
+        dtype="float16",
+        is_input=True,
+    )
+    Y = ait_model(X)
+    Y._attrs["is_output"] = True
+    Y._attrs["name"] = "Y"
+
+    # map pt weights to ait
+    weights = map_pt_params(ait_model, pt_model)
+
+    # code gen
+    target = detect_target()
+    with compile_model(
+        Y, target, "./tmp", "simple_model_demo", constants=weights
+    ) as module:
+        # create storage for output tensor
+        y = torch.empty([batch_size, hidden]).cuda().half()
+
+        # inputs and outputs dict
+        inputs = {"X": x}
+        outputs = {"Y": y}
+
+        # run
+        module.run_with_tensors(inputs, outputs, graph_mode=True)
+
+        # verify output is correct
+        print(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+        # benchmark ait and pt
+        count = 1000
+        ait_t, _, _ = module.benchmark_with_tensors(
+            inputs, outputs, graph_mode=True, count=count
+        )
+        print(f"AITemplate time: {ait_t} ms/iter")
+
+        pt_t = benchmark_torch_function(count, pt_model.forward, x)
+        print(f"PyTorch eager time: {pt_t} ms/iter")
+
+        # check out the fused graph
+        # there are only fused ops in the final graph
+        # gemm_rcr_bias_fast_gelu, gemm_rcr_bias_add, and layernorm
+        graph = module.debug_sorted_graph
+        print("Final graph:")
+        print(sorted_graph_pseudo_code(graph))
+
+
+verify_simple_model()
diff --git a/licenses/LICENSE.composable_kernel.txt b/licenses/LICENSE.composable_kernel.txt
new file mode 100644
index 000000000..2fe9a8455
--- /dev/null
+++ b/licenses/LICENSE.composable_kernel.txt
@@ -0,0 +1,28 @@
+Copyright (c) 2018-    , Advanced Micro Devices, Inc. (Chao Liu, Jing Zhang)
+Copyright (c) 2019-    , Advanced Micro Devices, Inc. (Letao Qin, Qianfeng Zhang, Liang Huang, Shaojie Wang)
+Copyright (c) 2022-    , Advanced Micro Devices, Inc. (Anthony Chang, Chunyu Lai, Illia Silin, Adam Osewski, Poyen Chen, Jehandad Khan)
+Copyright (c) 2019-2021, Advanced Micro Devices, Inc. (Hanwen Chang)
+Copyright (c) 2019-2020, Advanced Micro Devices, Inc. (Tejash Shah)
+Copyright (c) 2020     , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
+Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
+
+SPDX-License-Identifier: MIT
+Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/licenses/LICENSE.cub.txt b/licenses/LICENSE.cub.txt
new file mode 100644
index 000000000..6aeea8da6
--- /dev/null
+++ b/licenses/LICENSE.cub.txt
@@ -0,0 +1,24 @@
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/licenses/LICENSE.cutlass.txt b/licenses/LICENSE.cutlass.txt
new file mode 100644
index 000000000..d9219ec9b
--- /dev/null
+++ b/licenses/LICENSE.cutlass.txt
@@ -0,0 +1,27 @@
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/licenses/LICENSE.dmlc.txt b/licenses/LICENSE.dmlc.txt
new file mode 100644
index 000000000..8dada3eda
--- /dev/null
+++ b/licenses/LICENSE.dmlc.txt
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/licenses/LICENSE.flash_attention.txt b/licenses/LICENSE.flash_attention.txt
new file mode 100644
index 000000000..261eeb9e9
--- /dev/null
+++ b/licenses/LICENSE.flash_attention.txt
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/licenses/LICENSE.hipcub.txt b/licenses/LICENSE.hipcub.txt
new file mode 100644
index 000000000..c284d2bd9
--- /dev/null
+++ b/licenses/LICENSE.hipcub.txt
@@ -0,0 +1,25 @@
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+Modifications Copyright (c) 2019-2021, Advanced Micro Devices, Inc.  All rights reserved.
+ 
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+ 
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/licenses/LICENSE.markdown_table.txt b/licenses/LICENSE.markdown_table.txt
new file mode 100644
index 000000000..6a5cab0c2
--- /dev/null
+++ b/licenses/LICENSE.markdown_table.txt
@@ -0,0 +1,21 @@
+# MIT License
+
+# Copyright (c) 2020 hvalev
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
diff --git a/licenses/LICENSE.oneflow.txt b/licenses/LICENSE.oneflow.txt
new file mode 100644
index 000000000..f31ebbb41
--- /dev/null
+++ b/licenses/LICENSE.oneflow.txt
@@ -0,0 +1,202 @@
+Copyright 2020 The OneFlow Authors. All rights reserved.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/licenses/LICENSE.pydot.txt b/licenses/LICENSE.pydot.txt
new file mode 100644
index 000000000..741171aa6
--- /dev/null
+++ b/licenses/LICENSE.pydot.txt
@@ -0,0 +1,21 @@
+Copyright (c) 2014 Carlos Jenkins
+Copyright (c) 2014 Lance Hepler
+Copyright (c) 2004 Ero Carrera
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/licenses/LICENSE.pytorch.txt b/licenses/LICENSE.pytorch.txt
new file mode 100644
index 000000000..04f9ad110
--- /dev/null
+++ b/licenses/LICENSE.pytorch.txt
@@ -0,0 +1,77 @@
+From PyTorch:
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+From Caffe2:
+
+Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+All contributions by Facebook:
+Copyright (c) 2016 Facebook Inc.
+
+All contributions by Google:
+Copyright (c) 2015 Google Inc.
+All rights reserved.
+
+All contributions by Yangqing Jia:
+Copyright (c) 2015 Yangqing Jia
+All rights reserved.
+
+All contributions by Kakao Brain:
+Copyright 2019-2020 Kakao Brain
+
+All contributions by Cruise LLC:
+Copyright (c) 2022 Cruise LLC.
+All rights reserved.
+
+All contributions from Caffe:
+Copyright(c) 2013, 2014, 2015, the respective contributors
+All rights reserved.
+
+All other contributions:
+Copyright(c) 2015, 2016 the respective contributors
+All rights reserved.
+
+Caffe2 uses a copyright model similar to Caffe: each contributor holds
+copyright over their contributions to Caffe2. The project versioning records
+all such contribution and copyright details. If a contributor wants to further
+mark their specific copyright on a particular contribution, they should
+indicate their copyright solely in the commit message of the change when it is
+committed.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/licenses/LICENSE.tensorrt.txt b/licenses/LICENSE.tensorrt.txt
new file mode 100644
index 000000000..e29455903
--- /dev/null
+++ b/licenses/LICENSE.tensorrt.txt
@@ -0,0 +1,337 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   Copyright 2021 NVIDIA Corporation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+   PORTIONS LICENSED AS FOLLOWS
+
+   > tools/pytorch-quantization/examples/torchvision/models/classification/resnet.py
+
+     BSD 3-Clause License
+
+     Copyright (c) Soumith Chintala 2016,
+     All rights reserved.
+
+     Redistribution and use in source and binary forms, with or without
+     modification, are permitted provided that the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice, this
+       list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+       contributors may be used to endorse or promote products derived from
+       this software without specific prior written permission.
+
+     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+     FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+     DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+     SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+     OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   > samples/common/windows/getopt.c
+
+     Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com>
+
+     Permission to use, copy, modify, and distribute this software for any
+     purpose with or without fee is hereby granted, provided that the above
+     copyright notice and this permission notice appear in all copies.
+
+     THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+     WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+     MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+     ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+     WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+     ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+     OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+     Sponsored in part by the Defense Advanced Research Projects
+     Agency (DARPA) and Air Force Research Laboratory, Air Force
+     Materiel Command, USAF, under agreement number F39502-99-1-0512.
+
+
+     Copyright (c) 2000 The NetBSD Foundation, Inc.
+     All rights reserved.
+
+     This code is derived from software contributed to The NetBSD Foundation
+     by Dieter Baron and Thomas Klausner.
+
+     Redistribution and use in source and binary forms, with or without
+     modification, are permitted provided that the following conditions
+     are met:
+     1. Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+     2. Redistributions in binary form must reproduce the above copyright
+        notice, this list of conditions and the following disclaimer in the
+        documentation and/or other materials provided with the distribution.
+
+     THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+     ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+     TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+     PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+     BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+     POSSIBILITY OF SUCH DAMAGE.
+     - Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com>
+     - Copyright (c) 2000 The NetBSD Foundation, Inc.
+
+
+   > parsers/common/ieee_half.h
+   > samples/common/half.h
+   > third_party/ieee/half.h
+
+     The MIT License
+
+     Copyright (c) 2012-2017 Christian Rau <rauy@users.sourceforge.net>
+
+     Permission is hereby granted, free of charge, to any person obtaining a
+     copy of this software and associated documentation files (the "Software"),
+     to deal in the Software without restriction, including without limitation
+     the rights to use, copy, modify, merge, publish, distribute, sublicense,
+     and/or sell copies of the Software, and to permit persons to whom the
+     Software is furnished to do so, subject to the following conditions:
+
+     The above copyright notice and this permission notice shall be included
+     in all copies or substantial portions of the Software.
+
+     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+     DEALINGS IN THE SOFTWARE.
+
+   > plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttn.cu
+   > plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableAttn.h
+   > plugin/multiscaleDeformableAttnPlugin/multiscaleDeformableIm2ColCuda.cuh
+     
+     Copyright 2020 SenseTime
+
+     Licensed under the Apache License, Version 2.0 (the "License");
+     you may not use this file except in compliance with the License.
+     You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
+
+     DETR
+
+     Copyright 2020 - present, Facebook, Inc
+
+     Licensed under the Apache License, Version 2.0 (the "License");
+     you may not use this file except in compliance with the License.
+     You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+     Unless required by applicable law or agreed to in writing, software
+     distributed under the License is distributed on an "AS IS" BASIS,
+     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     See the License for the specific language governing permissions and
+     limitations under the License.
diff --git a/licenses/license.header.txt b/licenses/license.header.txt
new file mode 100644
index 000000000..78af24e7a
--- /dev/null
+++ b/licenses/license.header.txt
@@ -0,0 +1,13 @@
+ Copyright (c) Meta Platforms, Inc. and affiliates.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/python/aitemplate/__init__.py b/python/aitemplate/__init__.py
new file mode 100644
index 000000000..ed1d8a72e
--- /dev/null
+++ b/python/aitemplate/__init__.py
@@ -0,0 +1,42 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import os
+import sys
+
+from . import backend, compiler, frontend, testing, utils
+from ._libinfo import __version__  # noqa
+
+if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
+    PY3STATEMENT = "The minimal Python requirement is Python 3.7"
+    raise Exception(PY3STATEMENT)
+
+__all__ = ["backend", "compiler", "frontend", "testing", "utils"]
+
+root_logger = logging.getLogger(__name__)
+info_handle = logging.StreamHandler()
+formatter = logging.Formatter("%(asctime)s %(levelname)s <%(name)s> %(message)s")
+info_handle.setFormatter(formatter)
+root_logger.addHandler(info_handle)
+root_logger.propagate = False
+
+DEFAULT_LOGLEVEL = logging.getLogger().level
+log_level_str = os.environ.get("LOGLEVEL", None)
+LOG_LEVEL = (
+    getattr(logging, log_level_str.upper())
+    if log_level_str is not None
+    else DEFAULT_LOGLEVEL
+)
+root_logger.setLevel(LOG_LEVEL)
diff --git a/python/aitemplate/_libinfo.py b/python/aitemplate/_libinfo.py
new file mode 100644
index 000000000..6aacc3444
--- /dev/null
+++ b/python/aitemplate/_libinfo.py
@@ -0,0 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# current version
+# We use the version of the incoming release for code
+__version__ = "0.1.dev0"
diff --git a/python/aitemplate/backend/__init__.py b/python/aitemplate/backend/__init__.py
new file mode 100644
index 000000000..8e7aaca0d
--- /dev/null
+++ b/python/aitemplate/backend/__init__.py
@@ -0,0 +1,37 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Backend for AITemplate.
+"""
+from . import (  # noqa
+    backend_spec,
+    builder,
+    codegen,
+    cuda,
+    profiler_runner,
+    registry,
+    rocm,
+    target,
+)
+
+__all__ = [
+    "builder",
+    "codegen",
+    "cuda",
+    "profiler_runner",
+    "registry",
+    "rocm",
+    "target",
+]
diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
new file mode 100644
index 000000000..44daa1f3c
--- /dev/null
+++ b/python/aitemplate/backend/backend_spec.py
@@ -0,0 +1,280 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Backend Specifications.
+"""
+
+from dataclasses import dataclass, field
+
+from typing import Dict, List, Tuple
+
+import jinja2
+
+from ..compiler.ops.common.epilogue import FuncEnum
+from .target import Target
+
+
+@dataclass
+class BackendSpec:
+    dtype_to_backend_fp16_dtype: Dict[str, str] = field(
+        default_factory=lambda: {
+            "float16": "half",
+        }
+    )
+
+    dtype_to_backend_dtype: Dict[str, str] = field(
+        default_factory=lambda: {
+            "float16": "half",
+            "float": "float",
+            "int64": "int64_t",
+        }
+    )
+
+    backend_datatype_convertors: Dict[str, Dict[str, str]] = field(
+        default_factory=lambda: {
+            "half": {"float": "__half2float"},
+            "float": {"half": "__float2half_rn"},
+        }
+    )
+
+    read_num_elements_to_backend_type: List[Tuple[int, str]] = field(
+        default_factory=lambda: [
+            (8, "uint4"),
+            (4, "uint2"),
+            (2, "uint"),
+            (1, "half"),
+        ]
+    )
+    op_num_elements_to_backend_type: List[Tuple[int, str]] = field(
+        default_factory=lambda: [
+            (2, "half2"),
+            (1, "half"),
+        ]
+    )
+    op_type_priority_list: List[str] = field(
+        default_factory=lambda: [
+            "half2",
+            "half",
+            "float",
+        ]
+    )
+
+    func_enum_to_func_name: Dict[FuncEnum, Dict[str, str]] = field(
+        default_factory=lambda: {
+            FuncEnum.ADD: {
+                "half2": "__hadd2",
+                "half": "__hadd",
+                "float": "__fadd_rn",
+            },
+            FuncEnum.SUB: {
+                "half2": "__hsub2",
+                "half": "__hsub",
+                "float": "__fsub_rn",
+            },
+            FuncEnum.MUL: {
+                "half2": "__hmul2",
+                "half": "__hmul",
+                "float": "__fmul_rn",
+            },
+            FuncEnum.DIV: {
+                "half2": "__h2div",
+                "half": "__hdiv",
+                "float": "__fdiv_rn",
+            },
+            FuncEnum.COS: {
+                "half2": "h2cos",
+                "half": "hcos",
+                "float": "cosf",
+            },
+            FuncEnum.SIN: {
+                "half2": "h2sin",
+                "half": "hsin" if Target.current().name() == "cuda" else "hsin_custom",
+                "float": "sinf",
+            },
+            FuncEnum.TANH: {
+                "half2": "fast_tanh",
+                "half": "fast_tanh",
+                "float": "tanh",
+            },
+            FuncEnum.ABS: {
+                "half2": "__habs2",
+                "half": "__habs",
+                "float": "fabsf",
+            },
+            FuncEnum.LOGE: {
+                "half2": "h2log",
+                "half": "hlog",
+                "float": "logf",
+            },
+            FuncEnum.EXP: {
+                "half2": "h2exp",
+                "half": "hexp",
+                "float": "expf",
+            },
+            FuncEnum.SQRT: {
+                "half2": "h2sqrt",
+                "half": "hsqrt",
+                "float": "sqrtf",
+            },
+            FuncEnum.MAX: {
+                "half2": "hmax2_nan",
+                "half": "hmax_nan",
+                "float": "fmaxf_nan",
+            },
+            FuncEnum.MIN: {
+                "half2": "hmin2_nan",
+                "half": "hmin_nan",
+                "float": "fminf_nan",
+            },
+            FuncEnum.SIGN: {
+                "half2": "h2sign_custom",
+                "half": "sign_custom<half>",
+                "float": "sign_custom<float>",
+            },
+            FuncEnum.SIGMOID: {
+                "half2": "h2sigmoid_custom",
+                "half": "hsigmoid_custom",
+                "float": "fsigmoid_custom",
+            },
+            FuncEnum.LRELU: {
+                "half2": "leaky_relu",
+                "half": "leaky_relu",
+                "float": "leaky_relu",
+            },
+            FuncEnum.HARDTANH: {
+                "half2": "h2hard_tanh",
+                "half": "hard_tanh<half>",
+                "float": "hard_tanh<float>",
+            },
+            FuncEnum.RELU: {"half2": "relu", "half": "relu", "float": "relu"},
+            FuncEnum.NAN_TO_NUM: {
+                "half2": "nan_to_num",
+                "half": "nan_to_num",
+                "float": "nan_to_num",
+            },
+            FuncEnum.CLAMP_NAN_TO_NUM: {
+                "half2": "clamp_nan_to_num",
+                "half": "clamp_nan_to_num",
+                "float": "clamp_nan_to_num",
+            },
+            FuncEnum.SILU: {
+                "half2": "h2silu",
+                "half": "hsilu",
+                "float": "fsilu",
+            },
+        }
+    )
+
+    def get_backend_type(
+        self,
+        num_elements: int,
+        dtype: str,
+        num_elements_to_backend_type_list: List[Tuple[int, str]],
+    ) -> str:
+        if dtype != "float16":
+            raise NotImplementedError("Unsupported dtype {}!".format(dtype))
+        for num, backend_type in num_elements_to_backend_type_list:
+            if num_elements % num == 0:
+                return backend_type
+        raise RuntimeError(
+            "Failed to infer data type! num_elements: {}, num_elements_to_backend_type_list: {}".format(
+                num_elements, num_elements_to_backend_type_list
+            )
+        )
+
+    def get_candidate_op_types(self, op_t: str) -> List[str]:
+        res = []
+        found = False
+        for t in self.op_type_priority_list:
+            if t == op_t:
+                found = True
+            if found:
+                res.append(t)
+        return res
+
+    def get_dtype_to_dtype(self, dtype: str, type_dict: Dict[str, str]):
+        data_type = type_dict.get(dtype)
+        if not data_type:
+            raise NotImplementedError("Unsupported dtype {}!".format(dtype))
+        return data_type
+
+    def get_fp16_dtype(self, dtype: str):
+        return self.get_dtype_to_dtype(dtype, self.dtype_to_backend_fp16_dtype)
+
+    def dtype_to_backend_type(self, dtype: str):
+        return self.get_dtype_to_dtype(dtype, self.dtype_to_backend_dtype)
+
+
+@dataclass
+class ROCMSpec(BackendSpec):
+    backend_name = "rocm"
+    index_type = "int64_t"
+    prefix = "hip"
+    stream = "stream"
+    cub = "hipcub"
+
+    cast_to_half_ptr_template = jinja2.Template("reinterpret_cast<half*>({{name}})")
+    cast_to_const_half_ptr_template = jinja2.Template(
+        "reinterpret_cast<const half*>({{name}})"
+    )
+    header_src_template = jinja2.Template(
+        """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+{{extra_header}}
+        """
+    )
+    half2_data_ref = ".data"
+
+    dtype_to_ck_type: Dict[str, str] = field(
+        default_factory=lambda: {
+            "float16": "ck::half_t",
+            "float": "float",
+        }
+    )
+
+    def dtype_to_lib_type(self, dtype: str):
+        return self.get_dtype_to_dtype(dtype, self.dtype_to_ck_type)
+
+
+@dataclass
+class CUDASpec(BackendSpec):
+    backend_name = "cuda"
+    index_type = "int64_t"
+    prefix = "cuda"
+    stream = "stream"
+    cub = "cub"
+
+    cast_to_half_ptr_template = jinja2.Template("reinterpret_cast<half*>({{name}})")
+    cast_to_const_half_ptr_template = jinja2.Template(
+        "reinterpret_cast<const half*>({{name}})"
+    )
+    header_src_template = jinja2.Template(
+        """
+#include <cuda_fp16.h>
+{{extra_header}}
+        """
+    )
+
+    half2_data_ref = ""
+    dtype_to_cutlass_type: Dict[str, str] = field(
+        default_factory=lambda: {
+            "float16": "cutlass::half_t",
+            "float": "float",
+        }
+    )
+
+    def dtype_to_lib_type(self, dtype: str):
+        return self.get_dtype_to_dtype(dtype, self.dtype_to_cutlass_type)
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
new file mode 100644
index 000000000..80699a79b
--- /dev/null
+++ b/python/aitemplate/backend/builder.py
@@ -0,0 +1,295 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Builder is a module to compile generated source code files into binary objects.
+"""
+
+from __future__ import annotations
+
+import multiprocessing
+
+import os
+import pathlib
+import re
+import typing
+from typing import Optional
+
+import jinja2
+
+from ..utils import logger
+from .target import Target
+from .task_runner import BaseRunner, Task
+
+# pylint: disable=W0221,C0103
+
+
+def process_task(task: Task) -> None:
+    """This function extracts stdout and stderr from a finished task.
+    If the task process return code is not 0, will mark the task as
+    a failed task.
+
+    Parameters
+    ----------
+    task : Task
+        A compiling task
+    """
+    stdout = task._stdout
+    stderr = task._stderr
+    if task._proc.returncode != 0:
+        task._failed = True
+        logger.info(
+            __name__,
+            "Failed: [{name}]\ncmd:\n{cmd}\nstderr:\n{stderr}\nstdout:{stdout}".format(
+                name=task._name, cmd=task._cmd, stderr=stderr, stdout=stdout
+            ),
+        )
+        task._ret = -1
+    else:
+        logger.debug(
+            __name__,
+            "Successful: [{name}]\ncmd:\n{cmd}\nstderr:\n{stderr}\nstdout:{stdout}".format(
+                name=task._name, cmd=task._cmd, stderr=stderr, stdout=stdout
+            ),
+        )
+        task._ret = 0
+
+
+def process_return(task: Task) -> None:
+    """This function process the task. If task is timeout or failed,
+    raise a runtime error.
+
+    Parameters
+    ----------
+    task : Task
+        A compiling task.
+
+    Raises
+    ------
+    RuntimeError
+        Compiling failed.
+    """
+    if not task.is_timeout() and task.is_failed():
+        raise RuntimeError(f"Building failed. Logs:\n{task._stdout}\n{task._stderr}")
+
+
+class Runner(BaseRunner):
+    """A parallel runner for compiling tasks.
+    Runner is inherited from BaseRunner.
+    """
+
+    def __init__(self, devs: list[int], timeout: int = 10):
+        """Initialize a parallel runner for building
+
+        Parameters
+        ----------
+        devs : list[int]
+            CPU ids for compiling
+        timeout : int, optional
+            Compiling timeout, by default 10 (seconds)
+        """
+        super().__init__(devs, "builder", timeout)
+        logger.info(
+            __name__,
+            "Using {n} CPU for building".format(n=devs),
+        )
+        self._ftask_proc = process_task
+        self._fret_proc = process_return
+
+    def push(self, idx: typing.Union[int, str], cmd: str, target: Target) -> None:
+        """Push a building task into runner
+
+        Parameters
+        ----------
+        idx : Union[int, str]
+            Task id
+        cmd : str
+            bash command for compiling
+        target : Target
+            Target device type for building
+        """
+        self._queue.append(Task(idx, cmd, target, shell=True))
+
+    def pull(self) -> list[None]:
+        """Pull building results.
+        Check whether all building tasks are successful.
+
+        Returns
+        -------
+        list
+            An empty list
+        """
+        ret = super().pull(self._ftask_proc, self._fret_proc)
+        return ret
+
+
+class Builder(object):
+    """Builder is a module to compile generated source code
+    files into binary objects.
+    """
+
+    def __init__(self, n_jobs: int = -1, timeout: int = 180) -> None:
+        """Initialize a parallel builder for compiling source code.
+
+        Parameters
+        ----------
+        n_jobs : int, optional
+            Run how many parallel compiling job,
+            by default -1, which will set n_jobs to `multiprocessing.cpu_count()`
+        timeout : int, optional
+            Timeout value, by default 180 (seconds)
+        """
+        if n_jobs < 0:
+            n_jobs = multiprocessing.cpu_count()
+        num_builder = os.environ.get("NUM_BUILDERS", None)
+        if num_builder is not None:
+            n_jobs = int(num_builder)
+        self._runner = Runner(n_jobs, timeout)
+
+    def build_objs(
+        self,
+        files: list[typing.Tuple[str, str]],
+        cc_cmd: str,
+        binary_cc_cmd: Optional[str] = None,
+    ):
+        """Generate building task for each source code file, then build in parallel
+
+        Parameters
+        ----------
+        files : list[Tuple[str, str]]
+            list of tuples of source code path and object file path
+        cc_cmd : str
+            command line template for building objects
+        binary_cc_cmd : optional, str
+            command line template for turning raw binary files (those ending in .bin) into
+            objects. Since most compilation jobs will not need to compile these, this argument
+            is optional.
+        """
+        for idx, fpair in enumerate(files):
+            src, target = fpair
+            logger.info(__name__, "Building " + target)
+            if src.endswith(".bin"):
+                if binary_cc_cmd is None:
+                    raise ValueError(
+                        "Cannot compile .bin file without specifying binary_cc_cmd!"
+                    )
+
+                src_path = pathlib.Path(src)
+                target_path = pathlib.Path(target)
+                compile_cmd = binary_cc_cmd.format(
+                    target=target_path.name, src=src_path.name
+                )
+                containing_dir = str(src_path.parent.absolute())
+                # Have to cd into the containing dir so ld doesn't include
+                # the path in the symbol names; unfortunately, there's no other
+                # way to control this.
+                if logger.is_debug():
+                    cmd = f"cd {containing_dir} && {compile_cmd} && cd -"
+                else:
+                    # If not in debug mode, remove the original .bin file which can potentially be quite large.
+                    cmd = f"cd {containing_dir} && {compile_cmd} && rm {src_path.name} && cd -"
+            else:
+                cmd = cc_cmd.format(target=target, src=src)
+
+            logger.debug(__name__, f"The cmd for building {target} is : {cmd}")
+            self._runner.push(idx, cmd, target)
+        self._runner.join()
+        self._runner.pull()
+
+    def build_so(self, target: Target, objs: list[str]):
+        """Generate a task to build all objects into a dynamic library
+
+        Parameters
+        ----------
+        target : Target
+            Device target of dynamic library
+        objs : list[str]
+            List of all object file paths for building the dynamic library.
+        """
+        logger.info(__name__, "Building " + target)
+        cc = Target.current().cc()
+        compile_options = Target.current().compile_options()
+        fpic = "-fPIC"
+        if "nvcc" in cc:
+            fpic = "-Xcompiler=-fPIC"
+        cmd = (
+            "{cc} -shared ".format(cc=cc)
+            + fpic
+            + " "
+            + compile_options
+            + " -o {target} {objs}".format(target=target, objs=" ".join(objs))
+        )
+        logger.debug(__name__, f"The cmd for building {target} is {cmd}")
+        self._runner.push(0, cmd, target)
+        self._runner.join()
+        self._runner.pull()
+
+    def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
+
+        makefile_template = jinja2.Template(
+            """
+CC = {{cc}}
+CFLAGS = {{CFLAGS}}
+fPIC_flag = {{fPIC}}
+
+obj_files = {{obj_files}}
+
+%.obj : %.{{cpp}}
+    {{cfile_cmd}}
+%.obj : %.bin
+    {{bfile_cmd}}
+
+.PHONY: all
+all: {{target}}
+
+{{target}}: $(obj_files)
+    $(CC) -shared $(fPIC_flag) $(CFLAGS) -o $@ $(obj_files)
+
+clean:
+    rm -f *.obj test.so
+"""
+        )
+
+        obj_files = [pair[1].split("/")[-1] for pair in file_pairs]
+        obj_files = " ".join(obj_files)
+
+        cc = Target.current().cc()
+        compile_options = Target.current().compile_options()
+
+        fpic, cpp = "-fPIC", "cpp"
+        if "nvcc" in cc:
+            fpic, cpp = "-Xcompiler=-fPIC", "cu"
+
+        cfile_cmd = Target.current().compile_cmd(False).format(target="$@", src="$<")
+        bfile_cmd = Target.current().binary_compile_cmd()
+        if not bfile_cmd:
+            bfile_cmd = ""
+        else:
+            bfile_cmd = bfile_cmd.format(target="$@", src="$<")
+
+        makefile_str = makefile_template.render(
+            cc=cc,
+            cpp=cpp,
+            CFLAGS=compile_options,
+            fPIC=fpic,
+            obj_files=obj_files,
+            target=dll_name,
+            cfile_cmd=cfile_cmd,
+            bfile_cmd=bfile_cmd,
+        )
+
+        dumpfile = os.path.join(workdir, test_name, "Makefile")
+        with open(dumpfile, "w+") as f:
+            # fix the makefile indentation
+            f.write(re.sub("^    ", "\t", makefile_str, flags=re.M))
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
new file mode 100644
index 000000000..fcd806882
--- /dev/null
+++ b/python/aitemplate/backend/codegen.py
@@ -0,0 +1,744 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This module is for generating the final C++
+source code in files from Tensor and Operators.
+Functions in this module will be used for generating
+function source code files, profiler source code files,
+and model driver source code files.
+"""
+
+from __future__ import annotations
+
+import io
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+from aitemplate.backend.main_templates import MODEL_CONTAINER_TEMPLATE, MODEL_TEMPLATE
+from aitemplate.compiler.base import Operator
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+from aitemplate.compiler.transform.memory_planning import Workspace
+
+from ..compiler.base import get_dtype_size, IntImm, IntVar, Tensor
+from . import registry
+from .target import Target
+
+# pylint: disable=C0103,W0613,C0301
+
+DTYPE_TO_POINTERTYPE: Dict[str, str] = {
+    "float32": "float*",
+    "float": "float*",
+    "int": "int32_t*",
+    "int32": "int32_t*",
+    "int64": "int64_t*",
+}
+
+
+def gen_profiler(sorted_graph: list[Tensor], workdir: str, dynamic_profiling_strategy):
+    """Generate operator profiler source code files for the given graph
+
+    Parameters
+    ----------
+    sorted_graph : list[Tensor]
+        The network after running toposort transformation
+    workdir : str
+        Target directory for generated C++ source code files
+    dynamic_profiling_strategy: DynamicProfileStrategy, optional
+        A dynamic profiling strategy, used to filter generated profiles at compile time.
+        Pass-through to gen_profiler kernels of nodes in the graph.
+        See also: :func:`~aitemplate.compiler.transform.profile.profile`
+    """
+    for node in sorted_graph:
+        for func in node.src_ops():
+            if "has_profiler" in func._attrs and func._attrs["has_profiler"]:
+                func.gen_profiler(workdir, dynamic_profiling_strategy)
+
+
+def gen_function_src(
+    sorted_graph: list[Tensor], workdir: str, model_name: str = ""
+) -> list[Tuple[str, str]]:
+    """Generate functions source code files for the given graph
+
+    Parameters
+    ----------
+    sorted_graph : list[Tensor]
+        The network after running toposort transformation
+    workdir : str
+        Target directory for generated C++ source code files
+    model_name : str, optional
+        Sub working directory in the workdir for the given model, by default ""
+
+    Returns
+    -------
+    list[Tuple[str, str]]
+        List of tuple (source file path, object file path)
+    """
+    target = Target.current()
+    file_pairs = []
+    exist_func = set()
+    prefix = os.path.join(workdir, model_name)
+    for node in sorted_graph:
+        for func in node.src_ops():
+            fname = func._attrs["name"]
+            if fname not in exist_func:
+                src_path = os.path.join(prefix, fname + target.src_extension())
+                obj_path = os.path.join(prefix, fname + ".obj")
+                file_pairs.append((src_path, obj_path))
+                with open(src_path, "w") as fo:
+                    fo.write(func.gen_function())
+                exist_func.add(fname)
+    return file_pairs
+
+
+def map_set(
+    map_name: str,
+    key_name: str,
+    value_name: Optional[str] = None,
+    indent: str = "    ",
+) -> str:
+    """Generate a string setting a value in a map.
+
+    If value name is given, sets map_name["key_name"] = value_name. Else, sets
+    map_name["key_name"] = key_name. Special maps like dim_map may make
+    additional modificiations to the LHS of this expression.
+
+    Parameters
+    ----------
+    map_name : str
+        The map to use
+    key_name : str
+        The key to set. Will be put into quotes.
+    value_name : Optional[str]
+        If set, force map_name["key_name"] = value_name
+    indent : str
+        For formatting
+
+    Returns
+    -------
+    str
+        The formatted map set statement.
+    """
+    if value_name is not None:
+        value = value_name
+    else:
+        value = key_name
+        if map_name == "dim_map":
+            # Because ROCM backend uses int64_t while CUDA uses int,
+            # this is a temporary workaround to cast int64_t* to int*.
+            # FIXME: After we unified the two backends,
+            # reinterpret_cast<int *> should be removed.
+            value = f"reinterpret_cast<int64_t *>(&{value})"
+
+    return f'{indent}{map_name}["{key_name}"] = {value};'
+
+
+def set_value(lhs: Any, rhs: Any, indent: str = "     ") -> str:
+    return f"{indent}{lhs} = {rhs};"
+
+
+def set_value_from_map(map_name: Any, var_name: Any, indent: str = "    ") -> str:
+    """Generate a string that sets a value to something stored in a map.
+
+    Parameters
+    ----------
+    map_name : str
+        The map to use
+    var_name : str
+        The var_name, used as the name of the value and the key.
+    indent : str
+        For formatting
+
+    Returns
+    -------
+    str
+        The formatted statement.
+    """
+    key = var_name
+    value = var_name
+    return f'{indent}{value} = static_cast<decltype({value})>({map_name}["{key}"]);'
+
+
+def dtype_to_enumerator(dtype):
+    def _impl(dtype):
+        if dtype == "float16":
+            return "kHalf"
+        elif dtype == "float32" or dtype == "float":
+            return "kFloat"
+        elif dtype == "int32" or dtype == "int":
+            return "kInt"
+        elif dtype == "int64":
+            return "kLong"
+        else:
+            raise AssertionError(f"unknown dtype {dtype}")
+
+    return f"AITemplateDtype::{_impl(dtype)}"
+
+
+def count_inputs_outputs(graph):
+    n_inputs = n_outputs = 0
+    for node in graph:
+        if node._attrs["is_input"]:
+            n_inputs += 1
+        if node._attrs["is_output"]:
+            n_outputs += 1
+    return n_inputs, n_outputs
+
+
+def check_not_null(
+    tensor: Tensor,
+    tensor_idx: Optional[int] = None,
+    skip_if_lower_bound_is_zero: bool = False,
+) -> str:
+    """
+    Generate a nullptr check to be used by pointer initialization code.
+
+    If skip_if_lower_bound_is_zero == True, no code will be generated
+    when the Tensor has at least one dynamic dim with a lower bound
+    of zero. This is most useful for outputs; we put the nullptr
+    checks at the start of the inference, but we won't know output
+    shapes until after Run() finishes. We therefore just relax the check
+    for these outputs - only allow them to be null if their lower bound
+    is zero, otherwise never allow them to be null.
+    """
+    name = tensor._attrs["name"]
+    if tensor_idx is None:
+        check = name
+    else:
+        check = f"params[{tensor_idx}].ptr"
+
+    shape = ["1"]
+    lower_bound_is_zero = False
+    for dim in tensor._attrs["shape"]:
+        lower_bound_is_zero |= dim.lower_bound() == 0
+        if skip_if_lower_bound_is_zero and lower_bound_is_zero:
+            return ""
+        if isinstance(dim, IntImm):
+            shape.append(str(dim._attrs["values"][0]))
+        else:
+            shape.append(dim._attrs["name"])
+
+    nullptr_check = f"{check} == nullptr"
+    condition = (
+        nullptr_check
+        # If the lower bound of the shape is positive, never allow
+        # the tensor to be null.
+        if not lower_bound_is_zero
+        # Otherwise, allow it to be null only if the (possibly dynamic)
+        # size is zero.
+        else f"{nullptr_check} && {'*'.join(shape)} != 0"
+    )
+    return f"""
+if ({condition}) {{
+    throw std::runtime_error("Constant {name} was not set! Set the value with set_constant.");
+}}
+    """
+
+
+def device_copy(dst_tensor: Tensor, src_tensor: Tensor, dst_idx: int) -> str:
+    src_name = src_tensor._attrs["name"]
+    dst_ptr = f"params[{dst_idx}].ptr"
+    shape = ["1"]
+    for dim in dst_tensor._attrs["shape"]:
+        if isinstance(dim, IntImm):
+            shape.append(str(dim._attrs["values"][0]))
+        else:
+            shape.append(dim._attrs["name"])
+    shape = "*".join(shape)
+    size = f"{shape} * {get_dtype_size(dst_tensor._attrs['dtype'])}"
+    return f"DEVICE_CHECK(DeviceToDeviceCopy({dst_ptr}, {src_name}, {size}, stream));"
+
+
+class ModelContainerGenerator:
+    def __init__(
+        self,
+        max_blob_size: int,
+        max_constant_blob_size: int,
+        workspace: Workspace,
+        num_inputs: int,
+        num_outputs: int,
+        constants_data_file: io.BytesIO,
+        output_name_to_idx: Dict[str, int],
+    ):
+        self.target = Target.current()
+        self.f_var_decl = registry.get(self.target.name() + ".lib.var_decl")
+        self.f_ptr_decl = registry.get(self.target.name() + ".lib.ptr_decl")
+
+        self.constants_data_file = constants_data_file
+
+        self.exist_funcs = set()
+        self.func_decl = []
+        self.tensor_slice = []
+        self.tensor_map_set = []
+        self.set_inputs = []
+        self.func_seq = []
+        self.tensor_decl = []
+        self.dim_decl = []
+        self.device_to_device_copies = []
+        self.function_state = []
+        self.set_up_constants = []
+        self.set_up_param_names = []
+        self.set_up_param_dtypes = []
+        self.set_up_output_shapes = []
+        self.set_up_param_dynamic_shapes = []
+        self.state_record = set()
+        self.visited_func = set()
+        self.visited_dims = set()
+        self.set_up_constant_names = []
+        self.param_name_to_ptr_idx = {}
+
+        self.num_constants = 0
+        self.constants_data_size = 0
+        self.owned_constants_init = []
+
+        self.input_idx = 0
+        self.unbound_constant_idx = 0
+        self.output_name_to_idx = output_name_to_idx
+
+        (
+            self.max_blob_size,
+            self.max_constant_blob_size,
+            self.workspace,
+            self.num_inputs,
+            self.num_outputs,
+        ) = (
+            max_blob_size,
+            max_constant_blob_size,
+            workspace,
+            num_inputs,
+            num_outputs,
+        )
+
+    def _tensor_slice_func(
+        self,
+        node: Tensor,
+        blob_name: str,
+        indent="    ",
+    ) -> str:
+        offset = node._attrs["offset"]
+        name = node._attrs["name"]
+        return f"{indent}{name} = reinterpret_cast<decltype({name})>({blob_name} + {offset});"
+
+    def _record_param_tensor_info(self, tensor: Tensor, idx: int) -> None:
+        def max_value(var_or_imm):
+            if isinstance(var_or_imm, IntImm):
+                return var_or_imm.value()
+            else:
+                assert isinstance(var_or_imm, IntVar)
+                return var_or_imm.upper_bound()
+
+        shape_init = ", ".join(str(max_value(dim)) for dim in tensor._attrs["shape"])
+        param_shape_init = ", ".join(
+            f'&{dim._attrs["name"]}' for dim in tensor._attrs["shape"]
+        )
+        self.set_up_output_shapes.append(
+            set_value(f"max_param_shapes_[{idx}]", f"{{{shape_init}}}")
+        )
+        param_shape_init = ", ".join(
+            f'ParamDim({dim.lower_bound()}, {dim.upper_bound()}, &{dim._attrs["name"]})'
+            for dim in tensor._attrs["shape"]
+        )
+        self.set_up_param_dynamic_shapes.append(
+            set_value(f"params[{idx}].shape_ptrs", f"{{{param_shape_init}}}")
+        )
+        name = tensor._attrs["name"]
+        self.set_up_param_names.append(set_value(f"param_names_[{idx}]", f'"{name}"'))
+        self.set_up_param_dtypes.append(
+            set_value(
+                f"param_dtypes_[{idx}]",
+                dtype_to_enumerator(tensor.dtype()),
+            )
+        )
+
+    def _codegen_param_setup(
+        self,
+        tensor: Tensor,
+    ) -> None:
+        """
+        Generate code needed for setting up a constant in Model/ModelContainer.
+        """
+        name = tensor._attrs["name"]
+        data = tensor._attrs["data"]
+        if data is not None:
+            # Owned constant. Set up logic for copying the constant in from *.so.
+            assert (
+                tensor._attrs["offset"] >= 0
+            ), f"Constant node '{name}' must have non-negative offset"
+            self.set_up_constants.append(self._tensor_slice_func(tensor, "constants"))
+            num_bytes = len(data)
+            self.constants_data_file.write(data.to_bytes())
+
+            constant_info = f'ConstantInfo{{"{name}", {self.constants_data_size}, {tensor._attrs["offset"]}, {num_bytes}}}'
+            self.owned_constants_init.append(constant_info)
+            self.constants_data_size += num_bytes
+            self.num_constants += 1
+        else:
+            # Unbound constant. We will expect the user to set this via SetConstant.
+            self.set_up_constant_names.append(
+                set_value(
+                    f'unbound_constant_name_to_idx_["{name}"]',
+                    self.unbound_constant_idx,
+                )
+            )
+            self._record_param_tensor_info(
+                tensor, self.unbound_constant_idx + self.num_inputs + self.num_outputs
+            )
+            self.unbound_constant_idx += 1
+            self.set_inputs.append(check_not_null(tensor))
+            self.set_up_constants.append(
+                set_value(
+                    f'constant_name_to_ptr_["{name}"]',
+                    f"const_cast<const void**>(reinterpret_cast<void**>(&{name}))",
+                )
+            )
+
+    def _codegen_input_tensor(self, tensor: Tensor) -> None:
+        name = tensor._attrs["name"]
+        view = tensor._attrs["is_view_of"]
+        assert (
+            view is None
+        ), f"_codegen_input_tensor cannot be called with a view; expected a non-view tensor with is_input=True, got: {tensor}"
+        self.set_inputs.append(
+            set_value(
+                name,
+                f"static_cast<decltype({name})>(params[{self.input_idx}].ptr)",
+            )
+        )
+        self.set_inputs.append(check_not_null(tensor))
+        self.param_name_to_ptr_idx[name] = self.input_idx
+        self._record_param_tensor_info(tensor, self.input_idx)
+        self.input_idx += 1
+
+    def _get_output_idx(self, name: str) -> int:
+        assert (
+            name in self.output_name_to_idx
+        ), f"Tensor {name} was marked as an output, but its index was not found in output_name_to_index"
+        # Add num_inputs since we internally store outputs in the same array as inputs w/
+        # inputs first
+        return self.output_name_to_idx[name] + self.num_inputs
+
+    def _codegen_output_aliases_tensor(self, tensor: Tensor) -> None:
+        name = tensor._attrs["name"]
+        view = tensor._attrs["is_view_of"]
+        if tensor._attrs["external_tensor"] is not None:
+            self.set_inputs.append(set_value(name, view._attrs["name"]))
+            return
+        is_view = view is not None
+        if is_view:
+            ptr_idx = self.param_name_to_ptr_idx[view._attrs["name"]]
+            self.set_inputs.append(set_value(name, view._attrs["name"]))
+        else:
+            ptr_idx = self._get_output_idx(name)
+            self.set_inputs.append(
+                set_value(
+                    name,
+                    f"static_cast<decltype({name})>(params[{ptr_idx}].ptr)",
+                )
+            )
+
+        self.param_name_to_ptr_idx[name] = ptr_idx
+        if tensor._attrs["is_output"]:
+            self._record_param_tensor_info(tensor, ptr_idx)
+            self.set_inputs.append(
+                check_not_null(tensor, skip_if_lower_bound_is_zero=True)
+            )
+
+    def _codegen_output_tensor(self, tensor: Tensor) -> None:
+        is_param = tensor._attrs["is_param"]
+        is_input = tensor._attrs["is_input"]
+        view = tensor._attrs["is_view_of"]
+        is_view = view is not None
+        external_tensor = tensor._attrs["external_tensor"]
+        name = tensor._attrs["name"]
+
+        output_idx = self._get_output_idx(name)
+
+        if is_param:
+            self._codegen_param_setup(tensor)
+            self._record_param_tensor_info(tensor, output_idx)
+            self.device_to_device_copies.append(device_copy(tensor, tensor, output_idx))
+        elif external_tensor is not None:
+            # Special view cases for outputs; we can hit this case if the output
+            # is a view of a constant, input, or another output.
+            assert (
+                is_view
+            ), f"orig_tensor is not None, but node {name} is not marked as a view! Node: {tensor}"
+            self.set_inputs.append(
+                check_not_null(tensor, output_idx, skip_if_lower_bound_is_zero=True)
+            )
+            self.set_inputs.append(set_value(name, view._attrs["name"]))
+            self.device_to_device_copies.append(
+                device_copy(tensor, external_tensor, output_idx)
+            )
+            self._record_param_tensor_info(tensor, output_idx)
+        elif is_input:
+            # Inputs that are also outputs require an extra copy
+            self.set_inputs.append(
+                set_value(
+                    name,
+                    f"static_cast<decltype({name})>(params[{self.input_idx}].ptr)",
+                )
+            )
+            self._record_param_tensor_info(tensor, self.input_idx)
+            self._record_param_tensor_info(tensor, output_idx)
+            self.device_to_device_copies.append(device_copy(tensor, tensor, output_idx))
+            self.input_idx += 1
+        else:
+            self._codegen_output_aliases_tensor(tensor)
+
+    def _process_dims(self, shape: List[IntVar]) -> None:
+        for dim in shape:
+            if dim._attrs["name"] in self.visited_dims:
+                continue
+            intimm = 0
+            if len(dim._attrs["values"]) == 1:
+                intimm = dim._attrs["values"][0]
+            self.dim_decl.append(self.f_var_decl(dim._attrs["name"], intimm))
+            self.visited_dims.add(dim._attrs["name"])
+
+    def _process_dims_for_tensor(self, node: Tensor) -> None:
+        self._process_dims(node._attrs["shape"])
+
+    def _process_dims_for_tensor_accessors(
+        self, tensor_accessors: List[TensorAccessor]
+    ) -> None:
+        if tensor_accessors is None:
+            return
+        for accessor in tensor_accessors:
+            self._process_dims(accessor.original_shapes)
+
+    def _process_dims_for_op(self, node: Operator) -> None:
+        self._process_dims_for_tensor_accessors(node._attrs.get("input_accessors"))
+        self._process_dims_for_tensor_accessors(node._attrs.get("output_accessors"))
+
+    def _process_src_ops(self, node: Tensor) -> None:
+        funcs = node.src_ops()
+        for func in funcs:
+            f_func_decl = registry.get(
+                ".".join((self.target.name(), func._attrs["op"], "func_decl"))
+            )
+            f_func_call = registry.get(
+                ".".join((self.target.name(), func._attrs["op"], "func_call"))
+            )
+            if func._attrs["name"] not in self.exist_funcs:
+                self.func_decl.append(f_func_decl(func._attrs))
+                self.exist_funcs.add(func._attrs["name"])
+
+            # Only code gen func once for ops with multiple outputs
+            # The func can get renamed during refine_graph pass.
+            # We use original_name here because it's unique.
+            if func._attrs["original_name"] not in self.visited_func:
+                self.visited_func.add(func._attrs["original_name"])
+                self.func_seq.append(f_func_call(func._attrs, indent="    "))
+            if "int_state_flag" in func._attrs:
+                if func._attrs["name"] not in self.state_record:
+                    self.function_state.append(
+                        f'  int64_t {func._attrs["name"]}_state {{0}};'
+                    )
+                    self.state_record.add(func._attrs["name"])
+            self._process_dims_for_op(func)
+
+    def append_tensor(self, node: Tensor) -> None:
+        if node._attrs["nop"]:
+            return
+        name = node._attrs["name"]
+        dtype = node._attrs["dtype"]
+        self.tensor_decl.append(self.f_ptr_decl(name=name, dtype=dtype))
+
+        is_param = node._attrs["is_param"]
+        is_output = node._attrs["is_output"]
+        has_output_aliases = node._attrs["has_output_aliases"]
+        is_input = node._attrs["is_input"]
+        view = node._attrs["is_view_of"]
+        is_view = view is not None
+
+        if is_output:
+            # Outputs have a ton of special cases that depend on
+            # is_input, is_view, etc, so this condition needs to
+            # be checked before all the others
+            self._codegen_output_tensor(node)
+        elif is_param:
+            self._codegen_param_setup(node)
+        elif is_input:
+            self._codegen_input_tensor(node)
+        elif has_output_aliases:
+            # Special case: internal tensor that aliases an output.
+            self._codegen_output_aliases_tensor(node)
+        elif not is_view:
+            # Normal, internal tensor that is not a view: point it to the
+            # internal blob of memory
+            assert (
+                node._attrs["offset"] >= 0
+            ), f"Non-parameter node '{name}' must have non-negative offset"
+            self.tensor_slice.append(self._tensor_slice_func(node, "blob_ptr"))
+        else:
+            # Normal view, point it to the same memory as whatever it
+            # aliases
+            self.set_inputs.append(set_value(name, view._attrs["name"]))
+
+        self._process_dims_for_tensor(node)
+        self._process_src_ops(node)
+
+    def generate_source(self) -> Dict[str, str]:
+        """
+        Perform the codegen after adding all tensors.
+        The dictionary returned is a map from filename -> contents.
+        """
+        device_functions_header_name = f"{self.target.name()}_device_functions.h"
+        result = {}
+        result[
+            "device_functions-generated.h"
+        ] = f'#include "{device_functions_header_name}"'
+
+        # Disable graph mode on ROCM because the updating operations
+        # are not supported
+        target_has_graph_mode = "true" if self.target.name() == "cuda" else "false"
+
+        model_def = MODEL_TEMPLATE.render(
+            function_decl="\n".join(self.func_decl),
+            device_functions_header=device_functions_header_name,
+            set_inputs="\n".join(self.set_inputs),
+            tensor_slice="\n".join(self.tensor_slice),
+            tensor_map_set="\n".join(self.tensor_map_set),
+            set_up_constants="\n".join(self.set_up_constants),
+            device_to_device_copies="\n".join(self.device_to_device_copies),
+            set_up_param_dynamic_shapes="\n".join(self.set_up_param_dynamic_shapes),
+            function_seq=self.func_seq,
+            tensor_decl="\n".join(self.tensor_decl),
+            dim_decl="\n".join(self.dim_decl),
+            function_state="\n".join(self.function_state),
+            target_has_graph_mode=target_has_graph_mode,
+            unique_workspace_size=self.workspace.unique_size,
+        )
+
+        result["model-generated.h"] = model_def
+
+        model_container_src_fname = f"model_container_base{self.target.src_extension()}"
+        model_container_base_src = MODEL_CONTAINER_TEMPLATE.render(
+            blob_size=self.max_blob_size,
+            workspace_size=self.workspace.total_size(),
+            num_inputs=self.num_inputs,
+            num_outputs=self.num_outputs,
+            param_size=self.max_constant_blob_size,
+            set_up_constant_names="\n".join(self.set_up_constant_names),
+            set_up_param_dtypes="\n".join(self.set_up_param_dtypes),
+            set_up_output_shapes="\n".join(self.set_up_output_shapes),
+            set_up_param_names="\n".join(self.set_up_param_names),
+            num_constants=self.num_constants,
+            num_unbound_constants=self.unbound_constant_idx,
+            owned_constants_init=",".join(self.owned_constants_init),
+        )
+        result[model_container_src_fname] = model_container_base_src
+        return result
+
+
+def _construct_output_name_to_index_map(
+    sorted_graph: List[Tensor], output_tensors: List[Tensor]
+) -> Dict[str, int]:
+    """
+    Use the given output ordering to construct a name -> index map
+    to be used for constructing an internal ordering during codegen.
+
+    The indices in the map are propagated to an output's entire alias set.
+    If two outputs are part of the same alias set, only one of them propagates
+    its output index.
+    """
+    result = {tensor._attrs["name"]: i for i, tensor in enumerate(output_tensors)}
+
+    # Mark alias sets
+    for tensor in reversed(sorted_graph):
+        name = tensor._attrs["name"]
+        orig = tensor._attrs["is_view_of"]
+        if orig is None:
+            continue
+        orig_name = orig._attrs["name"]
+        if name in result and orig_name not in result:
+            result[orig_name] = result[name]
+
+    return result
+
+
+def gen_library_src(  # noqa: C901
+    sorted_graph: list[Tensor],
+    max_blob_size: int,
+    max_constant_blob_size: int,
+    workspace: Workspace,
+    workdir: str,
+    output_tensors: List[Tensor],
+    model_name: str = "",
+) -> list[Tuple[str, str]]:
+    """Generate model driver source code files for the given graph
+
+    Parameters
+    ----------
+    sorted_graph : list[Tensor]
+        The network after running toposort transformation
+    max_blob_size : int
+        Total memory for input/output tensor and intermediate results,
+        calculated by memory planning transformation
+    workspace : Workspace
+        Workspace sizes, computed by memory planning
+    workdir : str
+        Target directory for generated C++ source code files
+    model_name : str, optional
+        Sub working directory in the workdir for the given model, by default ""
+
+    Returns
+    -------
+    list[Tuple[str, str]]
+        List of tuple (source file path, object file path)
+    """
+
+    def to_obj_name(name: str):
+        name, _ = os.path.splitext(name)
+        return f"{name}.obj"
+
+    num_inputs, num_outputs = count_inputs_outputs(sorted_graph)
+    prefix = os.path.join(workdir, model_name)
+    constants_fname = os.path.join(prefix, "constants.bin")
+    constants_data_file = open(constants_fname, "wb")
+
+    output_name_to_index = _construct_output_name_to_index_map(
+        sorted_graph, output_tensors
+    )
+
+    model_container_generator = ModelContainerGenerator(
+        max_blob_size,
+        max_constant_blob_size,
+        workspace,
+        num_inputs,
+        num_outputs,
+        constants_data_file,
+        output_name_to_index,
+    )
+    for node in sorted_graph:
+        model_container_generator.append_tensor(node)
+    constants_data_file.close()
+
+    files = model_container_generator.generate_source()
+    to_build = [(constants_fname, to_obj_name(constants_fname))]
+    for fname, contents in files.items():
+        fname_full = os.path.join(prefix, fname)
+        with open(fname_full, "w") as fo:
+            fo.write(contents)
+        if not fname_full.endswith(".h"):
+            to_build.append((fname_full, to_obj_name(fname_full)))
+
+    # Copy over static csrc/headers
+    sources = model_container_generator.target.copy_headers_and_csrc_to_workdir(prefix)
+    for fname in sources:
+        to_build.append((fname, to_obj_name(fname)))
+
+    return to_build
diff --git a/python/aitemplate/backend/common/concatenate_common.py b/python/aitemplate/backend/common/concatenate_common.py
new file mode 100644
index 000000000..99f24bb03
--- /dev/null
+++ b/python/aitemplate/backend/common/concatenate_common.py
@@ -0,0 +1,839 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+backend concatenate function common templates.
+"""
+import jinja2
+
+from . import tensor_accessor_codegen
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    {{elem_output_type}} * /*output*/,
+    {{index_type}} *[] /*output_shape*/,
+    const {{elem_input_type}} *[] /*inputs*/,
+    const {{index_type}} *[], /* real_input_shapes, representing shapes of those inputs
+                                 whose masks are False, i.e. inputs that will be
+                                 copied to the output tensor by concat.*/
+    const {{index_type}} *[], /* all_input_shapes, including both kinds of inputs,
+                                 i.e. not matter input_mask being True or False */
+    const bool [] /*input_masks*/,
+    const {{index_type}} [] /*concat_dim_sizes*/,
+    {{index_type}} /*concat_dim*/,
+    {{index_type}} /*rank*/,
+    {{index_type}} /*num_real_inputs*/,
+    {{index_type}} /*num_all_inputs*/,
+    {{prefix}}Stream_t
+);
+"""
+)
+
+
+KERNEL_SRC_TEMPLATE = jinja2.Template(
+    """
+#include <assert.h>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+{{header_src}}
+
+#ifndef CHECK_ERROR_CAT
+#define CHECK_ERROR_CAT(expr)                                \\
+  do {                                                       \\
+    {{prefix}}Error_t status = (expr);                       \\
+    if (status != {{prefix}}Success) {                       \\
+      auto msg = std::string("Got error: ") +                \\
+        {{prefix}}GetErrorString(status) +                   \\
+        " at " + __FILE__ + ": " + std::to_string(__LINE__); \\
+      std::cerr << msg << std::endl;                         \\
+      throw std::runtime_error(msg);                         \\
+    }                                                        \\
+  } while (0)
+#endif // CHECK_ERROR_CAT
+
+#ifndef LAUNCH_CHECK_CAT
+#define LAUNCH_CHECK_CAT() CHECK_ERROR_CAT({{prefix}}GetLastError())
+#endif // LAUNCH_CHECK_CAT
+
+{% if element_func_def %}
+{{element_func_def}}
+{% endif %}
+
+namespace {
+
+{{tensor_accessor_libs}}
+
+// TODO: support strided tensor with TensorAccessor
+// For strided tensor, the index can be much larger than original if the stride is large
+bool can_use_32bit_index_math(const int64_t elements, int64_t max_elem=std::numeric_limits<int32_t>::max()) {
+  if (elements >= max_elem) {
+    return false;
+  }
+  if (elements == 0) {
+    return max_elem > 0;
+  }
+
+  return true;
+}
+
+template <typename T, {{index_type}} NumInputs>
+struct InputMetaData {
+  const T *inputs[NumInputs]; /* pointer to each input */
+  TensorAccessor input_accessors[NumInputs];
+  int64_t concat_dim_offsets[NumInputs]; /* offset of each input along
+                                            the concat dimension */
+  int64_t concat_dim_values[NumInputs]; /* concat dimension value of
+                                           each input */
+  int64_t num_elems[NumInputs]; /* number of elements of each input */
+};
+
+template <{{index_type}} Rank>
+struct OutputMetaData {
+  int64_t output_shape[Rank];
+  int64_t output_strides[Rank];
+};
+
+__host__ __device__ __forceinline__
+int64_t get_num_elems(const {{index_type}} *shape, {{index_type}} rank) {
+  int64_t num = 1;
+  for ({{index_type}} i = 0; i < rank; i++) {
+    num *= shape[i];
+  }
+  return num;
+}
+
+template <typename INDEX_T, {{index_type}} Rank>
+__host__ __device__ int64_t compute_output_elem_offset(
+    const int64_t *output_shape,
+    const int64_t *output_strides,
+    const INDEX_T input_concat_dim_value,
+    const INDEX_T concat_dim,
+    INDEX_T linear_idx) {
+  INDEX_T offset = 0;
+  for (INDEX_T i = Rank - 1; i >= 1; --i) {
+    INDEX_T cur_dim_size =
+        i == concat_dim ? input_concat_dim_value : output_shape[i];
+    INDEX_T next_dim_idx = linear_idx / cur_dim_size;
+    INDEX_T cur_dim_idx = linear_idx - cur_dim_size * next_dim_idx;
+    INDEX_T cur_dim_offset = cur_dim_idx * static_cast<INDEX_T>(output_strides[i]);
+    offset += cur_dim_offset;
+    linear_idx = next_dim_idx;
+  }
+  return offset + linear_idx * static_cast<INDEX_T>(output_strides[0]);
+}
+} // namespace
+
+template <typename READ_T, typename ELEM_T, typename INDEX_T, {{index_type}} Rank,
+          {{index_type}} NumInputs, {{index_type}} ElemsPerThread>
+__global__ void
+concatenate_kernel(
+    ELEM_T *orig_output,
+    OutputMetaData<Rank> output_meta,
+    InputMetaData<ELEM_T, NumInputs> input_meta,
+    const INDEX_T concat_dim,
+    const INDEX_T output_concat_dim_stride) {
+  const INDEX_T tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const INDEX_T block_y = blockIdx.y % NumInputs;
+  READ_T* output = reinterpret_cast<READ_T*>(orig_output);
+
+  READ_T* input = const_cast<READ_T*>(
+      reinterpret_cast<const READ_T*>(input_meta.inputs[block_y]));
+  const TensorAccessor &input_accessor = input_meta.input_accessors[block_y];
+  INDEX_T input_offset = input_meta.concat_dim_offsets[block_y];
+  INDEX_T num_input_elems = input_meta.num_elems[block_y];
+  INDEX_T input_concat_dim_value = input_meta.concat_dim_values[block_y];
+  INDEX_T output_offset = input_offset * output_concat_dim_stride;
+
+  constexpr unsigned read_t_sz = sizeof(READ_T);
+  constexpr unsigned elem_t_sz = sizeof(ELEM_T);
+  assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
+  constexpr INDEX_T n_of_elem_t = read_t_sz / elem_t_sz;
+  // number of READ_T elements per thread
+  INDEX_T reads_per_thread_in_read_t = ElemsPerThread / n_of_elem_t;
+  const INDEX_T num_elems_in_read_t = num_input_elems / n_of_elem_t;
+  INDEX_T read_idx = tid;
+
+#pragma unroll
+  for (INDEX_T i = 0; i < reads_per_thread_in_read_t;
+       i++, read_idx += blockDim.x * gridDim.x) {
+    if (read_idx >= num_elems_in_read_t) {
+      break;
+    }
+    READ_T tmp_v = *(input_accessor.get<ELEM_T, READ_T>(input, read_idx));
+    /* make sure to adjust read_idx, which refers to location at
+       (read_idx * n_of_elem_t) actually */
+
+    INDEX_T output_elem_offset =
+        compute_output_elem_offset<INDEX_T, Rank>(output_meta.output_shape,
+                                                  output_meta.output_strides,
+                                                  input_concat_dim_value,
+                                                  concat_dim,
+                                                  read_idx * n_of_elem_t);
+    {% if element_func %}
+    output[(output_offset + output_elem_offset) / n_of_elem_t] = {{element_func}}(tmp_v);
+    {% else %}
+    output[(output_offset + output_elem_offset) / n_of_elem_t] = tmp_v;
+    {% endif %}
+  }
+}
+
+enum class LoadVecType {
+  VT_HALF = 0,
+  VT_FLOAT,
+  VT_FLOAT2,
+  VT_FLOAT4
+};
+
+template <typename ELEM_T>
+static inline LoadVecType get_vec_type({{index_type}} dim_size) {
+  {{index_type}}  size_elem_t = sizeof(ELEM_T);
+
+#define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)  \\
+  if (sizeof(vec_type) % size_elem_t == 0) {          \\
+    {{index_type}}  n_of_elem_t = sizeof(vec_type) / size_elem_t; \\
+    if (dim_size % n_of_elem_t == 0) {                \\
+      return load_vec_type;                           \\
+    }                                                 \\
+  }
+
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+
+#undef HANDLE_ONE_VEC_TYPE
+  throw std::runtime_error(
+      "Cannot resolve LoadVecType."
+  );
+}
+
+template <typename ELEM_T, typename INDEX_T, {{index_type}} Rank, {{index_type}} NumInputs,
+          {{index_type}} ElemsPerThread, {{index_type}} ThreadsPerBlock>
+void concatenate_kernel_launcher(
+    ELEM_T *output,
+    const {{index_type}} *output_shape,
+    const ELEM_T *inputs[],
+    const {{index_type}} *real_input_shapes[],
+    const TensorAccessor *input_accessors[],
+    const int64_t concat_dim_offsets[],
+    const {{index_type}} concat_dim,
+    LoadVecType min_vec_type,
+    {{prefix}}Stream_t stream) {
+
+  OutputMetaData<Rank> output_meta;
+  output_meta.output_strides[Rank - 1] = 1;
+  output_meta.output_shape[Rank - 1] = output_shape[Rank - 1];
+  for (INDEX_T i = Rank - 2; i >= 0; i--) {
+    output_meta.output_strides[i] =
+        output_meta.output_strides[i+1] * output_shape[i+1];
+    output_meta.output_shape[i] = output_shape[i];
+  }
+
+  InputMetaData<ELEM_T, NumInputs> input_meta;
+  INDEX_T max_num_input_elems = 0;
+  for (INDEX_T i = 0; i < NumInputs; i++) {
+    INDEX_T num_elems = get_num_elems(real_input_shapes[i], Rank);
+    input_meta.inputs[i] = inputs[i];
+    input_meta.input_accessors[i] = *(input_accessors[i]);
+    input_meta.concat_dim_offsets[i] = concat_dim_offsets[i];
+    input_meta.concat_dim_values[i] = real_input_shapes[i][concat_dim];
+    input_meta.num_elems[i] = num_elems;
+
+    max_num_input_elems = num_elems > max_num_input_elems ?
+                          num_elems : max_num_input_elems;
+  }
+
+  constexpr INDEX_T elems_per_block = ThreadsPerBlock * ElemsPerThread;
+  INDEX_T m = (max_num_input_elems % elems_per_block != 0);
+  INDEX_T num_blocks_x =
+      (max_num_input_elems / elems_per_block) + m;
+  dim3 grid_config = dim3(static_cast<unsigned>(num_blocks_x), NumInputs);
+
+#define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)                        \\
+    case load_vec_type: {                                                   \\
+      if (ElemsPerThread * sizeof(ELEM_T) < sizeof(vec_type)) {             \\
+         throw std::runtime_error(                                          \\
+           std::string("No valid kernel available for ") + #vec_type);      \\
+      }                                                                     \\
+      concatenate_kernel<vec_type, ELEM_T, INDEX_T, Rank, NumInputs, ElemsPerThread> \\
+        <<<grid_config, ThreadsPerBlock, 0, stream>>>(                      \\
+            output,                                                         \\
+            output_meta,                                                    \\
+            input_meta,                                                     \\
+            concat_dim,                                                     \\
+            output_meta.output_strides[concat_dim]);                        \\
+      LAUNCH_CHECK_CAT();                                                   \\
+      break;                                                                \\
+    }
+
+  switch (min_vec_type) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    default:
+      throw std::runtime_error("Invalid LoadVecType\\n");
+  }
+
+#undef HANDLE_ONE_VEC_TYPE
+}
+
+#undef CHECK_ERROR_CAT
+#undef LAUNCH_CHECK_CAT
+"""
+)
+
+
+DUMMY_KERNEL_TEMPLATE = jinja2.Template(
+    """
+#include <assert.h>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+{{header_src}}
+
+void {{func_name}}(
+    {{elem_output_type}} *output,
+    {{index_type}} *output_shape[],
+    const {{elem_input_type}} *inputs[],
+    const {{index_type}} *real_input_shapes[],
+    const {{index_type}} *all_input_shapes[],
+    const bool input_masks[],
+    const {{index_type}} concat_dim_sizes[],
+    {{index_type}} concat_dim,
+    {{index_type}} rank,
+    {{index_type}} num_real_inputs,
+    {{index_type}} num_all_inputs,
+    {{prefix}}Stream_t stream
+    ) {
+}
+"""
+)
+
+
+INPUT_ACCESSOR_DEFS_TEMPLATE = jinja2.Template(
+    """
+{{input_accessors}}
+
+{{indent}}const TensorAccessor *input_accessors[{{num_real_inputs}}] = {
+
+{{indent}}  {{input_accessor_refs}}
+
+{{indent}}};
+"""
+)
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if (rank == {{rank}} && num_real_inputs == {{num_real_inputs}}) {
+
+{{input_accessor_defs}}
+
+{{indent}}  LoadVecType min_vec_type = LoadVecType::VT_FLOAT4;
+{{indent}}  int64_t accessor_idx = 0;
+{{indent}}  for ({{index_type}} i = 0; i < num_all_inputs; i++) {
+{{indent}}    int local_alignment;
+{{indent}}    if (!input_masks[i] ||
+{{indent}}        input_accessors[accessor_idx]->stride_dim == -1) {
+{{indent}}      local_alignment = all_input_shapes[i][rank - 1];
+{{indent}}      // int64_t is ok here because this happens on CPU
+{{indent}}      for (int64_t j = rank - 2; j >= concat_dim; j--) {
+{{indent}}        local_alignment *= all_input_shapes[i][j];
+{{indent}}      }
+{{indent}}    } else {
+{{indent}}      local_alignment =
+{{indent}}          input_accessors[accessor_idx]->max_alignment();
+{{indent}}    }
+{{indent}}    LoadVecType vec_type = get_vec_type<{{elem_type}}>(local_alignment);
+{{indent}}    min_vec_type = vec_type < min_vec_type ? vec_type : min_vec_type;
+{{indent}}    if (input_masks[i]) {
+{{indent}}      accessor_idx++;
+{{indent}}    }
+{{indent}}  }
+
+{{indent}}  {{index_type}} local_output_shape[] = {
+{% for idx in range(rank - 1) %}
+{{indent}}    *(output_shape[{{idx}}]),
+{% endfor %}
+{{indent}}    *(output_shape[{{rank - 1}}])
+{{indent}}  };
+
+{{indent}}/* TODO: more profiling on ElemsPerThread and ThreadsPerBlock */
+{{indent}}if (use_int32_index_math) {
+{{indent}}  concatenate_kernel_launcher<{{elem_type}},
+{{indent}}                    int32_t,
+{{indent}}                    {{rank}}/*Rank*/,
+{{indent}}                    {{num_real_inputs}}/*NumInputs*/,
+{{indent}}                    {{elems_per_thread}}/*ElemsPerThread*/,
+{{indent}}                    {{threads_per_block}}/*THREADS_PER_BLOCK*/>(
+{{indent}}    output, local_output_shape, inputs, real_input_shapes, input_accessors,
+{{indent}}    concat_dim_offsets.data(), concat_dim, min_vec_type, stream);
+{{indent}}} else {
+{{indent}}  concatenate_kernel_launcher<{{elem_type}},
+{{indent}}                    int64_t,
+{{indent}}                    {{rank}}/*Rank*/,
+{{indent}}                    {{num_real_inputs}}/*NumInputs*/,
+{{indent}}                    {{elems_per_thread}}/*ElemsPerThread*/,
+{{indent}}                    {{threads_per_block}}/*THREADS_PER_BLOCK*/>(
+{{indent}}    output, local_output_shape, inputs, real_input_shapes, input_accessors,
+{{indent}}    concat_dim_offsets.data(), concat_dim, min_vec_type, stream);
+{{indent}}}
+{{indent}}return;
+{{indent}}}
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{kernel_src}}
+
+void {{func_name}}(
+    {{elem_output_type}} *output,
+    {{index_type}} *output_shape[],
+    const {{elem_input_type}} *inputs[],
+    const {{index_type}} *real_input_shapes[], /* real_input_shapes, representing
+                                 shapes of those inputs whose masks are False,
+                                 i.e. inputs that will be copied to the output
+                                 tensor by concat.*/
+    const {{index_type}} *all_input_shapes[], /* all_input_shapes include both
+                                 kinds of inputs, i.e. no matter input_mask being
+                                 True or False */
+    const bool input_masks[],
+    const {{index_type}} concat_dim_sizes[],
+    {{index_type}} concat_dim,
+    {{index_type}} rank,
+    {{index_type}} num_real_inputs,
+    {{index_type}} num_all_inputs,
+    {{prefix}}Stream_t stream
+    ) {
+
+  if (rank <= 0) {
+    throw std::runtime_error("rank must be larger than 0!");
+  }
+  if (concat_dim >= rank) {
+    throw std::runtime_error("concat_dim must be smaller than rank!");
+  }
+  if (num_real_inputs < 1) {
+    throw std::runtime_error("the number of inputs must >= 1!");
+  }
+
+  for ({{index_type}} i = 0; i < rank; i++) {
+    if (i == concat_dim) continue;
+    {{index_type}} dim = real_input_shapes[0][i];
+    for ({{index_type}} j = 1; j < num_real_inputs; j++) {
+      if (real_input_shapes[j][i] != dim) {
+        throw std::runtime_error(
+          "invalid input shape, func_name: {{func_name}}, dim: " +
+          std::to_string(dim) + ", input_shape: " +
+          std::to_string(real_input_shapes[j][i])
+        );
+      }
+    }
+  }
+
+  {{index_type}} output_concat_dim_value = 0;
+  std::vector<int64_t> concat_dim_offsets;
+
+  for ({{index_type}} i = 0; i < num_all_inputs; i++) {
+    if (input_masks[i]) {
+      concat_dim_offsets.push_back(output_concat_dim_value);
+    }
+    output_concat_dim_value += concat_dim_sizes[i];
+  }
+  for ({{index_type}} i = 0; i < rank; i++) {
+    if (i == concat_dim) {
+      *(output_shape[i]) = output_concat_dim_value;
+    } else {
+      *(output_shape[i]) = real_input_shapes[0][i];
+    }
+  }
+
+  // If all input tensors are empty we are done
+  bool empty = false;
+  bool use_int32_index_math = true;
+  for (int i = 0; i < num_real_inputs; i++) {
+    int64_t num_elems = get_num_elems(real_input_shapes[i], rank);
+    if (get_num_elems(real_input_shapes[i], rank) != 0) {
+      empty = false;
+      // make sure input is valid for each non-zero-size tensor
+      if (!inputs[i]) {
+        throw std::runtime_error("NULL input is found at: " + std::to_string(i));
+      }
+    }
+    if (input_masks[i]) {
+      use_int32_index_math &= can_use_32bit_index_math(num_elems);
+    }
+  }
+
+  if (empty) {
+    return;
+  }
+
+  // if the output has any zero dim size, we are done
+  for (int i = 0; i < rank; i++) {
+    if (*output_shape[i] == 0)
+      return;
+  }
+  // make sure output is valid
+  if (!output) {
+    throw std::runtime_error("output is NULL!");
+  }
+
+{{exec_paths}}
+
+  throw std::runtime_error(
+      "Unsupported concat kernel specialization!"
+  );
+}
+"""
+)
+
+
+INPUT_SHAPE_DEF_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{index_type}} {{input_shape_name}}[] = {
+{{indent}}  {{input_dims}}
+{{indent}}};
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+
+{{indent}}  const {{input_elem_type}} *inputs[] = {
+{{indent}}    {{inputs}}
+{{indent}}  };
+
+{{real_input_shape_defs}}
+
+{{indent}}  const {{index_type}} *real_input_shapes[] = {
+{{indent}}    {{real_input_shapes}}
+{{indent}}  };
+
+{{all_input_shape_defs}}
+
+{{indent}}  const {{index_type}} *all_input_shapes[] = {
+{{indent}}    {{all_input_shapes}}
+{{indent}}  };
+
+{{indent}}  {{index_type}} *{{output}}_shape[] = {
+{{indent}}    {{output_dim_refs}}
+{{indent}}  };
+
+{{indent}}  {{index_type}} concat_dim_sizes[] = {
+{{indent}}    {{concat_dim_sizes}}
+{{indent}}  };
+
+{{indent}}  bool input_masks[] = {
+{{indent}}    {{input_masks}}
+{{indent}}  };
+
+{{indent}}  {{func_name}}(
+{{indent}}      {{output_ptr}},
+{{indent}}      {{output}}_shape,
+{{indent}}      inputs,
+{{indent}}      real_input_shapes,
+{{indent}}      all_input_shapes,
+{{indent}}      input_masks,
+{{indent}}      concat_dim_sizes,
+{{indent}}      {{concat_dim}}/*concat_dim*/,
+{{indent}}      {{rank}}/*rank*/,
+{{indent}}      {{num_real_inputs}}/*num_real_inputs*/,
+{{indent}}      {{num_all_inputs}}/*num_all_inputs*/,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+def gen_function_decl(func_attrs, backend_spec):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    # get dtype from orig_x in case actual "inputs" is turned into empty
+    # by some transformation
+    orig_x = func_attrs["original_inputs"][0]
+    y = func_attrs["outputs"][0]
+    input_type = backend_spec.dtype_to_backend_type(orig_x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        elem_output_type=output_type,
+        elem_input_type=input_type,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function(
+    func_attrs,
+    backend_spec,
+    element_func=None,
+    element_func_def=None,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    index_type: str
+        Index type.
+    prefix: str
+        Backend function prefix, hip/cuda
+    dtype_to_backend_type: Dict[str, str]
+    header_src_template: jinja Template
+    Header src template.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    inputs = func_attrs["inputs"]
+    original_inputs = func_attrs["original_inputs"]
+    orig_x = original_inputs[0]
+    y = func_attrs["outputs"][0]
+    x_shape = orig_x._attrs["shape"]
+
+    input_type = backend_spec.dtype_to_backend_type(orig_x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
+
+    # TODO: support type cast
+    if input_type != output_type:
+        raise NotImplementedError("input type must equal to output type")
+
+    def _stride(shape, dim):
+        stride = 1
+        for v in shape[dim:]:
+            stride = stride * v._attrs["values"][0]
+        return stride
+
+    concat_dim = func_attrs["concat_dim"]
+    assert concat_dim < len(x_shape)
+    strides = [_stride(i._attrs["shape"], concat_dim) for i in inputs]
+    # the max number of elements in each concat loop iteration
+    elems_per_iter = max(strides) if len(strides) > 0 else 0
+    threads_per_block = 128
+    # minimal number of elems per thread is 8, max is 480
+    elems_per_thread = min(480, (int((elems_per_iter / threads_per_block + 8) / 8) * 8))
+
+    input_accessors = []
+    input_accessor_refs = []
+    for i in range(len(inputs)):
+        accessor_name = f"input_accessor{i}"
+        input_accessor_refs.append(f"&{accessor_name}")
+        input_accessors.append(
+            tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+                name=accessor_name, tensor_accessor=func_attrs["input_accessors"][i]
+            )
+        )
+    input_accessor_defs = INPUT_ACCESSOR_DEFS_TEMPLATE.render(
+        indent="    ",
+        input_accessors="".join(input_accessors),
+        num_real_inputs=len(inputs),
+        input_accessor_refs=", ".join(input_accessor_refs),
+    )
+
+    # TODO: consider to add profiling paths for tuning
+    # elems_per_thread and threads_per_block
+    exec_paths = EXEC_COND_TEMPLATE.render(
+        indent="  ",
+        rank=len(x_shape),
+        num_real_inputs=len(inputs),
+        input_accessor_defs=input_accessor_defs,
+        elem_type=input_type,
+        elems_per_thread=elems_per_thread,
+        threads_per_block=threads_per_block,
+        index_type=backend_spec.index_type,
+    )
+
+    header_src = backend_spec.header_src_template.render()
+    if len(inputs) > 0:
+        tensor_accessor_libs = tensor_accessor_codegen.get_libs()
+        kernel_src = KERNEL_SRC_TEMPLATE.render(
+            element_func=element_func,
+            element_func_def=element_func_def,
+            header_src=header_src,
+            index_type=backend_spec.index_type,
+            prefix=backend_spec.prefix,
+            tensor_accessor_libs=tensor_accessor_libs,
+        )
+        return SRC_TEMPLATE.render(
+            kernel_src=kernel_src,
+            func_name=func_attrs["name"],
+            elem_input_type=input_type,
+            elem_output_type=output_type,
+            exec_paths=exec_paths,
+            index_type=backend_spec.index_type,
+            prefix=backend_spec.prefix,
+        )
+
+    return DUMMY_KERNEL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        header_src=header_src,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function_call(
+    func_attrs,
+    backend_spec,
+    indent="  ",
+):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    index_type: str
+        Index type.
+    cast_to_const_half_ptr_template: jinja template
+        Cast to const half ptr template.
+    cast_to_half_ptr_template: jinja template
+        Cast to half ptr template.
+    dtype_to_backend_type: Dict[str, str]
+        Stores python dtype to backend (rocm, cuda) type.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    inputs = func_attrs["inputs"]
+    input_accessors = func_attrs["input_accessors"]
+    assert len(inputs) == len(input_accessors), (
+        "expected inputs and input_accessors to have the same length, but got: "
+        f'{len(inputs)}, {len(input_accessors)}, op: {func_attrs["name"]}'
+    )
+    original_inputs = func_attrs["original_inputs"]
+    orig_x = original_inputs[0]
+    y = func_attrs["outputs"][0]
+    concat_dim = func_attrs["concat_dim"]
+
+    input_names = ",\n      ".join(
+        [
+            backend_spec.cast_to_const_half_ptr_template.render(name=i._attrs["name"])
+            for i in inputs
+        ]
+    )
+    real_input_shape_defs = []
+    real_input_shape_names = []
+    for idx, (i, input_accessor) in enumerate(zip(inputs, input_accessors)):
+        input_shape_name = f'{i._attrs["name"]}_shape_{idx}'
+        orig_input_shape = input_accessor.original_shapes
+        dims = ", ".join([dim._attrs["name"] for dim in orig_input_shape])
+        one_shape_def = INPUT_SHAPE_DEF_TEMPLATE.render(
+            indent="      ",
+            input_shape_name=input_shape_name,
+            input_dims=dims,
+            index_type=backend_spec.index_type,
+        )
+        real_input_shape_defs.append(one_shape_def)
+        real_input_shape_names.append(input_shape_name)
+
+    y_shape = y._attrs["shape"]
+    y_dim_refs = ", ".join(["&" + dim._attrs["name"] for dim in y_shape])
+    casted_y_ptr = backend_spec.cast_to_half_ptr_template.render(name=y._attrs["name"])
+
+    input_masks = func_attrs["input_masks"]
+    input_indices = [idx for idx, m in enumerate(input_masks) if m is True]
+    assert len(inputs) == len(input_indices)
+    concat_dim_sizes = [
+        "-1" if mask else str(original_inputs[idx]._attrs["shape"][concat_dim].value())
+        for idx, mask in enumerate(input_masks)
+    ]
+
+    # update dim size for real inputs
+    for input_accessor, input_index in zip(input_accessors, input_indices):
+        dim = input_accessor.original_shapes[concat_dim]._attrs["name"]
+        concat_dim_sizes[input_index] = dim
+
+    input_masks_str = ", ".join(
+        ["true" if mask is True else "false" for mask in input_masks]
+    )
+
+    # all input shape defs and names, including those that are masked out
+    all_input_shape_defs = []
+    all_input_shape_names = []
+    # first, create shape defs for inputs that have been masked off
+    for (
+        mask,
+        orig_input,
+    ) in zip(input_masks, original_inputs):
+        if mask is False:
+            orig_input_shape_name = f'orig_{orig_input._attrs["name"]}_shape'
+            if orig_input_shape_name not in all_input_shape_names:
+                dims = ", ".join(
+                    [str(dim._attrs["values"][0]) for dim in orig_input._attrs["shape"]]
+                )
+                one_shape_def = INPUT_SHAPE_DEF_TEMPLATE.render(
+                    indent="      ",
+                    input_shape_name=orig_input_shape_name,
+                    input_dims=dims,
+                    index_type=backend_spec.index_type,
+                )
+                all_input_shape_defs.append(one_shape_def)
+            all_input_shape_names.append(orig_input_shape_name)
+        else:
+            all_input_shape_names.append("")
+    # update all_input_shapes with real input shapes
+    for idx, (input_tensor, input_index) in enumerate(zip(inputs, input_indices)):
+        input_shape_name = f'{input_tensor._attrs["name"]}_shape_{idx}'
+        all_input_shape_names[input_index] = input_shape_name
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        input_elem_type=backend_spec.dtype_to_backend_type(orig_x._attrs["dtype"]),
+        inputs=input_names,
+        real_input_shape_defs="".join(real_input_shape_defs),
+        real_input_shapes=", ".join(real_input_shape_names),
+        all_input_shape_defs="".join(all_input_shape_defs),
+        all_input_shapes=", ".join(all_input_shape_names),
+        input_masks=input_masks_str,
+        concat_dim_sizes=", ".join(concat_dim_sizes),
+        output_dim_refs=y_dim_refs,
+        func_name=func_attrs["name"],
+        output=y._attrs["name"],
+        output_ptr=casted_y_ptr,
+        concat_dim=concat_dim,
+        rank=len(orig_x._attrs["shape"]),
+        num_real_inputs=len(inputs),
+        num_all_inputs=len(original_inputs),
+        index_type=backend_spec.index_type,
+    )
diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
new file mode 100644
index 000000000..14872058a
--- /dev/null
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -0,0 +1,881 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Backend-agnostic functions for elementwise codegen.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+
+import jinja2
+
+from ...compiler.base import IntImm, IntVar, Operator, Tensor
+from ...compiler.tensor_accessor import TensorAccessor
+from ...utils import shape_utils
+from ..backend_spec import BackendSpec
+from . import tensor_accessor_codegen
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define FUSED_ELE_THREAD_SIZE 256
+
+const int N_ELEMENTS_PER_THREAD = sizeof({{read_t}}) / sizeof({{data_t}});
+const int N_ELEMENTS_PER_READ = sizeof({{read_t}}) / sizeof({{data_t}});
+const int N_OPS_PER_THREAD = sizeof({{read_t}}) / sizeof({{op_t}});
+    """
+)
+
+KERNEL_DECL_INPUT_PARAM_TEMPLATE = jinja2.Template("const {{read_t}}* input{{idx}}")
+KERNEL_DECL_OUTPUT_PARAM_TEMPLATE = jinja2.Template("{{read_t}}* output{{idx}}")
+
+KERNEL_TMP_INPUT_TEMPLATE = jinja2.Template("p_tmp_i{{idx}}[i]")
+KERNEL_TMP_OUTPUT_TEMPLATE = jinja2.Template("p_tmp_o{{idx}}[i]")
+
+
+GET_STRIDED_ADDRESS_TEMPLATE = jinja2.Template(
+    """
+  {% if tensor_accessor.is_contiguous %}
+  {{data_ptr}} = get_strided_address</*data_t*/ {{data_t}},
+                                     /*read_t*/ {{read_t}},
+                                     /*is_contiguous*/ true>(
+      {{data_ptr}}, {{data_idx}}, {{tensor_accessor.offset}}, 0, 0);
+  {% else %}
+  {{data_ptr}} = get_strided_address</*data_t*/ {{data_t}},
+                                     /*read_t*/ {{read_t}},
+                                     /*is_contiguous*/ false>(
+      {{data_ptr}}, {{data_idx}},
+      {{tensor_accessor.offset}},
+      {{tensor_accessor.original_total_elements_from_stride_dim}},
+      {{tensor_accessor.actual_total_elements_from_stride_dim}});
+  {% endif %}
+    """
+)
+
+
+KERNEL_READ_INPUT_TEMPLATE = jinja2.Template(
+    """
+  {{read_t}} *{{input_name}} = const_cast<{{read_t}}*>(input{{input_idx}});
+  {{get_strided_address}}
+  {{read_t}} tmp_i{{input_idx}} = *{{input_name}};
+  const {{op_t}}* p_tmp_i{{input_idx}} = reinterpret_cast<const {{op_t}}*>(&tmp_i{{input_idx}});
+
+    """
+)
+
+
+KERNEL_DEFINE_OUTPUTS_TEMPLATE = jinja2.Template(
+    """
+  {% for idx in indexes %}
+  {{read_t}} tmp_o{{idx}};
+  {{op_t}}* p_tmp_o{{idx}} = reinterpret_cast<{{op_t}}*>(&tmp_o{{idx}});
+  {% endfor %}
+    """
+)
+
+
+KERNEL_WRITE_OUTPUT_TEMPLATE = jinja2.Template(
+    """
+  {{get_strided_address}}
+  *{{output_name}} = tmp_o{{output_idx}};
+    """
+)
+
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+__global__ void
+{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} int n_elements) {
+  const int bid = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int idx = bid * FUSED_ELE_THREAD_SIZE + tid;
+  const int idx_elem = idx * N_ELEMENTS_PER_THREAD;
+  if (idx_elem >= n_elements) {
+    return;
+  }
+  {{read_inputs}}
+  {{define_outputs}}
+#pragma unroll
+  for (int i = 0; i < N_OPS_PER_THREAD; ++i) {
+    {{fused_funcs}}
+  }
+  {{write_outputs}}
+}
+    """
+)
+
+FUNC_DECL_INPUT_PARAM_TEMPLATE = jinja2.Template("const {{data_t}}* input{{idx}}")
+FUNC_DECL_OUTPUT_PARAM_TEMPLATE = jinja2.Template("{{data_t}}* output{{idx}}")
+KERNEL_CALL_INPUT_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<const {{read_t}}*>(input{{idx}})"
+)
+KERNEL_CALL_OUTPUT_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<{{read_t}}*>(output{{idx}})"
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{head}}
+
+namespace {
+
+{{constant}}
+
+{{custom_libs}}
+
+{{tensor_accessor_lib}}
+
+{{kernel_function}}
+
+}  // namespace
+
+void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims_decl}} int n_elements, {{prefix}}Stream_t stream) {
+    if (n_elements == 0) {
+      return;
+    }
+    int block_size = static_cast<int>(std::ceil(static_cast<double>(n_elements) / N_ELEMENTS_PER_THREAD / FUSED_ELE_THREAD_SIZE));
+    {{func_name}}<<<block_size, FUSED_ELE_THREAD_SIZE, 0, stream>>>(
+        {{kernel_call_output_params}},
+        {{kernel_call_input_params}},
+        {{dynamic_dims_call}}
+        n_elements
+    );
+}
+    """
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} int n_elements, {{prefix}}Stream_t stream);
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}int {{func_name}}_n_elements = {{calculate_n}};
+    {{indent}}invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{func_name}}_n_elements, {{stream}});
+{{indent}}}
+    """
+)
+
+
+@dataclass
+class ElementwiseMetaData:
+    func_name: str
+    op_t: str
+    args: List[Tensor]
+    outputs: List[Tensor]
+
+
+@dataclass
+class FusedElementwiseMetaData:
+    # Input / output Tensors and TensorAccessors.
+    inputs: List[Tensor]
+    outputs: List[Tensor]
+    input_accessors: List[TensorAccessor]
+    output_accessors: List[TensorAccessor]
+
+    # Original input / output Tensors before graph transformation.
+    # Kept here for elementwise -> fused elementwise Tensor mapping.
+    original_inputs: List[Tensor]
+    original_outputs: List[Tensor]
+
+    read_t: str
+    op_t: str
+    data_t: str
+    input_broadcast_sizes: List[List[IntVar]]
+    dynamic_dims: List[IntVar]
+    sub_funcs: List[ElementwiseMetaData]
+
+
+def gen_function_single_thread(
+    fused_func_metadata,
+    input_names,
+    output_names,
+    type_converter,
+) -> str:
+    """Per thread elementwise function codegen."""
+    tensor_to_expr: Dict[Tensor, str] = {}
+    body = ""
+
+    for tensor, name in zip(fused_func_metadata.original_inputs, input_names):
+        tensor_to_expr[tensor] = name
+
+    tmp_output_idx: int = 0
+    for func_metadata in fused_func_metadata.sub_funcs:
+        params: List[str] = []
+        func_op_t = func_metadata.op_t
+        input_converter = None
+        output_converter = None
+        if func_op_t != fused_func_metadata.op_t:
+            input_converter = type_converter.get(fused_func_metadata.op_t).get(
+                func_op_t
+            )
+            output_converter = type_converter.get(func_op_t).get(
+                fused_func_metadata.op_t
+            )
+            assert (
+                input_converter is not None
+            ), "Unsupported convertion from {} to {}".format(
+                fused_func_metadata.op_t, func_op_t
+            )
+            assert (
+                output_converter is not None
+            ), "Unsupported convertion from {} to {}".format(
+                func_op_t, fused_func_metadata.op_t
+            )
+
+        for arg in func_metadata.args:
+            if arg in tensor_to_expr:
+                param = tensor_to_expr[arg]
+                params.append(
+                    "{}({})".format(input_converter, param)
+                    if input_converter is not None
+                    else param
+                )
+            elif arg.is_a_const_num():
+                if func_op_t[-1] == "2":
+                    params.append(
+                        "{}({},{})".format(
+                            func_op_t,
+                            str(arg._attrs["value"]),
+                            str(arg._attrs["value"]),
+                        )
+                    )
+                else:
+                    params.append("{}({})".format(func_op_t, str(arg._attrs["value"])))
+            else:
+                raise RuntimeError(
+                    "Cannot generate expression for node {}, ops: {}".format(
+                        arg, func_metadata
+                    )
+                )
+        assert (
+            len(func_metadata.outputs) == 1
+        ), "Operator has more than 1 output! Operator: {}".format(func_metadata)
+
+        output = func_metadata.outputs[0]
+        func_def = "{}({})".format(func_metadata.func_name, ",".join(params))
+        func_def = (
+            "{}({})".format(output_converter, func_def)
+            if output_converter is not None
+            else func_def
+        )
+        if len(output._attrs["dst_ops"]) > 1:
+            name = "tmp_" + (str)(tmp_output_idx)
+            tmp_output_idx += 1
+            body += "{} {} = {};\n".format(fused_func_metadata.op_t, name, func_def)
+            tensor_to_expr[output] = name
+        else:
+            tensor_to_expr[output] = func_def
+
+    for tensor, name in zip(fused_func_metadata.original_outputs, output_names):
+        if tensor not in tensor_to_expr:
+            raise RuntimeError(
+                "Cannot generate expression for node {}, outputs: {}".format(
+                    tensor, fused_func_metadata.original_outputs
+                )
+            )
+        expr = tensor_to_expr[tensor]
+        body += "{} = {};\n".format(name, expr)
+
+    return body
+
+
+def _get_sub_func_metadata(
+    ops: List[Operator], data_t: str, op_t: str, backend_spec: BackendSpec
+) -> Tuple[List[ElementwiseMetaData], str]:
+    candidate_op_types = backend_spec.get_candidate_op_types(op_t)
+    func_enums = []
+    for op in ops:
+        func_enum = op._attrs["func"]
+        func_enums.append(func_enum)
+        funcs = backend_spec.func_enum_to_func_name.get(func_enum)
+        if funcs is None:
+            raise NotImplementedError("Func {} is not supported!".format(func_enum))
+        for candidate_op_t in candidate_op_types:
+            func_name = funcs.get(candidate_op_t)
+            if func_name is not None:
+                candidate_op_types = backend_spec.get_candidate_op_types(candidate_op_t)
+                break
+    if len(candidate_op_types) == 0:
+        raise RuntimeError(
+            "Cannot find a common rocm data type! candidate_op_types: {}, op_t: {}.".format(
+                candidate_op_types, op_t
+            )
+        )
+    if op_t in set(candidate_op_types):
+        op_t = candidate_op_types[0]
+    else:
+        op_t = data_t
+        candidate_op_types = backend_spec.get_candidate_op_types(op_t)
+
+    sub_func_metadata = []
+    for op in ops:
+        func_enum = op._attrs["func"]
+        funcs = backend_spec.func_enum_to_func_name.get(func_enum)
+        func_name = None
+        func_op_t = None
+        for candidate_op_t in candidate_op_types:
+            func_name = funcs.get(candidate_op_t)
+            if func_name is not None:
+                func_op_t = candidate_op_t
+                break
+        if func_name is None:
+            raise NotImplementedError(
+                "Unsupported func {} and op type {}!".format(func_enum, op_t)
+            )
+        sub_func_metadata.append(
+            ElementwiseMetaData(
+                func_name, func_op_t, op._attrs["args"], op._attrs["outputs"]
+            )
+        )
+    return (sub_func_metadata, op_t)
+
+
+def _get_types_and_sizes(
+    inputs: List[Tensor],
+    input_accessors: List[TensorAccessor],
+    output_accessors: List[TensorAccessor],
+    backend_spec: BackendSpec,
+) -> Tuple[int, List[List[IntVar]], str]:
+    """
+    Returns Tuple(alignment, input_broadcast_sizes, dtype)
+    """
+
+    # Handle input broadcast.
+    output_shape = output_accessors[0].original_shapes
+    dtype = "float16"
+    input_broadcast_sizes = []
+    min_num_elements = None
+    for input_tensor, input_accessor in zip(inputs, input_accessors):
+        if input_tensor._attrs["dtype"] != "float16":
+            raise NotImplementedError(
+                "Unsupported dtype {}!".format(input_tensor._attrs["dtype"])
+            )
+        input_shape = input_accessor.original_shapes
+        broadcastable, _ = shape_utils.get_broadcast_max_shape(
+            output_shape, input_shape
+        )
+        if not broadcastable:
+            raise RuntimeError(
+                "Input shape {} is not compatible with output shape {}!".format(
+                    input_shape, output_shape
+                )
+            )
+        num_rightmost_non_broadcast_elements = len(input_shape)
+        extended_input_shape = list(input_shape)
+        if input_shape == output_shape:
+            input_broadcast_sizes.append(None)
+        else:
+            extended_input_shape = [IntImm(1)] * len(output_shape)
+            extended_input_shape[len(output_shape) - len(input_shape) :] = input_shape
+            input_broadcast_sizes.append(extended_input_shape)
+            for i in reversed(range(len(extended_input_shape))):
+                if extended_input_shape[i] != output_shape[i]:
+                    num_rightmost_non_broadcast_elements -= i + 1
+                    break
+        num_elements_for_alignments = shape_utils.get_num_rightmost_static_elements(
+            extended_input_shape, num_rightmost_non_broadcast_elements
+        )
+        if not min_num_elements:
+            min_num_elements = num_elements_for_alignments
+        else:
+            min_num_elements = min(min_num_elements, num_elements_for_alignments)
+    alignment = tensor_accessor_codegen.find_max_alignment(
+        min_num_elements, output_accessors
+    )
+    # Note that we use the same alignment for accessing inputs and outputs, although
+    # they may have different alignment requirements. We may lose perf a little bit,
+    # but reduce the complexity of our jinja template. We can do some perf
+    # experiments later to determine if we want to chase more perf gains.
+    alignment = tensor_accessor_codegen.find_max_alignment(alignment, input_accessors)
+    return alignment, input_broadcast_sizes, dtype
+
+
+def _get_dynamic_dims(output_accessors: List[TensorAccessor]) -> List[IntVar]:
+    res = {}
+    for output_accessor in output_accessors:
+        for dim in output_accessor.original_shapes:
+            if not isinstance(dim, IntImm):
+                res[dim._attrs["name"]] = dim
+    return res.values()
+
+
+def _parse_func_metadata(
+    ops: List[Operator],
+    inputs: List[Tensor],
+    outputs: List[Tensor],
+    input_accessors: List[TensorAccessor],
+    output_accessors: List[TensorAccessor],
+    original_inputs: List[Tensor],
+    original_outputs: List[Tensor],
+    backend_spec: BackendSpec,
+) -> FusedElementwiseMetaData:
+    alignment, input_broadcast_sizes, dtype = _get_types_and_sizes(
+        inputs, input_accessors, output_accessors, backend_spec
+    )
+    read_type = backend_spec.get_backend_type(
+        alignment, dtype, backend_spec.read_num_elements_to_backend_type
+    )
+    op_type = backend_spec.get_backend_type(
+        alignment, dtype, backend_spec.op_num_elements_to_backend_type
+    )
+    data_type = backend_spec.get_fp16_dtype(dtype)
+    sub_func_metadata, op_type = _get_sub_func_metadata(
+        ops, data_type, op_type, backend_spec
+    )
+    dynamic_dims = _get_dynamic_dims(output_accessors)
+
+    return FusedElementwiseMetaData(
+        inputs,
+        outputs,
+        input_accessors,
+        output_accessors,
+        original_inputs,
+        original_outputs,
+        read_type,
+        op_type,
+        data_type,
+        input_broadcast_sizes,
+        dynamic_dims,
+        sub_func_metadata,
+    )
+
+
+def _gen_int_var_product_str(
+    int_vars: List[IntVar],
+) -> str:
+    res = []
+    for int_var in int_vars:
+        if isinstance(int_var, IntImm):
+            res.append(str(int_var._attrs["values"][0]))
+        elif isinstance(int_var, IntVar):
+            res.append(int_var._attrs["name"])
+        else:
+            raise RuntimeError(
+                "A dim must be an IntVar! Current type: {}".format(type(int_var))
+            )
+    return " * ".join(res)
+
+
+def _gen_input_broadcast_calculator_str(
+    input_shape: List[IntVar],
+    output_shape: List[IntVar],
+) -> str:
+    output_num_elements = []
+    output_strides = []
+    input_strides = []
+
+    start_idx = 0
+    for i, (input_dim, output_dim) in enumerate(zip(input_shape, output_shape)):
+        if input_dim != output_dim:
+            assert input_dim == IntImm(
+                1
+            ), "Unexpected shapes! Input: {}, output: {}".format(
+                input_shape, output_shape
+            )
+            input_strides.append(input_shape[i:])
+            output_strides.append(output_shape[i:])
+            output_num_elements.append(output_shape[start_idx:])
+            start_idx = i + 1
+    if start_idx < len(output_shape):
+        input_strides.append([IntImm(1)])
+        output_strides.append([IntImm(1)])
+        output_num_elements.append(output_shape[start_idx:])
+
+    res = []
+    for (output_num_element, output_stride, input_stride) in zip(
+        output_num_elements, output_strides, input_strides
+    ):
+        res.append(
+            "{} % ({}) / ({}) * ({})".format(
+                "idx * N_ELEMENTS_PER_THREAD",
+                _gen_int_var_product_str(output_num_element),
+                _gen_int_var_product_str(output_stride),
+                _gen_int_var_product_str(input_stride),
+            )
+        )
+
+    return " + ".join(res)
+
+
+def _gen_input_broadcast_size_str(
+    input_broadcast_sizes: List[List[IntVar]],
+    output_shape: List[IntVar],
+) -> List[str]:
+    res = []
+    for input_broadcast_size in input_broadcast_sizes:
+        if input_broadcast_size is None:
+            res.append("")
+        else:
+            res.append(
+                _gen_input_broadcast_calculator_str(input_broadcast_size, output_shape)
+            )
+    return res
+
+
+def _gen_dynamic_dim_str(
+    index_type: str, dynamic_dims: List[IntVar], has_type: bool
+) -> str:
+    type_str = index_type + " " if has_type else ""
+    res = ", ".join([type_str + dim._attrs["name"] for dim in dynamic_dims])
+    if res:
+        res += ", "
+    return res
+
+
+def _gen_read_inputs_str(
+    fused_elementwise_metadata: FusedElementwiseMetaData, broadcast_sizes: List[str]
+):
+    read_inputs = []
+    for input_idx, (input_accessor, broadcast_size) in enumerate(
+        zip(fused_elementwise_metadata.input_accessors, broadcast_sizes)
+    ):
+        input_name = f"input_tmp{input_idx}"
+        data_idx = (
+            "idx"
+            if not broadcast_size
+            else f"({broadcast_size}) / N_ELEMENTS_PER_THREAD"
+        )
+        get_strided_addr_str = GET_STRIDED_ADDRESS_TEMPLATE.render(
+            tensor_accessor=input_accessor,
+            data_ptr=input_name,
+            data_t=fused_elementwise_metadata.data_t,
+            read_t=fused_elementwise_metadata.read_t,
+            data_idx=data_idx,
+        )
+        read_input = KERNEL_READ_INPUT_TEMPLATE.render(
+            get_strided_address=get_strided_addr_str,
+            input_name=input_name,
+            input_idx=input_idx,
+            read_t=fused_elementwise_metadata.read_t,
+            op_t=fused_elementwise_metadata.op_t,
+        )
+        read_inputs.append(read_input)
+    read_inputs_str = "\n".join(read_inputs)
+    return read_inputs_str
+
+
+def _gen_write_outputs_str(fused_elementwise_metadata: FusedElementwiseMetaData):
+    write_outputs = []
+    for output_idx, output_accessor in enumerate(
+        fused_elementwise_metadata.output_accessors
+    ):
+        output_name = f"output{output_idx}"
+        get_strided_addr_str = GET_STRIDED_ADDRESS_TEMPLATE.render(
+            tensor_accessor=output_accessor,
+            data_ptr=output_name,
+            data_t=fused_elementwise_metadata.data_t,
+            read_t=fused_elementwise_metadata.read_t,
+            data_idx="idx",
+        )
+        write_out = KERNEL_WRITE_OUTPUT_TEMPLATE.render(
+            get_strided_address=get_strided_addr_str,
+            output_name=output_name,
+            output_idx=output_idx,
+        )
+        write_outputs.append(write_out)
+    write_outputs_str = "\n".join(write_outputs)
+    return write_outputs_str
+
+
+def _gen_kernel_function(
+    func_attrs: Dict[str, Any],
+    index_type: str,
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+    backend_datatype_convertors: Dict[str, Dict[str, str]],
+) -> str:
+    output_params_decl = ",".join(
+        [
+            KERNEL_DECL_OUTPUT_PARAM_TEMPLATE.render(
+                read_t=fused_elementwise_metadata.read_t, idx=i
+            )
+            for i, _ in enumerate(fused_elementwise_metadata.outputs)
+        ]
+    )
+    input_params_decl = ",".join(
+        [
+            KERNEL_DECL_INPUT_PARAM_TEMPLATE.render(
+                read_t=fused_elementwise_metadata.read_t, idx=i
+            )
+            for i, _ in enumerate(fused_elementwise_metadata.inputs)
+        ]
+    )
+
+    broadcast_sizes = _gen_input_broadcast_size_str(
+        fused_elementwise_metadata.input_broadcast_sizes,
+        fused_elementwise_metadata.output_accessors[0].original_shapes,
+    )
+    read_inputs_str = _gen_read_inputs_str(fused_elementwise_metadata, broadcast_sizes)
+
+    define_outputs = KERNEL_DEFINE_OUTPUTS_TEMPLATE.render(
+        read_t=fused_elementwise_metadata.read_t,
+        op_t=fused_elementwise_metadata.op_t,
+        indexes=list(range(len(fused_elementwise_metadata.outputs))),
+    )
+    write_outputs_str = _gen_write_outputs_str(fused_elementwise_metadata)
+
+    input_names = [
+        KERNEL_TMP_INPUT_TEMPLATE.render(idx=i)
+        for i, _ in enumerate(fused_elementwise_metadata.inputs)
+    ]
+    output_names = [
+        KERNEL_TMP_OUTPUT_TEMPLATE.render(idx=i)
+        for i, _ in enumerate(fused_elementwise_metadata.outputs)
+    ]
+    fused_funcs = gen_function_single_thread(
+        fused_elementwise_metadata,
+        input_names,
+        output_names,
+        backend_datatype_convertors,
+    )
+
+    kernel_func = KERNEL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output_params=output_params_decl,
+        input_params=input_params_decl,
+        dynamic_dims=_gen_dynamic_dim_str(
+            index_type, fused_elementwise_metadata.dynamic_dims, has_type=True
+        ),
+        read_inputs=read_inputs_str,
+        define_outputs=define_outputs,
+        write_outputs=write_outputs_str,
+        fused_funcs=fused_funcs,
+    )
+    return kernel_func
+
+
+def fused_elementwise_gen_function(
+    func_attrs: Dict[str, Any],
+    custom_libs: str,
+    head_template: str,
+    backend_spec: BackendSpec,
+) -> str:
+    """Generates fused_elementwise function definition."""
+
+    ops = func_attrs["elementwise_ops"]
+    inputs = func_attrs["inputs"]
+    outputs = func_attrs["outputs"]
+    input_accessors = func_attrs["input_accessors"]
+    output_accessors = func_attrs["output_accessors"]
+    original_inputs = func_attrs["original_inputs"]
+    original_outputs = func_attrs["original_outputs"]
+    fused_elementwise_metadata = _parse_func_metadata(
+        ops,
+        inputs,
+        outputs,
+        input_accessors,
+        output_accessors,
+        original_inputs,
+        original_outputs,
+        backend_spec,
+    )
+    # Dump data types into func_attr for testing purpose.
+    func_attrs["read_t"] = fused_elementwise_metadata.read_t
+    func_attrs["op_t"] = fused_elementwise_metadata.op_t
+    func_attrs["data_t"] = fused_elementwise_metadata.data_t
+
+    tensor_accessor_lib = tensor_accessor_codegen.get_libs()
+    tensor_accessor_lib_str = "\n\n" + tensor_accessor_lib + "\n\n"
+
+    kernel_function = _gen_kernel_function(
+        func_attrs,
+        backend_spec.index_type,
+        fused_elementwise_metadata,
+        backend_spec.backend_datatype_convertors,
+    )
+    output_params_decl = ",".join(
+        [
+            FUNC_DECL_OUTPUT_PARAM_TEMPLATE.render(
+                data_t=fused_elementwise_metadata.data_t, idx=i
+            )
+            for i, _ in enumerate(fused_elementwise_metadata.outputs)
+        ]
+    )
+    input_params_decl = ",".join(
+        [
+            FUNC_DECL_INPUT_PARAM_TEMPLATE.render(
+                data_t=fused_elementwise_metadata.data_t, idx=i
+            )
+            for i, _ in enumerate(fused_elementwise_metadata.inputs)
+        ]
+    )
+    kernel_call_output_params = ",".join(
+        [
+            KERNEL_CALL_OUTPUT_PARAM_TEMPLATE.render(
+                read_t=fused_elementwise_metadata.read_t, idx=i
+            )
+            for i, _ in enumerate(fused_elementwise_metadata.outputs)
+        ]
+    )
+    kernel_call_input_params = ",".join(
+        [
+            KERNEL_CALL_INPUT_PARAM_TEMPLATE.render(
+                read_t=fused_elementwise_metadata.read_t, idx=i
+            )
+            for i, _ in enumerate(fused_elementwise_metadata.inputs)
+        ]
+    )
+    constant = CONSTANT_TEMPLATE.render(
+        read_t=fused_elementwise_metadata.read_t,
+        op_t=fused_elementwise_metadata.op_t,
+        data_t=fused_elementwise_metadata.data_t,
+    )
+
+    function = FUNC_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        head=backend_spec.header_src_template.render(extra_header=head_template),
+        constant=constant,
+        custom_libs=custom_libs,
+        tensor_accessor_lib=tensor_accessor_lib_str,
+        kernel_function=kernel_function,
+        func_name=func_attrs["name"],
+        output_params=output_params_decl,
+        input_params=input_params_decl,
+        dynamic_dims_decl=_gen_dynamic_dim_str(
+            backend_spec.index_type,
+            fused_elementwise_metadata.dynamic_dims,
+            has_type=True,
+        ),
+        dynamic_dims_call=_gen_dynamic_dim_str(
+            backend_spec.index_type,
+            fused_elementwise_metadata.dynamic_dims,
+            has_type=False,
+        ),
+        kernel_call_output_params=kernel_call_output_params,
+        kernel_call_input_params=kernel_call_input_params,
+    )
+    return function
+
+
+def fused_elementwise_gen_function_decl(
+    func_attrs,
+    backend_spec: BackendSpec,
+):
+    """Generates fused_elementwise function declaration."""
+
+    func_name = func_attrs["name"]
+    ops = func_attrs["elementwise_ops"]
+    inputs = func_attrs["inputs"]
+    outputs = func_attrs["outputs"]
+    input_accessors = func_attrs["input_accessors"]
+    output_accessors = func_attrs["output_accessors"]
+    original_inputs = func_attrs["original_inputs"]
+    original_outputs = func_attrs["original_outputs"]
+    fused_elementwise_metadata = _parse_func_metadata(
+        ops,
+        inputs,
+        outputs,
+        input_accessors,
+        output_accessors,
+        original_inputs,
+        original_outputs,
+        backend_spec,
+    )
+    output_params_decl = ",".join(
+        [
+            FUNC_DECL_OUTPUT_PARAM_TEMPLATE.render(
+                data_t=fused_elementwise_metadata.data_t, idx=i
+            )
+            for i, _ in enumerate(fused_elementwise_metadata.outputs)
+        ]
+    )
+    input_params_decl = ",".join(
+        [
+            FUNC_DECL_INPUT_PARAM_TEMPLATE.render(
+                data_t=fused_elementwise_metadata.data_t, idx=i
+            )
+            for i, _ in enumerate(fused_elementwise_metadata.inputs)
+        ]
+    )
+
+    function_decl = FUNC_DECL_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        func_name=func_name,
+        output_params=output_params_decl,
+        input_params=input_params_decl,
+        dynamic_dims=_gen_dynamic_dim_str(
+            backend_spec.index_type,
+            fused_elementwise_metadata.dynamic_dims,
+            has_type=True,
+        ),
+    )
+    return function_decl
+
+
+def fused_elementwise_gen_function_call(
+    func_attrs,
+    indent: str,
+    backend_spec: BackendSpec,
+):
+    """Generates fused_elementwise function call."""
+    ops = func_attrs["elementwise_ops"]
+    inputs = func_attrs["inputs"]
+    outputs = func_attrs["outputs"]
+    input_accessors = func_attrs["input_accessors"]
+    output_accessors = func_attrs["output_accessors"]
+    original_inputs = func_attrs["original_inputs"]
+    original_outputs = func_attrs["original_outputs"]
+    fused_elementwise_metadata = _parse_func_metadata(
+        ops,
+        inputs,
+        outputs,
+        input_accessors,
+        output_accessors,
+        original_inputs,
+        original_outputs,
+        backend_spec,
+    )
+
+    output_params_vec = []
+    for output in outputs:
+        if output._attrs["dtype"] != "float16":
+            raise NotImplementedError(
+                "Unsupported dtype {}".format(output._attrs["dtype"])
+            )
+        output_params_vec.append(
+            backend_spec.cast_to_half_ptr_template.render(name=output._attrs["name"])
+        )
+    output_params = ",".join(output_params_vec)
+
+    input_params_vec = []
+    for inp in inputs:
+        if inp._attrs["dtype"] != "float16":
+            raise NotImplementedError(
+                "Unsupported dtype {}".format(inp._attrs["dtype"])
+            )
+        input_params_vec.append(
+            backend_spec.cast_to_half_ptr_template.render(name=inp._attrs["name"])
+        )
+    input_params = ",".join(input_params_vec)
+
+    num_elements_calculator = _gen_int_var_product_str(
+        output_accessors[0].original_shapes
+    )
+
+    return FUNC_CALL_TEMPLATE.render(
+        stream=backend_spec.stream,
+        func_name=func_attrs["name"],
+        calculate_n=num_elements_calculator,
+        output_params=output_params,
+        input_params=input_params,
+        dynamic_dims=_gen_dynamic_dim_str(
+            backend_spec.index_type,
+            fused_elementwise_metadata.dynamic_dims,
+            has_type=False,
+        ),
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/common/gemm_common.py b/python/aitemplate/backend/common/gemm_common.py
new file mode 100644
index 000000000..eb7bad8b4
--- /dev/null
+++ b/python/aitemplate/backend/common/gemm_common.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Backend-agnostic functions for gemm codegen.
+"""
+
+from typing import Dict
+
+import jinja2
+
+from aitemplate.compiler.ops.gemm_universal.gemm_common import DimInfo, Source
+
+SHAPE_EVAL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}} {{name}} = {{dim_calculator}};
+"""
+)
+
+
+def gen_dim_calculator(dim_info: DimInfo, is_ptr: bool) -> str:
+    prefix = "*" if is_ptr else ""
+    if dim_info.source == Source.INPUT:
+        if dim_info.tensor_idx == 0:
+            prefix += "a_dim"
+        else:
+            assert dim_info.tensor_idx == 1, f"Unsupported gemm dim: {dim_info}"
+            prefix += "b_dim"
+    else:
+        assert (
+            dim_info.source == Source.OUTPUT and dim_info.tensor_idx == 0
+        ), f"Unsupported gemm dim: {dim_info}"
+        prefix += "c_dim"
+    dim_names = ["(" + prefix + str(idx) + ")" for idx in dim_info.dim_idx]
+    return " * ".join(dim_names)
+
+
+def gen_shape_eval_code(
+    indent: int, dtype: str, dim_info_dict: Dict[str, DimInfo], is_ptr: bool
+) -> str:
+    shape_eval_list = []
+    for name, dim_info_list in dim_info_dict.items():
+        dim_info = None
+        for d in dim_info_list:
+            if d.placeholder:
+                continue
+
+            dim_info = d
+            break
+        assert dim_info is not None, f"Couldn't find valid dim info for dim {name}"
+
+        shape_eval_list.append(
+            SHAPE_EVAL_TEMPLATE.render(
+                dtype=dtype,
+                indent=" " * indent,
+                name=name,
+                dim_calculator=gen_dim_calculator(dim_info, is_ptr),
+            )
+        )
+    return "\n".join(shape_eval_list)
diff --git a/python/aitemplate/backend/common/split_common.py b/python/aitemplate/backend/common/split_common.py
new file mode 100644
index 000000000..9205c90ee
--- /dev/null
+++ b/python/aitemplate/backend/common/split_common.py
@@ -0,0 +1,569 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Backend-agnostic function templates for split.
+"""
+import jinja2
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    {{elem_output_type}} *[] /*outputs*/,
+    {{index_type}} **[] /*output_shapes*/,
+    const {{elem_input_type}} * /*input*/,
+    const {{index_type}} * /*input_shape*/,
+    {{index_type}} /*num_splits*/,
+    {{index_type}} [] /*split_sizes*/,
+    {{index_type}} /*split_dim*/,
+    {{index_type}} /*rank*/,
+    {{prefix}}Stream_t stream
+);
+"""
+)
+
+
+KERNEL_SRC_TEMPLATE = jinja2.Template(
+    """
+#include <vector>
+#include <assert.h>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+{{header_src}}
+
+#ifndef CHECK_ERROR_SPLIT
+#define CHECK_ERROR_SPLIT(expr)                              \\
+  do {                                                       \\
+    {{prefix}}Error_t status = (expr);                       \\
+    if (status != {{prefix}}Success) {                       \\
+      auto msg = std::string("Got error: ") +                \\
+        {{prefix}}GetErrorString(status) +                   \\
+        " at " + __FILE__ + ": " + std::to_string(__LINE__); \\
+      std::cerr << msg << std::endl;                         \\
+      throw std::runtime_error(msg);                         \\
+    }                                                        \\
+  } while (0)
+#endif // CHECK_ERROR_SPLIT
+
+#ifndef LAUNCH_CHECK_SPLIT
+#define LAUNCH_CHECK_SPLIT() CHECK_ERROR_SPLIT({{prefix}}GetLastError())
+#endif // LAUNCH_CHECK_SPLIT
+
+template <typename T, {{index_type}} NumSplits>
+struct OutputMetaData {
+  T* outputs[NumSplits]; /* pointer to each output */
+  int64_t split_dim_offsets[NumSplits]; /* offset of each output along
+                                           the split dimension */
+  int64_t split_dim_sizes[NumSplits]; /* cat dimension size of each output */
+  int64_t num_elems[NumSplits]; /* number of the elements of each output */
+};
+
+template <{{index_type}} Rank>
+struct InputMetaData {
+  {{index_type}} input_shape[Rank];
+  int64_t input_strides[Rank];
+};
+
+__host__ __device__ __forceinline__
+int64_t get_num_elems(const {{index_type}} *shape, {{index_type}} rank) {
+  {{index_type}} num = 1;
+  for ({{index_type}} i = 0; i < rank; i++) {
+    num *= shape[i];
+  }
+  return num;
+}
+
+template <{{index_type}} Rank>
+__host__ __device__ int64_t compute_input_elem_offset(
+    const {{index_type}} *input_shape,
+    int64_t *input_strides,
+    int64_t split_dim_size,
+    {{index_type}} split_dim,
+    int64_t linear_idx) {
+  int64_t offset = 0;
+  for ({{index_type}} i = Rank - 1; i >= 1; --i) {
+    int64_t cur_dim_size = i == split_dim ? split_dim_size : input_shape[i];
+    int64_t next_dim_idx = linear_idx / cur_dim_size;
+    int64_t cur_dim_idx = linear_idx - cur_dim_size * next_dim_idx;
+    int64_t cur_dim_offset = cur_dim_idx * input_strides[i];
+    offset += cur_dim_offset;
+    linear_idx = next_dim_idx;
+  }
+  return offset + linear_idx * input_strides[0];
+}
+
+template <typename READ_T, typename ELEM_T, {{index_type}} Rank,
+          {{index_type}} NumSplits, {{index_type}} ElemsPerThread>
+__global__ void
+split_kernel(
+    const ELEM_T *orig_input,
+    InputMetaData<Rank> input_meta,
+    OutputMetaData<ELEM_T, NumSplits> output_meta,
+    const {{index_type}} split_dim,
+    const int64_t input_split_dim_stride) {
+  // split is the inverse of concat, so we
+  //   (1) use blockIdx.y to specify the blocks for each ouput; and
+  //   (2) use tid to access each output;
+  const {{index_type}} tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const READ_T* input = reinterpret_cast<const READ_T*>(orig_input);
+
+  READ_T* output =
+      reinterpret_cast<READ_T*>(output_meta.outputs[blockIdx.y]);
+  int64_t output_offset = output_meta.split_dim_offsets[blockIdx.y];
+  int64_t num_output_elems = output_meta.num_elems[blockIdx.y];
+  int64_t split_dim_size = output_meta.split_dim_sizes[blockIdx.y];
+  int64_t input_offset = output_offset * input_split_dim_stride;
+
+  unsigned read_t_sz = sizeof(READ_T);
+  unsigned elem_t_sz = sizeof(ELEM_T);
+  assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
+  {{index_type}} n_of_elem_t = read_t_sz / elem_t_sz;
+  // number of READ_T elements per thread
+  {{index_type}} reads_per_thread_in_read_t = ElemsPerThread / n_of_elem_t;
+  const {{index_type}} num_elems_in_read_t = num_output_elems / n_of_elem_t;
+  {{index_type}} read_idx = tid;
+
+#pragma unroll
+  for ({{index_type}} i = 0; i < reads_per_thread_in_read_t;
+       i++, read_idx += blockDim.x * gridDim.x) {
+    if (read_idx >= num_elems_in_read_t) {
+      break;
+    }
+    /* make sure to adjust read_idx, which refers to location at
+       (read_idx * n_of_elem_t) actually */
+    int64_t input_elem_offset =
+        compute_input_elem_offset<Rank>(input_meta.input_shape,
+                                        input_meta.input_strides,
+                                        split_dim_size,
+                                        split_dim,
+                                        read_idx * n_of_elem_t);
+
+    READ_T tmp_v = input[(input_offset + input_elem_offset) / n_of_elem_t];
+    output[read_idx] = tmp_v;
+  }
+}
+
+enum class LoadVecType {
+  VT_HALF = 0,
+  VT_FLOAT,
+  VT_FLOAT2,
+  VT_FLOAT4
+};
+
+template <typename ELEM_T>
+static inline LoadVecType get_vec_type(
+    const {{index_type}} *shape, {{index_type}} rank, {{index_type}} dim) {
+  assert(rank > 0);
+  assert(dim < rank && dim >= 0);
+  int64_t running_stride = shape[rank - 1];
+  for ({{index_type}} i = rank - 2; i >= dim; i--) {
+    running_stride *= shape[i];
+  }
+  {{index_type}} size_elem_t = sizeof(ELEM_T);
+
+#define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)  \\
+  if (sizeof(vec_type) % size_elem_t == 0) {          \\
+    {{index_type}} n_of_elem_t = sizeof(vec_type) / size_elem_t; \\
+    if (running_stride % n_of_elem_t == 0) {          \\
+      return load_vec_type;                           \\
+    }                                                 \\
+  }
+
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+
+#undef HANDLE_ONE_VEC_TYPE
+  throw std::runtime_error(
+      "Cannot resolve LoadVecType."
+  );
+}
+
+template <typename ELEM_T, {{index_type}} Rank, {{index_type}} NumSplits,
+          {{index_type}} ElemsPerThread, {{index_type}} ThreadsPerBlock>
+void split_kernel_launcher(
+    ELEM_T *outputs[],
+    {{index_type}} *output_shapes[],
+    const ELEM_T *input,
+    const {{index_type}} *input_shape,
+    const {{index_type}} split_dim,
+    {{prefix}}Stream_t stream
+) {
+
+  InputMetaData<Rank> input_meta;
+  input_meta.input_strides[Rank - 1] = 1;
+  input_meta.input_shape[Rank - 1] = input_shape[Rank - 1];
+  for ({{index_type}} i = Rank - 2; i >= 0; i--) {
+    input_meta.input_strides[i] =
+        input_meta.input_strides[i+1] * input_shape[i+1];
+    input_meta.input_shape[i] = input_shape[i];
+  }
+
+  OutputMetaData<ELEM_T, NumSplits> output_meta;
+  {{index_type}} offset = 0;
+  LoadVecType min_vec_type = LoadVecType::VT_FLOAT4;
+  for ({{index_type}} i = 0; i < NumSplits; i++) {
+    output_meta.outputs[i] = outputs[i];
+    output_meta.split_dim_offsets[i] = offset;
+    output_meta.split_dim_sizes[i] = output_shapes[i][split_dim];
+    output_meta.num_elems[i] = get_num_elems(output_shapes[i], Rank);
+    offset += output_meta.split_dim_sizes[i];
+    LoadVecType vec_type =
+        get_vec_type<ELEM_T>(output_shapes[i], Rank, split_dim);
+    min_vec_type = vec_type < min_vec_type ? vec_type : min_vec_type;
+  }
+
+  int64_t max_num_output_elems = 0;
+  for ({{index_type}} i = 0; i < NumSplits; i++) {
+    {{index_type}} num_outputs = get_num_elems(output_shapes[i], Rank);
+    max_num_output_elems = num_outputs > max_num_output_elems ?
+                           num_outputs : max_num_output_elems;
+  }
+  {{index_type}} m = (max_num_output_elems % (ThreadsPerBlock * ElemsPerThread) != 0);
+  {{index_type}} num_blocks_x =
+      (max_num_output_elems / (ThreadsPerBlock * ElemsPerThread)) + m;
+  dim3 grid_config = dim3(num_blocks_x, NumSplits);
+
+#define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)                   \\
+    case load_vec_type: {                                              \\
+      if (ElemsPerThread * sizeof(ELEM_T) < sizeof(vec_type)) {        \\
+         throw std::runtime_error(                                     \\
+           std::string("No valid kernel available for ") + #vec_type); \\
+      }                                                                \\
+      split_kernel<vec_type, ELEM_T, Rank, NumSplits, ElemsPerThread>  \\
+        <<<grid_config, ThreadsPerBlock, 0, stream>>>(                 \\
+            input,                                                     \\
+            input_meta,                                                \\
+            output_meta,                                               \\
+            split_dim,                                                 \\
+            input_meta.input_strides[split_dim]);                      \\
+      LAUNCH_CHECK_SPLIT();                                            \\
+      break;                                                           \\
+    }
+
+  switch (min_vec_type) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    default:
+      throw std::runtime_error("Invalid LoadVecType\\n");
+  }
+
+#undef HANDLE_ONE_VEC_TYPE
+}
+
+#undef CHECK_ERROR_SPLIT
+#undef LAUNCH_CHECK_SPLIT
+
+"""
+)
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if (rank == {{rank}} && num_splits == {{num_splits}}) {
+{% for split_idx in range(num_splits) %}
+{{indent}}  {{index_type}} local_shape{{split_idx}}[{{rank}}];
+{% for rank_idx in range(rank) %}
+{{indent}}  local_shape{{split_idx}}[{{rank_idx}}] = input_shape[{{rank_idx}}];
+{% endfor %}
+{{indent}}  local_shape{{split_idx}}[split_dim] = split_sizes[{{split_idx}}];
+
+{% endfor %}
+
+{{indent}}  {{index_type}}* local_output_shapes[{{num_splits}}] = {
+{% for idx in range(num_splits - 1) %}
+{{indent}}    local_shape{{idx}},
+{% endfor %}
+{{indent}}    local_shape{{num_splits - 1}}
+{{indent}}  };
+{{indent}}  /* TODO: more profiling on ElemsPerThread and ThreadsPerBlock */
+{{indent}}  split_kernel_launcher<{{elem_type}},
+{{indent}}                        {{rank}}/*Rank*/,
+{{indent}}                        {{num_splits}}/*NumSplits*/,
+{{indent}}                        {{elems_per_thread}}/*ElemsPerThread*/,
+{{indent}}                        {{threads_per_block}}/*THREADS_PER_BLOCK*/>(
+{{indent}}      outputs, local_output_shapes, input, input_shape, split_dim, stream);
+{{indent}}  return;
+{{indent}}}
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{kernel_src}}
+void {{func_name}}(
+    {{elem_output_type}}* outputs[],
+    {{index_type}} **output_shapes[],
+    const {{elem_input_type}}* input,
+    const {{index_type}} *input_shape,
+    {{index_type}} num_splits,
+    {{index_type}} split_sizes[],
+    {{index_type}} split_dim,
+    {{index_type}} rank,
+    {{prefix}}Stream_t stream
+    ) {
+
+  if (rank <= 0) {
+    throw std::runtime_error("rank must be larger than 0!");
+  }
+  if (split_dim >= rank) {
+    throw std::runtime_error("cat_dim must be smaller than rank!");
+  }
+  if (num_splits < 1) {
+    throw std::runtime_error("the number of splits must be larger than 0!");
+  }
+
+  // now we update the shape for each output
+  for ({{index_type}} i = 0; i < num_splits; i++) {
+    {{index_type}} **shape_ptr = output_shapes[i];
+    for ({{index_type}} dim_idx = 0; dim_idx < rank; dim_idx++) {
+      *(shape_ptr[dim_idx]) = input_shape[dim_idx];
+    }
+    // update dim size for the split axis
+    *(shape_ptr[split_dim]) = split_sizes[i];
+  }
+
+  {{index_type}} split_dim_size = input_shape[split_dim];
+  {{index_type}} sum_of_split_sizes = 0;
+  for ({{index_type}} i = 0; i < num_splits; i++) {
+    sum_of_split_sizes += split_sizes[i];
+  }
+  if (split_dim_size != sum_of_split_sizes) {
+      throw std::runtime_error("unmatched split dim size!");
+  }
+
+  // If split dim is zero, we are done
+  if (split_dim_size == 0) {
+    return;
+  }
+  // If the input tensor is empty, we are done
+  if (get_num_elems(input_shape, rank) == 0) {
+    return;
+  }
+  // make sure input and outputs are valid
+  if (!input) {
+    throw std::runtime_error("input is NULL!");
+  }
+  for (int i = 0; i < num_splits; i++) {
+    if (!outputs[i]) {
+      throw std::runtime_error("NULL output found at: " + std::to_string(i));
+    }
+  }
+
+{{exec_paths}}
+
+  throw std::runtime_error(
+      "Unsupported cat kernel specialization!"
+  );
+}
+"""
+)
+
+
+OUTPUT_SHAPE_DEF_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{index_type}} *{{output_shape_name}}[] = {
+{{indent}}  {{output_dim_refs}}
+{{indent}}};
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+
+{{indent}}  {{output_elem_type}} *outputs[] = {
+{{indent}}    {{outputs}}
+{{indent}}  };
+
+{{output_shape_defs}}
+
+{{indent}}  {{index_type}} **output_shapes[] = {
+{{indent}}    {{output_shapes}}
+{{indent}}  };
+
+{{indent}}  const {{index_type}} {{input_name}}_shape[] = {
+{{indent}}    {{input_dims}}
+{{indent}}  };
+
+{{indent}}  {{index_type}} split_sizes[] = {
+{{indent}}    {{split_sizes}}
+{{indent}}  };
+
+{{indent}}  {{func_name}}(
+{{indent}}      outputs,
+{{indent}}      output_shapes,
+{{indent}}      {{input_ptr}},
+{{indent}}      {{input_name}}_shape,
+{{indent}}      {{num_splits}}/*num_splits*/,
+{{indent}}      split_sizes,
+{{indent}}      {{split_dim}}/*split_dim*/,
+{{indent}}      {{rank}}/*rank*/,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+def gen_function_decl(func_attrs, backend_spec):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        func_name=func_attrs["name"],
+        elem_output_type=output_type,
+        elem_input_type=input_type,
+    )
+
+
+def gen_function(func_attrs, backend_spec):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    inputs = func_attrs["inputs"]
+    x = inputs[0]
+    y = func_attrs["outputs"][0]
+    x_shape = x._attrs["shape"]
+
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
+
+    # TODO: consider to add profiling paths for tuning
+    # elems_per_thread and threads_per_block
+    exec_paths = EXEC_COND_TEMPLATE.render(
+        indent="  ",
+        rank=len(x_shape),
+        num_splits=len(func_attrs["split_sizes"]),
+        elem_type=input_type,
+        elems_per_thread=128,
+        threads_per_block=128,
+        index_type=backend_spec.index_type,
+    )
+    header_src = backend_spec.header_src_template.render()
+    kernel_src = KERNEL_SRC_TEMPLATE.render(
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        header_src=header_src,
+    )
+    return SRC_TEMPLATE.render(
+        kernel_src=kernel_src,
+        func_name=func_attrs["name"],
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        exec_paths=exec_paths,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function_call(func_attrs, backend_spec, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    x = func_attrs["inputs"][0]
+    outputs = func_attrs["outputs"]
+    y = outputs[0]
+    split_dim = func_attrs["split_dim"]
+    num_splits = len(func_attrs["split_sizes"])
+
+    output_names = ",\n      ".join(
+        [
+            backend_spec.cast_to_half_ptr_template.render(name=i._attrs["name"])
+            for i in outputs
+        ]
+    )
+
+    output_shape_defs = []
+    output_shape_names = []
+    for i in outputs:
+        output_shape_name = "{}_shape".format(i._attrs["name"])
+        if output_shape_name not in output_shape_names:
+            dim_refs = ", ".join(
+                ["&" + dim._attrs["name"] for dim in i._attrs["shape"]]
+            )
+            one_shape_def = OUTPUT_SHAPE_DEF_TEMPLATE.render(
+                indent="      ",
+                output_shape_name=output_shape_name,
+                output_dim_refs=dim_refs,
+                index_type=backend_spec.index_type,
+            )
+            output_shape_defs.append(one_shape_def)
+        output_shape_names.append(output_shape_name)
+
+    x_shape = x._attrs["shape"]
+    x_dims = ", ".join([dim._attrs["name"] for dim in x_shape])
+    casted_x_ptr = backend_spec.cast_to_const_half_ptr_template.render(
+        name=x._attrs["name"]
+    )
+
+    split_sizes = ", ".join([str(i) for i in func_attrs["split_sizes"]])
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        output_elem_type=backend_spec.dtype_to_backend_type(y._attrs["dtype"]),
+        outputs=output_names,
+        output_shape_defs="".join(output_shape_defs),
+        output_shapes=", ".join(output_shape_names),
+        input_dims=x_dims,
+        func_name=func_attrs["name"],
+        input_name=x._attrs["name"],
+        input_ptr=casted_x_ptr,
+        split_dim=split_dim,
+        rank=len(x._attrs["shape"]),
+        num_splits=num_splits,
+        split_sizes=split_sizes,
+        index_type=backend_spec.index_type,
+    )
diff --git a/python/aitemplate/backend/common/tensor/argmax_common.py b/python/aitemplate/backend/common/tensor/argmax_common.py
new file mode 100644
index 000000000..bb422646e
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/argmax_common.py
@@ -0,0 +1,456 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+argmax kernel codegen.
+"""
+
+import os
+from typing import Any, Dict, List, Tuple
+
+import jinja2
+
+from ... import builder
+from ...target import Target
+
+# pylint: disable=C0301
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<half*>(&({{name}}->raw()))"
+)
+
+FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{kernel}}
+
+}  // namespace
+
+{{func_signature}}
+{
+
+    argmax_launcher<half>(stream, elem_cnt, instance_size, instance_num, input, workspace, output);
+}
+    """
+)
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+const int32_t kThreadsNumPerBlock = 256;
+const int32_t kMaxBlocksNum = 8192;
+
+#define GPU_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+inline size_t GetAlignedSize(size_t size) {
+  const size_t kAlignSize = 512;
+  return (size + kAlignSize - 1) / kAlignSize * kAlignSize;
+}
+
+template <typename T>
+
+class TmpBufferManager final {
+ public:
+  TmpBufferManager(int32_t capacity, void* ptr, int32_t instance_num)
+      : capacity_{capacity}, key_value_out_elem_cnt_{instance_num} {
+    const int32_t key_value_out_aligned_bytes = GetAlignedSize(
+        key_value_out_elem_cnt_ * sizeof({{cub}}::KeyValuePair<int32_t, T>));
+
+    key_value_out_ptr_ = reinterpret_cast<{{cub}}::KeyValuePair<int32_t, T>*>(ptr);
+    temp_storage_ptr_ = reinterpret_cast<void*>(
+        reinterpret_cast<char*>(key_value_out_ptr_) +
+        key_value_out_aligned_bytes);
+
+    temp_storage_bytes_ = capacity_ - key_value_out_aligned_bytes;
+  }
+  ~TmpBufferManager() = default;
+
+  {{cub}}::KeyValuePair<int32_t, T>* KeyValueOutPtr() const {
+    return key_value_out_ptr_;
+  }
+  void* TempStoragePtr() const {
+    return temp_storage_ptr_;
+  }
+
+  int32_t TempStorageBytes() const {
+    return temp_storage_bytes_;
+  }
+
+ private:
+  int32_t capacity_;
+
+  {{cub}}::KeyValuePair<int32_t, T>* key_value_out_ptr_;
+  void* temp_storage_ptr_;
+
+  int32_t key_value_out_elem_cnt_;
+  int32_t temp_storage_bytes_;
+};
+
+class MultiplyFunctor final {
+ public:
+  MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
+  __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
+    return idx * num_col_;
+  }
+
+ private:
+  int32_t num_col_;
+};
+
+template <typename T>
+
+size_t InferTempStorageForArgMax(int32_t num_row, int32_t num_col) {
+  using SegmentOffsetIter = {{cub}}::TransformInputIterator<
+      int32_t,
+      MultiplyFunctor,
+      {{cub}}::CountingInputIterator<int32_t>>;
+
+  {{cub}}::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  size_t temp_storage_bytes = 0;
+  auto err = {{cub}}::DeviceSegmentedReduce::
+      ArgMax<T*, {{cub}}::KeyValuePair<int32_t, T>*, SegmentOffsetIter>(
+          /* d_temp_storage */ nullptr,
+          /* temp_storage_bytes */ temp_storage_bytes,
+          /* d_in */ nullptr,
+          /* d_out */ nullptr,
+          /* num_segments */ num_row,
+          /* d_begin_offsets */ segment_offset_iter,
+          /* d_end_offsets */ segment_offset_iter + 1,
+
+          /* stream */ 0);
+  return temp_storage_bytes;
+}
+
+template <typename T>
+void ArgMax(
+    const T* in_ptr,
+    int32_t num_row,
+    int32_t num_col,
+    void* temp_storage_ptr,
+    int32_t temp_storage_bytes,
+    {{cub}}::KeyValuePair<int32_t, T>* out_ptr,
+    {{prefix}}Stream_t stream) {
+  size_t rt_inferred_temp_storage_bytes =
+      InferTempStorageForArgMax<T>(num_row, num_col);
+
+  using SegmentOffsetIter = {{cub}}::TransformInputIterator<
+      int32_t,
+      MultiplyFunctor,
+      {{cub}}::CountingInputIterator<int32_t>>;
+
+  {{cub}}::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  auto err = {{cub}}::DeviceSegmentedReduce::ArgMax(
+      /* d_temp_storage */ temp_storage_ptr,
+      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
+      /* d_in */ in_ptr,
+      /* d_out */ out_ptr,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* stream */ stream);
+}
+
+template <typename T>
+__global__ void WriteKeysToOutput(
+    const int32_t instance_num,
+    const int32_t instance_size,
+    const {{cub}}::KeyValuePair<int32_t, T>* key_value_out_ptr,
+    int64_t* out_ptr) {
+  GPU_KERNEL_LOOP(i, instance_num) {
+    out_ptr[i] = key_value_out_ptr[i].key{% if is_hipcub %} - instance_size * i{% endif %};
+  }
+}
+
+// ALIGNPTR
+int64_t* alignPtr(int64_t* ptr, uintptr_t to) {
+  uintptr_t addr = (uintptr_t)ptr;
+  if (addr % to) {
+    addr += to - addr % to;
+  }
+  return (int64_t*)addr;
+}
+
+inline int32_t BlocksNum4ThreadsNum(const int32_t n) {
+  return std::min(
+      (n + kThreadsNumPerBlock - 1) / kThreadsNumPerBlock,
+      kMaxBlocksNum);
+}
+
+template <typename T>
+void argmax_launcher(
+    {{prefix}}Stream_t stream,
+    const {{index_type}} elem_cnt,
+    const {{index_type}} instance_size,
+    const {{index_type}} instance_num,
+    const void* input,
+    void* workspace,
+    void* output) {
+  const uintptr_t ALIGNMENT = 32;
+  int64_t* vworkspace = alignPtr((int64_t*)workspace, ALIGNMENT);
+  T* tmp_buffer = (T*)vworkspace;
+
+  TmpBufferManager<T> buffer_manager(
+      static_cast<int64_t>(elem_cnt), tmp_buffer, instance_num);
+
+  ArgMax(
+      (const T*)input,
+      instance_num,
+      instance_size,
+      buffer_manager.TempStoragePtr(),
+      buffer_manager.TempStorageBytes(),
+      buffer_manager.KeyValueOutPtr(),
+      stream);
+
+  WriteKeysToOutput<T>
+      <<<BlocksNum4ThreadsNum(instance_num),
+         kThreadsNumPerBlock,
+         0,
+         stream>>>(
+          instance_num, instance_size, buffer_manager.KeyValueOutPtr(), (int64_t*)output);
+}
+"""
+)
+
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+{{header_files}}
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+namespace {
+{{kernel}}
+}  // namespace
+
+int main(int argc, char** argv) {
+  int instance_size = std::stoi(argv[1]);
+  int instance_num = std::stoi(argv[2]);
+
+  float runtime_ms = 0;
+  int32_t key_value_out_bytes = GetAlignedSize(instance_num * sizeof({{cub}}::KeyValuePair<int32_t, half>));
+  size_t temp_storage_bytes = InferTempStorageForArgMax<half>(instance_num, instance_size);
+  GLOBAL_WORKSPACE_SIZE  =  GetAlignedSize(key_value_out_bytes + temp_storage_bytes);
+
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(int64_t* output,
+                   const half* input,
+                   const {{index_type}} elem_cnt,
+                   const {{index_type}} instance_size,
+                   const {{index_type}} instance_num,
+                   uint8_t* workspace,
+                   {{prefix}}Stream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{output}}, {{input}},
+{{indent}}    {{elem_cnt}},
+{{indent}}    {{instance_size}},
+{{indent}}    {{instance_num}},
+{{indent}}    global_workspace, stream /* default stream */
+{{indent}});
+    """
+)
+
+
+def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+    """Generates function.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    header_files : str
+        Includes the header files for a backend.
+    backend_spec : class
+        Specifies the backend configurations.
+
+    Returns
+    -------
+    str
+        Rendered function.
+    """
+    index_type = backend_spec.index_type
+    prefix = backend_spec.prefix
+    cub = backend_spec.cub
+    return FUNC_TEMPLATE.render(
+        header_files=header_files,
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"], index_type=index_type, prefix=prefix
+        ),
+        kernel=KERNEL_TEMPLATE.render(
+            cub=cub, index_type=index_type, prefix=prefix, is_hipcub=(cub == "hipcub")
+        ),
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    """Generates function decl.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec : class
+        Specifies the backend configurations.
+
+    Returns
+    -------
+    str
+        Rendered function decl.
+    """
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            index_type=backend_spec.index_type,
+            prefix=backend_spec.prefix,
+        ),
+    ).strip()
+
+
+def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec : class
+        Specifies the backend configurations.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    output_name = ""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 1
+
+    output_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+    input_name = backend_spec.cast_to_half_ptr_template.render(
+        name=func_attrs["inputs"][0]._attrs["name"]
+    )
+
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+
+    elem_cnt = 1
+    for shape in xshape:
+        elem_cnt *= shape._attrs["values"][0]
+    instance_size = xshape[-1]._attrs["values"][0]
+    instance_num = elem_cnt // instance_size
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        input=input_name,
+        elem_cnt=elem_cnt,
+        instance_size=instance_size,
+        instance_num=instance_num,
+        indent=indent,
+    )
+
+
+def add_profiler(
+    file_pairs: List[Tuple[str, str]],
+    workdir: str,
+    op_type: str,
+    output_name: str,
+    code: str,
+):
+    prefix = os.path.join(workdir, "profiler", op_type)
+    if not os.path.exists(prefix):
+        os.makedirs(prefix)
+    src_path = os.path.join(prefix, output_name + ".cu")
+    obj_path = os.path.join(prefix, output_name)
+    if os.path.exists(obj_path):
+        return
+    with open(src_path, "w") as f:
+        f.write(code)
+    file_pairs.append((src_path, obj_path))
+
+
+def gen_profiler(
+    func_attrs: Dict[str, Any], workdir: str, header_files: str, backend_spec
+):
+    """Generates code for argmax profiling.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    workdir: str
+        Target directory for generated C++ source code files
+    header_files : str
+        Includes the header files for a backend.
+    backend_spec : class
+        Specifies the backend configurations.
+
+    Returns
+    -------
+    None
+    """
+    op_type = func_attrs["op"]
+    file_pairs = []
+    index_type = backend_spec.index_type
+    prefix = backend_spec.prefix
+    cub = backend_spec.cub
+    code = PROFILER_TEMPLATE.render(
+        header_files=header_files,
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"], index_type=index_type, prefix=prefix
+        ),
+        kernel=KERNEL_TEMPLATE.render(
+            cub=cub, index_type=index_type, prefix=prefix, is_hipcub=(cub == "hipcub")
+        ),
+        cub=cub,
+    )
+    op_name = func_attrs["op"]
+    add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    target = Target.current()
+    compile_engine = builder.Builder()
+    compile_engine.build_objs(file_pairs, target.compile_cmd(executable=True))
diff --git a/python/aitemplate/backend/common/tensor/batch_gather_common.py b/python/aitemplate/backend/common/tensor/batch_gather_common.py
new file mode 100644
index 000000000..86bbea7a0
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/batch_gather_common.py
@@ -0,0 +1,221 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+batch_gather kernel codegen.
+"""
+
+from typing import Any, Dict
+
+import jinja2
+
+# pylint: disable=C0301
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    """reinterpret_cast<half*>(
+        {% if is_cuda %}&({% endif %}{{name}}{% if is_cuda %}->raw()){% endif %})"""
+)
+
+FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{kernel}}
+
+}  // namespace
+
+{{func_signature}}
+{
+    batch_gather_launcher<half, int64_t>(stream, batch_num, indices_num, instance_size, gather_dim_size, input, indices, workspace, output);
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(half* output,
+                   const half* input,
+                   const int64_t* indices,
+                   const {{index_type}} batch_num,
+                   const {{index_type}} indices_num,
+                   const {{index_type}} instance_size,
+                   const {{index_type}} gather_dim_size,
+                   uint8_t* workspace,
+                   {{prefix}}Stream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{output}}, {{input}}, {{indices}},
+{{indent}}    {{batch_num}},
+{{indent}}    {{indices_num}},
+{{indent}}    {{instance_size}},
+{{indent}}    {{gather_dim_size}},
+{{indent}}    global_workspace, stream /* default stream */
+{{indent}});
+    """
+)
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+const int64_t kThreadsNumPerBlock = 256;
+const int64_t kMaxBlocksNum = 8192;
+
+#define GPU_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename K>
+__device__ int64_t GetInOffset(
+    const int64_t out_offset,
+    const K* indices,
+    const int64_t indices_num,
+    const int64_t instance_size,
+    const int64_t gather_dim_size) {
+  const int64_t batch_idx = out_offset / (indices_num * instance_size);
+  const int64_t indices_idx =
+      out_offset % (indices_num * instance_size) / instance_size;
+  const int64_t inner_idx = out_offset % instance_size;
+  const int64_t idx = indices[batch_idx * indices_num + indices_idx];
+  assert(idx >= 0 && idx < gather_dim_size);
+  return batch_idx * gather_dim_size * instance_size + idx * instance_size +
+      inner_idx;
+}
+
+template <typename T, typename K>
+__global__ void BatchGatherGpu(
+    const int64_t elem_cnt,
+    const T* in,
+    const K* indices,
+    const int64_t indices_num,
+    const int64_t instance_size,
+    const int64_t gather_dim_size,
+    T* out) {
+  GPU_KERNEL_LOOP(i, elem_cnt) {
+    out[i] = in[GetInOffset<K>(
+        i, indices, indices_num, instance_size, gather_dim_size)];
+  }
+}
+
+inline int64_t BlocksNum4ThreadsNum(const int64_t n) {
+  return std::min(
+      (n + kThreadsNumPerBlock - 1) / kThreadsNumPerBlock,
+      kMaxBlocksNum);
+}
+template <typename T, typename K>
+void batch_gather_launcher(
+    {{prefix}}Stream_t stream,
+    const {{index_type}} batch_num,
+    const {{index_type}} indices_num,
+    const {{index_type}} instance_size,
+    const {{index_type}} gather_dim_size,
+    const T* input,
+    const K* indices,
+    void* workspace,
+    T* output) {
+  const int64_t elem_cnt = batch_num * indices_num * instance_size;
+  BatchGatherGpu<T, K>
+      <<<BlocksNum4ThreadsNum(elem_cnt), kThreadsNumPerBlock, 0, stream>>>(
+          elem_cnt,
+          input,
+          indices,
+          indices_num,
+          instance_size,
+          gather_dim_size,
+          output);
+}
+    """
+)
+
+
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) -> str:
+    output_name = ""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 2
+
+    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"], is_cuda=is_cuda
+    )
+    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][0]._attrs["name"], is_cuda=is_cuda
+    )
+    indices_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][1]._attrs["name"]
+    )
+
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    indices = func_attrs["inputs"][1]
+    ind_shape = indices._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+
+    axis = len(ind_shape) - 1
+    batch_num = 1
+    for i in range(axis):
+        batch_num *= yshape[i]._attrs["values"][0]
+
+    indices_num = yshape[axis]._attrs["values"][0]
+
+    instance_size = 1
+    for i in range(axis + 1, len(yshape)):
+        instance_size *= yshape[i]._attrs["values"][0]
+
+    gather_dim_size = xshape[axis]._attrs["values"][0]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        input=input_name,
+        indices=indices_name,
+        batch_num=batch_num,
+        indices_num=indices_num,
+        instance_size=instance_size,
+        gather_dim_size=gather_dim_size,
+        indent=indent,
+    )
+
+
+def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+    index_type = backend_spec.index_type
+    prefix = backend_spec.prefix
+    return FUNC_TEMPLATE.render(
+        header_files=header_files,
+        kernel=KERNEL_TEMPLATE.render(index_type=index_type, prefix=prefix),
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"], index_type=index_type, prefix=prefix
+        ),
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            index_type=backend_spec.index_type,
+            prefix=backend_spec.prefix,
+        ).strip()
+    )
diff --git a/python/aitemplate/backend/common/tensor/permute021_common.py b/python/aitemplate/backend/common/tensor/permute021_common.py
new file mode 100644
index 000000000..db5ed63fd
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/permute021_common.py
@@ -0,0 +1,304 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common implementations for all backends for permute021.
+
+For three dimension input, shift the second and the third dimension.
+i.e. Output[d0, d2, d1] = Input[d0, d1, d2]
+
+"""
+from typing import Any, Dict
+
+import jinja2
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  {{lib_dtype}}*,
+  {{lib_dtype}}*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  {{prefix}}Stream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    ({{lib_dtype}}*)({{in_ptr}}),
+{{indent}}    ({{lib_dtype}}*)({{out_ptr}}),
+{{indent}}    {{x_dim0}},
+{{indent}}    {{x_dim1}},
+{{indent}}    {{x_dim2}},
+{{indent}}    {{y_dim0}},
+{{indent}}    {{y_dim1}},
+{{indent}}    {{y_dim2}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}permute021_launcher(
+{{indent}}    in_ptr,
+{{indent}}    out_ptr,
+{{indent}}    *x_dim0,
+{{indent}}    *x_dim1,
+{{indent}}    *x_dim2,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+template <typename T>
+__global__ void nhwc_to_nchw_kernel(T *output,
+                                    const T *input,
+                                    const int n,
+                                    const int h,
+                                    const int w,
+                                    const int c) {
+
+  const int hw = h*w;
+  const int hwc = hw*c;
+  __shared__ T shbuf[32 * (32 + 1)];
+  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
+  const int32_t wid  = tid / 32;
+  const int32_t lid  = tid % 32;
+  const int32_t ni   = blockIdx.z;
+  const int32_t hwi0  = blockIdx.y * 32;
+  const int32_t ci0 = blockIdx.x * 32;
+
+  const size_t input_idx = ni * hwc + (hwi0 + wid) * c + ci0;
+  const T *A = input + input_idx;
+  if (ci0 + lid < c) {
+    const int lid_x_33 = lid * 33;
+    if ((hwi0 + 32) <= hw) {
+      int hwi = wid;  // between 0 and 7
+      #pragma unroll
+      for (int cLoopIdx = 0; cLoopIdx < 4; cLoopIdx++) {
+        shbuf[lid_x_33 + hwi] = A[lid];
+        A                     = &A[8 * c];
+        hwi += 8;
+      }
+    } else {
+      for (int hwi = wid; hwi < 32; hwi += 8) {
+        if ((hwi + hwi0) < hw) {
+          shbuf[lid_x_33 + hwi] = A[lid];
+        }
+        A = &A[8 * c];
+      }
+    }
+  }
+  __syncthreads();
+
+  const int32_t hwiOut = hwi0 + lid;
+  output = &output[ni * hwc + hwiOut];
+  if (hwiOut < hw) {
+    if (ci0 + 32 < c) {
+      int cI = wid;
+      #pragma unroll
+      for (int hwLoopIdx = 0; hwLoopIdx < 4; ++hwLoopIdx) {
+        output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
+        cI += 8;
+      }
+    } else {
+      for (int cI = wid; cI < 32; cI += 8) {
+        if (ci0 + cI < c) {
+          output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
+        }
+      }
+    }
+  }
+}
+
+void permute021_launcher({{lib_dtype}}* in_ptr,
+                         {{lib_dtype}}* out_ptr,
+                         int x_dim0,
+                         int x_dim1,
+                         int x_dim2,
+                         {{prefix}}Stream_t stream) {
+  const int n = x_dim0;
+  const int h = 1;
+  const int w = x_dim1;
+  const int c = x_dim2;
+  dim3 grid((c + 31)/32, (h*w + 31)/32, n);
+  dim3 block(32, 8);
+  nhwc_to_nchw_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
+    ({{lib_dtype}}*)out_ptr,
+    (const {{lib_dtype}}*)in_ptr,
+    n,
+    h,
+    w,
+    c
+  );
+}
+} // namespace
+
+void {{function_name}} (
+    {{lib_dtype}}* in_ptr,
+    {{lib_dtype}}* out_ptr,
+    int64_t* x_dim0,
+    int64_t* x_dim1,
+    int64_t* x_dim2,
+    int64_t* y_dim0,
+    int64_t* y_dim1,
+    int64_t* y_dim2,
+    {{prefix}}Stream_t stream
+) {
+  if (!in_ptr) {
+    throw std::runtime_error("in_ptr is NULL!");
+  }
+  if (!out_ptr) {
+    throw std::runtime_error("in_ptr is NULL!");
+  }
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+def gen_function(
+    func_attrs: Dict[str, Any],
+    template_path: str,
+    shape_eval_template,
+    shape_save_template,
+    header_files: str,
+    backend_spec,
+) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+    shape_eval_template : jinja template
+    shape_save_template : jinja template
+    header_files : str
+        header files included in the function
+    backend_spec : class
+        specifies backend configs
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+
+    func_name = func_attrs["name"]
+    x = func_attrs["inputs"][0]
+    xdtype = x._attrs["dtype"]
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*x_dim0",
+        x_dim1="*x_dim1",
+        x_dim2="*x_dim2",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*y_dim0",
+        y_dim1="*y_dim1",
+        y_dim2="*y_dim2",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render()
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        header_files=header_files,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies backend configs
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+
+    func_name = func_attrs["name"]
+    x = func_attrs["inputs"][0]
+    xdtype = x._attrs["dtype"]
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies backend configs
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    xdtype = x._attrs["dtype"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        x_dim0="&" + xshape[0]._attrs["name"],
+        x_dim1="&" + xshape[1]._attrs["name"],
+        x_dim2="&" + xshape[2]._attrs["name"],
+        y_dim0="&" + yshape[0]._attrs["name"],
+        y_dim1="&" + yshape[1]._attrs["name"],
+        y_dim2="&" + yshape[2]._attrs["name"],
+        indent=indent,
+        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
+    )
diff --git a/python/aitemplate/backend/common/tensor/permute102_common.py b/python/aitemplate/backend/common/tensor/permute102_common.py
new file mode 100644
index 000000000..807e65bef
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/permute102_common.py
@@ -0,0 +1,310 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common implementations for all backends for permute012.
+
+For three dimension input, shift the first and the second dimension.
+i.e. Output[d1, d0, d2] = Input[d0, d1, d2]
+
+This is a naive modification over cutlass nhwc to nchw op:
+https://github.com/NVIDIA/cutlass/blob/master/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
+At implementation, it creates d1/32 x d2/32 x d0 blocks, each with 32 x 8 threads,
+and each thread processes 4 elements.
+
+We change the write stage of this cutlass permute op for d1 & d0.
+It might not be the most effecient version as applying different dimension on threads
+may relate to cache's performance.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  {{lib_dtype}}*,
+  {{lib_dtype}}*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  {{prefix}}Stream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    ({{lib_dtype}}*){{in_ptr}},
+{{indent}}    ({{lib_dtype}}*){{out_ptr}},
+{{indent}}    {{x_dim0}},
+{{indent}}    {{x_dim1}},
+{{indent}}    {{x_dim2}},
+{{indent}}    {{y_dim0}},
+{{indent}}    {{y_dim1}},
+{{indent}}    {{y_dim2}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}permute102_launcher(
+{{indent}}    in_ptr,
+{{indent}}    out_ptr,
+{{indent}}    *x_dim0,
+{{indent}}    *x_dim1,
+{{indent}}    *x_dim2,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+#define TILE_SIZE 32
+#define CH_K 4
+
+namespace {
+template <typename T>
+__global__ void nhwc_to_nchw_kernel(T *output,
+                                    const T *input,
+                                    const int n,
+                                    const int h,
+                                    const int w,
+                                    const int c) {
+
+  const int hw = h*w;
+  const int hwc = hw*c;
+  __shared__ T shbuf[TILE_SIZE * (TILE_SIZE + 1)];
+  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
+  const int32_t wid  = tid / TILE_SIZE;//th.y:0-7
+  const int32_t lid  = tid % TILE_SIZE;//th.x:0-31
+  const int32_t ni0   = blockIdx.z;
+  const int32_t hwi0  = blockIdx.y * TILE_SIZE;//parallel 8*seq 4
+  const int32_t ci0 = blockIdx.x * TILE_SIZE;//parallel 32
+  const size_t input_idx = ni0 * hwc + (hwi0 + wid) * c + ci0;
+  const T *A = input + input_idx;
+  if (ci0 + lid < c) {
+    const int lid_x_33 = lid * (TILE_SIZE + 1);
+    if ((hwi0 + TILE_SIZE - TILE_SIZE / CH_K) <= hw) {
+      int hwi = wid;  // between 0 and 7
+      #pragma unroll
+      for (int cLoopIdx = 0; cLoopIdx < CH_K; cLoopIdx++) {
+        shbuf[lid_x_33 + hwi] = A[lid];
+        A                     = &A[TILE_SIZE / CH_K * c];//because c is distributed on threads y
+        hwi += TILE_SIZE / CH_K;
+      }
+    } else {
+      for (int hwi = wid; hwi < TILE_SIZE; hwi += TILE_SIZE / CH_K) {
+        if ((hwi + hwi0) < hw) {
+          shbuf[lid_x_33 + hwi] = A[lid];
+        }
+        A = &A[TILE_SIZE / CH_K * c];
+      }
+    }
+  }
+  __syncthreads();
+
+  const int32_t hwiOut = hwi0 + lid;
+  const int nc = n*c;
+  output = &output[hwiOut*nc];
+  if(hwiOut < hw){
+    if(ci0 + TILE_SIZE < c){
+      int cI = wid;
+      #pragma unroll
+      for(int hwLoopIdx = 0; hwLoopIdx < CH_K; ++hwLoopIdx){
+          output[ni0*c + ci0 + cI] = shbuf[(cI)* (TILE_SIZE + 1) + lid];
+          cI += TILE_SIZE / CH_K;
+      }
+    } else {
+      for(int cI = wid; cI < TILE_SIZE; cI += TILE_SIZE / CH_K){
+        if(ci0+cI<c){
+          output[ni0*c+ci0+cI] = shbuf[(cI)* (TILE_SIZE + 1) + lid];
+        }
+      }
+    }
+  }
+}
+
+void permute102_launcher({{lib_dtype}}* in_ptr,
+                         {{lib_dtype}}* out_ptr,
+                         int x_dim0,
+                         int x_dim1,
+                         int x_dim2,
+                         {{prefix}}Stream_t stream) {
+  const int n = x_dim0;
+  const int h = 1;
+  const int w = x_dim1;
+  const int c = x_dim2;
+  dim3 grid((c + TILE_SIZE - 1)/TILE_SIZE, (h*w + TILE_SIZE -1)/TILE_SIZE, n);
+  dim3 block(TILE_SIZE, TILE_SIZE / CH_K);
+  nhwc_to_nchw_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
+    out_ptr,
+    (const {{lib_dtype}}*)in_ptr,
+    n,
+    h,
+    w,
+    c
+  );
+}
+} // namespace
+
+void {{function_name}} (
+    {{lib_dtype}}* in_ptr,
+    {{lib_dtype}}* out_ptr,
+    int64_t* x_dim0,
+    int64_t* x_dim1,
+    int64_t* x_dim2,
+    int64_t* y_dim0,
+    int64_t* y_dim1,
+    int64_t* y_dim2,
+    {{prefix}}Stream_t stream
+) {
+  if (!in_ptr) {
+    throw std::runtime_error("in_ptr is NULL!");
+  }
+  if (!out_ptr) {
+    throw std::runtime_error("in_ptr is NULL!");
+  }
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+def gen_function(
+    func_attrs: Dict[str, Any],
+    template_path: str,
+    shape_eval_template,
+    shape_save_template,
+    header_files: str,
+    backend_spec,
+) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+    shape_eval_template : jinja template
+    shape_save_template : jinja template
+    backend_spec : class
+        specifies backend configs
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    func_name = func_attrs["name"]
+    x = func_attrs["inputs"][0]
+    xdtype = x._attrs["dtype"]
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*x_dim0",
+        x_dim1="*x_dim1",
+        x_dim2="*x_dim2",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*y_dim0",
+        y_dim1="*y_dim1",
+        y_dim2="*y_dim2",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render()
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        header_files=header_files,
+        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies backend configs
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    func_name = func_attrs["name"]
+    x = func_attrs["inputs"][0]
+    xdtype = x._attrs["dtype"]
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies backend configs
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    xdtype = x._attrs["dtype"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        x_dim0="&" + xshape[0]._attrs["name"],
+        x_dim1="&" + xshape[1]._attrs["name"],
+        x_dim2="&" + xshape[2]._attrs["name"],
+        y_dim0="&" + yshape[0]._attrs["name"],
+        y_dim1="&" + yshape[1]._attrs["name"],
+        y_dim2="&" + yshape[2]._attrs["name"],
+        indent=indent,
+        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
+    )
diff --git a/python/aitemplate/backend/common/tensor/permute210_common.py b/python/aitemplate/backend/common/tensor/permute210_common.py
new file mode 100644
index 000000000..fa1d5d25a
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/permute210_common.py
@@ -0,0 +1,289 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common implementations for all backends for permute210.
+
+For three dimension input, shift the first and the third dimension.
+i.e. Output[d2, d1, d0] = Input[d0, d1, d2]
+
+We invoke kernel with the following settings:
+thread blocks of (TILE_SIZE x TILE_SIZE/4),
+grid size of (ceil(d1/TILE_SIZE) x d2 x ceil(d3/TILE_SIZE))
+For each, we have shared memory of size (TILE_SIZE, TILE_SIZE+1)
+
+The 4 for thread blocks indicates each thread is responsible of 4 elements.
+We use TILE_SIZE = 32 for the time being.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  {{lib_dtype}}*,
+  {{lib_dtype}}*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  {{prefix}}Stream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    static_cast<{{lib_dtype}}*>({{in_ptr}}),
+{{indent}}    static_cast<{{lib_dtype}}*>({{out_ptr}}),
+{{indent}}    {{x_dim0}},
+{{indent}}    {{x_dim1}},
+{{indent}}    {{x_dim2}},
+{{indent}}    {{y_dim0}},
+{{indent}}    {{y_dim1}},
+{{indent}}    {{y_dim2}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}permute210_launcher(
+{{indent}}    in_ptr,
+{{indent}}    out_ptr,
+{{indent}}    *x_dim0,
+{{indent}}    *x_dim1,
+{{indent}}    *x_dim2,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+#define TILE_SIZE 32
+
+namespace {
+template <typename T>
+__global__ void permute210_kernel(T *output,
+                                  const T *input,
+                                  const int n,
+                                  const int c,
+                                  const int w) {
+  __shared__ T shbuf[TILE_SIZE][TILE_SIZE + 1];
+
+  int32_t strides[2] = { c * w, w };
+  int32_t offset = blockIdx.y * strides[1]; // We are slicing through static c.
+
+  int32_t xBlock = blockIdx.x * TILE_SIZE;
+  int32_t yBlock = blockIdx.z * TILE_SIZE;
+  int32_t x = xBlock + threadIdx.x;
+  int32_t y = yBlock + threadIdx.y;
+
+  const int32_t inputIdx = y * strides[0] + offset + xBlock;
+  const T *A = input + inputIdx;
+
+  if (x < w) {
+    if (y + 24 < n) { // This guards (y, y+8, y+16, y+24) are within boundary.
+      int tid = threadIdx.y;
+      #pragma unroll
+      for (int loopIdx = 0; loopIdx < 4; loopIdx++) {
+        shbuf[threadIdx.x][tid] = A[threadIdx.x];
+        A                       = &A[8 * strides[0]];
+        tid += 8;
+      }
+    } else {
+      #pragma unroll
+      for (int tid = threadIdx.y; tid < 32; tid += 8) {
+        if (yBlock + tid < n) {
+          shbuf[threadIdx.x][tid] = A[threadIdx.x];
+        }
+        A = &A[8 * strides[0]];
+      }
+    }
+  }
+  __syncthreads();
+
+  // Now, we do the computation of transposes toward the new indices
+  strides[0] = c * n;
+  strides[1] = n;
+  offset = blockIdx.y * strides[1];
+
+  xBlock = blockIdx.z * TILE_SIZE;
+  yBlock = blockIdx.x * TILE_SIZE;
+  x = xBlock + threadIdx.x;
+  y = yBlock + threadIdx.y;
+
+  output = &output[y * strides[0] + offset + xBlock];
+  if (x < n) {
+    if (y + 24 < w) {
+      int tid = threadIdx.y;
+      #pragma unroll
+      for (int loopIdx = 0; loopIdx < 4; loopIdx++) {
+        output[threadIdx.x] = shbuf[tid][threadIdx.x];
+        output              = &output[8 * strides[0]];
+        tid += 8;
+      }
+    } else {
+      #pragma unroll
+      for (int tid = threadIdx.y; tid < 32; tid += 8) {
+        if (yBlock + tid < w) {
+          output[threadIdx.x] = shbuf[tid][threadIdx.x];
+        }
+        output = &output[8 * strides[0]];
+      }
+    }
+  }
+}
+
+void permute210_launcher({{lib_dtype}}* in_ptr,
+                         {{lib_dtype}}* out_ptr,
+                         int x_dim0,
+                         int x_dim1,
+                         int x_dim2,
+                         {{prefix}}Stream_t stream) {
+  dim3 grid((x_dim2 + (TILE_SIZE-1))/TILE_SIZE, x_dim1, (x_dim0 + (TILE_SIZE-1))/TILE_SIZE);
+  dim3 block(TILE_SIZE, TILE_SIZE/4);
+  permute210_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
+    out_ptr,
+    (const {{lib_dtype}}*)in_ptr,
+    x_dim0,
+    x_dim1,
+    x_dim2
+  );
+}
+} // namespace
+
+void {{function_name}} (
+    {{lib_dtype}}* in_ptr,
+    {{lib_dtype}}* out_ptr,
+    int64_t* x_dim0,
+    int64_t* x_dim1,
+    int64_t* x_dim2,
+    int64_t* y_dim0,
+    int64_t* y_dim1,
+    int64_t* y_dim2,
+    {{prefix}}Stream_t stream
+) {
+  if (!in_ptr) {
+    throw std::runtime_error("in_ptr is NULL!");
+  }
+  if (!out_ptr) {
+    throw std::runtime_error("in_ptr is NULL!");
+  }
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    header_files : str
+        header files included in the function
+    backend_spec : class
+        specifies the backend configs
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    func_name = func_attrs["name"]
+    x = func_attrs["inputs"][0]
+    xdtype = x._attrs["dtype"]
+    exec_paths = EXEC_TEMPLATE.render()
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        header_files=header_files,
+        exec_paths=exec_paths,
+        prefix=backend_spec.prefix,
+        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies the backend configs
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    func_name = func_attrs["name"]
+    x = func_attrs["inputs"][0]
+    xdtype = x._attrs["dtype"]
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        prefix=backend_spec.prefix,
+        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
+    )
+
+
+def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies the backend configs
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    xdtype = x._attrs["dtype"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        x_dim0="&" + xshape[0]._attrs["name"],
+        x_dim1="&" + xshape[1]._attrs["name"],
+        x_dim2="&" + xshape[2]._attrs["name"],
+        y_dim0="&" + yshape[0]._attrs["name"],
+        y_dim1="&" + yshape[1]._attrs["name"],
+        y_dim2="&" + yshape[2]._attrs["name"],
+        indent=indent,
+        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
+    )
diff --git a/python/aitemplate/backend/common/tensor/slice_common.py b/python/aitemplate/backend/common/tensor/slice_common.py
new file mode 100644
index 000000000..fb17116de
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/slice_common.py
@@ -0,0 +1,902 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Slice backend common implementation.
+"""
+import jinja2
+
+CAST_TO_CONST_HALF_PTR_TEMPLATE = jinja2.Template("reinterpret_cast<half*>({{name}})")
+
+
+CAST_TO_HALF_PTR_TEMPLATE = jinja2.Template("reinterpret_cast<const half*>({{name}})")
+
+
+SHAPE_UPDATE_FUNC = jinja2.Template(
+    """
+{{indent}}int64_t output_scatter_dim_value = 0;
+{{indent}}for ({{index_type}} i = 0; i < num_inputs; i++) {
+{{indent}}  output_scatter_dim_value +=
+{{indent}}      slice_end_indices[i][scatter_dim] - slice_start_indices[i][scatter_dim];
+{{indent}}}
+{{indent}}
+{{indent}}for ({{index_type}}  i = 0; i < rank; i++) {
+{{indent}}  if (i == scatter_dim) {
+{% if update_output_shape %}
+{{indent}}    *output_shape[i] = output_scatter_dim_value;
+{% else %}
+{{indent}}    // skip updating output_shape[i]
+{% endif %}
+{{indent}}  } else {
+{{indent}}    int64_t dim = slice_end_indices[0][i] - slice_start_indices[0][i];
+{{indent}}    for ({{index_type}}  j = 1; j < num_inputs; j++) {
+{{indent}}      if (slice_end_indices[j][i] - slice_start_indices[j][i] != dim) {
+{{indent}}        throw std::runtime_error("invalid indices");
+{{indent}}      }
+{% if update_output_shape %}
+{{indent}}      *output_shape[i] = dim;
+{% else %}
+{{indent}}    // skip updating output_shape[i]
+{% endif %}
+{{indent}}    }
+{{indent}}  }
+{{indent}}}
+"""
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    {{elem_output_type}} * /*output*/,
+    int64_t *[] /*output_shape*/,
+    const {{elem_input_type}} *[] /*inputs*/,
+    const int64_t *[] /*input_shapes*/,
+    const int64_t *[] /*orig_slice_start_indices*/,
+    const int64_t *[] /*orig_slice_end_indices*/,
+    {{index_type}}  /*scatter_dim*/,
+    {{index_type}}  /*rank*/,
+    {{index_type}}  /*num_inputs*/,
+    {{prefix}}Stream_t
+    );
+"""
+)
+
+
+KERNEL_SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_src}}
+
+#include <assert.h>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <vector>
+
+{% if element_func_def %}
+//#include <cutlass/fast_math.h>
+{% endif %}
+
+namespace {
+#ifndef CHECK_ERROR_SLICE
+#define CHECK_ERROR_SLICE(expr)                              \\
+  do {                                                       \\
+    {{prefix}}Error_t status = (expr);                       \\
+    if (status != {{prefix}}Success) {                       \\
+      auto msg = std::string("Got error: ") +                \\
+        {{prefix}}GetErrorString(status) +                   \\
+        " at " + __FILE__ + ": " + std::to_string(__LINE__); \\
+      std::cerr << msg << std::endl;                         \\
+      throw std::runtime_error(msg);                         \\
+    }                                                        \\
+  } while (0)
+#endif // CHECK_ERROR_SLICE
+
+#ifndef LAUNCH_CHECK_SLICE
+#define LAUNCH_CHECK_SLICE() CHECK_ERROR_SLICE({{prefix}}GetLastError())
+#endif // LAUNCH_CHECK_SLICE
+
+{% if element_func_def %}
+{{element_func_def}}
+{% endif %}
+
+template <typename T, {{index_type}}  Rank, {{index_type}}  NumInputs>
+struct SliceMetaData {
+  const T *inputs[NumInputs];
+  int64_t slice_start_indices[NumInputs][Rank];
+  int64_t slice_end_indices[NumInputs][Rank];
+  {{index_type}}  dim; // scatter dimension
+  int64_t input_strides[NumInputs][Rank];
+  int64_t num_elems[NumInputs];
+  int64_t offsets[NumInputs];  // value of (dim_offset * output_dim_stride) at
+                               // the dim axis in the output, where dim_offset
+                               // is the offset of the scattered input at the
+                               // dimension axis in the output
+  int64_t dim_sizes[NumInputs];  // dimension size of the input to be scattered
+                                 // at the dim axis
+};
+
+template <{{index_type}}  Rank, {{index_type}}  NumInputs>
+struct ScatterMetaData {
+  int64_t output_shape[Rank];
+  int64_t output_strides[Rank];
+};
+
+__host__ __device__ __forceinline__
+int64_t get_num_elems(const int64_t *shape, {{index_type}}  rank) {
+  {{index_type}}  num = 1;
+  for ({{index_type}}  i = 0; i < rank; i++) {
+    num *= shape[i];
+  }
+  return num;
+}
+
+template <{{index_type}}  Rank>
+__host__ __device__ int64_t compute_input_linear_index(
+    const int64_t *input_strides,
+    const int64_t *slice_start_indices,
+    const int64_t *slice_end_indices,
+    int64_t linear_idx) {
+  int64_t input_offset = slice_start_indices[0] * input_strides[0];
+  for ({{index_type}}  i = Rank - 1; i > 0; i--) {
+    {{index_type}}  curr_output_dim_size = slice_end_indices[i] - slice_start_indices[i];
+    int64_t curr_output_idx = linear_idx % curr_output_dim_size;
+    int64_t curr_input_idx = curr_output_idx + slice_start_indices[i];
+    input_offset += curr_input_idx * input_strides[i];
+    linear_idx /= curr_output_dim_size;
+  }
+  return input_offset + linear_idx * input_strides[0];
+}
+
+template <{{index_type}}  Rank>
+__host__ __device__ int64_t compute_output_elem_offset(
+    const int64_t *output_shape,
+    const int64_t *output_strides,
+    int64_t scatter_dim_size,
+    const {{index_type}}  scatter_dim,
+    int64_t linear_idx) {
+  int64_t offset = 0;
+  for ({{index_type}}  i = Rank - 1; i >= 1; --i) {
+    int64_t cur_dim_size = i == scatter_dim ?  scatter_dim_size : output_shape[i];
+    int64_t next_dim_idx = linear_idx / cur_dim_size;
+    int64_t cur_dim_idx = linear_idx - cur_dim_size * next_dim_idx;
+    int64_t cur_dim_offset = cur_dim_idx * output_strides[i];
+    offset += cur_dim_offset;
+    linear_idx = next_dim_idx;
+  }
+  return offset + linear_idx * output_strides[0];
+}
+
+template <typename READ_T, typename ELEM_T, {{index_type}}  Rank,
+          {{index_type}}  NumInputs, {{index_type}}  ElemsPerThread>
+__global__ void
+slice_scatter_kernel(
+    ELEM_T *orig_output,
+    SliceMetaData<ELEM_T, Rank, NumInputs> slice_meta_data,
+    ScatterMetaData<Rank, NumInputs> scatter_meta_data) {
+  const {{index_type}}  tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const {{index_type}}  block_y = blockIdx.y % NumInputs;
+
+  READ_T* output = reinterpret_cast<READ_T*>(orig_output);
+  const READ_T* input =
+      reinterpret_cast<const READ_T*>(slice_meta_data.inputs[block_y]);
+  int64_t num_elems = slice_meta_data.num_elems[block_y];
+  const int64_t *input_strides = slice_meta_data.input_strides[block_y];
+  const int64_t *slice_start_indices =
+      slice_meta_data.slice_start_indices[block_y];
+  const int64_t *slice_end_indices =
+      slice_meta_data.slice_end_indices[block_y];
+
+  {{index_type}}  scatter_dim = slice_meta_data.dim;
+  int64_t scatter_dim_size = slice_meta_data.dim_sizes[block_y];
+  int64_t scatter_offset = slice_meta_data.offsets[block_y];
+
+  unsigned read_t_sz = sizeof(READ_T);
+  unsigned elem_t_sz = sizeof(ELEM_T);
+  assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
+  {{index_type}}  n_of_elem_t = read_t_sz / elem_t_sz;
+  // number of READ_T elements per thread
+  {{index_type}}  reads_per_thread_in_read_t = ElemsPerThread / n_of_elem_t;
+  const int64_t num_elems_in_read_t = num_elems / n_of_elem_t;
+  {{index_type}}  read_idx = tid;
+
+#pragma unroll
+  for ({{index_type}}  i = 0; i < reads_per_thread_in_read_t;
+       i++, read_idx += blockDim.x * gridDim.x) {
+    if (read_idx >= num_elems_in_read_t) {
+      break;
+    }
+    /* make sure to adjust read_idx, which refers to location at
+       (read_idx * n_of_elem_t) actually */
+    int64_t input_idx = compute_input_linear_index<Rank>(
+        input_strides,
+        slice_start_indices,
+        slice_end_indices,
+        read_idx * n_of_elem_t);
+    int64_t output_elem_offset = compute_output_elem_offset<Rank>(
+        scatter_meta_data.output_shape,
+        scatter_meta_data.output_strides,
+        scatter_dim_size,
+        scatter_dim,
+        read_idx * n_of_elem_t);
+
+    READ_T tmp_v = input[input_idx / n_of_elem_t];
+    int64_t output_idx = (scatter_offset + output_elem_offset) / n_of_elem_t;
+    {% if element_func %}
+    output[output_idx] = {{element_func}}(tmp_v);
+    {% else %}
+    output[output_idx] = tmp_v;
+    {% endif %}
+  }
+}
+
+enum class LoadVecType {
+  VT_HALF = 0,
+  VT_FLOAT,
+  VT_FLOAT2,
+  VT_FLOAT4
+};
+
+template <typename ELEM_T>
+static inline LoadVecType get_vec_type(int64_t dim_size) {
+  {{index_type}}  size_elem_t = sizeof(ELEM_T);
+
+#define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)  \\
+  if (sizeof(vec_type) % size_elem_t == 0) {          \\
+    {{index_type}}  n_of_elem_t = sizeof(vec_type) / size_elem_t; \\
+    if (dim_size % n_of_elem_t == 0) {                \\
+      return load_vec_type;                           \\
+    }                                                 \\
+  }
+
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
+  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+
+#undef HANDLE_ONE_VEC_TYPE
+  throw std::runtime_error(
+      "Cannot resolve LoadVecType."
+  );
+}
+
+template <typename ELEM_T, {{index_type}}  Rank>
+static LoadVecType get_input_vec_type(
+    const int64_t *output_strides,
+    const ELEM_T *input,
+    const int64_t *input_shape,
+    const int64_t *input_strides,
+    const int64_t *slice_start_indices,
+    const int64_t *slice_end_indices,
+    {{index_type}}  scatter_dim,
+    {{index_type}}  scatter_offset,
+    {{index_type}}  dim_size) {
+  // get the outermost index where we continuous element accesses
+  {{index_type}}  flatten_index = Rank - 1;
+  for (; flatten_index >= 0; flatten_index--) {
+    if (slice_end_indices[flatten_index] - slice_start_indices[flatten_index] !=
+        input_shape[flatten_index]) {
+      break;
+    }
+  }
+  int64_t input_start_offset =
+      compute_input_linear_index<Rank>(input_strides,
+                                       slice_start_indices,
+                                       slice_end_indices,
+                                       /*linear_idx*/0);
+  LoadVecType slice_vec_type1 =
+      get_vec_type<ELEM_T>(input_start_offset);
+  LoadVecType slice_vec_type2;
+  if (Rank == 1) {
+    int64_t continuous_read_size = slice_end_indices[0] - slice_start_indices[0];
+    slice_vec_type2 = get_vec_type<ELEM_T>(continuous_read_size);
+  } else {
+    int64_t continuous_read_size =
+      (slice_end_indices[flatten_index] - slice_start_indices[flatten_index]) *
+      input_strides[flatten_index];
+    LoadVecType vec_type1 = get_vec_type<ELEM_T>(continuous_read_size);
+    continuous_read_size =
+      (input_shape[flatten_index] - slice_end_indices[flatten_index]) *
+      input_strides[flatten_index];
+    LoadVecType vec_type2 = get_vec_type<ELEM_T>(continuous_read_size);
+    // find the smaller alignment reqirement between the sliced piece
+    // and the rest along the flattened dimensions
+    slice_vec_type2 = vec_type1 < vec_type2 ?  vec_type1 : vec_type2;
+  }
+  LoadVecType slice_min_vec_type = slice_vec_type1 < slice_vec_type2 ?
+                                   slice_vec_type1 : slice_vec_type2;
+
+  LoadVecType scatter_vec_type1 = get_vec_type<ELEM_T>(dim_size);
+  LoadVecType scatter_vec_type2 = get_vec_type<ELEM_T>(scatter_offset);
+  LoadVecType scatter_min_vec_type = scatter_vec_type1 < scatter_vec_type2 ?
+                                     scatter_vec_type1 : scatter_vec_type2;
+
+  LoadVecType min_vec_type = slice_min_vec_type < scatter_min_vec_type ?
+                             slice_min_vec_type : scatter_min_vec_type;
+  return min_vec_type;
+}
+
+template <typename ELEM_T, {{index_type}}  Rank, {{index_type}}  NumInputs>
+void prepare_one_meta_data(
+    {{index_type}}  input_idx,
+    SliceMetaData<ELEM_T, Rank, NumInputs> &slice_meta_data,
+    ScatterMetaData<Rank, NumInputs> &scatter_meta_data,
+    const ELEM_T *input,
+    const int64_t *input_shape,
+    const int64_t *slice_start_indices,
+    const int64_t *slice_end_indices,
+    {{index_type}}  scatter_dim,
+    {{index_type}}  scatter_dim_offset) {
+  slice_meta_data.inputs[input_idx] = input;
+  slice_meta_data.input_strides[input_idx][Rank-1] = 1;
+  for ({{index_type}}  i = Rank - 2; i >= 0; i--) {
+    slice_meta_data.input_strides[input_idx][i] =
+        slice_meta_data.input_strides[input_idx][i+1] * input_shape[i+1];
+  }
+
+  slice_meta_data.num_elems[input_idx] = 1;
+  for ({{index_type}}  i = 0; i < Rank; i++) {
+    assert(slice_start_indices[i] >= 0 &&
+           slice_start_indices[i] <= input_shape[i]);
+    assert(slice_end_indices[i] >= 0 && slice_end_indices[i] <= input_shape[i]);
+    assert(slice_start_indices[i] <= slice_end_indices[i]);
+
+    slice_meta_data.num_elems[input_idx] *=
+        slice_end_indices[i] - slice_start_indices[i];
+    slice_meta_data.slice_start_indices[input_idx][i] = slice_start_indices[i];
+    slice_meta_data.slice_end_indices[input_idx][i] = slice_end_indices[i];
+  }
+
+  slice_meta_data.dim_sizes[input_idx] =
+      slice_end_indices[scatter_dim] - slice_start_indices[scatter_dim];
+  slice_meta_data.offsets[input_idx] =
+      scatter_dim_offset * scatter_meta_data.output_strides[scatter_dim];
+}
+
+template <typename ELEM_T, {{index_type}}  Rank, {{index_type}}  NumInputs,
+          {{index_type}}  ElemsPerThread, {{index_type}}  ThreadsPerBlock>
+void slice_scatter_kernel_launcher(
+    ELEM_T *output,
+    const int64_t *output_shape,
+    const ELEM_T *inputs[],
+    const int64_t *input_shapes[],
+    const std::vector<std::vector<int64_t>> &slice_start_indices,
+    const std::vector<std::vector<int64_t>> &slice_end_indices,
+    {{index_type}}  scatter_dim,
+    {{prefix}}Stream_t stream
+) {
+  SliceMetaData<ELEM_T, Rank, NumInputs> slice_meta_data;
+  ScatterMetaData<Rank, NumInputs> scatter_meta_data;
+
+  // meta data for placing sliced output
+  scatter_meta_data.output_strides[Rank-1] = 1;
+  scatter_meta_data.output_shape[Rank-1] = output_shape[Rank-1];
+  for ({{index_type}}  i = Rank - 2; i >= 0; i--) {
+    scatter_meta_data.output_strides[i] =
+        scatter_meta_data.output_strides[i+1] * output_shape[i+1];
+    scatter_meta_data.output_shape[i] = output_shape[i];
+  }
+
+  {{index_type}}  scatter_dim_offset = 0;
+  slice_meta_data.dim = scatter_dim;
+  for ({{index_type}}  i = 0; i < NumInputs; i++) {
+    prepare_one_meta_data(i, slice_meta_data, scatter_meta_data,
+                          inputs[i], input_shapes[i],
+                          slice_start_indices[i].data(),
+                          slice_end_indices[i].data(),
+                          scatter_dim, scatter_dim_offset);
+    scatter_dim_offset += slice_meta_data.dim_sizes[i];
+  }
+
+  LoadVecType min_vec_type = LoadVecType::VT_FLOAT4;
+  for ({{index_type}}  i = 0; i < NumInputs; i++) {
+    LoadVecType vec_type = get_input_vec_type<ELEM_T, Rank>(
+        scatter_meta_data.output_strides,
+        inputs[i],
+        input_shapes[i],
+        slice_meta_data.input_strides[i],
+        slice_start_indices[i].data(),
+        slice_end_indices[i].data(),
+        scatter_dim,
+        slice_meta_data.offsets[i],
+        slice_meta_data.dim_sizes[i]);
+    min_vec_type = vec_type < min_vec_type ? vec_type : min_vec_type;
+  }
+
+  // setup kernel configs
+  int64_t max_num_elems = 0;
+  for ({{index_type}}  i = 0; i < NumInputs; i++) {
+    if (slice_meta_data.num_elems[i] > max_num_elems) {
+      max_num_elems =  slice_meta_data.num_elems[i];
+    }
+  }
+
+  {{index_type}}  m = max_num_elems % (ThreadsPerBlock * ElemsPerThread) != 0;
+  {{index_type}}  num_blocks_x =
+      (max_num_elems / (ThreadsPerBlock * ElemsPerThread)) + m;
+  dim3 grid_config = dim3(num_blocks_x, NumInputs);
+
+#define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)                          \\
+    case load_vec_type: {                                                     \\
+      if (ElemsPerThread * sizeof(ELEM_T) < sizeof(vec_type)) {               \\
+         throw std::runtime_error(                                            \\
+           std::string("No valid kernel available for ") + #vec_type);        \\
+      }                                                                       \\
+      slice_scatter_kernel<vec_type, ELEM_T, Rank, NumInputs, ElemsPerThread> \\
+        <<<grid_config, ThreadsPerBlock, 0, stream>>>(                        \\
+            output,                                                           \\
+            slice_meta_data,                                                  \\
+            scatter_meta_data);                                               \\
+      LAUNCH_CHECK_SLICE();                                                   \\
+      break;                                                                  \\
+    }
+
+  switch (min_vec_type) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    default:
+      throw std::runtime_error("Invalid LoadVecType\\n");
+  }
+
+#undef HANDLE_ONE_VEC_TYPE
+}
+
+std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+normalize_slice_indices(
+    const int64_t *input_shape,
+    const int64_t *orig_slice_start_indices,
+    const int64_t *orig_slice_end_indices,
+    {{index_type}}  rank) {
+  std::vector<int64_t> slice_start_indices(rank);
+  std::vector<int64_t> slice_end_indices(rank);
+  for ({{index_type}}  i = 0; i < rank; i++) {
+    slice_start_indices[i] = orig_slice_start_indices[i] < 0 ?
+                             input_shape[i] + orig_slice_start_indices[i]:
+                             orig_slice_start_indices[i];
+    // make it compatible with PyTorch
+    slice_start_indices[i] = slice_start_indices[i] < 0 ?
+                             0 : slice_start_indices[i];
+    if (slice_start_indices[i] < 0) {
+      slice_start_indices[i] = 0;
+    }
+    if (slice_start_indices[i] > input_shape[i]) {
+      slice_start_indices[i] = input_shape[i];
+    }
+
+    slice_end_indices[i] =  orig_slice_end_indices[i] < 0 ?
+                            input_shape[i] + orig_slice_end_indices[i]:
+                            orig_slice_end_indices[i];
+    // make it compatible with PyTorch
+    slice_end_indices[i] = slice_end_indices[i] < 0 ?
+                           0 : slice_end_indices[i];
+    if (slice_end_indices[i] < 0) {
+      slice_end_indices[i] = 0;
+    }
+    if (slice_end_indices[i] > input_shape[i]) {
+      slice_end_indices[i] = input_shape[i];
+    }
+
+    // make it compatible with PyTorch
+    if (slice_start_indices[i] > slice_end_indices[i]) {
+      slice_start_indices[i] = slice_end_indices[i];
+    }
+  }
+
+  return {slice_start_indices, slice_end_indices};
+}
+} // namespace
+
+"""
+)
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if (rank == {{rank}} && num_inputs == {{num_inputs}}) {
+{{indent}}  int64_t local_output_shape[{{rank}}];
+{% for rank_idx in range(rank) %}
+{{indent}}  local_output_shape[{{rank_idx}}] = *output_shape[{{rank_idx}}];
+{% endfor %}
+{{indent}}  slice_scatter_kernel_launcher<{{elem_type}},
+{{indent}}                                {{rank}}/*Rank*/,
+{{indent}}                                {{num_inputs}}/*NumInputs*/,
+{{indent}}                                {{elems_per_thread}}/*ElemsPerThread*/,
+{{indent}}                                {{threads_per_block}}/*ThreadsPerBlock*/>(
+{{indent}}      output, local_output_shape, inputs, input_shapes,
+{{indent}}      slice_start_indices, slice_end_indices, scatter_dim, stream);
+{{indent}}  return;
+{{indent}}}
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{kernel_src}}
+
+void {{func_name}}(
+    {{elem_output_type}} *output,
+    int64_t *output_shape[],
+    const {{elem_input_type}} *inputs[],
+    const int64_t *input_shapes[],
+    const int64_t *orig_slice_start_indices[],
+    const int64_t *orig_slice_end_indices[],
+    {{index_type}}  scatter_dim,
+    {{index_type}}  rank,
+    {{index_type}}  num_inputs,
+    {{prefix}}Stream_t stream
+    ) {
+
+  if (rank <= 0) {
+    throw std::runtime_error("rank must > 0!");
+  }
+  if (scatter_dim >= rank) {
+    throw std::runtime_error("scatter_dim must < rank!");
+  }
+
+  // clip slip start and end indices
+  std::vector<std::vector<int64_t>> slice_start_indices(num_inputs);
+  std::vector<std::vector<int64_t>> slice_end_indices(num_inputs);
+  std::vector<int64_t> output_dim_sizes;
+  for ({{index_type}} i = 0; i < num_inputs; i++) {
+    std::vector<int64_t> start_indices;
+    std::vector<int64_t> end_indices;
+    std::tie(start_indices, end_indices) =
+        normalize_slice_indices(input_shapes[i],
+                                orig_slice_start_indices[i],
+                                orig_slice_end_indices[i],
+                                rank);
+    slice_start_indices[i] = start_indices;
+    slice_end_indices[i] = end_indices;
+  }
+
+{{shape_function}}
+
+  // If all input tensors are empty, we are done
+  bool empty = true;
+  for ({{index_type}} i = 0; i < num_inputs; i++) {
+    if (get_num_elems(input_shapes[i], rank) != 0) {
+      empty = false;
+      // make sure input is valid for each non-zero-size tensor
+      if (!inputs[i]) {
+        throw std::runtime_error("NULL input is found at: " + std::to_string(i));
+      }
+    }
+  }
+
+  if (empty)
+    return;
+
+  // if we output has any zero dim size, we are done
+  for ({{index_type}} i = 0; i < rank; i++) {
+    if (*output_shape[i] == 0)
+      return;
+  }
+  // make sure we have a valid output pointer
+  if (!output) {
+    throw std::runtime_error("output is NULL!");
+  }
+
+{{exec_paths}}
+
+  throw std::runtime_error(
+      "Unsupported cat kernel specialization!"
+  );
+}
+"""
+)
+
+
+DEFAULT_OUTPUT_SHAPE_DEF_TEMPLATE = jinja2.Template(
+    """
+{{indent}}  int64_t *{{output_name}}_shape[] = {
+{{indent}}    {{output_dim_refs}}
+{{indent}}  };
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{output_shape_def}}
+
+{{indent}}  const half *inputs[] = {
+{{indent}}    {{inputs}}
+{{indent}}  };
+
+{{input_shape_defs}}
+
+{{indent}}  const int64_t *input_shapes[] = {
+{{indent}}    {{input_shapes}}
+{{indent}}  };
+
+{{start_indices_defs}}
+
+{{indent}}  const int64_t *slice_start_indices[] = {
+{{indent}}    {{slice_start_indices}}
+{{indent}}  };
+
+{{end_indices_defs}}
+
+{{indent}}  const int64_t *slice_end_indices[] = {
+{{indent}}    {{slice_end_indices}}
+{{indent}}  };
+
+{{indent}}  {{func_name}}(
+{{indent}}    {{output_ptr}},
+{{indent}}    {{output_name}}_shape,
+{{indent}}    inputs,
+{{indent}}    input_shapes,
+{{indent}}    slice_start_indices,
+{{indent}}    slice_end_indices,
+{{indent}}    {{scatter_dim}}/*scatter_dim*/,
+{{indent}}    {{rank}}/*rank*/,
+{{indent}}    {{num_inputs}}/*num_inputs*/,
+{{indent}}    stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+INPUT_SHAPE_DEF_TEMPLATE = jinja2.Template(
+    """
+{{indent}}int64_t {{input_shape_name}}[] = {
+{{indent}}  {{input_dims}}
+{{indent}}};
+"""
+)
+
+
+INPUT_INDICES_DEF_TEMPLATE = jinja2.Template(
+    """
+{{indent}}int64_t {{input_indices_name}}[] = {
+{{indent}}  {{input_indices}}
+{{indent}}};
+"""
+)
+
+
+def gen_function_decl(func_attrs, backend_spec):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec: dataclass
+        Backend specification.
+
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        elem_output_type=output_type,
+        elem_input_type=input_type,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function(
+    func_attrs,
+    backend_spec,
+    elems_per_thread=8,
+    update_output_shape=True,
+    element_func=None,
+    element_func_def=None,
+    extra_header_template=None,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec: dataclass
+        Backend specification.
+    elems_per_thread: int
+        Per thread elements.
+    update_output_shape: bool
+        Whether to update output shape, by default True.
+    element_func: str
+        Attributes for ease of tanh concatenate fusion, default is None.
+    element_func_def: str
+        Implmentation for fast_tanh, default is None.
+    extra_header_template: str
+        Header for fast_tanh, default is None.
+
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    inputs = func_attrs["inputs"]
+    x = inputs[0]
+    y = func_attrs["outputs"][0]
+    x_shape = x._attrs["shape"]
+
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
+
+    # TODO: consider to add profiling paths for tuning
+    # elems_per_thread and threads_per_block
+    exec_paths = EXEC_COND_TEMPLATE.render(
+        indent="  ",
+        num_inputs=len(inputs),
+        rank=len(x_shape),
+        elem_type=input_type,
+        elems_per_thread=elems_per_thread,
+        threads_per_block=128,
+    )
+
+    shape_func = SHAPE_UPDATE_FUNC.render(
+        indent="  ",
+        update_output_shape=update_output_shape,
+        index_type=backend_spec.index_type,
+    )
+    extra_header = (
+        extra_header_template.render(element_func_def=element_func_def)
+        if extra_header_template is not None
+        else ""
+    )
+    header_src = backend_spec.header_src_template.render(extra_header=extra_header)
+    kernel_src = KERNEL_SRC_TEMPLATE.render(
+        element_func=element_func,
+        element_func_def=element_func_def,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        header_src=header_src,
+    )
+    return SRC_TEMPLATE.render(
+        kernel_src=kernel_src,
+        func_name=func_attrs["name"],
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        header_src=header_src,
+    )
+
+
+def gen_function_call(
+    backend_spec,
+    func_name,
+    inputs,
+    outputs,
+    start_indices,
+    end_indices,
+    dim=0,
+    indent="  ",
+    output_shape_def=None,
+):
+    """Generates function call.
+
+    Parameters
+    ----------
+    backend_spec: dataclass
+        Backend specification.
+    func_name : str
+        Function neame
+    inputs : List[Tensor]
+        Input tensors.
+    outputs : List[Tensor]
+        Output tensors.
+    start_indices : List[List[int]]
+        each input has its own list of indices
+    end_indices : List[List[int]]
+        Each input has its own list of indices
+    dim : int
+        Specify the concat dim if we concat outputs of all inputs, by default 0.
+    indent : str, optional
+        Indent for template, by default "  ".
+    output_shape_def: jinja2.Template
+      output shape template, by default None.
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    assert len(inputs) == len(start_indices) == len(end_indices)
+    x = inputs[0]
+    y = outputs[0]
+
+    input_names = ",\n        ".join(
+        [
+            backend_spec.cast_to_const_half_ptr_template.render(name=i._attrs["name"])
+            for i in inputs
+        ]
+    )
+
+    input_shape_defs = []
+    input_shape_names = []
+    start_indices_defs = []
+    start_indices_names = []
+    end_indices_defs = []
+    end_indices_names = []
+
+    for idx, (i, s_indices, e_indices) in enumerate(
+        zip(inputs, start_indices, end_indices)
+    ):
+        input_shape_name = "{}_shape".format(i._attrs["name"])
+        s_indices_name = "{}_slice_start_indices_{}".format(i._attrs["name"], idx)
+        e_indices_name = "{}_slice_end_indices_{}".format(i._attrs["name"], idx)
+        if input_shape_name not in input_shape_names:
+            dims = ", ".join([dim._attrs["name"] for dim in i._attrs["shape"]])
+            one_shape_def = INPUT_SHAPE_DEF_TEMPLATE.render(
+                indent="      ", input_shape_name=input_shape_name, input_dims=dims
+            )
+            input_shape_defs.append(one_shape_def)
+
+        s_indices_str = ", ".join([str(i) for i in s_indices])
+        one_s_indices_def = INPUT_INDICES_DEF_TEMPLATE.render(
+            indent="      ",
+            input_indices_name=s_indices_name,
+            input_indices=s_indices_str,
+        )
+        start_indices_defs.append(one_s_indices_def)
+
+        e_indices_str = ", ".join([str(i) for i in e_indices])
+        one_e_indices_def = INPUT_INDICES_DEF_TEMPLATE.render(
+            indent="      ",
+            input_indices_name=e_indices_name,
+            input_indices=e_indices_str,
+        )
+        end_indices_defs.append(one_e_indices_def)
+
+        input_shape_names.append(input_shape_name)
+        start_indices_names.append(s_indices_name)
+        end_indices_names.append(e_indices_name)
+
+    if output_shape_def is None:
+        y_dim_refs = ", ".join(["&" + dim._attrs["name"] for dim in y._attrs["shape"]])
+        output_shape_def = DEFAULT_OUTPUT_SHAPE_DEF_TEMPLATE.render(
+            indent=indent, output_name=y._attrs["name"], output_dim_refs=y_dim_refs
+        )
+
+    casted_y_ptr = backend_spec.cast_to_half_ptr_template.render(name=y._attrs["name"])
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        func_name=func_name,
+        output_elem_type=backend_spec.dtype_to_backend_type(y._attrs["dtype"]),
+        output_name=y._attrs["name"],
+        output_ptr=casted_y_ptr,
+        output_shape_def=output_shape_def,
+        inputs=input_names,
+        input_shape_defs="".join(input_shape_defs),
+        input_shapes=", ".join(input_shape_names),
+        start_indices_defs="".join(start_indices_defs),
+        slice_start_indices=", ".join(start_indices_names),
+        end_indices_defs="".join(end_indices_defs),
+        slice_end_indices=", ".join(end_indices_names),
+        scatter_dim=dim,
+        rank=len(x._attrs["shape"]),
+        num_inputs=len(inputs),
+    )
diff --git a/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py b/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
new file mode 100644
index 000000000..b8901a062
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
@@ -0,0 +1,149 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Slice reshape backend common implementation.
+"""
+import functools
+
+import jinja2
+
+from . import slice_common
+
+OUTPUT_DIM_DEF_TEMPLATE = jinja2.Template(
+    """
+{{indent}}int64_t {{dim_name}} = {{dim_value}};
+"""
+)
+
+OUTPUT_SHAPE_DEF_TEMPLATE = jinja2.Template(
+    """
+{{dim_defs}}
+{{indent}}  int64_t *{{output_name}}_shape[] = {
+{{indent}}    {{output_dim_refs}}
+{{indent}}  };
+"""
+)
+
+
+def gen_function_decl(func_attrs, backend_spec):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec: dataclass
+        Backend specification.
+
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return slice_common.gen_function_decl(func_attrs, backend_spec=backend_spec)
+
+
+def gen_function(
+    func_attrs, backend_spec, tanh_def, element_func=None, extra_header_template=None
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec: dataclass
+        Backend specification.
+    element_func: str
+        Attributes for ease of tanh concatenate fusion, default is None.
+    extra_header_template: str
+        Header for fast_tanh, default is None.
+
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    # TODO: consider to profile elems_per_thread
+    elems_per_thread = 8 if len(func_attrs["inputs"]) == 1 else 256
+    element_func_def = None if element_func is None else tanh_def.render()
+    return slice_common.gen_function(
+        func_attrs,
+        backend_spec=backend_spec,
+        elems_per_thread=elems_per_thread,
+        update_output_shape=False,
+        element_func=element_func,
+        element_func_def=element_func_def,
+        extra_header_template=extra_header_template,
+    )
+
+
+def gen_function_call(func_attrs, backend_spec, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec: dataclass
+        Backend specification.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    slice_ops = func_attrs["slice_ops"]
+    assert len(slice_ops) >= 1
+    start_indices = [op._attrs["start_indices"] for op in slice_ops]
+    end_indices = [op._attrs["end_indices"] for op in slice_ops]
+
+    y = func_attrs["outputs"][0]
+    dims = [d._attrs["values"][0] for d in y._attrs["shape"]]
+    scatter_dim = func_attrs["scatter_dim"]
+    output_shape_dims = []
+    output_shape_dim_defs = []
+    new_dims = dims[:scatter_dim]
+    remaining_dim = functools.reduce(lambda a, b: a * b, dims[scatter_dim:])
+    new_dims.append(remaining_dim)
+    for i, dim in enumerate(new_dims):
+        dim_name = "output_dim_{}".format(i)
+        output_shape_dims.append(dim_name)
+        dim_def = OUTPUT_DIM_DEF_TEMPLATE.render(
+            indent=indent, dim_name=dim_name, dim_value=dim
+        )
+        output_shape_dim_defs.append(dim_def)
+    y_dim_refs = ", ".join(["&" + dim for dim in output_shape_dims])
+    output_shape_def = OUTPUT_SHAPE_DEF_TEMPLATE.render(
+        indent=indent,
+        dim_defs="".join(output_shape_dim_defs),
+        output_name=y._attrs["name"],
+        output_dim_refs=y_dim_refs,
+    )
+
+    return slice_common.gen_function_call(
+        backend_spec,
+        func_attrs["name"],
+        func_attrs["inputs"],
+        func_attrs["outputs"],
+        start_indices,
+        end_indices,
+        dim=scatter_dim,
+        indent=indent,
+        output_shape_def=output_shape_def,
+    )
diff --git a/python/aitemplate/backend/common/tensor/topk_common.py b/python/aitemplate/backend/common/tensor/topk_common.py
new file mode 100644
index 000000000..6b82ef531
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/topk_common.py
@@ -0,0 +1,769 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+topk kernel codegen.
+"""
+
+import os
+from typing import Any, Dict, List, Tuple
+
+import jinja2
+
+from ... import builder
+from ...target import Target
+
+# pylint: disable=C0301
+
+FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{kernel}}
+
+}  // namespace
+
+{{func_signature}}
+{
+    topk_launcher<half>(stream, elem_cnt, instance_size, instance_num, top_k, input, workspace, output);
+}
+    """
+)
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+{{header_files}}
+
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+namespace {
+
+{{kernel}}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  int elem_cnt = std::stoi(argv[1]);
+  int instance_size = std::stoi(argv[2]);
+  int instance_num = std::stoi(argv[3]);
+
+  float runtime_ms = 0;
+  const int64_t sorted_in_aligned_bytes = GetAlignedSize(elem_cnt * sizeof(half));
+  const int64_t indices_aligned_bytes = GetAlignedSize(elem_cnt * sizeof(int64_t));
+  const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes;
+  int64_t temp_storage_bytes = InferTempStorageForSortPairsDescending<half, int64_t>(instance_size, instance_num);
+  GLOBAL_WORKSPACE_SIZE  =  GetAlignedSize(sorted_in_aligned_bytes + indices_aligned_bytes + sorted_indices_aligned_bytes + temp_storage_bytes);
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(int64_t* output,
+                   const half* input,
+                   const {{index_type}} elem_cnt,
+                   const {{index_type}} instance_size,
+                   const {{index_type}} instance_num,
+                   const {{index_type}} top_k,
+                   uint8_t* workspace,
+                   {{prefix}}Stream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{output}}, {{input}},
+{{indent}}    {{elem_cnt}},
+{{indent}}    {{instance_size}},
+{{indent}}    {{instance_num}},
+{{indent}}    {{top_k}},
+{{indent}}    global_workspace, stream /* default stream */
+{{indent}});
+    """
+)
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+const int32_t kThreadsNumPerBlock = 256;
+const int32_t kMaxBlocksNum = 8192;
+
+#define GPU_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+inline size_t GetAlignedSize(size_t size) {
+  const size_t kAlignSize = 512;
+  return (size + kAlignSize - 1) / kAlignSize * kAlignSize;
+}
+
+template <typename T>
+T GetZeroVal() {
+  return static_cast<T>(0);
+}
+
+template <typename T>
+T GetOneVal() {
+  return static_cast<T>(1);
+}
+
+template <typename T>
+T GetMinVal() {
+  uint16_t ret = 0xfbff;
+  return *(T*)&ret;
+}
+
+template <typename T>
+T GetMaxVal() {
+  uint16_t ret = 0x7bff;
+  return *(T*)&ret;
+}
+
+template <typename T>
+T PowOf2Floor(T val, int64_t max_power) {
+  T max_floor = static_cast<T>(std::pow(2, max_power));
+  val = std::min(val, max_floor);
+  T ret = GetOneVal<T>();
+  while (true) {
+    ret *= 2;
+    if (ret >= val) {
+      return ret == val ? ret : ret / 2;
+    }
+  }
+}
+
+template <typename T>
+T PowOf2Ceil(T val, int64_t max_power) {
+  T max_ceil = static_cast<T>(std::pow(2, max_power));
+  val = std::min(val, max_ceil);
+  T ret = GetOneVal<T>();
+  while (true) {
+    ret *= 2;
+    if (ret >= val) {
+      return ret;
+    }
+  }
+}
+
+template <typename T, typename Compare>
+__device__ void BitonicSwap(
+    T* data,
+    const int64_t i,
+    const int64_t j,
+    const bool dir,
+    const Compare& comp) {
+  if (comp(data[i], data[j]) == dir) {
+    T tmp = data[i];
+    data[i] = data[j];
+    data[j] = tmp;
+  }
+}
+
+class MultiplyFunctor final {
+ public:
+  MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
+  __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
+    return idx * num_col_;
+  }
+
+ private:
+  int32_t num_col_;
+};
+
+template <typename KeyType, typename ValueType>
+size_t InferTempStorageForSortPairsDescending(
+    int32_t num_row,
+    int32_t num_col) {
+  using SegmentOffsetIter = {{cub}}::TransformInputIterator<
+      int32_t,
+      MultiplyFunctor,
+      {{cub}}::CountingInputIterator<int32_t>>;
+
+  {{cub}}::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  size_t temp_storage_bytes = 0;
+  auto err = {{cub}}::DeviceSegmentedRadixSort::
+      SortPairsDescending<KeyType, ValueType, SegmentOffsetIter>(
+          /* d_temp_storage */ nullptr,
+          /* temp_storage_bytes */ temp_storage_bytes,
+          /* d_keys_in */ nullptr,
+          /* d_keys_out */ nullptr,
+          /* d_values_in */ nullptr,
+          /* d_values_out */ nullptr,
+          /* num_items */ num_row * num_col,
+          /* num_segments */ num_row,
+          /* d_begin_offsets */ segment_offset_iter,
+          /* d_end_offsets */ segment_offset_iter + 1,
+          /* begin_bit */ 0,
+          /* end_bit */ sizeof(KeyType) * 8,
+          /* stream */ 0);
+
+  return temp_storage_bytes;
+}
+
+template <typename KeyType, typename ValueType>
+void SortPairsDescending(
+    const KeyType* keys_ptr,
+    const ValueType* values_ptr,
+    int32_t num_row,
+    int32_t num_col,
+    void* temp_storage_ptr,
+    int32_t temp_storage_bytes,
+    KeyType* sorted_keys_ptr,
+    ValueType* sorted_values_ptr,
+    {{prefix}}Stream_t stream) {
+  size_t rt_inferred_temp_storage_bytes =
+      InferTempStorageForSortPairsDescending<KeyType, ValueType>(
+          num_row, num_col);
+
+  using SegmentOffsetIter = {{cub}}::TransformInputIterator<
+      int32_t,
+      MultiplyFunctor,
+      {{cub}}::CountingInputIterator<int32_t>>;
+
+  {{cub}}::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  auto err = {{cub}}::DeviceSegmentedRadixSort::SortPairsDescending(
+      /* d_temp_storage */ temp_storage_ptr,
+      /* temp_storage_bytes */ rt_inferred_temp_storage_bytes,
+      /* d_keys_in */ keys_ptr,
+      /* d_keys_out */ sorted_keys_ptr,
+      /* d_values_in */ values_ptr,
+      /* d_values_out */ sorted_values_ptr,
+      /* num_items */ num_row * num_col,
+      /* num_segments */ num_row,
+      /* d_begin_offsets */ segment_offset_iter,
+      /* d_end_offsets */ segment_offset_iter + 1,
+      /* begin_bit */ 0,
+      /* end_bit */ sizeof(KeyType) * 8,
+      /* stream */ stream);
+}
+
+template <typename T, typename Compare>
+__device__ void
+BitonicSort(T* data, const int64_t elem_cnt, const Compare& comp) {
+  // The element count of instance should be pow-of-2
+  assert(elem_cnt > 0 && !(elem_cnt & (elem_cnt - 1)));
+
+  // Generate a bitonic sequence from input
+  for (int64_t size = 2; size <= elem_cnt / 2; size *= 2) {
+    // Merge 2 bitonic sequences of length 'size' into a bitonic sequence of
+    // length '2 * size'
+    for (int64_t stride = size / 2; stride > 0; stride /= 2) {
+      for (int64_t swap_id = threadIdx.x; swap_id < elem_cnt / 2;
+           swap_id += blockDim.x) {
+        // Change dir at intervals of 'size / 2' swaps
+        const bool dir = swap_id & (size / 2);
+        // Locate the pair {pos, pos + stride} which is going te be swaped if
+        // needed
+        const int pos = 2 * swap_id - (swap_id & (stride - 1));
+
+        BitonicSwap(data, pos, pos + stride, dir, comp);
+
+        __syncthreads();
+      }
+    }
+  }
+
+  // Sort the bitonic sequence
+  for (int64_t stride = elem_cnt / 2; stride > 0; stride /= 2) {
+    for (int64_t swap_id = threadIdx.x; swap_id < elem_cnt / 2;
+         swap_id += blockDim.x) {
+      // Locate the pair {pos, pos + stride} which is going te be swaped if
+      // needed
+      const int pos = 2 * swap_id - (swap_id & (stride - 1));
+
+      BitonicSwap(data, pos, pos + stride, false, comp);
+
+      __syncthreads();
+    }
+  }
+}
+
+template <typename T>
+class Entry final {
+ public:
+  __device__ __forceinline__ Entry(int64_t index, T value)
+      : index_(index), value_(value) {}
+
+  __device__ __forceinline__ int64_t GetIndex() const {
+    return index_;
+  }
+  __device__ __forceinline__ T GetValue() const {
+    return value_;
+  }
+  __device__ __forceinline__ void SetIndex(int64_t index) {
+    index_ = index;
+  }
+  __device__ __forceinline__ void SetValue(T value) {
+    value_ = value;
+  }
+
+  __device__ __forceinline__ bool operator<(const Entry& entry) const {
+    return (value_ < entry.GetValue()) ||
+        (value_ == entry.GetValue() && index_ > entry.GetIndex());
+  }
+  __device__ __forceinline__ bool operator>(const Entry& entry) const {
+    return (value_ > entry.GetValue()) ||
+        (value_ == entry.GetValue() && index_ < entry.GetIndex());
+  }
+
+ private:
+  int64_t index_;
+  T value_;
+};
+
+template <typename T>
+class MinHeap final {
+ public:
+  __device__ __forceinline__ MinHeap(
+      Entry<T>* data,
+      const int64_t heap_size,
+      const int64_t init_index,
+      const T init_value)
+      : data_(data), heap_size_(heap_size) {
+    for (int64_t i = 0; i < heap_size; ++i) {
+      data_[i].SetIndex(init_index);
+      data_[i].SetValue(init_value);
+    }
+  }
+  __device__ __forceinline__ Entry<T>& Top() {
+    return data_[0];
+  }
+  __device__ __forceinline__ void Swap(const int64_t i, const int64_t j) {
+    auto tmp = data_[j];
+    data_[j] = data_[i];
+    data_[i] = tmp;
+  }
+  __device__ __forceinline__ void MinHeapify(int64_t index) {
+    while (true) {
+      const int64_t left = 2 * index + 1;
+      const int64_t right = 2 * index + 2;
+      int64_t min = index;
+      if (left < heap_size_ && data_[left] < data_[min]) {
+        min = left;
+      }
+      if (right < heap_size_ && data_[right] < data_[min]) {
+        min = right;
+      }
+      if (min == index) {
+        return;
+      }
+      Swap(min, index);
+      index = min;
+    }
+  }
+
+ private:
+  Entry<T>* data_;
+  int64_t heap_size_;
+};
+
+template <typename T>
+class TmpBufferManager final {
+ public:
+  TmpBufferManager(int64_t capacity, void* ptr, const int64_t N)
+      : capacity_{capacity},
+        sorted_in_elem_cnt_{N},
+        indices_elem_cnt_{sorted_in_elem_cnt_},
+        sorted_indices_elem_cnt_{sorted_in_elem_cnt_} {
+    const int64_t sorted_in_aligned_bytes =
+        GetAlignedSize(sorted_in_elem_cnt_ * sizeof(T));
+    const int64_t indices_aligned_bytes =
+        GetAlignedSize(indices_elem_cnt_ * sizeof(int64_t));
+    const int64_t sorted_indices_aligned_bytes = indices_aligned_bytes;
+    sorted_in_ptr_ = reinterpret_cast<T*>(ptr);
+    indices_ptr_ = reinterpret_cast<int64_t*>(
+        reinterpret_cast<char*>(sorted_in_ptr_) + sorted_in_aligned_bytes);
+    sorted_indices_ptr_ = reinterpret_cast<int64_t*>(
+        reinterpret_cast<char*>(indices_ptr_) + indices_aligned_bytes);
+    temp_storage_ptr_ = reinterpret_cast<void*>(
+        reinterpret_cast<char*>(sorted_indices_ptr_) +
+        sorted_indices_aligned_bytes);
+    temp_storage_bytes_ = capacity_ - sorted_in_aligned_bytes -
+        indices_aligned_bytes - sorted_indices_aligned_bytes;
+  }
+  ~TmpBufferManager() = default;
+
+  T* SortedInPtr() const {
+    return sorted_in_ptr_;
+  }
+  int64_t* IndicesPtr() const {
+    return indices_ptr_;
+  }
+  int64_t* SortedIndicesPtr() const {
+    return sorted_indices_ptr_;
+  }
+  void* TempStoragePtr() const {
+    return temp_storage_ptr_;
+  }
+
+  int64_t TempStorageBytes() const {
+    return temp_storage_bytes_;
+  }
+
+ private:
+  int64_t capacity_;
+
+  T* sorted_in_ptr_;
+  int64_t* indices_ptr_;
+  int64_t* sorted_indices_ptr_;
+  void* temp_storage_ptr_;
+
+  int64_t sorted_in_elem_cnt_;
+  int64_t indices_elem_cnt_;
+  int64_t sorted_indices_elem_cnt_;
+  int64_t temp_storage_bytes_;
+};
+
+__global__ void InitializeIndices(
+    int64_t elem_cnt,
+    int64_t* indices_ptr,
+    int64_t instance_size) {
+  GPU_KERNEL_LOOP(i, elem_cnt) {
+    indices_ptr[i] = i % instance_size;
+  };
+}
+
+template <typename T>
+__global__ void GetOutput(
+    int64_t top_k,
+    int64_t instance_num,
+    int64_t instance_size,
+    int64_t* indices_ptr,
+    T* output) {
+  for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < instance_num;
+       j += blockDim.y * gridDim.y) {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < top_k;
+         i += blockDim.x * gridDim.x) {
+      output[top_k * j + i] = indices_ptr[instance_size * j + i];
+    }
+  }
+}
+
+template <typename T>
+__global__ void HeapTopKKernel(
+    const T* in_ptr,
+    const int64_t instance_num,
+    const int64_t instance_size,
+    const int64_t k,
+    const int64_t heap_size,
+    const int64_t init_index,
+    const T init_value,
+    int64_t* out_ptr) {
+  extern __shared__ char smem[];
+  auto* shared_entries = reinterpret_cast<Entry<T>*>(smem);
+
+  // Divide elements to be sorted into disjoint sets (# of sets == # of heaps).
+  // Each thread in the thread block manipulates one heap to select top
+  // heap_size entries from corresponding set
+  const T* input = in_ptr + blockIdx.x * instance_size;
+  auto heap = MinHeap<T>(
+      shared_entries + threadIdx.x * heap_size,
+      heap_size,
+      init_index,
+      init_value);
+  for (int64_t i = threadIdx.x; i < instance_size; i += blockDim.x) {
+    auto entry = Entry<T>(i, input[i]);
+    if (entry > heap.Top()) {
+      heap.Top() = entry;
+      heap.MinHeapify(0);
+    }
+  }
+
+  __syncthreads();
+
+  // Merge all heaps into a unified, sorted array
+  BitonicSort(
+      shared_entries,
+      blockDim.x * heap_size,
+      [](const Entry<T>& x, const Entry<T>& y) { return x > y; });
+
+  // Write top_k elements in sorted array to output
+  for (int64_t i = threadIdx.x; i < k; i += blockDim.x) {
+    (out_ptr + blockIdx.x * k)[i] = shared_entries[i].GetIndex();
+  }
+}
+// ALIGNPTR
+int64_t* alignPtr(int64_t* ptr, uintptr_t to) {
+  uintptr_t addr = (uintptr_t)ptr;
+  if (addr % to) {
+    addr += to - addr % to;
+  }
+  return (int64_t*)addr;
+}
+
+inline int32_t BlocksNum4ThreadsNum(const int32_t n) {
+  return std::min(
+      (n + kThreadsNumPerBlock - 1) / kThreadsNumPerBlock,
+      kMaxBlocksNum);
+}
+
+template <typename T>
+void topk_launcher(
+    {{prefix}}Stream_t stream,
+    const int elem_cnt,
+    const int instance_size,
+    const int instance_num,
+    const int top_k,
+    const void* input,
+    void* workspace,
+    void* output) {
+  const int32_t k = std::min(top_k, instance_size);
+
+  if (top_k < 100) {
+    const int32_t kMaxSharedMemoryByteSize = 48 << 10;
+
+    // Use as many heaps as possible (# of heaps == # of threads used in thread
+    // block). Limitation 1: size of shared memory We also need heap_size *
+    // num_heap to be pow-of-2 which is necessary for bitonic sort
+    const int64_t heap_size = PowOf2Ceil(k, 16);
+    int32_t num_heap = PowOf2Floor(
+        kMaxSharedMemoryByteSize / (heap_size * sizeof(Entry<T>)), 16);
+    // Limitation 2: # of threads in thread block
+    num_heap = std::min(num_heap, kThreadsNumPerBlock);
+
+    HeapTopKKernel<T>
+        <<<instance_num,
+           num_heap,
+           num_heap * heap_size * sizeof(Entry<T>),
+           stream>>>(
+            (const T*)input,
+            instance_num,
+            instance_size,
+            k,
+            heap_size,
+            GetMaxVal<int64_t>(),
+            GetMinVal<T>(),
+            (int64_t*)output);
+
+  } else {
+    const uintptr_t ALIGNMENT = 32;
+    int64_t* vworkspace = alignPtr((int64_t*)workspace, ALIGNMENT);
+    T* tmp_buffer = (T*)vworkspace;
+
+    TmpBufferManager<T> buf_manager(
+        static_cast<int64_t>(elem_cnt), tmp_buffer, elem_cnt);
+
+    InitializeIndices<<<
+        BlocksNum4ThreadsNum(elem_cnt),
+        kThreadsNumPerBlock,
+        0,
+        stream>>>(elem_cnt, buf_manager.IndicesPtr(), instance_size);
+
+    SortPairsDescending(
+        (const T*)input,
+        buf_manager.IndicesPtr(),
+        instance_num,
+        instance_size,
+        buf_manager.TempStoragePtr(),
+        buf_manager.TempStorageBytes(),
+        buf_manager.SortedInPtr(),
+        buf_manager.SortedIndicesPtr(),
+        stream);
+
+    {{prefix}}Memcpy2DAsync(
+        (int64_t*)output,
+        k * sizeof(int64_t),
+        buf_manager.SortedIndicesPtr(),
+        instance_size * sizeof(int64_t),
+        k * sizeof(int64_t),
+        instance_num,
+        {{prefix}}MemcpyDefault,
+        stream);
+  }
+}
+    """
+)
+
+
+def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+    """Generates function.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    header_files : str
+        Includes the header files for a backend.
+    backend_spec : class
+        Specifies the backend configurations.
+
+    Returns
+    -------
+    str
+        Rendered function.
+    """
+    index_type = backend_spec.index_type
+    prefix = backend_spec.prefix
+    return FUNC_TEMPLATE.render(
+        header_files=header_files,
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"], index_type=index_type, prefix=prefix
+        ),
+        kernel=KERNEL_TEMPLATE.render(cub=backend_spec.cub, prefix=prefix),
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    """Generates function decl.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec : class
+        Specifies the backend configurations.
+
+    Returns
+    -------
+    str
+        Rendered function decl.
+    """
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            index_type=backend_spec.index_type,
+            prefix=backend_spec.prefix,
+        ),
+    ).strip()
+
+
+def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec : class
+        Specifies the backend configurations.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    output_name = ""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 1
+
+    output_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+    input_name = backend_spec.cast_to_half_ptr_template.render(
+        name=func_attrs["inputs"][0]._attrs["name"]
+    )
+
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+
+    elem_cnt = 1
+    for shape in xshape:
+        elem_cnt *= shape._attrs["values"][0]
+    instance_size = xshape[-1]._attrs["values"][0]
+    instance_num = elem_cnt // instance_size
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        input=input_name,
+        elem_cnt=elem_cnt,
+        instance_size=instance_size,
+        instance_num=instance_num,
+        top_k=func_attrs["topK"],
+        indent=indent,
+    )
+
+
+def add_profiler(
+    file_pairs: List[Tuple[str, str]],
+    workdir: str,
+    op_type: str,
+    output_name: str,
+    code: str,
+):
+    prefix = os.path.join(workdir, "profiler", op_type)
+    if not os.path.exists(prefix):
+        os.makedirs(prefix)
+    src_path = os.path.join(prefix, output_name + ".cu")
+    obj_path = os.path.join(prefix, output_name)
+    if os.path.exists(obj_path):
+        return
+    with open(src_path, "w") as f:
+        f.write(code)
+    file_pairs.append((src_path, obj_path))
+
+
+def gen_profiler(
+    func_attrs: Dict[str, Any], workdir: str, header_files: str, backend_spec
+):
+    """Generates code for topk profiling.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    workdir: str
+        Target directory for generated C++ source code files
+    header_files : str
+        Includes the header files for a backend.
+    backend_spec : class
+        Specifies the backend configurations.
+
+    Returns
+    -------
+    None
+    """
+    # If topK is less than 100, disable profiling since our implementation does not need it.
+    if func_attrs["topK"] < 100:
+        func_attrs["has_profiler"] = False
+        return
+
+    op_type = func_attrs["op"]
+    file_pairs = []
+    index_type = backend_spec.index_type
+    prefix = backend_spec.prefix
+    code = PROFILER_TEMPLATE.render(
+        header_files=header_files,
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"], index_type=index_type, prefix=prefix
+        ),
+        kernel=KERNEL_TEMPLATE.render(cub=backend_spec.cub, prefix=prefix),
+    )
+    op_name = func_attrs["op"]
+    add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    target = Target.current()
+    compile_engine = builder.Builder()
+    compile_engine.build_objs(file_pairs, target.compile_cmd(executable=True))
diff --git a/python/aitemplate/backend/common/tensor_accessor.cuh b/python/aitemplate/backend/common/tensor_accessor.cuh
new file mode 100644
index 000000000..64179da02
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor_accessor.cuh
@@ -0,0 +1,110 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#ifndef AIT_TENSOR_ACCESSOR_CUH
+#define AIT_TENSOR_ACCESSOR_CUH
+
+// Returns a strided address based on a base pointer, an index and strided
+// information.
+// DATA_T: tensor data type.
+// READ_T: actual data type used when reading data. e.g. for a "half"
+// tensor, READ_T could be uint4 when all data is aligned.
+// data: A base pointer in READ_T type.
+// idx: read index in terms of READ_T.
+// offset, original_total_elements_from_stride_dim and
+// actual_total_elements_from_stride_dim are the corresponding data member
+// values of TensorAccessor.
+template <typename DATA_T, typename READ_T, bool is_contiguous>
+__device__ __forceinline__ READ_T* get_strided_address(
+    READ_T* data,
+    int64_t idx,
+    int64_t offset,
+    int64_t original_total_elements_from_stride_dim,
+    int64_t actual_total_elements_from_stride_dim) {
+  (void)original_total_elements_from_stride_dim; // Suppress incorrect declared
+                                                 // but never referenced warning
+                                                 // from nvcc.
+  (void)actual_total_elements_from_stride_dim; // Ditto.
+  if constexpr (is_contiguous) {
+    return reinterpret_cast<READ_T*>(reinterpret_cast<DATA_T*>(data) + offset) +
+        idx;
+  } else {
+    constexpr int N_ELEMENTS_PER_READ = sizeof(READ_T) / sizeof(DATA_T);
+    int64_t data_idx = idx * N_ELEMENTS_PER_READ;
+    int64_t num_rows = data_idx / original_total_elements_from_stride_dim;
+    int64_t row_offset = data_idx % original_total_elements_from_stride_dim;
+    data_idx =
+        num_rows * actual_total_elements_from_stride_dim + row_offset + offset;
+    return reinterpret_cast<READ_T*>(
+        reinterpret_cast<DATA_T*>(data) + data_idx);
+  }
+  return nullptr; // Suppress incorrect warning about missing return statement
+                  // from nvcc.
+}
+
+static inline uint64_t max_power2_divisor(uint64_t n) {
+  // max power of 2 which divides n
+  return n & (~(n - 1));
+}
+
+// A TensorAccessor which handles strided tensor access underneath.
+struct TensorAccessor {
+  int64_t offset{0};
+  bool is_contiguous{true};
+
+  int stride_dim{-1};
+  int64_t original_total_elements_from_stride_dim{-1};
+  int64_t actual_total_elements_from_stride_dim{-1};
+
+  // Returns an address based on a base pointer and an index.
+
+  // DATA_T: tensor data type.
+  // READ_T: actual data type used when reading data. e.g. for a "half"
+  // tensor, READ_T could be uint4 when all data is aligned.
+  // data: A base pointer in READ_T type.
+  // idx: read index in terms of READ_T.
+  template <typename DATA_T, typename READ_T>
+  __device__ inline READ_T* get(READ_T* data, int64_t idx) const {
+    return is_contiguous ? get_strided_address<DATA_T, READ_T, true>(
+                               data,
+                               idx,
+                               offset,
+                               original_total_elements_from_stride_dim,
+                               actual_total_elements_from_stride_dim)
+                         : get_strided_address<DATA_T, READ_T, false>(
+                               data,
+                               idx,
+                               offset,
+                               original_total_elements_from_stride_dim,
+                               actual_total_elements_from_stride_dim);
+  }
+
+  uint64_t max_alignment() const {
+    // gcd of max alignments
+    auto alignment = max_power2_divisor(offset);
+    if (!is_contiguous) {
+      alignment |= max_power2_divisor(original_total_elements_from_stride_dim);
+      alignment |= max_power2_divisor(actual_total_elements_from_stride_dim);
+    }
+    return max_power2_divisor(alignment);
+  }
+
+  bool is_valid_alignment(uint64_t n) const {
+    // n is a power of 2; return whether tensor accessor alignment is divisible
+    // by n.
+    return !(max_alignment() & (n - 1));
+  }
+};
+
+#endif
diff --git a/python/aitemplate/backend/common/tensor_accessor_codegen.py b/python/aitemplate/backend/common/tensor_accessor_codegen.py
new file mode 100644
index 000000000..e2e873647
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor_accessor_codegen.py
@@ -0,0 +1,163 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Tensor accessor related codegens.
+"""
+
+import os
+from typing import List
+
+import jinja2
+
+from ...compiler.tensor_accessor import TensorAccessor
+from ..target import Target
+
+# Template used to transform a Python TensorAccessor object
+# to a C++ TensorAccessor struct.
+TENSOR_ACCESSOR_TEMPLATE = jinja2.Template(
+    """
+    TensorAccessor {{name}} = {
+      {{tensor_accessor.offset}},
+      {% if tensor_accessor.is_contiguous %}
+      true
+      {% else %}
+      false
+      {% endif %}
+      {% if not tensor_accessor.is_contiguous %}
+      ,
+      {{tensor_accessor.stride_dim}},
+      {{tensor_accessor.original_total_elements_from_stride_dim}},
+      {{tensor_accessor.actual_total_elements_from_stride_dim}}
+      {% endif %}
+    };
+"""
+)
+
+STRIDED_ADDRESS_AT_IDX_FUNC_TEMPLATE = jinja2.Template(
+    """
+template <typename DATA_T, typename READ_T>
+__device__ __forceinline__ READ_T* get_strided_address_at_idx(
+    READ_T *data, int64_t data_idx) {
+{%if output_accessor.is_contiguous %}
+  return get_strided_address<DATA_T, READ_T, true>(
+      data, data_idx, {{output_accessor.offset}}, 0, 0);
+{% else %}
+  return get_strided_address<DATA_T, READ_T, false>(
+      data, data_idx,
+      {{output_accessor.offset}},
+      {{output_accessor.original_total_elements_from_stride_dim}},
+      {{output_accessor.actual_total_elements_from_stride_dim}});
+{% endif %}
+}
+"""
+)
+
+
+def get_libs() -> str:
+    return Target.current().get_custom_libs(
+        os.path.dirname(__file__), "tensor_accessor.cuh"
+    )
+
+
+# Currently read4, add2 is best for both backend, so two backend seems identical.
+# They may diverge when we got deeper understanding / further optimization.
+ALIGNMENTS = [
+    8,
+    4,
+    2,
+    1,
+]
+
+
+def _find_max_alignment(number: int) -> int:
+    """
+    Return the first alignment value that meets the alignment requirement
+    for accessing the `number` of elements.
+    """
+    for alignment in ALIGNMENTS:
+        if number % alignment == 0:
+            return alignment
+    return 1
+
+
+def find_max_alignment_for_accessor(accessor: TensorAccessor) -> int:
+    """the max alignment value that meets the requirement specified by
+       the accessor
+
+    Parameters
+    ----------
+    accessors: TensorAccessor
+
+    Returns
+    ----------
+    int
+        the max alignment value
+    """
+    alignment = _find_max_alignment(accessor.offset)
+    if not accessor.is_contiguous:
+        alignment = min(
+            alignment,
+            _find_max_alignment(accessor.original_total_elements_from_stride_dim),
+        )
+        alignment = min(
+            alignment,
+            _find_max_alignment(accessor.actual_total_elements_from_stride_dim),
+        )
+    return alignment
+
+
+def find_max_alignment_for_accessors(accessors: List[TensorAccessor]) -> int:
+    """the max alignment value that meets the requirement specified by
+       the accessors
+
+    Parameters
+    ----------
+    accessors: List[TensorAccessor]
+        TensorAccessor(s) attached to the relevant tensor being accessed
+
+    Returns
+    ----------
+    int
+        the max alignment value
+    """
+    alignment = max(ALIGNMENTS)
+    # Handle accessors
+    for accessor in accessors:
+        alignment = min(alignment, find_max_alignment_for_accessor(accessor))
+    return alignment
+
+
+def find_max_alignment(num_elements: int, accessors: List[TensorAccessor]) -> int:
+    """find the max alignment value that meets the requirement of accessing
+       num_elements of data with access patterns (strides and offsets)
+       specified by accessors
+
+    Parameters
+    ----------
+    num_elements: int
+        specify the number of elements being accessed
+
+    accessors: List[TensorAccessor]
+        TensorAccessor(s) attached to the relevant tensor being accessed
+
+    Returns
+    ----------
+    int
+        the max alignment value
+    """
+    # get initial alignment based on the number of elements being accessed
+    alignment = _find_max_alignment(num_elements)
+    accessor_alignment = find_max_alignment_for_accessors(accessors)
+    return min(alignment, accessor_alignment)
diff --git a/python/aitemplate/backend/common/upsampling2d_common.py b/python/aitemplate/backend/common/upsampling2d_common.py
new file mode 100644
index 000000000..6d7aadd3c
--- /dev/null
+++ b/python/aitemplate/backend/common/upsampling2d_common.py
@@ -0,0 +1,425 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Backend-agnostic function templates for upsampling2d.
+"""
+
+import jinja2
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}bilinear_upsampling_luncher(
+{{indent}}    in_ptr,
+{% if bias_add %}
+  {{indent}}    res_ptr,
+{% endif %}
+{{indent}}    out_ptr,
+{{indent}}    NI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    CI,
+{{indent}}    HO,
+{{indent}}    WO,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+#define GPU_1D_KERNEL_LOOP(i, n) \
+  for (int64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+{% if mode == "bilinear"%}
+__global__ void bilinear_upsampling_f16_nhwc_kernel(const half2* input,
+                                                    {% if bias_add %}
+                                                      const half2* input_res,
+                                                    {% endif %}
+                                                    half2* output,
+                                                    const {{index_type}} batch,
+                                                    const {{index_type}} in_height,
+                                                    const {{index_type}} in_width,
+                                                    const {{index_type}} channels,
+                                                    const {{index_type}} out_height,
+                                                    const {{index_type}} out_width) {
+
+    const float height_scale = in_height / static_cast<float>(out_height);
+    const float width_scale = in_width / static_cast<float>(out_width);
+    const int64_t num_threads = out_height * out_width * channels * batch;
+
+GPU_1D_KERNEL_LOOP(out_idx, num_threads) {
+    int64_t idx = out_idx;
+    const int64_t c = idx % channels;
+    idx /= channels;
+    const int64_t x = idx % out_width;
+    idx /= out_width;
+    const int64_t y = idx % out_height;
+    const int64_t b = idx / out_height;
+
+    const float in_y = (static_cast<float>(y) + 0.5f) * height_scale - 0.5f;
+    const int64_t top_y_index = in_y > 0.0 ? floorf(in_y) : 0;
+    const int64_t bottom_y_index =
+        (in_y < in_height - 1) ? ceilf(in_y) : in_height - 1;
+    const float y_lerp = in_y - floorf(in_y);
+
+    const float in_x = (static_cast<float>(x) + 0.5f) * width_scale - 0.5f;
+    const int64_t left_x_index = in_x > 0.0 ? floorf(in_x) : 0;
+    const int64_t right_x_index =
+        (in_x < in_width - 1) ? ceilf(in_x) : in_width - 1;
+    const float x_lerp = in_x - floorf(in_x);
+
+    const half2 top_left = __ldg(
+        input + ((b * in_height + top_y_index) * in_width + left_x_index) *
+                   channels +
+               c);
+
+    const half2 top_right = __ldg(
+        input + ((b * in_height + top_y_index) * in_width + right_x_index) *
+                   channels +
+               c);
+    const half2 bottom_left = __ldg(
+        input + ((b * in_height + bottom_y_index) * in_width + left_x_index) *
+                   channels +
+               c);
+    const half2 bottom_right = __ldg(
+        input + ((b * in_height + bottom_y_index) * in_width + right_x_index) *
+                   channels +
+               c);
+
+    float top_x = __half2float(top_left{{half2_data_ref}}.x) + (__half2float(top_right{{half2_data_ref}}.x) - __half2float(top_left{{half2_data_ref}}.x)) * x_lerp;
+    float top_y = __half2float(top_left{{half2_data_ref}}.y) + (__half2float(top_right{{half2_data_ref}}.y) - __half2float(top_left{{half2_data_ref}}.y)) * x_lerp;
+
+    float bottom_x = __half2float(bottom_left{{half2_data_ref}}.x) + (__half2float(bottom_right{{half2_data_ref}}.x) - __half2float(bottom_left{{half2_data_ref}}.x)) * x_lerp;;
+    float bottom_y = __half2float(bottom_left{{half2_data_ref}}.y) + (__half2float(bottom_right{{half2_data_ref}}.y) - __half2float(bottom_left{{half2_data_ref}}.y)) * x_lerp;;
+
+    float2 out = {0.f, 0.f};
+    out.x = top_x + (bottom_x - top_x) * y_lerp;
+    out.y = top_y + (bottom_y - top_y) * y_lerp;
+
+    {% if bias_add %}
+      output[out_idx] = __hadd2(__float22half2_rn(out), __ldg(input_res + out_idx));
+    {% else %}
+      output[out_idx] = __float22half2_rn(out);
+    {% endif %}
+  }
+
+}
+
+{% else %}
+template <typename T, typename Telement, int element_in_Tio>
+__global__ void nearest_upsampling_f16_nhwc_kernel(const T* input,
+                                                    {% if bias_add %}
+                                                      const T* input_res,
+                                                    {% endif %}
+                                                    T* output,
+                                                    const {{index_type}} batch,
+                                                    const {{index_type}} in_height,
+                                                    const {{index_type}} in_width,
+                                                    const {{index_type}} channels,
+                                                    const {{index_type}} out_height,
+                                                    const {{index_type}} out_width) {
+
+    const float height_scale = in_height / static_cast<float>(out_height);
+    const float width_scale = in_width / static_cast<float>(out_width);
+    const int64_t nthreads = out_height * out_width * channels * batch;
+
+GPU_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index;
+    int c = n % channels;
+    n /= channels;
+    int out_x = n % out_width;
+    n /= out_width;
+    int out_y = n % out_height;
+    n /= out_height;
+
+    const T* bottom_data_n = input + n * channels * in_height * in_width;
+    const int in_y =
+        max(min(static_cast<int>(
+                    floorf((static_cast<float>(out_y) + 0.5f) * height_scale)),
+                static_cast<int>(in_height) - 1),
+            0);
+    const int in_x =
+        max(min(static_cast<int>(
+                    floorf((static_cast<float>(out_x) + 0.5f) * width_scale)),
+                static_cast<int>(in_width) - 1),
+            0);
+    const int idx = (in_y * in_width + in_x) * channels + c;
+
+
+  {% if bias_add %}
+    T input_val = __ldg(bottom_data_n + idx);
+    T input_res_val = __ldg(input_res + index);
+    {% if tsize == 1 %}
+    output[index] = input_val + input_res_val;
+
+    {% elif tsize == 8 %}
+    T output_val;
+    Telement* pack_y = reinterpret_cast<Telement*>(&output_val);
+    Telement* pack_x = reinterpret_cast<Telement*>(&input_val);
+    Telement* pack_res = reinterpret_cast<Telement*>(&input_res_val);
+    for (int k = 0 ; k < element_in_Tio ; k++)
+      pack_y[k] = pack_x[k] + pack_res[k];
+    output[index] =  output_val;
+
+    {% else %}
+    T output_val;
+    output_val{{half2_data_ref}}.x = input_val{{half2_data_ref}}.x + input_res_val{{half2_data_ref}}.x;
+    output_val{{half2_data_ref}}.y = input_val{{half2_data_ref}}.y + input_res_val{{half2_data_ref}}.y;
+    output[index] = output_val;
+    {% endif %}
+  {% else %}
+    output[index] = __ldg(bottom_data_n + idx);
+  {% endif %}
+
+  }
+}
+
+{% endif %}
+
+template <typename integer>
+constexpr __host__ __device__ inline integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+void bilinear_upsampling_luncher({{elem_input_type}}* input,
+                    {% if bias_add %}
+                      {{elem_input_type}}* input_res,
+                    {% endif %}
+                      {{elem_output_type}}* output,
+                      const {{index_type}} N,
+                      const {{index_type}} H,
+                      const {{index_type}} W,
+                      const {{index_type}} C,
+                      const {{index_type}} HO,
+                      const {{index_type}} WO,
+                      {{prefix}}Stream_t stream) {
+    const int64_t output_size = N * (C) * HO * WO;
+    dim3 grid(std::min(
+      ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+    dim3 block(512);
+
+{% if mode == "bilinear" %}
+    bilinear_upsampling_f16_nhwc_kernel<<<grid, block, 0, stream>>>(
+      (const half2 *)input,
+      {% if bias_add %}
+        (const half2 *)input_res,
+      {% endif %}
+      (half2 *)output,
+      N, H, W, C/2, HO, WO);
+{% else %}
+    {% if tsize == 1 %}
+    nearest_upsampling_f16_nhwc_kernel<half, half, 1><<<grid, block, 0, stream>>>(
+      (const half *)input,
+      {% if bias_add %}
+        (const half *)input_res,
+      {% endif %}
+      (half *)output,
+      N, H, W, C, HO, WO);
+    {% elif tsize == 8 %}
+    nearest_upsampling_f16_nhwc_kernel<float4, half, 8><<<grid, block, 0, stream>>>(
+      (const float4 *)input,
+      {% if bias_add %}
+        (const float4 *)input_res,
+      {% endif %}
+      (float4 *)output,
+      N, H, W, C/8, HO, WO);
+    {% else %}
+    nearest_upsampling_f16_nhwc_kernel<half2, half, 2><<<grid, block, 0, stream>>>(
+      (const half2 *)input,
+      {% if bias_add %}
+        (const half2 *)input_res,
+      {% endif %}
+      (half2 *)output,
+      N, H, W, C/2, HO, WO);
+    {% endif %}
+{% endif %}
+}
+} // namespace
+
+void {{function_name}} (
+    {{elem_input_type}}* in_ptr,
+    {% if bias_add %}
+      {{elem_input_type}}* res_ptr,
+    {% endif %}
+    {{elem_output_type}}* out_ptr,
+    {{index_type}}* batch,
+    {{index_type}}* in_h,
+    {{index_type}}* in_w,
+    {{index_type}}* in_ch,
+    {{index_type}}* out_batch,
+    {{index_type}}* out_h,
+    {{index_type}}* out_w,
+    {{prefix}}Stream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this bilinear upsampling specialization."
+  );
+}
+"""
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  {{elem_input_type}}*,
+  {% if bias_add %}
+    {{elem_input_type}}*,
+  {% endif %}
+  {{elem_output_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{prefix}}Stream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr}}),
+{% if bias_add %}
+  {{indent}}    static_cast<{{elem_input_type}}*>({{res_ptr}}),
+{% endif %}
+{{indent}}    static_cast<{{elem_output_type}}*>({{out_ptr}}),
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_function_decl(func_attrs, backend_spec, bias_add=False):
+    """Function declaration generation
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        It describes the operation attributes
+    backend_spec : custom class
+        It specifies the corresponding backend dtypes of pytorch dtypes for many operations
+
+    Returns
+    -------
+    str
+        Rendered function declaration stmt
+    """
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        func_name=func_attrs["name"],
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        bias_add=bias_add,
+    )
+
+
+def gen_alignment(x):
+    in_channel = x.shape()[-1].value()
+    if in_channel % 8 == 0:
+        tsize = 8
+    elif in_channel % 4 == 0:
+        tsize = 4
+    elif in_channel % 2 == 0:
+        tsize = 2
+    else:
+        tsize = 1
+    return tsize
+
+
+def gen_function_call(func_attrs, backend_spec, indent="  ", bias_add=False):
+    """Function call generation
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        It describes the operation attributes
+    indent : str, optional
+        Indent for template, by default "  "
+
+    Returns
+    -------
+    str
+        Rendered function call
+    """
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    if bias_add:
+        r = func_attrs["inputs"][1]
+        return FUNC_CALL_TEMPLATE.render(
+            func_name=func_attrs["name"],
+            elem_input_type=input_type,
+            elem_output_type=output_type,
+            index_type=backend_spec.index_type,
+            in_ptr=x._attrs["name"],
+            res_ptr=r._attrs["name"],
+            out_ptr=y._attrs["name"],
+            p_batch="&" + xshape[0]._attrs["name"],
+            p_in_ch="&" + xshape[3]._attrs["name"],
+            p_in_h="&" + xshape[1]._attrs["name"],
+            p_in_w="&" + xshape[2]._attrs["name"],
+            p_out_batch="&" + yshape[0]._attrs["name"],
+            p_out_h="&" + yshape[1]._attrs["name"],
+            p_out_w="&" + yshape[2]._attrs["name"],
+            indent=indent,
+            bias_add=bias_add,
+        )
+    else:
+        return FUNC_CALL_TEMPLATE.render(
+            func_name=func_attrs["name"],
+            elem_input_type=input_type,
+            elem_output_type=output_type,
+            index_type=backend_spec.index_type,
+            in_ptr=x._attrs["name"],
+            out_ptr=y._attrs["name"],
+            p_batch="&" + xshape[0]._attrs["name"],
+            p_in_ch="&" + xshape[3]._attrs["name"],
+            p_in_h="&" + xshape[1]._attrs["name"],
+            p_in_w="&" + xshape[2]._attrs["name"],
+            p_out_batch="&" + yshape[0]._attrs["name"],
+            p_out_h="&" + yshape[1]._attrs["name"],
+            p_out_w="&" + yshape[2]._attrs["name"],
+            indent=indent,
+            bias_add=bias_add,
+        )
diff --git a/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py b/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
new file mode 100644
index 000000000..8431e5d87
--- /dev/null
+++ b/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
@@ -0,0 +1,250 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+nms kernel codegen for CUDA.
+"""
+
+import os
+from typing import Any, Dict
+
+import jinja2
+
+from ... import builder
+from ...target import Target
+from .efficient_nms_kernel import kernel
+
+# pylint: disable=C0301
+
+FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{kernel}}
+
+}  // namespace
+
+{{func_signature}}
+{
+
+    const int N = *batch;
+    const int R = *num_rois;
+    const int C = *num_classes;
+
+    EfficientNMSParameters mParam;
+    mParam.iouThreshold = iouThreshold;
+    mParam.scoreThreshold = 0.001;
+    mParam.boxDecoder = false;
+    mParam.numOutputBoxesPerClass = nmsMaxOut;
+    mParam.numOutputBoxes = nmsMaxOut;
+    mParam.batchSize = N;
+    mParam.numBoxElements = R * C * 4;
+    mParam.numScoreElements = R * C;
+    mParam.numAnchors = R;
+    mParam.numClasses = C;
+    mParam.shareLocation = (C == 1) ? true : false;
+    mParam.outputONNXIndices = false;
+    mParam.scoreSigmoid = false;
+    mParam.numSelectedBoxes = 5000;
+
+    const void* const boxesInput = proposals;
+    const void* const scoresInput = fgScores;
+    const void* const anchorsInput = nullptr;
+
+    void* numDetectionsOutput = num_detections;
+    void* nmsBoxesOutput = detection_boxes;
+    void* nmsScoresOutput = detection_scores;
+    void* nmsClassesOutput = detection_classe;
+
+    return EfficientNMSInference(mParam, boxesInput, scoresInput, anchorsInput, numDetectionsOutput,
+        nmsBoxesOutput, nmsScoresOutput, nmsClassesOutput, nullptr, workspace, stream);
+
+
+}
+    """
+)
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+{{header_files}}
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+namespace {
+
+{{kernel}}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  float runtime_ms = 0;
+  int batchSize = std::stoi(argv[1]);
+  int numScoreElements = std::stoi(argv[2]);
+  int numClasses = std::stoi(argv[3]);
+  GLOBAL_WORKSPACE_SIZE = EfficientNMSWorkspaceSize<half>(batchSize, numScoreElements, numClasses);
+
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(int64_t* num_detections,
+                   half* detection_boxes,
+                   half* detection_scores,
+                   int64_t* detection_classe,
+                   const half* proposals,
+                   const half* fgScores,
+                   int64_t* batch,
+                   int64_t* num_rois,
+                   int64_t* num_classes,
+                   const int preNmsTop,
+                   const int nmsMaxOut,
+                   const float iouThreshold,
+                   const float minBoxSize,
+                   uint8_t* workspace,
+                   {{prefix}}Stream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{num_detections}},
+{{indent}}   {{detection_boxes}},
+{{indent}}   {{detection_scores}},
+{{indent}}   {{detection_classe}},
+{{indent}}    {{proposals}},
+{{indent}}    {{fgScores}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{num_rois}},
+{{indent}}    {{num_classes}},
+{{indent}}    {{preNmsTop}},
+{{indent}}    {{nmsMaxOut}},
+{{indent}}    {{iouThreshold}},
+{{indent}}    {{minBoxSize}},
+{{indent}}    global_workspace, stream /* default stream */
+{{indent}});
+    """
+)
+
+
+def gen_function(func_attrs: Dict[str, Any], header_files, backend_spec) -> str:
+    """the function for generating nms kernel"""
+    return FUNC_TEMPLATE.render(
+        header_files=header_files,
+        kernel=kernel.render(prefix=backend_spec.prefix, cub=backend_spec.cub),
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"], prefix=backend_spec.prefix
+        ),
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"], prefix=backend_spec.prefix
+        ).strip()
+    )
+
+
+def gen_function_call(func_attrs, backend_spec, indent="  "):
+    """the function for generating a function call for nms op"""
+
+    assert len(func_attrs["outputs"]) == 4
+    assert len(func_attrs["inputs"]) == 2
+
+    num_detections = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+    detection_boxes = backend_spec.cast_to_half_ptr_template.render(
+        name=func_attrs["outputs"][1]._attrs["name"]
+    )
+    detection_scores = backend_spec.cast_to_half_ptr_template.render(
+        name=func_attrs["outputs"][2]._attrs["name"]
+    )
+    detection_classes = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][3]._attrs["name"]
+    )
+    (input_name, score_name) = (
+        backend_spec.cast_to_half_ptr_template.render(name=input_tensor._attrs["name"])
+        for input_tensor in func_attrs["inputs"]
+    )
+
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        num_detections=num_detections,
+        detection_boxes=detection_boxes,
+        detection_scores=detection_scores,
+        detection_classe=detection_classes,
+        proposals=input_name,
+        fgScores=score_name,
+        p_batch="&" + xshape[0]._attrs["name"],
+        num_rois="&" + xshape[1]._attrs["name"],
+        num_classes="&" + xshape[2]._attrs["name"],
+        preNmsTop=func_attrs["preNmsTop"],
+        nmsMaxOut=func_attrs["nmsMaxOut"],
+        iouThreshold=func_attrs["iouThreshold"],
+        minBoxSize=func_attrs["minBoxSize"],
+        indent=indent,
+    )
+
+
+def add_profiler(file_pairs, workdir, op_type, output_name, code):
+    """generate nms kernel for profiling"""
+    prefix = os.path.join(workdir, "profiler", op_type)
+    if not os.path.exists(prefix):
+        os.makedirs(prefix)
+    src_path = os.path.join(prefix, output_name + ".cu")
+    obj_path = os.path.join(prefix, output_name)
+    if os.path.exists(obj_path):
+        return
+    with open(src_path, "w") as f:
+        f.write(code)
+    file_pairs.append((src_path, obj_path))
+
+
+def gen_profiler(func_attrs, workdir, header_files, backend_spec):
+    """the function for generating profiler for nms op"""
+    op_type = func_attrs["op"]
+    file_pairs = []
+    code = PROFILER_TEMPLATE.render(
+        header_files=header_files,
+        kernel=kernel.render(prefix=backend_spec.prefix, cub=backend_spec.cub),
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"], prefix=backend_spec.prefix
+        ),
+    )
+    op_name = func_attrs["op"]
+    add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    target = Target.current()
+    compile_engine = builder.Builder()
+    compile_engine.build_objs(file_pairs, target.compile_cmd(executable=True))
diff --git a/python/aitemplate/backend/common/vision_ops/efficient_nms_kernel.py b/python/aitemplate/backend/common/vision_ops/efficient_nms_kernel.py
new file mode 100644
index 000000000..5d5631f14
--- /dev/null
+++ b/python/aitemplate/backend/common/vision_ops/efficient_nms_kernel.py
@@ -0,0 +1,1160 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+efficient_nms function gpu kernel.
+"""
+import jinja2
+
+kernel = jinja2.Template(
+    """
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define NMS_TILES 5
+
+#define CSC(call, err)               \
+  do {                               \
+    {{prefix}}Error_t {{prefix}}Status = call;   \
+    if ({{prefix}}Status != {{prefix}}Success) { \
+      return err;                    \
+    }                                \
+  } while (0)
+
+#ifndef TRT_EFFICIENT_NMS_INFERENCE_CUH
+#define TRT_EFFICIENT_NMS_INFERENCE_CUH
+
+// FP32 Intrinsics
+
+float __device__ __inline__ exp_mp(const float a) {
+  return __expf(a);
+}
+float __device__ __inline__ sigmoid_mp(const float a) {
+  return __frcp_rn(__fadd_rn(1.f, __expf(-a)));
+}
+float __device__ __inline__ add_mp(const float a, const float b) {
+  return __fadd_rn(a, b);
+}
+float __device__ __inline__ sub_mp(const float a, const float b) {
+  return __fsub_rn(a, b);
+}
+float __device__ __inline__ mul_mp(const float a, const float b) {
+  return __fmul_rn(a, b);
+}
+bool __device__ __inline__ gt_mp(const float a, const float b) {
+  return a > b;
+}
+bool __device__ __inline__ lt_mp(const float a, const float b) {
+  return a < b;
+}
+bool __device__ __inline__ lte_mp(const float a, const float b) {
+  return a <= b;
+}
+bool __device__ __inline__ gte_mp(const float a, const float b) {
+  return a >= b;
+}
+
+#if __CUDA_ARCH__ >= 530
+
+// FP16 Intrinsics
+
+__half __device__ __inline__ exp_mp(const __half a) {
+  return hexp(a);
+}
+__half __device__ __inline__ sigmoid_mp(const __half a) {
+  return hrcp(__hadd((__half)1, hexp(__hneg(a))));
+}
+__half __device__ __inline__ add_mp(const __half a, const __half b) {
+  return __hadd(a, b);
+}
+__half __device__ __inline__ sub_mp(const __half a, const __half b) {
+  return __hsub(a, b);
+}
+__half __device__ __inline__ mul_mp(const __half a, const __half b) {
+  return __hmul(a, b);
+}
+bool __device__ __inline__ gt_mp(const __half a, const __half b) {
+  return __hgt(a, b);
+}
+bool __device__ __inline__ lt_mp(const __half a, const __half b) {
+  return __hlt(a, b);
+}
+bool __device__ __inline__ lte_mp(const __half a, const __half b) {
+  return __hle(a, b);
+}
+bool __device__ __inline__ gte_mp(const __half a, const __half b) {
+  return __hge(a, b);
+}
+
+#else
+
+// FP16 Fallbacks on older architectures that lack support
+
+__half __device__ __inline__ exp_mp(const __half a) {
+  return __float2half(exp_mp(__half2float(a)));
+}
+__half __device__ __inline__ sigmoid_mp(const __half a) {
+  return __float2half(sigmoid_mp(__half2float(a)));
+}
+__half __device__ __inline__ add_mp(const __half a, const __half b) {
+  return __float2half(add_mp(__half2float(a), __half2float(b)));
+}
+__half __device__ __inline__ sub_mp(const __half a, const __half b) {
+  return __float2half(sub_mp(__half2float(a), __half2float(b)));
+}
+__half __device__ __inline__ mul_mp(const __half a, const __half b) {
+  return __float2half(mul_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ gt_mp(const __half a, const __half b) {
+  return __float2half(gt_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ lt_mp(const __half a, const __half b) {
+  return __float2half(lt_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ lte_mp(const __half a, const __half b) {
+  return __float2half(lte_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ gte_mp(const __half a, const __half b) {
+  return __float2half(gte_mp(__half2float(a), __half2float(b)));
+}
+
+#endif
+
+typedef enum {
+  STATUS_SUCCESS = 0,
+  STATUS_FAILURE = 1,
+  STATUS_BAD_PARAM = 2,
+  STATUS_NOT_SUPPORTED = 3,
+  STATUS_NOT_INITIALIZED = 4
+} pluginStatus_t;
+
+struct EfficientNMSParameters {
+  // Related to NMS Options
+  float iouThreshold = 0.5f;
+  float scoreThreshold = 0.5f;
+  int numOutputBoxes = 100;
+  int numOutputBoxesPerClass = -1;
+  bool padOutputBoxesPerClass = false;
+  int backgroundClass = -1;
+  bool scoreSigmoid = false;
+  bool clipBoxes = false;
+  int boxCoding = 0; // BoxCorner
+
+  // Related to NMS Internals
+  int numSelectedBoxes = 4096;
+  int scoreBits = -1;
+  bool outputONNXIndices = false;
+
+  // Related to Tensor Configuration
+  // (These are set by the various plugin configuration methods, no need to
+  // define them during plugin creation.)
+  int batchSize = -1;
+  int numClasses = 1;
+  int numBoxElements = -1;
+  int numScoreElements = -1;
+  int numAnchors = -1;
+  bool shareLocation = true;
+  bool shareAnchors = true;
+  bool boxDecoder = false;
+  // DataType datatype = DataType::kFLOAT;
+};
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCorner;
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCenterSize;
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCorner {
+  // For NMS/IOU purposes, YXYX coding is identical to XYXY
+  T y1, x1, y2, x2;
+
+  __device__ void reorder() {
+    if (gt_mp(y1, y2)) {
+      // Swap values, so y1 < y2
+      y1 = sub_mp(y1, y2);
+      y2 = add_mp(y1, y2);
+      y1 = sub_mp(y2, y1);
+    }
+    if (gt_mp(x1, x2)) {
+      // Swap values, so x1 < x2
+      x1 = sub_mp(x1, x2);
+      x2 = add_mp(x1, x2);
+      x1 = sub_mp(x2, x1);
+    }
+  }
+
+  __device__ BoxCorner<T> clip(T low, T high) const {
+    return {
+        lt_mp(y1, low) ? low : (gt_mp(y1, high) ? high : y1),
+        lt_mp(x1, low) ? low : (gt_mp(x1, high) ? high : x1),
+        lt_mp(y2, low) ? low : (gt_mp(y2, high) ? high : y2),
+        lt_mp(x2, low) ? low : (gt_mp(x2, high) ? high : x2)};
+  }
+
+  __device__ BoxCorner<T> decode(BoxCorner<T> anchor) const {
+    return {
+        add_mp(y1, anchor.y1),
+        add_mp(x1, anchor.x1),
+        add_mp(y2, anchor.y2),
+        add_mp(x2, anchor.x2)};
+  }
+
+  __device__ float area() const {
+    T w = sub_mp(x2, x1);
+    T h = sub_mp(y2, y1);
+    if (lte_mp(h, (T)0)) {
+      return 0;
+    }
+    if (lte_mp(w, (T)0)) {
+      return 0;
+    }
+    return (float)h * (float)w;
+  }
+
+  __device__ operator BoxCenterSize<T>() const {
+    T w = sub_mp(x2, x1);
+    T h = sub_mp(y2, y1);
+    return BoxCenterSize<T>{
+        add_mp(y1, mul_mp((T)0.5, h)), add_mp(x1, mul_mp((T)0.5, w)), h, w};
+  }
+
+  __device__ static BoxCorner<T> intersect(BoxCorner<T> a, BoxCorner<T> b) {
+    return {
+        gt_mp(a.y1, b.y1) ? a.y1 : b.y1,
+        gt_mp(a.x1, b.x1) ? a.x1 : b.x1,
+        lt_mp(a.y2, b.y2) ? a.y2 : b.y2,
+        lt_mp(a.x2, b.x2) ? a.x2 : b.x2};
+  }
+};
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCenterSize {
+  // For NMS/IOU purposes, YXHW coding is identical to XYWH
+  T y, x, h, w;
+
+  __device__ void reorder() {}
+
+  __device__ BoxCenterSize<T> clip(T low, T high) const {
+    return BoxCenterSize<T>(BoxCorner<T>(*this).clip(low, high));
+  }
+
+  __device__ BoxCenterSize<T> decode(BoxCenterSize<T> anchor) const {
+    return {
+        add_mp(mul_mp(y, anchor.h), anchor.y),
+        add_mp(mul_mp(x, anchor.w), anchor.x),
+        mul_mp(anchor.h, exp_mp(h)),
+        mul_mp(anchor.w, exp_mp(w))};
+  }
+
+  __device__ float area() const {
+    if (h <= (T)0) {
+      return 0;
+    }
+    if (w <= (T)0) {
+      return 0;
+    }
+    return (float)h * (float)w;
+  }
+
+  __device__ operator BoxCorner<T>() const {
+    T h2 = mul_mp(h, (T)0.5);
+    T w2 = mul_mp(w, (T)0.5);
+    return BoxCorner<T>{
+        sub_mp(y, h2), sub_mp(x, w2), add_mp(y, h2), add_mp(x, w2)};
+  }
+  __device__ static BoxCenterSize<T> intersect(
+      BoxCenterSize<T> a, BoxCenterSize<T> b) {
+    return BoxCenterSize<T>(
+        BoxCorner<T>::intersect(BoxCorner<T>(a), BoxCorner<T>(b)));
+  }
+};
+
+#endif
+
+template <typename T>
+__device__ float
+IOU(EfficientNMSParameters param, BoxCorner<T> box1, BoxCorner<T> box2) {
+  // Regardless of the selected box coding, IOU is always performed in BoxCorner
+  // coding. The boxes are copied so that they can be reordered without
+  // affecting the originals.
+  BoxCorner<T> b1 = box1;
+  BoxCorner<T> b2 = box2;
+  b1.reorder();
+  b2.reorder();
+  float intersectArea = BoxCorner<T>::intersect(b1, b2).area();
+  if (intersectArea <= 0.f) {
+    return 0.f;
+  }
+  float unionArea = b1.area() + b2.area() - intersectArea;
+  if (unionArea <= 0.f) {
+    return 0.f;
+  }
+  return intersectArea / unionArea;
+}
+
+template <typename T, typename Tb>
+__device__ BoxCorner<T> DecodeBoxes(
+    EfficientNMSParameters param,
+    int boxIdx,
+    int anchorIdx,
+    const Tb* __restrict__ boxesInput,
+    const Tb* __restrict__ anchorsInput) {
+  // The inputs will be in the selected coding format, as well as the decoding
+  // function. But the decoded box will always be returned as BoxCorner.
+  Tb box = boxesInput[boxIdx];
+  if (!param.boxDecoder) {
+    return BoxCorner<T>(box);
+  }
+  Tb anchor = anchorsInput[anchorIdx];
+  box.reorder();
+  anchor.reorder();
+  return BoxCorner<T>(box.decode(anchor));
+}
+
+template <typename T, typename Tb>
+__device__ void MapNMSData(
+    EfficientNMSParameters param,
+    int idx,
+    int imageIdx,
+    const Tb* __restrict__ boxesInput,
+    const Tb* __restrict__ anchorsInput,
+    const int* __restrict__ topClassData,
+    const int* __restrict__ topAnchorsData,
+    const int* __restrict__ topNumData,
+    const T* __restrict__ sortedScoresData,
+    const int* __restrict__ sortedIndexData,
+    T& scoreMap,
+    int& classMap,
+    BoxCorner<T>& boxMap,
+    int& boxIdxMap) {
+  // idx: Holds the NMS box index, within the current batch.
+  // idxSort: Holds the batched NMS box index, which indexes the (filtered, but
+  // sorted) score buffer. scoreMap: Holds the score that corresponds to the
+  // indexed box being processed by NMS.
+  if (idx >= topNumData[imageIdx]) {
+    return;
+  }
+  int idxSort = imageIdx * param.numScoreElements + idx;
+  scoreMap = sortedScoresData[idxSort];
+
+  // idxMap: Holds the re-mapped index, which indexes the (filtered, but
+  // unsorted) buffers. classMap: Holds the class that corresponds to the idx'th
+  // sorted score being processed by NMS. anchorMap: Holds the anchor that
+  // corresponds to the idx'th sorted score being processed by NMS.
+  int idxMap = imageIdx * param.numScoreElements + sortedIndexData[idxSort];
+  classMap = topClassData[idxMap];
+  int anchorMap = topAnchorsData[idxMap];
+
+  // boxIdxMap: Holds the re-re-mapped index, which indexes the (unfiltered, and
+  // unsorted) boxes input buffer.
+  boxIdxMap = -1;
+  if (param.shareLocation) // Shape of boxesInput: [batchSize, numAnchors, 1, 4]
+  {
+    boxIdxMap = imageIdx * param.numAnchors + anchorMap;
+  } else // Shape of boxesInput: [batchSize, numAnchors, numClasses, 4]
+  {
+    int batchOffset = imageIdx * param.numAnchors * param.numClasses;
+    int anchorOffset = anchorMap * param.numClasses;
+    boxIdxMap = batchOffset + anchorOffset + classMap;
+  }
+  // anchorIdxMap: Holds the re-re-mapped index, which indexes the (unfiltered,
+  // and unsorted) anchors input buffer.
+  int anchorIdxMap = -1;
+  if (param.shareAnchors) // Shape of anchorsInput: [1, numAnchors, 4]
+  {
+    anchorIdxMap = anchorMap;
+  } else // Shape of anchorsInput: [batchSize, numAnchors, 4]
+  {
+    anchorIdxMap = imageIdx * param.numAnchors + anchorMap;
+  }
+  // boxMap: Holds the box that corresponds to the idx'th sorted score being
+  // processed by NMS.
+  boxMap = DecodeBoxes<T, Tb>(
+      param, boxIdxMap, anchorIdxMap, boxesInput, anchorsInput);
+}
+
+template <typename T>
+__device__ void WriteNMSResult(
+    EfficientNMSParameters param,
+    int64_t* __restrict__ numDetectionsOutput,
+    T* __restrict__ nmsScoresOutput,
+    int64_t* __restrict__ nmsClassesOutput,
+    BoxCorner<T>* __restrict__ nmsBoxesOutput,
+    T threadScore,
+    int threadClass,
+    BoxCorner<T> threadBox,
+    int imageIdx,
+    unsigned int resultsCounter) {
+  int outputIdx = imageIdx * param.numOutputBoxes + resultsCounter - 1;
+  if (param.scoreSigmoid) {
+    nmsScoresOutput[outputIdx] = sigmoid_mp(threadScore);
+  } else if (param.scoreBits > 0) {
+    nmsScoresOutput[outputIdx] = add_mp(threadScore, (T)-1);
+  } else {
+    nmsScoresOutput[outputIdx] = threadScore;
+  }
+  nmsClassesOutput[outputIdx] = (int64_t)threadClass;
+  if (param.clipBoxes) {
+    nmsBoxesOutput[outputIdx] = threadBox.clip((T)0, (T)1);
+  } else {
+    nmsBoxesOutput[outputIdx] = threadBox;
+  }
+  numDetectionsOutput[imageIdx] = (int64_t)resultsCounter;
+}
+
+__device__ void WriteONNXResult(
+    EfficientNMSParameters param,
+    int* outputIndexData,
+    int* __restrict__ nmsIndicesOutput,
+    int imageIdx,
+    int threadClass,
+    int boxIdxMap) {
+  int index = boxIdxMap % param.numAnchors;
+  int idx = atomicAdd((unsigned int*)&outputIndexData[0], 1);
+  nmsIndicesOutput[idx * 3 + 0] = imageIdx;
+  nmsIndicesOutput[idx * 3 + 1] = threadClass;
+  nmsIndicesOutput[idx * 3 + 2] = index;
+}
+
+__global__ void PadONNXResult(
+    EfficientNMSParameters param,
+    int* outputIndexData,
+    int* __restrict__ nmsIndicesOutput) {
+  if (threadIdx.x > 0) {
+    return;
+  }
+  int pidx = outputIndexData[0] - 1;
+  if (pidx < 0) {
+    return;
+  }
+  for (int idx = pidx + 1; idx < param.batchSize * param.numOutputBoxes;
+       idx++) {
+    nmsIndicesOutput[idx * 3 + 0] = nmsIndicesOutput[pidx * 3 + 0];
+    nmsIndicesOutput[idx * 3 + 1] = nmsIndicesOutput[pidx * 3 + 1];
+    nmsIndicesOutput[idx * 3 + 2] = nmsIndicesOutput[pidx * 3 + 2];
+  }
+}
+
+template <typename T, typename Tb>
+__global__ void EfficientNMS(
+    EfficientNMSParameters param,
+    const int* topNumData,
+    int* outputIndexData,
+    int* outputClassData,
+    const int* sortedIndexData,
+    const T* __restrict__ sortedScoresData,
+    const int* __restrict__ topClassData,
+    const int* __restrict__ topAnchorsData,
+    const Tb* __restrict__ boxesInput,
+    const Tb* __restrict__ anchorsInput,
+    int64_t* __restrict__ numDetectionsOutput,
+    T* __restrict__ nmsScoresOutput,
+    int64_t* __restrict__ nmsClassesOutput,
+    int* __restrict__ nmsIndicesOutput,
+    BoxCorner<T>* __restrict__ nmsBoxesOutput) {
+  unsigned int thread = threadIdx.x;
+  unsigned int imageIdx = blockIdx.y;
+  unsigned int tileSize = blockDim.x;
+  if (imageIdx >= param.batchSize) {
+    return;
+  }
+
+  int numSelectedBoxes = min(topNumData[imageIdx], param.numSelectedBoxes);
+  int numTiles = (numSelectedBoxes + tileSize - 1) / tileSize;
+  if (thread >= numSelectedBoxes) {
+    return;
+  }
+
+  __shared__ int blockState;
+  __shared__ unsigned int resultsCounter;
+  if (thread == 0) {
+    blockState = 0;
+    resultsCounter = 0;
+  }
+
+  int threadState[NMS_TILES];
+  unsigned int boxIdx[NMS_TILES];
+  T threadScore[NMS_TILES];
+  int threadClass[NMS_TILES];
+  BoxCorner<T> threadBox[NMS_TILES];
+  int boxIdxMap[NMS_TILES];
+  for (int tile = 0; tile < numTiles; tile++) {
+    threadState[tile] = 0;
+    boxIdx[tile] = thread + tile * blockDim.x;
+    MapNMSData<T, Tb>(
+        param,
+        boxIdx[tile],
+        imageIdx,
+        boxesInput,
+        anchorsInput,
+        topClassData,
+        topAnchorsData,
+        topNumData,
+        sortedScoresData,
+        sortedIndexData,
+        threadScore[tile],
+        threadClass[tile],
+        threadBox[tile],
+        boxIdxMap[tile]);
+  }
+
+  // Iterate through all boxes to NMS against.
+  for (int i = 0; i < numSelectedBoxes; i++) {
+    int tile = i / tileSize;
+
+    if (boxIdx[tile] == i) {
+      // Iteration lead thread, figure out what the other threads should do,
+      // this will be signaled via the blockState shared variable.
+      if (threadState[tile] == -1) {
+        // Thread already dead, this box was already dropped in a previous
+        // iteration, because it had a large IOU overlap with another lead
+        // thread previously, so it would never be kept anyway, therefore it can
+        // safely be skip all IOU operations in this iteration.
+        blockState = -1; // -1 => Signal all threads to skip iteration
+      } else if (threadState[tile] == 0) {
+        // As this box will be kept, this is a good place to find what index in
+        // the results buffer it should have, as this allows to perform an early
+        // loop exit if there are enough results.
+        if (resultsCounter >= param.numOutputBoxes) {
+          blockState = -2; // -2 => Signal all threads to do an early loop exit.
+        } else {
+          // Thread is still alive, because it has not had a large enough IOU
+          // overlap with any other kept box previously. Therefore, this box
+          // will be kept for sure. However, we need to check against all other
+          // subsequent boxes from this position onward, to see how those other
+          // boxes will behave in future iterations.
+          blockState = 1; // +1 => Signal all (higher index) threads to
+                          // calculate IOU against this box
+          threadState[tile] = 1; // +1 => Mark this box's thread to be kept and
+                                 // written out to results
+
+          // If the numOutputBoxesPerClass check is enabled, write the result
+          // only if the limit for this class on this image has not been reached
+          // yet. Other than (possibly) skipping the write, this won't affect
+          // anything else in the NMS threading.
+          bool write = true;
+          if (param.numOutputBoxesPerClass >= 0) {
+            int classCounterIdx =
+                imageIdx * param.numClasses + threadClass[tile];
+            write =
+                (outputClassData[classCounterIdx] <
+                 param.numOutputBoxesPerClass);
+            outputClassData[classCounterIdx]++;
+          }
+          if (write) {
+            // This branch is visited by one thread per iteration, so it's safe
+            // to do non-atomic increments.
+            resultsCounter++;
+            if (param.outputONNXIndices) {
+              WriteONNXResult(
+                  param,
+                  outputIndexData,
+                  nmsIndicesOutput,
+                  imageIdx,
+                  threadClass[tile],
+                  boxIdxMap[tile]);
+            } else {
+              WriteNMSResult<T>(
+                  param,
+                  numDetectionsOutput,
+                  nmsScoresOutput,
+                  nmsClassesOutput,
+                  nmsBoxesOutput,
+                  threadScore[tile],
+                  threadClass[tile],
+                  threadBox[tile],
+                  imageIdx,
+                  resultsCounter);
+            }
+          }
+        }
+      } else {
+        // This state should never be reached, but just in case...
+        blockState = 0; // 0 => Signal all threads to not do any updates,
+                        // nothing happens.
+      }
+    }
+
+    __syncthreads();
+
+    if (blockState == -2) {
+      // This is the signal to exit from the loop.
+      return;
+    }
+
+    if (blockState == -1) {
+      // This is the signal for all threads to just skip this iteration, as no
+      // IOU's need to be checked.
+      continue;
+    }
+
+    // Grab a box and class to test the current box against. The test box
+    // corresponds to iteration i, therefore it will have a lower index than the
+    // current thread box, and will therefore have a higher score than the
+    // current box because it's located "before" in the sorted score list.
+    T testScore;
+    int testClass;
+    BoxCorner<T> testBox;
+    int testBoxIdxMap;
+    MapNMSData<T, Tb>(
+        param,
+        i,
+        imageIdx,
+        boxesInput,
+        anchorsInput,
+        topClassData,
+        topAnchorsData,
+        topNumData,
+        sortedScoresData,
+        sortedIndexData,
+        testScore,
+        testClass,
+        testBox,
+        testBoxIdxMap);
+
+    for (int tile = 0; tile < numTiles; tile++) {
+      // IOU
+      if (boxIdx[tile] > i && // Make sure two different boxes are being tested,
+                              // and that it's a higher index;
+          boxIdx[tile] < numSelectedBoxes && // Make sure the box is within
+                                             // numSelectedBoxes;
+          blockState == 1 && // Signal that allows IOU checks to be performed;
+          threadState[tile] == 0 && // Make sure this box hasn't been either
+                                    // dropped or kept already;
+          threadClass[tile] ==
+              testClass && // Compare only boxes of matching classes;
+          lte_mp(threadScore[tile], testScore) && // Make sure the sorting order
+                                                  // of scores is as expected;
+          IOU<T>(param, threadBox[tile], testBox) >=
+              param.iouThreshold) // And... IOU overlap.
+      {
+        // Current box overlaps with the box tested in this iteration, this box
+        // will be skipped.
+        threadState[tile] = -1; // -1 => Mark this box's thread to be dropped.
+      }
+    }
+  }
+}
+
+template <typename T>
+{{prefix}}Error_t EfficientNMSLauncher(
+    EfficientNMSParameters& param,
+    int* topNumData,
+    int* outputIndexData,
+    int* outputClassData,
+    int* sortedIndexData,
+    T* sortedScoresData,
+    int* topClassData,
+    int* topAnchorsData,
+    const void* boxesInput,
+    const void* anchorsInput,
+    int64_t* numDetectionsOutput,
+    T* nmsScoresOutput,
+    int64_t* nmsClassesOutput,
+    int* nmsIndicesOutput,
+    void* nmsBoxesOutput,
+    {{prefix}}Stream_t stream) {
+  unsigned int tileSize = param.numSelectedBoxes / NMS_TILES;
+  if (param.numSelectedBoxes <= 512) {
+    tileSize = 512;
+  }
+  if (param.numSelectedBoxes <= 256) {
+    tileSize = 256;
+  }
+
+  const dim3 blockSize = {tileSize, 1, 1};
+  const dim3 gridSize = {1, (unsigned int)param.batchSize, 1};
+
+  if (param.boxCoding == 0) {
+    EfficientNMS<T, BoxCorner<T>><<<gridSize, blockSize, 0, stream>>>(
+        param,
+        topNumData,
+        outputIndexData,
+        outputClassData,
+        sortedIndexData,
+        sortedScoresData,
+        topClassData,
+        topAnchorsData,
+        (BoxCorner<T>*)boxesInput,
+        (BoxCorner<T>*)anchorsInput,
+        numDetectionsOutput,
+        nmsScoresOutput,
+        nmsClassesOutput,
+        nmsIndicesOutput,
+        (BoxCorner<T>*)nmsBoxesOutput);
+  } else if (param.boxCoding == 1) {
+    // Note that nmsBoxesOutput is always coded as BoxCorner<T>, regardless of
+    // the input coding type.
+    EfficientNMS<T, BoxCenterSize<T>><<<gridSize, blockSize, 0, stream>>>(
+        param,
+        topNumData,
+        outputIndexData,
+        outputClassData,
+        sortedIndexData,
+        sortedScoresData,
+        topClassData,
+        topAnchorsData,
+        (BoxCenterSize<T>*)boxesInput,
+        (BoxCenterSize<T>*)anchorsInput,
+        numDetectionsOutput,
+        nmsScoresOutput,
+        nmsClassesOutput,
+        nmsIndicesOutput,
+        (BoxCorner<T>*)nmsBoxesOutput);
+  }
+
+  if (param.outputONNXIndices) {
+    PadONNXResult<<<1, 1, 0, stream>>>(
+        param, outputIndexData, nmsIndicesOutput);
+  }
+
+  return {{prefix}}GetLastError();
+}
+
+__global__ void EfficientNMSFilterSegments(
+    EfficientNMSParameters param,
+    const int* __restrict__ topNumData,
+    int* __restrict__ topOffsetsStartData,
+    int* __restrict__ topOffsetsEndData) {
+  int imageIdx = threadIdx.x;
+  if (imageIdx > param.batchSize) {
+    return;
+  }
+  topOffsetsStartData[imageIdx] = imageIdx * param.numScoreElements;
+  topOffsetsEndData[imageIdx] =
+      imageIdx * param.numScoreElements + topNumData[imageIdx];
+}
+
+template <typename T>
+__global__ void EfficientNMSFilter(
+    EfficientNMSParameters param,
+    const T* __restrict__ scoresInput,
+    int* __restrict__ topNumData,
+    int* __restrict__ topIndexData,
+    int* __restrict__ topAnchorsData,
+    T* __restrict__ topScoresData,
+    int* __restrict__ topClassData) {
+  int elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
+  int imageIdx = blockDim.y * blockIdx.y + threadIdx.y;
+
+  // Boundary Conditions
+  if (elementIdx >= param.numScoreElements || imageIdx >= param.batchSize) {
+    return;
+  }
+
+  // Shape of scoresInput: [batchSize, numAnchors, numClasses]
+  int scoresInputIdx = imageIdx * param.numScoreElements + elementIdx;
+
+  // For each class, check its corresponding score if it crosses the threshold,
+  // and if so select this anchor, and keep track of the maximum score and the
+  // corresponding (argmax) class id
+  T score = scoresInput[scoresInputIdx];
+  if (gte_mp(score, (T)param.scoreThreshold)) {
+    // Unpack the class and anchor index from the element index
+    int classIdx = elementIdx % param.numClasses;
+    int anchorIdx = elementIdx / param.numClasses;
+
+    // If this is a background class, ignore it.
+    if (classIdx == param.backgroundClass) {
+      return;
+    }
+
+    // Use an atomic to find an open slot where to write the selected anchor
+    // data.
+    if (topNumData[imageIdx] >= param.numScoreElements) {
+      return;
+    }
+    int selectedIdx = atomicAdd((unsigned int*)&topNumData[imageIdx], 1);
+    if (selectedIdx >= param.numScoreElements) {
+      topNumData[imageIdx] = param.numScoreElements;
+      return;
+    }
+
+    // Shape of topScoresData / topClassData: [batchSize, numScoreElements]
+    int topIdx = imageIdx * param.numScoreElements + selectedIdx;
+
+    if (param.scoreBits > 0) {
+      score = add_mp(score, (T)1);
+      if (gt_mp(score, (T)(2.f - 1.f / 1024.f))) {
+        // Ensure the incremented score fits in the mantissa without changing
+        // the exponent
+        score = (2.f - 1.f / 1024.f);
+      }
+    }
+
+    topIndexData[topIdx] = selectedIdx;
+    topAnchorsData[topIdx] = anchorIdx;
+    topScoresData[topIdx] = score;
+    topClassData[topIdx] = classIdx;
+  }
+}
+
+template <typename T>
+__global__ void EfficientNMSDenseIndex(
+    EfficientNMSParameters param,
+    int* __restrict__ topNumData,
+    int* __restrict__ topIndexData,
+    int* __restrict__ topAnchorsData,
+    int* __restrict__ topOffsetsStartData,
+    int* __restrict__ topOffsetsEndData,
+    T* __restrict__ topScoresData,
+    int* __restrict__ topClassData) {
+  int elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
+  int imageIdx = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (elementIdx >= param.numScoreElements || imageIdx >= param.batchSize) {
+    return;
+  }
+
+  int dataIdx = imageIdx * param.numScoreElements + elementIdx;
+  int anchorIdx = elementIdx / param.numClasses;
+  int classIdx = elementIdx % param.numClasses;
+  if (param.scoreBits > 0) {
+    T score = topScoresData[dataIdx];
+    if (lt_mp(score, (T)param.scoreThreshold)) {
+      score = (T)1;
+    } else if (classIdx == param.backgroundClass) {
+      score = (T)1;
+    } else {
+      score = add_mp(score, (T)1);
+      if (gt_mp(score, (T)(2.f - 1.f / 1024.f))) {
+        // Ensure the incremented score fits in the mantissa without changing
+        // the exponent
+        score = (2.f - 1.f / 1024.f);
+      }
+    }
+    topScoresData[dataIdx] = score;
+  } else {
+    T score = topScoresData[dataIdx];
+    if (lt_mp(score, (T)param.scoreThreshold)) {
+      topScoresData[dataIdx] = -(1 << 15);
+    } else if (classIdx == param.backgroundClass) {
+      topScoresData[dataIdx] = -(1 << 15);
+    }
+  }
+
+  topIndexData[dataIdx] = elementIdx;
+  topAnchorsData[dataIdx] = anchorIdx;
+  topClassData[dataIdx] = classIdx;
+
+  if (elementIdx == 0) {
+    // Saturate counters
+    topNumData[imageIdx] = param.numScoreElements;
+    topOffsetsStartData[imageIdx] = imageIdx * param.numScoreElements;
+    topOffsetsEndData[imageIdx] = (imageIdx + 1) * param.numScoreElements;
+  }
+}
+
+template <typename T>
+{{prefix}}Error_t EfficientNMSFilterLauncher(
+    EfficientNMSParameters& param,
+    const T* scoresInput,
+    int* topNumData,
+    int* topIndexData,
+    int* topAnchorsData,
+    int* topOffsetsStartData,
+    int* topOffsetsEndData,
+    T* topScoresData,
+    int* topClassData,
+    {{prefix}}Stream_t stream) {
+  const unsigned int elementsPerBlock = 512;
+  const unsigned int imagesPerBlock = 1;
+  const unsigned int elementBlocks =
+      (param.numScoreElements + elementsPerBlock - 1) / elementsPerBlock;
+  const unsigned int imageBlocks =
+      (param.batchSize + imagesPerBlock - 1) / imagesPerBlock;
+  const dim3 blockSize = {elementsPerBlock, imagesPerBlock, 1};
+  const dim3 gridSize = {elementBlocks, imageBlocks, 1};
+
+  float kernelSelectThreshold = 0.007f;
+  if (param.scoreSigmoid) {
+    // Inverse Sigmoid
+    if (param.scoreThreshold <= 0.f) {
+      param.scoreThreshold = -(1 << 15);
+    } else {
+      param.scoreThreshold =
+          logf(param.scoreThreshold / (1.f - param.scoreThreshold));
+    }
+    kernelSelectThreshold =
+        logf(kernelSelectThreshold / (1.f - kernelSelectThreshold));
+    // Disable Score Bits Optimization
+    param.scoreBits = -1;
+  }
+
+  if (param.scoreThreshold < kernelSelectThreshold) {
+    // A full copy of the buffer is necessary because sorting will scramble the
+    // input data otherwise.
+    {{prefix}}MemcpyAsync(
+        topScoresData,
+        scoresInput,
+        param.batchSize * param.numScoreElements * sizeof(T),
+        {{prefix}}MemcpyDeviceToDevice,
+        stream);
+
+    EfficientNMSDenseIndex<T><<<gridSize, blockSize, 0, stream>>>(
+        param,
+        topNumData,
+        topIndexData,
+        topAnchorsData,
+        topOffsetsStartData,
+        topOffsetsEndData,
+        topScoresData,
+        topClassData);
+  } else {
+    EfficientNMSFilter<T><<<gridSize, blockSize, 0, stream>>>(
+        param,
+        scoresInput,
+        topNumData,
+        topIndexData,
+        topAnchorsData,
+        topScoresData,
+        topClassData);
+
+    EfficientNMSFilterSegments<<<1, param.batchSize, 0, stream>>>(
+        param, topNumData, topOffsetsStartData, topOffsetsEndData);
+  }
+
+  return {{prefix}}GetLastError();
+}
+
+template <typename T>
+size_t EfficientNMSSortWorkspaceSize(int batchSize, int numScoreElements) {
+  size_t sortedWorkspaceSize = 0;
+  {{cub}}::DoubleBuffer<T> keysDB(nullptr, nullptr);
+  {{cub}}::DoubleBuffer<int> valuesDB(nullptr, nullptr);
+  {{cub}}::DeviceSegmentedRadixSort::SortPairsDescending(
+      nullptr,
+      sortedWorkspaceSize,
+      keysDB,
+      valuesDB,
+      numScoreElements,
+      batchSize,
+      (const int*)nullptr,
+      (const int*)nullptr);
+  return sortedWorkspaceSize;
+}
+
+template <typename T>
+size_t
+EfficientNMSWorkspaceSize(int batchSize, int numScoreElements, int numClasses) {
+  size_t total = 0;
+  const size_t align = 256;
+  // Counters
+  // 3 for Filtering
+  // 1 for Output Indexing
+  // C for Max per Class Limiting
+  size_t size = (3 + 1 + numClasses) * batchSize * sizeof(int64_t);
+  total += size + (size % align ? align - (size % align) : 0);
+  // Int Buffers
+  for (int i = 0; i < 4; i++) {
+    size = batchSize * numScoreElements * sizeof(int64_t);
+    total += size + (size % align ? align - (size % align) : 0);
+  }
+  // Float Buffers
+  for (int i = 0; i < 2; i++) {
+    size = batchSize * numScoreElements * sizeof(T);
+    total += size + (size % align ? align - (size % align) : 0);
+  }
+  // Sort Workspace
+  size = EfficientNMSSortWorkspaceSize<T>(batchSize, numScoreElements);
+  total += size + (size % align ? align - (size % align) : 0);
+  return total;
+}
+
+template <typename T>
+T* EfficientNMSWorkspace(void* workspace, size_t& offset, size_t elements) {
+  T* buffer = (T*)((size_t)workspace + offset);
+  size_t align = 256;
+  size_t size = elements * sizeof(T);
+  size_t sizeAligned = size + (size % align ? align - (size % align) : 0);
+  offset += sizeAligned;
+  return buffer;
+}
+
+template <typename T>
+pluginStatus_t EfficientNMSDispatch(
+    EfficientNMSParameters param,
+    const void* boxesInput,
+    const void* scoresInput,
+    const void* anchorsInput,
+    void* numDetectionsOutput,
+    void* nmsBoxesOutput,
+    void* nmsScoresOutput,
+    void* nmsClassesOutput,
+    void* nmsIndicesOutput,
+    void* workspace,
+    {{prefix}}Stream_t stream) {
+  // Clear Outputs (not all elements will get overwritten by the kernels, so
+  // safer to clear everything out)
+  if (param.outputONNXIndices) {
+    {{prefix}}MemsetAsync(
+        nmsIndicesOutput,
+        0xFF,
+        param.batchSize * param.numOutputBoxes * 3 * sizeof(int),
+        stream);
+  } else {
+    {{prefix}}MemsetAsync(
+        numDetectionsOutput, 0x00, param.batchSize * sizeof(int64_t), stream);
+    {{prefix}}MemsetAsync(
+        nmsScoresOutput,
+        0x00,
+        param.batchSize * param.numOutputBoxes * sizeof(T),
+        stream);
+    {{prefix}}MemsetAsync(
+        nmsBoxesOutput,
+        0x00,
+        param.batchSize * param.numOutputBoxes * 4 * sizeof(T),
+        stream);
+    {{prefix}}MemsetAsync(
+        nmsClassesOutput,
+        0x00,
+        param.batchSize * param.numOutputBoxes * sizeof(int64_t),
+        stream);
+  }
+
+  // Empty Inputs
+  if (param.numScoreElements < 1) {
+    return STATUS_SUCCESS;
+  }
+
+  // Counters Workspace
+  size_t workspaceOffset = 0; // 1 << 20;
+  int countersTotalSize = (3 + 1 + param.numClasses) * param.batchSize;
+  int* topNumData =
+      EfficientNMSWorkspace<int>(workspace, workspaceOffset, countersTotalSize);
+  int* topOffsetsStartData = topNumData + param.batchSize;
+  int* topOffsetsEndData = topNumData + 2 * param.batchSize;
+  int* outputIndexData = topNumData + 3 * param.batchSize;
+  int* outputClassData = topNumData + 4 * param.batchSize;
+  {{prefix}}MemsetAsync(topNumData, 0x00, countersTotalSize * sizeof(int), stream);
+  {{prefix}}Error_t status = {{prefix}}GetLastError();
+  CSC(status, STATUS_FAILURE);
+
+  // Other Buffers Workspace
+  int* topIndexData = EfficientNMSWorkspace<int>(
+      workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+  int* topClassData = EfficientNMSWorkspace<int>(
+      workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+  int* topAnchorsData = EfficientNMSWorkspace<int>(
+      workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+  int* sortedIndexData = EfficientNMSWorkspace<int>(
+      workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+  T* topScoresData = EfficientNMSWorkspace<T>(
+      workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+  T* sortedScoresData = EfficientNMSWorkspace<T>(
+      workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+  size_t sortedWorkspaceSize =
+      EfficientNMSSortWorkspaceSize<T>(param.batchSize, param.numScoreElements);
+  char* sortedWorkspaceData = EfficientNMSWorkspace<char>(
+      workspace, workspaceOffset, sortedWorkspaceSize);
+  {{cub}}::DoubleBuffer<T> scoresDB(topScoresData, sortedScoresData);
+  {{cub}}::DoubleBuffer<int> indexDB(topIndexData, sortedIndexData);
+
+  // Kernels
+  status = EfficientNMSFilterLauncher<T>(
+      param,
+      (T*)scoresInput,
+      topNumData,
+      topIndexData,
+      topAnchorsData,
+      topOffsetsStartData,
+      topOffsetsEndData,
+      topScoresData,
+      topClassData,
+      stream);
+  CSC(status, STATUS_FAILURE);
+
+  status = {{cub}}::DeviceSegmentedRadixSort::SortPairsDescending(
+      sortedWorkspaceData,
+      sortedWorkspaceSize,
+      scoresDB,
+      indexDB,
+      param.batchSize * param.numScoreElements,
+      param.batchSize,
+      topOffsetsStartData,
+      topOffsetsEndData,
+      param.scoreBits > 0 ? (10 - param.scoreBits) : 0,
+      param.scoreBits > 0 ? 10 : sizeof(T) * 8,
+      stream,
+      false);
+  CSC(status, STATUS_FAILURE);
+
+  status = EfficientNMSLauncher<T>(
+      param,
+      topNumData,
+      outputIndexData,
+      outputClassData,
+      indexDB.Current(),
+      scoresDB.Current(),
+      topClassData,
+      topAnchorsData,
+      boxesInput,
+      anchorsInput,
+      (int64_t*)numDetectionsOutput,
+      (T*)nmsScoresOutput,
+      (int64_t*)nmsClassesOutput,
+      (int*)nmsIndicesOutput,
+      nmsBoxesOutput,
+      stream);
+  CSC(status, STATUS_FAILURE);
+
+  return STATUS_SUCCESS;
+}
+
+void EfficientNMSInference(
+    EfficientNMSParameters param,
+    const void* boxesInput,
+    const void* scoresInput,
+    const void* anchorsInput,
+    void* numDetectionsOutput,
+    void* nmsBoxesOutput,
+    void* nmsScoresOutput,
+    void* nmsClassesOutput,
+    void* nmsIndicesOutput,
+    void* workspace,
+    {{prefix}}Stream_t stream) {
+  if (param.scoreBits <= 0 || param.scoreBits > 10) {
+    param.scoreBits = -1;
+  }
+  EfficientNMSDispatch<__half>(
+      param,
+      boxesInput,
+      scoresInput,
+      anchorsInput,
+      numDetectionsOutput,
+      nmsBoxesOutput,
+      nmsScoresOutput,
+      nmsClassesOutput,
+      nmsIndicesOutput,
+      workspace,
+      stream);
+}
+"""
+)
diff --git a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
new file mode 100644
index 000000000..19f8bd6cd
--- /dev/null
+++ b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
@@ -0,0 +1,464 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+multi-level roi align common functions for all backends.
+"""
+
+import jinja2
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}FPNRoiAlign<float, {{num_rois}}, {{pooled_size}}>(
+{{indent}}    in_ptr_p2,
+{{indent}}    in_ptr_p3,
+{{indent}}    in_ptr_p4,
+{{indent}}    in_ptr_p5,
+{{indent}}    rois_ptr,
+{{indent}}    out_ptr,
+{{indent}}    batchSize,
+{{indent}}    featureCount,
+{{indent}}    imageSize,
+{{indent}}    P2dims,
+{{indent}}    P3dims,
+{{indent}}    P4dims,
+{{indent}}    P5dims,
+{{indent}}    sampling_ratio,
+{{indent}}    spatial_scale,
+{{indent}}    position_sensitive,
+{{indent}}    continuous_coordinate,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+// customized roi align kernel
+
+struct xy_t {
+  int64_t y;
+  int64_t x;
+
+  xy_t() : y(0), x(0) {}
+  xy_t(int64_t y_, int64_t x_) : y(y_), x(x_) {}
+};
+
+template <typename T>
+__device__ inline T interpolateBilinear(
+    const T* src,
+    xy_t srcDims,
+    float y,
+    float x,
+    const int channels) {
+  // deal with cases that inverse elements are out of feature map boundary
+  int height = srcDims.y;
+  int width = srcDims.x;
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = T(1.0) - ly, hx = T(1.0) - lx;
+  // do bilinear interpolation
+  T v1 = src[channels * (y_low * width + x_low)];
+  T v2 = src[channels * (y_low * width + x_high)];
+  T v3 = src[channels * (y_high * width + x_low)];
+  T v4 = src[channels * (y_high * width + x_high)];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename Trois, typename Tfeat>
+__global__ void roiAlign_kernel(
+    xy_t imageSize,
+    int featureCount,
+    int roiCount,
+    float threshold,
+    int samplingRatio,
+    const Trois* rois,
+    const Tfeat* P2,
+    const xy_t P2dims,
+    const Tfeat* P3,
+    const xy_t P3dims,
+    const Tfeat* P4,
+    const xy_t P4dims,
+    const Tfeat* P5,
+    const xy_t P5dims,
+    Tfeat* pooled,
+    const xy_t poolDims) {
+  const int batch = blockIdx.x;
+  const int feature = blockIdx.y;
+  const int roiIdx = blockIdx.z;
+
+  const Trois* roi = rois + 5 * (batch * roiCount + roiIdx);
+  float hw;
+  float x1 = __half2float(roi[1]);
+  float y1 = __half2float(roi[2]);
+  float x2 = __half2float(roi[3]);
+  float y2 = __half2float(roi[4]);
+
+  y1 = max(0.f, min((float)imageSize.y, y1)) / imageSize.y;
+  x1 = max(0.f, min((float)imageSize.x, x1)) / imageSize.x;
+  y2 = max(0.f, min((float)imageSize.y, y2)) / imageSize.y;
+  x2 = max(0.f, min((float)imageSize.x, x2)) / imageSize.x;
+
+  hw = (y2 - y1) * (x2 - x1);
+
+  const Tfeat* src = P2;
+  xy_t srcDims = P2dims;
+  int iP = 2;
+
+  if (hw > threshold) {
+    src = P3;
+    srcDims = P3dims;
+    ++iP;
+  }
+  threshold *= 4;
+
+  if (hw > threshold) {
+    src = P4;
+    srcDims = P4dims;
+    ++iP;
+  }
+  threshold *= 4;
+
+  if (hw > threshold) {
+    src = P5;
+    srcDims = P5dims;
+    ++iP;
+  }
+
+  src += batch * srcDims.x * srcDims.y * featureCount + feature;
+  // batch, roiCount, poolx, pooly, featureCount
+  Tfeat* dst = pooled +
+      poolDims.x * poolDims.y *
+          (batch * roiCount * featureCount + roiIdx * featureCount) +
+      feature;
+
+  float samplingOffset = 0.5f;
+  float inputOffset = 0.5f;
+
+  float yStart = y1 * srcDims.y - inputOffset;
+  float xStart = x1 * srcDims.x - inputOffset;
+
+  float yEnd = y2 * srcDims.y - inputOffset;
+  float xEnd = x2 * srcDims.x - inputOffset;
+
+  float yDelta = (yEnd - yStart) / poolDims.y;
+  float xDelta = (xEnd - xStart) / poolDims.x;
+
+  const int samplingRatioX = samplingRatio > 0
+      ? samplingRatio
+      : max(1, (int)ceilf((xEnd - xStart) / poolDims.x));
+  const int samplingRatioY = samplingRatio > 0
+      ? samplingRatio
+      : max(1, (int)ceilf((yEnd - yStart) / poolDims.y));
+  const int samplingCount = samplingRatioX * samplingRatioY;
+
+  for (int outIdx = threadIdx.x; outIdx < poolDims.x * poolDims.y;
+       outIdx += blockDim.x) {
+    int xx = outIdx % poolDims.x;
+    int yy = outIdx / poolDims.x;
+    Tfeat* out = dst + (poolDims.x * yy + xx) * featureCount;
+    Tfeat result = 0;
+    for (int iy = 0; iy < samplingRatioY; iy++) {
+      float ySample = yStart + yDelta * yy;
+      ySample += yDelta * (iy + samplingOffset) / samplingRatioY;
+      ySample = min(max(ySample, 0.f), srcDims.y - 1.0f);
+
+      for (int ix = 0; ix < samplingRatioX; ix++) {
+        float xSample = xStart + xDelta * xx;
+        xSample += xDelta * (ix + samplingOffset) / samplingRatioX;
+        xSample = min(max(xSample, 0.f), srcDims.x - 1.0f);
+
+        result +=
+            interpolateBilinear(src, srcDims, ySample, xSample, featureCount);
+      }
+    }
+    *out = result / __float2half_rn(samplingCount);
+  }
+}
+
+template <typename T, int roiCount, int pool_size>
+void FPNRoiAlign(
+    {{elem_input_type}}* P2,
+    {{elem_input_type}}* P3,
+    {{elem_input_type}}* P4,
+    {{elem_input_type}}* P5,
+    {{elem_input_type}}* rois,
+    {{elem_output_type}}* output,
+    const int batchSize,
+    const int featureCount,
+    const xy_t imageSize,
+    const xy_t P2dims,
+    const xy_t P3dims,
+    const xy_t P4dims,
+    const xy_t P5dims,
+    const int samplingRatio,
+    const float spatial_scale,
+    const bool position_sensitive,
+    const bool continuous_coordinate,
+    {{prefix}}Stream_t stream) {
+  float mFPNScale = 224;
+  float normScale = sqrtf(mFPNScale * mFPNScale / (imageSize.x * imageSize.y));
+  float firstThreshold = normScale * normScale / 4.f;
+
+  const dim3 blocks(batchSize, featureCount, roiCount);
+  const int threads(min(256, pool_size * pool_size));
+
+  roiAlign_kernel<<<blocks, threads, 0, stream>>>(
+      imageSize,
+      featureCount,
+      roiCount,
+      firstThreshold,
+      samplingRatio,
+      (const half*)rois,
+      (const half*)P2,
+      P2dims,
+      (const half*)P3,
+      P3dims,
+      (const half*)P4,
+      P4dims,
+      (const half*)P5,
+      P5dims,
+      (half*)output,
+      {pool_size, pool_size});
+}
+
+} // namespace
+
+void {{function_name}} (
+    {{elem_input_type}}* in_ptr_p2,
+    {{elem_input_type}}* in_ptr_p3,
+    {{elem_input_type}}* in_ptr_p4,
+    {{elem_input_type}}* in_ptr_p5,
+    {{elem_input_type}}* rois_ptr,
+    {{elem_output_type}}* out_ptr,
+    {{index_type}}* batch, {{index_type}}* in_ch,
+    {{index_type}}* p2_h, {{index_type}}* p2_w,
+    {{index_type}}* p3_h, {{index_type}}* p3_w,
+    {{index_type}}* p4_h, {{index_type}}* p4_w,
+    {{index_type}}* p5_h, {{index_type}}* p5_w,
+    const int im_h, const int im_w,
+    int sampling_ratio,
+    const float spatial_scale,
+    const bool position_sensitive,
+    const bool continuous_coordinate,
+    {{prefix}}Stream_t stream
+) {
+  {{shape_function}}
+
+  const xy_t imageSize = {im_h, im_w};
+  const xy_t P2dims = {*p2_h, *p2_w};
+  const xy_t P3dims = {*p3_h, *p3_w};
+  const xy_t P4dims = {*p4_h, *p4_w};
+  const xy_t P5dims = {*p5_h, *p5_w};
+  const int featureCount = *in_ch;
+  const int batchSize = *batch;
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this bilinear upsampling specialization."
+  );
+}
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  {{elem_input_type}}*,
+  {{elem_input_type}}*,
+  {{elem_input_type}}*,
+  {{elem_input_type}}*,
+  {{elem_input_type}}*,
+  {{elem_output_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  int,
+  int,
+  int,
+  float,
+  bool,
+  bool,
+  {{prefix}}Stream_t
+);
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p2}}),
+{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p3}}),
+{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p4}}),
+{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p5}}),
+{{indent}}    static_cast<{{elem_input_type}}*>({{rois_ptr}}),
+{{indent}}    static_cast<{{elem_output_type}}*>({{out_ptr}}),
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p2_h}}, {{p2_w}},
+{{indent}}    {{p3_h}}, {{p3_w}},
+{{indent}}    {{p4_h}}, {{p4_w}},
+{{indent}}    {{p5_h}}, {{p5_w}},
+{{indent}}    {{im_h}}, {{im_w}},
+{{indent}}    {{sampling_ratio}},
+{{indent}}    {{spatial_scale}},
+{{indent}}    {{position_sensitive}},
+{{indent}}    {{continuous_coordinate}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_function_decl(func_attrs, backend_spec):
+    """Function declaration generation
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        It describes the operation attributes
+    backend_spec : custom class
+        It specifies the corresponding backend dtypes of pytorch dtypes for many operations
+
+    Returns
+    -------
+    str
+        Rendered function declaration stmt
+    """
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        func_name=func_attrs["name"],
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+    )
+
+
+def gen_function_call(func_attrs, backend_spec, indent="  "):
+    """Function call generation
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        It describes the operation attributes
+    indent : str, optional
+        Indent for template, by default "  "
+
+    Returns
+    -------
+    str
+        Rendered function call
+    """
+    p2 = func_attrs["inputs"][0]
+    p3 = func_attrs["inputs"][1]
+    p4 = func_attrs["inputs"][2]
+    p5 = func_attrs["inputs"][3]
+    rois = func_attrs["inputs"][4]
+    xshape = p2._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+
+    input_type = backend_spec.dtype_to_lib_type(p2._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr_p2=p2._attrs["name"],
+        in_ptr_p3=p3._attrs["name"],
+        in_ptr_p4=p4._attrs["name"],
+        in_ptr_p5=p5._attrs["name"],
+        rois_ptr=rois._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        im_h=func_attrs["im_shape"][0],
+        im_w=func_attrs["im_shape"][1],
+        p2_h="&" + p2._attrs["shape"][1]._attrs["name"],
+        p2_w="&" + p2._attrs["shape"][2]._attrs["name"],
+        p3_h="&" + p3._attrs["shape"][1]._attrs["name"],
+        p3_w="&" + p3._attrs["shape"][2]._attrs["name"],
+        p4_h="&" + p4._attrs["shape"][1]._attrs["name"],
+        p4_w="&" + p4._attrs["shape"][2]._attrs["name"],
+        p5_h="&" + p5._attrs["shape"][1]._attrs["name"],
+        p5_w="&" + p5._attrs["shape"][2]._attrs["name"],
+        sampling_ratio=func_attrs["sampling_ratio"],
+        spatial_scale=func_attrs["spatial_scale"],
+        position_sensitive="true" if func_attrs["position_sensitive"] else "false",
+        continuous_coordinate="true"
+        if func_attrs["continuous_coordinate"]
+        else "false",
+        backend_spec=backend_spec,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/common/vision_ops/nms_common.py b/python/aitemplate/backend/common/vision_ops/nms_common.py
new file mode 100644
index 000000000..50cc5e356
--- /dev/null
+++ b/python/aitemplate/backend/common/vision_ops/nms_common.py
@@ -0,0 +1,235 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+nms kernel codegen.
+"""
+
+import os
+from typing import Any, Dict, List
+
+import jinja2
+
+from ... import builder
+from ...target import Target
+from .nms_kernel import KERNEL_TEMPLATE
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+const int T_SIZE = {{T_SIZE}}; //(preNmsTopN + blockSize - 1) / blockSize - 1;
+{{kernel}}
+
+}  // namespace
+
+{{func_signature}}
+{
+
+    const int N = *batch;
+    const int R = *num_rois;
+    nmsGpu<half, half>(stream, N, R, preNmsTop, nmsMaxOut, iouThreshold, minBoxSize, fgScores, proposals, workspace, rois);
+}
+    """
+)
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+{{header_files}}
+
+
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+namespace {
+
+const int T_SIZE = {{T_SIZE}}; //(preNmsTopN + blockSize - 1) / blockSize - 1;
+{{kernel}}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  int instance_num = std::stoi(argv[1]); // batch
+  int instance_size = std::stoi(argv[2]); // num_rois
+  int elem_cnt = instance_size * instance_num;
+
+  float runtime_ms = 0;
+  const int64_t offsets_bytes = GetCudaAlignedSize((instance_num+1) * sizeof(int64_t));
+  const int64_t scores_bytes = GetCudaAlignedSize(elem_cnt * sizeof(half));
+  const int64_t boxes_bytes = GetCudaAlignedSize(elem_cnt * 4 * sizeof(half));
+  int64_t temp_storage_bytes = InferTempStorageForSortPairsDescending<half, int64_t>(instance_num, instance_size);
+
+  GLOBAL_WORKSPACE_SIZE = GetCudaAlignedSize(offsets_bytes + scores_bytes + boxes_bytes + temp_storage_bytes);
+
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(half* rois,
+                   const half* proposals,
+                   const half* fgScores,
+                   int64_t* batch,
+                   int64_t* num_rois,
+                   const {{index_type}} preNmsTop,
+                   const {{index_type}} nmsMaxOut,
+                   const float iouThreshold,
+                   const float minBoxSize,
+                   uint8_t* workspace,
+                   {{prefix}}Stream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{rois}}, {{proposals}}, {{fgScores}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{num_rois}},
+{{indent}}    {{preNmsTop}},
+{{indent}}    {{nmsMaxOut}},
+{{indent}}    {{iouThreshold}},
+{{indent}}    {{minBoxSize}},
+{{indent}}    global_workspace, stream /* default stream */
+{{indent}});
+    """
+)
+
+
+def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+    """the function for generating nms kernel"""
+    blockSize = 1024
+    t_size = int((func_attrs["preNmsTop"] + blockSize - 1) / blockSize)
+    if backend_spec.backend_name == "cuda":
+        cuda_hmaxmin = True
+    else:
+        cuda_hmaxmin = False
+
+    return FUNC_TEMPLATE.render(
+        T_SIZE=t_size,
+        header_files=header_files,
+        kernel=KERNEL_TEMPLATE.render(
+            prefix=backend_spec.prefix, cub=backend_spec.cub, cuda_hmaxmin=cuda_hmaxmin
+        ),
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            prefix=backend_spec.prefix,
+            index_type=backend_spec.index_type,
+        ),
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            prefix=backend_spec.prefix,
+            index_type=backend_spec.index_type,
+        ).strip()
+    )
+
+
+def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent: str) -> str:
+    """ "The function for generating a function call to nms"""
+    output_name = ""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 2
+
+    output_name = backend_spec.cast_to_half_ptr_template.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+    (input_name, score_name) = (
+        backend_spec.cast_to_half_ptr_template.render(name=input_tensor._attrs["name"])
+        for input_tensor in func_attrs["inputs"]
+    )
+
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        rois=output_name,
+        proposals=input_name,
+        fgScores=score_name,
+        p_batch="&" + xshape[0]._attrs["name"],
+        num_rois="&" + xshape[1]._attrs["name"],
+        preNmsTop=func_attrs["preNmsTop"],
+        nmsMaxOut=func_attrs["nmsMaxOut"],
+        iouThreshold=func_attrs["iouThreshold"],
+        minBoxSize=func_attrs["minBoxSize"],
+        indent=indent,
+    )
+
+
+def add_profiler(
+    file_pairs: List[Any], workdir: str, op_type, output_name: str, code: str
+) -> None:
+    """generate code for profiling"""
+    prefix = os.path.join(workdir, "profiler", op_type)
+    if not os.path.exists(prefix):
+        os.makedirs(prefix)
+    src_path = os.path.join(prefix, output_name + ".cu")
+    obj_path = os.path.join(prefix, output_name)
+    if os.path.exists(obj_path):
+        return
+    with open(src_path, "w") as f:
+        f.write(code)
+    file_pairs.append((src_path, obj_path))
+
+
+def gen_profiler(
+    func_attrs: Dict[str, Any], workdir: str, header_files: str, backend_spec
+) -> None:
+    """generate and build code for NMS profiling"""
+    op_type = func_attrs["op"]
+    file_pairs = []
+    blockSize = 1024
+    t_size = int((func_attrs["preNmsTop"] + blockSize - 1) / blockSize)
+
+    if backend_spec.backend_name == "cuda":
+        cuda_hmaxmin = True
+    else:
+        cuda_hmaxmin = False
+
+    code = PROFILER_TEMPLATE.render(
+        T_SIZE=t_size,
+        header_files=header_files,
+        kernel=KERNEL_TEMPLATE.render(
+            prefix=backend_spec.prefix, cub=backend_spec.cub, cuda_hmaxmin=cuda_hmaxmin
+        ),
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            prefix=backend_spec.prefix,
+            index_type=backend_spec.index_type,
+        ),
+    )
+    op_name = func_attrs["op"]
+    add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    target = Target.current()
+    compile_engine = builder.Builder()
+    compile_engine.build_objs(file_pairs, target.compile_cmd(executable=True))
diff --git a/python/aitemplate/backend/common/vision_ops/nms_kernel.py b/python/aitemplate/backend/common/vision_ops/nms_kernel.py
new file mode 100644
index 000000000..1eb8a51bd
--- /dev/null
+++ b/python/aitemplate/backend/common/vision_ops/nms_kernel.py
@@ -0,0 +1,565 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+nms kernel template.
+"""
+import jinja2
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// code adapted from
+// https://github.com/NVIDIA/TensorRT/blob/main/plugin/common/kernels/nmsLayer.cu
+//------------------------------------------------------------------------
+// GPU kernel parameters.
+
+template <
+    typename Key,
+    int BLOCK_THREADS,
+    int ITEMS_PER_THREAD>
+__launch_bounds__(BLOCK_THREADS) __global__ void BlockSortKernel(
+    Key* d_in, // Tile of input
+    Key* d_out) // Elapsed cycle count of block scan
+{
+  enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+  // Specialize BlockLoad type for our thread block (uses warp-striped loads for
+  // coalescing, then transposes in shared memory to a blocked arrangement)
+  typedef {{cub}}::BlockLoad<
+      Key,
+      BLOCK_THREADS,
+      ITEMS_PER_THREAD,
+      {{cub}}::BLOCK_LOAD_WARP_TRANSPOSE>
+      BlockLoadT;
+
+  // Specialize BlockRadixSort type for our thread block
+  typedef {{cub}}::BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD>
+      BlockRadixSortT;
+
+  // Shared memory
+  __shared__ union TempStorage {
+    typename BlockLoadT::TempStorage load;
+    typename BlockRadixSortT::TempStorage sort;
+  } temp_storage;
+
+  // Per-thread tile items
+  Key items[ITEMS_PER_THREAD];
+
+  // Our current block's offset
+  int block_offset = blockIdx.x * TILE_SIZE;
+
+  // Load items into a blocked arrangement
+  BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+
+  // Barrier for smem reuse
+  __syncthreads();
+
+  // Start cycle timer
+  clock_t start = clock();
+
+  // Sort keys
+  BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
+
+  // Stop cycle timer
+  clock_t stop = clock();
+
+  // Store output in striped fashion
+  {{cub}}::StoreDirectStriped<BLOCK_THREADS>(
+      threadIdx.x, d_out + block_offset, items);
+
+  // // Store elapsed clocks
+  // if (threadIdx.x == 0)
+  // {
+  //     d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start;
+  // }
+}
+
+typedef enum {
+  STATUS_SUCCESS = 0,
+  STATUS_FAILURE = 1,
+  STATUS_BAD_PARAM = 2,
+  STATUS_NOT_SUPPORTED = 3,
+  STATUS_NOT_INITIALIZED = 4
+} pluginStatus_t;
+
+typedef enum { NCHW = 0, NC4HW = 1, NC32HW = 2 } DLayout_t;
+
+#define CSC(call, err)               \
+  do {                               \
+    {{prefix}}Error_t {{prefix}}Status = call;   \
+    if ({{prefix}}Status != {{prefix}}Success) { \
+      return err;                    \
+    }                                \
+  } while (0)
+
+template <typename T>
+struct Bbox {
+  T xmin, ymin, xmax, ymax;
+  Bbox(T xmin, T ymin, T xmax, T ymax)
+      : xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {}
+  Bbox() = default;
+};
+
+// HASH
+unsigned int hash(const void* array_, size_t size) {
+  // Apply hashing only when debugging RPN codes.
+  if (0) {
+    const char* array_const;
+    char* array;
+    {{prefix}}MallocHost((void**)&array, size);
+    {{prefix}}Memcpy(array, array_, size, {{prefix}}MemcpyDeviceToHost);
+    array_const = array;
+    unsigned int hash = 45599;
+    for (size_t i = 0; i < size; i++) {
+      unsigned int value = array_const[i];
+      hash = hash * 1487 + value;
+      hash = hash * 317;
+      hash = hash % 105359;
+    }
+    return hash;
+  } else {
+    return 0;
+  }
+}
+
+// ALIGNPTR
+int8_t* alignPtr(int8_t* ptr, uintptr_t to) {
+  uintptr_t addr = (uintptr_t)ptr;
+  if (addr % to) {
+    addr += to - addr % to;
+  }
+  return (int8_t*)addr;
+}
+
+#define ASSERT_PARAM(exp)      \
+  do {                         \
+    if (!(exp))                \
+      return STATUS_BAD_PARAM; \
+  } while (0)
+
+// CUB's bug workaround:
+// To work properly for large batch size CUB segmented sort needs ridiculous
+// workspace alignment.
+const uintptr_t ALIGNMENT = 1 << 20;
+
+// IOU
+// template <typename TFloat>
+// __device__ __host__ inline float IoU(const Bbox<TFloat>& a, const
+// Bbox<TFloat>& b)
+// {
+//     TFloat left = max(a.xmin, b.xmin), right = min(a.xmax, b.xmax);
+//     TFloat top = max(a.ymin, b.ymin), bottom = min(a.ymax, b.ymax);
+//     TFloat width = max((TFloat)(right - left + (TFloat) 1.0), (TFloat) 0.0);
+//     TFloat height = max((TFloat)(bottom - top + (TFloat) 1.0), (TFloat) 0.0);
+//     TFloat interS = width * height;
+//     TFloat Sa = (a.xmax - a.xmin + (TFloat) 1) * (a.ymax - a.ymin + (TFloat)
+//     1); TFloat Sb = (b.xmax - b.xmin + (TFloat) 1) * (b.ymax - b.ymin +
+//     (TFloat) 1); return (float) interS / (float) (Sa + Sb - interS);
+// }
+
+__device__ inline half hmax(const half a, const half b) {
+{% if cuda_hmaxmin %}
+#if __CUDA_ARCH__ >= 800
+  return __hmax(a, b);
+#else
+  return a > b ? a : b;
+#endif
+{% else %}
+  return a > b ? a : b;
+{% endif %}
+}
+
+__device__ inline half hmin(const half a, const half b) {
+{% if cuda_hmaxmin %}
+#if __CUDA_ARCH__ >= 800
+  return __hmin(a, b);
+#else
+  return a < b ? a : b;
+#endif
+{% else %}
+  return a < b ? a : b;
+{% endif %}
+}
+
+template <typename T>
+__device__ __host__ inline float IoU(const Bbox<T>& a, const Bbox<T>& b) {
+  T left = hmax(a.xmin, b.xmin), right = hmin(a.xmax, b.xmax);
+  T top = hmax(a.ymin, b.ymin), bottom = hmin(a.ymax, b.ymax);
+  T width = hmax(T(right - left + T(1.0)), T(0.0));
+  T height = hmax(T(bottom - top + T(1.0)), T(0.0));
+  float interS = __half2float(width) * __half2float(height);
+  float Sa = __half2float(a.xmax - a.xmin + T(1.0)) *
+      __half2float(a.ymax - a.ymin + T(1.0));
+  float Sb = __half2float(b.xmax - b.xmin + T(1.0)) *
+      __half2float(b.ymax - b.ymin + T(1.0));
+
+  return interS / (Sa + Sb - interS);
+}
+
+// NMS KERNEL FOR SMALL BATCH SIZE
+template <typename T_PROPOSALS, typename T_ROIS, int DIM, int TSIZE>
+__global__ __launch_bounds__(DIM) void nmsKernel1(
+    const int propSize,
+    Bbox<T_PROPOSALS> const* __restrict__ preNmsProposals,
+    T_ROIS* __restrict__ afterNmsProposals,
+    const int preNmsTopN,
+    const float nmsThres,
+    const int afterNmsTopN) {
+  __shared__ bool kept_boxes[TSIZE * DIM];
+  int kept = 0;
+  int batch_offset = blockIdx.x * propSize;
+  int max_box_idx = batch_offset + preNmsTopN;
+  int batch_offset_out = blockIdx.x * afterNmsTopN;
+
+  int flag_idx[TSIZE];
+  int boxes_idx[TSIZE];
+  Bbox<T_PROPOSALS> cur_boxes[TSIZE];
+
+// initialize kept_boxes
+#pragma unroll
+  for (int i = 0; i < TSIZE; i++) {
+    boxes_idx[i] = threadIdx.x + batch_offset + DIM * i;
+    flag_idx[i] = threadIdx.x + DIM * i;
+
+    if (boxes_idx[i] < max_box_idx) {
+      cur_boxes[i] = preNmsProposals[boxes_idx[i]];
+      kept_boxes[flag_idx[i]] = true;
+    } else {
+      kept_boxes[flag_idx[i]] = false;
+      boxes_idx[i] = -1.0f;
+      flag_idx[i] = -1.0f;
+    }
+  }
+
+  int ref_box_idx = 0 + batch_offset;
+
+  // remove the overlapped boxes
+  while ((kept < afterNmsTopN) && (ref_box_idx < max_box_idx)) {
+    Bbox<T_PROPOSALS> ref_box;
+    ref_box = preNmsProposals[ref_box_idx];
+
+#pragma unroll
+    for (int i = 0; i < TSIZE; i++) {
+      if (boxes_idx[i] > ref_box_idx) {
+        if (IoU(ref_box, cur_boxes[i]) > nmsThres) {
+          kept_boxes[flag_idx[i]] = false;
+        }
+      } else if (boxes_idx[i] == ref_box_idx) {
+        afterNmsProposals[(batch_offset_out + kept) * 4 + 0] = ref_box.xmin;
+        afterNmsProposals[(batch_offset_out + kept) * 4 + 1] = ref_box.ymin;
+        afterNmsProposals[(batch_offset_out + kept) * 4 + 2] = ref_box.xmax;
+        afterNmsProposals[(batch_offset_out + kept) * 4 + 3] = ref_box.ymax;
+      }
+    }
+    __syncthreads();
+
+    do {
+      ref_box_idx++;
+    } while (!kept_boxes[ref_box_idx - batch_offset] &&
+             ref_box_idx < max_box_idx);
+
+    kept++;
+  }
+}
+
+// NMS KERNEL FOR LARGE BATCH SIZE
+template <typename T_PROPOSALS, typename T_ROIS, int DIM, int TSIZE>
+__global__ __launch_bounds__(DIM) void nmsKernel2(
+    const int propSize,
+    Bbox<T_PROPOSALS> const* __restrict__ proposals,
+    T_ROIS* __restrict__ filtered,
+    const int preNmsTopN,
+    const float nmsThres,
+    const int afterNmsTopN) {
+  Bbox<T_PROPOSALS> const* cProposals = proposals + blockIdx.x * propSize;
+
+  Bbox<T_PROPOSALS> t[TSIZE];
+  uint64_t del = 0;
+
+  for (int i = 0; i < TSIZE; i++) {
+    if (i < TSIZE - 1 || i * DIM + threadIdx.x < preNmsTopN) {
+      t[i] = cProposals[i * DIM + threadIdx.x];
+    }
+  }
+
+  __shared__ Bbox<T_PROPOSALS> last;
+  __shared__ bool kept;
+  __shared__ int foundBatch;
+  if (threadIdx.x == 0)
+    foundBatch = 0;
+
+  for (int i = 0; i < TSIZE; i++) {
+    for (int j = 0; j < DIM; j++) {
+      int offset = i * DIM;
+      int index = offset + j;
+      if (index >= preNmsTopN)
+        break;
+
+      __syncthreads();
+
+      if (threadIdx.x == j) {
+        kept = 0 == (del & ((uint64_t)1 << i));
+        last = t[i];
+
+        if (kept) {
+          int cnt = blockIdx.x * afterNmsTopN + foundBatch;
+          filtered[cnt * 4 + 0] = t[i].xmin;
+          filtered[cnt * 4 + 1] = t[i].ymin;
+          filtered[cnt * 4 + 2] = t[i].xmax;
+          filtered[cnt * 4 + 3] = t[i].ymax;
+          foundBatch++;
+        }
+      }
+
+      __syncthreads();
+
+      if (foundBatch == afterNmsTopN) {
+        return;
+      }
+
+      if (kept) {
+        Bbox<T_PROPOSALS> test = last;
+
+        for (int k = 0; k < TSIZE; k++) {
+          if (index < k * DIM + threadIdx.x &&
+              IoU<T_PROPOSALS>(test, t[k]) > nmsThres) {
+            del |= (uint64_t)1 << k;
+          }
+        }
+      }
+    }
+  }
+}
+
+// NMS LAUNCH
+template <typename T_PROPOSALS, DLayout_t L_PROPOSALS, typename T_ROIS>
+pluginStatus_t nmsLaunch(
+    {{prefix}}Stream_t stream,
+    const int batch,
+    const int propSize,
+    void* proposals,
+    void* filtered,
+    const int preNmsTopN,
+    const float nmsThres,
+    const int afterNmsTopN) {
+  const int blockSize = 1024;
+
+  // #define P1(tsize) nmsKernel1<T_PROPOSALS, T_ROIS, blockSize, (tsize)>
+  // #define P2(tsize) nmsKernel2<T_PROPOSALS, T_ROIS, blockSize, (tsize)>
+
+  //   void (*kernel[64])(
+  //       int, Bbox<T_PROPOSALS> const*, T_ROIS*, int, float, int) = {
+  //       P1(1),  P1(2),  P1(3),  P1(4),  P1(5),  P1(6),  P1(7),  P1(8),
+  //       P1(9),  P1(10), P1(11), P1(12), P2(13), P2(14), P2(15), P2(16),
+  //       P2(17), P2(18), P2(19), P2(20), P2(21), P2(22), P2(23), P2(24),
+  //       P2(25), P2(26), P2(27), P2(28), P2(29), P2(30), P2(31), P2(32),
+  //       P2(33), P2(34), P2(35), P2(36), P2(37), P2(38), P2(39), P2(40),
+  //       P2(41), P2(42), P2(43), P2(44), P2(45), P2(46), P2(47), P2(48),
+  //       P2(49), P2(50), P2(51), P2(52), P2(53), P2(54), P2(55), P2(56),
+  //       P2(57), P2(58), P2(59), P2(60), P2(61), P2(62), P2(63), P2(64)};
+
+#if T_SZIE <= 12
+#define nmsKernel nmsKernel1<T_PROPOSALS, T_ROIS, blockSize, T_SIZE>
+#else
+#define nmsKernel nmsKernel2<T_PROPOSALS, T_ROIS, blockSize, T_SIZE>
+#endif
+
+  ASSERT_PARAM(preNmsTopN < 64 * blockSize);
+
+  CSC({{prefix}}MemsetAsync(
+          filtered, 0x00, batch * afterNmsTopN * 4 * sizeof(T_ROIS), stream),
+      STATUS_FAILURE);
+
+  nmsKernel<<<batch, blockSize, 0, stream>>>(
+      propSize,
+      (Bbox<T_PROPOSALS>*)proposals,
+      (T_ROIS*)filtered,
+      preNmsTopN,
+      nmsThres,
+      afterNmsTopN);
+
+  CSC({{prefix}}GetLastError(), STATUS_FAILURE);
+
+  return STATUS_SUCCESS;
+}
+
+// SET OFFSET
+// Works for up to 2Gi elements (cub's limitation)!
+__global__ void setOffset(int stride, int size, int* output) {
+  // One block, because batch size shouldn't be too large.
+  for (int i = threadIdx.x; i < size; i += blockDim.x) {
+    output[i] = i * stride;
+  }
+}
+
+// BBFilter KERNEL
+__global__ void bboxFilter_kernel(
+    int N,
+    const float minSize,
+    const half* proposals,
+    half* scores) {
+  if (minSize == 0)
+    return;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  uint16_t bits = 0x3c00u;
+  half one = reinterpret_cast<half const&>(bits);
+
+  if (tid < N) {
+    int ininf = 0xff800000;
+    float ninf = *(float*)&ininf;
+
+    if (__hsub(proposals[tid * 4 + 2], proposals[tid * 4 + 0]) <
+            half(minSize) ||
+        __hsub(proposals[tid * 4 + 3], proposals[tid * 4 + 1]) <
+            half(minSize)) {
+      scores[tid] = half(ninf);
+    }
+  }
+}
+
+inline size_t GetCudaAlignedSize(size_t size) {
+  const size_t kCudaAlignSize = 1 << 20;
+  return (size + kCudaAlignSize - 1) / kCudaAlignSize * kCudaAlignSize;
+}
+
+class MultiplyFunctor final {
+ public:
+  MultiplyFunctor(int32_t num_col) : num_col_(num_col) {}
+  __host__ __device__ __forceinline__ int32_t operator()(int32_t idx) const {
+    return idx * num_col_;
+  }
+
+ private:
+  int32_t num_col_;
+};
+
+template <typename KeyType, typename ValueType>
+size_t InferTempStorageForSortPairsDescending(
+    int32_t num_row,
+    int32_t num_col) {
+  using SegmentOffsetIter = {{cub}}::TransformInputIterator<
+      int32_t,
+      MultiplyFunctor,
+      {{cub}}::CountingInputIterator<int32_t>>;
+
+  {{cub}}::CountingInputIterator<int32_t> counting_iter(0);
+  MultiplyFunctor multiply_functor(num_col);
+  SegmentOffsetIter segment_offset_iter(counting_iter, multiply_functor);
+
+  size_t temp_storage_bytes = 0;
+  auto err = {{cub}}::DeviceSegmentedRadixSort::
+      SortPairsDescending<KeyType, ValueType, SegmentOffsetIter>(
+          /* d_temp_storage */ nullptr,
+          /* temp_storage_bytes */ temp_storage_bytes,
+          /* d_keys_in */ nullptr,
+          /* d_keys_out */ nullptr,
+          /* d_values_in */ nullptr,
+          /* d_values_out */ nullptr,
+          /* num_items */ num_row * num_col,
+          /* num_segments */ num_row,
+          /* d_begin_offsets */ segment_offset_iter,
+          /* d_end_offsets */ segment_offset_iter + 1,
+          /* begin_bit */ 0,
+          /* end_bit */ sizeof(KeyType) * 8,
+          /* stream */ 0);
+  // OF_CUDA_CHECK(err);
+
+  return temp_storage_bytes;
+}
+
+// NMS GPU
+template <typename T_SCORES, typename T_ROIS>
+pluginStatus_t nmsGpu(
+    {{prefix}}Stream_t stream,
+    const int N,
+    const int R,
+    const int preNmsTop,
+    const int nmsMaxOut,
+    const float iouThreshold,
+    const float minBoxSize,
+    const void* fgScores,
+    const void* proposals,
+    void* workspace,
+    void* rois) {
+  const int BS = 32;
+  const int GS = ((R) + BS - 1) / BS;
+  bboxFilter_kernel<<<GS, BS, 0, stream>>>(
+      R, minBoxSize, (T_ROIS*)proposals, (T_ROIS*)fgScores);
+
+  int8_t* vworkspace = alignPtr((int8_t*)workspace, 32);
+
+  pluginStatus_t error;
+
+  int* offsets = (int*)vworkspace;
+  setOffset<<<1, 1024, 0, stream>>>(R, N + 1, offsets);
+  CSC({{prefix}}GetLastError(), STATUS_FAILURE);
+
+  vworkspace = vworkspace + N + 1;
+  vworkspace = alignPtr(vworkspace, ALIGNMENT);
+
+  std::size_t tempStorageBytes =
+      InferTempStorageForSortPairsDescending<half, int64_t>(N, R);
+
+  CSC({{prefix}}GetLastError(), STATUS_FAILURE);
+
+  T_SCORES* scoresOut = (T_SCORES*)vworkspace;
+  vworkspace = (int8_t*)(scoresOut + N * R);
+  vworkspace = alignPtr(vworkspace, ALIGNMENT);
+  Bbox<T_ROIS>* proposalsOut = (Bbox<T_ROIS>*)vworkspace;
+  vworkspace = (int8_t*)(proposalsOut + N * R);
+  vworkspace = alignPtr(vworkspace, ALIGNMENT);
+
+  {{cub}}::DeviceSegmentedRadixSort::SortPairsDescending(
+      vworkspace,
+      tempStorageBytes,
+      (T_SCORES*)fgScores,
+      (T_SCORES*)scoresOut,
+      (Bbox<T_ROIS>*)proposals,
+      (Bbox<T_ROIS>*)proposalsOut,
+      N * R,
+      N,
+      offsets,
+      offsets + 1,
+      0,
+      8 * sizeof(T_SCORES),
+      stream);
+
+  CSC({{prefix}}GetLastError(), STATUS_FAILURE);
+
+  error = nmsLaunch<T_ROIS, NC4HW, T_ROIS>(
+      stream, N, R, proposalsOut, rois, preNmsTop, iouThreshold, nmsMaxOut);
+
+  if (error != STATUS_SUCCESS) {
+    return error;
+  }
+  return STATUS_SUCCESS;
+}
+    """
+)
diff --git a/python/aitemplate/backend/common/vision_ops/roi_align_common.py b/python/aitemplate/backend/common/vision_ops/roi_align_common.py
new file mode 100644
index 000000000..b658b711f
--- /dev/null
+++ b/python/aitemplate/backend/common/vision_ops/roi_align_common.py
@@ -0,0 +1,392 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+roi align common functions for all backends.
+"""
+
+import jinja2
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}roi_align_launcher<float, {{num_rois}}, {{pooled_size}}>(
+{{indent}}    in_ptr,
+{{indent}}    rois_ptr,
+{{indent}}    out_ptr,
+{{indent}}    NI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    CI,
+{{indent}}    HO,
+{{indent}}    WO,
+{{indent}}    sampling_ratio,
+{{indent}}    spatial_scale,
+{{indent}}    position_sensitive,
+{{indent}}    continuous_coordinate,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+template <typename T>
+__device__ float2 bilinear_interpolate(const half2* bottom_data,
+                                  const int height,
+                                  const int width,
+                                  T y,
+                                  T x,
+                                  const int channels,
+                                  const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  float2 val = {0.f, 0.f};
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return val;
+  }
+
+  y = y <= 0 ? 0 : y;
+  x = x <= 0 ? 0 : x;
+
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+
+  y_high = y_low >= height - 1 ? height - 1 : y_low + 1;
+  y_low = y_low >= height - 1 ? height - 1 : y_low;
+  y = y_low >= height - 1 ? (T)y_low : y;
+
+  x_high = x_low >= width - 1 ? width - 1 : x_low + 1;
+  x_low = x_low >= width - 1 ? width - 1 : x_low;
+  x = x_low >= width - 1 ? (T)x_low : x;
+
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  const half2  v1 = __ldg(bottom_data + (y_low * width + x_low) * channels);
+  const half2  v2 = __ldg(bottom_data + (y_low * width + x_high) * channels);
+  const half2  v3 = __ldg(bottom_data + (y_high * width + x_low) * channels);
+  const half2  v4 = __ldg(bottom_data + (y_high * width + x_high) * channels);
+
+  T v1_x = __half2float(v1{{half2_data_ref}}.x);
+  T v2_x = __half2float(v2{{half2_data_ref}}.x);
+  T v3_x = __half2float(v3{{half2_data_ref}}.x);
+  T v4_x = __half2float(v4{{half2_data_ref}}.x);
+
+  T v1_y = __half2float(v1{{half2_data_ref}}.y);
+  T v2_y = __half2float(v2{{half2_data_ref}}.y);
+  T v3_y = __half2float(v3{{half2_data_ref}}.y);
+  T v4_y = __half2float(v4{{half2_data_ref}}.y);
+
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  val.x = (w1 * v1_x + w2 * v2_x + w3 * v3_x + w4 * v4_x);
+  val.y = (w1 * v1_y + w2 * v2_y + w3 * v3_y + w4 * v4_y);
+
+  return val;
+}
+
+template <typename T, int64_t num_rois, int pool_size>
+__global__ void roi_align_f16_nhwc_kernel(const half2* bottom_data,
+                                         const half* bottom_rois,
+                                         half2* top_data,
+                                         const int64_t N,
+                                         const int64_t height,
+                                         const int64_t width,
+                                         const int64_t channels,
+                                         const int64_t pooled_height,
+                                         const int64_t pooled_width,
+                                         const int sampling_ratio,
+                                         const float spatial_scale,
+                                         const bool position_sensitive,
+                                         const bool continuous_coordinate) {
+
+  const int64_t nthreads = num_rois * channels * pooled_width * pooled_height;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    // index = c + channels * (x + out_width * (y + out_height * b))
+    int64_t idx = index;
+    const int c = idx % channels;
+    idx /= channels;
+    const int pw = idx % pooled_width;
+    idx /= pooled_width;
+    const int ph = idx % pooled_height;
+    const int n = idx / pooled_height;
+
+
+    const half* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = static_cast<int>(__half2float(offset_bottom_rois[0]));
+
+    float2 output_val = {0.f, 0.f};
+    if (roi_batch_ind < 0) {
+      top_data[index] = __float22half2_rn(output_val);
+      continue;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_offset  = continuous_coordinate ? static_cast<T>(0.5) : static_cast<T>(0);
+    T roi_start_w = __half2float(offset_bottom_rois[1]) * spatial_scale - roi_offset;
+    T roi_start_h = __half2float(offset_bottom_rois[2]) * spatial_scale - roi_offset;
+    T roi_end_w   = __half2float(offset_bottom_rois[3]) * spatial_scale - roi_offset;
+    T roi_end_h   = __half2float(offset_bottom_rois[4]) * spatial_scale - roi_offset;
+
+    T roi_width  = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!continuous_coordinate) {  // backward compatiblity
+      // Force malformed ROIs to be 1x1
+      roi_width  = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    int c_unpooled        = c;
+    int channels_unpooled = channels;
+    if (position_sensitive) {
+      c_unpooled        = c * pooled_height * pooled_width + ph * pooled_width + pw;
+      channels_unpooled = channels * pooled_height * pooled_width;
+    }
+
+    const half2* offset_bottom_data =
+           bottom_data + (roi_batch_ind * height * width * channels_unpooled + c_unpooled);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    // T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const T y =
+          roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+
+        float2 val = bilinear_interpolate(offset_bottom_data, height, width, y, x, channels, index);
+        output_val.x += val.x;
+        output_val.y += val.y;
+      }
+    }
+    output_val.x /= count;
+    output_val.y /= count;
+
+    top_data[index] = __float22half2_rn(output_val);
+  }
+
+}
+
+
+template <typename integer>
+constexpr __host__ __device__ inline integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+
+template <typename T, int64_t num_rois, int pool_size>
+void roi_align_launcher({{elem_input_type}}* input,
+                      {{elem_input_type}}* rois,
+                      {{elem_output_type}}* output,
+                      const {{index_type}} N,
+                      const {{index_type}} H,
+                      const {{index_type}} W,
+                      const {{index_type}} C,
+                      const {{index_type}} HO,
+                      const {{index_type}} WO,
+                      const int sampling_ratio,
+                      const float spatial_scale,
+                      const bool position_sensitive,
+                      const bool continuous_coordinate,
+                      {{prefix}}Stream_t stream) {
+
+  const int64_t output_size = num_rois * C * HO * WO;
+
+  dim3 grid(std::min(
+      ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  roi_align_f16_nhwc_kernel<T, num_rois, pool_size><<<grid, block, 0, stream>>>(
+    (const half2*)input, (const half*)rois, (half2*)output, N, H, W, C / 2, HO, WO,
+    sampling_ratio, spatial_scale, position_sensitive, continuous_coordinate);
+
+}
+} // namespace
+
+void {{function_name}} (
+    {{elem_input_type}}* in_ptr,
+    {{elem_input_type}}* rois_ptr,
+    {{elem_output_type}}* out_ptr,
+    {{index_type}}* batch,
+    {{index_type}}* in_h,
+    {{index_type}}* in_w,
+    {{index_type}}* in_ch,
+    {{index_type}}* out_batch,
+    {{index_type}}* out_h,
+    {{index_type}}* out_w,
+    int sampling_ratio,
+    const float spatial_scale,
+    const bool position_sensitive,
+    const bool continuous_coordinate,
+    {{prefix}}Stream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this avg pool2d specialization."
+  );
+}
+
+"""
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  {{elem_input_type}}*,
+  {{elem_input_type}}*,
+  {{elem_output_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  {{index_type}}*,
+  int,
+  float,
+  bool,
+  bool,
+  {{prefix}}Stream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr}}),
+{{indent}}    static_cast<{{elem_input_type}}*>({{rois_ptr}}),
+{{indent}}    static_cast<{{elem_output_type}}*>({{out_ptr}}),
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{sampling_ratio}},
+{{indent}}    {{spatial_scale}},
+{{indent}}    {{position_sensitive}},
+{{indent}}    {{continuous_coordinate}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_function_decl(func_attrs, backend_spec):
+    """Function declaration generation
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        It describes the operation attributes
+    backend_spec : custom class
+        It specifies the corresponding backend dtypes of pytorch dtypes for many operations
+
+    Returns
+    -------
+    str
+        Rendered function declaration stmt
+    """
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        func_name=func_attrs["name"],
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+    )
+
+
+def gen_function_call(func_attrs, backend_spec, indent="  "):
+    """Function call generation
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        It describes the operation attributes
+    indent : str, optional
+        Indent for template, by default "  "
+
+    Returns
+    -------
+    str
+        Rendered function call
+    """
+    x = func_attrs["inputs"][0]
+    rois = func_attrs["inputs"][1]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        rois_ptr=rois._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        sampling_ratio=func_attrs["sampling_ratio"],
+        spatial_scale=func_attrs["spatial_scale"],
+        position_sensitive="true" if func_attrs["position_sensitive"] else "false",
+        continuous_coordinate="true"
+        if func_attrs["continuous_coordinate"]
+        else "false",
+        backend_spec=backend_spec,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/__init__.py b/python/aitemplate/backend/cuda/__init__.py
new file mode 100644
index 000000000..38586aab5
--- /dev/null
+++ b/python/aitemplate/backend/cuda/__init__.py
@@ -0,0 +1,37 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+CUDA backend codegen functions.
+"""
+from . import cuda_common, lib_template, target_def, utils
+from .common import *
+from .conv2d import *
+from .elementwise import *
+from .embedding import *
+from .gemm_special import *
+from .gemm_universal import *
+from .gemm_epilogue_vistor import *
+from .layernorm_sigmoid_mul import *
+from .padding import *
+from .pool2d import *
+from .reduce import *
+from .softmax import *
+from .tensor import *
+from .upsample import *
+from .view_ops import *
+from .vision_ops import *
+from .attention import *
+from .groupnorm import *
diff --git a/python/aitemplate/backend/cuda/attention/__init__.py b/python/aitemplate/backend/cuda/attention/__init__.py
new file mode 100644
index 000000000..61a47c3ad
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+cuda flash_attention module init
+"""
+from . import flash_attention
+
+__all__ = ["flash_attention"]
diff --git a/python/aitemplate/backend/cuda/attention/flash_attention.py b/python/aitemplate/backend/cuda/attention/flash_attention.py
new file mode 100644
index 000000000..b2fe5c0ca
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/flash_attention.py
@@ -0,0 +1,319 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+attention kernel codegen for CUDA.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from ... import registry
+
+# pylint: disable=C0301
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<half*>(&({{name}}->raw()))"
+)
+
+FUNC_CALL_INT32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int*>({{name}})")
+
+FUNC_CALL_FP32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<float*>({{name}})")
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+
+#include "fmha.h"
+#include "fmha_fprop_kernel_1xN.h"
+
+namespace {
+
+template<typename Kernel_traits, bool Is_dropout, bool Is_causal, bool Return_softmax>
+__global__ void fmha_fprop_fp16_sm80_loop_kernel(Fused_multihead_attention_fprop_params params) {
+    fmha::device_1xN_loop<Kernel_traits, Is_dropout, Is_causal, Return_softmax>(params);
+}
+
+template<typename Kernel_traits>
+void run_fmha_fp16_sm80_loop_(Launch_params<Fused_multihead_attention_fprop_params> &launch_params,
+                            const bool configure) {
+    bool is_causal = launch_params.params.is_causal;
+    auto kernel = (is_causal
+           ? (&fmha_fprop_fp16_sm80_loop_kernel<Kernel_traits, false, true, false>)
+           : (&fmha_fprop_fp16_sm80_loop_kernel<Kernel_traits, false, false, false>));
+
+    constexpr int N = Kernel_traits::Cta_tile_p::N;
+    const int loop_steps = (launch_params.params.s + N - 1) / N;
+    constexpr int smem_size_softmax_lse = Kernel_traits::Smem_dp_sum::BYTES_PER_TILE;
+    // Don't need smem_size_softmax_lse if we're not looping
+    const int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>()
+        + (loop_steps > 1 ? smem_size_softmax_lse : 0);
+
+    if( smem_size >= 48 * 1024 ) {
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+
+    if (configure) {
+        using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
+        constexpr int M = Kernel_traits::Cta_tile_p::M;
+        size_t STEPS = (launch_params.params.s + M - 1) / M;
+        constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
+        constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;
+        size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8 * loop_steps;
+        launch_params.elts_per_thread = elts_per_head;
+        return;
+    }
+
+    dim3 grid(launch_params.params.h, launch_params.params.b);
+    kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(
+        launch_params.params);
+
+    FMHA_CHECK_CUDA(cudaPeekAtLastError());
+}
+
+void run_fmha_fp16_sm80(Launch_params<Fused_multihead_attention_fprop_params> &launch_params,
+                        const bool configure) {
+{{custom_kernel}}
+}
+
+void set_params(Fused_multihead_attention_fprop_params &params,
+                // sizes
+                const size_t b,
+                const size_t s,
+                const size_t h,
+                const size_t d,
+                // device pointers
+                void *qkv_packed_d,
+                void *cu_seqlens_d,
+                void *o_packed_d,
+                void *o_tmp_d,
+                void *do_packed_d,
+                void *s_d,
+                void *softmax_lse_d,
+                void *dsoftmax_sum_d,
+                float p_dropout,
+                float softmax_scale,
+                bool is_causal) {
+
+    Data_type acc_type = DATA_TYPE_FP32;
+    Data_type data_type = DATA_TYPE_FP16;
+
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+
+    // Set the pointers and strides.
+    params.q_ptr = qkv_packed_d;
+    params.k_ptr = qkv_packed_d + get_size_in_bytes(h * d, data_type);
+    params.v_ptr = qkv_packed_d + 2 * get_size_in_bytes(h * d, data_type);
+    params.q_row_stride_in_elts = 3 * h * d;
+    params.k_row_stride_in_elts = 3 * h * d;
+    params.v_row_stride_in_elts = 3 * h * d;
+    params.q_head_stride_in_elts = d;
+    params.k_head_stride_in_elts = d;
+    params.v_head_stride_in_elts = d;
+    params.o_ptr = o_packed_d;
+    params.o_row_stride_in_elts = h * d;
+    params.o_head_stride_in_elts = d;
+    params.do_ptr = do_packed_d;
+    params.o_tmp_ptr = o_tmp_d;
+
+    params.cu_seqlens = static_cast<int *>(cu_seqlens_d);
+
+    // S = softmax(P)
+    params.s_ptr = s_d;
+    params.s_stride_in_bytes = get_size_in_bytes(b * h * s, data_type);
+
+    // Softmax sum
+    params.softmax_lse_ptr = softmax_lse_d;
+    params.dsoftmax_sum = dsoftmax_sum_d;
+
+    // Set the dimensions.
+    params.b = b;
+    params.h = h;
+    params.s = s;
+    params.d = d;
+
+    // Set the different scale values.
+    // const float scale_bmm1 = 1.f / sqrtf(d);
+    const float scale_bmm1 = softmax_scale;
+    constexpr float scale_softmax = 1.f;
+    constexpr float scale_bmm2 = 1.f;
+
+    params.scale_bmm1f = scale_bmm1;
+    set_alpha(params.scale_bmm1, scale_bmm1, data_type);
+    set_alpha(params.scale_softmax, scale_softmax, acc_type);
+    set_alpha(params.scale_bmm2, scale_bmm2, data_type);
+
+    // Set this to probability of keeping an element to simplify things.
+    params.p_dropout = 1.f - p_dropout;
+    // Convert p from float to int so we don't have to convert the random uint to float to compare.
+    // [Minor] We want to round down since when we do the comparison we use <= instead of <
+    params.p_dropout_in_uint = uint32_t(std::floor(params.p_dropout * 4294967295.0));
+    params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout * 65535.0));
+    params.rp_dropout = 1.f / params.p_dropout;
+    set_alpha(params.scale_dropout, params.rp_dropout, data_type);
+
+    params.is_causal = is_causal;
+}
+}  // namespace
+
+{{func_signature}}
+{
+    bool is_dropout = p_dropout > 0.0;
+    bool return_softmax = false;
+
+    Launch_params<Fused_multihead_attention_fprop_params> launch_params(stream, is_dropout, return_softmax);
+
+    set_params(launch_params.params,
+               batch_size, // b
+               seq_len, // s
+               num_heads, // h
+               head_size, // d
+               (void*)qkv,
+               (void*)cu_seqlens,
+               (void*)output,
+               loop ? (void*)o_tmp : nullptr,
+               nullptr,
+               nullptr, // return softmax
+               (void*)softmax_lse,
+               nullptr,
+               p_dropout,
+               softmax_scale,
+               is_causal);
+
+    run_fmha_fp16_sm80(launch_params, /*configure=*/ false);
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(half* output,
+                   const half* qkv,
+                   const int* cu_seqlens,
+                   float* softmax_lse,
+                   float* o_tmp,
+                   int batch_size,
+                   int seq_len,
+                   int num_heads,
+                   int head_size,
+                   float p_dropout,
+                   float softmax_scale,
+                   bool is_causal,
+                   bool loop,
+                   cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{output}}, {{qkv}}, {{cu_seqlens}},
+{{indent}}    {{softmax_lse}}, {{o_tmp}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{seq_len}},
+{{indent}}    {{num_heads}},
+{{indent}}    {{head_size}},
+{{indent}}    {{p_dropout}},
+{{indent}}    {{softmax_scale}},
+{{indent}}    {{is_causal}}, {{loop}}, stream /* default stream */
+{{indent}});
+    """
+)
+
+ATT_KERNEL_TEMPLATE = jinja2.Template(
+    """
+    using Kernel_traits = FMHA_kernel_traits<{{s1}}, {{s2}}, 16, 1, 4, 0x08u>;
+    run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    """
+)
+
+
+@registry.reg("cuda.flash_attention.gen_function")
+def flash_attention_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    return FUNC_TEMPLATE.render(
+        custom_kernel=ATT_KERNEL_TEMPLATE.render(
+            s1=128 if func_attrs["seq_len"] == 128 else 256,
+            s2=func_attrs["head_size"],
+        ),
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+    )
+
+
+@registry.reg("cuda.flash_attention.func_decl")
+def flash_attention_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.flash_attention.func_call")
+def flash_attention_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    output_name = ""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 2
+
+    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+
+    qkv_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][0]._attrs["name"]
+    )
+
+    seqlens_name = FUNC_CALL_INT32_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][1]._attrs["name"]
+    )
+
+    x = func_attrs["inputs"][0]
+
+    batch_size = func_attrs["batch_size"]
+    seq_len = func_attrs["seq_len"]
+
+    num_heads = x._attrs["shape"][2]._attrs["values"][0]
+    head_size = x._attrs["shape"][3]._attrs["values"][0]
+    p_dropout = func_attrs["dropout"]
+    is_causal = func_attrs["causal"]
+    softmax_scale = head_size ** (-0.5)
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        qkv=qkv_name,
+        cu_seqlens=seqlens_name,
+        softmax_lse="reinterpret_cast<float*>(global_workspace)",
+        o_tmp="reinterpret_cast<float*>(global_workspace + {} * sizeof(float))".format(
+            batch_size * num_heads * seq_len
+        ),
+        batch_size=batch_size,
+        seq_len=seq_len,
+        num_heads=num_heads,
+        head_size=head_size,
+        p_dropout=p_dropout,
+        softmax_scale=softmax_scale,
+        is_causal="true" if is_causal else "false",
+        loop="true" if seq_len > 256 else "false",
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha.h b/python/aitemplate/backend/cuda/attention/src/fmha.h
new file mode 100644
index 000000000..9cc516722
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha.h
@@ -0,0 +1,211 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cuda.h>
+#include <vector>
+
+#include <fmha_utils.h>
+
+constexpr int TOTAL_DIM = 0;
+constexpr int THREE_DIM = 1;
+constexpr int H_DIM = 2;
+constexpr int D_DIM = 3;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct PhiloxCudaState {
+  PhiloxCudaState() = default;
+  // Called if graph capture is not underway
+  PhiloxCudaState(uint64_t seed, uint64_t offset) {
+    seed_ = seed;
+    offset_.val = offset;
+  }
+  // Called if graph capture is underway
+  PhiloxCudaState(
+      uint64_t seed,
+      int64_t* offset_extragraph,
+      uint32_t offset_intragraph) {
+    seed_ = seed;
+    offset_.ptr = offset_extragraph;
+    offset_intragraph_ = offset_intragraph;
+    captured_ = true;
+  }
+
+  // Public members, directly accessible by at::cuda::philox::unpack.
+  // If we made them private with getters/setters, the getters/setters
+  // would have to be __device__, and we can't declare __device__ in ATen.
+  union Payload {
+    uint64_t val;
+    int64_t* ptr;
+  };
+
+  uint64_t seed_ = 0;
+  Payload offset_;
+  uint32_t offset_intragraph_ = 0;
+  bool captured_ = false;
+};
+
+struct Qkv_params {
+  // The QKV matrices.
+  void* __restrict__ q_ptr;
+  void* __restrict__ k_ptr;
+  void* __restrict__ v_ptr;
+
+  // The stride between rows of the Q, K and V matrices.
+  // size_t qkv_stride_in_elts;
+  // size_t qkv_stride_in_bytes;
+  // TD [2022-04-16]: We're using 32-bit indexing to save registers.
+  // The code probably won't work for arrays larger than 2GB.
+  uint32_t q_row_stride_in_elts;
+  uint32_t k_row_stride_in_elts;
+  uint32_t v_row_stride_in_elts;
+  uint32_t q_head_stride_in_elts;
+  uint32_t k_head_stride_in_elts;
+  uint32_t v_head_stride_in_elts;
+
+  // The number of heads.
+  int h;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Fused_multihead_attention_fprop_params : public Qkv_params {
+  // The dQKV matrices.
+  void* __restrict__ dqkv_ptr;
+
+  // The O matrix (output).
+  void* __restrict__ o_ptr;
+
+  // The stride between rows of O.
+  // size_t o_stride_in_elts;
+  // size_t o_stride_in_bytes;
+  uint32_t o_row_stride_in_elts;
+  uint32_t o_head_stride_in_elts;
+
+  // The pointer to the O_tmp matrix, which holds O intermediate value during
+  // the loop;
+  void* __restrict__ o_tmp_ptr;
+
+  // The dO matrix .
+  void* __restrict__ do_ptr;
+
+  // The pointer to the S matrix, overwritten by the dP matrix (bwd).
+  void* __restrict__ s_ptr;
+  // The stride between rows of the S matrix.
+  // int64_t s_stride_in_bytes;
+  uint32_t s_stride_in_bytes;
+
+  // The pointer to the softmax sum.
+  void* __restrict__ softmax_lse_ptr;
+
+  // The pointer to the softmax d sum.
+  void* __restrict__ dsoftmax_sum;
+
+  // The dimensions.
+  int b, s, d;
+
+  // The scaling factors for the kernel.
+  float scale_bmm1f;
+  uint32_t scale_bmm1, scale_softmax, scale_bmm2;
+
+  // array of length b+1 holding starting offset of each sequence.
+  int* __restrict__ cu_seqlens;
+
+  int* __restrict__ blockmask;
+
+  // The dropout probability (probability of keeping an activation).
+  float p_dropout;
+  uint32_t p_dropout_in_uint;
+  uint16_t p_dropout_in_uint16_t;
+
+  // Scale factor of 1 / (1 - p_dropout).
+  float rp_dropout;
+
+  // Scale factor of 1 / (1 - p_dropout), in half2.
+  uint32_t scale_dropout;
+
+  // Random state.
+  PhiloxCudaState philox_args;
+
+  bool is_causal;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_params>
+struct Launch_params {
+  Launch_params(cudaStream_t stream_, bool is_dropout_, bool return_softmax_)
+      : elts_per_thread(0),
+        stream(stream_),
+        is_dropout(is_dropout_),
+        return_softmax(return_softmax_) {}
+
+  size_t elts_per_thread;
+
+  cudaStream_t stream;
+
+  bool is_dropout;
+  bool return_softmax;
+
+  Kernel_params params;
+  int num_full_heads;
+  int num_main_groups;
+  int heads_last_wave;
+  int main_steps;
+  int rest_steps;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// void run_fmha_fp16_sm80(Launch_params<Fused_multihead_attention_fprop_params>
+// &launch_params, const bool configure);
+
+// void run_fmha_dgrad_fp16_sm80(const Fused_multihead_attention_fprop_params
+// &params, cudaStream_t stream);
+
+// void
+// run_fmha_block_fp16_sm80(Launch_params<Fused_multihead_attention_fprop_params>
+// &launch_params, const bool configure);
+
+// void run_fmha_block_dgrad_fp16_sm80(const
+// Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h b/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h
new file mode 100644
index 000000000..433676370
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/gemm.h
@@ -0,0 +1,482 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <fmha/utils.h>
+
+#include <cutlass/arch/mma.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/layout/layout.h"
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Data_type_, int NUM_ELTS_, int BITS_PER_ELT_, int ALIGNMENT_>
+struct Fragment_base_ {
+  // The data type.
+  using Data_type = Data_type_;
+  // default input type
+  using Input_type_ = Data_type_;
+  // Does it store the array of elements.
+  static constexpr bool HAS_ELTS = BITS_PER_ELT_ >= 8;
+  // The number of elements.
+  static constexpr int NUM_ELTS = NUM_ELTS_;
+  // The size of element in bits.
+  static constexpr int BITS_PER_ELT = BITS_PER_ELT_;
+  // The size of byte of a single register.
+  static constexpr int BYTES_PER_REG = 4;
+  // The size in bits.
+  static constexpr int BITS_PER_REG = BYTES_PER_REG * 8;
+  // The number of registers needed to store the fragment.
+  static constexpr int NUM_REGS =
+      DivUpConstexpr(NUM_ELTS * BITS_PER_ELT, BITS_PER_REG);
+  // The size in bytes (as returned by sizeof(Fragment_base<>).
+  static constexpr int SIZE_IN_BYTES = NUM_REGS * BYTES_PER_REG;
+  // The alignment.
+  static constexpr int ALIGNMENT = ALIGNMENT_ > 0
+      ? ALIGNMENT_
+      : MinConstexpr(NUM_REGS* BYTES_PER_REG, 16);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The type of the elements.
+    typename Data_type_,
+    // The number of elements.
+    int NUM_ELTS_,
+    // The alignment if you want to force a value -- use 0 otherwise.
+    int ALIGNMENT_ = 0,
+    // The base class.
+    typename Base_ = Fragment_base_<
+        Data_type_,
+        NUM_ELTS_,
+        8 * sizeof(Data_type_),
+        ALIGNMENT_>>
+struct alignas(static_cast<int>(Base_::ALIGNMENT)) Fragment : public Base_ {
+  // The size of a load/store.
+  static constexpr int BYTES_PER_LOAD_STORE =
+      Base_::NUM_REGS * sizeof(uint32_t);
+
+  // Clear the fragment. Using PTX in that code seems to produce better SASS...
+  inline __device__ void clear() {
+#pragma unroll
+    for (int ii = 0; ii < Base_::NUM_REGS; ++ii) {
+      asm volatile("mov.u32 %0, 0; \n" : "=r"(this->reg(ii)) :);
+    }
+  }
+
+  // Immutable access to a register.
+  inline __device__ const uint32_t& reg(int ii) const {
+    return this->regs_[ii];
+  }
+
+  // Mutable access to a register.
+  inline __device__ uint32_t& reg(int ii) {
+    return this->regs_[ii];
+  }
+
+  uint32_t regs_[Base_::NUM_REGS];
+
+  // Immutable access to the elements.
+  inline __device__ const Data_type_& elt(int ii) const {
+    return reinterpret_cast<const Data_type_*>(&this->regs_[0])[ii];
+  }
+
+  // Mutable access to the elements.
+  inline __device__ Data_type_& elt(int ii) {
+    return reinterpret_cast<Data_type_*>(&this->regs_[0])[ii];
+  }
+
+  // Immutable access to the elements with a cast.
+  template <typename Cast_type>
+  inline __device__ const Cast_type& elt_as(int ii) const {
+    return reinterpret_cast<const Cast_type*>(&this->regs_[0])[ii];
+  }
+
+  // Mutable access to the elements.
+  template <typename Cast_type>
+  inline __device__ Cast_type& elt_as(int ii) {
+    return reinterpret_cast<Cast_type*>(&this->regs_[0])[ii];
+  }
+
+  // Add another fragment.
+  inline __device__ void add(const Fragment& other) {
+// TODO (TD 2022-04-09): Shouldn't this be NUM_REGS instead of NUM_ELTS?
+// Also are we doing int addition or __half2 addition?
+#pragma unroll
+    for (int ii = 0; ii < NUM_ELTS_; ++ii) {
+      this->elt(ii) += other.elt(ii);
+    }
+  }
+
+  // Multiply by another fragment.
+  inline __device__ void hmul(const Fragment& other) {
+#pragma unroll
+    for (int ii = 0; ii < Base_::NUM_REGS; ++ii) {
+      this->reg(ii) = fmha::hmul2(this->reg(ii), other.reg(ii));
+    }
+  }
+
+  inline __device__ void hrelu_() {
+#pragma unroll
+    for (int ii = 0; ii < Base_::NUM_REGS; ++ii) {
+      this->reg(ii) = fmha::hrelu2(this->reg(ii));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_a : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+struct Fragment_b : public Fragment<uint16_t, 8> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Fragment_accumulator : public Fragment<float, 8> {
+  // The base class.
+  using Base = Fragment<float, 8>;
+
+  // Add two fragments.
+  template <typename Other_fragment_>
+  inline __device__ void add(const Other_fragment_& other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) = this->elt(ii) + other.elt(ii);
+    }
+  }
+
+  inline __device__ void mul_(const float other) {
+    for (int ii = 0; ii < Base::NUM_ELTS; ++ii) {
+      this->elt(ii) *= other;
+    }
+  }
+
+  // Do the HMMA.
+  template <typename Layout_a, typename Layout_b>
+  inline __device__ void mma(
+      const Fragment_a<Layout_a>& a,
+      const Fragment_b<Layout_b>& b) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5, %6, %7}, \n"
+        "    {%8, %9}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+f"(elt(0)), "+f"(elt(1)), "+f"(elt(2)), "+f"(elt(3))
+        : "r"(a.reg(0)),
+          "r"(a.reg(1)),
+          "r"(a.reg(2)),
+          "r"(a.reg(3)),
+          "r"(b.reg(0)),
+          "r"(b.reg(1)));
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n"
+        "    {%0, %1, %2, %3}, \n"
+        "    {%4, %5, %6, %7}, \n"
+        "    {%8, %9}, \n"
+        "    {%0, %1, %2, %3}; \n"
+        : "+f"(elt(4)), "+f"(elt(5)), "+f"(elt(6)), "+f"(elt(7))
+        : "r"(a.reg(0)),
+          "r"(a.reg(1)),
+          "r"(a.reg(2)),
+          "r"(a.reg(3)),
+          "r"(b.reg(2)),
+          "r"(b.reg(3)));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Fragment, int M, int N>
+inline __device__ void clear(Fragment (&frag)[M][N]) {
+#pragma unroll
+  for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+    for (int ni = 0; ni < N; ++ni) {
+      frag[mi][ni].clear();
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Accumulator_type, int WARPS_K>
+struct Clear_accumulator {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int WARPS_K>
+struct Clear_accumulator<float, WARPS_K> {
+  template <typename Acc, int M, int N>
+  static inline __device__ void apply(Acc (&acc)[M][N], bool = false) {
+    fmha::clear(acc);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Acc, typename A, typename B, int M, int N>
+inline __device__ void gemm(
+    Acc (&acc)[M][N],
+    const A (&a)[M],
+    const B (&b)[N]) {
+#pragma unroll
+  for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+    for (int ni = 0; ni < N; ++ni) {
+      acc[mi][ni].mma(a[mi], b[ni]);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Acc, typename A, typename B, int M, int N>
+inline __device__ void gemm_cl(
+    Acc (&acc)[M][N],
+    const A (&a)[M],
+    const B (&b)[N]) {
+  using Shape = cutlass::gemm::GemmShape<16 * M, 16 * N, 16>;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+#else
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  // TD [2022-06-02] We don't support Volta (SM70) yet.
+  assert(0);
+#endif
+  using Element = cutlass::half_t;
+  using ElementC = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using WarpMma = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      Shape,
+      InstructionShape,
+      Element,
+      LayoutA,
+      Element,
+      LayoutB,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd,
+      1,
+      true>::Type;
+
+  constexpr int kIters = Shape::kK / InstructionShape::kK;
+  // using FragmentA = typename WarpMma::FragmentA;
+  // using FragmentB = typename WarpMma::FragmentB;
+  using FragmentA = typename WarpMma::ArchMmaOperator::FragmentA;
+  using FragmentB = typename WarpMma::ArchMmaOperator::FragmentB;
+  using FragmentC = typename WarpMma::FragmentC;
+
+  // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y) == 0) {
+  //     printf("FragmentA::kStorageElements = %d\n",
+  //     FragmentA::kStorageElements);
+  //     printf("Archmma::FragmentA::kStorageElements = %d\n",
+  //     WarpMma::ArchMmaOperator::FragmentA::kStorageElements);
+  //     printf("FragmentB::kStorageElements = %d\n",
+  //     FragmentB::kStorageElements);
+  //     printf("Archmma::FragmentB::kStorageElements = %d\n",
+  //     WarpMma::ArchMmaOperator::FragmentB::kStorageElements);
+  //     printf("FragmentC::kStorageElements = %d\n",
+  //     FragmentC::kStorageElements);
+  //     printf("Archmma::FragmentC::kStorageElements = %d\n",
+  //     WarpMma::ArchMmaOperator::FragmentC::kStorageElements);
+  // }
+
+  // static_assert(FragmentA::kStorageElements == M * a[0].NUM_REGS);
+  // static_assert(FragmentB::kStorageElements == N * b[0].NUM_REGS);
+  static_assert(FragmentA::kStorageElements * kIters == a[0].NUM_REGS);
+  static_assert(
+      FragmentB::kStorageElements * kIters * 16 / InstructionShape::kN ==
+      b[0].NUM_REGS);
+  static_assert(FragmentC::kStorageElements == M * N * acc[0][0].NUM_REGS);
+  // const FragmentA a_cl = reinterpret_cast<const FragmentA (&)>(a);
+  // const FragmentB b_cl = reinterpret_cast<const FragmentB (&)>(b);
+  FragmentC c_cl = reinterpret_cast<FragmentC(&)>(acc);
+  FragmentA a_cl[kIters][M];
+  FragmentA b_cl[kIters][N];
+  constexpr int kRegs = InstructionShape::kK == 16 ? 4 : 2;
+#pragma unroll
+  for (int iter = 0; iter < kIters; iter++) {
+#pragma unroll
+    for (int mi = 0; mi < M; mi++) {
+      uint32_t* a_ptr = a_cl[iter][mi].raw_data();
+#pragma unroll
+      for (int ki = 0; ki < kRegs; ki++) {
+        a_ptr[ki] = a[mi].regs_[iter * kRegs + ki];
+      }
+    }
+  }
+#pragma unroll
+  for (int iter = 0; iter < kIters; iter++) {
+#pragma unroll
+    for (int ni = 0; ni < N; ni++) {
+      uint32_t* b_ptr = b_cl[iter][ni].raw_data();
+#pragma unroll
+      for (int ki = 0; ki < kRegs; ki++) {
+        // b_ptr[ki] = b[ni].regs_[iter * kRegs + ki];
+        // TD [2022-06-02] For some reason the order for frag_b is different.
+        b_ptr[ki] = b[ni].regs_
+                        [InstructionShape::kK == 16 ? iter * kRegs + ki
+                                                    : ki * kRegs + iter];
+      }
+    }
+  }
+
+  WarpMma mma_op;
+// mma_op(c_cl, a_cl, b_cl, c_cl);
+#pragma unroll
+  for (int iter = 0; iter < kIters; iter++) {
+    mma_op(
+        c_cl,
+        reinterpret_cast<const typename WarpMma::FragmentA(&)>(a_cl[iter]),
+        reinterpret_cast<const typename WarpMma::FragmentB(&)>(b_cl[iter]),
+        c_cl);
+  }
+
+// The modified c_cl is not copied back into acc, idk why
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+#pragma unroll
+    for (int ni = 0; ni < N; ni++) {
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        acc[mi][ni].elt(i) = c_cl[mi * N * 8 + ni * 8 + i];
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The number of rows in the CTA tile.
+    int M_,
+    // The number of cols in the CTA tile.
+    int N_,
+    // The number of elements in the the K dimension of the GEMM loop.
+    int K_,
+    // The number of rows of warps.
+    int WARPS_M_,
+    // The number of cols of warps.
+    int WARPS_N_,
+    // The number of warps in the K dimension of the GEMM loop.
+    int WARPS_K_>
+struct Cta_tile_ {
+  static constexpr int M = M_, N = N_, K = K_;
+  // The number of warps.
+  static constexpr int WARPS_M = WARPS_M_, WARPS_N = WARPS_N_,
+                       WARPS_K = WARPS_K_;
+  // The number of warps per CTA.
+  static constexpr int WARPS_PER_CTA = WARPS_M * WARPS_N * WARPS_K;
+  // The number of threads per warp.
+  static constexpr int THREADS_PER_WARP = 32;
+  // The number of threads per CTA.
+  static constexpr int THREADS_PER_CTA = WARPS_PER_CTA * THREADS_PER_WARP;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Hmma_tile {
+  // The number of elements computed with a single warp-MMA.
+  static constexpr int M_PER_MMA = 16, N_PER_MMA = 16, K_PER_MMA = 16;
+
+  // The number of elements computed with a single CTA-MMA.
+  static constexpr int M_PER_MMA_PER_CTA = M_PER_MMA * Cta_tile::WARPS_M,
+                       N_PER_MMA_PER_CTA = N_PER_MMA * Cta_tile::WARPS_N,
+                       K_PER_MMA_PER_CTA = K_PER_MMA * Cta_tile::WARPS_K;
+
+  // The number of MMAs needed to compute the GEMM.
+  static constexpr int MMAS_M = DivUpConstexpr(Cta_tile::M, M_PER_MMA_PER_CTA),
+                       MMAS_N = DivUpConstexpr(Cta_tile::N, N_PER_MMA_PER_CTA),
+                       MMAS_K = DivUpConstexpr(Cta_tile::K, K_PER_MMA_PER_CTA);
+
+  // // The number of elements computed per warp.
+  // static constexpr int M_PER_WARP = MMAS_M * M_PER_MMA,
+  //     N_PER_WARP = MMAS_N * N_PER_MMA,
+  //     K_PER_WARP = MMAS_K * K_PER_MMA;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using A_type = uint16_t;
+using B_type = uint16_t;
+using C_type = uint16_t;
+using Accumulator_type = float;
+using Epilogue_type = float;
+
+constexpr int BITS_PER_ELEMENT_A = sizeof(A_type) * 8;
+constexpr int BITS_PER_ELEMENT_B = sizeof(B_type) * 8;
+constexpr int BITS_PER_ELEMENT_C = sizeof(C_type) * 8;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int M, int N, int K, int WARPS_M, int WARPS_N, int WARPS_K>
+using Cta_tile_extd = Cta_tile_<M, N, K, WARPS_M, WARPS_N, WARPS_K>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile_>
+using Cta_tile_with_k_with_padding = Cta_tile_extd<
+    Cta_tile_::M,
+    Cta_tile_::N,
+    Next_power_of_two<Cta_tile_::K>::VALUE,
+    Cta_tile_::WARPS_M,
+    Cta_tile_::WARPS_N,
+    Cta_tile_::WARPS_K>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace fmha
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h b/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h
new file mode 100644
index 000000000..119ac6a6f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/gmem_tile.h
@@ -0,0 +1,608 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile_,
+    // The number of bits per element.
+    int BITS_PER_ELEMENT,
+    // The number of rows of Q, K or V loaded by this tile.
+    int ROWS_,
+    // The number of columns.
+    int COLS>
+struct Gmem_tile_qkv {
+  using Cta_tile = Cta_tile_;
+
+  static constexpr int BYTES_PER_ELEMENT = BITS_PER_ELEMENT / 8;
+  // The size of each LDG.
+  static constexpr int BYTES_PER_LDG = 16;
+  // The size of a row in bytes.
+  static constexpr int BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8;
+
+  // The number of threads to load a "row" of the matrix.
+  static constexpr int THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG;
+
+  static constexpr int ROWS = ROWS_;
+  // The number of "rows" loaded per LDG.
+  static constexpr int ROWS_PER_LDG =
+      Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW;
+  // The number of LDGs needed to load a chunk of the Q matrix.
+  static constexpr int LDGS = DivUpConstexpr(ROWS, ROWS_PER_LDG);
+
+  // Ctor.
+  template <typename BInfo>
+  inline __device__ Gmem_tile_qkv(
+      void* ptr_,
+      const uint32_t row_stride_in_elts,
+      const uint32_t head_stride_in_elts,
+      const BInfo& binfo,
+      const int tidx)
+      : row_stride_in_bytes(row_stride_in_elts * BYTES_PER_ELEMENT),
+        actual_seqlen(binfo.actual_seqlen),
+        ptr(reinterpret_cast<char*>(ptr_)),
+        tidx_(tidx) {
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // Store the row as we need it to disable the loads.
+    // TD [2022-04-16]: To minimize registers, we'll recompute row_ instead of
+    // storing it row_ = row;
+
+    // The row offset in the batched GEMM. For each seq element, we store QKV in
+    // that order. int64_t row_offset = (int64_t)row *
+    // params.qkv_stride_in_bytes;
+    uint32_t row_offset = (uint32_t)((binfo.sum_s + row) * row_stride_in_bytes);
+    // Add the block index.
+    // row_offset += (int64_t)((binfo.sum_s * NUM_MATS + qkv_offset) * binfo.h +
+    // binfo.bidh) * BYTES_PER_ROW;
+    row_offset +=
+        (uint32_t)(binfo.bidh * head_stride_in_elts * BYTES_PER_ELEMENT);
+
+    // Assemble the final pointer.
+    ptr += row_offset + col * BYTES_PER_LDG;
+  }
+
+  // Store data to shared memory.
+  template <typename Smem_tile>
+  inline __device__ void commit(Smem_tile& smem_tile) {
+    smem_tile.store(fetch_);
+  }
+
+  inline __device__ void load() {
+    int row_ = tidx_ / THREADS_PER_ROW;
+    const void* ptrs[LDGS];
+    uint32_t preds[LDGS];
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      // ptrs[ii] = ptr + (int64_t)ii * ROWS_PER_LDG * row_stride_in_bytes;
+      ptrs[ii] = ptr + (uint32_t)ii * ROWS_PER_LDG * row_stride_in_bytes;
+      preds[ii] = ((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen));
+      fetch_[ii] = make_uint4(0, 0, 0, 0);
+    }
+
+    // not packing predicates removes restrictions (e.g. FP16 384, 4 warps)
+    Ldg_functor<uint4, LDGS> fct(fetch_, ptrs);
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      fct.load(ii, preds[ii]);
+    }
+  }
+
+  // Store data to memory.
+  inline __device__ void store(const uint4 (&data)[LDGS]) {
+    int row_ = tidx_ / THREADS_PER_ROW;
+#pragma unroll
+    for (int ii = 0; ii < LDGS; ++ii) {
+      // char *ptr_ = ptr + (int64_t)ii * ROWS_PER_LDG * row_stride_in_bytes;
+      char* ptr_ = ptr + (uint32_t)ii * ROWS_PER_LDG * row_stride_in_bytes;
+      if ((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen)) {
+        fmha::stg(ptr_, data[ii]);
+      }
+    }
+  }
+
+  inline __device__ void move(const int steps = 1) {
+    // ptr += (int64_t)ROWS * row_stride_in_bytes * steps;
+    ptr += (uint32_t)ROWS * row_stride_in_bytes * steps;
+    actual_seqlen -= ROWS * steps;
+  }
+
+  // The stride between rows for the QKV matrice.
+  // int64_t row_stride_in_bytes;
+  const uint32_t row_stride_in_bytes;
+  // The pointer.
+  char* ptr;
+  // The fetch registers.
+  uint4 fetch_[LDGS];
+  // Keep track of the row the thread is processing as we move the tile.
+  // int row_;
+  const int tidx_;
+  // The length of the sequence loaded by that memory tile.
+  int actual_seqlen;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BYTES_PER_ELEMENT = 2>
+struct Gmem_tile_o {
+  static_assert(BYTES_PER_ELEMENT == 2 || BYTES_PER_ELEMENT == 4);
+
+  // The mma tile.
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+  // The size of each element.
+  // static constexpr int BYTES_PER_ELEMENT = 2;
+  // The size of each STG.
+  static constexpr int BYTES_PER_STG = BYTES_PER_ELEMENT * 4;
+  static constexpr int COLS = Cta_tile::N;
+  // The size of a row in bytes.
+  static constexpr int BYTES_PER_ROW = COLS * BYTES_PER_ELEMENT;
+
+  // The number of threads to store a "row" of the matrix.
+  static constexpr int THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_STG;
+  // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+  static constexpr int ROWS = Cta_tile::M;
+  // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
+  static constexpr int ROWS_PER_LOOP =
+      ROWS <= 64 ? ROWS : (int)Mma_tile::M_PER_MMA_PER_CTA;
+  // The number of outter loop for the stores.
+  static constexpr int LOOPS = ROWS / ROWS_PER_LOOP;
+
+  // The number of "rows" stored per STG.
+  static constexpr int ROWS_PER_STG =
+      Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW;
+  // Do we have to guard against partial writes/reads.
+  static constexpr bool HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0;
+  // The number of STGs needed to store a chunk of the Q matrix.
+  static constexpr int STGS_PER_LOOP =
+      DivUpConstexpr(ROWS_PER_LOOP, ROWS_PER_STG);
+  // The number of STGs needed to store a chunk of the Q matrix in total.
+  static constexpr int STGS = STGS_PER_LOOP * LOOPS;
+
+  // Ctor.
+  template <typename BInfo>
+  // inline __device__ Gmem_tile_o(void *ptr, const size_t row_stride_in_elts,
+  // const BInfo &binfo, const int tidx)
+  inline __device__ Gmem_tile_o(
+      void* ptr,
+      const uint32_t row_stride_in_elts,
+      const uint32_t head_stride_in_elts,
+      const BInfo& binfo,
+      const int tidx)
+      : row_stride_in_bytes(row_stride_in_elts * BYTES_PER_ELEMENT),
+        actual_seqlen(binfo.actual_seqlen),
+        ptr_(reinterpret_cast<char*>(ptr)),
+        tidx_(tidx) {
+    // Compute the position in the sequence (within the CTA for the moment).
+    int row = tidx / THREADS_PER_ROW;
+    // Compute the position of the thread in the row.
+    int col = tidx % THREADS_PER_ROW;
+
+    // Store the row as we need it to disable loads.
+    // row_ = row;
+
+    // The row offset in the batched GEMM.
+    // int64_t row_offset = (int64_t)row * row_stride_in_bytes + binfo.bidx *
+    // BYTES_PER_ROW;
+    uint32_t row_offset = (uint32_t)((binfo.sum_s + row) * row_stride_in_bytes);
+    row_offset +=
+        (uint32_t)(binfo.bidh * head_stride_in_elts * BYTES_PER_ELEMENT);
+    // Assemble the final pointer.
+    ptr_ += row_offset + col * BYTES_PER_STG;
+
+    // Is that thread active on the last STG?
+    if (HAS_INCOMPLETE_STG) {
+      is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(const uint4 (&src)[STGS_PER_LOOP], int mi) {
+    int row_ = tidx_ / THREADS_PER_ROW;
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      int jj = mi * STGS_PER_LOOP + ii;
+      if (row_ + jj * ROWS_PER_STG >= this->actual_seqlen) {
+        break;
+      }
+
+      if (BYTES_PER_ELEMENT == 4) {
+        if (!HAS_INCOMPLETE_STG ||
+            (jj < STGS - 1 || this->is_active_for_last_stg_)) {
+          fmha::stg(
+              this->ptr_ + jj * ROWS_PER_STG * this->row_stride_in_bytes,
+              src[ii]);
+        }
+      } else if (BYTES_PER_ELEMENT == 2) {
+        float x = reinterpret_cast<const float&>(src[ii].x);
+        float y = reinterpret_cast<const float&>(src[ii].y);
+        float z = reinterpret_cast<const float&>(src[ii].z);
+        float w = reinterpret_cast<const float&>(src[ii].w);
+        uint2 out = float4_to_half4(x, y, z, w);
+        if (!HAS_INCOMPLETE_STG ||
+            (jj < STGS - 1 || this->is_active_for_last_stg_)) {
+          fmha::stg(
+              this->ptr_ + jj * ROWS_PER_STG * this->row_stride_in_bytes, out);
+        }
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void load(uint4 (&dst)[STGS_PER_LOOP], int mi) {
+    static_assert(BYTES_PER_ELEMENT == 4);
+    int row_ = tidx_ / THREADS_PER_ROW;
+#pragma unroll
+    for (int ii = 0; ii < STGS_PER_LOOP; ++ii) {
+      int jj = mi * STGS_PER_LOOP + ii;
+      if (row_ + jj * ROWS_PER_STG >= this->actual_seqlen) {
+        break;
+      }
+
+      if (!HAS_INCOMPLETE_STG ||
+          (jj < STGS - 1 || this->is_active_for_last_stg_)) {
+        fmha::ldg(
+            dst[ii],
+            this->ptr_ + jj * ROWS_PER_STG * this->row_stride_in_bytes);
+      }
+    }
+  }
+
+  inline __device__ void move(const int steps = 1) {
+    // row_ += ROWS * steps;
+    // ptr_ += (int64_t)ROWS * row_stride_in_bytes * steps;
+    ptr_ += (uint32_t)ROWS * row_stride_in_bytes * steps;
+    actual_seqlen -= ROWS * steps;
+  }
+
+  // The stride between rows for the QKV matrice.
+  // int64_t row_stride_in_bytes;
+  const uint32_t row_stride_in_bytes;
+  // The pointer.
+  char* ptr_;
+  // Is the thread active for the last STG?
+  int is_active_for_last_stg_;
+  // The length of the sequence loaded by that memory tile.
+  int actual_seqlen;
+  const int tidx_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, int BYTES_PER_ELEMENT>
+struct Gmem_tile_mma_sd {
+  // The mma tile.
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+  // Each STG stores 8 elements.
+  static constexpr int BYTES_PER_STG = BYTES_PER_ELEMENT * 8;
+  // The number of MMAs in the M dimension.
+  static constexpr int MMAS_M = Mma_tile::MMAS_M;
+  // The number of MMAs in the N dimension.
+  static constexpr int MMAS_N = Mma_tile::MMAS_N;
+  // The number of rows computed per MMA per thread block.
+  static constexpr int M_PER_MMA_PER_CTA = Mma_tile::M_PER_MMA_PER_CTA;
+  // The number of cols computed per MMA per thread block.
+  static constexpr int N_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA;
+  // The number of threads per block.
+  static constexpr int THREADS_PER_CTA = Cta_tile::THREADS_PER_CTA;
+  // The size of each row in bytes. I.e. how many bytes are stored per STG.
+  static constexpr int BYTES_PER_ROW = THREADS_PER_CTA * BYTES_PER_STG;
+  // The distance between elements stored per loop (in bytes).
+  static constexpr int LOOP_STRIDE_BYTES = MMAS_M * MMAS_N * BYTES_PER_ROW;
+
+  // The type of elements stored per STG.
+  using Type = typename fmha::Uint_from_size_in_bytes<BYTES_PER_STG>::Type;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Gmem_tile_mma_sd(
+      void* ptr,
+      const Params& params,
+      const int bidb,
+      const int bidh,
+      const int tidx)
+      : ptr_(static_cast<char*>(ptr)) {
+    // The block index.
+    // size_t bidx = bidb * params.h + bidh;
+    uint32_t bidx = bidb * params.h + bidh;
+
+    // The distance between two blocks (in bytes).
+    // const size_t block_stride_bytes = params.s * params.s *
+    // BYTES_PER_ELEMENT;
+    const uint32_t block_stride_bytes = params.s * params.s * BYTES_PER_ELEMENT;
+    // Set store location for each thread at the beginning of the loop
+    ptr_ += bidx * block_stride_bytes + tidx * BYTES_PER_STG;
+  }
+
+  // Store to global memory.
+  inline __device__ void store(const Type& data, const int mi, const int ni) {
+    // size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
+    uint32_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
+    fmha::stg(ptr_ + offset, data);
+  }
+
+  // Load from global memory.
+  inline __device__ void load(Type& data, const int mi, const int ni) {
+    // size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
+    uint32_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
+    fmha::ldg(data, ptr_ + offset);
+  }
+
+  // Move to the next tile.
+  inline __device__ void move(const int steps = 1) {
+    ptr_ += LOOP_STRIDE_BYTES * steps;
+  }
+
+  // The pointer in global memory.
+  char* ptr_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename Cta_tile,
+    typename Base = Gmem_tile_mma_sd<Cta_tile, sizeof(uint16_t)>>
+struct Gmem_tile_mma_s : public Base {
+  // The number of mmas in the vertical dimension.
+  static constexpr int M = Base::MMAS_M;
+  // The number of mmas in the horizontal dimension.
+  static constexpr int N = Base::MMAS_N;
+  // The type of the vectors stored by each STG.
+  using Type = typename Base::Type;
+
+  // Ctor.
+  template <typename Params, typename Block_info>
+  inline __device__ Gmem_tile_mma_s(
+      const Params& params,
+      const Block_info& binfo,
+      const int tidx)
+      : Base(params.s_ptr, params, binfo.bidb, binfo.bidh, tidx) {}
+
+  // Store to global memory.
+  template <typename Mask>
+  inline __device__ void store(
+      const float (&softmax)[2 * M][4 * N],
+      const Mask& mask) {
+#pragma unroll
+    for (int mi = 0; mi < M; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < N; ni++) {
+        float tmp00 = softmax[2 * mi + 0][4 * ni + 0];
+        float tmp01 = softmax[2 * mi + 0][4 * ni + 1];
+        float tmp02 = softmax[2 * mi + 0][4 * ni + 2];
+        float tmp03 = softmax[2 * mi + 0][4 * ni + 3];
+
+        float tmp10 = softmax[2 * mi + 1][4 * ni + 0];
+        float tmp11 = softmax[2 * mi + 1][4 * ni + 1];
+        float tmp12 = softmax[2 * mi + 1][4 * ni + 2];
+        float tmp13 = softmax[2 * mi + 1][4 * ni + 3];
+
+        uint4 dst;
+        dst.x = fmha::float2_to_half2(tmp00, tmp01);
+        dst.y = fmha::float2_to_half2(tmp02, tmp03);
+        dst.z = fmha::float2_to_half2(tmp10, tmp11);
+        dst.w = fmha::float2_to_half2(tmp12, tmp13);
+        if (mask.is_valid(mi, ni, 0, 0)) {
+          Base::store(dst, mi, ni);
+        }
+      }
+    }
+  }
+
+  // Store to global memory.
+  template <typename Mask, typename Fragment>
+  inline __device__ void store(const Fragment (&frag)[N][M], const Mask& mask) {
+#pragma unroll
+    for (int mi = 0; mi < M; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < N; ni++) {
+        uint4 dst;
+        dst.x = frag[ni][mi].reg(0);
+        dst.y = frag[ni][mi].reg(2);
+        dst.z = frag[ni][mi].reg(1);
+        dst.w = frag[ni][mi].reg(3);
+        if (mask.any_valid(mi, ni)) {
+          Base::store(dst, mi, ni);
+        }
+      }
+    }
+  }
+
+  // Load from global memory.
+  template <typename Mask>
+  inline __device__ void load(uint4 (&regs)[M][N], const Mask& mask) {
+#pragma unroll
+    for (int mi = 0; mi < M; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < N; ni++) {
+        regs[mi][ni] = make_uint4(0, 0, 0, 0);
+        if (mask.any_valid(mi, ni)) {
+          Base::load(regs[mi][ni], mi, ni);
+        }
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile>
+struct Gmem_summary_stats {
+  // The Mma tile.
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+  // The number of MMAs in M/N dimensions.
+  static constexpr int MMAS_M = Mma_tile::MMAS_M;
+
+  // The size of each element.
+  static constexpr int BYTES_PER_ELEMENT = 4;
+  static constexpr int BYTES_PER_MMA =
+      (Cta_tile::THREADS_PER_WARP / 4) * 2 * BYTES_PER_ELEMENT;
+  static constexpr int ROWS = Cta_tile::M;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Gmem_summary_stats(
+      void* ptr,
+      const Params& params,
+      const int tidx)
+      : ptr_(reinterpret_cast<char*>(ptr)), tidx_(tidx) {
+    // The block index for the batch.
+    const int bidb = blockIdx.y;
+    // The block index for the head.
+    const int bidh = blockIdx.x;
+    // The block index.
+    // size_t bidx = bidb * params.h + bidh;
+    uint32_t bidx = bidb * params.h + bidh;
+
+    // Extract the position in the warp.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // The distance between two blocks (in bytes).
+    // size_t block_stride_bytes = params.s * BYTES_PER_ELEMENT;
+    uint32_t block_stride_bytes = params.s * BYTES_PER_ELEMENT;
+
+    // Set store location for each thread at the beginning of the loop
+    ptr_row_ = ptr_ + bidx * block_stride_bytes;
+    ptr_ += bidx * block_stride_bytes + (lane / 4) * BYTES_PER_ELEMENT;
+  }
+
+  // Store data to global memory.
+  inline __device__ void store(const uint32_t (&data)[MMAS_M * 2]) {
+    int warp = tidx_ / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx_ % Cta_tile::THREADS_PER_WARP;
+    if ((warp == 0) && (lane % 4 == 0)) {
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; ++mi) {
+        // TODO: Not sure if it's right for MMAS_M > 1
+        fmha::stg(
+            ptr_ + mi * BYTES_PER_MMA + 0 * BYTES_PER_ELEMENT,
+            data[mi * 2 + 0]);
+        fmha::stg(
+            ptr_ + mi * BYTES_PER_MMA + 8 * BYTES_PER_ELEMENT,
+            data[mi * 2 + 1]);
+      }
+    }
+  }
+
+  // Store data to global memory.
+  inline __device__ void store_row(
+      const uint32_t (&data)[MMAS_M],
+      const int row) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // TODO: Not sure if it's right for MMAS_M > 1
+      fmha::stg(
+          ptr_row_ + mi * BYTES_PER_MMA + row * BYTES_PER_ELEMENT, data[mi]);
+    }
+  }
+
+  // Load from global memory.
+  inline __device__ void load(uint32_t (&data)[MMAS_M * 2]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // TODO: Not sure if it's right for MMAS_M > 1
+      fmha::ldg(
+          data[mi * 2 + 0], ptr_ + mi * BYTES_PER_MMA + 0 * BYTES_PER_ELEMENT);
+      fmha::ldg(
+          data[mi * 2 + 1], ptr_ + mi * BYTES_PER_MMA + 8 * BYTES_PER_ELEMENT);
+    }
+  }
+
+  // Load from global memory.
+  inline __device__ void load_next(
+      uint32_t (&data)[MMAS_M * 2],
+      int move_steps = 1) {
+    char* ptr_next = ptr_ + move_steps * ROWS * BYTES_PER_ELEMENT;
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      // TODO: Not sure if it's right for MMAS_M > 1
+      fmha::ldg(
+          data[mi * 2 + 0],
+          ptr_next + mi * BYTES_PER_MMA + 0 * BYTES_PER_ELEMENT);
+      fmha::ldg(
+          data[mi * 2 + 1],
+          ptr_next + mi * BYTES_PER_MMA + 8 * BYTES_PER_ELEMENT);
+    }
+  }
+
+  // Store data to global memory.
+  template <int N>
+  inline __device__ void load_row(uint32_t (&data)[N], const int row[N]) {
+#pragma unroll
+    for (int ni = 0; ni < N; ++ni) {
+      fmha::ldg(data[ni], ptr_row_ + row[ni] * BYTES_PER_ELEMENT);
+    }
+  }
+
+  // Move the pointer to the next location.
+  inline __device__ void move() {
+    ptr_ += ROWS * BYTES_PER_ELEMENT;
+    ptr_row_ += ROWS * BYTES_PER_ELEMENT;
+  }
+
+  // Move the pointer to the next location.
+  inline __device__ void move(const int steps) {
+    ptr_ += ROWS * BYTES_PER_ELEMENT * steps;
+    ptr_row_ += ROWS * BYTES_PER_ELEMENT * steps;
+  }
+
+  // The pointer.
+  char* ptr_;
+  char* ptr_row_;
+  const int tidx_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace fmha
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h b/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h
new file mode 100644
index 000000000..27aad1b80
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/kernel_traits.h
@@ -0,0 +1,143 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+#include <fmha/gmem_tile.h>
+#include <fmha/smem_tile.h>
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    int S,
+    int D,
+    int STEP,
+    int WARPS_M,
+    int WARPS_N,
+    uint32_t FLAGS = 0x08u>
+struct FMHA_kernel_traits {
+  // The CTA description for the 1st GEMM.
+  using Cta_tile_p = fmha::Cta_tile_extd<STEP, S, D, WARPS_M, WARPS_N, 1>;
+  // The CTA description for the 2nd GEMM.
+  using Cta_tile_o = fmha::Cta_tile_extd<STEP, D, S, WARPS_M, 1, WARPS_N>;
+
+  // Do we use one buffer for K and V.
+  static constexpr bool SHARE_SMEM_FOR_K_AND_V = (FLAGS & 0x08u) != 0u;
+  // Do we keep K in registers.
+  static constexpr bool K_IN_REGS = (FLAGS & 0x10u) == 0u;
+  // Do we keep V in registers.
+  static constexpr bool V_IN_REGS = (FLAGS & 0x100u) == 0u;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q =
+      fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_A, STEP, D>;
+
+  // The shared memory tile to swizzle Q.
+  // using Smem_tile_q = fmha::Smem_tile_a<Cta_tile_p, fmha::Row,
+  // Gmem_tile_q::BYTES_PER_LDG, 1>;
+  using Smem_tile_q =
+      fmha::Smem_tile_a<Cta_tile_p, fmha::Row, Gmem_tile_q::BYTES_PER_LDG, 2>;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k =
+      fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_B, S, D>;
+  // The shared memory tile to swizzle K.
+  using Smem_tile_k = fmha::Smem_tile_b<Cta_tile_p, fmha::Col>;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v =
+      fmha::Gmem_tile_qkv<Cta_tile_o, fmha::BITS_PER_ELEMENT_B, S, D>;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = fmha::Smem_tile_v<Cta_tile_o>;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = fmha::Gmem_tile_o<Cta_tile_o>;
+  // The shared memory tile for O.
+  using Smem_tile_o = fmha::Smem_tile_o<Cta_tile_o>;
+  ;
+
+  // The global memory tile to load/store S.
+  using Gmem_tile_s = fmha::Gmem_tile_mma_s<Cta_tile_p>;
+
+  // The shared memory tile to transpose S.
+  using Smem_tile_st = fmha::Smem_tile_mma_transposed<Cta_tile_p>;
+
+  using Gmem_tile_do =
+      fmha::Gmem_tile_qkv<Cta_tile_p, fmha::BITS_PER_ELEMENT_A, STEP, D>;
+
+  // The global memory tile to store the softmax sum.
+  using Gmem_softmax_sum = fmha::Gmem_summary_stats<Cta_tile_p>;
+
+  // The shared memory tile to store dp sum.
+  using Smem_dp_sum = fmha::Smem_tile_dp_sum<Gmem_tile_q, 2>;
+
+  // Make sure the number of threads match.
+  static_assert(
+      (int)Gmem_tile_o::THREADS_PER_ROW == (int)Smem_tile_o::THREADS_PER_ROW,
+      "");
+
+  // The number of threads.
+  static constexpr int THREADS = Cta_tile_p::THREADS_PER_CTA;
+  // Make sure the number of threads matches both CTAs.
+  static_assert(THREADS == Cta_tile_o::THREADS_PER_CTA, "");
+
+  // The amount of shared memory needed to load Q and K.
+  static constexpr int BYTES_PER_SMEM_QK =
+      Smem_tile_q::BYTES_PER_TILE + Smem_tile_k::BYTES_PER_TILE;
+  // The extra amount of shared memory needed to load V.
+  static constexpr int BYTES_PER_SMEM_V =
+      SHARE_SMEM_FOR_K_AND_V ? 0u : Smem_tile_v::BYTES_PER_TILE;
+  // The amount of shared memory needed for Q, K and V..
+  static constexpr int BYTES_PER_SMEM_QKV =
+      BYTES_PER_SMEM_QK + BYTES_PER_SMEM_V;
+  // The amount of shared memory needed to load Q and store O.
+  static constexpr int BYTES_PER_SMEM_QO =
+      Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE;
+
+  // The amount of shared memory needed for Q, K, V and O.
+  static constexpr int BYTES_PER_SMEM =
+      fmha::MaxConstexpr(BYTES_PER_SMEM_QKV, BYTES_PER_SMEM_QO);
+  // Make sure we have enough shared memory.
+  static_assert(
+      Smem_tile_q::BYTES_PER_TILE + Smem_tile_o::BYTES_PER_TILE <=
+          BYTES_PER_SMEM,
+      "");
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/mask.h b/python/aitemplate/backend/cuda/attention/src/fmha/mask.h
new file mode 100644
index 000000000..ec07012af
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/mask.h
@@ -0,0 +1,117 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+namespace fmha {
+
+template <typename Cta_tile, bool Is_causal = false>
+struct Mask {
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+  template <typename BInfo>
+  __device__ Mask(
+      const BInfo& blockInfo,
+      int tidx,
+      const int loop_step_idx_ = 0)
+      : actual_seqlen(blockInfo.actual_seqlen - loop_step_idx_ * Cta_tile::N),
+        loop_step_idx(loop_step_idx_) {
+    const int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    const int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    static_assert(Cta_tile::WARPS_K == 1, "");
+
+    // find the warp in the Cta tile
+    const int warp_n = (warp / Cta_tile::WARPS_M);
+    const int warp_m = (warp % Cta_tile::WARPS_M);
+    // decompose warp into 8x4 tile
+    const int quad = lane / 4;
+    const int tid = (lane % 4) * 2;
+    row = warp_m * 16 + quad;
+    col = warp_n * 16 + tid;
+  }
+
+  inline __device__ bool is_valid(
+      const int mi,
+      const int ni,
+      const int ii,
+      const int jj) const {
+    // ii and jj iterate over the 2x4 fragment
+    // const int current_col = (Is_causal ? loop_step_idx * Cta_tile::N : 0) +
+    // ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1);
+    const int current_col =
+        ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2) * 4 + (jj & 1);
+    const int current_row = row_offset + ii * 8;
+    const bool col_valid = current_col < actual_seqlen;
+    // const bool col_valid = (ni * Mma_tile::N_PER_MMA_PER_CTA + col + (jj & 2)
+    // * 4 + (jj & 1)) < actual_seqlen;
+    //&& (row + mi * Mma_tile::M_PER_MMA_PER_CTA + ii * 8) < actual_seqlen;
+    bool all_valid = Is_causal ? col_valid &&
+            (current_col + loop_step_idx * Cta_tile::N <= current_row)
+                               : col_valid;
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("current_col=%d, current_row=%d, actual_seqlen=%d,
+    //     col_valid=%d, all_valid=%d\n", current_col, current_row,
+    //     actual_seqlen, col_valid, all_valid);
+    // }
+    return Is_causal ? col_valid &&
+            (current_col + loop_step_idx * Cta_tile::N <= current_row)
+                     : col_valid;
+    // return row_valid && col_valid;
+  }
+
+  // BERT Mask: if upper left is invalid, none are valid
+  inline __device__ bool any_valid(const int mi, const int ni) const {
+    return is_valid(mi, ni, 0, 0) || is_valid(mi, ni, 1, 0);
+  }
+
+  inline __device__ void load(const int it) {
+    row_offset = it * Cta_tile::M + row;
+  }
+  int row_offset;
+
+  int row;
+  int col;
+  const int loop_step_idx;
+  const int actual_seqlen;
+};
+
+} // namespace fmha
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h b/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h
new file mode 100644
index 000000000..0bb8285d2
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/smem_tile.h
@@ -0,0 +1,1843 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/utils.h>
+#include "utils.h"
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The description of the tile computed by this CTA.
+    typename Cta_tile,
+    // The number of rows in the 2D shared memory buffer.
+    int M_,
+    // The number of cols.
+    int N_,
+    // The size in bits of each element.
+    int BITS_PER_ELEMENT_,
+    // The number of bytes per STS.
+    int BYTES_PER_STS_ = 16,
+    // The number of buffers. (Used in multistage and double buffer cases.)
+    int BUFFERS_PER_TILE_ = 1,
+    // Do we enable the fast path for LDS.128 and friends.
+    int ENABLE_LDS_FAST_PATH_ = 0,
+    // The number of rows that are used for the XOR swizzling to allow fast
+    // STS/LDS.
+    int ROWS_PER_XOR_PATTERN_ = 8,
+    // The number of cols that are used for the XOR swizzling to allow fast
+    // STS/LDS.
+    int COLS_PER_XOR_PATTERN_ = 1,
+    // Use or not predicates
+    bool USE_PREDICATES_ = true>
+struct Smem_tile_without_skews {
+  // The size in bits of each element.
+  enum { BITS_PER_ELEMENT = BITS_PER_ELEMENT_ };
+  // The size in bytes of a single STS.
+  enum { BYTES_PER_STS = BYTES_PER_STS_ };
+  // The number of elements per STS.
+  enum { ELEMENTS_PER_STS = BYTES_PER_STS * 8 / BITS_PER_ELEMENT };
+  // To support arbitrary N, we pad some values to a power-of-2.
+  enum { N_WITH_PADDING = Next_power_of_two<N_>::VALUE };
+  // The number of bytes per row without packing of rows.
+  enum { BYTES_PER_ROW_BEFORE_PACKING = N_WITH_PADDING * BITS_PER_ELEMENT / 8 };
+  // The number of bytes per row -- we want at least 128B per row.
+  enum { BYTES_PER_ROW = Max<BYTES_PER_ROW_BEFORE_PACKING, 128>::VALUE };
+  // The number of rows in shared memory (two rows may be packed into a single
+  // one).
+  enum { ROWS = M_ * BYTES_PER_ROW_BEFORE_PACKING / BYTES_PER_ROW };
+
+  // The number of threads per row.
+  enum { THREADS_PER_ROW_UNBOUNDED = BYTES_PER_ROW / BYTES_PER_STS };
+  // The number of threads per row.
+  enum {
+    THREADS_PER_ROW =
+        Min<Cta_tile::THREADS_PER_CTA, THREADS_PER_ROW_UNBOUNDED>::VALUE
+  };
+
+  // The number of STS per row.
+  enum { STS_PER_ROW = BYTES_PER_ROW / THREADS_PER_ROW / BYTES_PER_STS };
+  // It must be at least one.
+  static_assert(STS_PER_ROW >= 1, "");
+  // The number of rows written with a single STS.
+  enum { ROWS_PER_STS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+  // Make sure we write to at least one row per STS. Thanks Dr. Obvious ;)
+  static_assert(ROWS_PER_STS >= 1, "");
+  // The number of STS needed to store all rows.
+  enum { STS_PER_COL = Div_up<ROWS, ROWS_PER_STS>::VALUE };
+  // The number of STS in total.
+  enum { STS = STS_PER_COL * STS_PER_ROW };
+
+  // TD [2022-06-02] In the case of Q (16 x 64) in the backward pass with 256
+  // threads, we only need to store 16 * 64 * 2 = 2KB instead of 4KB.
+  static constexpr bool PARTIAL_STORE = ROWS_PER_STS > ROWS;
+  static constexpr int STORING_THREADS =
+      PARTIAL_STORE ? ROWS * THREADS_PER_ROW : Cta_tile::THREADS_PER_CTA;
+
+  // The size of one buffer in bytes in shared memory.
+  // enum { BYTES_PER_BUFFER = STS * BYTES_PER_STS * Cta_tile::THREADS_PER_CTA
+  // };
+  enum { BYTES_PER_BUFFER = STS * BYTES_PER_STS * STORING_THREADS };
+  // The number of buffers.
+  enum { BUFFERS_PER_TILE = BUFFERS_PER_TILE_ };
+  // The size in bytes of total buffers.
+  enum { BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE };
+  // The boundary for smem_read_offset and smem_write_offset increment.
+  enum { BYTES_PER_TILE_INC_BOUNDARY = BYTES_PER_TILE - BYTES_PER_BUFFER };
+
+  // Do we enable the LDS.128 fast path?
+  enum { ENABLE_LDS_FAST_PATH = ENABLE_LDS_FAST_PATH_ };
+  static_assert(ENABLE_LDS_FAST_PATH == 0);
+  // The number of rows that are used for the XOR swizzling to allow fast
+  // STS/LDS.
+  enum { ROWS_PER_XOR_PATTERN = ROWS_PER_XOR_PATTERN_ };
+  // The number of cols that are used for the XOR swizzling to allow fast
+  // STS/LDS.
+  enum { COLS_PER_XOR_PATTERN = COLS_PER_XOR_PATTERN_ * 16 / BYTES_PER_STS };
+  // Use or not predicates
+  enum { USE_PREDICATES = USE_PREDICATES_ };
+
+  // The type of elements that are stored in shared memory by each thread.
+  using Store_type = typename Uint_from_size_in_bytes<BYTES_PER_STS>::Type;
+
+  // Ctor.
+  inline __device__ Smem_tile_without_skews(void* smem, int tidx)
+      : smem_(__nvvm_get_smem_pointer(smem)), tidx_(tidx) {
+    // The row written by a thread. See doc/mma_smem_layout.xlsx.
+    int smem_write_row = tidx / THREADS_PER_ROW;
+
+    // The XOR pattern.
+    int smem_write_xor =
+        smem_write_row % ROWS_PER_XOR_PATTERN * COLS_PER_XOR_PATTERN;
+    // Compute the column and apply the XOR pattern.
+    int smem_write_col = (tidx % THREADS_PER_ROW) ^ smem_write_xor;
+
+    // The offset.
+    this->smem_write_offset_ =
+        smem_write_row * BYTES_PER_ROW + smem_write_col * BYTES_PER_STS;
+
+    // TODO: Why not merge it with the read offset?
+    // this->smem_read_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+    // this->smem_write_buffer_ = __shfl_sync(0xffffffff, 0, 0);
+  }
+
+  // Compute the store pointers.
+  template <int N>
+  inline __device__ void compute_store_pointers(uint32_t (&ptrs)[N]) {
+#pragma unroll
+    for (int ii = 0; ii < N; ++ii) {
+      // Decompose the STS into row/col.
+      int row = ii / STS_PER_ROW;
+      int col = ii % STS_PER_ROW;
+
+      // Assemble the offset.
+      int offset = smem_write_offset_ + row * ROWS_PER_STS * BYTES_PER_ROW;
+
+      // Take the column into account.
+      if (STS_PER_ROW > 1) {
+        offset += col * THREADS_PER_ROW * BYTES_PER_STS;
+      }
+
+      // Apply the XOR pattern if needed.
+      if (ROWS_PER_STS < ROWS_PER_XOR_PATTERN) {
+        const int m = row * ROWS_PER_STS % ROWS_PER_XOR_PATTERN;
+        offset ^= m * COLS_PER_XOR_PATTERN * BYTES_PER_STS;
+      }
+
+      // Assemble the final pointer :)
+      // ptrs[ii] = smem_ + offset + smem_write_buffer_;
+      // smem_write_buffer_ is already merged with smem_write_offset_
+      ptrs[ii] = smem_ + offset;
+    }
+  }
+
+  inline __device__ void debug_reset() {
+    for (int buffer = 0; buffer < BYTES_PER_TILE; buffer += BYTES_PER_BUFFER) {
+      for (int row = 0; row < ROWS; ++row) {
+        for (int col = 0; col < BYTES_PER_ROW; col += 4) {
+          if (threadIdx.x == 0) {
+            uint32_t val = 0x0;
+            sts(val, smem_ + row * BYTES_PER_ROW + col + buffer);
+          }
+        }
+      }
+    }
+  }
+
+  // Print the content of the tile (only for debug ;)).
+  inline __device__ void debug_print() const {
+    for (int buffer = 0; buffer < BYTES_PER_TILE; buffer += BYTES_PER_BUFFER) {
+      for (int row = 0; row < ROWS; ++row) {
+        for (int col = 0; col < BYTES_PER_ROW; col += 4) {
+          if (threadIdx.x == 0) {
+            uint32_t val;
+            lds(val, smem_ + row * BYTES_PER_ROW + col + buffer);
+            printf(
+                "block=(x=%2d, y=%2d, z=%2d) (smem_=%2d, buffer=%2d, row=%2d, byte=%4d)=0x%08x\n",
+                blockIdx.x,
+                blockIdx.y,
+                blockIdx.z,
+                smem_,
+                buffer,
+                row,
+                col,
+                val);
+          }
+        }
+      }
+    }
+  }
+
+  // Move the read offset to next buffer.
+  inline __device__ void move_to_next_read_buffer() {
+    // if( BUFFERS_PER_TILE > 1 && smem_read_buffer_ >=
+    // BYTES_PER_TILE_INC_BOUNDARY ) {
+    //     this->smem_read_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
+    // } else if( BUFFERS_PER_TILE > 1 ) {
+    //     this->smem_read_buffer_ += BYTES_PER_BUFFER;
+    // }
+    if (BUFFERS_PER_TILE > 1 &&
+        smem_read_offset_ >= BYTES_PER_TILE_INC_BOUNDARY) {
+      this->smem_read_offset_ -= BYTES_PER_TILE_INC_BOUNDARY;
+    } else if (BUFFERS_PER_TILE > 1) {
+      this->smem_read_offset_ += BYTES_PER_BUFFER;
+    }
+  }
+
+  // Move the read offset to next buffer. TODO: Remove this member function!!!
+  inline __device__ void move_next_read_buffer() {
+    this->move_to_next_read_buffer();
+  }
+
+  // Move the read offset to next N buffer (circular-buffer).
+  inline __device__ void move_to_next_read_buffer(int N) {
+    if (BUFFERS_PER_TILE > 1) {
+      // this->smem_read_buffer_ += N * BYTES_PER_BUFFER;
+      // this->smem_read_buffer_ -= smem_read_buffer_ >= BYTES_PER_TILE ?
+      // BYTES_PER_TILE : 0;
+      this->smem_read_offset_ += N * BYTES_PER_BUFFER;
+      this->smem_read_offset_ -=
+          smem_read_offset_ >= BYTES_PER_TILE ? BYTES_PER_TILE : 0;
+    }
+  }
+
+  // Move the read offset to next N buffer (circular-buffer). TODO: Remove this
+  // member function!!!
+  inline __device__ void move_next_read_buffer(int N) {
+    this->move_to_next_read_buffer(N);
+  }
+
+  // Move the write offset to next buffer.
+  inline __device__ void move_to_next_write_buffer() {
+    // if( BUFFERS_PER_TILE > 1 && smem_write_buffer_ >=
+    // BYTES_PER_TILE_INC_BOUNDARY ) {
+    //     this->smem_write_buffer_ -= BYTES_PER_TILE_INC_BOUNDARY;
+    // } else if( BUFFERS_PER_TILE > 1 ) {
+    //     this->smem_write_buffer_ += BYTES_PER_BUFFER;
+    // }
+    if (BUFFERS_PER_TILE > 1 &&
+        smem_write_offset_ >= BYTES_PER_TILE_INC_BOUNDARY) {
+      this->smem_write_offset_ -= BYTES_PER_TILE_INC_BOUNDARY;
+    } else if (BUFFERS_PER_TILE > 1) {
+      this->smem_write_offset_ += BYTES_PER_BUFFER;
+    }
+  }
+
+  // Move the write offset to next buffer. TODO: Remove that member function!
+  inline __device__ void move_next_write_buffer() {
+    this->move_to_next_write_buffer();
+  }
+
+  // Move the read offset.
+  inline __device__ void move_read_offset(int delta) {
+    this->smem_read_offset_ += delta;
+  }
+
+  // Move the write offset.
+  inline __device__ void move_write_offset(int delta) {
+    this->smem_write_offset_ += delta;
+  }
+
+  // Store to the tile in shared memory.
+  template <int N>
+  inline __device__ void store(const Store_type (&data)[N], uint64_t = 0) {
+    uint32_t smem_ptrs[N];
+    this->compute_store_pointers(smem_ptrs);
+    // Trying to reduce the shared mem for Q from 4KB per buffer to 2KB per
+    // buffer.
+    if (!PARTIAL_STORE || (tidx_ / THREADS_PER_ROW < ROWS)) {
+      sts(smem_ptrs, data);
+    }
+  }
+
+  // Store to the tile in shared memory.
+  template <int N, int M>
+  inline __device__ void store(
+      const Store_type (&data)[N],
+      uint32_t (&preds)[M],
+      uint64_t = 0) {
+    uint32_t smem_ptrs[N];
+    this->compute_store_pointers(smem_ptrs);
+    sts(smem_ptrs, data, preds);
+  }
+
+  // Store to the tile in shared memory.
+  template <int N>
+  inline __device__ void store(
+      const Store_type (&data)[N],
+      uint32_t preds,
+      uint64_t = 0) {
+    this->store(data, preds);
+  }
+
+  // Store to the tile in shared memory.
+  template <int N>
+  inline __device__ void store(
+      const void* (&gmem_ptrs)[N],
+      uint32_t preds,
+      uint64_t = 0) {
+    uint32_t tmp[1] = {preds};
+    this->store(gmem_ptrs, tmp);
+  }
+
+  // The shared memory pointer.
+  const uint32_t smem_;
+  // The read offset. Reserve 4 offsets if needed.
+  int smem_read_offset_;
+  // The write offset.
+  int smem_write_offset_;
+  // The buffer base offset for read.
+  // int smem_read_buffer_;
+  // The buffer base offset for write.
+  // int smem_write_buffer_;
+  const int tidx_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The layout of the tile.
+    typename Layout,
+    // The size of the STS.
+    int BYTES_PER_STS = 16,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE = 1,
+    // Use or not predicates
+    bool USE_PREDICATES = true>
+struct Smem_tile_a {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int MMAS_K, int MMAS_K_WITH_PADDING>
+struct Compute_reset_mask {
+  // The potential mask.
+  enum { HALF = MMAS_K_WITH_PADDING / 2 };
+  // The remainder.
+  enum { MOD = MMAS_K % HALF };
+  // The final value.
+  enum {
+    VALUE = (MMAS_K == MOD ? 0 : HALF) | Compute_reset_mask<MOD, HALF>::VALUE
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int MMAS_K_WITH_PADDING>
+struct Compute_reset_mask<0, MMAS_K_WITH_PADDING> {
+  enum { VALUE = 0 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int MMAS_K>
+struct Compute_reset_mask<MMAS_K, MMAS_K> {
+  enum { VALUE = MMAS_K - 1 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Rows_per_xor_pattern_a {
+  // The size in bits.
+  enum { N_IN_BITS = N * fmha::BITS_PER_ELEMENT_A };
+  // The number of rows.
+  enum { VALUE = N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Rows_per_xor_pattern_row_a : public Rows_per_xor_pattern_a<N> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_row_a<Cta_tile::K>::VALUE>
+struct Smem_tile_row_a : public Smem_tile_without_skews<
+                             Cta_tile,
+                             Cta_tile::M,
+                             Cta_tile::K,
+                             fmha::BITS_PER_ELEMENT_A,
+                             BYTES_PER_STS,
+                             BUFFERS_PER_TILE,
+                             0,
+                             ROWS_PER_XOR_PATTERN_,
+                             1> {
+  // The MMA tile.
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+  // The base class.
+  using Base = Smem_tile_without_skews<
+      Cta_tile,
+      Cta_tile::M,
+      Cta_tile::K,
+      fmha::BITS_PER_ELEMENT_A,
+      BYTES_PER_STS,
+      BUFFERS_PER_TILE,
+      0,
+      ROWS_PER_XOR_PATTERN_,
+      1>;
+  // The fragment.
+  using Fragment = Fragment_a<Row>;
+
+  // When we use padding to reach a power of two, special care has to be taken.
+  using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Cta_tile>;
+  // The number of MMAs.
+  using Mma_tile_with_padding = fmha::Hmma_tile<Cta_tile_with_padding>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_row_a(void* smem, int tidx) : Base(smem, tidx) {
+    // For documentation on the layout, see doc/mma_smem_layout.xlsx.
+
+    // The number of warps.
+    const int WARPS_M = Cta_tile::WARPS_M;
+    const int WARPS_N = Cta_tile::WARPS_N;
+    const int WARPS_K = Cta_tile::WARPS_K;
+
+    static_assert(WARPS_M == 1);
+    static_assert(WARPS_N == 4 || WARPS_N == 8);
+    static_assert(WARPS_K == 1);
+    static_assert(
+        Base::ROWS_PER_XOR_PATTERN == 2 || Base::ROWS_PER_XOR_PATTERN == 4 ||
+        Base::ROWS_PER_XOR_PATTERN == 8);
+
+    // The row and column read by the thread.
+    int smem_read_row = (tidx & 0x0f);
+    constexpr int ROWS_PER_PACKING =
+        Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING;
+    int smem_read_col =
+        ((smem_read_row / ROWS_PER_PACKING) % Base::ROWS_PER_XOR_PATTERN) *
+        Base::COLS_PER_XOR_PATTERN;
+    smem_read_col ^= (tidx & 0x10) / 16;
+
+    // The shared memory offset.
+    this->smem_read_offset_ =
+        smem_read_row * Base::BYTES_PER_ROW_BEFORE_PACKING +
+        smem_read_col * BYTES_PER_LDS;
+  }
+
+  // Rewind smem_read_offset for last LDS phase in main loop.
+  inline __device__ void reverse_smem_read_offset(int ki = 0) {
+    // Undo the pointer increment for the next ni.
+    // Should match the load function below for ki = 0.
+    if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+    }
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&a)[Mma_tile::MMAS_M], int ki) {
+#pragma unroll
+    for (int mi = 0; mi < Mma_tile::MMAS_M; ++mi) {
+      // Jump by as many matrix rows as needed (a row in smem may pack multiple
+      // matrix rows).
+      int offset =
+          mi * Mma_tile::M_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;
+
+      // Load using LDSM.M88.4.
+      uint4 tmp;
+      // ldsm(tmp, this->smem_ + this->smem_read_offset_ +
+      // this->smem_read_buffer_ + offset);
+      ldsm(tmp, this->smem_ + this->smem_read_offset_ + offset);
+
+      // Store the value into the fragment.
+      a[mi].reg(0) = tmp.x;
+      a[mi].reg(1) = tmp.y;
+      a[mi].reg(2) = tmp.z;
+      a[mi].reg(3) = tmp.w;
+    }
+
+    // Move the offset to the next possition. See doc/mma_smem_layout.xlsx.
+    static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
+    if (Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15) {
+      this->smem_read_offset_ ^= 31 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 16 && ki % 8 == 7) {
+      this->smem_read_offset_ ^= 15 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 8 && ki % 4 == 3) {
+      this->smem_read_offset_ ^= 7 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 4 && ki % 2 == 1) {
+      this->smem_read_offset_ ^= 3 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= 1 * BYTES_PER_LDS * 2;
+    }
+  }
+
+  // Reset the read offset.
+  inline __device__ void reset_read_offset() {
+    // The number of MMAs in the K dimension.
+    enum { MMAS_K = Mma_tile::MMAS_K };
+    // The number of MMAs in the K dimension when we include padding.
+    enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
+    // Assemble the mask.
+    enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };
+
+    // Reset the read offset.
+    this->smem_read_offset_ ^= MASK * BYTES_PER_LDS * 2;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_a<Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_row_a<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE> {
+  // The base class.
+  using Base = Smem_tile_row_a<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_a(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The layout of the tile.
+    typename Layout,
+    // The size of the STS.
+    int BYTES_PER_STS = 16,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE = 1,
+    // Use or not predicates
+    bool USE_PREDICATES = true>
+struct Smem_tile_b {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Rows_per_xor_pattern_b {
+  // The size in bits.
+  enum { N_IN_BITS = N * fmha::BITS_PER_ELEMENT_B };
+  // The number of rows.
+  enum { VALUE = N_IN_BITS <= 256 ? 2 : (N_IN_BITS <= 512 ? 4 : 8) };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Rows_per_xor_pattern_col_b : public Rows_per_xor_pattern_b<N> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_col_b<Cta_tile::K>::VALUE>
+struct Smem_tile_col_b : public Smem_tile_without_skews<
+                             Cta_tile,
+                             Cta_tile::N,
+                             Cta_tile::K,
+                             fmha::BITS_PER_ELEMENT_B,
+                             BYTES_PER_STS,
+                             BUFFERS_PER_TILE,
+                             0,
+                             ROWS_PER_XOR_PATTERN_,
+                             1> {
+  // The MMA tile.
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+  // The base class.
+  using Base = Smem_tile_without_skews<
+      Cta_tile,
+      Cta_tile::N,
+      Cta_tile::K,
+      fmha::BITS_PER_ELEMENT_B,
+      BYTES_PER_STS,
+      BUFFERS_PER_TILE,
+      0,
+      ROWS_PER_XOR_PATTERN_,
+      1>;
+  // The fragment.
+  using Fragment = Fragment_b<Col>;
+
+  // When we use padding to reach a power of two, special care has to be taken.
+  using Cta_tile_with_padding = Cta_tile_with_k_with_padding<Cta_tile>;
+  // The number of MMAs.
+  using Mma_tile_with_padding = fmha::Hmma_tile<Cta_tile_with_padding>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // The number of STS per thread
+  enum {
+    STS_PER_THREAD_ =
+        Base::ROWS * Base::THREADS_PER_ROW / Cta_tile::THREADS_PER_CTA
+  };
+  // The number of STS per thread must be at least 1.
+  enum { STS_PER_THREAD = Max<1, STS_PER_THREAD_>::VALUE };
+
+  // Ctor.
+  inline __device__ Smem_tile_col_b(void* smem, int tidx) : Base(smem, tidx) {
+    // For documentation on the layout, see doc/mma_smem_layout.xlsx.
+
+    // The number of warps.
+    const int WARPS_M = Cta_tile::WARPS_M;
+    const int WARPS_N = Cta_tile::WARPS_N;
+    const int WARPS_K = Cta_tile::WARPS_K;
+    static_assert(
+        Base::ROWS_PER_XOR_PATTERN == 2 || Base::ROWS_PER_XOR_PATTERN == 4 ||
+        Base::ROWS_PER_XOR_PATTERN == 8);
+    static_assert(WARPS_M == 1);
+    static_assert(WARPS_N == 4 || WARPS_N == 8);
+    static_assert(WARPS_K == 1);
+
+    // The masks to select the warps.
+    const int WARP_MASK_N = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+
+    // The divisor for the warps.
+    const int WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;
+
+    // The row and column read by the thread.
+    int smem_read_row =
+        (tidx & WARP_MASK_N) / WARP_DIV_N * Mma_tile::N_PER_MMA +
+        (tidx & 0x07) + (tidx & 0x10) / 2;
+    constexpr int ROWS_PER_PACKING =
+        Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING;
+    int smem_read_col =
+        ((smem_read_row / ROWS_PER_PACKING) % Base::ROWS_PER_XOR_PATTERN) *
+        Base::COLS_PER_XOR_PATTERN;
+    smem_read_col ^= (tidx & 0x08) / 8;
+    // The shared memory offset.
+    this->smem_read_offset_ =
+        smem_read_row * Base::BYTES_PER_ROW_BEFORE_PACKING +
+        smem_read_col * BYTES_PER_LDS;
+  }
+
+  // Rewind smem_read_offset for last LDS phase in main loop.
+  inline __device__ void reverse_smem_read_offset(int ki = 0) {
+    // Undo the pointer increment for the next ni.
+    // Should match the load function below for ki = 0.
+    if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+    }
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // Jump by as many matrix rows as needed (a row in smem may pack multiple
+      // matrix rows).
+      int offset =
+          ni * Mma_tile::N_PER_MMA_PER_CTA * Base::BYTES_PER_ROW_BEFORE_PACKING;
+
+      // Load using LDSM.M88.4.
+      uint4 tmp;
+      // ldsm(tmp, this->smem_ + this->smem_read_offset_ +
+      // this->smem_read_buffer_ + offset);
+      ldsm(tmp, this->smem_ + this->smem_read_offset_ + offset);
+
+      // Store the value into the fragment.
+      b[ni].reg(0) = tmp.x;
+      b[ni].reg(1) = tmp.y;
+      b[ni].reg(2) = tmp.z;
+      b[ni].reg(3) = tmp.w;
+    }
+
+    // Move the offset to the next possition. See doc/mma_smem_layout.xlsx.
+    static_assert(Mma_tile_with_padding::MMAS_K < 64, "Not implemented");
+    if (Mma_tile_with_padding::MMAS_K >= 32 && ki % 16 == 15) {
+      this->smem_read_offset_ ^= 31 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 16 && ki % 8 == 7) {
+      this->smem_read_offset_ ^= 15 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 8 && ki % 4 == 3) {
+      this->smem_read_offset_ ^= 7 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 4 && ki % 2 == 1) {
+      this->smem_read_offset_ ^= 3 * BYTES_PER_LDS * 2;
+    } else if (Mma_tile_with_padding::MMAS_K >= 2) {
+      this->smem_read_offset_ ^= 1 * BYTES_PER_LDS * 2;
+    }
+  }
+
+  // Reset the read offset.
+  inline __device__ void reset_read_offset() {
+    // The number of MMAs in the K dimension.
+    enum { MMAS_K = Mma_tile::MMAS_K };
+    // The number of MMAs in the K dimension when we include padding.
+    enum { MMAS_K_WITH_PADDING = Mma_tile_with_padding::MMAS_K };
+    // Assemble the mask.
+    enum { MASK = Compute_reset_mask<MMAS_K, MMAS_K_WITH_PADDING>::VALUE };
+
+    // Reset the read offset.
+    this->smem_read_offset_ ^= MASK * BYTES_PER_LDS * 2;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Cta_tile, Col, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_col_b<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE> {
+  // The base class.
+  using Base = Smem_tile_col_b<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct Rows_per_xor_pattern_row_b : public Rows_per_xor_pattern_b<N> {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE,
+    // How many rows to use for the XOR pattern to avoid bank conflicts?
+    int ROWS_PER_XOR_PATTERN_ = Rows_per_xor_pattern_row_b<Cta_tile::N>::VALUE,
+    // How many cols to use for the XOR pattern to avoid bank conflicts?
+    int COLS_PER_XOR_PATTERN_ = 1>
+struct Smem_tile_row_b : public Smem_tile_without_skews<
+                             Cta_tile,
+                             Cta_tile::K,
+                             Cta_tile::N,
+                             fmha::BITS_PER_ELEMENT_B,
+                             BYTES_PER_STS,
+                             BUFFERS_PER_TILE,
+                             0,
+                             ROWS_PER_XOR_PATTERN_,
+                             COLS_PER_XOR_PATTERN_> {
+  // The MMA tile.
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+  // The base class.
+  using Base = Smem_tile_without_skews<
+      Cta_tile,
+      Cta_tile::K,
+      Cta_tile::N,
+      fmha::BITS_PER_ELEMENT_B,
+      BYTES_PER_STS,
+      BUFFERS_PER_TILE,
+      0,
+      ROWS_PER_XOR_PATTERN_,
+      COLS_PER_XOR_PATTERN_>;
+  // The fragment.
+  using Fragment = Fragment_b<Row>;
+
+  // Can we use LDSM? No if the data type is 32-bit large.
+  enum { USE_LDSMT = fmha::BITS_PER_ELEMENT_B == 16 };
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = USE_LDSMT ? 16 : 4 };
+  // The number of elements per LDS.
+  enum { ELEMENTS_PER_LDS = BYTES_PER_LDS * 8 / fmha::BITS_PER_ELEMENT_B };
+
+  // The number of STS per thread
+  enum {
+    STS_PER_THREAD_ =
+        Base::ROWS * Base::THREADS_PER_ROW / Cta_tile::THREADS_PER_CTA
+  };
+  // The number of STS per thread must be at least 1.
+  enum { STS_PER_THREAD = Max<1, STS_PER_THREAD_>::VALUE };
+
+  // Ctor.
+  inline __device__ Smem_tile_row_b(void* smem, int tidx) : Base(smem, tidx) {
+    // The number of warps.
+    const int WARPS_M = Cta_tile::WARPS_M;
+    const int WARPS_N = Cta_tile::WARPS_N;
+    const int WARPS_K = Cta_tile::WARPS_K;
+    static_assert(WARPS_K == 1);
+    static_assert(WARPS_M == 4 || WARPS_M == 8);
+    static_assert(WARPS_N == 1);
+
+    // The masks to select the warps.
+    const int WARP_MASK_N = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::N;
+    const int WARP_MASK_K = Warp_masks<WARPS_M, WARPS_N, WARPS_K>::K;
+
+    // The divisor for the warps.
+    const int WARP_DIV_N = WARPS_M * 1 * Cta_tile::THREADS_PER_WARP;
+    const int WARP_DIV_K = WARPS_M * WARPS_N * Cta_tile::THREADS_PER_WARP;
+
+    static_assert(USE_LDSMT);
+    static_assert(
+        Base::ROWS_PER_XOR_PATTERN == 2 || Base::ROWS_PER_XOR_PATTERN == 4 ||
+        Base::ROWS_PER_XOR_PATTERN == 8);
+
+    // The row/col read by the thread.
+    int smem_read_row =
+        (tidx & WARP_MASK_K) / WARP_DIV_K * Mma_tile::MMAS_K * 16 +
+        (tidx & 0x07) + (tidx & 0x08);
+    constexpr int ROWS_PER_PACKING =
+        Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING;
+    int smem_read_col =
+        ((smem_read_row / ROWS_PER_PACKING) % Base::ROWS_PER_XOR_PATTERN) *
+        Base::COLS_PER_XOR_PATTERN;
+    smem_read_col ^= (tidx & WARP_MASK_N) / WARP_DIV_N * 2 + (tidx & 0x10) / 16;
+
+    // The shared memory offset.
+    this->smem_read_offset_ =
+        smem_read_row * Base::BYTES_PER_ROW_BEFORE_PACKING +
+        smem_read_col * BYTES_PER_LDS;
+
+    // Fill zeroes for group conv
+  }
+
+  // Rewind smem_read_offset for last LDS phase in main loop.
+  inline __device__ void reverse_smem_read_offset(int ki = 0) {
+    // The size of each element in bits.
+    const int BITS_PER_ELT = fmha::BITS_PER_ELEMENT_B;
+    // The size in bytes of the data needed to compute an MMA per CTA.
+    const int BYTES_PER_MMA_PER_CTA =
+        Mma_tile::N_PER_MMA_PER_CTA * BITS_PER_ELT / 8;
+
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // Undo the pointer increment for the next ni.
+      // Should match the load function below for ki = 0.
+      if (BYTES_PER_MMA_PER_CTA >= 128) {
+        // Nothing to do!
+      } else if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1) {
+        this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+      } else if (BYTES_PER_MMA_PER_CTA == 64) {
+        // Nothing to do!
+      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 4) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
+      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 2) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+      }
+    }
+
+    // Reset smem_read_offset for odd MMAS_N > 1 (npo2 kernels)
+    if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1 &&
+        Mma_tile::MMAS_N % 2 == 1) {
+      this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+    }
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
+    // The size of each element in bits.
+    const int BITS_PER_ELT = fmha::BITS_PER_ELEMENT_B;
+    // The size in bytes of the data needed to compute an MMA per CTA.
+    const int BYTES_PER_MMA_PER_CTA =
+        Mma_tile::N_PER_MMA_PER_CTA * BITS_PER_ELT / 8;
+
+// uint32_t smem_read_og = this->smem_ + this->smem_read_offset_;
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // Prepare the offset.
+      int offset = ki * Base::ROWS_PER_XOR_PATTERN * 2 *
+          Base::BYTES_PER_ROW_BEFORE_PACKING;
+      if (BYTES_PER_MMA_PER_CTA == 32) {
+        offset += this->smem_read_offset_;
+      } else if (BYTES_PER_MMA_PER_CTA == 64) {
+        offset +=
+            this->smem_read_offset_ + (ni / 2) * BYTES_PER_MMA_PER_CTA * 2;
+      } else {
+        offset += this->smem_read_offset_ + (ni)*BYTES_PER_MMA_PER_CTA;
+      }
+
+      // Load the data using LDSM.MT88.2.
+      // uint32_t ptr = this->smem_ + this->smem_read_buffer_ + offset;
+      uint32_t ptr = this->smem_ + offset;
+      uint4 tmp;
+      if (USE_LDSMT) {
+        ldsmt(tmp, ptr);
+      } else {
+        lds(tmp.x, (ptr) + 0 * Base::BYTES_PER_ROW_BEFORE_PACKING);
+        lds(tmp.y, (ptr) + 4 * Base::BYTES_PER_ROW_BEFORE_PACKING);
+        lds(tmp.z, (ptr ^ 32) + 0 * Base::BYTES_PER_ROW_BEFORE_PACKING);
+        lds(tmp.w, (ptr ^ 32) + 4 * Base::BYTES_PER_ROW_BEFORE_PACKING);
+      }
+
+      // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+      //     printf("BYTES_PER_MMA_PER_CTA=%d, ni = %d, smem_read diff = %d\n",
+      //     BYTES_PER_MMA_PER_CTA, ni, ptr - smem_read_og);
+      // }
+      // Store those values in the fragment.
+      b[ni].reg(0) = tmp.x;
+      b[ni].reg(1) = tmp.y;
+      b[ni].reg(2) = tmp.z;
+      b[ni].reg(3) = tmp.w;
+
+      // Move the pointer for the next ni. I expect the compiler to not
+      // recompute those.
+      if (BYTES_PER_MMA_PER_CTA >= 128) {
+        // Nothing to do!
+      } else if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1) {
+        this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+      } else if (BYTES_PER_MMA_PER_CTA == 64) {
+        // Nothing to do!
+      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 8) {
+        this->smem_read_offset_ ^=
+            BYTES_PER_LDS * (ni % 4 == 3 ? 14 : (ni % 2 == 1 ? 6 : 2));
+      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 4) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
+      } else if (BYTES_PER_MMA_PER_CTA == 32 && Mma_tile::MMAS_N == 2) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+      }
+    }
+
+    // Reset smem_read_offset for odd MMAS_N > 1 (npo2 kernels)
+    if (BYTES_PER_MMA_PER_CTA == 64 && Mma_tile::MMAS_N > 1 &&
+        Mma_tile::MMAS_N % 2 == 1) {
+      this->smem_read_offset_ ^= BYTES_PER_MMA_PER_CTA;
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The dimensions of the tile computed by the CTA.
+    typename Cta_tile,
+    // The size of the STS.
+    int BYTES_PER_STS,
+    // The number of buffers per tile.
+    int BUFFERS_PER_TILE>
+struct Smem_tile_b<Cta_tile, Row, BYTES_PER_STS, BUFFERS_PER_TILE>
+    : public Smem_tile_row_b<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE> {
+  // The base class.
+  using Base = Smem_tile_row_b<Cta_tile, BYTES_PER_STS, BUFFERS_PER_TILE>;
+
+  // Ctor.
+  inline __device__ Smem_tile_b(void* smem, int tidx) : Base(smem, tidx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_v : public fmha::Smem_tile_without_skews<
+                         Cta_tile,
+                         Cta_tile::K,
+                         Cta_tile::N,
+                         16,
+                         16,
+                         1,
+                         0,
+                         Rows_per_xor_pattern_col_b<Cta_tile::N>::VALUE,
+                         1> {
+  // The base class.
+  using Base = Smem_tile_without_skews<
+      Cta_tile,
+      Cta_tile::K,
+      Cta_tile::N,
+      16,
+      16,
+      1,
+      0,
+      Rows_per_xor_pattern_col_b<Cta_tile::N>::VALUE,
+      1>;
+  // The MMA tile.
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+  // The fragment.
+  using Fragment = Fragment_b<fmha::Col>;
+
+  // The size of a single LDS in bytes.
+  enum { BYTES_PER_LDS = 16 };
+
+  // Ctor.
+  inline __device__ Smem_tile_v(void* smem, int tidx) : Base(smem, tidx) {
+    // The row/col read by the thread.
+    int read_row, read_col;
+
+    static_assert(
+        Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 1 &&
+        (Cta_tile::WARPS_K == 4 || Cta_tile::WARPS_K == 8));
+
+    read_row = (tidx & 0xe0) / 2 + (tidx & 0x0f);
+    constexpr int ROWS_PER_PACKING =
+        Base::BYTES_PER_ROW / Base::BYTES_PER_ROW_BEFORE_PACKING;
+    read_col = ((read_row / ROWS_PER_PACKING) % Base::ROWS_PER_XOR_PATTERN) *
+        Base::COLS_PER_XOR_PATTERN;
+    read_col ^= (tidx & 0x10) / 16;
+
+    // The shared memory offset.
+    this->smem_read_offset_ = read_row * Base::BYTES_PER_ROW_BEFORE_PACKING +
+        read_col * BYTES_PER_LDS;
+  }
+
+  // Load from shared memory.
+  inline __device__ void load(Fragment (&b)[Mma_tile::MMAS_N], int ki) {
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // Jump by 16 * #warps row.
+      int row = ki * 16 * Cta_tile::WARPS_K;
+
+      // Load the data using LDSM.MT88.2.
+      uint4 tmp;
+      fmha::ldsmt(
+          tmp,
+          this->smem_ + this->smem_read_offset_ +
+              row * Base::BYTES_PER_ROW_BEFORE_PACKING);
+      b[ni].reg(0) = tmp.x;
+      b[ni].reg(1) = tmp.y;
+      b[ni].reg(2) = tmp.z;
+      b[ni].reg(3) = tmp.w;
+
+      // Move the pointer for the next ni. I expect the compiler to not
+      // recompute those.
+      if (Mma_tile::MMAS_N == 1) {
+        // noop
+      } else if (Mma_tile::MMAS_N == 2) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * 2;
+      } else if (Mma_tile::MMAS_N == 4) {
+        this->smem_read_offset_ ^= BYTES_PER_LDS * (ni % 2 == 0 ? 2 : 6);
+      } else if (Mma_tile::MMAS_N == 8) {
+        this->smem_read_offset_ ^=
+            BYTES_PER_LDS * (ni % 4 == 3 ? 14 : (ni % 2 == 1 ? 6 : 2));
+      } else {
+        assert(false); // Not implemented!
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_o {
+  // The MMA tile.
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator;
+  // The accumulators.
+  using Data_type = typename Accumulator::Data_type;
+
+  // The size of each element.
+  static constexpr int BYTES_PER_ELEMENT = sizeof(Data_type);
+  // The size of each STS.
+  static constexpr int BYTES_PER_STS = 8;
+  // The size of each row in shared memory.
+  static constexpr int BYTES_PER_ROW =
+      Cta_tile::N * Cta_tile::WARPS_K * BYTES_PER_ELEMENT;
+
+  // The size of each LDS.
+  static constexpr int BYTES_PER_LDS = 16;
+  static constexpr int THREADS_PER_ROW =
+      Cta_tile::N * BYTES_PER_ELEMENT / BYTES_PER_LDS;
+
+  // The number of rows.
+  static constexpr int ROWS = Cta_tile::M;
+  // The number of "rows" to process per loop iteration (in the "epilogue").
+  static constexpr int ROWS_PER_LOOP =
+      ROWS <= 64 ? ROWS : (int)Mma_tile::M_PER_MMA_PER_CTA;
+  // The number of outer loops.
+  static constexpr int LOOPS = ROWS / ROWS_PER_LOOP;
+  // Make sure it matches our expectations.
+  static_assert(LOOPS == 1 || LOOPS == (int)Mma_tile::MMAS_M, "");
+
+  // The number of rows loaded per LDS.
+  static constexpr int ROWS_PER_LDS =
+      Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW;
+  // Do we have to guard against partial writes/reads.
+  static constexpr bool HAS_INCOMPLETE_LDS = ROWS_PER_LOOP % ROWS_PER_LDS != 0;
+  // The total number of LDS per loop.
+  static constexpr int LDS_PER_LOOP =
+      fmha::DivUpConstexpr(ROWS_PER_LOOP, ROWS_PER_LDS);
+
+  // The amount of shared memory.
+  static constexpr int BYTES_PER_TILE = ROWS_PER_LOOP * BYTES_PER_ROW;
+
+  // The write pointer.
+  uint32_t smem_write_, smem_read_;
+  // Is the thread active for the last LDS of the series?
+  int is_active_for_last_lds_;
+
+  // static_assert(BYTES_PER_ROW == 64 * 4 * Cta_tile::WARPS_K);
+  static_assert(LOOPS == 1 || LOOPS == (int)Mma_tile::MMAS_M, "");
+
+  // Ctor.
+  inline __device__ Smem_tile_o(void* smem, int tidx) {
+    // Get a 32-bit value for the shared memory address.
+    uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+    static_assert(
+        Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 1 &&
+        (Cta_tile::WARPS_K == 4 || Cta_tile::WARPS_K == 8));
+    static_assert(
+        Cta_tile::N == 16 || Cta_tile::N == 32 || Cta_tile::N == 64 ||
+        Cta_tile::N == 128);
+
+    int write_row = (tidx & 0x1c) / 4;
+
+    const int lane = tidx % 32;
+    const int warp = tidx / 32;
+
+    constexpr int ELEMENTS_PER_STS = BYTES_PER_STS / BYTES_PER_ELEMENT;
+    constexpr int STS_PER_WARP = 16 * Mma_tile::MMAS_N / ELEMENTS_PER_STS;
+    int write_col = warp * STS_PER_WARP + lane % STS_PER_WARP;
+
+    // if ((threadIdx.x == 16) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("write_row = %d, write_col = %d\n", write_row, write_col);
+    // }
+
+    // if ((blockIdx.x == 0) && (blockIdx.y == 0) && (write_row == 0) &&
+    // (write_col == 0)) {
+    //     printf("threadIdx.x = %d\n", threadIdx.x);
+    // }
+
+    // Assemble the write pointer.
+    smem_write_ = smem_ + write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+
+    // The element read by each thread.
+    int read_row = tidx / THREADS_PER_ROW;
+    int read_col = tidx % THREADS_PER_ROW;
+
+    // Take the XOR pattern into account for the column.
+    read_col ^=
+        2 * (read_row % (Cta_tile::N == 16 ? 2 : (Cta_tile::N == 32 ? 4 : 8)));
+    // read_col ^= 2 * (read_row % (Cta_tile::N == 16 ? 2 : (Cta_tile::N == 32 ?
+    // 4 : (Cta_tile::N == 128 ? 16 : 8))));
+
+    // if ((threadIdx.x == 8) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("read_row = %d, read_col = %d\n", read_row, read_col);
+    // }
+    // if ((blockIdx.x == 0) && (blockIdx.y == 0) && (read_row == 0) &&
+    // (read_col == 0)) {
+    //     printf("threadIdx.x = %d\n", threadIdx.x);
+    // }
+    // Assemble the read pointer.
+    this->smem_read_ =
+        smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+
+    // Is that thread active on the last LDS?
+    if (HAS_INCOMPLETE_LDS) {
+      this->is_active_for_last_lds_ =
+          read_row + (LDS_PER_LOOP - 1) * ROWS_PER_LDS < Cta_tile::M;
+    }
+  }
+
+  // Load the output fragments.
+  template <bool zero_init = true>
+  inline __device__ void load(uint4 (&out)[LDS_PER_LOOP]) const {
+#pragma unroll
+    for (int ii = 0; ii < LDS_PER_LOOP; ++ii) {
+      // Load the elements before the reduction (split-K).
+      uint4 tmp[Cta_tile::WARPS_K];
+#pragma unroll
+      for (int jj = 0; jj < Cta_tile::WARPS_K; ++jj) {
+        int imm = ii * ROWS_PER_LDS * BYTES_PER_ROW +
+            jj * Cta_tile::N * BYTES_PER_ELEMENT;
+        uint32_t smem_read = this->smem_read_ + imm;
+        // TD [2022-06-05] Ugly fix for d=128 in the forward pass, maybe there's
+        // a better way.
+        if ((Cta_tile::N == 128) && (ROWS_PER_LDS == 4) && (ii % 2 == 1)) {
+          smem_read ^= 8 * BYTES_PER_LDS;
+        }
+        // if ((threadIdx.x == 8) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+        //     printf("imm diff = %d\n", smem_read - this->smem_read_);
+        // }
+        if (!HAS_INCOMPLETE_LDS ||
+            (ii < LDS_PER_LOOP - 1 || this->is_active_for_last_lds_)) {
+          // fmha::lds(tmp[jj], this->smem_read_ + imm);
+          fmha::lds(tmp[jj], smem_read);
+        }
+      }
+
+      // Perform the reduction.
+      out[ii] = zero_init ? tmp[0] : fmha::fadd4(out[ii], tmp[0]);
+// if ((threadIdx.x == 8) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+//     printf("out reduction: out = %.6f\n", reinterpret_cast<float
+//     (&)[4]>(out[ii])[0]);
+// }
+#pragma unroll
+      for (int jj = 1; jj < Cta_tile::WARPS_K; ++jj) {
+        out[ii] = fmha::fadd4(out[ii], tmp[jj]);
+        // if ((threadIdx.x == 8) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+        //     printf("out reduction tmp = %.6f, out = %.6f\n",
+        //     reinterpret_cast<float (&)[4]>(tmp[jj])[0],
+        //     reinterpret_cast<float (&)[4]>(out[ii])[0]);
+        // }
+      }
+    }
+  }
+
+  // Store the accumulators.
+  template <int M, int N>
+  inline __device__ void store(const Accumulator (&acc)[M][N], int mi) {
+    // uint32_t smem_write_og = this->smem_write_;
+    static constexpr int M_PER_MMA = Mma_tile::M_PER_MMA_PER_CTA;
+#pragma unroll
+    for (int ni = 0; ni < Mma_tile::MMAS_N; ++ni) {
+      // The number of MMAs that are stored per loop iteration.
+      static constexpr int MMAS_M_PER_LOOP = Mma_tile::MMAS_M / LOOPS;
+
+// Store 1st column of the different MMAs.
+#pragma unroll
+      for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+        // Precompute the immediates to jump between rows.
+        int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW;
+        int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW;
+        uint2 tmp0, tmp1;
+        tmp0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(0);
+        tmp0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(1);
+
+        tmp1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(2);
+        tmp1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(3);
+
+        // Store.
+        fmha::sts(this->smem_write_ + row_0, tmp0);
+        fmha::sts(this->smem_write_ + row_1, tmp1);
+      }
+      // if ((threadIdx.x == 16) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+      //     printf("smem_write diff = %d\n", this->smem_write_ -
+      //     smem_write_og);
+      // }
+
+      // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+      //     uint4 read_tmp;
+      //     fmha::lds(read_tmp, this->smem_read_);
+      //     printf("smem_o = %.6f\n", reinterpret_cast<float
+      //     (&)[4]>(read_tmp)[0]);
+      // }
+      // Swizzle the write pointer using a XOR of 16B.
+      this->smem_write_ ^= 32;
+
+// Store 2nd column of the different MMAs.
+#pragma unroll
+      for (int mj = 0; mj < MMAS_M_PER_LOOP; ++mj) {
+        // Precompute the immediates to jump between rows.
+        int row_0 = (mj * M_PER_MMA + 0) * BYTES_PER_ROW;
+        int row_1 = (mj * M_PER_MMA + 8) * BYTES_PER_ROW;
+
+        uint2 tmp0, tmp1;
+        tmp0.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(4);
+        tmp0.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(5);
+
+        tmp1.x = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(6);
+        tmp1.y = acc[mi * MMAS_M_PER_LOOP + mj][ni].reg(7);
+        // Store.
+        fmha::sts(this->smem_write_ + row_0, tmp0);
+        fmha::sts(this->smem_write_ + row_1, tmp1);
+      }
+
+      // if ((threadIdx.x == 16) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+      //     printf("smem_write diff = %d\n", this->smem_write_ -
+      //     smem_write_og);
+      // }
+
+      // Cancel the previous XOR of 1 + swizzle the write pointer using a XOR of
+      // 32B or 64B.
+      static_assert(Mma_tile::MMAS_N <= 8, "Not implemented");
+      if (Mma_tile::MMAS_N >= 8 && ni % 4 == 3) {
+        this->smem_write_ ^= 15 * 32;
+      } else if (Mma_tile::MMAS_N >= 4 && ni % 2 == 1) {
+        this->smem_write_ ^= 7 * 32;
+      } else if (Mma_tile::MMAS_N >= 2) {
+        this->smem_write_ ^= 3 * 32;
+      } else {
+        this->smem_write_ ^= 3 * 32;
+      }
+      // this->smem_write_ ^= (ni & 1) ? 7 * 32 : 3 * 32;
+      // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0))  {
+      //     uint4 read_tmp;
+      //     fmha::lds(read_tmp, this->smem_read_);
+      //     printf("smem_o = %.6f\n", reinterpret_cast<float
+      //     (&)[4]>(read_tmp)[0]);
+      // }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_mma {
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+  using Fragment = fmha::Fragment_a<fmha::Col>;
+
+  enum { COLS = Cta_tile::N };
+  enum { BYTES_PER_ELT = 2 };
+  enum { BYTES_PER_STS = 4 };
+  enum { BYTES_PER_ROW = COLS * BYTES_PER_ELT }; // TODO
+  enum { BYTES_PER_TILE = Cta_tile::M * BYTES_PER_ROW };
+
+  enum { WARPS_M = Cta_tile::WARPS_M };
+  enum { WARPS_N = Cta_tile::WARPS_N };
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  static_assert(WARPS_K == 1);
+  inline __device__ Smem_tile_mma(char* smem, int tidx) {
+    uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+    int write_col, write_row;
+    static_assert(
+        WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8) ||
+        (WARPS_M == 4 || WARPS_M == 8) || WARPS_N == 1);
+    if (WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8)) {
+      write_row = (tidx & 0x1c) / 4;
+      write_col = (tidx & 0xe0) / 4 + (tidx & 0x03);
+      write_col ^= (write_row & 0x07) * 4;
+    } else {
+      write_row = (tidx & 0xe0) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x03);
+      // write_col ^= (write_row & (BYTES_PER_ROW == 32 ? 0x01 : (BYTES_PER_ROW
+      // == 64 ? 0x03 : (BYTES_PER_ROW == 128 ? 0x07 : 0x0f)))) * 4;
+      write_col ^= (write_row &
+                    (BYTES_PER_ROW == 32
+                         ? 0x01
+                         : (BYTES_PER_ROW == 64
+                                ? 0x03
+                                : (BYTES_PER_ROW == 128 ? 0x07 : 0x07)))) *
+          4;
+    }
+
+    // write_offset_ = write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+    smem_write_ = smem_ + write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+  }
+
+  template <int M, int N>
+  inline __device__ void store(const uint4 (&regs)[M][N]) {
+    static_assert(COLS == Cta_tile::N);
+#pragma unroll
+    for (int mi = 0; mi < M; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < N; ni++) {
+        // size_t offset = write_offset_ + mi * WARPS_M * 16 * BYTES_PER_ROW +
+        // ni * WARPS_N * 16 * BYTES_PER_ELT; fmha::sts(smem_ + offset + 0 *
+        // BYTES_PER_ROW, regs[mi][ni].x); fmha::sts(smem_ + offset + 8 *
+        // BYTES_PER_ROW, regs[mi][ni].z); offset ^= 4 * BYTES_PER_STS;
+        // fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].y);
+        // fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].w);
+        // size_t offset = smem_write_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni
+        // * WARPS_N * 16 * BYTES_PER_ELT;
+        uint32_t offset = smem_write_ + mi * WARPS_M * 16 * BYTES_PER_ROW +
+            ni * WARPS_N * 16 * BYTES_PER_ELT;
+        fmha::sts(offset + 0 * BYTES_PER_ROW, regs[mi][ni].x);
+        fmha::sts(offset + 8 * BYTES_PER_ROW, regs[mi][ni].z);
+        offset ^= 4 * BYTES_PER_STS;
+        fmha::sts(offset + 0 * BYTES_PER_ROW, regs[mi][ni].y);
+        fmha::sts(offset + 8 * BYTES_PER_ROW, regs[mi][ni].w);
+      }
+    }
+  }
+
+  template <typename Fragment, int M, int N>
+  inline __device__ void store(const Fragment (&frag)[N][M]) {
+    static_assert(COLS == Cta_tile::N);
+    uint4 regs[M][N];
+#pragma unroll
+    for (int mi = 0; mi < M; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < N; ni++) {
+        // Need to transpose ref(1) and reg(2) here since when we load it we
+        // transpose again.
+        regs[mi][ni] = make_uint4(
+            frag[ni][mi].reg(0),
+            frag[ni][mi].reg(2),
+            frag[ni][mi].reg(1),
+            frag[ni][mi].reg(3));
+      }
+    }
+    this->store(regs);
+  }
+
+  // uint32_t smem_;
+  // uint32_t write_offset_;
+  uint32_t smem_write_;
+};
+
+template <typename Cta_tile, typename Base = Smem_tile_mma<Cta_tile>>
+struct Smem_tile_mma_transposed : public Base {
+  enum { BYTES_PER_LDS = 16 };
+  enum { BYTES_PER_ROW = Base::BYTES_PER_ROW };
+  enum { BYTES_PER_ELT = Base::BYTES_PER_ELT };
+  enum { WARPS_M = Base::WARPS_M };
+  enum { WARPS_N = Base::WARPS_N };
+  static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8));
+  using Fragment = typename Base::Fragment;
+  inline __device__ Smem_tile_mma_transposed(char* smem, int tidx)
+      : Base(smem, tidx) {
+    uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+    static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8));
+    int read_row, read_col;
+    read_row = (tidx & 0x0f);
+    read_col = (tidx & 0xe0) / 16 + (tidx & 0x1c) / 16;
+
+    // read_col ^= (read_row & (Base::BYTES_PER_ROW == 32 ? 0x01 :
+    // (Base::BYTES_PER_ROW == 64 ? 0x03 : (Base::BYTES_PER_ROW == 128 ? 0x07 :
+    // 0x0f))));
+    read_col ^= (read_row & 0x07);
+    // read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+    smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+  }
+
+  template <int M, int N>
+  inline __device__ void load(Fragment (&frag)[M][N]) {
+    static_assert(Base::COLS == Cta_tile::N);
+    for (int mi = 0; mi < M; mi++) {
+      for (int ni = 0; ni < N; ni++) {
+        // size_t offset = read_offset_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni
+        // * WARPS_N * 16 * BYTES_PER_ELT;
+        uint4 dst;
+        // fmha::ldsmt(dst, this->smem_ + offset);
+        // size_t offset = smem_read_ + mi * WARPS_M * 16 * BYTES_PER_ROW + ni *
+        // WARPS_N * 16 * BYTES_PER_ELT;
+        uint32_t offset = smem_read_ + mi * WARPS_M * 16 * BYTES_PER_ROW +
+            ni * WARPS_N * 16 * BYTES_PER_ELT;
+        fmha::ldsmt(dst, offset);
+        frag[mi][ni].reg(0) = dst.x;
+        frag[mi][ni].reg(1) = dst.z; // Fragment A regs col major!
+        frag[mi][ni].reg(2) = dst.y;
+        frag[mi][ni].reg(3) = dst.w;
+      }
+    }
+  }
+
+  // uint32_t read_offset_;
+  uint32_t smem_read_;
+};
+
+template <typename Cta_tile, typename Base = Smem_tile_mma<Cta_tile>>
+struct Smem_tile_mma_epilogue : public Base {
+  enum { BYTES_PER_LDS = 16 };
+  enum { BYTES_PER_ROW = Base::BYTES_PER_ROW };
+  enum { BYTES_PER_ELT = Base::BYTES_PER_ELT };
+  enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDS };
+  static_assert(THREADS_PER_ROW * BYTES_PER_LDS == BYTES_PER_ROW);
+  enum { ROWS_PER_LDS = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
+  enum { NUM_LDS = Cta_tile::M / ROWS_PER_LDS };
+  static_assert(NUM_LDS * ROWS_PER_LDS == Cta_tile::M);
+  enum { WARPS_M = Base::WARPS_M };
+  enum { WARPS_N = Base::WARPS_N };
+  static_assert((WARPS_M == 4 || WARPS_N == 8) || WARPS_N == 1);
+
+  using Acc = fmha::Fragment_accumulator;
+
+  inline __device__ Smem_tile_mma_epilogue(char* smem, int tidx)
+      : Base(smem, tidx) {
+    uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+    const int read_row = tidx / THREADS_PER_ROW;
+    int read_col = tidx % THREADS_PER_ROW;
+    // read_col ^= (read_row & (Base::BYTES_PER_ROW == 32 ? 0x01 :
+    // (Base::BYTES_PER_ROW == 64 ? 0x03 : 0x07)));
+    static_assert(
+        Base::BYTES_PER_ROW == 32 || Base::BYTES_PER_ROW == 64 ||
+        Base::BYTES_PER_ROW == 128 || Base::BYTES_PER_ROW == 256);
+    read_col ^=
+        (read_row &
+         (Base::BYTES_PER_ROW == 32
+              ? 0x01
+              : (Base::BYTES_PER_ROW == 64
+                     ? 0x03
+                     : (Base::BYTES_PER_ROW == 128 ? 0x07 : 0x07))));
+    // read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+    smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+  }
+
+  inline __device__ void load(uint4 (&data)[NUM_LDS]) {
+    for (int ii = 0; ii < NUM_LDS; ii++) {
+      // size_t offset = read_offset_ + ii * ROWS_PER_LDS * BYTES_PER_ROW;
+      // fmha::lds(data[ii], this->smem_ + offset);
+      // size_t offset = smem_read_ + ii * ROWS_PER_LDS * BYTES_PER_ROW;
+      uint32_t offset = smem_read_ + ii * ROWS_PER_LDS * BYTES_PER_ROW;
+      fmha::lds(data[ii], offset);
+    }
+  }
+
+  template <int M, int N>
+  inline __device__ void store(const Acc (&acc)[M][N]) {
+#pragma unroll
+    for (int mi = 0; mi < M; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < N; ni++) {
+        // 1st row - 4 elements per row.
+        float tmp00 = acc[mi][ni].elt(0);
+        float tmp01 = acc[mi][ni].elt(1);
+        float tmp02 = acc[mi][ni].elt(4);
+        float tmp03 = acc[mi][ni].elt(5);
+        // 2nd row - 4 elements per row.
+        float tmp10 = acc[mi][ni].elt(2);
+        float tmp11 = acc[mi][ni].elt(3);
+        float tmp12 = acc[mi][ni].elt(6);
+        float tmp13 = acc[mi][ni].elt(7);
+
+        uint32_t x = fmha::float2_to_half2(tmp00, tmp01);
+        uint32_t y = fmha::float2_to_half2(tmp02, tmp03);
+        uint32_t z = fmha::float2_to_half2(tmp10, tmp11);
+        uint32_t w = fmha::float2_to_half2(tmp12, tmp13);
+
+        // size_t offset = (this->write_offset_ ^ (ni * 32)) + mi * WARPS_M * 16
+        // * BYTES_PER_ROW; fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW,
+        // x); fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, z); offset ^=
+        // 4 * Base::BYTES_PER_STS; fmha::sts(this->smem_ + offset + 0 *
+        // BYTES_PER_ROW, y); fmha::sts(this->smem_ + offset + 8 *
+        // BYTES_PER_ROW, w); size_t offset = (this->smem_write_ ^ (ni * 32)) +
+        // mi * WARPS_M * 16 * BYTES_PER_ROW;
+        uint32_t offset =
+            (this->smem_write_ ^ (ni * 32)) + mi * WARPS_M * 16 * BYTES_PER_ROW;
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+        //     printf("mi = %d, ni = %d, offset - smem_write_ = %d\n", mi, ni,
+        //     offset - this->smem_write_);
+        // }
+        fmha::sts(offset + 0 * BYTES_PER_ROW, x);
+        fmha::sts(offset + 8 * BYTES_PER_ROW, z);
+        offset ^= 4 * Base::BYTES_PER_STS;
+        fmha::sts(offset + 0 * BYTES_PER_ROW, y);
+        fmha::sts(offset + 8 * BYTES_PER_ROW, w);
+      }
+    }
+  }
+
+  template <int M, int N>
+  inline __device__ void store(const uint4 (&regs)[M][N]) {
+    for (int mi = 0; mi < M; mi++) {
+      for (int ni = 0; ni < N; ni++) {
+        // size_t offset = (this->write_offset_ ^ (ni * 32)) + mi * WARPS_M * 16
+        // * BYTES_PER_ROW;
+        uint32_t offset = (this->write_offset_ ^ (ni * 32)) +
+            mi * WARPS_M * 16 * BYTES_PER_ROW;
+        fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].x);
+        fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].z);
+        offset ^= 4 * Base::BYTES_PER_STS;
+        fmha::sts(this->smem_ + offset + 0 * BYTES_PER_ROW, regs[mi][ni].y);
+        fmha::sts(this->smem_ + offset + 8 * BYTES_PER_ROW, regs[mi][ni].w);
+      }
+    }
+  }
+
+  // uint32_t read_offset_;
+  uint32_t smem_read_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile>
+struct Smem_tile_transpose {
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+  using Fragment_write = fmha::Fragment_b<fmha::Col>;
+  using Fragment_read = fmha::Fragment_b<fmha::Col>;
+
+  enum { COLS = Cta_tile::N };
+  enum { BYTES_PER_ELT = 2 };
+  enum { BYTES_PER_STS = 4 };
+  enum { BYTES_PER_ROW = COLS * BYTES_PER_ELT }; // TODO
+  enum { BYTES_PER_TILE = Cta_tile::M * BYTES_PER_ROW };
+
+  enum { BYTES_PER_LDS = 16 };
+
+  enum { WARPS_M = Cta_tile::WARPS_M };
+  enum { WARPS_N = Cta_tile::WARPS_N };
+  enum { WARPS_K = Cta_tile::WARPS_K };
+
+  static_assert(WARPS_K == 1);
+  static_assert(WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8));
+
+  inline __device__ Smem_tile_transpose(char* smem, int tidx) {
+    smem_ = __nvvm_get_smem_pointer(smem);
+    // uint32_t smem_ = __nvvm_get_smem_pointer(smem);
+
+    int write_col, write_row;
+    static_assert(
+        WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8) ||
+        (WARPS_M == 4 || WARPS_N == 8) || WARPS_N == 1);
+    if (WARPS_M == 1 && (WARPS_N == 4 || WARPS_N == 8)) {
+      write_row = (tidx & 0x1c) / 4;
+      write_col = (tidx & 0xe0) / 4 + (tidx & 0x03);
+    } else {
+      write_row = (tidx & 0xe0) / 2 + (tidx & 0x1c) / 4;
+      write_col = (tidx & 0x03);
+    }
+    write_col ^= (write_row & 0x07) * 4;
+
+    write_offset_ = write_row * BYTES_PER_ROW + write_col * BYTES_PER_STS;
+    // smem_write_ = smem_ + write_row * BYTES_PER_ROW + write_col *
+    // BYTES_PER_STS;
+
+    int read_row, read_col;
+    read_row = (tidx & 0x0f);
+    read_col = (tidx & 0xe0) / 16 + (tidx & 0x1c) / 16;
+
+    read_col ^= (read_row & 0x07);
+    read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+    // smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
+  }
+
+  template <int M, int N>
+  inline __device__ void store(const Fragment_write (&frag_w)[M][N], int mi) {
+#pragma unroll
+    for (int ni = 0; ni < N; ni++) {
+      // size_t offset = write_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+      uint32_t offset = write_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+      fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, frag_w[ni][mi].reg(0));
+      fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, frag_w[ni][mi].reg(2));
+      offset ^= 4 * BYTES_PER_STS;
+      fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, frag_w[ni][mi].reg(1));
+      fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, frag_w[ni][mi].reg(3));
+    }
+  }
+
+  template <int N>
+  inline __device__ void load(Fragment_read (&frag_r)[N]) {
+#pragma unroll
+    for (int ni = 0; ni < N; ni++) {
+      // size_t offset = read_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+      uint32_t offset = read_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+      uint4 dst;
+      fmha::ldsmt(dst, this->smem_ + offset);
+      frag_r[ni].reg(0) = dst.x;
+      frag_r[ni].reg(1) = dst.y; // Fragment B regs col major!
+      frag_r[ni].reg(2) = dst.z;
+      frag_r[ni].reg(3) = dst.w;
+    }
+  }
+
+  template <int M, int N>
+  inline __device__ void transpose(
+      const Fragment_write (&frag_w)[M][N],
+      Fragment_read (&frag_r)[M],
+      int mi) {
+    static_assert(COLS == Cta_tile::N);
+#pragma unroll
+    for (int ni = 0; ni < N; ni++) {
+      // size_t offset = write_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+      uint32_t offset = write_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+      fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, frag_w[ni][mi].reg(0));
+      fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, frag_w[ni][mi].reg(2));
+      offset ^= 4 * BYTES_PER_STS;
+      fmha::sts(smem_ + offset + 0 * BYTES_PER_ROW, frag_w[ni][mi].reg(1));
+      fmha::sts(smem_ + offset + 8 * BYTES_PER_ROW, frag_w[ni][mi].reg(3));
+    }
+#pragma unroll
+    for (int ni = 0; ni < N; ni++) {
+      // size_t offset = read_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+      // size_t offset = read_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+      uint32_t offset = read_offset_ + ni * WARPS_N * 16 * BYTES_PER_ELT;
+      uint4 dst;
+      fmha::ldsmt(dst, this->smem_ + offset);
+      frag_r[ni].reg(0) = dst.x;
+      frag_r[ni].reg(1) = dst.y; // Fragment B regs col major!
+      frag_r[ni].reg(2) = dst.z;
+      frag_r[ni].reg(3) = dst.w;
+    }
+  }
+
+  uint32_t smem_;
+  uint32_t write_offset_;
+  uint32_t read_offset_;
+  // uint32_t smem_write_;
+  // uint32_t smem_read_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename Gmem_tile,
+    // The number of buffers. (Used in multistage and double buffer cases.)
+    int BUFFERS_PER_TILE_ = 1>
+struct Smem_tile_dp_sum {
+  using Cta_tile = typename Gmem_tile::Cta_tile;
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+  // The size of each element.
+  static constexpr int BYTES_PER_ELEMENT = 4;
+  static constexpr int ROWS = Gmem_tile::ROWS;
+  static constexpr int THREADS_PER_ROW = Gmem_tile::THREADS_PER_ROW;
+  static constexpr int MMAS_M = Mma_tile::MMAS_M;
+
+  static constexpr int ROWS_PER_LDG = Gmem_tile::ROWS_PER_LDG;
+  static constexpr int LDGS = Gmem_tile::LDGS;
+
+  static constexpr int ROWS_PER_MMA = Mma_tile::M_PER_MMA;
+
+  // The size of one buffer in bytes in shared memory.
+  static constexpr int BYTES_PER_BUFFER = ROWS * BYTES_PER_ELEMENT;
+  // The number of buffers.
+  static constexpr int BUFFERS_PER_TILE = BUFFERS_PER_TILE_;
+  // The size in bytes of total buffers.
+  static constexpr int BYTES_PER_TILE = BYTES_PER_BUFFER * BUFFERS_PER_TILE;
+  // The boundary for smem_read_offset and smem_write_offset increment.
+  static constexpr int ROWS_PER_TILE_INC_BOUNDARY =
+      ROWS * BUFFERS_PER_TILE - ROWS;
+
+  inline __device__ Smem_tile_dp_sum(float* smem, const int tidx)
+      : smem_(smem),
+        smem_read_buffer_(smem),
+        smem_write_buffer_(smem),
+        tidx_(tidx) {}
+
+  // Move the read offset to next buffer.
+  inline __device__ void move_to_next_read_buffer() {
+    if (BUFFERS_PER_TILE > 1 &&
+        (smem_read_buffer_ - smem_) >= ROWS_PER_TILE_INC_BOUNDARY) {
+      this->smem_read_buffer_ -= ROWS_PER_TILE_INC_BOUNDARY;
+    } else if (BUFFERS_PER_TILE > 1) {
+      this->smem_read_buffer_ += ROWS;
+    }
+  }
+
+  // Move the write offset to next buffer.
+  inline __device__ void move_to_next_write_buffer() {
+    if (BUFFERS_PER_TILE > 1 &&
+        (smem_write_buffer_ - smem_) >= ROWS_PER_TILE_INC_BOUNDARY) {
+      this->smem_write_buffer_ -= ROWS_PER_TILE_INC_BOUNDARY;
+    } else if (BUFFERS_PER_TILE > 1) {
+      this->smem_write_buffer_ += ROWS;
+    }
+  }
+
+  inline __device__ void store(const float (&sum)[LDGS]) {
+    if (tidx_ % THREADS_PER_ROW == 0) {
+      int row = tidx_ / THREADS_PER_ROW;
+#pragma unroll
+      for (int i = 0; i < LDGS; ++i) {
+        if (row + i * ROWS_PER_LDG < ROWS) {
+          smem_write_buffer_[row + i * ROWS_PER_LDG] = sum[i];
+        }
+      }
+    }
+  }
+
+  inline __device__ void store(const float sum, const int buffer_idx) {
+    float* smem_write = smem_ + buffer_idx * ROWS;
+    int row = tidx_ / THREADS_PER_ROW;
+    if ((row < ROWS) && (tidx_ % THREADS_PER_ROW == 0)) {
+      smem_write[row] = sum;
+    }
+  }
+
+  inline __device__ void store(const float (&sum)[LDGS], const int buffer_idx) {
+    float* smem_write = smem_ + buffer_idx * ROWS;
+    if (tidx_ % THREADS_PER_ROW == 0) {
+      int row = tidx_ / THREADS_PER_ROW;
+#pragma unroll
+      for (int i = 0; i < LDGS; ++i) {
+        if (row + i * ROWS_PER_LDG < ROWS) {
+          smem_write[row + i * ROWS_PER_LDG] = sum[i];
+        }
+      }
+    }
+  }
+
+  inline __device__ void store_pair(
+      const float (&sum)[MMAS_M * 2],
+      const int buffer_idx) {
+    float* smem_write = smem_ + buffer_idx * ROWS;
+    // Extract the position in the warp.
+    int warp = tidx_ / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx_ % Cta_tile::THREADS_PER_WARP;
+    int row = lane / 4;
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+      smem_write[mi * ROWS_PER_MMA + row + 0] = sum[mi * 2 + 0];
+      smem_write[mi * ROWS_PER_MMA + row + 8] = sum[mi * 2 + 1];
+    }
+  }
+
+  template <int N>
+  inline __device__ void load(float (&sum)[N], const int (&row)[N]) {
+#pragma unroll
+    for (int ni = 0; ni < N; ni++) {
+      sum[ni] = smem_read_buffer_[row[ni]];
+    }
+  }
+
+  template <int N>
+  inline __device__ void load(
+      float (&sum)[N],
+      const int (&row)[N],
+      const int buffer_idx) {
+    float* smem_read = smem_ + buffer_idx * ROWS;
+#pragma unroll
+    for (int ni = 0; ni < N; ni++) {
+      sum[ni] = smem_read[row[ni]];
+    }
+  }
+
+  static inline __device__ float reduce_warp(float sum) {
+    fmha::SumOp<float> sum_op;
+    return fmha::Allreduce<THREADS_PER_ROW>::run(sum, sum_op);
+  }
+
+  const int tidx_;
+  float* const smem_;
+  float* smem_read_buffer_;
+  float* smem_write_buffer_;
+};
+
+} // namespace fmha
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h b/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h
new file mode 100644
index 000000000..02e82c427
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/softmax.h
@@ -0,0 +1,708 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cmath>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Sum_ {
+  static constexpr bool IS_SUM = true;
+  static inline __device__ float apply(float x, float y) {
+    return x + y;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Max_ {
+  static constexpr bool IS_SUM = false;
+  static inline __device__ float apply(float x, float y) {
+    return x > y ? x : y;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float apply_exp_(float x, float max) {
+  return __expf(x - max);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float apply_exp2_(float x, float max) {
+  return exp2f(x - max);
+  // With fast-math, this produces the same PTX instruction as the assembly
+  // below float diff = x - max; float res; asm ("ex2.approx.ftz.f32 %0,
+  // %1;\n\t" : "=f"(res) : "f"(diff)); return res;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int COLS>
+struct ReadType {};
+template <>
+struct ReadType<4> {
+  using T = float;
+};
+template <>
+struct ReadType<8> {
+  using T = float2;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Smem_tile_reduce {
+  // Helper class to distribute MMA tiles reduced over rows per warp over quads.
+
+  // The Mma tile.
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+  // The number of MMAs in M/N dimensions.
+  static constexpr int MMAS_M = Mma_tile::MMAS_M;
+  static constexpr int MMAS_N = Mma_tile::MMAS_N;
+
+  static constexpr int WARPS_M = Cta_tile::WARPS_M;
+  static constexpr int WARPS_N = Cta_tile::WARPS_N;
+
+  static constexpr int ROWS = WARPS_M * MMAS_M * 16;
+  static constexpr int COLS = WARPS_N;
+  static_assert(COLS == 4 || COLS == 8);
+  static constexpr int ROWS_PER_XOR_PATTERN = (COLS == 8) ? 4 : 8;
+  static constexpr int BYTES_PER_TILE = ROWS * COLS * sizeof(float);
+  static constexpr int ELTS_PER_TILE = ROWS * COLS;
+
+  static constexpr int THREADS_PER_GROUP =
+      Kernel_traits::Gmem_tile_o::THREADS_PER_ROW;
+  // TD [2022-05-02]: No longer true if head_dim != 64
+  // static_assert(THREADS_PER_GROUP == 16); // DEBUG
+  static constexpr int ROWS_PER_WARP = 32 / THREADS_PER_GROUP;
+  static constexpr int LOOPS = Kernel_traits::Gmem_tile_o::LOOPS;
+  static_assert(LOOPS == 1);
+
+  using read_t = typename ReadType<COLS>::T;
+
+  __device__ inline Smem_tile_reduce(float* smem_, const int tidx) {
+    int lane = tidx % 32;
+    int warp = tidx / 32;
+
+    int warp_m = warp % WARPS_M;
+    int warp_n = warp / WARPS_M;
+
+    qid_ = lane % 4;
+    int qp = lane / 4;
+
+    // Swizzle the column to avoid 2-fold bank conflicts when we have 8 warps.
+    // This won't affect reading as we assume commutative reduction ops.
+    const int col = warp_n ^ (qp / ROWS_PER_XOR_PATTERN);
+    smem_write_ = &smem_[warp_m * 16 * MMAS_M * WARPS_N + qp * WARPS_N + col];
+    smem_read_ = &reinterpret_cast<read_t*>(
+        smem_)[warp_m * 16 * MMAS_M * 4 + qp * 4 + qid_];
+    smem_read_row_ =
+        &reinterpret_cast<read_t*>(smem_)[warp_m * 16 * MMAS_M * 4 + qid_];
+  }
+
+  __device__ inline void store(float (&frag)[2 * MMAS_M]) {
+    if (qid_ == 0) {
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M; mi++) {
+        int offset = mi * 16 * WARPS_N;
+        smem_write_[offset + 0 * 8 * WARPS_N] = frag[mi * 2 + 0];
+        smem_write_[offset + 1 * 8 * WARPS_N] = frag[mi * 2 + 1];
+      }
+    }
+  }
+
+  __device__ inline void load(read_t (&frag)[2 * MMAS_M]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; mi++) {
+      int offset = mi * 16 * 4;
+      frag[mi * 2 + 0] = smem_read_[offset + 0 * 8 * 4];
+      frag[mi * 2 + 1] = smem_read_[offset + 1 * 8 * 4];
+    }
+  }
+
+  __device__ inline void load_row(read_t (&frag)[MMAS_M], int row) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; mi++) {
+      int offset = mi * 16 * 4;
+      frag[mi] = smem_read_row_[offset + 0 * 8 * 4 + row * 4];
+    }
+  }
+
+  int qid_;
+  float* smem_write_;
+  read_t* smem_read_;
+  read_t* smem_read_row_;
+};
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax_base {
+  // The Mma tile.
+  using Mma_tile = fmha::Hmma_tile<Cta_tile>;
+
+  // The number of MMAs in M/N dimensions.
+  static constexpr int MMAS_M = Mma_tile::MMAS_M;
+  static constexpr int MMAS_N = Mma_tile::MMAS_N;
+
+  // The number of groups of warp such that we have at most 4 warps writing
+  // consecutive elements.
+  static constexpr int GROUPS = fmha::DivUpConstexpr(Cta_tile::WARPS_N, 4);
+  // The number of elements that we are going to store per row.
+  static constexpr int ELEMENTS_PER_ROW = Cta_tile::WARPS_N / GROUPS;
+  // The number of rows.
+  static constexpr int ROWS = Cta_tile::M * GROUPS;
+  // The total number of elements.
+  static constexpr int ELEMENTS = ROWS * ELEMENTS_PER_ROW;
+
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax_base(const Params& params, void* smem, int tidx)
+      : // packed_mask_ptr_(reinterpret_cast<const
+        // char*>(params.packed_mask_ptr)),
+        smem_(reinterpret_cast<float*>(smem)),
+        tidx_(tidx) {
+    // Move to the 1st mask loaded by the thread+ tidx;
+    // packed_mask_ptr_ += bidb * params.packed_mask_stride_in_bytes + tidx *
+    // sizeof(uint32_t);
+
+    // Extract the position in the warp.
+    int warp = tidx / Cta_tile::THREADS_PER_WARP;
+    int lane = tidx % Cta_tile::THREADS_PER_WARP;
+
+    // Decompose the warp index into M and N.
+    int warp_m = warp % Cta_tile::WARPS_M;
+    int warp_n = warp / Cta_tile::WARPS_M;
+
+    // Decompose the warp-n index into group/position-inside-the-group.
+    int warp_g = warp_n / ELEMENTS_PER_ROW;
+    int warp_i = warp_n % ELEMENTS_PER_ROW;
+
+    // The location written by the threads.
+    int write_row =
+        warp_g * (ROWS / GROUPS) + warp_m * Mma_tile::M_PER_MMA + lane / 4;
+    int write_col = warp_i;
+
+    // Assemble the write pointer.
+    smem_write_ = &smem_[write_row * ELEMENTS_PER_ROW + write_col];
+
+    // Assemble the read pointer.
+    smem_read_ = &smem_[warp_m * Mma_tile::M_PER_MMA + lane / 4];
+  }
+
+  template <bool zero = false, typename Mask>
+  inline __device__ void apply_mask(const Mask& mask) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ii = 0; ii < 2; ++ii) {
+#pragma unroll
+        for (int ni = 0; ni < MMAS_N; ++ni) {
+#pragma unroll
+          for (int jj = 0; jj < 4; ++jj) {
+            if (!mask.is_valid(mi, ni, ii, jj)) {
+              elt_[2 * mi + ii][4 * ni + jj] = zero ? 0.f : -INFINITY;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Apply the exp to all the elements.
+  template <bool max_in_base2 = false, bool elt_in_base2 = false>
+  inline __device__ void apply_exp(const float (&max)[MMAS_M * 2]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+      // max * log_2(e)) This allows the compiler to use the ffma
+      // instruction instead of fadd and fmul separately.
+      constexpr float kLog2e = M_LOG2E;
+      const float max_base2 = max_in_base2 ? max[mi] : max[mi] * kLog2e;
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        // elt_[mi][ni] = apply_exp_(elt_[mi][ni], max[mi]);
+        elt_[mi][ni] = apply_exp2_(
+            elt_in_base2 ? elt_[mi][ni] : elt_[mi][ni] * kLog2e, max_base2);
+      }
+    }
+  }
+
+  // Apply the exp to all the elements.
+  template <bool scale_max = true>
+  inline __device__ void scale_apply_exp(
+      const float (&max)[MMAS_M * 2],
+      const float scale_) {
+    const float max_scale = scale_max ? scale_ * M_LOG2E : M_LOG2E;
+    const float scale = scale_ * M_LOG2E;
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+      // max * log_2(e)) This allows the compiler to use the ffma
+      // instruction instead of fadd and fmul separately.
+      const float max_scaled = max[mi] * max_scale;
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        elt_[mi][ni] = apply_exp2_(elt_[mi][ni] * scale, max_scaled);
+      }
+    }
+  }
+
+  // Apply the exp to all the elements.
+  template <bool max_in_base2 = false>
+  inline __device__ void apply_exp_col(const float (&max)[MMAS_N * 4]) {
+#pragma unroll
+    for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+      constexpr float kLog2e = M_LOG2E;
+      const float max_base2 = max_in_base2 ? max[ni] : max[ni] * kLog2e;
+#pragma unroll
+      for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+        elt_[mi][ni] = apply_exp2_(elt_[mi][ni] * kLog2e, max_base2);
+      }
+    }
+  }
+  // inline __device__ void apply_exp_col(const float (&max)[MMAS_N]) {
+  //     constexpr float kLog2e = M_LOG2E;
+  //     #pragma unroll
+  //     for( int ni = 0; ni < MMAS_N * 4; ++ni ) {
+  //         float max_base2 = max_in_base2 ? max[ni / 4] : max[ni / 4] *
+  //         kLog2e; max_base2 = __shfl_sync(0xffffffff, max_base2, (ni % 4) * 8
+  //         + threadIdx.x % 8); #pragma unroll for( int mi = 0; mi < MMAS_M *
+  //         2; ++mi ) {
+  //             elt_[mi][ni] = apply_exp2_(elt_[mi][ni] * kLog2e, max_base2);
+  //         }
+  //     }
+  // }
+
+  template <bool encode_dropout_in_sign_bit = false>
+  inline __device__ void apply_dropout(Philox& ph, uint32_t p_dropout_in_uint) {
+    // We encode the dropout pattern in the sign bit of the non-negative
+    // softmax to distinguish from pre-existing zeros
+    auto encode_dropout = [](bool keep, float val) {
+      return keep ? val : (encode_dropout_in_sign_bit ? -val : float(0));
+    };
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ni++) {
+        uint4 tmp = ph();
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+        //     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, tmp.x, tmp.y,
+        //     tmp.z, tmp.w);
+        // }
+        elt_[mi][4 * ni + 0] =
+            encode_dropout(tmp.x <= p_dropout_in_uint, elt_[mi][4 * ni + 0]);
+        elt_[mi][4 * ni + 1] =
+            encode_dropout(tmp.y <= p_dropout_in_uint, elt_[mi][4 * ni + 1]);
+        elt_[mi][4 * ni + 2] =
+            encode_dropout(tmp.z <= p_dropout_in_uint, elt_[mi][4 * ni + 2]);
+        elt_[mi][4 * ni + 3] =
+            encode_dropout(tmp.w <= p_dropout_in_uint, elt_[mi][4 * ni + 3]);
+      }
+    }
+  }
+
+  template <bool encode_dropout_in_sign_bit = false>
+  inline __device__ void apply_dropout(
+      Philox& ph0,
+      Philox& ph1,
+      uint32_t p_dropout_in_uint) {
+    // We encode the dropout pattern in the sign bit of the non-negative
+    // softmax to distinguish from pre-existing zeros
+    auto encode_dropout = [](bool keep, float val) {
+      return keep ? val : (encode_dropout_in_sign_bit ? -val : float(0));
+    };
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; mi++) {
+      static_assert(MMAS_N % 2 == 0);
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ni += 2) {
+        uint4 tmp = ph0();
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+        //     printf("ni = %d, ph0, Philox: %u, %u, %u, %u\n", ni, tmp.x,
+        //     tmp.y, tmp.z, tmp.w);
+        // }
+        elt_[mi][4 * ni + 0] =
+            encode_dropout(tmp.x <= p_dropout_in_uint, elt_[mi][4 * ni + 0]);
+        elt_[mi][4 * ni + 1] =
+            encode_dropout(tmp.y <= p_dropout_in_uint, elt_[mi][4 * ni + 1]);
+        elt_[mi][4 * ni + 2] =
+            encode_dropout(tmp.z <= p_dropout_in_uint, elt_[mi][4 * ni + 2]);
+        elt_[mi][4 * ni + 3] =
+            encode_dropout(tmp.w <= p_dropout_in_uint, elt_[mi][4 * ni + 3]);
+        tmp = ph1();
+        // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+        //     printf("ni = %d, ph1, Philox: %u, %u, %u, %u\n", ni + 1, tmp.x,
+        //     tmp.y, tmp.z, tmp.w);
+        // }
+        elt_[mi][4 * (ni + 1) + 0] = encode_dropout(
+            tmp.x <= p_dropout_in_uint, elt_[mi][4 * (ni + 1) + 0]);
+        elt_[mi][4 * (ni + 1) + 1] = encode_dropout(
+            tmp.y <= p_dropout_in_uint, elt_[mi][4 * (ni + 1) + 1]);
+        elt_[mi][4 * (ni + 1) + 2] = encode_dropout(
+            tmp.z <= p_dropout_in_uint, elt_[mi][4 * (ni + 1) + 2]);
+        elt_[mi][4 * (ni + 1) + 3] = encode_dropout(
+            tmp.w <= p_dropout_in_uint, elt_[mi][4 * (ni + 1) + 3]);
+      }
+    }
+  }
+
+  template <bool encode_dropout_in_sign_bit = false>
+  inline __device__ void apply_dropout_16bits(
+      Philox& ph,
+      uint16_t p_dropout_in_uint16_t) {
+    // We encode the dropout pattern in the sign bit of the non-negative
+    // softmax to distinguish from pre-existing zeros
+    auto encode_dropout = [](bool keep, float val) {
+      return keep ? val : (encode_dropout_in_sign_bit ? -val : float(0));
+    };
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; mi++) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ni++) {
+        uint16_t tmp[8];
+        fmha::uint4_to_ushort8(ph(), tmp);
+// if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+//     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, tmp.x, tmp.y, tmp.z,
+//     tmp.w);
+// }
+#pragma unroll
+        for (int ii = 0; ii < 2; ++ii) {
+#pragma unroll
+          for (int jj = 0; jj < 4; ++jj) {
+            elt_[mi * 2 + ii][4 * ni + jj] = encode_dropout(
+                tmp[ii * 4 + jj] <= p_dropout_in_uint16_t,
+                elt_[mi * 2 + ii][4 * ni + jj]);
+          }
+        }
+      }
+    }
+  }
+
+  template <bool encode_dropout_in_sign_bit = false>
+  inline __device__ void apply_dropout_16bits(
+      Philox& ph0,
+      Philox& ph1,
+      uint16_t p_dropout_in_uint16_t) {
+    // We encode the dropout pattern in the sign bit of the non-negative
+    // softmax to distinguish from pre-existing zeros
+    auto encode_dropout = [](bool keep, float val) {
+      return keep ? val : (encode_dropout_in_sign_bit ? -val : float(0));
+    };
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; mi++) {
+      static_assert(MMAS_N % 2 == 0);
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ni += 2) {
+        uint16_t tmp[8];
+        fmha::uint4_to_ushort8(ph0(), tmp);
+// if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+//     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, tmp.x, tmp.y, tmp.z,
+//     tmp.w);
+// }
+#pragma unroll
+        for (int ii = 0; ii < 2; ++ii) {
+#pragma unroll
+          for (int jj = 0; jj < 4; ++jj) {
+            elt_[mi * 2 + ii][4 * ni + jj] = encode_dropout(
+                tmp[ii * 4 + jj] <= p_dropout_in_uint16_t,
+                elt_[mi * 2 + ii][4 * ni + jj]);
+          }
+        }
+        fmha::uint4_to_ushort8(ph1(), tmp);
+// if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+//     printf("ni = %d, ph  Philox: %u, %u, %u, %u\n", ni, tmp.x, tmp.y, tmp.z,
+//     tmp.w);
+// }
+#pragma unroll
+        for (int ii = 0; ii < 2; ++ii) {
+#pragma unroll
+          for (int jj = 0; jj < 4; ++jj) {
+            elt_[mi * 2 + ii][4 * (ni + 1) + jj] = encode_dropout(
+                tmp[ii * 4 + jj] <= p_dropout_in_uint16_t,
+                elt_[mi * 2 + ii][4 * (ni + 1) + jj]);
+          }
+        }
+      }
+    }
+  }
+
+  // Scale all the elements.
+  inline __device__ void scale(const float (&sum)[MMAS_M * 2]) {
+    // Precompute the inverse sum to normalize. Without -use_fast_math, it makes
+    // a huge deal.
+    float inv_sum[MMAS_M * 2];
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+      inv_sum[mi] =
+          (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi];
+    }
+
+// Update the values.
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        elt_[mi][ni] *= inv_sum[mi];
+      }
+    }
+  }
+
+  // Subtract all elements by dp_sum
+  inline __device__ void subtract_dp_sum(const float (&dp_sum)[MMAS_M * 2]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M * 2; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N * 4; ++ni) {
+        elt_[mi][ni] -= dp_sum[mi];
+      }
+    }
+  }
+
+  // The pointer to the mask.
+  const char* packed_mask_ptr_;
+  // Shared memory for the CTA-wide reduction.
+  float *smem_, *smem_write_, *smem_read_;
+  // The current thread index.
+  int tidx_;
+  // The elements.
+  float elt_[MMAS_M * 2][MMAS_N * 4];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Cta_tile, typename Kernel_traits>
+struct Softmax : public Softmax_base<Cta_tile, Kernel_traits> {
+  // The base class.
+  using Base = Softmax_base<Cta_tile, Kernel_traits>;
+  // The fragment.
+  using Fragment_a = fmha::Fragment_a<fmha::Row>;
+
+  static_assert(Fragment_a::NUM_REGS == 4);
+
+  static constexpr int WARPS_M = Cta_tile::WARPS_M;
+  static constexpr int WARPS_N = Cta_tile::WARPS_N;
+  // The MMAs.
+  static constexpr int MMAS_M = Base::MMAS_M;
+  static constexpr int MMAS_N = Base::MMAS_N;
+
+  // The accumulators.
+  using Accumulator = fmha::Fragment_accumulator;
+  using Accumulator_out = Fragment<uint16_t, 8>;
+  static_assert(Accumulator_out::NUM_REGS == 4);
+
+  static_assert(std::is_same<Accumulator::Data_type, float>::value);
+
+  using Smem_tile_red = Smem_tile_reduce<Cta_tile, Kernel_traits>;
+  static_assert(Smem_tile_red::ELTS_PER_TILE == Cta_tile::M * WARPS_N);
+  // Ctor.
+  template <typename Params>
+  inline __device__ Softmax(const Params& params, void* smem, int tidx)
+      : Base(params, smem, tidx),
+        params_scale_bmm1_(params.scale_bmm1),
+        smem_sum_(static_cast<float*>(smem), tidx),
+        smem_max_(
+            static_cast<float*>(smem) + Smem_tile_red::ELTS_PER_TILE,
+            tidx) {}
+
+  // Pack the data to a fragment for the next GEMM.
+  template <int K, int M>
+  inline __device__ void pack(Fragment_a (&dst)[K][M]) const {
+#pragma unroll
+    for (int mi = 0; mi < M; ++mi) {
+#pragma unroll
+      for (int ki = 0; ki < K; ++ki) {
+        // 1st row - 4 elements per row.
+        float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0];
+        float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1];
+        float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2];
+        float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3];
+
+        // 2nd row - 4 elements per row.
+        float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0];
+        float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1];
+        float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2];
+        float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3];
+
+        // Pack to 4 registers.
+        dst[ki][mi].reg(0) = fmha::float2_to_half2(tmp_00, tmp_01);
+        dst[ki][mi].reg(1) = fmha::float2_to_half2(tmp_10, tmp_11);
+        dst[ki][mi].reg(2) = fmha::float2_to_half2(tmp_02, tmp_03);
+        dst[ki][mi].reg(3) = fmha::float2_to_half2(tmp_12, tmp_13);
+      }
+    }
+  }
+
+  // Scale FP32 fragments
+  inline __device__ void unpack(const Accumulator (&acc)[MMAS_M][MMAS_N]) {
+    const float scalef =
+        reinterpret_cast<const float&>(this->params_scale_bmm1_);
+
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // 1st row - 4 elements per row.
+        this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0) * scalef;
+        this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1) * scalef;
+        this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4) * scalef;
+        this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5) * scalef;
+        // 2nd row - 4 elements per row.
+        this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2) * scalef;
+        this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3) * scalef;
+        this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6) * scalef;
+        this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7) * scalef;
+      }
+    }
+  }
+
+  // Scale FP32 fragments
+  inline __device__ void unpack_noscale(
+      const Accumulator (&acc)[MMAS_M][MMAS_N]) {
+#pragma unroll
+    for (int mi = 0; mi < MMAS_M; ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < MMAS_N; ++ni) {
+        // 1st row - 4 elements per row.
+        this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0);
+        this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1);
+        this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4);
+        this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5);
+        // 2nd row - 4 elements per row.
+        this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2);
+        this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3);
+        this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6);
+        this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7);
+      }
+    }
+  }
+
+  template <bool zero_init = true, typename Operator>
+  __device__ inline void thread_reduce_(
+      float (&frag)[2 * MMAS_M],
+      Operator& op) {
+#pragma unroll
+    for (int mi = 0; mi < 2 * MMAS_M; mi++) {
+      frag[mi] =
+          zero_init ? this->elt_[mi][0] : op(frag[mi], this->elt_[mi][0]);
+#pragma unroll
+      for (int ni = 1; ni < 4 * MMAS_N; ni++) {
+        frag[mi] = op(frag[mi], this->elt_[mi][ni]);
+      }
+    }
+  }
+
+  template <bool zero_init = true, typename Operator>
+  __device__ inline void reduce_(
+      float (&frag)[2 * MMAS_M],
+      Operator& op,
+      Smem_tile_red& smem_red) {
+    thread_reduce_<zero_init>(frag, op);
+    quad_reduce(frag, frag, op);
+    smem_red.store(frag);
+    __syncthreads();
+    typename Smem_tile_red::read_t tmp[2 * MMAS_M];
+    smem_red.load(tmp);
+    quad_allreduce(frag, tmp, op);
+  }
+
+  template <bool zero_init = true>
+  __device__ inline void reduce_max(float (&frag)[2 * MMAS_M]) {
+    MaxOp<float> max;
+    reduce_<zero_init>(frag, max, smem_max_);
+  }
+
+  __device__ inline void reduce_sum(float (&frag)[2 * MMAS_M]) {
+    SumOp<float> sum;
+    reduce_(frag, sum, smem_sum_);
+  }
+
+  template <bool zero_init = true>
+  __device__ inline void reduce_sum_before_sync_(float (&frag)[2 * MMAS_M]) {
+    SumOp<float> sum;
+    thread_reduce_<zero_init>(frag, sum);
+    quad_reduce(frag, frag, sum);
+    smem_sum_.store(frag);
+  }
+
+  template <int NROWS, typename Operator>
+  __device__ inline void reduce_after_sync_(
+      float (&frag)[NROWS][MMAS_M],
+      const int (&rows)[NROWS],
+      Operator& op,
+      Smem_tile_red& smem_red) {
+#pragma unroll
+    for (int ii = 0; ii < NROWS; ii++) {
+      typename Smem_tile_red::read_t tmp[MMAS_M];
+      smem_red.load_row(tmp, rows[ii]);
+      quad_allreduce(frag[ii], tmp, op);
+    }
+  }
+
+  template <int NROWS>
+  __device__ inline void reduce_sum_after_sync_(
+      float (&frag)[NROWS][MMAS_M],
+      const int (&rows)[NROWS]) {
+    SumOp<float> sum;
+    reduce_after_sync_(frag, rows, sum, smem_sum_);
+  }
+
+  template <int NROWS>
+  __device__ inline void reduce_max_after_sync_(
+      float (&frag)[NROWS][MMAS_M],
+      const int (&rows)[NROWS]) {
+    MaxOp<float> max;
+    reduce_after_sync_(frag, rows, max, smem_max_);
+  }
+
+  const uint32_t params_scale_bmm1_;
+  Smem_tile_red smem_max_;
+  Smem_tile_red smem_sum_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace fmha
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha/utils.h b/python/aitemplate/backend/cuda/attention/src/fmha/utils.h
new file mode 100644
index 000000000..7bc0b3df9
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha/utils.h
@@ -0,0 +1,1332 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <cuda_fp16.h>
+
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void* ptr);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Row {};
+struct Col {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int M, bool = (M & (M - 1)) == 0>
+struct Next_power_of_two {};
+
+template <int M>
+struct Next_power_of_two<M, true> {
+  enum { VALUE = M };
+};
+template <>
+struct Next_power_of_two<3, false> {
+  enum { VALUE = 4 };
+};
+template <>
+struct Next_power_of_two<5, false> {
+  enum { VALUE = 8 };
+};
+template <>
+struct Next_power_of_two<6, false> {
+  enum { VALUE = 8 };
+};
+template <>
+struct Next_power_of_two<7, false> {
+  enum { VALUE = 8 };
+};
+template <>
+struct Next_power_of_two<9, false> {
+  enum { VALUE = 16 };
+};
+template <>
+struct Next_power_of_two<10, false> {
+  enum { VALUE = 16 };
+};
+template <>
+struct Next_power_of_two<11, false> {
+  enum { VALUE = 16 };
+};
+template <>
+struct Next_power_of_two<12, false> {
+  enum { VALUE = 16 };
+};
+template <>
+struct Next_power_of_two<13, false> {
+  enum { VALUE = 16 };
+};
+template <>
+struct Next_power_of_two<14, false> {
+  enum { VALUE = 16 };
+};
+template <>
+struct Next_power_of_two<15, false> {
+  enum { VALUE = 16 };
+};
+template <>
+struct Next_power_of_two<24, false> {
+  enum { VALUE = 32 };
+};
+template <>
+struct Next_power_of_two<48, false> {
+  enum { VALUE = 64 };
+};
+template <>
+struct Next_power_of_two<80, false> {
+  enum { VALUE = 128 };
+};
+template <>
+struct Next_power_of_two<96, false> {
+  enum { VALUE = 128 };
+};
+template <>
+struct Next_power_of_two<112, false> {
+  enum { VALUE = 128 };
+};
+template <>
+struct Next_power_of_two<144, false> {
+  enum { VALUE = 256 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, bool = (N & (N - 1)) == 0>
+struct Prev_power_of_two {};
+
+template <int N>
+struct Prev_power_of_two<N, true> {
+  enum { VALUE = N };
+};
+template <>
+struct Prev_power_of_two<3, false> {
+  enum { VALUE = 2 };
+};
+template <>
+struct Prev_power_of_two<5, false> {
+  enum { VALUE = 4 };
+};
+template <>
+struct Prev_power_of_two<6, false> {
+  enum { VALUE = 4 };
+};
+template <>
+struct Prev_power_of_two<7, false> {
+  enum { VALUE = 4 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int M, int N>
+struct Div_up {
+  enum { VALUE = (M + N - 1) / N };
+};
+
+constexpr int DivUpConstexpr(int M, int N) {
+  return (M + N - 1) / N;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int A, int B>
+struct Max {
+  enum { VALUE = A >= B ? A : B };
+};
+
+constexpr int MaxConstexpr(int A, int B) {
+  return A >= B ? A : B;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int A, int B, int C>
+struct Max_3 {
+  enum { VALUE = Max<Max<A, B>::VALUE, C>::VALUE };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int A, int B>
+struct Min {
+  enum { VALUE = A <= B ? A : B };
+};
+
+constexpr int MinConstexpr(int A, int B) {
+  return A <= B ? A : B;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int SIZE_IN_BYTES>
+struct Uint_from_size_in_bytes {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Uint_from_size_in_bytes<1> {
+  using Type = uint8_t;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Uint_from_size_in_bytes<2> {
+  using Type = uint16_t;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Uint_from_size_in_bytes<4> {
+  using Type = uint32_t;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Uint_from_size_in_bytes<8> {
+  using Type = uint2;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Uint_from_size_in_bytes<16> {
+  using Type = uint4;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int WARPS_M, int WARPS_N, int WARPS_K>
+struct Warp_masks {};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Warp_masks<8, 1, 1> {
+  enum { M = 0xe0, N = 0x00, K = 0x00 };
+};
+template <>
+struct Warp_masks<4, 2, 1> {
+  enum { M = 0x60, N = 0x80, K = 0x00 };
+};
+template <>
+struct Warp_masks<4, 1, 2> {
+  enum { M = 0x60, N = 0x00, K = 0x80 };
+};
+template <>
+struct Warp_masks<4, 1, 1> {
+  enum { M = 0x60, N = 0x00, K = 0x00 };
+};
+template <>
+struct Warp_masks<2, 4, 1> {
+  enum { M = 0x20, N = 0xc0, K = 0x00 };
+};
+template <>
+struct Warp_masks<2, 2, 2> {
+  enum { M = 0x20, N = 0x40, K = 0x80 };
+};
+template <>
+struct Warp_masks<2, 2, 1> {
+  enum { M = 0x20, N = 0x40, K = 0x00 };
+};
+template <>
+struct Warp_masks<2, 1, 2> {
+  enum { M = 0x20, N = 0x00, K = 0x40 };
+};
+template <>
+struct Warp_masks<2, 1, 1> {
+  enum { M = 0x20, N = 0x00, K = 0x00 };
+};
+template <>
+struct Warp_masks<1, 8, 1> {
+  enum { M = 0x00, N = 0xe0, K = 0x00 };
+};
+template <>
+struct Warp_masks<1, 4, 2> {
+  enum { M = 0x00, N = 0x60, K = 0x80 };
+};
+template <>
+struct Warp_masks<1, 4, 1> {
+  enum { M = 0x00, N = 0x60, K = 0x00 };
+};
+template <>
+struct Warp_masks<1, 2, 2> {
+  enum { M = 0x00, N = 0x20, K = 0x40 };
+};
+template <>
+struct Warp_masks<1, 2, 1> {
+  enum { M = 0x00, N = 0x20, K = 0x00 };
+};
+template <>
+struct Warp_masks<1, 1, 4> {
+  enum { M = 0x00, N = 0x00, K = 0x60 };
+};
+template <>
+struct Warp_masks<1, 1, 2> {
+  enum { M = 0x00, N = 0x00, K = 0x20 };
+};
+template <>
+struct Warp_masks<1, 1, 1> {
+  enum { M = 0x00, N = 0x00, K = 0x00 };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+inline __device__ __host__ T div_up(T m, T n) {
+  return (m + n - 1) / n;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline int clz(int x) {
+  for (int i = 31; i >= 0; --i) {
+    if ((1 << i) & x) {
+      return 31 - i;
+    }
+  }
+  return 32;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline int find_log_2(int x, bool round_up = false) {
+  int a = 31 - clz(x);
+  if (round_up) {
+    a += (x & (x - 1)) ? 1 : 0;
+  }
+  return a;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hadd2(uint32_t a, uint32_t b) {
+  uint32_t c;
+  asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hmin2(uint32_t a, uint32_t b) {
+  uint32_t c;
+  asm volatile("min.f16x2 %0, %1, %2;" : "=r"(c) : "r"(a), "r"(b));
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hmul2(const uint32_t a, const uint32_t b) {
+  // uint32_t c;
+  // asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+  // return c;
+  __half2 result = __hmul2(
+      reinterpret_cast<const __half2(&)>(a),
+      reinterpret_cast<const __half2(&)>(b));
+  return reinterpret_cast<uint32_t(&)>(result);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2 hmul4(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = hmul2(a.x, b.x);
+  c.y = hmul2(a.y, b.y);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hmul8(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = hmul2(a.x, b.x);
+  c.y = hmul2(a.y, b.y);
+  c.z = hmul2(a.z, b.z);
+  c.w = hmul2(a.w, b.w);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hmul8(uint32_t a, uint4 b) {
+  uint4 c;
+  c.x = hmul2(a, b.x);
+  c.y = hmul2(a, b.y);
+  c.z = hmul2(a, b.z);
+  c.w = hmul2(a, b.w);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hrelu2(uint32_t x, uint32_t lb = 0) {
+  uint32_t res;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("max.f16x2 %0, %1, %2;\n" : "=r"(res) : "r"(x), "r"(lb));
+#else
+  const uint32_t zero = 0u;
+  asm volatile(
+      "{\n"
+      "\t .reg .f16x2 sela;\n"
+      "\t set.gtu.u32.f16x2 sela, %1, %2;\n"
+      "\t and.b32 %0, sela, %1;\n"
+      "}\n"
+      : "=r"(res)
+      : "r"(x), "r"(zero));
+#endif
+  return res;
+}
+static inline __device__ uint32_t habs2(uint32_t x) {
+  uint32_t res;
+  asm volatile("abs.f16x2 %0, %1;\n" : "=r"(res) : "r"(x));
+  return res;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+template <typename T>
+static inline __device__ T clamp(T x, T lb, T ub) {
+  return x < lb ? lb : (x > ub ? ub : x);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t clamp_to_zero(uint16_t x) {
+  uint16_t mask;
+  asm volatile("set.gtu %0, %1, 0;" : "=h"(mask) : "h"(x));
+  return mask & x;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t float_to_half(float f) {
+  uint16_t h;
+  asm volatile("cvt.rn.f16.f32 %0, %1;" : "=h"(h) : "f"(f));
+  return h;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t float2_to_half2(float a, float b) {
+  uint32_t c;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(c) : "f"(b), "f"(a));
+#else
+  uint16_t lo = float_to_half(a);
+  uint16_t hi = float_to_half(b);
+  asm volatile("mov.b32 %0, {%1, %2};\n" : "=r"(c) : "h"(lo), "h"(hi));
+#endif
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t float_to_half2(float a) {
+  return float2_to_half2(a, a);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t float2_to_half2(const float2& f) {
+  return float2_to_half2(f.x, f.y);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2
+float4_to_half4(float x, float y, float z, float w) {
+  uint2 d;
+  d.x = float2_to_half2(x, y);
+  d.y = float2_to_half2(z, w);
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hfma2(uint32_t a, uint32_t b, uint32_t c) {
+  uint32_t d;
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(d)
+               : "r"(a), "r"(b), "r"(c));
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t
+hfma2_relu(uint32_t a, uint32_t b, uint32_t c) {
+  uint32_t d;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("fma.rn.f16x2.relu %0, %1, %2, %3;"
+               : "=r"(d)
+               : "r"(a), "r"(b), "r"(c));
+#else
+  d = hrelu2(hfma2(a, b, c));
+#endif
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t h0_h0(uint32_t x) {
+  uint32_t y;
+  asm volatile(
+      "{.reg .f16 lo, hi; mov.b32 {lo, hi}, %1; mov.b32 %0, {lo, lo};}\n"
+      : "=r"(y)
+      : "r"(x));
+  return y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float h0_to_float(uint32_t h2) {
+  float f;
+  asm volatile(
+      "{\n"
+      ".reg .f16 lo, hi;\n"
+      "mov.b32 {lo, hi}, %1;\n"
+      "cvt.f32.f16 %0, lo;\n"
+      "}\n"
+      : "=f"(f)
+      : "r"(h2));
+  return f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t h1_h1(uint32_t x) {
+  uint32_t y;
+  asm volatile(
+      "{.reg .f16 lo, hi; mov.b32 {lo, hi}, %1; mov.b32 %0, {hi, hi};}\n"
+      : "=r"(y)
+      : "r"(x));
+  return y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t hadd(uint16_t a, uint16_t b) {
+  uint16_t d;
+  asm volatile("add.f16 %0, %1, %2;" : "=h"(d) : "h"(a), "h"(b));
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hadd(uint32_t a, uint32_t b) {
+  return hadd2(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2 hadd4(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = hadd2(a.x, b.x);
+  c.y = hadd2(a.y, b.y);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint2 hadd(uint2 a, uint2 b) {
+  return hadd4(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hadd8(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = hadd2(a.x, b.x);
+  c.y = hadd2(a.y, b.y);
+  c.z = hadd2(a.z, b.z);
+  c.w = hadd2(a.w, b.w);
+  return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Converted two half2's into float, then take their dot product.
+// inline __device__ void hfma2_to_float(float &sum, const __half2 a, const
+// __half2 b) {
+static inline __device__ float hfma2_to_float(
+    const __half2 a,
+    const __half2 b) {
+  float2 af = __half22float2(a);
+  float2 bf = __half22float2(b);
+  return af.x * bf.x + af.y * bf.y;
+  // sum += af.x * bf.x + af.y * bf.y;
+  // sum = __fmaf_rn(sum, af.x, bf.x);
+  // sum = __fmaf_rn(sum, af.y, bf.y);
+  // float2 prod = __half22float2(__hmul2(a, b));
+  // sum += prod.x + prod.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Converted two vectors of 8 half's into float, then take their dot product.
+static inline __device__ float hmulsum8(const uint4 a, const uint4 b) {
+  float sum;
+  sum = fmha::hfma2_to_float(
+      reinterpret_cast<const __half2&>(a.x),
+      reinterpret_cast<const __half2&>(b.x));
+  sum += fmha::hfma2_to_float(
+      reinterpret_cast<const __half2&>(a.y),
+      reinterpret_cast<const __half2&>(b.y));
+  sum += fmha::hfma2_to_float(
+      reinterpret_cast<const __half2&>(a.z),
+      reinterpret_cast<const __half2&>(b.z));
+  sum += fmha::hfma2_to_float(
+      reinterpret_cast<const __half2&>(a.w),
+      reinterpret_cast<const __half2&>(b.w));
+  return sum;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 fadd4(uint4 a, uint4 b) {
+  float4 c;
+  c.x =
+      reinterpret_cast<const float&>(a.x) + reinterpret_cast<const float&>(b.x);
+  c.y =
+      reinterpret_cast<const float&>(a.y) + reinterpret_cast<const float&>(b.y);
+  c.z =
+      reinterpret_cast<const float&>(a.z) + reinterpret_cast<const float&>(b.z);
+  c.w =
+      reinterpret_cast<const float&>(a.w) + reinterpret_cast<const float&>(b.w);
+  return reinterpret_cast<const uint4&>(c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 fmul4(uint4 a, float b) {
+  float4 c;
+  c.x = reinterpret_cast<const float&>(a.x) * b;
+  c.y = reinterpret_cast<const float&>(a.y) * b;
+  c.z = reinterpret_cast<const float&>(a.z) * b;
+  c.w = reinterpret_cast<const float&>(a.w) * b;
+  return reinterpret_cast<const uint4&>(c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint4 hadd(uint4 a, uint4 b) {
+  return hadd8(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float half_to_float(uint16_t h) {
+  float f;
+  asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+  return f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float2 half2_to_float2(uint32_t x) {
+  uint16_t lo, hi;
+  asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(x));
+  return make_float2(half_to_float(lo), half_to_float(hi));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void half2_to_float2(float& x, float& y, uint32_t h) {
+  float2 tmp = half2_to_float2(h);
+  x = tmp.x;
+  y = tmp.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t hfma(uint16_t a, uint16_t b, uint16_t c) {
+  uint16_t d;
+  asm volatile("fma.rn.f16 %0, %1, %2, %3;" : "=h"(d) : "h"(a), "h"(b), "h"(c));
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint16_t hmul(uint16_t a, uint16_t b) {
+  uint16_t d;
+  asm volatile("mul.f16 %0, %1, %2;" : "=h"(d) : "h"(a), "h"(b));
+  return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void uint4_to_ushort8(
+    const uint4 a,
+    uint16_t (&b)[8]) {
+  uint32_t* b_tmp = reinterpret_cast<uint32_t*>(&b[0]);
+  b_tmp[0] = a.x;
+  b_tmp[1] = a.y;
+  b_tmp[2] = a.z;
+  b_tmp[3] = a.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float sigmoid(float x) {
+  return 1.f / (1.f + expf(-x));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint16_t& dst) {
+  dst = uint16_t(0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint32_t& dst) {
+  dst = 0u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint2& dst) {
+  dst = make_uint2(0u, 0u);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void clear(uint4& dst) {
+  dst = make_uint4(0u, 0u, 0u, 0u);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// P R E D I C A T E   P A C K I N G
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+enum {
+  BYTES_PER_REG = 4,
+  PREDS_PER_BYTE = 4,
+  PREDS_PER_REG = BYTES_PER_REG * PREDS_PER_BYTE
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// G E N E R I C   P R E D I C A T E D   L D G S T S
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M, typename Functor>
+inline __device__ void load_(Functor& fct, const uint32_t (&preds)[M]) {
+  // The number of complete bytes (where we use all the predicates in a byte).
+  enum { COMPLETE = N / PREDS_PER_BYTE };
+  // Make sure we did allocate enough predicates.
+  static_assert(Div_up<COMPLETE, BYTES_PER_REG>::VALUE <= M, "");
+  // The remainder.
+  enum { REMAINDER = N - COMPLETE * PREDS_PER_BYTE };
+  // Make sure we got the math right and the remainder is between 0 and 3.
+  static_assert(REMAINDER >= 0 && REMAINDER <= 3, "");
+  // The mask to extract the predicates.
+  enum { COMPLETE_MASK = (1 << PREDS_PER_BYTE) - 1 };
+
+// Clear the fetch registers.
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    fct.clear(ii);
+  }
+
+  // Run complete steps.
+  bool p[PREDS_PER_BYTE];
+#pragma unroll
+  for (int ii = 0; ii < COMPLETE; ++ii) {
+    // The predicate.
+    uint32_t reg = preds[ii / BYTES_PER_REG];
+
+// Extract the predicates.
+#pragma unroll
+    for (int jj = 0; jj < PREDS_PER_BYTE; ++jj) {
+      uint32_t mask = 1u << (ii % BYTES_PER_REG * 8 + jj);
+      p[jj] = (reg & mask) != 0u;
+    }
+
+// Issue the loads.
+#pragma unroll
+    for (int jj = 0; jj < PREDS_PER_BYTE; ++jj) {
+      fct.load(ii * PREDS_PER_BYTE + jj, p[jj]);
+    }
+  }
+
+  // Skip the rest of the code if we do not have a remainder.
+  if (REMAINDER > 0) {
+    // The mask to extract the predicates.
+    enum { REMAINDER_MASK = (1 << REMAINDER) - 1 };
+
+    // The predicate register.
+    uint32_t reg = preds[COMPLETE / BYTES_PER_REG];
+
+// Extract the predicates.
+#pragma unroll
+    for (int jj = 0; jj < PREDS_PER_BYTE; ++jj) {
+      uint32_t mask = 1u << (COMPLETE % BYTES_PER_REG * 8 + jj);
+      p[jj] = (reg & mask) != 0u;
+    }
+
+// Issue the loads.
+#pragma unroll
+    for (int ii = 0; ii < REMAINDER; ++ii) {
+      fct.load(COMPLETE * PREDS_PER_BYTE + ii, p[ii]);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int M, typename Functor>
+inline __device__ void load_(Functor& fct, uint32_t preds) {
+  uint32_t tmp[1] = {preds};
+  load_<M>(fct, tmp);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// L D G
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint8_t& dst, const void* ptr) {
+  dst = *reinterpret_cast<const uint8_t*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint16_t& dst, const void* ptr) {
+  dst = *reinterpret_cast<const uint16_t*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint32_t& dst, const void* ptr) {
+  dst = *reinterpret_cast<const uint32_t*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint2& dst, const void* ptr) {
+  dst = *reinterpret_cast<const uint2*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldg(uint4& dst, const void* ptr) {
+  dst = *reinterpret_cast<const uint4*>(ptr);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Data_type, int N>
+struct Ldg_functor {
+  // Ctor.
+  inline __device__ Ldg_functor(Data_type (&fetch)[N], const void* (&ptrs)[N])
+      : fetch_(fetch), ptrs_(ptrs) {}
+
+  // Clear the element.
+  inline __device__ void clear(int ii) {
+    fmha::clear(fetch_[ii]);
+  }
+
+  // Trigger the loads.
+  inline __device__ void load(int ii, bool p) {
+    if (p) {
+      ldg(fetch_[ii], ptrs_[ii]);
+    }
+  }
+
+  // The fetch registers.
+  Data_type (&fetch_)[N];
+  // The pointers.
+  const void* (&ptrs_)[N];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Data_type, int N, int M>
+inline __device__ void ldg_(
+    Data_type (&fetch)[N],
+    const void* (&ptrs)[N],
+    uint32_t (&preds)[M]) {
+  Ldg_functor<Data_type, N> fct(fetch, ptrs);
+  load_<N>(fct, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldg(
+    uint8_t (&fetch)[N],
+    const void* (&ptrs)[N],
+    uint32_t (&preds)[M]) {
+  ldg_<uint8_t, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldg(
+    uint16_t (&fetch)[N],
+    const void* (&ptrs)[N],
+    uint32_t (&preds)[M]) {
+  ldg_<uint16_t, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldg(
+    uint32_t (&fetch)[N],
+    const void* (&ptrs)[N],
+    uint32_t (&preds)[M]) {
+  ldg_<uint32_t, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldg(
+    uint2 (&fetch)[N],
+    const void* (&ptrs)[N],
+    uint32_t (&preds)[M]) {
+  ldg_<uint2, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N, int M>
+inline __device__ void ldg(
+    uint4 (&fetch)[N],
+    const void* (&ptrs)[N],
+    uint32_t (&preds)[M]) {
+  ldg_<uint4, N>(fetch, ptrs, preds);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// L D S
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint16_t& dst, uint32_t ptr) {
+  asm volatile("ld.shared.b16 %0, [%1];\n" : "=h"(dst) : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint32_t& dst, uint32_t ptr) {
+  asm volatile("ld.shared.b32 %0, [%1];\n" : "=r"(dst) : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint2& dst, uint32_t ptr) {
+  asm volatile("ld.shared.v2.b32 {%0, %1}, [%2];\n"
+               : "=r"(dst.x), "=r"(dst.y)
+               : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void lds(uint4& dst, uint32_t ptr) {
+  asm volatile("ld.shared.v4.b32 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w)
+               : "r"(ptr));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// L D S M
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsm(uint32_t& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
+               : "=r"(dst)
+               : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsmt(uint32_t& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile("ldmatrix.sync.aligned.m8n8.x1.trans.shared.b16 {%0}, [%1];\n"
+               : "=r"(dst)
+               : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsm(uint2& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0, %1}, [%2];\n"
+               : "=r"(dst.x), "=r"(dst.y)
+               : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsmt(uint2& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16 {%0, %1}, [%2];\n"
+      : "=r"(dst.x), "=r"(dst.y)
+      : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsm(uint4& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+      : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w)
+      : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void ldsmt(uint4& dst, uint32_t ptr) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 730
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+      : "=r"(dst.x), "=r"(dst.y), "=r"(dst.z), "=r"(dst.w)
+      : "r"(ptr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// S T G
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, uint8_t val) {
+  *reinterpret_cast<uint8_t*>(ptr) = val;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, uint16_t val) {
+  *reinterpret_cast<uint16_t*>(ptr) = val;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, uint32_t val) {
+  *reinterpret_cast<uint32_t*>(ptr) = val;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, uint2 val) {
+  *reinterpret_cast<uint2*>(ptr) = val;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void stg(void* ptr, uint4 val) {
+  *reinterpret_cast<uint4*>(ptr) = val;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// S T S
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint16_t val) {
+  asm volatile("st.shared.b16 [%0], %1;\n" : : "r"(ptr), "h"(val));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint32_t val) {
+  asm volatile("st.shared.b32 [%0], %1;\n" : : "r"(ptr), "r"(val));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint2 val) {
+  asm volatile("st.shared.v2.b32 [%0], {%1, %2};\n"
+               :
+               : "r"(ptr), "r"(val.x), "r"(val.y));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void sts(uint32_t ptr, uint4 val) {
+  asm volatile("st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
+               :
+               : "r"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Data_type, int N>
+inline __device__ void sts_(uint32_t (&ptrs)[N], const Data_type (&data)[N]) {
+#pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    sts(ptrs[ii], data[ii]);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+inline __device__ void sts(uint32_t (&ptrs)[N], const uint16_t (&data)[N]) {
+  sts_<uint16_t, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+inline __device__ void sts(uint32_t (&ptrs)[N], const uint32_t (&data)[N]) {
+  sts_<uint32_t, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+inline __device__ void sts(uint32_t (&ptrs)[N], const uint2 (&data)[N]) {
+  sts_<uint2, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+inline __device__ void sts(uint32_t (&ptrs)[N], const uint4 (&data)[N]) {
+  sts_<uint4, N>(ptrs, data);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct MaxOp {
+  __device__ inline T operator()(T const& x, T const& y) {
+    return x > y ? x : y;
+  }
+};
+
+template <>
+struct MaxOp<float> {
+  // This is slightly faster
+  __device__ inline float operator()(float const& x, float const& y) {
+    return max(x, y);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct SumOp {
+  __device__ inline T operator()(T const& x, T const& y) {
+    return x + y;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int THREADS>
+struct Allreduce {
+  static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
+  template <typename T, typename Operator>
+  static __device__ inline T run(T x, Operator& op) {
+    constexpr int OFFSET = THREADS / 2;
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
+    return Allreduce<OFFSET>::run(x, op);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Allreduce<2> {
+  template <typename T, typename Operator>
+  static __device__ inline T run(T x, Operator& op) {
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
+    return x;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_reduce(
+    float (&dst)[M],
+    float (&src)[M],
+    Operator& op) {
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    dst[mi] = src[mi];
+    dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 2));
+    dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 1));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_reduce(
+    __half2 (&dst)[M],
+    __half2 (&src)[M],
+    Operator& op) {
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    dst[mi] = src[mi];
+    dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 2));
+    dst[mi] = op(dst[mi], __shfl_down_sync(uint32_t(-1), dst[mi], 1));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_reduce(
+    float (&dst)[M],
+    float2 (&src)[M],
+    Operator& op) {
+  float tmp[M];
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    tmp[mi] = op(src[mi].x, src[mi].y);
+  }
+  quad_reduce(dst, tmp, op);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_reduce(
+    __half2 (&dst)[M],
+    float2 (&src)[M],
+    Operator& op) {
+  __half2 tmp[M];
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    tmp[mi] =
+        op(reinterpret_cast<const __half2&>(src[mi].x),
+           reinterpret_cast<const __half2&>(src[mi].y));
+  }
+  quad_reduce(dst, tmp, op);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_allreduce(
+    float (&dst)[M],
+    float (&src)[M],
+    Operator& op) {
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    dst[mi] = src[mi];
+    dst[mi] = Allreduce<4>::run(dst[mi], op);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_allreduce(
+    __half2 (&dst)[M],
+    __half2 (&src)[M],
+    Operator& op) {
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    dst[mi] = src[mi];
+    dst[mi] = Allreduce<4>::run(dst[mi], op);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_allreduce(
+    float (&dst)[M],
+    float2 (&src)[M],
+    Operator& op) {
+  float tmp[M];
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    tmp[mi] = op(src[mi].x, src[mi].y);
+  }
+  quad_allreduce(dst, tmp, op);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator, int M>
+__device__ inline void quad_allreduce(
+    __half2 (&dst)[M],
+    float2 (&src)[M],
+    Operator& op) {
+  __half2 tmp[M];
+#pragma unroll
+  for (int mi = 0; mi < M; mi++) {
+    tmp[mi] =
+        op(reinterpret_cast<const __half2&>(src[mi].x),
+           reinterpret_cast<const __half2&>(src[mi].y));
+  }
+  quad_allreduce(dst, tmp, op);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace fmha
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu
new file mode 100644
index 000000000..46bddc48e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_fp16_kernel.sm80.cu
@@ -0,0 +1,155 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include "fmha.h"
+#include "fmha_block_fprop_kernel_1xN.h"
+
+template <
+    typename Kernel_traits,
+    bool Is_dropout,
+    bool Is_causal,
+    bool Return_softmax>
+__global__ void fmha_block_fprop_fp16_sm80_loop_kernel(
+    Fused_multihead_attention_fprop_params params) {
+  fmha::device_block_1xN_loop<
+      Kernel_traits,
+      Is_dropout,
+      Is_causal,
+      Return_softmax>(params);
+}
+
+template <typename Kernel_traits>
+void run_fmha_block_fp16_sm80_loop_(
+    Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
+    const bool configure) {
+  bool is_causal = launch_params.params.is_causal;
+  // TD [2022-04-27]: This case work is pretty ugly, maybe there's a better way?
+  auto kernel = launch_params.is_dropout
+      ? (is_causal ? (launch_params.return_softmax
+                          ? &fmha_block_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                true,
+                                true,
+                                true>
+                          : &fmha_block_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                true,
+                                true,
+                                false>)
+                   : (launch_params.return_softmax
+                          ? &fmha_block_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                true,
+                                false,
+                                true>
+                          : &fmha_block_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                true,
+                                false,
+                                false>))
+      : (is_causal ? (launch_params.return_softmax
+                          ? &fmha_block_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                false,
+                                true,
+                                true>
+                          : &fmha_block_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                false,
+                                true,
+                                false>)
+                   : (launch_params.return_softmax
+                          ? &fmha_block_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                false,
+                                false,
+                                true>
+                          : &fmha_block_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                false,
+                                false,
+                                false>));
+
+  constexpr int N = Kernel_traits::Cta_tile_p::N;
+  const int loop_steps = (launch_params.params.s + N - 1) / N;
+  constexpr int smem_size_softmax_lse =
+      Kernel_traits::Smem_dp_sum::BYTES_PER_TILE;
+  // Don't need smem_size_softmax_lse if we're not looping
+  const int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>() +
+      (loop_steps > 1 ? smem_size_softmax_lse : 0);
+
+  if (smem_size >= 48 * 1024) {
+    FMHA_CHECK_CUDA(cudaFuncSetAttribute(
+        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+  }
+
+  if (configure) {
+    using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
+    constexpr int M = Kernel_traits::Cta_tile_p::M;
+    size_t STEPS = (launch_params.params.s + M - 1) / M;
+    constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
+    constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;
+    size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8 * loop_steps;
+    launch_params.elts_per_thread = elts_per_head;
+    return;
+  }
+
+  dim3 grid(launch_params.params.h, launch_params.params.b);
+  kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(
+      launch_params.params);
+
+  FMHA_CHECK_CUDA(cudaPeekAtLastError());
+}
+
+void run_fmha_block_fp16_sm80(
+    Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
+    const bool configure) {
+  if (launch_params.params.d == 16) {
+    using Kernel_traits = FMHA_kernel_traits<256, 16, 16, 1, 4, 0x08u>;
+    run_fmha_block_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+  } else if (launch_params.params.d == 32) {
+    using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 4, 0x08u>;
+    run_fmha_block_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+  } else if (launch_params.params.d == 64) {
+    using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u>;
+    run_fmha_block_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+  }
+}
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h
new file mode 100644
index 000000000..89776414a
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h
@@ -0,0 +1,661 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/***************************************************************************************************
+ * Copyright (c) 2022, Tri Dao.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include "fmha_blockmask.h"
+#include "fmha_fprop_kernel_1xN.h"
+#include "fmha_kernel.h"
+
+namespace fmha {
+
+template <
+    typename Kernel_traits,
+    bool Is_dropout,
+    bool Is_causal,
+    bool Return_softmax,
+    bool Is_first,
+    bool Is_last,
+    typename Params,
+    typename Prng>
+inline __device__ void device_block_1xN_(
+    const Params& params,
+    const int bidb,
+    const int bidh,
+    int steps,
+    Prng& ph0,
+    Prng& ph1,
+    const int loop_step_idx) {
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = fmha::Hmma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  using Gmem_tile_o_tmp = fmha::Gmem_tile_o<Cta_tile_o, 4>;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
+
+  using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
+
+  using Smem_softmax_sum = typename Kernel_traits::Smem_dp_sum;
+
+  using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>;
+
+  using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The thread index.
+  const int tidx = threadIdx.x;
+
+  const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
+  // if( binfo.stop_early() ) return;
+  if (binfo.stop_early(loop_step_idx * Cta_tile_p::N))
+    return;
+
+  Blockmask blockmask(params, loop_step_idx);
+  int block_row_idx = 0;
+  int mask_val = blockmask.mask_val(0);
+  if (mask_val == -1)
+    return;
+  // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+  //     printf("mask_val = %d.\n", mask_val);
+  // }
+
+  Gemm1 gemm_q_k(smem_, tidx);
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(
+      params.q_ptr,
+      params.q_row_stride_in_elts,
+      params.q_head_stride_in_elts,
+      binfo,
+      tidx);
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(
+      params.o_ptr,
+      params.o_row_stride_in_elts,
+      params.o_head_stride_in_elts,
+      binfo,
+      tidx);
+  Gmem_tile_o_tmp gmem_o_tmp(
+      params.o_tmp_ptr,
+      params.o_row_stride_in_elts,
+      params.o_head_stride_in_elts,
+      binfo,
+      tidx);
+  // Allocate the global memory tile loader for S.
+  Gmem_tile_s gmem_s(params, binfo, tidx);
+  Gmem_softmax_sum gmem_softmax_lse(params.softmax_lse_ptr, params, tidx);
+
+  // Wind gmem tiles to the correct position.
+  static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
+  int block_row_idx_next = mask_val / 4;
+  int block_row_idx_to_move = block_row_idx_next - block_row_idx;
+  gmem_q.move(block_row_idx_to_move);
+  gmem_o.move(block_row_idx_to_move);
+  gmem_o_tmp.move(block_row_idx_to_move);
+  if (Return_softmax) {
+    gmem_s.move(block_row_idx_to_move);
+  }
+  gmem_softmax_lse.move(block_row_idx_to_move);
+  block_row_idx = block_row_idx_next;
+  // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+  //     printf("begin = %d, steps = %d\n", begin, steps);
+  // }
+
+  fmha::Mask<Cta_tile_p, Is_causal> mask(binfo, tidx, loop_step_idx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(
+      params.k_ptr,
+      params.k_row_stride_in_elts,
+      params.k_head_stride_in_elts,
+      binfo,
+      tidx);
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(
+      params.v_ptr,
+      params.v_row_stride_in_elts,
+      params.v_head_stride_in_elts,
+      binfo,
+      tidx);
+  // The base pointer of smem_v;
+  char* smem_v_ = &smem_[Gemm1::SMEM_OFFSET_V];
+
+  // Allocate the shared memory tile loader for V. We use the same as K so be
+  // careful!!!
+  Smem_tile_v smem_v(smem_v_, tidx);
+
+  // Allocate the shared memory tile loader for O. We use the same as K so be
+  // careful!!!
+  Smem_tile_o smem_o(&smem_[Gemm1::SMEM_OFFSET_O], tidx);
+
+  if (!Is_first) {
+    gmem_k.move(loop_step_idx);
+    gmem_v.move(loop_step_idx);
+    if (Return_softmax) {
+      gmem_s.move(loop_step_idx * steps);
+    }
+  }
+
+  // Trigger the loads for K.
+  gmem_k.load();
+  // Trigger the loads for Q.
+  gmem_q.load();
+  // Trigger the loads for V.
+  gmem_v.load();
+
+  if (!Is_first) {
+    __syncthreads();
+  }
+
+  float p_prev_lse[Mma_tile_p::MMAS_M * 2];
+  if (!(Is_first || mask_val % 2 == 1)) {
+    gmem_softmax_lse.load(
+        reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
+  }
+
+  // Commit the data for Q and V to shared memory.
+  gmem_q.commit(gemm_q_k.smem_q);
+  gmem_v.commit(smem_v);
+
+  // const uint32_t scale_bmm1 = reinterpret_cast<const
+  // uint32_t&>(params.scale_bmm1); #pragma unroll for(int it=0;it <
+  // Gmem_tile_k::LDGS;it++){
+  //     gmem_k.fetch_[it] = fmha::hmul8(scale_bmm1, gmem_k.fetch_[it]);
+  // }
+
+  // Commit the data for K to shared memory.
+  if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    gmem_k.commit(gemm_q_k.smem_k);
+  }
+
+  __syncthreads();
+
+  // Load the fragments for Q.
+  gemm_q_k.load_q();
+
+  // Load the fragments for V. We keep the data in registers during the entire
+  // kernel.
+  typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+    smem_v.load(frag_v[ki], ki);
+  }
+
+  // Commit the data for V to shared memory if it has not been done already.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    // Make sure we are done loading the fragments for K.
+    __syncthreads();
+
+    // Commit the data to shared memory for V.
+    gmem_k.commit(gemm_q_k.smem_k);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+  }
+
+  // Load the fragments for K.
+  gemm_q_k.load_k();
+
+  // Create the object to do the softmax.
+  Softmax softmax(params, &smem_[Gemm1::SMEM_OFFSET_SOFTMAX], tidx);
+
+  Smem_softmax_sum smem_softmax_lse(
+      reinterpret_cast<float*>(&smem_[Gemm1::SMEM_BYTES]), tidx);
+
+  // Load over the entire sequence length.
+  for (int l = 0; l < steps; l++) {
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("block_row_idx = %d\n", block_row_idx);
+    // }
+    if (block_row_idx * Cta_tile_p::M >= binfo.actual_seqlen)
+      break;
+
+    int mask_val_next = l < steps - 1 ? blockmask.mask_val(l + 1) : -1;
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("mask_val = %d, mask_val_next = %d\n", mask_val,
+    //     mask_val_next);
+    // }
+
+    // Declare the accumulators for the 1st gemm.
+    fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+    fmha::Clear_accumulator<
+        typename fmha::Accumulator_type,
+        Cta_tile_p::WARPS_K>::apply(acc_p);
+
+    // Do this part of P = Q * K^T.
+    gemm_q_k(acc_p);
+
+    uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+    bool is_first_read = Is_first || mask_val % 2 == 1;
+    // if (!Is_first) { gmem_o_tmp.load(out, 0); }
+    if (!is_first_read) {
+      gmem_o_tmp.load(out, 0);
+    }
+
+    // Trigger the load for the next Q values.
+    bool not_last_iter = (l < steps - 1) && (mask_val_next != -1);
+    block_row_idx_next = mask_val_next / 4;
+    int block_row_idx_to_move = block_row_idx_next - block_row_idx;
+    if (not_last_iter) {
+      gemm_q_k.smem_q.move_to_next_write_buffer();
+      gmem_q.move(block_row_idx_to_move);
+      gmem_q.load();
+    }
+
+    // Load the mask for that iteration.
+    mask.load(block_row_idx);
+
+    // Convert from the accumulator type to FP32 for Softmax.
+    softmax.unpack_noscale(acc_p);
+
+    // Apply the mask.
+    softmax.apply_mask(mask);
+
+    // softmax.unpack_noscale_half_and_apply_mask(acc_p, mask);
+
+    if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l == 0) {
+      // if we share K and V, it could be that V was not fully read yet but we
+      // write into smem for reduction
+      __syncthreads();
+    }
+    // if (!Is_first) {
+    //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+    //     (l == 0))  {
+    //         printf("p_prev_lse=%.6f, %.6f\n", p_prev_lse[0], p_prev_lse[1]);
+    //     }
+    // }
+    // Compute the max.
+    float p_max[Mma_tile_p::MMAS_M * 2];
+    // if (!Is_first) {
+    if (!is_first_read) {
+      smem_softmax_lse.store_pair(p_prev_lse, l % 2);
+      // for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { p_max[mi] =
+      // p_prev_lse[mi]; }
+      for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) {
+        p_max[mi] = p_prev_lse[mi] / params.scale_bmm1f;
+      }
+    }
+
+    // Trigger the load for the next LSE values.
+    if (not_last_iter) {
+      // if (!Is_first) {
+      if (!(Is_first || mask_val_next % 2 == 1)) {
+        gmem_softmax_lse.load_next(
+            reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse),
+            block_row_idx_to_move);
+      }
+    }
+
+    // __half2 p_max[Mma_tile_p::MMAS_M];
+    // softmax.template reduce_max</*zero_init=*/Is_first>(p_max);
+    is_first_read ? softmax.template reduce_max</*zero_init=*/true>(p_max)
+                  : softmax.template reduce_max</*zero_init=*/false>(p_max);
+
+    // if ((threadIdx.x == 0) && (l == 38)) {
+    //     printf("loop_step_idx %d, p_max = %.6f, %.6f., p_prev_lse = %.6f,
+    //     %.6f\n", loop_step_idx, p_max[0], p_max[1], Is_first ? -10000.f :
+    //     p_prev_lse[0], Is_first ? -10000.f : p_prev_lse[1]);
+    // }
+
+    // if (!Is_first) {
+    //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+    //     (l == 0))  {
+    //         printf("after reduce_max=%.6f, %.6f\n", softmax.elt_[0][0],
+    //         softmax.elt_[0][1]);
+    //     }
+    // }
+
+    // Compute the exponential value.
+    // softmax.apply_exp(p_max);
+    softmax.scale_apply_exp(p_max, params.scale_bmm1f);
+
+    // if (!Is_first) {
+    //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+    //     (l == 0))  {
+    //         printf("after apply_exp=%.6f, %.6f\n", softmax.elt_[0][0],
+    //         softmax.elt_[0][1]);
+    //     }
+    // }
+
+    // Compute the sum.
+    float p_sum[Mma_tile_p::MMAS_M * 2];
+    // if (!Is_first) {
+    //     int warp = tidx / Cta_tile_p::THREADS_PER_WARP;
+    //     int lane = tidx % Cta_tile_p::THREADS_PER_WARP;
+    //     for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) {
+    //         p_sum[mi] = ((warp == 0) && (lane % 4 == 0)) ?
+    //         expf(p_prev_lse[mi] - p_max[mi]) : 0;
+    //     }
+    // }
+    // softmax.reduce_sum(p_sum);
+    softmax.reduce_sum_before_sync_(p_sum);
+    // softmax.template reduce_sum_before_sync_</*zero_init=*/Is_first>(p_sum);
+
+    // float p_sum_log[Mma_tile_p::MMAS_M * 2];
+    // for (int mi = 0; mi  < Mma_tile_p::MMAS_M * 2; ++mi) {
+    //     float sum = p_sum[mi];
+    //     // p_sum_log[mi] = (sum == 0.f || sum != sum) ? INFINITY : p_max[mi]
+    //     + __logf(sum); constexpr float kLog2e = M_LOG2E; p_sum_log[mi] = (sum
+    //     == 0.f || sum != sum) ? INFINITY : p_max[mi] * kLog2e + __log2f(sum);
+    // }
+    // // gmem_softmax_lse.store(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M
+    // * 2]>(p_sum));
+    // gmem_softmax_lse.store(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M *
+    // 2]>(p_sum_log)); gmem_softmax_lse.move();
+
+    // // Finalize softmax on the accumulators of P^T.
+    // softmax.scale(p_sum);
+
+    constexpr bool encode_dropout_in_sign_bit = Return_softmax;
+    if (Is_dropout) {
+      // softmax.template apply_dropout<encode_dropout_in_sign_bit>(ph0,
+      // params.p_dropout_in_uint); softmax.template
+      // apply_dropout<encode_dropout_in_sign_bit>(ph0, ph1,
+      // params.p_dropout_in_uint);
+      softmax.template apply_dropout_16bits<encode_dropout_in_sign_bit>(
+          ph0, ph1, params.p_dropout_in_uint16_t);
+    }
+
+    using Frag_p = fmha::Fragment_a<fmha::Row>;
+    Frag_p frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+    static_assert(Mma_tile_o::MMAS_M == Mma_tile_p::MMAS_M);
+    static_assert(Mma_tile_o::MMAS_K == Mma_tile_p::MMAS_N);
+    softmax.pack(frag_p);
+    if (Return_softmax) {
+      gmem_s.store(frag_p, mask);
+      if (not_last_iter) {
+        gmem_s.move(block_row_idx_to_move);
+      }
+    }
+
+    // Commit the values for Q into shared memory.
+    if (not_last_iter) {
+      gmem_q.commit(gemm_q_k.smem_q);
+    }
+
+    if (Is_dropout && encode_dropout_in_sign_bit) {
+#pragma unroll
+      for (int ki = 0; ki < Mma_tile_o::MMAS_K; ki++) {
+#pragma unroll
+        for (int mi = 0; mi < Mma_tile_o::MMAS_M; mi++) {
+          frag_p[ki][mi].hrelu_();
+        }
+      }
+    }
+
+    // Declare the accumulators for the 2nd gemm.
+    fmha::Fragment_accumulator acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::MMAS_N];
+    fmha::Clear_accumulator<
+        typename fmha::Accumulator_type,
+        Cta_tile_o::WARPS_K>::apply(acc_o);
+
+// Do this part of O = P^T * V^T.
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+      fmha::gemm_cl(acc_o, frag_p[ki], frag_v[ki]);
+    }
+
+    // The mapping from tidx to rows changes between the softmax and the
+    // O-reduction. So we recalculate the max.
+    float p_max_o[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
+    // TODO: not sure if this is right for seqlen 128 or 256
+    int rows[Gmem_tile_o::STGS_PER_LOOP];
+    for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+      rows[jj] =
+          tidx / Gmem_tile_o::THREADS_PER_ROW + jj * Gmem_tile_o::ROWS_PER_STG;
+    }
+    softmax.reduce_max_after_sync_(p_max_o, rows);
+    static_assert(Mma_tile_o::MMAS_M == 1);
+    for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+      p_max_o[jj][0] *= params.scale_bmm1f;
+    }
+    float p_prev_scale_o[Gmem_tile_o::STGS_PER_LOOP];
+    // if (!Is_first) { smem_softmax_lse.load(p_prev_scale_o, rows, l % 2); }
+    if (!is_first_read) {
+      smem_softmax_lse.load(p_prev_scale_o, rows, l % 2);
+    }
+    // if (!Is_first) {
+    //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+    //     (l == 0))  {
+    //         printf("p_prev_scale_o=%.6f\n", p_prev_scale_o[0]);
+    //     }
+    // }
+
+    static_assert(Gmem_tile_o::LOOPS == 1);
+
+    // Swizzle the elements and do the final reduction.
+    smem_o.store(acc_o, 0);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    static_assert(Mma_tile_o::MMAS_M == 1);
+    float p_sum_o[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
+    softmax.reduce_sum_after_sync_(p_sum_o, rows);
+    // if (!Is_first) {
+    if (!is_first_read) {
+      for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+        p_prev_scale_o[jj] = expf(p_prev_scale_o[jj] - p_max_o[jj][0]);
+        p_sum_o[jj][0] += p_prev_scale_o[jj];
+      }
+    }
+
+    float p_sum_log[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
+#pragma unroll
+    for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+      float sum = p_sum_o[jj][0];
+      p_sum_log[jj][0] =
+          (sum == 0.f || sum != sum) ? -INFINITY : p_max_o[jj][0] + __logf(sum);
+      // if (sum == 0.f || sum != sum) {
+      //     printf("loop_step_idx = %d, l = %d, tidx = %d, sum = %.6f, p_max_o
+      //     = %.6f\n", loop_step_idx, l, tidx, sum, p_max_o[jj][0]);
+      // }
+      // if (Is_first) {
+      //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+      //     (l == 0))  {
+      //         printf("p_sum_log=%.6f\n", p_sum_log[jj][0]);
+      //     }
+      // }
+      if ((tidx % Gmem_tile_o::THREADS_PER_ROW == 0) &&
+          (tidx / Gmem_tile_o::THREADS_PER_ROW < Gmem_tile_o::ROWS)) {
+        gmem_softmax_lse.store_row(
+            reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),
+            rows[jj]);
+      }
+    }
+    if (not_last_iter) {
+      gmem_softmax_lse.move(block_row_idx_to_move);
+    }
+
+    // Load from shared memory.
+    // if (!Is_first) {
+    if (!is_first_read) {
+      for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+        out[jj] = fmha::fmul4(out[jj], p_prev_scale_o[jj]);
+      }
+    }
+    // smem_o.template load</*zero_init=*/Is_first>(out);
+    is_first_read ? smem_o.template load</*zero_init=*/true>(out)
+                  : smem_o.template load</*zero_init=*/false>(out);
+
+    const bool is_final_write = Is_last ||
+        ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen) ||
+        ((mask_val & 0x2) != 0) ||
+        ((Is_causal) &&
+         (block_row_idx * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
+// if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+//     printf("is_final_write = %d\n", is_final_write);
+// }
+#pragma unroll
+    for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+      float sum = p_sum_o[jj][0];
+      float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
+      if (Is_dropout && is_final_write) {
+        inv_sum *= params.rp_dropout;
+      }
+      out[jj] = fmha::fmul4(out[jj], inv_sum);
+    }
+
+    // if (Is_dropout && Is_last) {
+    //     for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+    //         out[jj] = fmha::fmul4(out[jj], params.rp_dropout);
+    //     }
+    // }
+
+    // Output the values.
+    if (is_final_write) {
+      gmem_o.store(out, 0);
+    } else {
+      gmem_o_tmp.store(out, 0);
+    }
+
+    // Move to the next part of the output.
+    gmem_o.move(block_row_idx_to_move);
+    if (!(Is_first && Is_last)) {
+      gmem_o_tmp.move(block_row_idx_to_move);
+    }
+    gemm_q_k.reload_k();
+
+    // Make sure we are reading from the correct buffer.
+    gemm_q_k.smem_q.move_to_next_read_buffer();
+    // Trigger the load from shared memory for the next series of Q values.
+    if (not_last_iter) {
+      gemm_q_k.reload_q();
+    }
+
+    if (mask_val_next == -1)
+      break;
+    mask_val = mask_val_next;
+    block_row_idx += block_row_idx_to_move;
+
+  } // Outer loop over the sequence length.
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename Kernel_traits,
+    bool Is_dropout,
+    bool Is_causal,
+    bool Return_softmax,
+    typename Params>
+inline __device__ void device_block_1xN_loop(const Params& params) {
+  // The block index for the batch.
+  const int bidb = blockIdx.y;
+  // The block index for the head.
+  const int bidh = blockIdx.x;
+  // The thread index.
+  const int tidx = threadIdx.x;
+
+  const int tidx_global = (bidb * params.h + bidh) * blockDim.x * 2 + tidx;
+  // auto seeds = at::cuda::philox::unpack(params.philox_args);
+  auto seeds = std::make_tuple(0, 0);
+  Philox ph0(std::get<0>(seeds), tidx_global, std::get<1>(seeds));
+  Philox ph1(std::get<0>(seeds), tidx_global + blockDim.x, std::get<1>(seeds));
+  const int STEPS = params.s / Kernel_traits::Cta_tile_p::M;
+
+  constexpr int N_per_loop = Kernel_traits::Cta_tile_p::N;
+  if (params.s == N_per_loop) {
+    fmha::device_block_1xN_<
+        Kernel_traits,
+        Is_dropout,
+        Is_causal,
+        Return_softmax,
+        true,
+        true>(params, bidb, bidh, STEPS, ph0, ph1, 0);
+  } else {
+    const int max_loop_steps = (params.s + N_per_loop - 1) / N_per_loop;
+    fmha::device_block_1xN_<
+        Kernel_traits,
+        Is_dropout,
+        Is_causal,
+        Return_softmax,
+        true,
+        false>(params, bidb, bidh, STEPS, ph0, ph1, 0);
+    for (int loop_step_idx = 1; loop_step_idx < max_loop_steps - 1;
+         loop_step_idx++) {
+      fmha::device_block_1xN_<
+          Kernel_traits,
+          Is_dropout,
+          Is_causal,
+          Return_softmax,
+          false,
+          false>(params, bidb, bidh, STEPS, ph0, ph1, loop_step_idx);
+    }
+    fmha::device_block_1xN_<
+        Kernel_traits,
+        Is_dropout,
+        Is_causal,
+        Return_softmax,
+        false,
+        true>(params, bidb, bidh, STEPS, ph0, ph1, max_loop_steps - 1);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace fmha
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h b/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h
new file mode 100644
index 000000000..9de497e7f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_blockmask.h
@@ -0,0 +1,69 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <fmha.h>
+#include <fmha/gmem_tile.h>
+#include <fmha/mask.h>
+#include <fmha/smem_tile.h>
+#include <fmha/softmax.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Blockmask {
+  template <typename Params>
+  __device__ Blockmask(const Params& params, int loop_step_idx)
+      : blockmask_ptr(params.blockmask + loop_step_idx * params.s / 16) {}
+
+  __device__ int mask_val(int block_row_idx) const {
+    return blockmask_ptr[block_row_idx];
+  }
+
+  const int* blockmask_ptr;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace fmha
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu
new file mode 100644
index 000000000..5031d81a0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_fp16_kernel.sm80.cu
@@ -0,0 +1,262 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// #include "fmha.h"
+// #include "fmha_fprop_kernel_1xN.h"
+
+template <
+    typename Kernel_traits,
+    bool Is_dropout,
+    bool Is_causal,
+    bool Return_softmax>
+__global__ void fmha_fprop_fp16_sm80_loop_kernel(
+    Fused_multihead_attention_fprop_params params) {
+  fmha::device_1xN_loop<Kernel_traits, Is_dropout, Is_causal, Return_softmax>(
+      params);
+}
+
+template <typename Kernel_traits>
+void run_fmha_fp16_sm80_loop_(
+    Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
+    const bool configure) {
+  bool is_causal = launch_params.params.is_causal;
+  // TD [2022-04-27]: This case work is pretty ugly, maybe there's a better way?
+  auto kernel = launch_params.is_dropout
+      ? (is_causal ? (launch_params.return_softmax
+                          ? &fmha_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                true,
+                                true,
+                                true>
+                          : &fmha_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                true,
+                                true,
+                                false>)
+                   : (launch_params.return_softmax
+                          ? &fmha_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                true,
+                                false,
+                                true>
+                          : &fmha_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                true,
+                                false,
+                                false>))
+      : (is_causal ? (launch_params.return_softmax
+                          ? &fmha_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                false,
+                                true,
+                                true>
+                          : &fmha_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                false,
+                                true,
+                                false>)
+                   : (launch_params.return_softmax
+                          ? &fmha_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                false,
+                                false,
+                                true>
+                          : &fmha_fprop_fp16_sm80_loop_kernel<
+                                Kernel_traits,
+                                false,
+                                false,
+                                false>));
+
+  constexpr int N = Kernel_traits::Cta_tile_p::N;
+  const int loop_steps = (launch_params.params.s + N - 1) / N;
+  constexpr int smem_size_softmax_lse =
+      Kernel_traits::Smem_dp_sum::BYTES_PER_TILE;
+  // Don't need smem_size_softmax_lse if we're not looping
+  const int smem_size = fmha::get_dynamic_smem_size<Kernel_traits>() +
+      (loop_steps > 1 ? smem_size_softmax_lse : 0);
+
+  if (smem_size >= 48 * 1024) {
+    FMHA_CHECK_CUDA(cudaFuncSetAttribute(
+        kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+  }
+
+  if (configure) {
+    using Mma_tile_p = fmha::Hmma_tile<typename Kernel_traits::Cta_tile_p>;
+    constexpr int M = Kernel_traits::Cta_tile_p::M;
+    size_t STEPS = (launch_params.params.s + M - 1) / M;
+    constexpr size_t MMAS_M = Mma_tile_p::MMAS_M;
+    constexpr size_t MMAS_N = Mma_tile_p::MMAS_N;
+    size_t elts_per_head = STEPS * MMAS_M * MMAS_N * 8 * loop_steps;
+    launch_params.elts_per_thread = elts_per_head;
+    return;
+  }
+
+  dim3 grid(launch_params.params.h, launch_params.params.b);
+  kernel<<<grid, Kernel_traits::THREADS, smem_size, launch_params.stream>>>(
+      launch_params.params);
+
+  FMHA_CHECK_CUDA(cudaPeekAtLastError());
+}
+
+void run_fmha_fp16_sm80(
+    Launch_params<Fused_multihead_attention_fprop_params>& launch_params,
+    const bool configure) {
+  if (launch_params.params.d == 16) {
+    if (launch_params.params.s == 128) {
+      using Kernel_traits = FMHA_kernel_traits<128, 16, 16, 1, 4, 0x08u>;
+      run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    } else if (launch_params.params.s == 256) {
+      using Kernel_traits = FMHA_kernel_traits<256, 16, 16, 1, 4, 0x08u>;
+      run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    } else {
+      // TD [2022-05-15] 512 gives wrong results rn
+      // using Kernel_traits = FMHA_kernel_traits<512, 16, 16, 1, 4, 0x08u>;
+      using Kernel_traits = FMHA_kernel_traits<256, 16, 16, 1, 4, 0x08u>;
+      run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    }
+  } else if (launch_params.params.d == 32) {
+    if (launch_params.params.s == 128) {
+      using Kernel_traits = FMHA_kernel_traits<128, 32, 16, 1, 4, 0x08u>;
+      run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    } else if (launch_params.params.s == 256) {
+      using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 4, 0x08u>;
+      run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    } else {
+      using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 4, 0x08u>;
+      run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    }
+  } else if (launch_params.params.d == 64) {
+    if (launch_params.params.s == 128) {
+      using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u>;
+      run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    } else if (launch_params.params.s >= 256) {
+      // auto dprops = at::cuda::getCurrentDeviceProperties();
+      // if (dprops->major == 8 && dprops->minor >= 0) {
+      using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4, 0x08u>;
+      run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+      // } else if (dprops->major == 7 && dprops->minor == 5) {
+      //     if (launch_params.is_dropout) { // Need to use the same block size
+      //     as backward
+      //         using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4,
+      //         0x08u>; run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params,
+      //         configure);
+      //     } else {
+      //         using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4,
+      //         0x08u>; run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params,
+      //         configure);
+      //     }
+      // }
+    }
+  } else if (launch_params.params.d == 128) {
+    if (launch_params.params.s == 128) {
+      using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4, 0x08u>;
+      run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+    } else {
+      // auto dprops = at::cuda::getCurrentDeviceProperties();
+      // if (dprops->major == 8 && dprops->minor >= 0 &&
+      // !launch_params.is_dropout) {
+      //     // TD [2022-06-05] Keep K in registers to reduce register spilling
+      //     // Gives about 6% speedup compared to using block size 128.
+      using Kernel_traits = FMHA_kernel_traits<256, 128, 16, 1, 4, 0x18u>;
+      run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+      // } else {  // Need to use the same block size as backward
+      //     using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4,
+      //     0x08u>; run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params,
+      //     configure);
+      // }
+    }
+  }
+  // if (launch_params.params.d == 64) {
+  //     // using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u>;
+  //     // using Kernel_traits = FMHA_kernel_traits<64, 64, 16, 1, 4, 0x08u>;
+  //     // using Kernel_traits = FMHA_kernel_traits<512, 64, 16, 1, 8, 0x08u>;
+  //     using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u>;
+  //     run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+  // }
+  // if (launch_params.params.d == 64) {
+  //     if( launch_params.params.s == 128 ) {
+  //         using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4, 0x08u>;
+  //         run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params, configure);
+  //     } else if( launch_params.params.s >= 256 ) {
+  //         auto dprops = at::cuda::getCurrentDeviceProperties();
+  //         if (dprops->major == 8 && dprops->minor >= 0) {
+  //             using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4,
+  //             0x08u>; run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params,
+  //             configure);
+  //         } else if (dprops->major == 7 && dprops->minor == 5) {
+  //             if (launch_params.is_dropout) { // Need to use the same block
+  //             size as backward
+  //                 using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 4,
+  //                 0x08u>;
+  //                 run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params,
+  //                 configure);
+  //             } else {
+  //                 using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 4,
+  //                 0x08u>;
+  //                 run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params,
+  //                 configure);
+  //             }
+  //         }
+  //     }
+  // }
+  // if (launch_params.params.d == 128) {
+  //     if( launch_params.params.s == 128 ) {
+  //         using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4,
+  //         0x08u>; run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params,
+  //         configure);
+  //     } else {
+  //         auto dprops = at::cuda::getCurrentDeviceProperties();
+  //         if (dprops->major == 8 && dprops->minor >= 0 &&
+  //         !launch_params.is_dropout) {
+  //             // TD [2022-06-05] Keep K in registers to reduce register
+  //             spilling
+  //             // Gives about 6% speedup compared to using block size 128.
+  //             using Kernel_traits = FMHA_kernel_traits<256, 128, 16, 1, 4,
+  //             0x18u>; run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params,
+  //             configure);
+  //         } else {  // Need to use the same block size as backward
+  //             using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 4,
+  //             0x08u>; run_fmha_fp16_sm80_loop_<Kernel_traits>(launch_params,
+  //             configure);
+  //         }
+  //     }
+  // }
+}
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h
new file mode 100644
index 000000000..1cd4c191c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h
@@ -0,0 +1,795 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/***************************************************************************************************
+ * Copyright (c) 2022, Tri Dao.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <fmha/gemm.h>
+#include <fmha/kernel_traits.h>
+#include <fmha/utils.h>
+#include "fmha_kernel.h"
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits>
+struct Gemm_Q_K_base {
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+  using Smem_tile_q = typename Kernel_traits::Smem_tile_q;
+  using Smem_tile_k = typename Kernel_traits::Smem_tile_k;
+  using Fragment_q = typename Smem_tile_q::Fragment;
+  using Fragment_k = typename Smem_tile_k::Fragment;
+
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
+
+  static constexpr int SMEM_BYTES_SOFTMAX =
+      Cta_tile_p::M * Cta_tile_p::WARPS_N * sizeof(float) * 2;
+
+  __device__ inline Gemm_Q_K_base(
+      char* smem_ptr_q,
+      char* smem_ptr_k,
+      const int tidx)
+      : smem_q(smem_ptr_q, tidx), smem_k(smem_ptr_k, tidx) {}
+
+  __device__ inline void load_q() {
+    smem_q.load(frag_q[0], 0);
+  }
+
+  __device__ inline void reload_q() {
+    smem_q.load(frag_q[0], 0);
+  }
+
+  Fragment_q frag_q[2][Mma_tile_p::MMAS_M];
+  Smem_tile_q smem_q;
+  Smem_tile_k smem_k;
+};
+
+template <typename Kernel_traits, bool K_in_regs>
+struct Gemm_Q_K : public Gemm_Q_K_base<Kernel_traits> {
+  using Base = Gemm_Q_K_base<Kernel_traits>;
+  using Smem_tile_o = typename Base::Smem_tile_o;
+  using Smem_tile_q = typename Base::Smem_tile_q;
+  using Smem_tile_k = typename Base::Smem_tile_k;
+  using Fragment_k = typename Base::Fragment_k;
+  using Mma_tile_p = typename Base::Mma_tile_p;
+
+  static constexpr bool SHARE_SMEM_FOR_K_AND_V =
+      Kernel_traits::SHARE_SMEM_FOR_K_AND_V;
+  // If V is stored in shared memory, we can't load K using the same shared
+  // memory.
+  static_assert(Kernel_traits::V_IN_REGS);
+
+  static constexpr int SMEM_OFFSET_O = Smem_tile_q::BYTES_PER_TILE;
+  static constexpr int SMEM_OFFSET_SOFTMAX =
+      SMEM_OFFSET_O + Smem_tile_o::BYTES_PER_TILE;
+  static constexpr int SMEM_OFFSET_V = Smem_tile_q::BYTES_PER_TILE +
+      (SHARE_SMEM_FOR_K_AND_V ? 0 : Smem_tile_k::BYTES_PER_TILE);
+
+  // Q | K / V
+  //   | O | SOFTMAX
+  static constexpr int SMEM_BYTES = Smem_tile_q::BYTES_PER_TILE +
+      std::max((SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Smem_tile_k::BYTES_PER_TILE,
+               Smem_tile_o::BYTES_PER_TILE + Base::SMEM_BYTES_SOFTMAX);
+
+  __device__ inline Gemm_Q_K(char* smem_, const int tidx)
+      : Base(smem_, smem_ + Smem_tile_q::BYTES_PER_TILE, tidx) {}
+
+  __device__ inline void load_k() {
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_p::MMAS_K; ++ki) {
+      Base::smem_k.load(frag_k[ki], ki);
+    }
+  }
+
+  template <typename Acc, int M, int N>
+  __device__ inline void operator()(Acc (&acc_p)[M][N]) {
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+    for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
+      // Trigger the load from shared memory for the next series of Q values.
+      Base::smem_q.load(Base::frag_q[ki & 1], ki);
+      // Do the math for the values already in registers.
+      fmha::gemm_cl(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+    }
+    // Do the final stage of math.
+    {
+      int ki = Mma_tile_p::MMAS_K;
+      fmha::gemm_cl(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1)]);
+    }
+  }
+
+  __device__ inline void reload_k() {
+    // Noop.
+  }
+
+  Fragment_k frag_k[Mma_tile_p::MMAS_K][Mma_tile_p::MMAS_N];
+};
+
+template <typename Kernel_traits>
+struct Gemm_Q_K<Kernel_traits, false> : public Gemm_Q_K_base<Kernel_traits> {
+  using Base = Gemm_Q_K_base<Kernel_traits>;
+  using Smem_tile_o = typename Base::Smem_tile_o;
+  using Smem_tile_q = typename Base::Smem_tile_q;
+  using Smem_tile_k = typename Base::Smem_tile_k;
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+  using Fragment_k = typename Base::Fragment_k;
+  using Mma_tile_p = typename Base::Mma_tile_p;
+  Fragment_k frag_k[2][Mma_tile_p::MMAS_N];
+
+  static constexpr bool SHARE_SMEM_FOR_K_AND_V =
+      Kernel_traits::SHARE_SMEM_FOR_K_AND_V;
+  static constexpr bool V_IN_REGS = Kernel_traits::V_IN_REGS;
+  static_assert(V_IN_REGS || !SHARE_SMEM_FOR_K_AND_V);
+
+  static constexpr int SMEM_OFFSET_V = Smem_tile_q::BYTES_PER_TILE +
+      (SHARE_SMEM_FOR_K_AND_V ? 0 : Smem_tile_k::BYTES_PER_TILE);
+  static_assert(
+      Smem_tile_v::BYTES_PER_TILE == (int)Smem_tile_k::BYTES_PER_TILE);
+  static constexpr int SMEM_OFFSET_O =
+      SMEM_OFFSET_V + Smem_tile_v::BYTES_PER_TILE;
+  static constexpr int SMEM_OFFSET_SOFTMAX =
+      SMEM_OFFSET_O + Smem_tile_o::BYTES_PER_TILE;
+
+  // If V_IN_REGS and SHARE_SMEM_FOR_K_AND_V:      Q | K/V | O | SOFTMAX
+  // If !V_IN_REGS (then !SHARE_SMEM_FOR_K_AND_V): Q | K   | V | O | SOFTMAX
+  static constexpr int SMEM_BYTES = Smem_tile_q::BYTES_PER_TILE +
+      (SHARE_SMEM_FOR_K_AND_V ? 1 : 2) * Smem_tile_k::BYTES_PER_TILE +
+      Smem_tile_o::BYTES_PER_TILE + Base::SMEM_BYTES_SOFTMAX;
+
+  __device__ inline Gemm_Q_K(char* smem_, const int tidx)
+      : Base(smem_, smem_ + Smem_tile_q::BYTES_PER_TILE, tidx) {}
+
+  __device__ inline void load_k() {
+    Base::smem_k.load(frag_k[0], 0);
+  }
+
+  template <typename Acc, int M, int N>
+  __device__ inline void operator()(Acc (&acc_p)[M][N]) {
+// Do this part of P^T = (Q * K^T)^T.
+#pragma unroll
+    for (int ki = 1; ki < Mma_tile_p::MMAS_K; ++ki) {
+      // Trigger the load from shared memory for the next series of Q values.
+      Base::smem_q.load(Base::frag_q[ki & 1], ki);
+      Base::smem_k.load(frag_k[ki & 1], ki);
+      // Do the math for the values already in registers.
+      fmha::gemm_cl(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
+    }
+    // Do the final stage of math.
+    {
+      int ki = Mma_tile_p::MMAS_K;
+      fmha::gemm_cl(acc_p, Base::frag_q[(ki - 1) & 1], frag_k[(ki - 1) & 1]);
+    }
+  }
+
+  __device__ inline void reload_k() {
+    Base::smem_k.load(frag_k[0], 0);
+  }
+};
+
+template <typename Kernel_traits>
+constexpr size_t get_dynamic_smem_size() {
+  return Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>::SMEM_BYTES;
+}
+
+template <
+    typename Kernel_traits,
+    bool Is_dropout,
+    bool Is_causal,
+    bool Return_softmax,
+    bool Is_first,
+    bool Is_last,
+    typename Params,
+    typename Prng>
+inline __device__ void device_1xN_(
+    const Params& params,
+    const int bidb,
+    const int bidh,
+    int begin,
+    int steps,
+    Prng& ph0,
+    Prng& ph1,
+    const int loop_step_idx) {
+  // The description of the CTA tile for the 1st batched GEMM.
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  // The description of the CTA tile for the 2nd batched GEMM.
+  using Cta_tile_o = typename Kernel_traits::Cta_tile_o;
+
+  // The MMA tile for the 1st GEMM.
+  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
+  // The MMA tile for the 2nd GEMM.
+  using Mma_tile_o = fmha::Hmma_tile<Cta_tile_o>;
+
+  // The global memory tile to load Q.
+  using Gmem_tile_q = typename Kernel_traits::Gmem_tile_q;
+
+  // The global memory tile to load K.
+  using Gmem_tile_k = typename Kernel_traits::Gmem_tile_k;
+
+  // The global memory tile to load V.
+  using Gmem_tile_v = typename Kernel_traits::Gmem_tile_v;
+  // The shared memory tile to swizzle V.
+  using Smem_tile_v = typename Kernel_traits::Smem_tile_v;
+
+  // The global memory tile to store O.
+  using Gmem_tile_o = typename Kernel_traits::Gmem_tile_o;
+  using Gmem_tile_o_tmp = fmha::Gmem_tile_o<Cta_tile_o, 4>;
+  // The shared memory tile to swizzle O.
+  using Smem_tile_o = typename Kernel_traits::Smem_tile_o;
+
+  using Gmem_tile_s = typename Kernel_traits::Gmem_tile_s;
+
+  using Gmem_softmax_sum = typename Kernel_traits::Gmem_softmax_sum;
+
+  using Smem_softmax_sum = typename Kernel_traits::Smem_dp_sum;
+
+  using Gemm1 = Gemm_Q_K<Kernel_traits, Kernel_traits::K_IN_REGS>;
+
+  using Softmax = fmha::Softmax<Cta_tile_p, Kernel_traits>;
+
+  // Shared memory.
+  extern __shared__ char smem_[];
+
+  // The thread index.
+  const int tidx = threadIdx.x;
+
+  const BlockInfoPadded<Kernel_traits::THREADS> binfo(params, bidb, bidh, tidx);
+  // if( binfo.stop_early() ) return;
+  if (binfo.stop_early(loop_step_idx * Cta_tile_p::N))
+    return;
+
+  Gemm1 gemm_q_k(smem_, tidx);
+  // Allocate the global memory tile loader for Q.
+  Gmem_tile_q gmem_q(
+      params.q_ptr,
+      params.q_row_stride_in_elts,
+      params.q_head_stride_in_elts,
+      binfo,
+      tidx);
+  // Allocate the global memory tile loader for O.
+  Gmem_tile_o gmem_o(
+      params.o_ptr,
+      params.o_row_stride_in_elts,
+      params.o_head_stride_in_elts,
+      binfo,
+      tidx);
+  Gmem_tile_o_tmp gmem_o_tmp(
+      params.o_tmp_ptr,
+      params.o_row_stride_in_elts,
+      params.o_head_stride_in_elts,
+      binfo,
+      tidx);
+  // Allocate the global memory tile loader for S.
+  Gmem_tile_s gmem_s(params, binfo, tidx);
+  Gmem_softmax_sum gmem_softmax_lse(params.softmax_lse_ptr, params, tidx);
+
+  // Wind gmem tiles to the correct position.
+  static_assert(Cta_tile_p::N % Cta_tile_p::M == 0);
+  const int begin_og = begin;
+  begin = Is_causal
+      ? std::max(begin, loop_step_idx * Cta_tile_p::N / Cta_tile_p::M)
+      : begin;
+  const int steps_og = steps;
+  steps -= begin - begin_og;
+  gmem_q.move(begin);
+  gmem_o.move(begin);
+  gmem_o_tmp.move(begin);
+  if (Return_softmax) {
+    gmem_s.move(begin);
+  }
+  gmem_softmax_lse.move(begin);
+  // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+  //     printf("begin = %d, steps = %d\n", begin, steps);
+  // }
+
+  fmha::Mask<Cta_tile_p, Is_causal> mask(binfo, tidx, loop_step_idx);
+
+  // Allocate the global memory tile loader for K.
+  Gmem_tile_k gmem_k(
+      params.k_ptr,
+      params.k_row_stride_in_elts,
+      params.k_head_stride_in_elts,
+      binfo,
+      tidx);
+  // Allocate the global memory tile loader for V.
+  Gmem_tile_v gmem_v(
+      params.v_ptr,
+      params.v_row_stride_in_elts,
+      params.v_head_stride_in_elts,
+      binfo,
+      tidx);
+  // The base pointer of smem_v;
+  char* smem_v_ = &smem_[Gemm1::SMEM_OFFSET_V];
+
+  // Allocate the shared memory tile loader for V. We use the same as K so be
+  // careful!!!
+  Smem_tile_v smem_v(smem_v_, tidx);
+
+  // Allocate the shared memory tile loader for O. We use the same as K so be
+  // careful!!!
+  Smem_tile_o smem_o(&smem_[Gemm1::SMEM_OFFSET_O], tidx);
+
+  if (!Is_first) {
+    gmem_k.move(loop_step_idx);
+    gmem_v.move(loop_step_idx);
+    if (Return_softmax) {
+      gmem_s.move(loop_step_idx * steps_og);
+    }
+  }
+
+  // Trigger the loads for K.
+  gmem_k.load();
+  // Trigger the loads for Q.
+  gmem_q.load();
+  // Trigger the loads for V.
+  gmem_v.load();
+
+  if (!Is_first) {
+    __syncthreads();
+  }
+
+  float p_prev_lse[Mma_tile_p::MMAS_M * 2];
+  if (!Is_first) {
+    gmem_softmax_lse.load(
+        reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
+  }
+
+  // Commit the data for Q and V to shared memory.
+  gmem_q.commit(gemm_q_k.smem_q);
+  gmem_v.commit(smem_v);
+
+  // const uint32_t scale_bmm1 = reinterpret_cast<const
+  // uint32_t&>(params.scale_bmm1); #pragma unroll for(int it=0;it <
+  // Gmem_tile_k::LDGS;it++){
+  //     gmem_k.fetch_[it] = fmha::hmul8(scale_bmm1, gmem_k.fetch_[it]);
+  // }
+
+  // Commit the data for K to shared memory.
+  if (!Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    gmem_k.commit(gemm_q_k.smem_k);
+  }
+
+  __syncthreads();
+
+  // Load the fragments for Q.
+  gemm_q_k.load_q();
+
+  // Load the fragments for V. We keep the data in registers during the entire
+  // kernel.
+  typename Smem_tile_v::Fragment frag_v[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_N];
+#pragma unroll
+  for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+    smem_v.load(frag_v[ki], ki);
+  }
+
+  // Commit the data for V to shared memory if it has not been done already.
+  if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V) {
+    // Make sure we are done loading the fragments for K.
+    __syncthreads();
+
+    // Commit the data to shared memory for V.
+    gmem_k.commit(gemm_q_k.smem_k);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+  }
+
+  // Load the fragments for K.
+  gemm_q_k.load_k();
+
+  // Create the object to do the softmax.
+  Softmax softmax(params, &smem_[Gemm1::SMEM_OFFSET_SOFTMAX], tidx);
+
+  Smem_softmax_sum smem_softmax_lse(
+      reinterpret_cast<float*>(&smem_[Gemm1::SMEM_BYTES]), tidx);
+
+  // Load over the entire sequence length.
+  for (int l = 0; l < steps; l++) {
+    if ((begin + l) * Cta_tile_p::M >= binfo.actual_seqlen)
+      break;
+
+    // Declare the accumulators for the 1st gemm.
+    fmha::Fragment_accumulator acc_p[Mma_tile_p::MMAS_M][Mma_tile_p::MMAS_N];
+    fmha::Clear_accumulator<
+        typename fmha::Accumulator_type,
+        Cta_tile_p::WARPS_K>::apply(acc_p);
+
+    // Do this part of P = Q * K^T.
+    gemm_q_k(acc_p);
+
+    uint4 out[Gmem_tile_o::STGS_PER_LOOP];
+    if (!Is_first) {
+      gmem_o_tmp.load(out, 0);
+    }
+
+    // Trigger the load for the next Q values.
+    if (l < steps - 1) {
+      gemm_q_k.smem_q.move_to_next_write_buffer();
+      gmem_q.move();
+      gmem_q.load();
+    }
+
+    // Load the mask for that iteration.
+    mask.load(begin + l);
+
+    // Convert from the accumulator type to FP32 for Softmax.
+    softmax.unpack_noscale(acc_p);
+
+    // Apply the mask.
+    softmax.apply_mask(mask);
+
+    if (Kernel_traits::SHARE_SMEM_FOR_K_AND_V && l == 0) {
+      // if we share K and V, it could be that V was not fully read yet but we
+      // write into smem for reduction
+      __syncthreads();
+    }
+    // if (!Is_first) {
+    //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+    //     (l == 0))  {
+    //         printf("p_prev_lse=%.6f, %.6f\n", p_prev_lse[0], p_prev_lse[1]);
+    //     }
+    // }
+    // Compute the max.
+    float p_max[Mma_tile_p::MMAS_M * 2];
+    if (!Is_first) {
+      smem_softmax_lse.store_pair(p_prev_lse, l % 2);
+      // for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) { p_max[mi] =
+      // p_prev_lse[mi]; }
+      for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) {
+        p_max[mi] = p_prev_lse[mi] / params.scale_bmm1f;
+      }
+    }
+
+    // Trigger the load for the next LSE values.
+    if (l < steps - 1) {
+      if (!Is_first) {
+        gmem_softmax_lse.load_next(
+            reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
+      }
+    }
+
+    softmax.template reduce_max</*zero_init=*/Is_first>(p_max);
+
+    // if ((threadIdx.x == 0) && (l == 38)) {
+    //     printf("loop_step_idx %d, p_max = %.6f, %.6f., p_prev_lse = %.6f,
+    //     %.6f\n", loop_step_idx, p_max[0], p_max[1], Is_first ? -10000.f :
+    //     p_prev_lse[0], Is_first ? -10000.f : p_prev_lse[1]);
+    // }
+
+    // if (!Is_first) {
+    //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+    //     (l == 0))  {
+    //         printf("after reduce_max=%.6f, %.6f\n", softmax.elt_[0][0],
+    //         softmax.elt_[0][1]);
+    //     }
+    // }
+
+    // Compute the exponential value.
+    // softmax.apply_exp(p_max);
+    softmax.scale_apply_exp(p_max, params.scale_bmm1f);
+
+    // if (!Is_first) {
+    //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+    //     (l == 0))  {
+    //         printf("after apply_exp=%.6f, %.6f\n", softmax.elt_[0][0],
+    //         softmax.elt_[0][1]);
+    //     }
+    // }
+
+    // Compute the sum.
+    float p_sum[Mma_tile_p::MMAS_M * 2];
+    // if (!Is_first) {
+    //     int warp = tidx / Cta_tile_p::THREADS_PER_WARP;
+    //     int lane = tidx % Cta_tile_p::THREADS_PER_WARP;
+    //     for (int mi = 0; mi < Mma_tile_p::MMAS_M * 2; mi++) {
+    //         p_sum[mi] = ((warp == 0) && (lane % 4 == 0)) ?
+    //         expf(p_prev_lse[mi] - p_max[mi]) : 0;
+    //     }
+    // }
+    // softmax.reduce_sum(p_sum);
+    softmax.reduce_sum_before_sync_(p_sum);
+    // softmax.template reduce_sum_before_sync_</*zero_init=*/Is_first>(p_sum);
+
+    // float p_sum_log[Mma_tile_p::MMAS_M * 2];
+    // for (int mi = 0; mi  < Mma_tile_p::MMAS_M * 2; ++mi) {
+    //     float sum = p_sum[mi];
+    //     // p_sum_log[mi] = (sum == 0.f || sum != sum) ? INFINITY : p_max[mi]
+    //     + __logf(sum); constexpr float kLog2e = M_LOG2E; p_sum_log[mi] = (sum
+    //     == 0.f || sum != sum) ? INFINITY : p_max[mi] * kLog2e + __log2f(sum);
+    // }
+    // // gmem_softmax_lse.store(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M
+    // * 2]>(p_sum));
+    // gmem_softmax_lse.store(reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M *
+    // 2]>(p_sum_log)); gmem_softmax_lse.move();
+
+    // // Finalize softmax on the accumulators of P^T.
+    // softmax.scale(p_sum);
+
+    constexpr bool encode_dropout_in_sign_bit = Return_softmax;
+    if (Is_dropout) {
+      // softmax.template apply_dropout<encode_dropout_in_sign_bit>(ph0,
+      // params.p_dropout_in_uint); softmax.template
+      // apply_dropout<encode_dropout_in_sign_bit>(ph0, ph1,
+      // params.p_dropout_in_uint);
+      softmax.template apply_dropout_16bits<encode_dropout_in_sign_bit>(
+          ph0, ph1, params.p_dropout_in_uint16_t);
+    }
+
+    using Frag_p = fmha::Fragment_a<fmha::Row>;
+    Frag_p frag_p[Mma_tile_o::MMAS_K][Mma_tile_o::MMAS_M];
+    static_assert(Mma_tile_o::MMAS_M == Mma_tile_p::MMAS_M);
+    static_assert(Mma_tile_o::MMAS_K == Mma_tile_p::MMAS_N);
+    softmax.pack(frag_p);
+    if (Return_softmax) {
+      gmem_s.store(frag_p, mask);
+      gmem_s.move();
+    }
+
+    // Commit the values for Q into shared memory.
+    if (l < steps - 1) {
+      gmem_q.commit(gemm_q_k.smem_q);
+    }
+
+    if (Is_dropout && encode_dropout_in_sign_bit) {
+#pragma unroll
+      for (int ki = 0; ki < Mma_tile_o::MMAS_K; ki++) {
+#pragma unroll
+        for (int mi = 0; mi < Mma_tile_o::MMAS_M; mi++) {
+          frag_p[ki][mi].hrelu_();
+        }
+      }
+    }
+
+    // Declare the accumulators for the 2nd gemm.
+    fmha::Fragment_accumulator acc_o[Mma_tile_o::MMAS_M][Mma_tile_o::MMAS_N];
+    fmha::Clear_accumulator<
+        typename fmha::Accumulator_type,
+        Cta_tile_o::WARPS_K>::apply(acc_o);
+
+// Do this part of O = P^T * V^T.
+#pragma unroll
+    for (int ki = 0; ki < Mma_tile_o::MMAS_K; ++ki) {
+      fmha::gemm_cl(acc_o, frag_p[ki], frag_v[ki]);
+      // if ((threadIdx.x == 4) && (blockIdx.x == 0) && (blockIdx.y == 0) && (l
+      // == 0))  {
+      //     float2 tmp_p = __half22float2(reinterpret_cast<__half2
+      //     &>(frag_p[ki])); float2 tmp_v =
+      //     __half22float2(reinterpret_cast<__half2 &>(frag_v[ki]));
+      //     printf("Per warp, threadIdx.x = %d, frag_p = %.6f, %.6f, frag_v =
+      //     %.6f, %.6f, acc_o=%.6f\n", threadIdx.x, tmp_p.x, tmp_p.y, tmp_v.x,
+      //     tmp_v.y, acc_o[0][0].elt(0));
+      // }
+    }
+
+    // if ((threadIdx.x % 32 == 16) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+    // (l == 0))  {
+    //     printf("Per warp, threadIdx.x = %d, acc_o=%.6f\n", threadIdx.x,
+    //     acc_o[0][2].elt(0));
+    // }
+
+    // The mapping from tidx to rows changes between the softmax and the
+    // O-reduction. So we recalculate the max.
+    float p_max_o[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
+    // TODO: not sure if this is right for seqlen 128 or 256
+    int rows[Gmem_tile_o::STGS_PER_LOOP];
+    for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+      rows[jj] =
+          tidx / Gmem_tile_o::THREADS_PER_ROW + jj * Gmem_tile_o::ROWS_PER_STG;
+    }
+    softmax.reduce_max_after_sync_(p_max_o, rows);
+    static_assert(Mma_tile_o::MMAS_M == 1);
+    for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+      p_max_o[jj][0] *= params.scale_bmm1f;
+    }
+    float p_prev_scale_o[Gmem_tile_o::STGS_PER_LOOP];
+    if (!Is_first) {
+      smem_softmax_lse.load(p_prev_scale_o, rows, l % 2);
+    }
+    // if (!Is_first) {
+    //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+    //     (l == 0))  {
+    //         printf("p_prev_scale_o=%.6f\n", p_prev_scale_o[0]);
+    //     }
+    // }
+
+    static_assert(Gmem_tile_o::LOOPS == 1);
+
+    // Swizzle the elements and do the final reduction.
+    smem_o.store(acc_o, 0);
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    static_assert(Mma_tile_o::MMAS_M == 1);
+    float p_sum_o[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
+    softmax.reduce_sum_after_sync_(p_sum_o, rows);
+    if (!Is_first) {
+      for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+        p_prev_scale_o[jj] = expf(p_prev_scale_o[jj] - p_max_o[jj][0]);
+        p_sum_o[jj][0] += p_prev_scale_o[jj];
+      }
+    }
+
+    float p_sum_log[Gmem_tile_o::STGS_PER_LOOP][Mma_tile_o::MMAS_M];
+#pragma unroll
+    for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+      float sum = p_sum_o[jj][0];
+      p_sum_log[jj][0] =
+          (sum == 0.f || sum != sum) ? -INFINITY : p_max_o[jj][0] + __logf(sum);
+      // if (sum == 0.f || sum != sum) {
+      //     printf("loop_step_idx = %d, l = %d, tidx = %d, sum = %.6f, p_max_o
+      //     = %.6f\n", loop_step_idx, l, tidx, sum, p_max_o[jj][0]);
+      // }
+      // if (Is_first) {
+      //     if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) &&
+      //     (l == 0))  {
+      //         printf("p_sum_log=%.6f\n", p_sum_log[jj][0]);
+      //     }
+      // }
+      if ((tidx % Gmem_tile_o::THREADS_PER_ROW == 0) &&
+          (tidx / Gmem_tile_o::THREADS_PER_ROW < Gmem_tile_o::ROWS)) {
+        gmem_softmax_lse.store_row(
+            reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),
+            rows[jj]);
+      }
+    }
+    gmem_softmax_lse.move();
+
+    // Load from shared memory.
+    if (!Is_first) {
+      for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+        out[jj] = fmha::fmul4(out[jj], p_prev_scale_o[jj]);
+      }
+    }
+    smem_o.template load</*zero_init=*/Is_first>(out);
+
+    const bool is_final_write = Is_last ||
+        ((loop_step_idx + 1) * Cta_tile_p::N >= binfo.actual_seqlen) ||
+        ((Is_causal) &&
+         ((begin + l) * Cta_tile_p::M < (loop_step_idx + 1) * Cta_tile_p::N));
+#pragma unroll
+    for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+      float sum = p_sum_o[jj][0];
+      float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum;
+      if (Is_dropout && is_final_write) {
+        inv_sum *= params.rp_dropout;
+      }
+      out[jj] = fmha::fmul4(out[jj], inv_sum);
+    }
+
+    // if (Is_dropout && Is_last) {
+    //     for (int jj = 0; jj < Gmem_tile_o::STGS_PER_LOOP; jj++) {
+    //         out[jj] = fmha::fmul4(out[jj], params.rp_dropout);
+    //     }
+    // }
+
+    // Output the values.
+    if (is_final_write) {
+      gmem_o.store(out, 0);
+      gmem_o.move();
+    } else {
+      gmem_o_tmp.store(out, 0);
+    }
+
+    // Move to the next part of the output.
+    if (!(Is_first && Is_last)) {
+      gmem_o_tmp.move();
+    }
+    gemm_q_k.reload_k();
+
+    // Make sure we are reading from the correct buffer.
+    gemm_q_k.smem_q.move_to_next_read_buffer();
+    // Trigger the load from shared memory for the next series of Q values.
+    if (l < steps - 1) {
+      gemm_q_k.reload_q();
+    }
+
+  } // Outer loop over the sequence length.
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename Kernel_traits,
+    bool Is_dropout,
+    bool Is_causal,
+    bool Return_softmax,
+    typename Params>
+inline __device__ void device_1xN_loop(const Params& params) {
+  // The block index for the batch.
+  const int bidb = blockIdx.y;
+  // The block index for the head.
+  const int bidh = blockIdx.x;
+  // The thread index.
+  const int tidx = threadIdx.x;
+
+  const int tidx_global = (bidb * params.h + bidh) * blockDim.x * 2 + tidx;
+  // auto seeds = at::cuda::philox::unpack(params.philox_args);
+  auto seeds = std::make_tuple(0, 0);
+  Philox ph0(std::get<0>(seeds), tidx_global, std::get<1>(seeds));
+  Philox ph1(std::get<0>(seeds), tidx_global + blockDim.x, std::get<1>(seeds));
+  const int STEPS = params.s / Kernel_traits::Cta_tile_p::M;
+
+  constexpr int N_per_loop = Kernel_traits::Cta_tile_p::N;
+  if (params.s == N_per_loop) {
+    fmha::device_1xN_<
+        Kernel_traits,
+        Is_dropout,
+        Is_causal,
+        Return_softmax,
+        true,
+        true>(params, bidb, bidh, 0, STEPS, ph0, ph1, 0);
+  } else {
+    const int max_loop_steps = (params.s + N_per_loop - 1) / N_per_loop;
+    fmha::device_1xN_<
+        Kernel_traits,
+        Is_dropout,
+        Is_causal,
+        Return_softmax,
+        true,
+        false>(params, bidb, bidh, 0, STEPS, ph0, ph1, 0);
+    for (int loop_step_idx = 1; loop_step_idx < max_loop_steps - 1;
+         loop_step_idx++) {
+      fmha::device_1xN_<
+          Kernel_traits,
+          Is_dropout,
+          Is_causal,
+          Return_softmax,
+          false,
+          false>(params, bidb, bidh, 0, STEPS, ph0, ph1, loop_step_idx);
+    }
+    fmha::device_1xN_<
+        Kernel_traits,
+        Is_dropout,
+        Is_causal,
+        Return_softmax,
+        false,
+        true>(params, bidb, bidh, 0, STEPS, ph0, ph1, max_loop_steps - 1);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace fmha
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h b/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h
new file mode 100644
index 000000000..43692802b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_kernel.h
@@ -0,0 +1,204 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <philox.cuh>
+
+#include <fmha.h>
+#include <fmha/gmem_tile.h>
+#include <fmha/mask.h>
+#include <fmha/smem_tile.h>
+#include <fmha/softmax.h>
+#include <fmha/utils.h>
+
+namespace fmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int THREADS_PER_CTA>
+struct BlockInfoPadded {
+  template <typename Params>
+  __device__ BlockInfoPadded(
+      const Params& params,
+      const int bidb,
+      const int bidh,
+      const int tidx)
+      : bidb(bidb), bidh(bidh), h(params.h) {
+    // The block index.
+    sum_s = params.cu_seqlens[bidb];
+    actual_seqlen = params.cu_seqlens[bidb + 1] - sum_s;
+    bidx = sum_s * params.h + bidh;
+
+    tidx_global = (bidb * params.h + bidh) * THREADS_PER_CTA + tidx;
+  }
+
+  __device__ bool stop_early(const int start_col = 0) const {
+    return actual_seqlen <= start_col;
+  }
+
+  int actual_seqlen;
+  int bidx;
+  int sum_s;
+  int bidh;
+  int bidb;
+  int tidx_global;
+  int h;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int CHUNKS, typename Cta_tile>
+struct Noloop_traits {
+  // Interpretation of Cta_tile dims, i.e. Cta_tile_p:
+  enum { STEP = Cta_tile::M };
+  enum { SEQLEN = Cta_tile::N };
+
+  template <typename Block_info>
+  inline __device__ Noloop_traits(const int bidc, const Block_info& binfo)
+      : bidc_(bidc) {
+    const int seqlen = binfo.actual_seqlen;
+    const int steps = (seqlen + STEP - 1) / STEP;
+    const int steps_per_chunk = (steps + CHUNKS - 1) / CHUNKS;
+
+    const int step_begin = bidc_ * steps_per_chunk;
+    const int step_end = min(steps, (bidc_ + 1) * steps_per_chunk);
+    const int actual_steps = max(0, step_end - step_begin);
+    loop_offset_ = step_begin;
+    num_steps_ = actual_steps;
+  }
+
+  template <typename... Tiles>
+  inline __device__ void move_all(Tiles&... tiles) const {
+    using expand_type = int[];
+    for (int s = 0; s < loop_offset_; s++) {
+      expand_type{(tiles.move(), 0)...};
+    }
+  }
+
+  inline __device__ int get_idx_dk() const {
+    // return bidc_;
+    return bidc_ * 2 + 0;
+  }
+
+  inline __device__ int get_idx_dv() const {
+    // return CHUNKS + bidc_;
+    return bidc_ * 2 + 1;
+  }
+
+  inline __device__ int offset_loop_count(const int l) {
+    // convert loop counter to position in the outer sequence
+    return (loop_offset_ + l) * STEP;
+  }
+
+  const uint32_t bidc_;
+  int loop_offset_;
+  int num_steps_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Kernel_traits>
+std::tuple<int, int, int, int, int, int> work_dist(
+    const int total_ctas,
+    const int heads_total) {
+  constexpr int STEPS_PER_HEAD =
+      Kernel_traits::Cta_tile_p::N / Kernel_traits::Cta_tile_p::M;
+
+  const int num_full_heads = heads_total / total_ctas;
+  const int heads_last_wave = heads_total % total_ctas;
+
+  int num_main_groups = 0;
+  int main_steps = 0;
+  int rest_steps = 0;
+  if (heads_last_wave > 0) {
+    // Number of CTA groups that process within heads.
+    num_main_groups = total_ctas / heads_last_wave;
+    // Remaining CTAs that process between heads.
+    const int rest_ctas = total_ctas - (heads_last_wave * num_main_groups);
+    if (rest_ctas == 0) {
+      // We have exactly "num_main_groups" CTAs to process each of the remaining
+      // heads.
+      main_steps = (STEPS_PER_HEAD + num_main_groups - 1) / num_main_groups;
+      num_main_groups = STEPS_PER_HEAD / main_steps; // Here: main_step > 0
+      rest_steps = STEPS_PER_HEAD % main_steps;
+
+    } else {
+      // Ideal number of steps if we could load-balance as evenly as possible.
+      const int steps_ideal =
+          (heads_last_wave * STEPS_PER_HEAD + total_ctas - 1) / total_ctas;
+      // Iterations that a "rest" CTA has to do at most.
+      const int max_rest_iters = (heads_last_wave + rest_ctas - 1) / rest_ctas;
+      // Find the first step distribution, s.t. the maximum work of the "rest"
+      // CTAs is less than the work of the main CTAs.
+      main_steps = steps_ideal;
+      rest_steps = STEPS_PER_HEAD - main_steps * num_main_groups;
+      for (; main_steps * num_main_groups < STEPS_PER_HEAD; main_steps++) {
+        rest_steps = STEPS_PER_HEAD - main_steps * num_main_groups;
+        const int max_rest_total_steps = rest_steps * max_rest_iters;
+        if (max_rest_total_steps < main_steps)
+          break;
+      }
+      rest_steps = STEPS_PER_HEAD - main_steps * num_main_groups;
+    }
+  }
+
+  using Cta_tile_p = typename Kernel_traits::Cta_tile_p;
+  using Mma_tile_p = fmha::Hmma_tile<Cta_tile_p>;
+
+  const int max_steps =
+      STEPS_PER_HEAD * num_full_heads + std::max(main_steps, rest_steps);
+  const int elts_per_thread_per_step =
+      Mma_tile_p::MMAS_M * Mma_tile_p::MMAS_N * 8;
+  const int elts_per_thread = max_steps * elts_per_thread_per_step;
+
+  return {
+      num_full_heads,
+      num_main_groups,
+      heads_last_wave,
+      main_steps,
+      rest_steps,
+      elts_per_thread};
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace fmha
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_utils.h b/python/aitemplate/backend/cuda/attention/src/fmha_utils.h
new file mode 100644
index 000000000..af8456621
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/fmha_utils.h
@@ -0,0 +1,111 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+/******************************************************************************
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define FMHA_CHECK_CUDA(call)                                     \
+  do {                                                            \
+    cudaError_t status_ = call;                                   \
+    if (status_ != cudaSuccess) {                                 \
+      auto msg = std::string("CUDA error(") + __FILE__ + ":" +    \
+          std::to_string(__LINE__) + cudaGetErrorString(status_); \
+      std::cerr << msg << std::endl;                              \
+      throw std::runtime_error(msg);                              \
+    }                                                             \
+  } while (0)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum Data_type {
+  DATA_TYPE_FP16,
+  DATA_TYPE_FP32,
+  DATA_TYPE_INT32,
+  DATA_TYPE_INT8
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline void set_alpha(uint32_t& alpha, float norm, Data_type dtype) {
+  if (dtype == DATA_TYPE_FP16) {
+    half x = __float2half_rn(norm);
+    uint16_t h = reinterpret_cast<const uint16_t&>(x);
+    ushort2 h2 = {h, h};
+    alpha = reinterpret_cast<const uint32_t&>(h2);
+  } else if (dtype == DATA_TYPE_FP32) {
+    alpha = reinterpret_cast<const uint32_t&>(norm);
+  } else if (dtype == DATA_TYPE_INT32) {
+    int32_t inorm = static_cast<int32_t>(norm);
+    alpha = reinterpret_cast<const uint32_t&>(inorm);
+  } else {
+    assert(false);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline size_t get_size_in_bytes(size_t n, Data_type dtype) {
+  switch (dtype) {
+    case DATA_TYPE_FP32:
+      return n * 4;
+    case DATA_TYPE_FP16:
+      return n * 2;
+    case DATA_TYPE_INT32:
+      return n * 4;
+    case DATA_TYPE_INT8:
+      return n;
+    default:
+      assert(false);
+      return 0;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/python/aitemplate/backend/cuda/attention/src/licenses/LICENSE b/python/aitemplate/backend/cuda/attention/src/licenses/LICENSE
new file mode 100644
index 000000000..261eeb9e9
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/licenses/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/python/aitemplate/backend/cuda/attention/src/philox.cuh b/python/aitemplate/backend/cuda/attention/src/philox.cuh
new file mode 100644
index 000000000..36e788400
--- /dev/null
+++ b/python/aitemplate/backend/cuda/attention/src/philox.cuh
@@ -0,0 +1,171 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+// Pytorch also has an implementation of Philox RNG:
+// https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
+#pragma once
+// Philox CUDA.
+
+namespace {
+
+class Philox {
+ public:
+  __device__ inline Philox(
+      unsigned long long seed,
+      unsigned long long subsequence,
+      unsigned long long offset)
+      : STATE(0), key(reinterpret_cast<const uint2&>(seed)) {
+    // key.x = (unsigned int)seed;
+    // key.y = (unsigned int)(seed >> 32);
+    // counter = make_uint4(0, 0, 0, 0);
+    // counter.z = (unsigned int)(subsequence);
+    // counter.w = (unsigned int)(subsequence >> 32);
+    // STATE = 0;
+    // incr_n(offset / 4);
+
+    // key = reinterpret_cast<const uint2&>(seed);
+    ull2* tmp = reinterpret_cast<ull2*>(&counter);
+    tmp->x = offset / 4;
+    tmp->y = subsequence;
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("Philox counter: %d, %d, %d, %d\n", counter.x, counter.y,
+    //     counter.z, counter.w);
+    // }
+  }
+  __device__ inline uint4 operator()() {
+    // if (STATE == 0) {
+    uint4 counter_ = counter;
+    uint2 key_ = key;
+// 7-round philox
+#pragma unroll
+    for (int i = 0; i < 6; i++) {
+      counter_ = single_round(counter_, key_);
+      key_.x += (kPhilox10A);
+      key_.y += (kPhilox10B);
+    }
+    // output = single_round(counter_, key_);
+    uint4 output = single_round(counter_, key_);
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("Philox counter: %u, %u, %u, %u\n", counter.x, counter.y,
+    //     counter.z, counter.w); printf("Philox output: %u, %u, %u, %u\n",
+    //     output.x, output.y, output.z, output.w);
+    // }
+    incr();
+    // }
+    // return a float4 directly
+    // unsigned long ret;
+    // switch(STATE) {
+    //  case 0: ret = output.x; break;
+    //  case 1: ret = output.y; break;
+    //  case 2: ret = output.z; break;
+    //  case 3: ret = output.w; break;
+    //}
+    // STATE = (STATE + 1) % 4;
+    return output;
+  }
+
+ private:
+  struct ull2 {
+    uint64_t x;
+    uint64_t y;
+  };
+  uint4 counter;
+  // uint4 output;
+  const uint2 key;
+  unsigned int STATE;
+  __device__ inline void incr_n(unsigned long long n) {
+    unsigned int nlo = (unsigned int)(n);
+    unsigned int nhi = (unsigned int)(n >> 32);
+    counter.x += nlo;
+    if (counter.x < nlo)
+      nhi++;
+    counter.y += nhi;
+    if (nhi <= counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }
+
+  __device__ uint4 incr128(uint4 ctr) {
+    uint4 res;
+    asm("add.cc.u32      %0, %4, %8;\n\t"
+        "addc.cc.u32     %1, %5, %9;\n\t"
+        "addc.cc.u32     %2, %6, %10;\n\t"
+        "addc.u32        %3, %7, %11;\n\t"
+        : "=r"(res.x), "=r"(res.y), "=r"(res.z), "=r"(res.w)
+        : "r"(ctr.x),
+          "r"(ctr.y),
+          "r"(ctr.z),
+          "r"(ctr.w),
+          "n"(1),
+          "n"(0),
+          "n"(0),
+          "n"(0));
+    return res;
+  }
+
+  __device__ inline void incr() {
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("Counter before: %u, %u, %u, %u\n", counter.x, counter.y,
+    //     counter.z, counter.w);
+    // }
+    counter = incr128(counter);
+    // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+    //     printf("Counter after: %u, %u, %u, %u\n", counter.x, counter.y,
+    //     counter.z, counter.w);
+    // }
+  }
+  __device__ unsigned int mulhilo32(
+      unsigned int a,
+      unsigned int b,
+      unsigned int* result_high) {
+    *result_high = __umulhi(a, b);
+    return a * b;
+  }
+  __device__ uint2 mulhilo32_v2(const unsigned int a, const unsigned int b) {
+    uint2* res;
+    unsigned long long tmp;
+    asm("mul.wide.u32      %0, %1, %2;\n\t" : "=l"(tmp) : "r"(a), "r"(b));
+    res = (uint2*)(&tmp);
+    return *res;
+  }
+  __device__ inline uint4 single_round(const uint4 ctr, const uint2 key) {
+    // unsigned int hi0;
+    // unsigned int hi1;
+    // unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
+    // unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
+    // uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
+    uint2 res0 = mulhilo32_v2(kPhiloxSA, ctr.x);
+    uint2 res1 = mulhilo32_v2(kPhiloxSB, ctr.z);
+    uint4 ret = {
+        res1.y ^ ctr.y ^ key.x, res1.x, res0.y ^ ctr.w ^ key.y, res0.x};
+    return ret;
+  }
+  static const unsigned long kPhilox10A = 0x9E3779B9;
+  static const unsigned long kPhilox10B = 0xBB67AE85;
+  static const unsigned long kPhiloxSA = 0xD2511F53;
+  static const unsigned long kPhiloxSB = 0xCD9E8D57;
+};
+// Inverse of 2^32.
+constexpr float M_RAN_INVM32 = 2.3283064e-10f;
+__device__ __inline__ float4 uniform4(const uint4 x) {
+  return make_float4(
+      x.x * M_RAN_INVM32,
+      x.y * M_RAN_INVM32,
+      x.z * M_RAN_INVM32,
+      x.w * M_RAN_INVM32);
+}
+
+} // namespace
diff --git a/python/aitemplate/backend/cuda/common/__init__.py b/python/aitemplate/backend/cuda/common/__init__.py
new file mode 100644
index 000000000..2115b6952
--- /dev/null
+++ b/python/aitemplate/backend/cuda/common/__init__.py
@@ -0,0 +1,19 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+CUDA Common module init
+"""
+from .dummy_op import *
diff --git a/python/aitemplate/backend/cuda/common/dummy_op.py b/python/aitemplate/backend/cuda/common/dummy_op.py
new file mode 100644
index 000000000..da293ee4e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/common/dummy_op.py
@@ -0,0 +1,36 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Dummy op codegen for CUDA.
+"""
+
+from typing import Any, Dict
+
+from ... import registry
+
+
+@registry.reg("cuda.size.gen_function")
+def dummy_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return ""
+
+
+@registry.reg("cuda.size.func_decl")
+def dummy_gen_function_decl(func_attrs):
+    return ""
+
+
+@registry.reg("cuda.size.func_call")
+def dummy_gen_function_call(func_attrs, indent):
+    return ""
diff --git a/python/aitemplate/backend/cuda/conv2d/__init__.py b/python/aitemplate/backend/cuda/conv2d/__init__.py
new file mode 100644
index 000000000..7d83ce1fd
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/__init__.py
@@ -0,0 +1,33 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+cuda conv2d module init
+"""
+from . import (
+    conv2d,
+    conv2d_bias,
+    conv2d_bias_add,
+    conv2d_bias_add_hardswish,
+    conv2d_bias_add_relu,
+    conv2d_bias_few_channels,
+    conv2d_bias_hardswish,
+    conv2d_bias_hardswish_few_channels,
+    conv2d_bias_relu,
+    conv2d_bias_relu_few_channels,
+    conv2d_bias_sigmoid,
+    transposed_conv2d,
+    transposed_conv2d_bias,
+)
diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
new file mode 100644
index 000000000..9e0de0d91
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -0,0 +1,244 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+common template for conv2d
+"""
+import re
+from collections import OrderedDict
+from hashlib import sha1
+from typing import List
+
+import jinja2
+
+from ...target import Target
+from ..gemm_universal.common import add_profiler, build_profiler  # noqa: F401
+
+
+KERNEL_KEY_TEMPLATE = jinja2.Template(
+    """
+cutlass{{opcode_class}}_{{extended_name}}_{{threadblock}}_{{layout}}_align_{{align_ab}}_{{align_c}}
+"""
+)
+
+
+def kernel_name(op):
+    """generate cuda kernel name"""
+    from cutlass_lib import library
+
+    threadblock = op.tile_description.procedural_name()
+    extended_name = op.extended_name()
+    opcode_class_name = library.OpcodeClassNames[
+        op.tile_description.math_instruction.opcode_class
+    ]
+    layout = op.layout_name()
+    align_ab = op.A.alignment
+    align_c = op.C.alignment
+    name = KERNEL_KEY_TEMPLATE.render(
+        threadblock=threadblock,
+        extended_name=extended_name,
+        opcode_class_name=opcode_class_name,
+        layout=layout,
+        align_ab=align_ab,
+        align_c=align_c,
+    )
+    return name.replace("\n", "")
+
+
+def emit_instance(op):
+    """emit instance"""
+    import cutlass_lib
+
+    if hasattr(op, "binary_op"):
+        emiter = cutlass_lib.conv2d_operation.EmitConv2dWithBroadcastInstance()
+    else:
+        emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
+    op_def = emiter.emit(op)
+    return op_def
+
+
+def extract_config(func_attrs, f_proc_op=None):
+    """Extracts cutlass config for conv kernels."""
+    import copy
+
+    import cutlass_lib
+
+    def f_proc_op_default(op):
+        # import cutlass_lib
+        ret = []
+        data_type = cutlass_lib.library.DataType.f16
+        acc_type = cutlass_lib.library.DataType.f32
+        # check target use fp16 acc
+        if "use_fp16_acc" in Target.current()._kwargs:
+            if Target.current()._kwargs["use_fp16_acc"]:
+                acc_type = cutlass_lib.library.DataType.f16
+
+        if (
+            op.A.element == data_type
+            and op.B.element == data_type
+            and op.C.element == data_type
+            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
+            and op.accumulator_type() == acc_type
+        ):
+
+            op = copy.deepcopy(op)
+            # set epilogue
+            epilogue_name = func_attrs["epilogue"]
+            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
+            op.element_epilogue = acc_type
+            # set C alignment
+            for i in [8, 4, 2, 1]:
+                op = copy.deepcopy(op)
+                op.C.alignment = i
+                ret.append(op)
+        return ret
+
+    op_kind = cutlass_lib.library.OperationKind.Conv2d
+    conv_kind = cutlass_lib.library.ConvKind.Fprop
+    ret = []
+    conv2d_ops = OrderedDict()
+    extract_ops = list(Target.current()._operators[op_kind].items())
+
+    for _, value in extract_ops:
+        op = value[0]
+        if op.conv_kind == conv_kind:
+            if f_proc_op is None:
+                ret = f_proc_op_default(op)
+            else:
+                ret = f_proc_op(op)
+            if len(ret) > 0:
+                for op_inst in ret:
+                    key = kernel_name(op_inst)
+                    conv2d_ops[key] = op_inst
+    return conv2d_ops
+
+
+def extract_config_name(config):
+    """Extracts config name from a given config."""
+    pattern = re.compile(r"\s*using\s(.*?)\s=")
+    decl = config.split("\n")[2]
+    match = pattern.match(decl)
+    if match is None:
+        raise RuntimeError("Invalid config: \n" + config)
+    return match.groups()[0]
+
+
+def gen_function(
+    func_attrs,
+    instance_template,
+    exec_template,
+    src_template,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+    f_emit_instance=emit_instance,
+    extra_header="",
+):
+    """Function definition codegen."""
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for key, value in exec_path.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        if value not in inst_def_flag:
+            config = f_emit_instance(op_instance[value])
+            inst_def_flag.add(value)
+        else:
+            config = ""
+        inst = instance_template.render(
+            config=config, name=fname, config_name=extract_config_name(config)
+        )
+        instances[key] = inst
+        instance_decl += inst
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        w_dim0="*out_ch",
+        w_dim1="*kernel_h",
+        w_dim2="*kernel_w",
+        stride="stride",
+        dilate="dilation",
+        pad="pad",
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+        y_dim3="*out_ch",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in instances:
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = exec_template.render(indent="    ", instance=fname)
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return src_template.render(
+        instances=instance_decl,
+        function_name=func_name,
+        dtype="cutlass::half_t",
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        extra_header=extra_header,
+    )
+
+
+def cal_align_ab(x_shape: List[int]) -> int:
+    """Returns input alignment."""
+    k = x_shape[3]  # CI
+    if k % 8 == 0:
+        return 8
+    if k % 4 == 0:
+        return 4
+    if k % 2 == 0:
+        return 2
+    raise RuntimeError("a/b is not aligned")
+
+
+def function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    ab_alignment = cal_align_ab(x_shape)
+    tmp = cfg.split("_")
+    align_c = int(tmp[-1])
+    align_ab = int(tmp[-2])
+    if align_c != func_attrs["epilogue_alignment"]:
+        return False
+    if align_ab != ab_alignment:
+        return False
+    return True
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
new file mode 100644
index 000000000..ddcef02b3
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
@@ -0,0 +1,373 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+common templates for conv_bias_activation subgraph
+"""
+import jinja2
+
+from . import common
+
+# pylint: disable=C0103,C0301
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
+//  TODO: cast to right dtype
+{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}    problem_size,
+{{indent}}    {(cutlass::half_t*)(in_ptr), layout_A},
+{{indent}}    {(cutlass::half_t*)(weight_ptr), layout_B},
+{{indent}}    {(cutlass::half_t*)(bias_ptr), cutlass::layout::TensorNHWC::Stride(0)},
+{{indent}}    {(cutlass::half_t*)(out_ptr), layout_C},
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+{{indent}}};
+{{indent}}{{instance}} implicit_gemm_op;
+{% if is_profiler %}
+{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <string>
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include <cutlass/epilogue/thread/linear_combination_bias_relu.h>
+#include <cutlass/epilogue/thread/linear_combination_hardswish.h>
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{instances_def}}
+
+void {{function_name}} (
+    cutlass::half_t* in_ptr,
+    cutlass::half_t* weight_ptr,
+    cutlass::half_t* out_ptr,
+    cutlass::half_t* bias_ptr,
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride,
+    int dilation,
+    int pad,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+  int i32_batch = *batch;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNHWC;
+  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
+  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
+        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
+        {pad, pad, pad, pad},
+        {stride, stride},
+        {dilation, dilation},
+        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
+        cutlass::conv::Mode::kCrossCorrelation,
+        1
+  );
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+{{op_func}}
+
+int main(int argc, char** argv) {
+  int64_t batch = std::stoi(argv[1]);
+  int64_t in_h = std::stoi(argv[2]);
+  int64_t in_w = std::stoi(argv[3]);
+  int64_t in_ch = std::stoi(argv[4]);
+  int64_t kernel_h = std::stoi(argv[5]);
+  int64_t kernel_w = std::stoi(argv[6]);
+  int64_t out_ch = std::stoi(argv[7]);
+  int stride = std::stoi(argv[8]);
+  int pad = std::stoi(argv[9]);
+  int dilation = std::stoi(argv[10]);
+  {{shape_func}}
+  using ElementOutput = typename {{name}}::ElementC;
+  using ElementInputA = typename {{name}}::ElementA;
+  using ElementInputB = typename {{name}}::ElementB;
+
+  uint8_t* global_workspace = nullptr;
+  cudaStream_t stream = nullptr;
+
+  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, CI});
+  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> b({(int)CO, 1, 1, 1});
+  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
+  //
+  // warmup
+  conv((cutlass::half_t*) x.device_data(),
+       (cutlass::half_t*) w.device_data(),
+       (cutlass::half_t*) y.device_data(),
+       (cutlass::half_t*) b.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KH,
+       &KW,
+       &HI,
+       &WI,
+       &NO,
+       &HO,
+       &WO,
+       stride,
+       dilation,
+       pad,
+       stream);
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0]);
+  for (int i = 0; i < 5; ++i) {
+      conv((cutlass::half_t*) x.device_data(),
+       (cutlass::half_t*) w.device_data(),
+       (cutlass::half_t*) y.device_data(),
+       (cutlass::half_t*) b.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KH,
+       &KW,
+       &HI,
+       &WI,
+       &NO,
+       &HO,
+       &WO,
+       stride,
+       dilation,
+       pad,
+       stream);
+  }
+  cudaEventRecord(events[1]);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  uint8_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  int,
+  int,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{bias_ptr}},
+{{indent}}    global_workspace,
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{stride}},
+{{indent}}    {{dilation}},
+{{indent}}    {{pad}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_profiler(func_attrs, workdir, shape_template, extra_header=""):
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    # shape func
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_h",
+        x_dim2="in_w",
+        x_dim3="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_h",
+        w_dim2="kernel_w",
+        stride="stride",
+        dilate="dilation",
+        pad="pad",
+    )
+    file_pairs = []
+    for op_name, op in op_instance.items():
+        config = common.emit_instance(op)
+
+        config_name = common.extract_config_name(config)
+        name = "DeviceConvFwdInstance"
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ", is_profiler=True, instance=name
+        )
+        op_func = SRC_TEMPLATE.render(
+            instances=instance,
+            function_name="conv",
+            dtype="cutlass::half_t",
+            shape_func="",
+            exec_paths=exec_program,
+            extra_header=extra_header,
+        )
+        code = PROFILER_TEMPLATE.render(
+            op_func=op_func, shape_func=shape_func, name=name
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
+
+
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    b = func_attrs["inputs"][2]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        bias_ptr=b._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
new file mode 100644
index 000000000..0647769a1
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
@@ -0,0 +1,348 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+common template for conv2d bias act residual add
+"""
+import jinja2
+
+from . import common
+
+# pylint: disable=C0301,C0103
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
+//  TODO: cast to right dtype
+{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}    problem_size,
+{{indent}}    {(cutlass::half_t*)(in_ptr), layout_A},
+{{indent}}    {(cutlass::half_t*)(weight_ptr), layout_B},
+{{indent}}    {(cutlass::half_t*)(res_ptr), layout_C},
+{{indent}}    {(cutlass::half_t*)(out_ptr), layout_C},
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+{{indent}}    cutlass::conv::SplitKMode::kSerial,
+{{indent}}    (cutlass::half_t*)(bias_ptr),
+{{indent}}    nullptr, 0, *out_ch
+{{indent}}};
+{{indent}}{{instance}} implicit_gemm_op;
+{% if is_profiler %}
+{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+return;
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <string>
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include <cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h>
+#include <cutlass/epilogue/thread/linear_combination_residual_block.h>
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{instances_def}}
+
+void {{function_name}} (
+    cutlass::half_t* in_ptr,
+    cutlass::half_t* weight_ptr,
+    cutlass::half_t* out_ptr,
+    cutlass::half_t* bias_ptr,
+    cutlass::half_t* res_ptr,
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride,
+    int dilation,
+    int pad,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+  int i32_batch = *batch;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNHWC;
+  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
+  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+      {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
+      {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
+      {pad, pad, pad, pad},
+      {stride, stride},
+      {dilation, dilation},
+      {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
+      cutlass::conv::Mode::kCrossCorrelation,
+      1
+  );
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+{{op_func}}
+
+int main(int argc, char** argv) {
+  int64_t batch = std::stoi(argv[1]);
+  int64_t in_h = std::stoi(argv[2]);
+  int64_t in_w = std::stoi(argv[3]);
+  int64_t in_ch = std::stoi(argv[4]);
+  int64_t kernel_h = std::stoi(argv[5]);
+  int64_t kernel_w = std::stoi(argv[6]);
+  int64_t out_ch = std::stoi(argv[7]);
+  int stride = std::stoi(argv[8]);
+  int pad = std::stoi(argv[9]);
+  int dilation = std::stoi(argv[10]);
+  {{shape_func}}
+  using ElementOutput = typename {{name}}::ElementC;
+  using ElementInputA = typename {{name}}::ElementA;
+  using ElementInputB = typename {{name}}::ElementB;
+
+  uint8_t* global_workspace = nullptr;
+  cudaStream_t stream = nullptr;
+
+  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, CI});
+  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> b({(int)CO, 1, 1, 1});
+  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> r({NO, HO, WO, CO});
+  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
+  //
+  // warmup
+  conv((cutlass::half_t*) x.device_data(),
+       (cutlass::half_t*) w.device_data(),
+       (cutlass::half_t*) y.device_data(),
+       (cutlass::half_t*) b.device_data(),
+       (cutlass::half_t*) r.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KH,
+       &KW,
+       &HI,
+       &WI,
+       &NO,
+       &HO,
+       &WO,
+       stride,
+       dilation,
+       pad,
+       stream);
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0]);
+  for (int i = 0; i < 5; ++i) {
+      conv((cutlass::half_t*) x.device_data(),
+       (cutlass::half_t*) w.device_data(),
+       (cutlass::half_t*) y.device_data(),
+       (cutlass::half_t*) b.device_data(),
+       (cutlass::half_t*) r.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KH,
+       &KW,
+       &HI,
+       &WI,
+       &NO,
+       &HO,
+       &WO,
+       stride,
+       dilation,
+       pad,
+       stream);
+  }
+  cudaEventRecord(events[1]);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+
+"""
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  uint8_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  int,
+  int,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{bias_ptr}},
+{{indent}}    {{res_ptr}},
+{{indent}}    global_workspace,
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{stride}},
+{{indent}}    {{dilation}},
+{{indent}}    {{pad}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_profiler(func_attrs, workdir, shape_template):
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    # shape func
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_h",
+        x_dim2="in_w",
+        x_dim3="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_h",
+        w_dim2="kernel_w",
+        stride="stride",
+        dilate="dilation",
+        pad="pad",
+    )
+    file_pairs = []
+    for op_name, op in op_instance.items():
+        config = common.emit_instance(op)
+        config_name = common.extract_config_name(config)
+        name = "DeviceConvFwdInstance"
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ", is_profiler=True, instance=name
+        )
+        op_func = SRC_TEMPLATE.render(
+            instances=instance,
+            function_name="conv",
+            dtype="cutlass::half_t",
+            shape_func="",
+            exec_paths=exec_program,
+        )
+        code = PROFILER_TEMPLATE.render(
+            op_func=op_func, shape_func=shape_func, name=name
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
new file mode 100644
index 000000000..c24f0a4db
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
@@ -0,0 +1,111 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+common functions for conv2d op with few channels(< 8)
+"""
+
+from collections import OrderedDict
+
+from ...target import Target
+from . import common
+
+
+def apply_special_config(func_attrs, op):
+    import cutlass_lib
+
+    x = func_attrs["inputs"][0]
+    in_ch = x._attrs["shape"][-1]._attrs["values"][0]
+
+    if in_ch == 3:
+        # By default we don't use it since the perf is worse than pad4+fixchannel
+        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FewChannels
+        op.A.alignment = 1
+        op.B.alignment = 1
+        op.tile_description.stages = 2
+    elif in_ch in [2, 4, 8]:
+        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FixedChannels
+        op.A.alignment = in_ch
+        op.B.alignment = in_ch
+        op.tile_description.stages = 3
+    return op
+
+
+def extract_config(func_attrs):
+    """extract epilogue for conv op
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        [description] op attributes
+
+    Returns
+    -------
+    [type]: Dict
+        [description]
+
+    Raises
+    ------
+    NotImplementedError
+        [description]
+    """
+    import copy
+
+    import cutlass_lib
+
+    def f_proc_op_special(op):
+        ret = []
+        data_type = cutlass_lib.library.DataType.f16
+        acc_type = cutlass_lib.library.DataType.f32
+        # check target use fp16 acc
+        if "use_fp16_acc" in Target.current()._kwargs:
+            if Target.current()._kwargs["use_fp16_acc"]:
+                acc_type = cutlass_lib.library.DataType.f16
+
+        if (
+            op.A.element == data_type
+            and op.B.element == data_type
+            and op.C.element == data_type
+            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
+            and op.accumulator_type() == acc_type
+        ):
+
+            op = copy.deepcopy(op)
+            # set epilogue
+            epilogue_name = func_attrs["epilogue"]
+            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
+            op.element_epilogue = acc_type
+            op = apply_special_config(func_attrs, op)
+            # set C alignment
+            for i in [8, 4, 2, 1]:
+                op = copy.deepcopy(op)
+                op.C.alignment = i
+                ret.append(op)
+        return ret
+
+    op_kind = cutlass_lib.library.OperationKind.Conv2d
+    conv_kind = cutlass_lib.library.ConvKind.Fprop
+    ret = []
+    conv2d_ops = OrderedDict()
+    extract_ops = list(Target.current()._operators[op_kind].items())
+
+    for _, value in extract_ops:
+        op = value[0]
+        if op.conv_kind == conv_kind:
+            ret = f_proc_op_special(op)
+            if len(ret) > 0:
+                for op_inst in ret:
+                    key = common.kernel_name(op_inst)
+                    conv2d_ops[key] = op_inst
+    return conv2d_ops
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d.py b/python/aitemplate/backend/cuda/conv2d/conv2d.py
new file mode 100644
index 000000000..7e5da403f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d.py
@@ -0,0 +1,420 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen for conv2d.
+"""
+import jinja2
+
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
+//  TODO: cast to right dtype
+{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}    problem_size,
+{{indent}}    {(cutlass::half_t*)(in_ptr), layout_A},
+{{indent}}    {(cutlass::half_t*)(weight_ptr), layout_B},
+{{indent}}    {(cutlass::half_t*)(out_ptr), layout_C},
+{{indent}}    {(cutlass::half_t*)(out_ptr), layout_C},
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+{{indent}}};
+{{indent}}{{instance}} implicit_gemm_op;
+{% if is_profiler %}
+{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = implicit_gemm_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <string>
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{instances_def}}
+
+void {{function_name}} (
+    cutlass::half_t* in_ptr,
+    cutlass::half_t* weight_ptr,
+    cutlass::half_t* out_ptr,
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride,
+    int dilation,
+    int pad,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+  int i32_batch = *batch;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNHWC;
+  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
+  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
+        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
+        {pad, pad, pad, pad},
+        {stride, stride},
+        {dilation, dilation},
+        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
+        cutlass::conv::Mode::kCrossCorrelation,
+        1
+  );
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+{{op_func}}
+
+int main(int argc, char** argv) {
+  int64_t batch = std::stoi(argv[1]);
+  int64_t in_h = std::stoi(argv[2]);
+  int64_t in_w = std::stoi(argv[3]);
+  int64_t in_ch = std::stoi(argv[4]);
+  int64_t kernel_h = std::stoi(argv[5]);
+  int64_t kernel_w = std::stoi(argv[6]);
+  int64_t out_ch = std::stoi(argv[7]);
+  int stride = std::stoi(argv[8]);
+  int pad = std::stoi(argv[9]);
+  int dilation = std::stoi(argv[10]);
+  {{shape_func}}
+  using ElementOutput = typename {{name}}::ElementC;
+  using ElementInputA = typename {{name}}::ElementA;
+  using ElementInputB = typename {{name}}::ElementB;
+
+  uint8_t* global_workspace = nullptr;
+  cudaStream_t stream = nullptr;
+
+  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, CI});
+  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
+
+  //
+  // warmup
+  conv((cutlass::half_t*) x.device_data(),
+       (cutlass::half_t*) w.device_data(),
+       (cutlass::half_t*) y.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KH,
+       &KW,
+       &HI,
+       &WI,
+       &NO,
+       &HO,
+       &WO,
+       stride,
+       dilation,
+       pad,
+       stream);
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0]);
+  for (int i = 0; i < 5; ++i) {
+      conv((cutlass::half_t*) x.device_data(),
+       (cutlass::half_t*) w.device_data(),
+       (cutlass::half_t*) y.device_data(),
+       global_workspace,
+       &NI,
+       &CO,
+       &CI,
+       &KH,
+       &KW,
+       &HI,
+       &WI,
+       &NO,
+       &HO,
+       &WO,
+       stride,
+       dilation,
+       pad,
+       stream);
+  }
+  cudaEventRecord(events[1]);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  uint8_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  int,
+  int,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    global_workspace,
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{stride}},
+{{indent}}    {{dilation}},
+{{indent}}    {{pad}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+@registry.reg("cuda.conv2d.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    """Populates conv2d cutlass configs into 'op_instance' field."""
+    func_attrs["op_instance"] = common.extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv2d.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    """Codegen for conv2d profiler."""
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    # shape func
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_h",
+        x_dim2="in_w",
+        x_dim3="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_h",
+        w_dim2="kernel_w",
+        stride="stride",
+        dilate="dilation",
+        pad="pad",
+    )
+    file_pairs = []
+    for op_name, op in op_instance.items():
+        config = common.emit_instance(op)
+        config_name = common.extract_config_name(config)
+        name = "DeviceConvFwdInstance"
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ", is_profiler=True, instance=name
+        )
+        op_func = SRC_TEMPLATE.render(
+            instances=instance,
+            function_name="conv",
+            dtype="cutlass::half_t",
+            shape_func="",
+            exec_paths=exec_program,
+        )
+        code = PROFILER_TEMPLATE.render(
+            op_func=op_func, shape_func=shape_func, name=name
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
+
+
+@registry.reg("cuda.conv2d.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Codegen for conv2d function."""
+    return common.gen_function(
+        func_attrs,
+        INSTANCE_TEMPLATE,
+        EXEC_TEMPLATE,
+        SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    """Codegen for conv2d function declaration."""
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    """Codegen for conv2d function call."""
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.conv2d.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
new file mode 100644
index 000000000..c1ce2ac94
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias codegen
+"""
+from ... import registry
+from . import common, common_conv2d_bias_activation as cba
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+@registry.reg("cuda.conv2d_bias.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    """Populates all available conv2d configs into the op_instance field."""
+    func_attrs["op_instance"] = common.extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv2d_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    """Codegen for conv2d profiler."""
+    cba.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.conv2d_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Codegen for conv2d function."""
+    return common.gen_function(
+        func_attrs,
+        cba.INSTANCE_TEMPLATE,
+        cba.EXEC_TEMPLATE,
+        cba.SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d_bias.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    """Codegen for conv2d function declaration."""
+    func_name = func_attrs["name"]
+    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_bias.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    """Codegen for conv2d function call."""
+    return cba.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.conv2d_bias.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
new file mode 100644
index 000000000..663495f22
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
@@ -0,0 +1,149 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias add codegen
+"""
+from ... import registry
+from ...target import Target
+from . import common, common_conv2d_bias_add_activation as cbaa
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+@registry.reg("cuda.conv2d_bias_add_identity.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import copy
+
+        import cutlass_lib
+
+        ret = []
+        data_type = cutlass_lib.library.DataType.f16
+        acc_type = cutlass_lib.library.DataType.f32
+        # check target use fp16 acc
+        if "use_fp16_acc" in Target.current()._kwargs:
+            if Target.current()._kwargs["use_fp16_acc"]:
+                acc_type = cutlass_lib.library.DataType.f16
+
+        if (
+            op.A.element == data_type
+            and op.B.element == data_type
+            and op.C.element == data_type
+            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
+            and op.accumulator_type() == acc_type
+        ):
+
+            op = copy.deepcopy(op)
+            # set epilogue
+            epilogue_name = func_attrs["epilogue"]
+            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
+            op.element_epilogue = acc_type
+
+            op.activation_op = cutlass_lib.library.EpilogueMathName["Identity"]
+            op.binary_op = cutlass_lib.library.EpilogueMathName["Plus"]
+            op.unary_op = cutlass_lib.library.EpilogueMathName["Identity"]
+
+            # set C alignment
+            for i in [8, 4, 2, 1]:
+                op = copy.deepcopy(op)
+                op.C.alignment = i
+                ret.append(op)
+        return ret
+
+    func_attrs["op_instance"] = common.extract_config(func_attrs, fproc_f16)
+
+
+@registry.reg("cuda.conv2d_bias_add_identity.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    cbaa.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.conv2d_bias_add_identity.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs,
+        cbaa.INSTANCE_TEMPLATE,
+        cbaa.EXEC_TEMPLATE,
+        cbaa.SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_add_identity.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return cbaa.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_bias_add_identity.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    b = func_attrs["inputs"][2]
+    r = func_attrs["inputs"][3]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return cbaa.FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        bias_ptr=b._attrs["name"],
+        res_ptr=r._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_add_identity.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
new file mode 100644
index 000000000..10aa46619
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
@@ -0,0 +1,149 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias add hardswish codegen
+"""
+from ... import registry
+from ...target import Target
+from . import common, common_conv2d_bias_add_activation as cbaa
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+@registry.reg("cuda.conv2d_bias_add_hardswish.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import copy
+
+        import cutlass_lib
+
+        ret = []
+        data_type = cutlass_lib.library.DataType.f16
+        acc_type = cutlass_lib.library.DataType.f32
+        # check target use fp16 acc
+        if "use_fp16_acc" in Target.current()._kwargs:
+            if Target.current()._kwargs["use_fp16_acc"]:
+                acc_type = cutlass_lib.library.DataType.f16
+
+        if (
+            op.A.element == data_type
+            and op.B.element == data_type
+            and op.C.element == data_type
+            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
+            and op.accumulator_type() == acc_type
+        ):
+
+            op = copy.deepcopy(op)
+            # set epilogue
+            epilogue_name = func_attrs["epilogue"]
+            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
+            op.element_epilogue = acc_type
+
+            op.activation_op = cutlass_lib.library.EpilogueMathName["Identity"]
+            op.binary_op = cutlass_lib.library.EpilogueMathName["Add"]
+            op.unary_op = cutlass_lib.library.EpilogueMathName["HardSwish"]
+
+            # set C alignment
+            for i in [8, 4, 2, 1]:
+                op = copy.deepcopy(op)
+                op.C.alignment = i
+                ret.append(op)
+        return ret
+
+    func_attrs["op_instance"] = common.extract_config(func_attrs, fproc_f16)
+
+
+@registry.reg("cuda.conv2d_bias_add_hardswish.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    cbaa.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.conv2d_bias_add_hardswish.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs,
+        cbaa.INSTANCE_TEMPLATE,
+        cbaa.EXEC_TEMPLATE,
+        cbaa.SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_add_hardswish.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return cbaa.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_bias_add_hardswish.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    b = func_attrs["inputs"][2]
+    r = func_attrs["inputs"][3]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return cbaa.FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        bias_ptr=b._attrs["name"],
+        res_ptr=r._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_add_hardswish.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
new file mode 100644
index 000000000..b6b96704f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
@@ -0,0 +1,149 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias add relu codegen
+"""
+from ... import registry
+from ...target import Target
+from . import common, common_conv2d_bias_add_activation as cbaa
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+@registry.reg("cuda.conv2d_bias_add_relu.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import copy
+
+        import cutlass_lib
+
+        ret = []
+        data_type = cutlass_lib.library.DataType.f16
+        acc_type = cutlass_lib.library.DataType.f32
+        # check target use fp16 acc
+        if "use_fp16_acc" in Target.current()._kwargs:
+            if Target.current()._kwargs["use_fp16_acc"]:
+                acc_type = cutlass_lib.library.DataType.f16
+
+        if (
+            op.A.element == data_type
+            and op.B.element == data_type
+            and op.C.element == data_type
+            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
+            and op.accumulator_type() == acc_type
+        ):
+
+            op = copy.deepcopy(op)
+            # set epilogue
+            epilogue_name = func_attrs["epilogue"]
+            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
+            op.element_epilogue = acc_type
+
+            op.activation_op = cutlass_lib.library.EpilogueMathName["Identity"]
+            op.binary_op = cutlass_lib.library.EpilogueMathName["Plus"]
+            op.unary_op = cutlass_lib.library.EpilogueMathName["ReLu"]
+
+            # set C alignment
+            for i in [8, 4, 2, 1]:
+                op = copy.deepcopy(op)
+                op.C.alignment = i
+                ret.append(op)
+        return ret
+
+    func_attrs["op_instance"] = common.extract_config(func_attrs, fproc_f16)
+
+
+@registry.reg("cuda.conv2d_bias_add_relu.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    cbaa.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.conv2d_bias_add_relu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs,
+        cbaa.INSTANCE_TEMPLATE,
+        cbaa.EXEC_TEMPLATE,
+        cbaa.SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_add_relu.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return cbaa.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_bias_add_relu.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    b = func_attrs["inputs"][2]
+    r = func_attrs["inputs"][3]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return cbaa.FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        bias_ptr=b._attrs["name"],
+        res_ptr=r._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_add_relu.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
new file mode 100644
index 000000000..b8ddfa205
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
@@ -0,0 +1,211 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+specialize conv2d op with few channels(< 8)
+"""
+from collections import OrderedDict
+
+from ... import registry
+from ...target import Target
+from . import common, common_conv2d_bias_activation as cba
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+def apply_special_config(func_attrs, op):
+    import cutlass_lib
+
+    x = func_attrs["inputs"][0]
+    in_ch = x._attrs["shape"][-1]._attrs["values"][0]
+
+    if in_ch == 3:
+        # By default we don't use it since the perf is worse than pad4+fixchannel
+        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FewChannels
+        op.A.alignment = 1
+        op.B.alignment = 1
+        op.tile_description.stages = 2
+    elif in_ch in [2, 4, 8]:
+        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FixedChannels
+        op.A.alignment = in_ch
+        op.B.alignment = in_ch
+        op.tile_description.stages = 3
+    return op
+
+
+def extract_config(func_attrs):
+    """extract epilogue for conv op
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        [description] op attributes
+
+    Returns
+    -------
+    [type]: Dict
+        [description]
+
+    Raises
+    ------
+    NotImplementedError
+        [description]
+    """
+    import copy
+
+    import cutlass_lib
+
+    def f_proc_op_special(op):
+        ret = []
+        data_type = cutlass_lib.library.DataType.f16
+        acc_type = cutlass_lib.library.DataType.f32
+        # check target use fp16 acc
+        if "use_fp16_acc" in Target.current()._kwargs:
+            if Target.current()._kwargs["use_fp16_acc"]:
+                acc_type = cutlass_lib.library.DataType.f16
+
+        if (
+            op.A.element == data_type
+            and op.B.element == data_type
+            and op.C.element == data_type
+            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
+            and op.accumulator_type() == acc_type
+        ):
+
+            op = copy.deepcopy(op)
+            # set epilogue
+            epilogue_name = func_attrs["epilogue"]
+            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
+            op.element_epilogue = acc_type
+            op = apply_special_config(func_attrs, op)
+            # set C alignment
+            for i in [8, 4, 2, 1]:
+                op = copy.deepcopy(op)
+                op.C.alignment = i
+                ret.append(op)
+        return ret
+
+    op_kind = cutlass_lib.library.OperationKind.Conv2d
+    conv_kind = cutlass_lib.library.ConvKind.Fprop
+    ret = []
+    conv2d_ops = OrderedDict()
+    extract_ops = list(Target.current()._operators[op_kind].items())
+
+    for _, value in extract_ops:
+        op = value[0]
+        if op.conv_kind == conv_kind:
+            ret = f_proc_op_special(op)
+            if len(ret) > 0:
+                for op_inst in ret:
+                    key = common.kernel_name(op_inst)
+                    conv2d_ops[key] = op_inst
+    return conv2d_ops
+
+
+@registry.reg("cuda.conv2d_bias_few_channels.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    """extract configurations for profiling
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        [description] op attributes
+    dtype : str, optional
+        [description] by default "float16"
+
+    Returns
+    -------
+    [type]
+        [description]
+
+    Raises
+    ------
+    NotImplementedError
+        [description]
+    """
+    func_attrs["op_instance"] = extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv2d_bias_few_channels.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    """generate code for profiling"""
+    cba.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.conv2d_bias_few_channels.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """generating special conv2d kernel and all of its auxiliary functions
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        [description] attributes of conv2d op
+    exec_cond_remplate : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    return common.gen_function(
+        func_attrs,
+        cba.INSTANCE_TEMPLATE,
+        cba.EXEC_TEMPLATE,
+        cba.SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_few_channels.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_bias_few_channels.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    return cba.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.conv2d_bias_few_channels.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
new file mode 100644
index 000000000..e31ad9095
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
@@ -0,0 +1,81 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias hardswish codegen
+"""
+from ... import registry
+from . import common, common_conv2d_bias_activation as cba
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+@registry.reg("cuda.conv2d_bias_hardswish.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    func_attrs["op_instance"] = common.extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv2d_bias_hardswish.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    cba.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.conv2d_bias_hardswish.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs,
+        cba.INSTANCE_TEMPLATE,
+        cba.EXEC_TEMPLATE,
+        cba.SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_hardswish.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_bias_hardswish.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    return cba.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.conv2d_bias_hardswish.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
new file mode 100644
index 000000000..f305f3344
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
@@ -0,0 +1,123 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+specialize conv2d op with few channels(< 8)
+"""
+
+from ... import registry
+
+from . import common, common_conv2d_bias_activation as cba
+from .common_conv2d_few_channels import extract_config
+
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+@registry.reg("cuda.conv2d_bias_hardswish_few_channels.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    """extract configurations for profiling
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        [description] op attributes
+    dtype : str, optional
+        [description] by default "float16"
+
+    Returns
+    -------
+    [type]
+        [description]
+
+    Raises
+    ------
+    NotImplementedError
+        [description]
+    """
+    func_attrs["op_instance"] = extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv2d_bias_hardswish_few_channels.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    """generate code for profiling"""
+    cba.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.conv2d_bias_hardswish_few_channels.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """generating special conv2d kernel and all of its auxiliary functions
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        [description] attributes of conv2d op
+    exec_cond_remplate : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    return common.gen_function(
+        func_attrs,
+        cba.INSTANCE_TEMPLATE,
+        cba.EXEC_TEMPLATE,
+        cba.SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_hardswish_few_channels.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_bias_hardswish_few_channels.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    return cba.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.conv2d_bias_hardswish_few_channels.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
new file mode 100644
index 000000000..ea75bdd9d
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
@@ -0,0 +1,81 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias relu codegen
+"""
+from ... import registry
+from . import common, common_conv2d_bias_activation as cba
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+@registry.reg("cuda.conv2d_bias_relu.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    func_attrs["op_instance"] = common.extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv2d_bias_relu.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    cba.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.conv2d_bias_relu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs,
+        cba.INSTANCE_TEMPLATE,
+        cba.EXEC_TEMPLATE,
+        cba.SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_relu.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_bias_relu.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    return cba.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.conv2d_bias_relu.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
new file mode 100644
index 000000000..e207bc10a
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
@@ -0,0 +1,115 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+specialize conv2d op with few channels(< 8)
+"""
+
+from ... import registry
+from . import common, common_conv2d_bias_activation as cba
+from .common_conv2d_few_channels import extract_config
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+@registry.reg("cuda.conv2d_bias_relu_few_channels.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    """extract configurations for profiling
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        op attributes
+    dtype : str, optional
+        by default "float16"
+
+    Returns
+    -------
+    None
+    """
+    func_attrs["op_instance"] = extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv2d_bias_relu_few_channels.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    """generate code for profiling"""
+    cba.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.conv2d_bias_relu_few_channels.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """generating special conv2d kernel and all of its auxiliary functions
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        [description] attributes of conv2d op
+    exec_cond_remplate : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    return common.gen_function(
+        func_attrs,
+        cba.INSTANCE_TEMPLATE,
+        cba.EXEC_TEMPLATE,
+        cba.SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_relu_few_channels.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_bias_relu_few_channels.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    return cba.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.conv2d_bias_relu_few_channels.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
new file mode 100644
index 000000000..5ad4ccd6a
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
@@ -0,0 +1,82 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias sigmoid codegen
+"""
+
+from ... import registry
+from . import common, common_conv2d_bias_activation as cba
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+
+@registry.reg("cuda.conv2d_bias_sigmoid.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    func_attrs["op_instance"] = common.extract_config(func_attrs)
+
+
+@registry.reg("cuda.conv2d_bias_sigmoid.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    cba.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.conv2d_bias_sigmoid.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs,
+        cba.INSTANCE_TEMPLATE,
+        cba.EXEC_TEMPLATE,
+        cba.SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv2d_bias_sigmoid.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv2d_bias_sigmoid.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    return cba.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.conv2d_bias_sigmoid.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
new file mode 100644
index 000000000..b1b6acbc1
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
@@ -0,0 +1,256 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+transposed conv2d op codegen
+"""
+import re
+
+import jinja2
+
+from ... import registry
+from . import common, conv2d
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{instances_def}}
+
+void {{function_name}} (
+    cutlass::half_t* in_ptr,
+    cutlass::half_t* weight_ptr,
+    cutlass::half_t* out_ptr,
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride,
+    int dilation,
+    int pad,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+  int i32_batch = *batch;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNHWC;
+  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
+  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
+        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
+        {pad, pad, pad, pad},
+        {stride, stride},
+        {dilation, dilation},
+        {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
+        cutlass::conv::Mode::kCrossCorrelation,
+        1
+  );
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+
+def conv_transpose_instance(op_def):
+    tmp = op_def.replace("DefaultConv2dFprop", "DefaultConv2dDgrad")
+    tmp = re.sub(
+        r"cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<\d>",
+        "cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>",
+        tmp,
+    )
+    return tmp
+
+
+def emit_instance(op, f_instance_convertor=conv_transpose_instance):
+    """Emits cutlass instance."""
+    import cutlass_lib
+
+    emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
+    op_def = emiter.emit(op)
+    op_def = f_instance_convertor(op_def)
+    return op_def
+
+
+@registry.reg("cuda.transposed_conv2d.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    func_attrs["op_instance"] = common.extract_config(func_attrs)
+
+
+@registry.reg("cuda.transposed_conv2d.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs,
+        conv2d.INSTANCE_TEMPLATE,
+        conv2d.EXEC_TEMPLATE,
+        SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        f_emit_instance=emit_instance,
+    )
+
+
+@registry.reg("cuda.transposed_conv2d.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return conv2d.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.transposed_conv2d.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return conv2d.FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.transposed_conv2d.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    # shape func
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_h",
+        x_dim2="in_w",
+        x_dim3="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_h",
+        w_dim2="kernel_w",
+        stride="stride",
+        dilate="dilation",
+        pad="pad",
+    )
+    file_pairs = []
+    for op_name, op in op_instance.items():
+        config = emit_instance(op)
+
+        config_name = common.extract_config_name(config)
+        name = "DeviceConvBwdInstance"
+        instance = conv2d.INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = conv2d.EXEC_TEMPLATE.render(
+            indent="  ", is_profiler=True, instance=name
+        )
+        op_func = SRC_TEMPLATE.render(
+            instances=instance,
+            function_name="conv",
+            dtype="cutlass::half_t",
+            shape_func="",
+            exec_paths=exec_program,
+        )
+        code = conv2d.PROFILER_TEMPLATE.render(
+            op_func=op_func, shape_func=shape_func, name=name
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
+
+
+@registry.reg("cuda.transposed_conv2d.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
new file mode 100644
index 000000000..2df9642fa
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
@@ -0,0 +1,264 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+transposed conv2d + bias + (relu) codegen
+"""
+import re
+
+import jinja2
+
+from ... import registry
+from . import common, common_conv2d_bias_activation as cba
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("Got cutlass error: ") + cutlassGetStatusString(error) + \\
+          " at: " + std::to_string(__LINE__);                                         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{instances_def}}
+
+void {{function_name}} (
+    cutlass::half_t* in_ptr,
+    cutlass::half_t* weight_ptr,
+    cutlass::half_t* out_ptr,
+    cutlass::half_t* bias_ptr,
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride,
+    int dilation,
+    int pad,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+  int i32_batch = *batch;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNHWC;
+  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
+  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
+        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
+        {pad, pad, pad, pad},
+        {stride, stride},
+        {dilation, dilation},
+        {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
+        cutlass::conv::Mode::kCrossCorrelation,
+        1
+  );
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+
+def _conv_transpose_instance(op_def):
+    tmp = op_def.replace("DefaultConv2dFprop", "DefaultConv2dDgrad")
+    tmp = re.sub(
+        r"cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<\d>",
+        "cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>",
+        tmp,
+    )
+    return tmp
+
+
+def emit_instance(op, f_instance_convertor=_conv_transpose_instance):
+    import cutlass_lib
+
+    emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
+    op_def = emiter.emit(op)
+    op_def = f_instance_convertor(op_def)
+    return op_def
+
+
+@registry.reg("cuda.transposed_conv2d_bias.config")
+@registry.reg("cuda.transposed_conv2d_bias_relu.config")
+def conv2d_config(func_attrs, dtype="float16"):
+    func_attrs["op_instance"] = common.extract_config(func_attrs)
+
+
+@registry.reg("cuda.transposed_conv2d_bias.gen_function")
+@registry.reg("cuda.transposed_conv2d_bias_relu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs,
+        cba.INSTANCE_TEMPLATE,
+        cba.EXEC_TEMPLATE,
+        SRC_TEMPLATE,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        f_emit_instance=emit_instance,
+    )
+
+
+@registry.reg("cuda.transposed_conv2d_bias.func_decl")
+@registry.reg("cuda.transposed_conv2d_bias_relu.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.transposed_conv2d_bias.func_call")
+@registry.reg("cuda.transposed_conv2d_bias_relu.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    b = func_attrs["inputs"][2]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return cba.FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        bias_ptr=b._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.transposed_conv2d_bias.gen_profiler")
+@registry.reg("cuda.transposed_conv2d_bias_relu.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    # shape func
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_h",
+        x_dim2="in_w",
+        x_dim3="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_h",
+        w_dim2="kernel_w",
+        stride="stride",
+        dilate="dilation",
+        pad="pad",
+    )
+    file_pairs = []
+    for op_name, op in op_instance.items():
+        config = emit_instance(op)
+
+        config_name = common.extract_config_name(config)
+        name = "DeviceConvBwdInstance"
+        instance = cba.INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = cba.EXEC_TEMPLATE.render(
+            indent="  ", is_profiler=True, instance=name
+        )
+        op_func = SRC_TEMPLATE.render(
+            instances=instance,
+            function_name="conv",
+            dtype="cutlass::half_t",
+            shape_func="",
+            exec_paths=exec_program,
+        )
+        code = cba.PROFILER_TEMPLATE.render(
+            op_func=op_func, shape_func=shape_func, name=name
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
+
+
+@registry.reg("cuda.transposed_conv2d_bias.filter")
+@registry.reg("cuda.transposed_conv2d_bias_relu.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/cuda_common.py b/python/aitemplate/backend/cuda/cuda_common.py
new file mode 100644
index 000000000..20093b05c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/cuda_common.py
@@ -0,0 +1,48 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA common functions for codegen.
+"""
+from typing import Dict
+
+DTYPE_TO_CUDATYPE: Dict[str, str] = {
+    "float16": "half",
+    "float": "float",
+    "int64": "int64_t",
+}
+
+
+DTYPE_TO_CUTLASSTYPE: Dict[str, str] = {
+    "float16": "cutlass::half_t",
+    "float": "float",
+}
+
+
+def dtype_to_cuda_type(dtype: str):
+    """Returns the corresponding cuda type."""
+    cuda_type = DTYPE_TO_CUDATYPE.get(dtype)
+
+    if cuda_type is None:
+        raise NotImplementedError("CUDA - Unsupported dtype: {}".format(dtype))
+    return cuda_type
+
+
+def dtype_to_cutlass_type(dtype: str):
+    """Returns the corresponding cutlass type."""
+    cutlass_type = DTYPE_TO_CUTLASSTYPE.get(dtype)
+
+    if cutlass_type is None:
+        raise NotImplementedError("CUDA - Unsupported dtype: {}".format(dtype))
+    return cutlass_type
diff --git a/python/aitemplate/backend/cuda/elementwise/__init__.py b/python/aitemplate/backend/cuda/elementwise/__init__.py
new file mode 100644
index 000000000..0bf6e473f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/elementwise/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+(c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+"""
+from . import fused_elementwise
+
+__all__ = ["fused_elementwise"]
diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
new file mode 100644
index 000000000..2adddd531
--- /dev/null
+++ b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
@@ -0,0 +1,299 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#ifndef CUSTOM_MATH
+#define CUSTOM_MATH
+
+#ifndef __HALF2_TO_UI
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
+#endif
+
+#ifndef __HALF_TO_US
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+#endif
+
+template <typename T>
+__device__ T sign_custom(const T a) {
+  return T(a > T(0)) - T(a < T(0));
+}
+
+__device__ half2 h2sign_custom(const half2 a) {
+  return half2(sign_custom(a.x), sign_custom(a.y));
+}
+
+__device__ half2 fast_tanh(half2 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 750)
+
+  asm volatile("tanh.approx.f16x2 %0, %1;"
+               : "=r"(__HALF2_TO_UI(x))
+               : "r"(__HALF2_TO_UI(x)));
+  return x;
+
+#else
+  CUTLASS_NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half fast_tanh(half x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 750)
+
+  asm volatile("tanh.approx.f16 %0, %1;"
+               : "=h"(__HALF_TO_US(x))
+               : "h"(__HALF_TO_US(x)));
+  return x;
+
+#else
+  return half(cutlass::fast_tanh(float(x)));
+#endif
+}
+
+// Return 1
+__device__ half one() {
+  uint16_t bits = 0x3c00u;
+  return reinterpret_cast<half const&>(bits);
+}
+
+/// Returns (1/2)  (specialization for half_t)
+__device__ half constant_half() {
+  uint16_t bits = 0x3800u;
+  return reinterpret_cast<half const&>(bits);
+}
+
+__device__ float fsigmoid_custom(const float a) {
+  return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
+}
+
+__device__ half hsigmoid_custom(const half a) {
+  half half_val = constant_half();
+  half one_val = one();
+  return __hmul((__hadd(fast_tanh(__hmul(a, half_val)), one_val)), half_val);
+}
+
+__device__ half2 h2sigmoid_custom(const half2 a) {
+  half2 halfX2 = half2(constant_half(), constant_half());
+  half2 oneX2 = half2(one(), one());
+  return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
+}
+
+__device__ float fsilu(const float a) {
+  return a * fsigmoid_custom(a);
+}
+
+__device__ half hsilu(const half a) {
+  return __hmul(a, hsigmoid_custom(a));
+}
+
+__device__ half2 h2silu(const half2 a) {
+  return __hmul2(a, h2sigmoid_custom(a));
+}
+
+__device__ float leaky_relu(const float a, const float negativeSlope) {
+  return a > 0.f ? a : a * negativeSlope;
+}
+
+__device__ half leaky_relu(const half a, const half negativeSlope) {
+  return a > half(0.f) ? a : __hmul(a, negativeSlope);
+}
+
+__device__ half2 leaky_relu(const half2 a, const half2 negativeSlope) {
+  return half2(
+      leaky_relu(a.x, negativeSlope.x), leaky_relu(a.y, negativeSlope.y));
+}
+
+__device__ float relu(const float a) {
+  return a > 0.f ? a : 0.f;
+}
+
+__device__ half relu(const half a) {
+  return a > half(0.f) ? a : half(0.f);
+}
+
+__device__ half2 relu(const half2 a) {
+  half2 zeroX2 = half2(half(0.f), half(0.f));
+#if __CUDA_ARCH__ >= 800
+  return __hmax2(a, zeroX2);
+#else
+  return half2(relu(a.x), relu(a.y));
+#endif
+}
+
+template <typename T>
+__device__ T hard_tanh(const T a, T min_val, T max_val) {
+  if (a <= min_val) {
+    return min_val;
+  } else if (a >= max_val) {
+    return max_val;
+  } else {
+    return a;
+  }
+}
+
+__device__ half2
+h2hard_tanh(const half2 a, const half2 min_val, const half2 max_val) {
+  return half2(
+      hard_tanh(a.x, min_val.x, max_val.x),
+      hard_tanh(a.y, min_val.y, max_val.y));
+}
+
+__device__ half replace_if_inf(
+    const half a,
+    const half inf_replace,
+    const half neginf_replace) {
+  auto is_inf = __hisinf(a);
+  if (is_inf == -1) {
+    return neginf_replace;
+  }
+  if (is_inf == 1) {
+    return inf_replace;
+  }
+  return a;
+}
+
+__device__ float replace_if_inf(
+    const float a,
+    const float inf_replace,
+    const float neginf_replace) {
+  auto is_inf = isinf(a);
+  if (is_inf == -1) {
+    return neginf_replace;
+  }
+  if (is_inf == 1) {
+    return inf_replace;
+  }
+  return a;
+}
+
+__device__ half2 nan_to_num(
+    const half2 a,
+    const half2 nan_replace,
+    const half2 inf_replace,
+    const half2 neginf_replace) {
+  half2 isnan = __hisnan2(a);
+  return half2(
+      isnan.x ? nan_replace.x
+              : replace_if_inf(a.x, inf_replace.x, neginf_replace.x),
+      isnan.y ? nan_replace.y
+              : replace_if_inf(a.y, inf_replace.y, neginf_replace.y));
+}
+
+__device__ half nan_to_num(
+    const half a,
+    const half nan_replace,
+    const half inf_replace,
+    const half neginf_replace) {
+  if (__hisnan(a)) {
+    return nan_replace;
+  }
+  return replace_if_inf(a, inf_replace, neginf_replace);
+}
+
+__device__ float nan_to_num(
+    const float a,
+    const float nan_replace,
+    const float inf_replace,
+    const float neginf_replace) {
+  if (isnan(a)) {
+    return nan_replace;
+  }
+  return replace_if_inf(a, inf_replace, neginf_replace);
+}
+
+__device__ half2 clamp_nan_to_num(
+    const half2 a,
+    const half2 clamp_min,
+    const half2 clamp_max,
+    const half2 nan_replace) {
+  half2 isnan = __hisnan2(a);
+  return half2(
+      isnan.x ? nan_replace.x : hard_tanh(a.x, clamp_min.x, clamp_max.x),
+      isnan.y ? nan_replace.y : hard_tanh(a.y, clamp_min.y, clamp_max.y));
+}
+
+__device__ half clamp_nan_to_num(
+    const half a,
+    const half clamp_min,
+    const half clamp_max,
+    const half nan_replace) {
+  return __hisnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
+}
+
+__device__ float clamp_nan_to_num(
+    const float a,
+    const float clamp_min,
+    const float clamp_max,
+    const float nan_replace) {
+  return isnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
+}
+
+// Backup functions for CUDA_ARCH < 800
+__device__ half nanh() {
+  return __float2half(nanf(""));
+}
+
+__device__ bool half_isnan(half h) {
+  return h != h;
+}
+
+__device__ half hmin(half a, half b) {
+  return (a < b) ? a : b;
+}
+
+__device__ half hmax(half a, half b) {
+  return (a > b) ? a : b;
+}
+
+// max/min functions that let NaNs pass through
+__device__ float fmaxf_nan(const float a, const float b) {
+  return (isnan(a) || isnan(b)) ? nanf("") : fmaxf(a, b);
+}
+
+__device__ half hmax_nan(const half a, const half b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax_nan(a, b);
+#else
+  return (half_isnan(a) || half_isnan(b)) ? nanh() : hmax(a, b);
+#endif
+}
+
+__device__ half2 hmax2_nan(const half2 a, const half2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2_nan(a, b);
+#else
+  return half2(hmax_nan(a.x, b.x), hmax_nan(a.y, b.y));
+#endif
+}
+
+__device__ float fminf_nan(const float a, const float b) {
+  return (isnan(a) || isnan(b)) ? nanf("") : fminf(a, b);
+}
+
+__device__ half hmin_nan(const half a, const half b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmin_nan(a, b);
+#else
+  return (half_isnan(a) || half_isnan(b)) ? nanh() : hmin(a, b);
+#endif
+}
+
+__device__ half2 hmin2_nan(const half2 a, const half2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmin2_nan(a, b);
+#else
+  return half2(hmin_nan(a.x, b.x), hmin_nan(a.y, b.y));
+#endif
+}
+
+#endif
diff --git a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
new file mode 100644
index 000000000..f25013aec
--- /dev/null
+++ b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
@@ -0,0 +1,65 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Elementwise codegen for CUDA.
+"""
+
+import os
+from typing import Any, Dict
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common import elementwise_common
+from ...target import Target
+
+HEAD_TEMPLATE = """
+#include <cuda_fp16.hpp>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/constants.h"
+"""
+
+
+@registry.reg("cuda.fused_elementwise.gen_function")
+def fused_elementwise_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generates fused_elementwise function definition."""
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "custom_math.cuh"
+    )
+    return elementwise_common.fused_elementwise_gen_function(
+        func_attrs=func_attrs,
+        custom_libs=custom_libs,
+        head_template=HEAD_TEMPLATE,
+        backend_spec=CUDASpec(),
+    )
+
+
+@registry.reg("cuda.fused_elementwise.func_decl")
+def fused_elementwise_gen_function_decl(func_attrs):
+    """Generates fused_elementwise function declaration."""
+    return elementwise_common.fused_elementwise_gen_function_decl(
+        func_attrs=func_attrs,
+        backend_spec=CUDASpec(),
+    )
+
+
+@registry.reg("cuda.fused_elementwise.func_call")
+def fused_elementwise_gen_function_call(func_attrs, indent):
+    """Generates fused_elementwise function call."""
+    return elementwise_common.fused_elementwise_gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+        backend_spec=CUDASpec(),
+    )
diff --git a/python/aitemplate/backend/cuda/embedding/__init__.py b/python/aitemplate/backend/cuda/embedding/__init__.py
new file mode 100644
index 000000000..3e3aab46b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/embedding/__init__.py
@@ -0,0 +1,16 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from .bert_embeddings import *
diff --git a/python/aitemplate/backend/cuda/embedding/bert_embeddings.py b/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
new file mode 100644
index 000000000..2ca8d5816
--- /dev/null
+++ b/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
@@ -0,0 +1,450 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+bert_embeddings kernel codegen for CUDA.
+"""
+
+from typing import Any, Dict
+
+import jinja2
+
+from ... import registry
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+
+#define FINAL_MASK 0xffffffff
+
+namespace {
+
+template <typename T>
+__inline__ __device__ T warpReduceSum(T* val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1) {
+    val[0] += __shfl_xor_sync(FINAL_MASK, val[0], mask, 32);
+  }
+  return (T)(0.0f);
+}
+
+template <typename T>
+__inline__ __device__ T blockReduceSum(T* val) {
+  __shared__ T shared[33];
+  int lane = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  warpReduceSum<T>(val);
+
+  if (lane == 0) {
+#pragma unroll
+    shared[wid] = val[0];
+  }
+
+  __syncthreads();
+
+  // blockDim.x is round up to multiples of 32
+  bool is_mask = threadIdx.x < (blockDim.x / 32);
+#pragma unroll
+  val[0] = is_mask ? shared[lane] : (T)(0.0f);
+
+  warpReduceSum<T>(val);
+  return (T)0.0f;
+}
+
+template <typename T>
+__inline__ __device__ T normalize(T val, T mean, T variance, T gamma, T beta) {
+  return (val - mean) * variance * gamma + beta;
+}
+
+// __inline__ __device__ float sigmoid(float val) {
+//   return 1.0f / (1.0f + expf(-1.0f * val));
+// }
+
+// fast sigmoid
+__inline__ __device__ float sigmoid(float val) {
+  return (cutlass::fast_tanh(val * 0.5f) + 1.0f) * 0.5f;
+}
+
+template <typename INDEX_T>
+__global__ void bert_embeddings_kernel(
+    uint4* output,
+    INDEX_T* input_ids,
+    INDEX_T* token_type_ids,
+    INDEX_T* position_ids,
+    uint4* word_embeddings,
+    uint4* token_type_embeddings,
+    uint4* position_embeddings,
+    uint4* gamma,
+    uint4* beta,
+    const int64_t embedding_dim,
+    const int64_t vocab_size,
+    const int64_t type_vocab_size,
+    const int64_t max_position_embeddings,
+    const float eps) {
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;
+  const int embedding_dim_div_8 = embedding_dim / 8;
+
+  const int64_t input_id = input_ids[bid];
+  const int64_t token_type_id = token_type_ids[bid];
+  const int64_t position_id = position_ids[bid];
+
+  // index bound check
+  if (input_id < 0 || input_id >= vocab_size || token_type_id < 0 ||
+      token_type_id >= type_vocab_size || position_id < 0 ||
+      position_id >= max_position_embeddings) {
+    return;
+  }
+
+  word_embeddings = word_embeddings + input_id * embedding_dim_div_8;
+  token_type_embeddings =
+      token_type_embeddings + token_type_id * embedding_dim_div_8;
+  position_embeddings = position_embeddings + position_id * embedding_dim_div_8;
+
+  uint4 word_embedding{0, 0, 0, 0};
+  uint4 token_type_embedding{0, 0, 0, 0};
+  uint4 position_embedding{0, 0, 0, 0};
+
+  if (tid < embedding_dim_div_8) {
+    word_embedding = word_embeddings[tid];
+    token_type_embedding = token_type_embeddings[tid];
+    position_embedding = position_embeddings[tid];
+  }
+  uint4 embedding{0, 0, 0, 0};
+
+  half* word_emb_vec = reinterpret_cast<half*>(&word_embedding);
+  half* token_emb_vec = reinterpret_cast<half*>(&token_type_embedding);
+  half* pos_emb_vec = reinterpret_cast<half*>(&position_embedding);
+
+  half* emb_vec = reinterpret_cast<half*>(&embedding);
+
+  // layernorm
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+
+#pragma unroll
+  for (int i = 0; i < 8; i++) {
+    float sum = word_emb_vec[i] + token_emb_vec[i] + pos_emb_vec[i];
+    local_sums[0] += sum;
+    emb_vec[i] = (half)sum;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float>(local_sums);
+  } else {
+    blockReduceSum<float>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / embedding_dim;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+
+  if (tid < embedding_dim_div_8) {
+#pragma unroll
+    for (int i = 0; i < 8; i++) {
+      float val = emb_vec[i];
+      local_sums[0] += (val - s_mean) * (val - s_mean);
+    }
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float>(local_sums);
+  } else {
+    blockReduceSum<float>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / embedding_dim + eps);
+  }
+  __syncthreads();
+
+  if (tid < embedding_dim_div_8) {
+    uint4 local_gamma = gamma[tid];
+    half* gamma_vec = reinterpret_cast<half*>(&local_gamma);
+    uint4 local_beta = beta[tid];
+    half* beta_vec = reinterpret_cast<half*>(&local_beta);
+#pragma unroll
+    for (int i = 0; i < 8; i++) {
+      emb_vec[i] = normalize(
+          (float)emb_vec[i],
+          s_mean,
+          s_variance,
+          (float)gamma_vec[i],
+          (float)beta_vec[i]);
+    }
+  }
+
+  // write to output
+  if (tid < embedding_dim_div_8) {
+    output = output + bid * embedding_dim_div_8;
+    output[tid] = embedding;
+  }
+}
+
+template <typename INDEX_T>
+void bert_embeddings_launcher(
+    half* output,
+    INDEX_T* input_ids,
+    INDEX_T* token_type_ids,
+    INDEX_T* position_ids,
+    half* word_embeddings,
+    half* token_type_embeddings,
+    half* position_embeddings,
+    half* gamma,
+    half* beta,
+    const int64_t indices_num,
+    const int64_t embedding_dim,
+    const int64_t vocab_size,
+    const int64_t type_vocab_size,
+    const int64_t max_position_embeddings,
+    const float eps,
+    cudaStream_t stream) {
+  if (embedding_dim % 8 != 0) {
+    throw std::runtime_error("embedding dim must be multiple of 8");
+  }
+  dim3 grid(indices_num);
+
+  // round up to multiple of 32
+  int64_t num_threads = embedding_dim / 8;
+  num_threads = (num_threads + 31) / 32 * 32;
+  dim3 block(num_threads);
+
+  bert_embeddings_kernel<INDEX_T><<<grid, block, 0, stream>>>(
+      reinterpret_cast<uint4*>(output),
+      input_ids,
+      token_type_ids,
+      position_ids,
+      reinterpret_cast<uint4*>(word_embeddings),
+      reinterpret_cast<uint4*>(token_type_embeddings),
+      reinterpret_cast<uint4*>(position_embeddings),
+      reinterpret_cast<uint4*>(gamma),
+      reinterpret_cast<uint4*>(beta),
+      embedding_dim,
+      vocab_size,
+      type_vocab_size,
+      max_position_embeddings,
+      eps);
+}
+
+} // namespace
+
+{{func_signature}}
+{
+    bert_embeddings_launcher<{{index_type}}>(
+      output,
+      input_ids,
+      token_type_ids,
+      position_ids,
+      word_embeddings,
+      token_type_embeddings,
+      position_embeddings,
+      gamma,
+      beta,
+      indices_num,
+      embedding_dim,
+      vocab_size,
+      type_vocab_size,
+      max_position_embeddings,
+      eps,
+      stream
+    );
+}
+
+"""
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(half* output,
+                   {{index_type}}* input_ids,
+                   {{index_type}}* token_type_ids,
+                   {{index_type}}* position_ids,
+                   half* word_embeddings,
+                   half* token_type_embeddings,
+                   half* position_embeddings,
+                   half* gamma,
+                   half* beta,
+                   const int64_t indices_num,
+                   const int64_t embedding_dim,
+                   const int64_t vocab_size,
+                   const int64_t type_vocab_size,
+                   const int64_t max_position_embeddings,
+                   const float eps,
+                   cudaStream_t stream)
+  """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{calculate_indices_num}}
+{{indent}}  {{func_name}}(
+{{indent}}            {{output}},
+{{indent}}            {{input_ids}},
+{{indent}}            {{token_type_ids}},
+{{indent}}            {{position_ids}},
+{{indent}}            {{word_embeddings}},
+{{indent}}            {{token_type_embeddings}},
+{{indent}}            {{position_embeddings}},
+{{indent}}            {{gamma}},
+{{indent}}            {{beta}},
+{{indent}}            {{indices_num}},
+{{indent}}            {{embedding_dim}},
+{{indent}}            {{vocab_size}},
+{{indent}}            {{type_vocab_size}},
+{{indent}}            {{max_position_embeddings}},
+{{indent}}            {{eps}},
+{{indent}}            stream /* default stream */
+{{indent}} );
+
+{{indent}}}
+    """
+)
+
+INDICES_NUM_TEMPLATE = jinja2.Template(
+    """
+  int64_t indices_num = 1;
+  {% for dim_name in dim_names %}
+  indices_num *= {{dim_name}};
+  {% endfor %}
+  """
+)
+
+
+def python_int_dtype_to_c_dtype(dtype):
+    if dtype == "int64":
+        return "int64_t"
+    if dtype in ["int", "int32"]:
+        return "int32_t"
+    return dtype
+
+
+@registry.reg("cuda.bert_embeddings.gen_function")
+def bert_embeddings_gen_function(func_attrs: Dict[str, Any]) -> str:
+    dtype = python_int_dtype_to_c_dtype(func_attrs["inputs"][0]._attrs["dtype"])
+    return FUNC_TEMPLATE.render(
+        index_type=dtype,
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            index_type=dtype,
+        ).strip(),
+    )
+
+
+@registry.reg("cuda.bert_embeddings.func_decl")
+def bert_embeddings_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    dtype = python_int_dtype_to_c_dtype(func_attrs["inputs"][0]._attrs["dtype"])
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            index_type=dtype,
+        ).strip()
+    )
+
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<half*>(&({{name}}->raw()))"
+)
+
+FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
+FUNC_CALL_INT32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int32_t*>({{name}})")
+
+
+def get_int_param_template(tensor):
+    name = tensor._attrs["name"]
+    dtype = tensor._attrs["dtype"]
+    if dtype == "int64":
+        return FUNC_CALL_INT64_PARAM_TEMPLATE.render(name=name)
+    elif dtype in ("int", "int32"):
+        return FUNC_CALL_INT32_PARAM_TEMPLATE.render(name=name)
+    else:
+        raise NotImplementedError(f"Unsupported dtype: {dtype}")
+
+
+@registry.reg("cuda.bert_embeddings.func_call")
+def bert_embeddings_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    (
+        input_ids,
+        token_type_ids,
+        position_ids,
+        word_embeddings,
+        token_type_embeddings,
+        position_embeddings,
+        gamma,
+        beta,
+    ) = func_attrs["inputs"]
+
+    indices_dims = [shape._attrs["name"] for shape in input_ids.shape()]
+    indices_num_str = INDICES_NUM_TEMPLATE.render(
+        dim_names=indices_dims,
+    )
+    embedding_dim = word_embeddings._size(-1).value()
+    vocab_size = word_embeddings._size(0).value()
+    type_vocab_size = token_type_embeddings._size(0).value()
+    max_position_embeddings = position_embeddings._size(0).value()
+
+    eps = func_attrs["eps"]
+    output_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+
+    input_ids_str = get_int_param_template(input_ids)
+    token_type_ids_str = get_int_param_template(token_type_ids)
+    position_ids_str = get_int_param_template(position_ids)
+
+    word_embeddings_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=word_embeddings._attrs["name"]
+    )
+    token_type_embeddings_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=token_type_embeddings._attrs["name"]
+    )
+    position_embeddings_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=position_embeddings._attrs["name"]
+    )
+
+    gamma_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=gamma._attrs["name"])
+    beta_str = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=beta._attrs["name"])
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        calculate_indices_num=indices_num_str,
+        output=output_str,
+        input_ids=input_ids_str,
+        token_type_ids=token_type_ids_str,
+        position_ids=position_ids_str,
+        word_embeddings=word_embeddings_str,
+        token_type_embeddings=token_type_embeddings_str,
+        position_embeddings=position_embeddings_str,
+        gamma=gamma_str,
+        beta=beta_str,
+        indices_num="indices_num",
+        embedding_dim=embedding_dim,
+        vocab_size=vocab_size,
+        type_vocab_size=type_vocab_size,
+        max_position_embeddings=max_position_embeddings,
+        eps=eps,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
new file mode 100644
index 000000000..604984059
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from . import bmm_rcr_softmax, gemm_rcr_bias_softmax, gemm_rcr_softmax
+
+__all__ = ["bmm_rcr_softmax", "gemm_rcr_bias_softmax", "gemm_rcr_softmax"]
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
new file mode 100644
index 000000000..4a63ff1fc
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
@@ -0,0 +1,256 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common functions and templates for bmm-family ops
+"""
+import jinja2
+
+from ...common import gemm_common
+from ..gemm_universal import common
+
+from . import common_softmax
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+{% if has_bias %}
+  cutlass::half_t*,
+{% endif %}
+  cutlass::half_t*,
+  cutlass::half_t*,
+  float*,
+  cutlass::half_t*,
+  uint8_t*,
+{% if support_split_k %}
+  int,
+{% endif %}
+{% for idx in range(ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(ndims) %}
+  int64_t*,
+{% endfor %}
+  cudaStream_t
+);
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{a_ptr}},
+{{indent}}    {{b_ptr}},
+{% if has_bias %}
+{{indent}}    {{bias_ptr}},
+{% endif %}
+{{indent}}    {{c_ptr}},
+{{indent}}    {{d_ptr}},
+{{indent}}    {{n_ptr}},
+{{indent}}    {{soft_ptr}},
+{{indent}}    global_workspace,
+{{indent}}    {{a_dim0_ptr}},
+{{indent}}    {{a_dim1_ptr}},
+{{indent}}    {{a_dim2_ptr}},
+{{indent}}    {{b_dim0_ptr}},
+{{indent}}    {{b_dim1_ptr}},
+{{indent}}    {{b_dim2_ptr}},
+{{indent}}    {{c_dim0_ptr}},
+{{indent}}    {{c_dim1_ptr}},
+{{indent}}    {{c_dim2_ptr}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  // cast to int64_t to avoid overflow
+  int64_t a_ptr_sz = static_cast<int64_t>(a_dim0) * static_cast<int64_t>(a_dim1) * static_cast<int64_t>(a_dim2);
+  int64_t b_ptr_sz = static_cast<int64_t>(b_dim0) * static_cast<int64_t>(b_dim1) * static_cast<int64_t>(b_dim2);
+  int64_t c_ptr_sz = static_cast<int64_t>(c_dim0) * static_cast<int64_t>(c_dim1) * static_cast<int64_t>(c_dim2);
+  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for A100 L2 cache 40M
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+
+
+  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
+  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d_ptr: index 3
+  memory_pool->AllocateFloatTensor(c_dim0 * c_dim1,  mem_pool_sz);  // n_ptr: index 4
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // soft_ptr: index 5
+"""
+)
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    args_parser_template,
+    emit_kernel=False,
+    bias_ptr_arg=None,
+):
+    """Generate code for profiling"""
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    has_d = False
+    if "has_d" in func_attrs:
+        has_d = func_attrs["has_d"]
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    file_pairs = []
+    has_bias = bias_ptr_arg is not None
+    assert not (has_d and has_bias)
+    for op_name, op in op_instance.items():
+        config = common_softmax.emit_instance(op, emit_kernel=emit_kernel)
+        config_name = common.extract_config_name(config)
+        name = "GemmInstance"
+        instance = common.INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = common_softmax.EXEC_TEMPLATE.render(
+            indent="  ",
+            instance=name,
+            is_profiler=True,
+            problem_args=problem_args_template.render(),
+        )
+        op_func = src_template.render(
+            custom_libs=common_softmax.gen_custom_libs(),
+            instances=instance,
+            function_name="bmm",
+            input_ndims=3,
+            weight_ndims=3,
+            shape_eval=shape_func,
+            exec_paths=exec_program,
+            has_d=has_d,
+        )
+        func_call = FUNC_CALL_TEMPLATE.render(
+            func_name="bmm",
+            a_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(0)",
+            b_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(1)",
+            has_bias=has_bias,
+            bias_ptr=bias_ptr_arg,
+            c_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(2)",
+            d_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(3)",
+            n_ptr="memory_pool->RequestTensorByIdx<float>(4)",
+            soft_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(5)",
+            has_d=has_d,
+            a_dim0_ptr="&a_dim0",
+            a_dim1_ptr="&a_dim1",
+            a_dim2_ptr="&a_dim2",
+            b_dim0_ptr="&b_dim0",
+            b_dim1_ptr="&b_dim1",
+            b_dim2_ptr="&b_dim2",
+            c_dim0_ptr="&c_dim0",
+            c_dim1_ptr="&c_dim1",
+            c_dim2_ptr="&c_dim2",
+        )
+        code = common_softmax.PROFILER_TEMPLATE.render(
+            op_func=op_func,
+            args_parse=args_parser_template.render(),
+            func_call=func_call,
+            name=name,
+            tensor_decl=TENSOR_DECL_TEMPLATE.render(
+                name=name, has_d=has_d, has_bias=has_bias
+            ),
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
+
+
+def gen_function_decl(func_attrs):
+    """Rendering argument to function declaration template"""
+    func_name = func_attrs["name"]
+    has_d = False
+    if "has_d" in func_attrs:
+        has_d = func_attrs["has_d"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name, ndims=3, has_d=has_d)
+
+
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args,
+):
+    """Generate the code for main function"""
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_softmax.gen_function(
+        func_attrs,
+        common_softmax.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        dim_info_dict=dim_info_dict,
+        emit_kernel=True,
+    )
+
+
+def gen_function_call(func_attrs, indent="  ", bias_ptr_arg=None):
+    """Rendering the code to function call template"""
+
+    a = func_attrs["inputs"][0]
+    ashape = func_attrs["input_accessors"][0].original_shapes
+    b = func_attrs["inputs"][1]
+    bshape = func_attrs["input_accessors"][1].original_shapes
+
+    c = func_attrs["inputs"][2]
+    d = func_attrs["inputs"][3]
+    n = func_attrs["inputs"][4]
+
+    soft = func_attrs["outputs"][0]
+    cshape = func_attrs["output_accessors"][0].original_shapes
+    has_d = False
+    has_bias = bias_ptr_arg is not None
+    assert not (has_d and has_bias)
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        a_ptr=a._attrs["name"],
+        b_ptr=b._attrs["name"],
+        has_bias=has_bias,
+        bias_ptr=bias_ptr_arg,
+        c_ptr=c._attrs["name"],
+        d_ptr=d._attrs["name"],
+        n_ptr=n._attrs["name"],
+        soft_ptr=soft._attrs["name"],
+        has_d=has_d,
+        a_dim0_ptr="&" + ashape[0]._attrs["name"],
+        a_dim1_ptr="&" + ashape[1]._attrs["name"],
+        a_dim2_ptr="&" + ashape[2]._attrs["name"],
+        b_dim0_ptr="&" + bshape[0]._attrs["name"],
+        b_dim1_ptr="&" + bshape[1]._attrs["name"],
+        b_dim2_ptr="&" + bshape[2]._attrs["name"],
+        c_dim0_ptr="&" + cshape[0]._attrs["name"],
+        c_dim1_ptr="&" + cshape[1]._attrs["name"],
+        c_dim2_ptr="&" + cshape[2]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
new file mode 100644
index 000000000..751a19a84
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -0,0 +1,161 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+"""
+import jinja2
+
+from ... import registry
+from ..gemm_universal import common
+from ..gemm_universal.layout import RCR
+from . import bmm_common_softmax as bmm_common, common_softmax
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = M;
+  int64_t a_dim2 = K;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = N;
+  int64_t b_dim2 = K;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = N;
+"""
+)
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    /*
+        A: B*M*K (RowMajor)
+        B: B*N*K (ColumnMajor)
+        C/D/sofmax: B*M*N (RowMajor)
+        N: B*M*1 (RowMajor)
+    */
+
+        {M, N, K},
+        B,
+        {a_ptr, LayoutA(K)},
+        {b_ptr, LayoutB(K)},
+        {c_ptr, LayoutC(N)},
+        {d_ptr, LayoutC(N)},
+        {
+            float(1.0),
+            float(0.0)
+        },
+        {n_ptr, LayoutC(1)},
+        {soft_ptr, LayoutC(N)},
+        M*K,
+        N*K,
+        M*N,
+        M*N,
+        M*N,
+        M*N
+
+
+"""
+)
+
+
+@registry.reg("cuda.bmm_rcr_softmax.config")
+def bmm_rcr_softmax_config(func_attrs, dtype="float16"):
+    """This function sets a callback for processing the epilogue of the kernel
+    associated with func_attrs.
+
+    Parameters
+    ----------
+    func_attrs: Dictionary
+        kernel attributes dictionary
+    layout: layout object
+        kernel layout
+    Returns
+    -------
+    None
+    """
+    common.make_fproc_f16(func_attrs, RCR)
+
+
+@registry.reg("cuda.bmm_rcr_softmax.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generate code for profiling"""
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common_softmax.SRC_TEMPLATE,
+        PROBLEM_ARGS_TEMPLATE,
+        ARGS_PARSER_TEMPLATE,
+        emit_kernel=True,
+    )
+
+
+@registry.reg("cuda.bmm_rcr_softmax.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    """Generate the code for main function"""
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        PROBLEM_ARGS_TEMPLATE.render(),
+    )
+
+
+@registry.reg("cuda.bmm_rcr_softmax.func_decl")
+def gen_function_decl(func_attrs):
+    """Rendering argument to function declaration template"""
+    func_name = func_attrs["name"]
+    return bmm_common.FUNC_DECL_TEMPLATE.render(func_name=func_name, ndims=3)
+
+
+@registry.reg("cuda.bmm_rcr_softmax.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Rendering the code to function call template"""
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.bmm_rcr_softmax.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
new file mode 100644
index 000000000..ff5e4b084
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
@@ -0,0 +1,538 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common template for softmax.
+"""
+import os
+import re
+from hashlib import sha1
+
+import jinja2
+
+from ...common import gemm_common
+from ...target import Target
+from ..gemm_universal import common
+
+# pylint: disable=C0301,C0415,R1705
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <memory>
+#include <random>
+#include <vector>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/device_memory.h"
+
+#include "gemm_with_softmax.h"
+
+{{custom_libs}}
+
+{{extra_code}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutC = cutlass::layout::RowMajor;
+
+
+void {{function_name}} (
+    cutlass::half_t* a_ptr,
+    cutlass::half_t* b_ptr,
+{% if has_d %}
+    cutlass::half_t* d_ptr,
+{% endif %}
+    cutlass::half_t* c_ptr,
+    cutlass::half_t* d_ptr,
+    float* n_ptr,
+    cutlass::half_t* soft_ptr,
+    uint8_t* workspace,
+{% if support_split_k %}
+    int split_k,
+{% endif %}
+{% for idx in range(input_ndims) %}
+    int64_t* a_dim{{idx}},
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+    int64_t* b_dim{{idx}},
+{% endfor %}
+{% for idx in range(input_ndims) %}
+    int64_t* c_dim{{idx}},
+{% endfor %}
+    cudaStream_t stream
+  ) {
+  {{shape_eval}}
+  {{output_addr_calculator}}
+  {{extra_shape}}
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this gemm specialization."
+  );
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}typename {{instance}}::Arguments arguments{
+
+{{problem_args}}
+
+{{indent}}};
+{{indent}}{{instance}} gemm_op;
+{% if is_profiler %}
+{{indent}}size_t workspace_size = 0; //gemm_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+
+{{indent}}auto status = gemm_op.initialize(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = gemm_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  float*,
+  cutlass::half_t*,
+  uint8_t*,
+{% if support_split_k %}
+  int,
+{% endif %}
+{% for idx in range(input_ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(input_ndims) %}
+  int64_t*,
+{% endfor %}
+  cudaStream_t
+);
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{a_ptr}},
+{{indent}}    {{b_ptr}},
+{% if has_bias %}
+{{indent}}    {{bias_ptr}},
+{% endif %}
+{{indent}}    {{c_ptr}},
+{{indent}}    {{d_ptr}},
+{{indent}}    {{n_ptr}},
+{{indent}}    {{soft_ptr}},
+{{indent}}    global_workspace,
+{{indent}}    {{split_k}},
+{% for dim in adims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in bdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in cdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  // cast to int64_t to avoid overflow
+  int64_t a_ptr_sz = static_cast<int64_t>(a_dim0) * static_cast<int64_t>(a_dim1);
+  int64_t b_ptr_sz = static_cast<int64_t>(b_dim0) * static_cast<int64_t>(b_dim1);
+  int64_t c_ptr_sz = static_cast<int64_t>(c_dim0) * static_cast<int64_t>(c_dim1);
+  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for A100 L2 cache 40M
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+
+  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
+  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d_ptr: index 3
+  memory_pool->AllocateFloatTensor(c_dim0,  mem_pool_sz);  // n_ptr: index 4
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // soft_ptr: index 5
+"""
+)
+
+
+DEFAULT_EXTRA_SHAPE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}const int M = AM;
+{{indent}}const int N = BN;
+{{indent}}const int K = AK;
+"""
+)
+
+
+# TODO Merge all alignment into single profiler
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+{{op_func}}
+
+struct ProfilerMemoryPool {
+  ProfilerMemoryPool() {
+    std::random_device rd;
+    gen = std::mt19937(rd());
+    uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
+    offsets.reserve(512);
+    strides.reserve(512);
+    copies.reserve(512);
+    ptrs.reserve(512);
+    blobs.reserve(512);
+  }
+  ~ProfilerMemoryPool() {}
+
+  template <typename DType>
+  DType* AllocateGaussianTensor(int64_t size) {
+    size_t length = size * sizeof(DType);
+    blobs.emplace_back(length);
+    DType* ptr = reinterpret_cast<DType*>(blobs.back().get());
+
+    uint64_t seed = uniform_dist(gen);
+    double mean = 0.f;
+    double std = 1.f;
+
+    cutlass::reference::device::BlockFillRandomGaussian(ptr, size, seed, mean,
+                                                        std);
+
+    return ptr;
+  }
+
+
+  cutlass::half_t* AllocateHalfGaussianTensor(int64_t size) {
+    return reinterpret_cast<cutlass::half_t*>(
+        AllocateGaussianTensor<__half>(size));
+  }
+
+  int AllocateHalfTensor(int64_t size, int64_t copy) {
+    offsets.push_back(0);
+    strides.push_back(size);
+    copies.push_back(copy);
+    auto ptr = AllocateHalfGaussianTensor(size * copy);
+    ptrs.push_back(reinterpret_cast<void*>(ptr));
+    return ptrs.size() - 1;
+  }
+
+  float* AllocateFloatGaussianTensor(int64_t size) {
+    return reinterpret_cast<float*>(
+        AllocateGaussianTensor<float>(size));
+  }
+
+  int AllocateFloatTensor(int64_t size, int64_t copy) {
+    offsets.push_back(0);
+    strides.push_back(size);
+    copies.push_back(copy);
+    auto ptr = AllocateFloatGaussianTensor(size * copy);
+    ptrs.push_back(reinterpret_cast<void*>(ptr));
+    return ptrs.size() - 1;
+  }
+
+  template <typename T>
+  T* RequestTensorByIdx(int idx) {
+    auto copy = copies.at(idx);
+    auto offset = offsets.at(idx);
+    auto stride = strides.at(idx);
+    T* ptr = reinterpret_cast<T*>(ptrs.at(idx));
+    ptr += offset;
+    offset += stride;
+    if (offset == copy * stride) {
+        offset = 0;
+    }
+    offsets[idx] = offset;
+    return ptr;
+  }
+
+  std::vector<int64_t> offsets;
+  std::vector<int64_t> strides;
+  std::vector<int64_t> copies;
+  std::vector<void*> ptrs;
+  std::vector<cutlass::DeviceAllocation<uint8_t> > blobs;
+  std::mt19937 gen;
+  std::uniform_int_distribution<int64_t> uniform_dist;
+};
+
+int main(int argc, char** argv) {
+  int device_idx;
+  cudaDeviceProp device_properties;
+  cudaError_t result = cudaGetDevice(&device_idx);
+  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaGetDevice() API call failed.");
+  }
+
+  result = cudaGetDeviceProperties(&device_properties, device_idx);
+
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaGetDeviceProperties() failed");
+  }
+
+
+
+  {{args_parse}}
+
+  using ElementOutput = typename {{name}}::ElementC;
+  using ElementInputA = typename {{name}}::ElementA;
+  using ElementInputB = typename {{name}}::ElementB;
+  using ElementInputN = typename {{name}}::ElementN;
+  uint8_t* global_workspace = nullptr;
+  cudaStream_t stream = nullptr;
+
+  {{tensor_decl}}
+
+  // warmup
+  {{func_call}}
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0]);
+  for (int i = 0; i < 5; ++i) {
+    {{func_call}}
+  }
+  cudaEventRecord(events[1]);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+}
+"""
+)
+
+
+def gen_custom_libs():
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "include/gemm_with_softmax.h"
+    )
+    return custom_libs
+
+
+def _gemm_softmax_instance(op_def):
+    tmp = op_def.replace("GemmSoftmax", "GemmSoftmaxUniversal")
+    tmp = re.sub(
+        r"GemmIdentityThreadblockSwizzle<\d>",
+        "GemmBatchedIdentityThreadblockSwizzle",
+        tmp,
+    )
+    return tmp
+
+
+def emit_instance(op, f_instance_convertor=_gemm_softmax_instance, emit_kernel=False):
+    import cutlass_lib
+
+    emiter = cutlass_lib.gemm_operation.EmitGemmInstance()
+    if emit_kernel:
+        emiter = cutlass_lib.gemm_operation.EmitGemmSoftmaxInstance()
+
+    op_def = emiter.emit(op)
+    op_def = f_instance_convertor(op_def)
+    return op_def
+
+
+def gen_function(
+    func_attrs,
+    src_template,
+    exec_cond_template,
+    problem_args,
+    input_ndims,
+    weight_ndims,
+    dim_info_dict,
+    f_instance_convertor=_gemm_softmax_instance,
+    emit_kernel=False,
+    support_split_k=False,
+    output_addr_calculator="",
+    extra_code="",
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for exec_item in exec_path.values():
+        fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
+        algo = exec_item.algo
+        if algo not in inst_def_flag:
+            config = emit_instance(op_instance[algo], f_instance_convertor, emit_kernel)
+            inst_def_flag.add(algo)
+        else:
+            config = ""
+        inst = common.INSTANCE_TEMPLATE.render(
+            config=config, name=fname, config_name=common.extract_config_name(config)
+        )
+        instances[exec_item.exec_cond] = inst
+        instance_decl += inst
+    shape_eval_func = gemm_common.gen_shape_eval_code(
+        indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+    exec_paths = ""
+    for key, _ in instances.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = EXEC_TEMPLATE.render(
+            indent="    ",
+            instance=fname,
+            problem_args=problem_args,
+            support_split_k=support_split_k,
+        )
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return src_template.render(
+        custom_libs=gen_custom_libs(),
+        instances=instance_decl,
+        function_name=func_name,
+        dtype="cutlass::half_t",
+        shape_eval=shape_eval_func,
+        output_addr_calculator=output_addr_calculator,
+        exec_paths=exec_paths,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=support_split_k,
+        has_d=common.has_d(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+        extra_code=extra_code,
+    )
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    args_parser_template,
+    emit_kernel=False,
+    support_split_k=False,
+    output_addr_calculator="",
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+
+    ndims = 2
+    adims = ["&a_dim" + str(i) for i in range(ndims)]
+    bdims = ["&b_dim" + str(i) for i in range(ndims)]
+    cdims = ["&c_dim" + str(i) for i in range(ndims)]
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    file_pairs = []
+    has_bias = bias_ptr_arg is not None
+    for op_name, op in op_instance.items():
+        config = emit_instance(op, emit_kernel=emit_kernel)
+        config_name = common.extract_config_name(config)
+        name = "GemmInstance"
+        instance = common.INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ",
+            instance=name,
+            is_profiler=True,
+            support_split_k=support_split_k,
+            problem_args=problem_args_template.render(),
+        )
+        op_func = src_template.render(
+            custom_libs=gen_custom_libs(),
+            instances=instance,
+            function_name="gemm",
+            input_ndims=2,
+            weight_ndims=2,
+            shape_eval=shape_func,
+            exec_paths=exec_program,
+            output_addr_calculator=output_addr_calculator,
+            support_split_k=support_split_k,
+            extra_code=extra_code,
+        )
+        func_call = FUNC_CALL_TEMPLATE.render(
+            func_name="gemm",
+            a_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(0)",
+            b_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(1)",
+            has_bias=has_bias,
+            bias_ptr=bias_ptr_arg,
+            c_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(2)",
+            d_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(3)",
+            n_ptr="memory_pool->RequestTensorByIdx<float>(4)",
+            soft_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(5)",
+            split_k="split_k",
+            adims=adims,
+            bdims=bdims,
+            cdims=cdims,
+        )
+        code = PROFILER_TEMPLATE.render(
+            op_func=op_func,
+            args_parse=args_parser_template.render(),
+            func_call=func_call,
+            name=name,
+            tensor_decl=TENSOR_DECL_TEMPLATE.render(name=name, has_bias=has_bias),
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
new file mode 100644
index 000000000..90e9d25a6
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
@@ -0,0 +1,118 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+"""
+import jinja2
+
+from ... import registry
+from ..gemm_universal import common
+from . import common_softmax, gemm_rcr_softmax
+
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    /*
+        A: M*K (RowMajor)
+        B: N*K (ColumnMajor)
+        C/D/sofmax: M*N (RowMajor)
+        N: M*1 (RowMajor)
+    */
+
+        {M, N, K},
+        1,
+        {a_ptr, LayoutA(K)},
+        {b_ptr, LayoutB(K)},
+        {c_ptr, 0},
+        {d_ptr, LayoutC(N)},
+        {
+            float(1.0),
+            float(1.0)
+        },
+        {n_ptr, LayoutC(1)},
+        {soft_ptr, LayoutC(N)}
+
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_bias_softmax.config")
+def gemm_rcr_bias_softmax_config(func_attrs, dtype="float16"):
+    return gemm_rcr_softmax.gemm_rcr_softmax_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_bias_softmax.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return gemm_rcr_softmax.common_gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common_softmax.SRC_TEMPLATE,
+        PROBLEM_ARGS_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_softmax.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return gemm_rcr_softmax.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        PROBLEM_ARGS_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_softmax.func_decl")
+def gen_function_decl(func_attrs):
+    return gemm_rcr_softmax.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_softmax.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return gemm_rcr_softmax.gen_function_call(
+        func_attrs,
+        indent,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_softmax.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
new file mode 100644
index 000000000..eb3fcde49
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -0,0 +1,216 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+"""
+import jinja2
+
+from ... import registry
+from ..gemm_universal import common
+from ..gemm_universal.layout import RCR
+from . import common_softmax
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::atoi(argv[1]);
+  int64_t N = std::atoi(argv[2]);
+  int64_t K = std::atoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+)
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    /*
+        A: M*K (RowMajor)
+        B: N*K (ColumnMajor)
+        C/D/sofmax: M*N (RowMajor)
+        N: M*1 (RowMajor)
+    */
+
+        {M, N, K},
+        1,
+        {a_ptr, LayoutA(K)},
+        {b_ptr, LayoutB(K)},
+        {c_ptr, LayoutC(N)},
+        {d_ptr, LayoutC(N)},
+        {
+            float(1.0),
+            float(0.0)
+        },
+        {n_ptr, LayoutC(1)},
+        {soft_ptr, LayoutC(N)}
+
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_softmax.config")
+def gemm_rcr_softmax_config(func_attrs, dtype="float16"):
+    common.make_fproc_f16(func_attrs, RCR)
+
+
+def common_gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+        stride_dim="*b_dim0"
+    )
+    common_softmax.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        src_template,
+        problem_args_template,
+        ARGS_PARSER_TEMPLATE,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=output_addr_calculator,
+        bias_ptr_arg=bias_ptr_arg,
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_softmax.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common_softmax.SRC_TEMPLATE,
+        PROBLEM_ARGS_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_softmax.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args_template=None,
+):
+    if problem_args_template is None:
+        problem_args = PROBLEM_ARGS_TEMPLATE.render()
+    else:
+        problem_args = problem_args_template.render()
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_softmax.gen_function(
+        func_attrs,
+        common_softmax.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        dim_info_dict,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_softmax.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_softmax.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_softmax.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    a = func_attrs["inputs"][0]
+    b = func_attrs["inputs"][1]
+
+    tmp_c = func_attrs["inputs"][2]
+    tmp_d = func_attrs["inputs"][3]
+    tmp_n = func_attrs["inputs"][4]
+
+    soft = func_attrs["outputs"][0]
+    has_bias = False
+    adims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["input_accessors"][0].original_shapes
+    ]
+    bdims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["input_accessors"][1].original_shapes
+    ]
+    cdims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["output_accessors"][0].original_shapes
+    ]
+    return common_softmax.FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        a_ptr=a._attrs["name"],
+        b_ptr=b._attrs["name"],
+        has_bias=has_bias,
+        c_ptr=tmp_c._attrs["name"],
+        d_ptr=tmp_d._attrs["name"],
+        n_ptr=tmp_n._attrs["name"],
+        soft_ptr=soft._attrs["name"],
+        split_k=func_attrs["split_k"],
+        adims=adims,
+        bdims=bdims,
+        cdims=cdims,
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_softmax.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
new file mode 100644
index 000000000..3b168b3d8
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
@@ -0,0 +1,302 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+namespace cutlass {
+
+template <
+    typename ElementA_,
+    typename LayoutA_,
+    int kAlignmentA,
+    typename ElementB_,
+    typename LayoutB_,
+    int kAlignmentB,
+    typename ElementC_,
+    int kAlignmentC,
+    typename OperatorClass,
+    typename ArchTag,
+    typename ElementAccumulator,
+    int kStages,
+    typename ThreadblockShape,
+    typename WarpShape,
+    typename InstructionShape,
+    typename EpilogueFunctorOp,
+    typename ThreadblockSwizzle,
+    typename ElementSum_ = ElementAccumulator,
+    typename ElementSoftmax_ = ElementC_>
+
+class GemmSoftmaxUniversal {
+ public:
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  //
+  // Type definitions
+  //
+
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+  using ElementCompute = ElementAccumulator;
+  using ElementSum = ElementSum_;
+  using ElementSoft = ElementSoftmax_;
+
+  using LayoutA = LayoutA_;
+  using LayoutB = LayoutB_;
+
+  static int const kAlignment = kAlignmentA;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  /// Linear scaling operator
+  // using EpilogueFunctorOp = cutlass::epilogue::thread::LinearCombination<
+  //   ElementC,
+  //   kAlignment,
+  //   ElementCompute,
+  //   ElementCompute
+  // >;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  // This is a mandatory data type for the atomic reduction in the GEMM epilogue
+  // to function.
+
+  using ElementN = float;
+
+  // These are mandatory layouts.
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutN = cutlass::layout::RowMajor;
+  using LayoutSoft = cutlass::layout::RowMajor;
+
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+  using TensorRefB = TensorRef<ElementB, LayoutB>;
+  using TensorRefC = TensorRef<ElementC, LayoutC>;
+  using TensorRefN = TensorRef<ElementN, LayoutN>;
+  using TensorRefSoft = TensorRef<ElementSoft, LayoutSoft>;
+
+  // using OperatorClass       = cutlass::arch::OpClassTensorOp;
+  // using ArchTag             = cutlass::arch::Sm80;
+  // static int const kStages  = Stages;
+  // using ThreadblockSwizzle =
+  // cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  // basic GEMM kernel
+  using DefaultGemmKernel = typename cutlass::gemm::kernel::DefaultGemm<
+      ElementA,
+      LayoutA,
+      kAlignment,
+      ElementB,
+      LayoutB,
+      kAlignment,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      EpilogueFunctorOp,
+      ThreadblockSwizzle,
+      kStages,
+      true,
+      typename cutlass::gemm::device::DefaultGemmConfiguration<
+          OperatorClass,
+          ArchTag,
+          ElementA,
+          ElementB,
+          ElementC,
+          ElementCompute>::Operator,
+      cutlass::gemm::SharedMemoryClearOption::kNone>::GemmKernel;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Epilogue visitor
+  using EpilogueVisitor = kernel::EpilogueVisitorBiasMax<
+      ThreadblockShape,
+      DefaultGemmKernel::kThreadCount,
+      typename DefaultGemmKernel::Epilogue::OutputTileIterator,
+      ElementCompute,
+      EpilogueFunctorOp>;
+
+  /// Epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      EpilogueWithVisitorFromExistingEpilogue<
+          EpilogueVisitor,
+          typename DefaultGemmKernel::Epilogue>::Epilogue;
+
+  // GEMM
+  using GemmKernel = gemm::kernel::GemmWithEpilogueVisitor<
+      typename DefaultGemmKernel::Mma,
+      Epilogue,
+      ThreadblockSwizzle>;
+
+  // Softmax kernel
+  using SoftmaxApplyKernel = kernel::ApplySoftmax<
+      ElementC,
+      ElementN,
+      ElementSum,
+      ElementSoft,
+      kAlignmentC,
+      MatrixShape<1, 1024>>;
+
+ public:
+  /// Arguments class
+  struct Arguments {
+    typename GemmKernel::Arguments gemm;
+
+    typename SoftmaxApplyKernel::Arguments softmax;
+
+    //
+    // Methods
+    //
+    Arguments() {}
+
+    Arguments(
+        cutlass::gemm::GemmCoord problem_size,
+        int32_t batch_count_,
+        TensorRefA ref_A_,
+        TensorRefB ref_B_,
+        TensorRefC ref_C_,
+        TensorRefC ref_D_,
+        typename EpilogueFunctorOp::Params linear_scaling,
+        TensorRefN ref_N_,
+        TensorRefSoft ref_Softmax_,
+        int64_t batch_stride_A_ = 0,
+        int64_t batch_stride_B_ = 0,
+        int64_t batch_stride_C_ = 0,
+        int64_t batch_stride_D_ = 0,
+        int64_t batch_stride_Max_ = 0,
+        int64_t batch_stride_Softmax_ = 0)
+        : gemm(
+              cutlass::gemm::GemmUniversalMode::kBatched,
+              problem_size,
+              batch_count_,
+              ref_A_,
+              ref_B_,
+              batch_stride_A_,
+              batch_stride_B_,
+              typename EpilogueVisitor::Arguments(
+                  linear_scaling,
+                  ref_C_,
+                  ref_D_,
+                  ref_N_.data(),
+                  batch_stride_C_,
+                  batch_stride_D_,
+                  batch_stride_Max_)),
+          softmax(
+              MatrixCoord(problem_size.m(), problem_size.n()),
+              batch_count_,
+              ref_D_,
+              ref_N_,
+              ref_Softmax_,
+              batch_stride_D_,
+              batch_stride_Max_,
+              batch_stride_Softmax_) {}
+  };
+
+  struct Params {
+    typename GemmKernel::Params gemm;
+
+    typename SoftmaxApplyKernel::Params softmax;
+
+    //
+    // Methods
+    //
+    Params() {}
+
+    Params(Arguments const& args) : gemm(args.gemm), softmax(args.softmax) {}
+  };
+
+ public:
+  // Gemm
+
+  //
+  // Methods
+  //
+
+ private:
+  Params params_;
+
+ public:
+  /// Ctor
+  GemmSoftmaxUniversal() {}
+
+  /// Initialize
+  Status initialize(Arguments const& args) {
+    params_ = Params(args);
+
+    return cutlass::Status::kSuccess;
+  }
+
+  /// Run
+  Status run(cudaStream_t stream) {
+    //
+    // Launch the GEMM + max kernel
+    //
+
+    dim3 gemm_grid =
+        ThreadblockSwizzle().get_grid_shape(params_.gemm.grid_tiled_shape);
+
+    dim3 gemm_block(GemmKernel::kThreadCount, 1, 1);
+
+    int gemm_smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    cutlass::Kernel<GemmKernel>
+        <<<gemm_grid, gemm_block, gemm_smem_size, stream>>>(params_.gemm);
+
+    cudaError_t result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    //
+    // Launch the SoftmaxApplyKernel
+    //
+
+    dim3 apply_block(
+        SoftmaxApplyKernel::Shape::kColumn, SoftmaxApplyKernel::Shape::kRow);
+
+    int cta_rows = SoftmaxApplyKernel::Shape::kRow;
+    int cta_columns =
+        SoftmaxApplyKernel::Shape::kColumn * SoftmaxApplyKernel::kAlignment;
+
+    dim3 apply_grid(
+        (params_.softmax.args.extent.row() + cta_rows - 1) / cta_rows,
+        (params_.softmax.args.extent.column() + cta_columns - 1) / cta_columns,
+        params_.softmax.args.batch_count);
+
+    Kernel<SoftmaxApplyKernel>
+        <<<apply_grid,
+           apply_block,
+           sizeof(typename SoftmaxApplyKernel::SharedStorage),
+           stream>>>(params_.softmax);
+
+    result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    return cutlass::Status::kSuccess;
+  }
+
+  /// Function call operator
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+};
+
+} // namespace cutlass
diff --git a/python/aitemplate/backend/cuda/gemm_special/__init__.py b/python/aitemplate/backend/cuda/gemm_special/__init__.py
new file mode 100644
index 000000000..93043be2c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_special/__init__.py
@@ -0,0 +1,21 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+special gemm ops
+"""
+from . import bmm_rcr_n1, bmm_rrr_k1_tanh, gemm_rrr_small_nk
+
+
+__all__ = ["bmm_rcr_n1", "bmm_rrr_k1_tanh", "gemm_rrr_small_nk"]
diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
new file mode 100644
index 000000000..5582ee24e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
@@ -0,0 +1,616 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+
+Special kernel for GEMV case:
+A: [B, M, K]
+B: [B, N, K]
+C: [B, M, N]
+where N = 1
+
+This kernel computes C = alpha * A @ B
+"""
+
+import jinja2
+
+from ....compiler.base import IntImm
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common import gemm_common, tensor_accessor_codegen
+from ...target import Target
+from ..gemm_universal import common
+
+# pylint: disable=C0301,W0613,W0612
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  {{elem_input_type}}*,
+  {{elem_input_type}}*,
+  {{elem_input_type}}*,
+  {% for i in range(3) %}
+  int64_t*,
+  {% endfor %}
+  {% for i in range(3) %}
+  int64_t*,
+  {% endfor %}
+  {% for i in range(3) %}
+  int64_t*,
+  {% endfor %}
+  float,
+  bool,
+  cudaStream_t
+);
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}{{local_dim_defs}}
+{{indent}}{{func_name}}(
+{{indent}}    {{a_ptr}},
+{{indent}}    {{b_ptr}},
+{{indent}}    {{c_ptr}},
+{% for adim in adims %}
+{{indent}}    {{adim}},
+{% endfor %}
+{% for bdim in bdims %}
+{{indent}}     {{bdim}},
+{% endfor %}
+{% for cdim in cdims %}
+{{indent}}    {{cdim}},
+{% endfor %}
+{{indent}}    {{alpha}},
+{{indent}}    {{use_fp16_acc}},
+{{indent}}    stream
+{{indent}});
+{{indent}}}
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}bmm_rcr_n1_launcher<{{elem_input_type}}, {{read_vec_type}}, {{K}}>(
+{{indent}}    a_ptr,
+{{indent}}    b_ptr,
+{{indent}}    c_ptr,
+{{indent}}    B,
+{{indent}}    M,
+{{indent}}    alpha,
+{{indent}}    use_fp16_acc,
+{{indent}}    stream,
+{{intent}}    input_a_accessor,
+{{intent}}    input_b_accessor,
+{{intent}}    output_accessor
+{{indent}});
+{{indent}}return;
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+namespace {
+
+{{tensor_accessor_libs}}
+
+template<typename ElemT, typename ReadVecT, int64_t K>
+__forceinline__ __device__ bool load_vec_data(
+    ReadVecT* a_ptr,
+    ReadVecT* b_ptr,
+    const int64_t M,
+    float alpha,
+    TensorAccessor input_a_accessor,
+    TensorAccessor input_b_accessor,
+    TensorAccessor output_accessor,
+    ReadVecT *a_vec,
+    ReadVecT *b_vec) {
+
+  int64_t batch_idx = blockIdx.y;
+  int64_t row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  constexpr int64_t N_READ_ELEMS_IN_V = sizeof(ReadVecT) / sizeof(ElemT);
+  constexpr int64_t N_NUM_ELEMS_IN_V = K / N_READ_ELEMS_IN_V;
+
+  int64_t b_idx_base = (batch_idx * K) / N_READ_ELEMS_IN_V;
+
+  if (blockDim.x >= N_NUM_ELEMS_IN_V) {
+    // We have enough threads in a thread block where each thread takes care
+    // of loading one vector.
+    if (threadIdx.x < N_NUM_ELEMS_IN_V) {
+      b_vec[threadIdx.x] = *input_b_accessor.get<ElemT, ReadVecT>(b_ptr, b_idx_base + threadIdx.x);
+    }
+  } else {
+    // We have more vectors than the available threads of a thread block, so each
+    // thread may read multiple vectors.
+    for (int64_t i = 0; i < N_NUM_ELEMS_IN_V / blockDim.x + 1; i++) {
+      int64_t idx = i * blockDim.x + threadIdx.x;
+      if (idx < N_NUM_ELEMS_IN_V) {
+        b_vec[idx] = *input_b_accessor.get<ElemT, ReadVecT>(b_ptr, b_idx_base + idx);
+      }
+    }
+  }
+
+  __syncthreads();
+  if (row_idx >= M) {
+    return false;
+  }
+
+  int64_t a_batch_stride = M * K;
+  int64_t a_idx_base = (batch_idx * a_batch_stride + row_idx * K) / N_READ_ELEMS_IN_V;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int64_t k = 0, i = 0; k < K; k += N_READ_ELEMS_IN_V, i++) {
+    a_vec[i] = *input_a_accessor.get<ElemT, ReadVecT>(a_ptr, a_idx_base++);
+  }
+
+  return true;
+}
+
+// Each thread reads one row from "a" and one column from "b",
+// computes dot_product(a_row, b_col), and writes the result to "c".
+// This kernel assumes loading "a" and "b" can be fully vectorized,
+// so it reads both "a" and "b" in ReadVecT.
+template<typename ElemT, typename ReadVecT, int64_t K>
+__global__ void bmm_rcr_n1_kernel_fp32_acc_vec(
+    ReadVecT* a_ptr,
+    ReadVecT* b_ptr,
+    ElemT* c_ptr,
+    const int64_t M,
+    float alpha,
+    TensorAccessor input_a_accessor,
+    TensorAccessor input_b_accessor,
+    TensorAccessor output_accessor) {
+
+  static_assert(sizeof(ReadVecT) % sizeof(ElemT) == 0, "invalid vector type");
+  constexpr int64_t N_READ_ELEMS_IN_V = sizeof(ReadVecT) / sizeof(ElemT);
+  static_assert(N_READ_ELEMS_IN_V % 2 == 0, "invalid vector type for read");
+  static_assert(K % N_READ_ELEMS_IN_V == 0, "cannot vectorize input");
+  constexpr int64_t N_NUM_ELEMS_IN_V = K / N_READ_ELEMS_IN_V;
+
+  __shared__ ReadVecT b_vec[N_NUM_ELEMS_IN_V];
+  ReadVecT a_vec[N_NUM_ELEMS_IN_V];
+
+  if (!load_vec_data<ElemT, ReadVecT, K>(
+        a_ptr, b_ptr, M, alpha, input_a_accessor, input_b_accessor,
+        output_accessor, a_vec, b_vec)) {
+    return;
+  }
+
+  float result = 0.0;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int64_t i = 0; i < N_NUM_ELEMS_IN_V; i++) {
+    const half2* a_vec_h2 = reinterpret_cast<const half2*>(&a_vec[i]);
+    const half2* b_vec_h2 = reinterpret_cast<const half2*>(&b_vec[i]);
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t j = 0; j < N_READ_ELEMS_IN_V / 2; ++j) {
+      half2 c_h2 = __hmul2(a_vec_h2[j], b_vec_h2[j]);
+      result += float(__low2half(c_h2)) + float(__high2half(c_h2));
+    }
+  }
+
+  int64_t batch_idx = blockIdx.y;
+  int64_t row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  *output_accessor.get<ElemT, ElemT>(c_ptr, batch_idx * M + row_idx) = alpha * result;
+}
+
+template<typename ElemT, int64_t K>
+__forceinline__ __device__ bool load_data(
+    ElemT* a_ptr,
+    ElemT* b_ptr,
+    const int64_t M,
+    float alpha,
+    TensorAccessor input_a_accessor,
+    TensorAccessor input_b_accessor,
+    TensorAccessor output_accessor,
+    ElemT *a_data,
+    ElemT *b_data) {
+
+  int64_t batch_idx = blockIdx.y;
+  int64_t b_idx_base = batch_idx * K;
+  int64_t row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (blockDim.x >= K) {
+    // We have enough threads in a thread block where each thread takes care
+    // of loading one element.
+    if (threadIdx.x < K) {
+      b_data[threadIdx.x] = *input_b_accessor.get<ElemT, ElemT>(b_ptr, b_idx_base + threadIdx.x);
+    }
+  } else {
+    // We have more elements than the available threads of a thread block, so each
+    // thread may load multiple elements.
+    for (int64_t i = 0; i < K / blockDim.x + 1; i++) {
+      int64_t idx = i * blockDim.x + threadIdx.x;
+      if (idx < K) {
+        b_data[idx] = *input_b_accessor.get<ElemT, ElemT>(b_ptr, b_idx_base + idx);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (row_idx >= M) {
+    return false;
+  }
+
+  int64_t a_batch_stride = M * K;
+  int64_t a_idx_base = batch_idx * a_batch_stride + row_idx * K;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int64_t i = 0; i < K; i++) {
+    a_data[i] = *input_a_accessor.get<ElemT, ElemT>(a_ptr, a_idx_base++);
+  }
+
+  return true;
+}
+
+// Each thread reads one row from "a" and one column from "b",
+// computes dot_product(a_row, b_col), and writes the result to "c".
+// It reads both "a" and "b" one by one in ElemT.
+template<typename ElemT, typename ReadVecT, int64_t K>
+__global__ void bmm_rcr_n1_kernel_fp32_acc(
+    ElemT* a_ptr,
+    ElemT* b_ptr,
+    ElemT* c_ptr,
+    const int64_t M,
+    float alpha,
+    TensorAccessor input_a_accessor,
+    TensorAccessor input_b_accessor,
+    TensorAccessor output_accessor) {
+
+  __shared__ ElemT b_data[K];
+  ElemT a_data[K];
+
+  if (!load_data<ElemT, K>(
+        a_ptr, b_ptr, M, alpha, input_a_accessor, input_b_accessor,
+        output_accessor, a_data, b_data)) {
+    return;
+  }
+
+  float result = 0.0;
+
+  const half2* a_data_h2 = reinterpret_cast<const half2*>(&a_data[0]);
+  const half2* b_data_h2 = reinterpret_cast<const half2*>(&b_data[0]);
+  CUTLASS_PRAGMA_UNROLL
+  for (int64_t i = 0; i < K / 2; ++i) {
+    half2 c_h2 = __hmul2(a_data_h2[i], b_data_h2[i]);
+    result += float(__low2half(c_h2)) + float(__high2half(c_h2));
+  }
+  if (K % 2) {
+    result += float(__hmul(reinterpret_cast<half&>(a_data[K-1]),
+                           reinterpret_cast<half&>(b_data[K-1])));
+  }
+
+  int64_t batch_idx = blockIdx.y;
+  int64_t row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  *output_accessor.get<ElemT, ElemT>(c_ptr, batch_idx * M + row_idx) = alpha * result;
+}
+
+template<typename ElemT, typename ReadVecT, int64_t K>
+__global__ void bmm_rcr_n1_kernel_fp16_acc_vec(
+    ReadVecT* a_ptr,
+    ReadVecT* b_ptr,
+    ElemT* c_ptr,
+    const int64_t M,
+    float alpha,
+    TensorAccessor input_a_accessor,
+    TensorAccessor input_b_accessor,
+    TensorAccessor output_accessor) {
+
+  static_assert(sizeof(ReadVecT) % sizeof(ElemT) == 0, "invalid vector type");
+  constexpr int64_t N_READ_ELEMS_IN_V = sizeof(ReadVecT) / sizeof(ElemT);
+  static_assert(N_READ_ELEMS_IN_V % 2 == 0, "invalid vector type for read");
+  static_assert(K % N_READ_ELEMS_IN_V == 0, "cannot vectorize input");
+  constexpr int64_t N_NUM_ELEMS_IN_V = K / N_READ_ELEMS_IN_V;
+
+  __shared__ ReadVecT b_vec[N_NUM_ELEMS_IN_V];
+  ReadVecT a_vec[N_NUM_ELEMS_IN_V];
+
+  if (!load_vec_data<ElemT, ReadVecT, K>(
+        a_ptr, b_ptr, M, alpha, input_a_accessor, input_b_accessor,
+        output_accessor, a_vec, b_vec)) {
+    return;
+  }
+
+  half2 result_h2 = {0.0, 0.0};
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int64_t i = 0; i < N_NUM_ELEMS_IN_V; i++) {
+    const half2* a_vec_h2 = reinterpret_cast<const half2*>(&a_vec[i]);
+    const half2* b_vec_h2 = reinterpret_cast<const half2*>(&b_vec[i]);
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t j = 0; j < N_READ_ELEMS_IN_V / 2; ++j) {
+      result_h2 = __hfma2(a_vec_h2[j], b_vec_h2[j], result_h2);
+    }
+  }
+
+  float result = __hadd(__low2half(result_h2), __high2half(result_h2));
+
+  int64_t batch_idx = blockIdx.y;
+  int64_t row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  *output_accessor.get<ElemT, ElemT>(c_ptr, batch_idx * M + row_idx) = alpha * result;
+}
+
+template<typename ElemT, typename ReadVecT, int64_t K>
+__global__ void bmm_rcr_n1_kernel_fp16_acc(
+    ElemT* a_ptr,
+    ElemT* b_ptr,
+    ElemT* c_ptr,
+    const int64_t M,
+    float alpha,
+    TensorAccessor input_a_accessor,
+    TensorAccessor input_b_accessor,
+    TensorAccessor output_accessor) {
+
+  __shared__ ElemT b_data[K];
+  ElemT a_data[K];
+
+  if (!load_data<ElemT, K>(
+        a_ptr, b_ptr, M, alpha, input_a_accessor, input_b_accessor,
+        output_accessor, a_data, b_data)) {
+    return;
+  }
+
+  half2 result_h2 = {0.0, 0.0};
+
+  const half2* a_data_h2 = reinterpret_cast<const half2*>(&a_data[0]);
+  const half2* b_data_h2 = reinterpret_cast<const half2*>(&b_data[0]);
+  CUTLASS_PRAGMA_UNROLL
+  for (int64_t i = 0; i < K / 2; ++i) {
+    result_h2 = __hfma2(a_data_h2[i], b_data_h2[i], result_h2);
+  }
+
+  half result = __hadd(__low2half(result_h2), __high2half(result_h2));
+  if (K % 2) {
+    result = __hfma(reinterpret_cast<const half&>(a_data[K-1]),
+                    reinterpret_cast<const half&>(b_data[K-1]),
+                    result);
+  }
+
+  int64_t batch_idx = blockIdx.y;
+  int64_t row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  *output_accessor.get<ElemT, ElemT>(c_ptr, batch_idx * M + row_idx) =
+      alpha * (float)result;
+}
+
+// N = 1, K is small
+template<typename ElemT, typename ReadVecT, int64_t K>
+void bmm_rcr_n1_launcher(ElemT* a_ptr,
+                         ElemT* b_ptr,
+                         ElemT* c_ptr,
+                         int64_t B,
+                         int64_t M,
+                         float alpha,
+                         bool use_fp16_acc,
+                         cudaStream_t stream,
+                         const TensorAccessor& input_a_accessor,
+                         const TensorAccessor& input_b_accessor,
+                         const TensorAccessor& output_accessor) {
+  const int nthread = 256;
+  dim3 thread_block(nthread);
+  dim3 grid((M + nthread - 1) / nthread, B);
+
+  if(use_fp16_acc) {
+    {{bmm_rcr_n1_kernel_fp16}}<ElemT, ReadVecT, K>
+    <<<grid, thread_block, 0, stream>>>(
+      (ReadVecT*)a_ptr,
+      (ReadVecT*)b_ptr,
+      c_ptr,
+      M,
+      alpha,
+      input_a_accessor,
+      input_b_accessor,
+      output_accessor
+    );
+  } else {
+    {{bmm_rcr_n1_kernel_fp32}}<ElemT, ReadVecT, K>
+    <<<grid, thread_block, 0, stream>>>(
+      (ReadVecT*)a_ptr,
+      (ReadVecT*)b_ptr,
+      c_ptr,
+      M,
+      alpha,
+      input_a_accessor,
+      input_b_accessor,
+      output_accessor
+    );
+  }
+}
+
+} // namespace
+
+void {{function_name}} (
+    {{elem_input_type}}* a_ptr,
+    {{elem_input_type}}* b_ptr,
+    {{elem_input_type}}* c_ptr,
+    {% for i in range(3) %}
+    int64_t *a_dim{{loop.index0}},
+    {% endfor %}
+    {% for i in range(3) %}
+    int64_t *b_dim{{loop.index0}},
+    {% endfor %}
+    {% for i in range(3) %}
+    int64_t *c_dim{{loop.index0}},
+    {% endfor %}
+    float alpha,
+    bool use_fp16_acc,
+    cudaStream_t stream
+) {
+  {{shape_function}}
+  {{input_output_checks}}
+  {{input_accessors}}
+  {{output_accessors}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("cuda.bmm_rcr_n1.gen_function")
+def gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    func_name = func_attrs["name"]
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    def _get_original_dim_val(func_attrs, input_idx, dim):
+        accessor = func_attrs["input_accessors"][input_idx]
+        shape = accessor.original_shapes
+        assert isinstance(
+            shape[dim], IntImm
+        ), f"input {input_idx}'s dim {dim} must be static. Instead it's dynamic"
+        k = shape[dim]._attrs["values"][0]
+        return k
+
+    # Get original k value in case it's changed to a strided tensor after
+    # fusing split op into bmm_rcr. Strided dim can only be the last dim.
+    ak = _get_original_dim_val(func_attrs, 0, 2)
+    bk = _get_original_dim_val(func_attrs, 1, 2)
+    assert ak == bk, f"ak is not equal to bk. ak: {ak}, bk: {bk}"
+
+    elem_input_type = "cutlass::half_t"
+    backend_spec = CUDASpec()
+    vec_lens = list(zip(*backend_spec.read_num_elements_to_backend_type))[0][:-1]
+    alignment = tensor_accessor_codegen.find_max_alignment(
+        ak, func_attrs["input_accessors"]
+    )
+    if alignment % 2:
+        bmm_rcr_n1_kernel_fp32 = "bmm_rcr_n1_kernel_fp32_acc"
+        bmm_rcr_n1_kernel_fp16 = "bmm_rcr_n1_kernel_fp16_acc"
+        read_vec_type = elem_input_type
+    else:
+        for vec_idx, vec_len in enumerate(vec_lens):
+            if ak % vec_len == 0:
+                bmm_rcr_n1_kernel_fp32 = "bmm_rcr_n1_kernel_fp32_acc_vec"
+                bmm_rcr_n1_kernel_fp16 = "bmm_rcr_n1_kernel_fp16_acc_vec"
+                read_vec_type = backend_spec.read_num_elements_to_backend_type[vec_idx][
+                    1
+                ]
+                break
+
+    input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=3,
+        weight_ndims=3,
+        output_ndims=3,
+    )
+    if ak == 0:
+        # avoid compilation failure (zero-sized variable not alowed in device code)
+        # caused by instantiating the template with K=0
+        exec_paths = ""
+    else:
+        exec_paths = EXEC_TEMPLATE.render(
+            indent="  ",
+            read_vec_type=read_vec_type,
+            elem_input_type=elem_input_type,
+            K=ak,
+        )
+
+    input_a_accessor = tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+        name="input_a_accessor", tensor_accessor=func_attrs["input_accessors"][0]
+    )
+
+    input_b_accessor = tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+        name="input_b_accessor", tensor_accessor=func_attrs["input_accessors"][1]
+    )
+
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        bmm_rcr_n1_kernel_fp32=bmm_rcr_n1_kernel_fp32,
+        bmm_rcr_n1_kernel_fp16=bmm_rcr_n1_kernel_fp16,
+        shape_function=shape_func,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_paths,
+        tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
+        input_accessors=input_a_accessor + input_b_accessor,
+        output_accessors=tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+            name="output_accessor", tensor_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
+
+
+@registry.reg("cuda.bmm_rcr_n1.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, elem_input_type="cutlass::half_t"
+    )
+
+
+@registry.reg("cuda.bmm_rcr_n1.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    a = func_attrs["inputs"][0]
+    ashape = func_attrs["input_accessors"][0].original_shapes
+    adims = ["&" + dim._attrs["name"] for dim in ashape]
+    b = func_attrs["inputs"][1]
+    bshape = func_attrs["input_accessors"][1].original_shapes
+    bdims = ["&" + dim._attrs["name"] for dim in bshape]
+    c = func_attrs["outputs"][0]
+    cshape = func_attrs["output_accessors"][0].original_shapes
+    cdims = ["&" + dim._attrs["name"] for dim in cshape]
+    alpha = func_attrs["alpha"]
+    use_fp16_acc = False
+    if "use_fp16_acc" in Target.current()._kwargs:
+        use_fp16_acc = Target.current()._kwargs["use_fp16_acc"]
+    return FUNC_CALL_TEMPLATE.render(
+        local_dim_defs=common.gen_local_dim_defs(func_attrs, indent=indent),
+        func_name=func_attrs["name"],
+        a_ptr=a._attrs["name"],
+        b_ptr=b._attrs["name"],
+        c_ptr=c._attrs["name"],
+        adims=adims,
+        bdims=bdims,
+        cdims=cdims,
+        alpha=alpha,
+        use_fp16_acc="true" if use_fp16_acc else "false",
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.bmm_rcr_n1.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
new file mode 100644
index 000000000..de29a6ab7
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
@@ -0,0 +1,258 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen for bmm_rrr_k1_tanh.
+
+This kernel computes C = tanh(alpha * A @ B), where:
+A[RowMajor]: [B, M, 1]
+B[RowMajor]: [B, 1, N]
+C[RowMajor]: [B, M, N]
+"""
+import jinja2
+
+from ... import registry
+from ...common import gemm_common
+from ..gemm_universal import common
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  {% for i in range(3) %}
+  int64_t*,
+  {% endfor %}
+  {% for i in range(3) %}
+  int64_t*,
+  {% endfor %}
+  {% for i in range(3) %}
+  int64_t*,
+  {% endfor %}
+cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{a_ptr}},
+{{indent}}    {{b_ptr}},
+{{indent}}    {{c_ptr}},
+{% for adim in adims %}
+{{indent}} {{adim}},
+{% endfor %}
+{% for bdim in bdims %}
+{{indent}} {{bdim}},
+{% endfor %}
+{% for cdim in cdims %}
+{{indent}} {{cdim}},
+{% endfor %}
+{{indent}}  stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}bmm_rrr_k1_tanh_launcher(
+{{indent}}    a_ptr,
+{{indent}}    b_ptr,
+{{indent}}    c_ptr,
+{{indent}}    B,
+{{indent}}    M,
+{{indent}}    N,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/fast_math.h"
+
+#ifndef __HALF_TO_US
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#endif
+
+namespace {
+
+__device__ half fast_tanh(half x) {
+  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+
+  asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(__HALF_TO_US(x)) : "h"(__HALF_TO_US(x)));
+  return x;
+
+  #else
+  return half(cutlass::fast_tanh(float(x)));
+  #endif
+}
+
+template<int num_thread>
+__global__ void bmm_rrr_k1_tanh_kernel(const float4* a_ptr,
+                                  const float4* b_ptr,
+                                  float4* c_ptr,
+                                  const int B,
+                                  const int M,
+                                  const int N) {
+  // TODO: check boundary
+  half tmp[64];
+  int idx = blockIdx.x * num_thread + threadIdx.x;
+  int m = idx % M;
+  int b = idx / M;
+  int a_idx_base = b * M + m;
+  float4 a_vec = __ldg(a_ptr + a_idx_base);
+  half* a_vec_ptr = (half*)(&a_vec);
+  for (int n = 0; n < N; ++n) {
+    int b_idx_base = b * N + n;
+    float4 b_vec = __ldg(b_ptr + b_idx_base);
+    half* b_vec_ptr = (half*)(&b_vec);
+    for (int i = 0; i < 8; ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < 8; ++j) {
+        tmp[i * 8 + j] = fast_tanh(__hmul(a_vec_ptr[i], b_vec_ptr[j]));
+      }
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+        int c_idx = (b * M * 8  + m * 8 + i) * N  + n;
+        c_ptr[c_idx] = *((const float4*)(tmp + i * 8));
+    }
+  }
+}
+
+
+void bmm_rrr_k1_tanh_launcher(cutlass::half_t* a_ptr,
+                         cutlass::half_t* b_ptr,
+                         cutlass::half_t* c_ptr,
+                         int B,
+                         int M,
+                         int N,
+                         cudaStream_t stream) {
+  const int nthread = 256;
+  dim3 thread_block(nthread);
+  dim3 grid(B * M / nthread / 8);
+  bmm_rrr_k1_tanh_kernel<nthread><<<grid, thread_block, 0, stream>>>(
+    (const float4*)a_ptr,
+    (const float4*)b_ptr,
+    (float4*) c_ptr,
+    B,
+    M / 8,
+    N / 8
+  );
+}
+
+} // namespace
+
+void {{function_name}} (
+    cutlass::half_t* a_ptr,
+    cutlass::half_t* b_ptr,
+    cutlass::half_t* c_ptr,
+    {% for i in range(3) %}
+    int64_t *a_dim{{loop.index0}},
+    {% endfor %}
+    {% for i in range(3) %}
+    int64_t *b_dim{{loop.index0}},
+    {% endfor %}
+    {% for i in range(3) %}
+    int64_t *c_dim{{loop.index0}},
+    {% endfor %}
+    cudaStream_t stream
+) {
+  {{shape_function}}
+  {{input_output_checks}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("cuda.bmm_rrr_k1_tanh.gen_function")
+def gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    func_name = func_attrs["name"]
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+    input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=3,
+        weight_ndims=3,
+        output_ndims=3,
+    )
+    exec_paths = EXEC_TEMPLATE.render()
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        shape_function=shape_func,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_paths,
+    )
+
+
+@registry.reg("cuda.bmm_rrr_k1_tanh.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.bmm_rrr_k1_tanh.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    a = func_attrs["inputs"][0]
+    ashape = a._attrs["shape"]
+    adims = ["&" + dim._attrs["name"] for dim in ashape]
+    b = func_attrs["inputs"][1]
+    bshape = b._attrs["shape"]
+    bdims = ["&" + dim._attrs["name"] for dim in bshape]
+    c = func_attrs["outputs"][0]
+    cshape = c._attrs["shape"]
+    cdims = ["&" + dim._attrs["name"] for dim in cshape]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        a_ptr=a._attrs["name"],
+        b_ptr=b._attrs["name"],
+        c_ptr=c._attrs["name"],
+        adims=adims,
+        bdims=bdims,
+        cdims=cdims,
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.bmm_rrr_k1_tanh.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
new file mode 100644
index 000000000..81ed764e8
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
@@ -0,0 +1,374 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[RowMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+
+Special kernel for small K and N
+K <= 8, N <= 8
+A: [M, K] A can be ND with the first N - 1 dimensions as batch dimensions
+B: [K, N]
+C: [M, N]
+"""
+
+import jinja2
+
+from ... import registry
+from ...common import gemm_common
+from ...target import Target
+from ..gemm_universal import common
+
+# pylint: disable=C0301,W0613,W0612
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  {% for i in range(a_ndim) %}
+  int64_t*,
+  {% endfor %}
+  {% for i in range(b_ndim) %}
+  int64_t*,
+  {% endfor %}
+  {% for i in range(c_ndim) %}
+  int64_t*,
+  {% endfor %}
+  bool,
+  cudaStream_t
+);
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{a_ptr}},
+{{indent}}    {{b_ptr}},
+{{indent}}    {{c_ptr}},
+{% for adim in adims %}
+{{indent}}    {{adim}},
+{% endfor %}
+{% for bdim in bdims %}
+{{indent}}    {{bdim}},
+{% endfor %}
+{% for cdim in cdims %}
+{{indent}}    {{cdim}},
+{% endfor %}
+{{indent}}    {{use_fp16_acc}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}gemm_rrr_small_nk_launcher<{{N}}, {{K}}>(
+{{indent}}    a_ptr,
+{{indent}}    b_ptr,
+{{indent}}    c_ptr,
+{{indent}}    M,
+{{indent}}    use_fp16_acc,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+namespace {
+
+// For each thread, read
+// A tile: 8 x K
+// B matrix: K x N
+// C tile: 8 x N
+template<int num_thread, int N, int K, bool USE_FP16_ACC>
+__global__ void gemm_rrr_small_nk_kernel(float4* a_ptr,
+                                         float4* b_ptr,
+                                         float4* c_ptr,
+                                         int M) {
+  int idx = blockIdx.x * num_thread + threadIdx.x;
+
+  if (idx >= (M + 7) / 8) {
+    return;
+  }
+
+  int a_idx_base = idx * K;
+  a_ptr += a_idx_base;
+
+  // load b matrix
+  half b[K][N];
+  half* b_half = reinterpret_cast<half*>(b_ptr);
+  for (int i = 0; i < K; ++i) {
+    for (int j = 0; j < N; ++j) {
+      b[i][j] = b_half[i * N + j];
+    }
+  }
+
+  int c_idx_base = idx * N;
+  c_ptr += c_idx_base;
+
+  half c_tile[8][N];
+
+  if (idx <= M / 8 - 1) {
+    // fast kernel
+    // load a
+    float4 a_tile_vec[K];
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < K; i++) {
+      a_tile_vec[i] = __ldg(a_ptr++);
+    }
+    half* a_tile = reinterpret_cast<half*>(&a_tile_vec);
+
+    // compute
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < N; ++j) {
+        if (USE_FP16_ACC) {
+          half sum = 0;
+          CUTLASS_PRAGMA_UNROLL
+          for (int k = 0; k < K; ++k) {
+            sum = __hfma(a_tile[i * K + k], b[k][j], sum);
+          }
+          c_tile[i][j] = sum;
+        } else {
+          float sum = 0;
+          CUTLASS_PRAGMA_UNROLL
+          for (int k = 0; k < K; ++k) {
+            sum += __half2float(__hmul(a_tile[i * K + k], b[k][j]));
+          }
+          c_tile[i][j] = __float2half_rn(sum);
+        }
+      }
+    }
+
+    // write c
+    float4* c_tile_vec = reinterpret_cast<float4*>(&c_tile);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; i++) {
+      c_ptr[i] = c_tile_vec[i];
+    }
+  } else {
+    // process tail
+    // load a
+    half* a_h = reinterpret_cast<half*>(a_ptr);
+    int m = M - M / 8 * 8;
+    half a_tile[8][K];
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < m; i++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < K; j++) {
+        a_tile[i][j] = a_h[i * K + j];
+      }
+    }
+
+    // compute
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < m; ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < N; ++j) {
+        if (USE_FP16_ACC) {
+          half sum = 0;
+          CUTLASS_PRAGMA_UNROLL
+          for (int k = 0; k < K; ++k) {
+            sum = __hfma(a_tile[i][k], b[k][j], sum);
+          }
+          c_tile[i][j] = sum;
+        } else {
+          float sum = 0;
+          CUTLASS_PRAGMA_UNROLL
+          for (int k = 0; k < K; ++k) {
+            sum += __half2float(__hmul(a_tile[i][k], b[k][j]));
+          }
+          c_tile[i][j] = __float2half_rn(sum);
+        }
+      }
+    }
+
+    // write c
+    half* c_h = reinterpret_cast<half*>(c_ptr);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < m; i++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < N; j++) {
+        c_h[i * N + j] = c_tile[i][j];
+      }
+    }
+  }
+}
+
+// N <= 8, K <= 8
+template<int N, int K>
+void gemm_rrr_small_nk_launcher(cutlass::half_t* a_ptr,
+                         cutlass::half_t* b_ptr,
+                         cutlass::half_t* c_ptr,
+                         int M,
+                         bool use_fp16_acc,
+                         cudaStream_t stream) {
+  const int nthread = 256;
+  dim3 thread_block(nthread);
+  const int n_element_per_t = nthread * 8;
+  dim3 grid((M + n_element_per_t - 1) / n_element_per_t);
+  if(use_fp16_acc) {
+    gemm_rrr_small_nk_kernel<nthread, N, K, true><<<grid, thread_block, 0, stream>>>(
+      (float4*)a_ptr,
+      (float4*)b_ptr,
+      (float4*)c_ptr,
+      M
+    );
+  } else {
+    gemm_rrr_small_nk_kernel<nthread, N, K, false><<<grid, thread_block, 0, stream>>>(
+      (float4*)a_ptr,
+      (float4*)b_ptr,
+      (float4*)c_ptr,
+      M
+    );
+  }
+}
+
+} // namespace
+
+void {{function_name}} (
+    cutlass::half_t* a_ptr,
+    cutlass::half_t* b_ptr,
+    cutlass::half_t* c_ptr,
+    {% for i in range(a_ndim) %}
+    int64_t *a_dim{{loop.index0}},
+    {% endfor %}
+    {% for i in range(b_ndim) %}
+    int64_t *b_dim{{loop.index0}},
+    {% endfor %}
+    {% for i in range(c_ndim) %}
+    int64_t *c_dim{{loop.index0}},
+    {% endfor %}
+    bool use_fp16_acc,
+    cudaStream_t stream
+) {
+  {{shape_function}}
+  {{input_output_checks}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("cuda.gemm_rrr_small_nk.gen_function")
+def gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    func_name = func_attrs["name"]
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    b = func_attrs["inputs"][1]
+    bshape = b._attrs["shape"]
+    k = bshape[0]._attrs["values"][0]
+    n = bshape[1]._attrs["values"][0]
+
+    a_ndim = func_attrs["inputs"][0]._rank()
+    b_ndim = func_attrs["inputs"][1]._rank()
+    c_ndim = func_attrs["outputs"][0]._rank()
+
+    input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=a_ndim,
+        weight_ndims=2,
+        output_ndims=c_ndim,
+    )
+    if n == 0 or k == 0:
+        # avoid "zero-sized variable not allowed in device code" error
+        exec_paths = ""
+    else:
+        exec_paths = EXEC_TEMPLATE.render(indent="  ", N=n, K=k)
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        shape_function=shape_func,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_paths,
+        a_ndim=a_ndim,
+        b_ndim=b_ndim,
+        c_ndim=c_ndim,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_small_nk.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    a_ndim = func_attrs["inputs"][0]._rank()
+    b_ndim = func_attrs["inputs"][1]._rank()
+    c_ndim = func_attrs["outputs"][0]._rank()
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, a_ndim=a_ndim, b_ndim=b_ndim, c_ndim=c_ndim
+    )
+
+
+@registry.reg("cuda.gemm_rrr_small_nk.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    a = func_attrs["inputs"][0]
+    ashape = a._attrs["shape"]
+    adims = ["&" + dim._attrs["name"] for dim in ashape]
+    b = func_attrs["inputs"][1]
+    bshape = b._attrs["shape"]
+    bdims = ["&" + dim._attrs["name"] for dim in bshape]
+    c = func_attrs["outputs"][0]
+    cshape = c._attrs["shape"]
+    cdims = ["&" + dim._attrs["name"] for dim in cshape]
+    use_fp16_acc = False
+    if "use_fp16_acc" in Target.current()._kwargs:
+        use_fp16_acc = Target.current()._kwargs["use_fp16_acc"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        a_ptr=a._attrs["name"],
+        b_ptr=b._attrs["name"],
+        c_ptr=c._attrs["name"],
+        adims=adims,
+        bdims=bdims,
+        cdims=cdims,
+        use_fp16_acc="true" if use_fp16_acc else "false",
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_small_nk.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/__init__.py b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
new file mode 100644
index 000000000..c07983128
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
@@ -0,0 +1,61 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from . import (
+    bmm_ccr,
+    bmm_ccr_add,
+    bmm_crr,
+    bmm_crr_add,
+    bmm_rcr,
+    bmm_rcr_permute,
+    bmm_rrr,
+    bmm_rrr_add,
+    bmm_rrr_permute,
+    gemm_rcr,
+    gemm_rcr_bias,
+    gemm_rcr_bias_add,
+    gemm_rcr_bias_add_add,
+    gemm_rcr_bias_add_add_relu,
+    gemm_rcr_bias_add_relu,
+    gemm_rcr_bias_fast_gelu,
+    gemm_rcr_bias_gelu,
+    gemm_rcr_bias_hardswish,
+    gemm_rcr_bias_mul,
+    gemm_rcr_bias_mul_add,
+    gemm_rcr_bias_mul_tanh,
+    gemm_rcr_bias_permute,
+    gemm_rcr_bias_relu,
+    gemm_rcr_bias_sigmoid,
+    gemm_rcr_bias_sigmoid_mul,
+    gemm_rcr_bias_sigmoid_mul_tanh,
+    gemm_rcr_bias_swish,
+    gemm_rcr_bias_tanh,
+    gemm_rcr_permute,
+    gemm_rrr,
+    gemm_rrr_permute,
+    group_gemm_rcr,
+    group_gemm_rcr_bias,
+    group_gemm_rcr_bias_relu,
+    group_gemm_rcr_bias_sigmoid,
+    perm021fc_ccr,
+    perm021fc_ccr_bias,
+    perm021fc_ccr_bias_permute,
+    perm021fc_crc,
+    perm021fc_crc_bias,
+    perm102_bmm_rcr,
+    perm102_bmm_rcr_bias,
+    perm102_bmm_rrr,
+    perm102_bmm_rrr_bias,
+)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
new file mode 100644
index 000000000..25ad9e9a8
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
@@ -0,0 +1,142 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen for bmm_ccr, which computes A @ B + bias.
+A[ColMajor], B[ColMajor], bias[RowMajor]
+"""
+from ... import registry
+from ...common import gemm_common
+from . import bmm_common, common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_problem_info(**kwargs):
+    problem_args = {
+        "bias_ptr": "c_ptr",
+        "a_batch_stride": "M * K",
+        "b_batch_stride": "N * K",
+        "bias_batch_stride": "M * N",
+        "c_batch_stride": "M * N",
+        "lda": "M",
+        "ldb": "K",
+        "ldbias": "N",
+        "ldc": "N",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+@registry.reg("cuda.bmm_ccr.config")
+def bmm_ccr_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+        )
+
+    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+
+
+@registry.reg("cuda.bmm_ccr.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    a_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 0
+    )
+    b_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 1
+    )
+    c_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.OUTPUT, 0
+    )
+
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.bmm_ccr.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.bmm_ccr.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.bmm_ccr.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.bmm_ccr.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
new file mode 100644
index 000000000..ea9ff0510
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
@@ -0,0 +1,120 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen for bmm_ccr_add, which computes A @ B + bias + C.
+A[ColMajor], B[ColMajor], bias / C[RowMajor]
+"""
+from ... import registry
+from ...common import gemm_common
+from . import bmm_ccr, bmm_common, common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+@registry.reg("cuda.bmm_ccr_add.config")
+def bmm_ccr_add_config(func_attrs, dtype="float16"):
+    return bmm_ccr.bmm_ccr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.bmm_ccr_add.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    a_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 0
+    )
+    b_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 1
+    )
+    c_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.OUTPUT, 0
+    )
+
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    mm_info = bmm_ccr._get_problem_info(
+        bias_ptr="d_ptr",
+        alpha_value=func_attrs.get("alpha", 1),
+        beta_value=1,
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    d_shapes = func_attrs["input_accessors"][2].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.bmm_ccr_add.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    mm_info = bmm_ccr._get_problem_info(
+        bias_ptr="d_ptr", alpha_value=func_attrs.get("alpha", 1), beta_value=1
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    d_shapes = func_attrs["input_accessors"][2].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.bmm_ccr_add.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.bmm_ccr_add.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.bmm_ccr_add.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
new file mode 100644
index 000000000..7b22806e3
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -0,0 +1,391 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common functions and templates for bmm-family ops
+"""
+from dataclasses import dataclass
+
+import jinja2
+
+from ...common import gemm_common
+from . import common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+# ARGS_PARSER is only used by profiler, so the batch is not of concern.
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+
+{% for dim in a_dims %}
+  int64_t a_dim{{loop.index0}} = {{dim}};
+{% endfor %}
+{% for dim in b_dims %}
+  int64_t b_dim{{loop.index0}} = {{dim}};
+{% endfor %}
+{% for dim in c_dims %}
+  int64_t c_dim{{loop.index0}} = {{dim}};
+{% endfor %}
+"""
+)
+
+OUTPUT_ADDR_CALCULATOR = jinja2.Template(
+    """
+  int64_t output_batch_stride = {{output_batch_stride_dim}};
+  int64_t output_stride = {{output_stride_dim}};
+  int64_t output_offset = {{output_offset_val}}; // default to 0
+    """
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+{% if has_d %}
+  cutlass::half_t*,
+{% endif %}
+  cutlass::half_t*,
+  uint8_t*,
+{% if support_split_k %}
+  int,
+{% endif %}
+{% for idx in range(a_ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(b_ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(c_ndims) %}
+  int64_t*,
+{% endfor %}
+  cudaStream_t
+);
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}{{local_dim_defs}}
+{{indent}}{{func_name}}(
+{{indent}}    {{a_ptr}},
+{{indent}}    {{b_ptr}},
+{% if has_d %}
+{{indent}}    {{d_ptr}},
+{% endif %}
+{% if has_bias %}
+{{indent}}    {{bias_ptr}},
+{% endif %}
+{{indent}}    {{c_ptr}},
+{{indent}}    global_workspace,
+{% for dim in a_dims_ptr %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in b_dims_ptr %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in c_dims_ptr %}
+{{indent}}    {{dim}},
+{% endfor %}
+{{indent}}    stream
+{{indent}});
+{{indent}}}
+"""
+)
+
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  // cast to int64_t to avoid overflow
+  int64_t a_ptr_sz = 1;
+  {% for idx in range(a_ndims) %}
+  {{indent}} {{indent}} a_ptr_sz *= static_cast<int64_t>(a_dim{{idx}});
+  {% endfor %}
+
+  int64_t b_ptr_sz = 1;
+  {% for idx in range(b_ndims) %}
+  {{indent}} {{indent}} b_ptr_sz *= static_cast<int64_t>(b_dim{{idx}});
+  {% endfor %}
+
+  int64_t c_ptr_sz = 1;
+  {% for idx in range(c_ndims) %}
+  {{indent}} {{indent}} c_ptr_sz *= static_cast<int64_t>(c_dim{{idx}});
+  {% endfor %}
+
+  // The value 1 is used to force ptr_max_sz to be non-zero
+  int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for A100 L2 cache 40M
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+
+  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
+  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+{% if has_bias %}
+  memory_pool->AllocateHalfTensor(c_dim2, mem_pool_sz);  // bias_ptr: index 3
+{% endif %}
+{% if has_d %}
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d_ptr: index 3 (no bias) or 4
+{% endif %}
+"""
+)
+
+
+@dataclass
+class Bmm_problem_info:
+    alpha_value: float = 1
+    beta_value: float = 0
+    problem_size: str = "{M, N, K}"
+    batch_size: str = "B"
+    a_ptr: str = "a_ptr"
+    b_ptr: str = "b_ptr"
+    bias_ptr: str = "d_ptr"
+    c_ptr: str = "c_ptr"
+    a_batch_stride: str = "0"
+    b_batch_stride: str = "0"
+    bias_batch_stride: str = "0"
+    c_batch_stride: str = "0"
+    lda: str = "0"
+    ldb: str = "0"
+    ldbias: str = "0"
+    ldc: str = "0"
+
+
+def _update_stride_info(mm_info, a_shapes, b_shapes, bias_shapes=None):
+    if len(a_shapes) == 2 or a_shapes[0] == 1:
+        mm_info.a_batch_stride = "0"
+    if len(b_shapes) == 2 or b_shapes[0] == 1:
+        mm_info.b_batch_stride = "0"
+
+    if bias_shapes is None:
+        return
+
+    if len(bias_shapes) < 3 or bias_shapes[0] == 1:
+        mm_info.bias_batch_stride = "0"
+    if len(bias_shapes) < 2 or all([x == 1 for x in bias_shapes[:-1]]):
+        mm_info.ldbias = "0"
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kBatched,
+    {{mm_info.problem_size}},
+    {{mm_info.batch_size}},
+    {ElementComputeEpilogue({{mm_info.alpha_value}}), ElementComputeEpilogue({{mm_info.beta_value}})},
+    (void*) {{mm_info.a_ptr}},
+    (void*) {{mm_info.b_ptr}},
+    (void*) {{mm_info.bias_ptr}},
+    (void*) {{mm_info.c_ptr}},
+    {{mm_info.a_batch_stride}},
+    {{mm_info.b_batch_stride}},
+    {{mm_info.bias_batch_stride}},
+    {{mm_info.c_batch_stride}},
+    {{mm_info.lda}},
+    {{mm_info.ldb}},
+    {{mm_info.ldbias}},
+    {{mm_info.ldc}}
+"""
+)
+
+
+def reverse_dim_info_mapping(dim_info_dict, source, tensor_idx):
+    def _fill(arr, idx, val):
+        if len(arr) <= idx:
+            arr = arr + [None] * (idx - len(arr) + 1)
+        arr[idx] = val
+        return arr
+
+    ret = []
+    for name, dim_infos in dim_info_dict.items():
+        for dim_info in dim_infos:
+            if dim_info.source == source and dim_info.tensor_idx == tensor_idx:
+                for dim_idx in dim_info.dim_idx:
+                    ret = _fill(ret, dim_idx, name)
+
+    if None in ret:
+        raise RuntimeError(
+            "dim_info_dict for source: {}, tensor_idx: {} not complete.".format(
+                source, tensor_idx
+            )
+        )
+
+    return ret
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    src_template,
+    problem_args,
+    args_parser,
+    bias_ptr_arg=None,
+):
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    has_d = False
+    if "has_d" in func_attrs:
+        has_d = func_attrs["has_d"]
+
+    a_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    b_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    c_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    file_pairs = []
+    has_bias = bias_ptr_arg is not None
+    assert not (has_d and has_bias)
+    for op_name, op in op_instance.items():
+        config = common.emit_instance(op, for_profiler=True)
+        config_name = common.extract_config_name(config)
+        name = "GemmInstance"
+        instance = common.INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = common.EXEC_TEMPLATE.render(
+            indent="  ",
+            instance=name,
+            is_profiler=True,
+            problem_args=problem_args,
+        )
+        input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+            input_ndims=a_ndims,
+            weight_ndims=b_ndims,
+            output_ndims=c_ndims,
+        )
+        op_func = src_template.render(
+            instances=instance,
+            function_name="bmm",
+            input_ndims=a_ndims,
+            weight_ndims=b_ndims,
+            output_ndims=c_ndims,
+            shape_eval=shape_func,
+            input_output_checks=input_output_checks,
+            exec_paths=exec_program,
+            has_d=has_d,
+        )
+        a_dims_ptr = [f"&a_dim{idx}" for idx in range(a_ndims)]
+        b_dims_ptr = [f"&b_dim{idx}" for idx in range(b_ndims)]
+        c_dims_ptr = [f"&c_dim{idx}" for idx in range(c_ndims)]
+        func_call = FUNC_CALL_TEMPLATE.render(
+            func_name="bmm",
+            a_ptr="memory_pool->RequestHalfTensorByIdx(0)",
+            b_ptr="memory_pool->RequestHalfTensorByIdx(1)",
+            has_bias=has_bias,
+            bias_ptr=bias_ptr_arg,
+            c_ptr="memory_pool->RequestHalfTensorByIdx(2)",
+            d_ptr="memory_pool->RequestHalfTensorByIdx(%d)" % (4 if has_bias else 3),
+            has_d=has_d,
+            a_dims_ptr=a_dims_ptr,
+            b_dims_ptr=b_dims_ptr,
+            c_dims_ptr=c_dims_ptr,
+        )
+        code = common.PROFILER_TEMPLATE.render(
+            op_func=op_func,
+            args_parse=args_parser,
+            func_call=func_call,
+            name=name,
+            tensor_decl=TENSOR_DECL_TEMPLATE.render(
+                name=name,
+                a_ndims=a_ndims,
+                b_ndims=b_ndims,
+                c_ndims=c_ndims,
+                has_d=has_d,
+                has_bias=has_bias,
+            ),
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
+
+
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    has_d = False
+    if "has_d" in func_attrs:
+        has_d = func_attrs["has_d"]
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        a_ndims=len(func_attrs["input_accessors"][0].original_shapes),
+        b_ndims=len(func_attrs["input_accessors"][1].original_shapes),
+        c_ndims=len(func_attrs["output_accessors"][0].original_shapes),
+        has_d=has_d,
+    )
+
+
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    problem_args,
+    dim_info_dict,
+    input_addr_calculator="",
+    output_addr_calculator="",
+):
+    return common.gen_function(
+        func_attrs,
+        common.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims=len(func_attrs["input_accessors"][0].original_shapes),
+        weight_ndims=len(func_attrs["input_accessors"][1].original_shapes),
+        output_ndims=len(func_attrs["output_accessors"][0].original_shapes),
+        dim_info_dict=dim_info_dict,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=output_addr_calculator,
+    )
+
+
+def gen_function_call(func_attrs, indent="  ", bias_ptr_arg=None):
+    a = func_attrs["inputs"][0]
+    ashape = func_attrs["input_accessors"][0].original_shapes
+    a_dims_ptr = [f'&{ashape[idx]._attrs["name"]}' for idx in range(len(ashape))]
+    b = func_attrs["inputs"][1]
+    bshape = func_attrs["input_accessors"][1].original_shapes
+    b_dims_ptr = [f'&{bshape[idx]._attrs["name"]}' for idx in range(len(bshape))]
+    c = func_attrs["outputs"][0]
+    cshape = func_attrs["output_accessors"][0].original_shapes
+    c_dims_ptr = [f'&{cshape[idx]._attrs["name"]}' for idx in range(len(cshape))]
+    has_d = False
+    d_ptr = None
+    if "has_d" in func_attrs:
+        has_d = func_attrs["has_d"]
+        d_ptr = func_attrs["inputs"][2]._attrs["name"]
+    has_bias = bias_ptr_arg is not None
+    assert not (has_d and has_bias)
+
+    local_dim_defs = common.gen_local_dim_defs(func_attrs, indent=indent)
+
+    return FUNC_CALL_TEMPLATE.render(
+        local_dim_defs=local_dim_defs,
+        func_name=func_attrs["name"],
+        a_ptr=a._attrs["name"],
+        b_ptr=b._attrs["name"],
+        has_bias=has_bias,
+        bias_ptr=bias_ptr_arg,
+        c_ptr=c._attrs["name"],
+        d_ptr=d_ptr,
+        has_d=has_d,
+        a_dims_ptr=a_dims_ptr,
+        b_dims_ptr=b_dims_ptr,
+        c_dims_ptr=c_dims_ptr,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
new file mode 100644
index 000000000..62d6eee96
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
@@ -0,0 +1,144 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for bmm_crr, which computes A @ B + bias.
+A[ColMajor], B[RowMajor], bias[RowMajor]
+"""
+
+from ... import registry
+from ...common import gemm_common
+from . import bmm_common, common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_problem_info(**kwargs):
+    problem_args = {
+        "bias_ptr": "c_ptr",
+        "a_batch_stride": "M * K",
+        "b_batch_stride": "N * K",
+        "bias_batch_stride": "M * N",
+        "c_batch_stride": "M * N",
+        "lda": "M",
+        "ldb": "N",
+        "ldbias": "N",
+        "ldc": "N",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+@registry.reg("cuda.bmm_crr.config")
+def bmm_crr_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            b_layout=cutlass_lib.library.LayoutType.RowMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+        )
+
+    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+
+
+@registry.reg("cuda.bmm_crr.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    a_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 0
+    )
+    b_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 1
+    )
+    c_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.OUTPUT, 0
+    )
+
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.bmm_crr.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.bmm_crr.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.bmm_crr.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.bmm_crr.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
new file mode 100644
index 000000000..2767af9b0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
@@ -0,0 +1,104 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for bmm_crr_add, which computes A @ B + bias + C.
+A[ColMajor], B[RowMajor], bias / C[RowMajor]
+"""
+
+from ... import registry
+from ...common import gemm_common
+from . import bmm_common, bmm_crr, common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+@registry.reg("cuda.bmm_crr_add.config")
+def bmm_crr_add_config(func_attrs, dtype="float16"):
+    return bmm_crr.bmm_crr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.bmm_crr_add.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    a_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 0
+    )
+    b_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 1
+    )
+    c_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.OUTPUT, 0
+    )
+
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    mm_info = bmm_crr._get_problem_info(
+        bias_ptr="d_ptr", alpha_value=func_attrs.get("alpha", 1), beta_value=1
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    d_shapes = func_attrs["input_accessors"][2].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.bmm_crr_add.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    mm_info = bmm_crr._get_problem_info(
+        bias_ptr="d_ptr", alpha_value=func_attrs.get("alpha", 1), beta_value=1
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    d_shapes = func_attrs["input_accessors"][2].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.bmm_crr_add.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.bmm_crr_add.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.bmm_crr_add.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
new file mode 100644
index 000000000..582bfd38e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
@@ -0,0 +1,166 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common functions and templates for bmm_permute-family ops
+"""
+from ...common import gemm_common
+from ..gemm_universal import common, common_bias
+
+from . import bmm_common, common_permute
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    src_template,
+    problem_args,
+    args_parser,
+    emit_kernel=False,
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    """Generate code for profiling"""
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    has_d = False
+    if "has_d" in func_attrs:
+        has_d = func_attrs["has_d"]
+
+    a_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    b_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    c_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    file_pairs = []
+    has_bias = bias_ptr_arg is not None
+    assert not (has_d and has_bias)
+    for op_name, op in op_instance.items():
+        config = common_permute.emit_instance(
+            op,
+            for_profiler=True,
+            emit_kernel=emit_kernel,
+            func_attrs=func_attrs,
+        )
+        config_name = common.extract_config_name(config)
+        name = "GemmInstance"
+        instance = common.INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = common.EXEC_TEMPLATE.render(
+            indent="  ",
+            instance=name,
+            is_profiler=True,
+            problem_args=problem_args,
+        )
+        input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+            input_ndims=a_ndims,
+            weight_ndims=b_ndims,
+            output_ndims=c_ndims,
+        )
+        op_func = src_template.render(
+            instances=instance,
+            function_name="bmm",
+            input_ndims=a_ndims,
+            weight_ndims=b_ndims,
+            output_ndims=c_ndims,
+            shape_eval=shape_func,
+            input_output_checks=input_output_checks,
+            exec_paths=exec_program,
+            has_d=has_d,
+            extra_code=extra_code,
+        )
+        a_dims_ptr = [f"&a_dim{idx}" for idx in range(a_ndims)]
+        b_dims_ptr = [f"&b_dim{idx}" for idx in range(b_ndims)]
+        c_dims_ptr = [f"&c_dim{idx}" for idx in range(c_ndims)]
+        func_call = bmm_common.FUNC_CALL_TEMPLATE.render(
+            func_name="bmm",
+            a_ptr="memory_pool->RequestHalfTensorByIdx(0)",
+            b_ptr="memory_pool->RequestHalfTensorByIdx(1)",
+            has_bias=has_bias,
+            bias_ptr=bias_ptr_arg,
+            c_ptr="memory_pool->RequestHalfTensorByIdx(2)",
+            d_ptr="memory_pool->RequestHalfTensorByIdx(%d)" % (4 if has_bias else 3),
+            has_d=has_d,
+            a_dims_ptr=a_dims_ptr,
+            b_dims_ptr=b_dims_ptr,
+            c_dims_ptr=c_dims_ptr,
+        )
+        code = common.PROFILER_TEMPLATE.render(
+            op_func=op_func,
+            args_parse=args_parser,
+            func_call=func_call,
+            name=name,
+            tensor_decl=bmm_common.TENSOR_DECL_TEMPLATE.render(
+                name=name,
+                a_ndims=a_ndims,
+                b_ndims=b_ndims,
+                c_ndims=c_ndims,
+                has_d=has_d,
+                has_bias=has_bias,
+            ),
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
+
+
+def gen_function_decl(func_attrs):
+    """Rendering argument to function declaration template"""
+    func_name = func_attrs["name"]
+    has_d = False
+    if "has_d" in func_attrs:
+        has_d = func_attrs["has_d"]
+    return bmm_common.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        a_ndims=len(func_attrs["input_accessors"][0].original_shapes),
+        b_ndims=len(func_attrs["input_accessors"][1].original_shapes),
+        c_ndims=len(func_attrs["output_accessors"][0].original_shapes),
+        has_d=has_d,
+    )
+
+
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    problem_args,
+    dim_info_dict,
+    input_addr_calculator="",
+    output_addr_calculator="",
+    extra_code="",
+    has_bias=False,
+):
+    return common_permute.gen_function(
+        func_attrs,
+        common_bias.SRC_TEMPLATE if has_bias else common.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims=len(func_attrs["input_accessors"][0].original_shapes),
+        weight_ndims=len(func_attrs["input_accessors"][1].original_shapes),
+        output_ndims=len(func_attrs["output_accessors"][0].original_shapes),
+        dim_info_dict=dim_info_dict,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=output_addr_calculator,
+        emit_kernel=True,
+        extra_code=extra_code,
+    )
+
+
+def gen_function_call(func_attrs, indent="  ", bias_ptr_arg=None):
+    return bmm_common.gen_function_call(func_attrs, indent, bias_ptr_arg)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
new file mode 100644
index 000000000..d660f3c61
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
@@ -0,0 +1,211 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for bmm_rcr, which computes A @ B + bias.
+A[RowMajor], B[ColMajor], bias[RowMajor]
+"""
+
+from ... import registry
+from ...common import gemm_common
+from . import bmm_common, common
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_default_problem_info(**kwargs):
+    problem_args = {
+        "bias_ptr": "c_ptr",
+        "a_batch_stride": "M * K",
+        "b_batch_stride": "N * K",
+        "bias_batch_stride": "M * N",
+        "c_batch_stride": "M * N",
+        "lda": "K",
+        "ldb": "K",
+        "ldbias": "N",
+        "ldc": "N",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+@registry.reg("cuda.bmm_rcr.config")
+def bmm_rcr_config(func_attrs, dtype="float16"):
+    common.make_fproc_f16(func_attrs, RCR)
+
+
+@registry.reg("cuda.bmm_rcr.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    a_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 0
+    )
+    b_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 1
+    )
+    c_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.OUTPUT, 0
+    )
+
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    mm_info = _get_default_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.bmm_rcr.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    input_a_batch_stride_dim = "M * K"
+    input_a_stride_k_dim = "K"
+    input_a_offset = 0
+    input_b_batch_stride_dim = "N * K"
+    input_b_stride_k_dim = "K"
+    input_b_offset = 0
+
+    if "input_accessors" in func_attrs:
+        input_a_accessor = func_attrs["input_accessors"][0]
+        input_b_accessor = func_attrs["input_accessors"][1]
+
+        if input_a_accessor.is_from_strided_tensor:
+            input_a_offset = input_a_accessor.offset
+            if not input_a_accessor.is_contiguous:
+                a_dims = bmm_common.reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 0
+                )
+
+                input_a_batch_stride_dim = input_a_accessor.gen_stride_str(0, a_dims)
+                input_a_stride_k_dim = input_a_accessor.stride(1)
+
+        if input_b_accessor.is_from_strided_tensor:
+            input_b_offset = input_b_accessor.offset
+            if not input_b_accessor.is_contiguous:
+                b_dims = bmm_common.reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 1
+                )
+                input_b_batch_stride_dim = input_b_accessor.gen_stride_str(0, b_dims)
+                input_b_stride_k_dim = input_b_accessor.stride(1)
+
+    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
+        input_a_batch_stride_dim=input_a_batch_stride_dim,
+        input_a_stride_dim=input_a_stride_k_dim,
+        input_a_offset_val=input_a_offset,
+        input_b_batch_stride_dim=input_b_batch_stride_dim,
+        input_b_stride_dim=input_b_stride_k_dim,
+        input_b_offset_val=input_b_offset,
+    )
+
+    output_batch_stride_dim = "M * N"
+    output_stride_n_dim = "N"
+    output_offset = 0
+
+    if "output_accessors" in func_attrs:
+        output_accessor = func_attrs["output_accessors"][0]
+        if output_accessor.is_from_strided_tensor:
+            output_offset = output_accessor.offset
+            if not output_accessor.is_contiguous:
+                c_dims = bmm_common.reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.OUTPUT, 0
+                )
+                output_batch_stride_dim = output_accessor.gen_stride_str(0, c_dims)
+                output_stride_n_dim = output_accessor.stride(1)
+
+    output_addr_calculator = bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+        output_batch_stride_dim=output_batch_stride_dim,
+        output_stride_dim=output_stride_n_dim,
+        output_offset_val=output_offset,
+    )
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+        a_ptr="(a_ptr + input_a_offset)",
+        b_ptr="(b_ptr + input_b_offset)",
+        bias_ptr="(c_ptr + output_offset)",
+        c_ptr="(c_ptr + output_offset)",
+        a_batch_stride="input_a_batch_stride",
+        b_batch_stride="input_b_batch_stride",
+        bias_batch_stride="output_batch_stride",
+        c_batch_stride="output_batch_stride",
+        lda="input_a_stride",
+        ldb="input_b_stride",
+        ldbias="output_stride",
+        ldc="output_stride",
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+        input_addr_calculator,
+        output_addr_calculator,
+    )
+
+
+@registry.reg("cuda.bmm_rcr.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.bmm_rcr.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.bmm_rcr.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
new file mode 100644
index 000000000..2dc737be5
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
@@ -0,0 +1,211 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for bmm_rcr_permute, which computes permute(A @ B + bias).
+A[RowMajor], B[ColMajor], bias[RowMajor]
+"""
+
+from ... import registry
+from ...common import gemm_common
+from . import bmm_common, bmm_permute_common, common, common_permute
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+@registry.reg("cuda.bmm_rcr_permute.config")
+def bmm_rcr_permute_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common_permute.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.RowMajor,
+            b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+            permute_layout=func_attrs["layout"],
+        )
+
+    func_attrs["op_instance"] = common_permute.extract_config(fproc_f16, func_attrs)
+
+
+@registry.reg("cuda.bmm_rcr_permute.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    a_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 0
+    )
+    b_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 1
+    )
+    c_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.OUTPUT, 0
+    )
+
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+        bias_ptr="c_ptr",
+        a_batch_stride="M * K",
+        b_batch_stride="N * K",
+        bias_batch_stride="M * N",
+        c_batch_stride="0",
+        lda="K",
+        ldb="K",
+        ldbias="N",
+        ldc="N",
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
+
+    bmm_permute_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+        emit_kernel=True,
+        extra_code=common_permute.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.bmm_rcr_permute.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    input_a_batch_stride_dim = "M * K"
+    input_a_stride_k_dim = "K"
+    input_a_offset = 0
+    input_b_batch_stride_dim = "N * K"
+    input_b_stride_k_dim = "K"
+    input_b_offset = 0
+
+    if "input_accessors" in func_attrs:
+        input_a_accessor = func_attrs["input_accessors"][0]
+        input_b_accessor = func_attrs["input_accessors"][1]
+
+        if input_a_accessor.is_from_strided_tensor:
+            input_a_offset = input_a_accessor.offset
+            if not input_a_accessor.is_contiguous:
+                input_a_batch_stride_dim = input_a_accessor.stride(0)
+                input_a_stride_k_dim = input_a_accessor.stride(1)
+
+        if input_b_accessor.is_from_strided_tensor:
+            input_b_offset = input_b_accessor.offset
+            if not input_b_accessor.is_contiguous:
+                input_b_batch_stride_dim = input_b_accessor.stride(0)
+                input_b_stride_k_dim = input_b_accessor.stride(1)
+
+    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
+        input_a_batch_stride_dim=input_a_batch_stride_dim,
+        input_a_stride_dim=input_a_stride_k_dim,
+        input_a_offset_val=input_a_offset,
+        input_b_batch_stride_dim=input_b_batch_stride_dim,
+        input_b_stride_dim=input_b_stride_k_dim,
+        input_b_offset_val=input_b_offset,
+    )
+
+    output_batch_stride_dim = "M * N"
+    output_stride_n_dim = "N"
+    output_offset = 0
+
+    if "output_accessors" in func_attrs:
+        output_accessor = func_attrs["output_accessors"][0]
+        if output_accessor.is_from_strided_tensor:
+            output_offset = output_accessor.offset
+            if not output_accessor.is_contiguous:
+                output_batch_stride_dim = output_accessor.stride(0)
+                output_stride_n_dim = output_accessor.stride(1)
+
+    output_addr_calculator = bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+        output_batch_stride_dim=output_batch_stride_dim,
+        output_stride_dim=output_stride_n_dim,
+        output_offset_val=output_offset,
+    )
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+        a_ptr="(a_ptr + input_a_offset)",
+        b_ptr="(b_ptr + input_b_offset)",
+        bias_ptr="(c_ptr + output_offset)",
+        c_ptr="(c_ptr + output_offset)",
+        a_batch_stride="input_a_batch_stride",
+        b_batch_stride="input_b_batch_stride",
+        bias_batch_stride="output_batch_stride",
+        c_batch_stride="0",
+        lda="input_a_stride",
+        ldb="input_b_stride",
+        ldbias="output_stride",
+        ldc="output_stride",
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
+
+    return bmm_permute_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+        input_addr_calculator,
+        output_addr_calculator,
+        extra_code=common_permute.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.bmm_rcr_permute.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_permute_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.bmm_rcr_permute.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_permute_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.bmm_rcr_permute.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
new file mode 100644
index 000000000..bc752b1bb
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
@@ -0,0 +1,145 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for bmm_rrr, which computes A @ B + bias.
+A[RowMajor], B[RowMajor], bias / C[RowMajor]
+"""
+
+from ... import registry
+from ...common import gemm_common
+from . import bmm_common, common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_problem_info(**kwargs):
+    problem_args = {
+        "bias_ptr": "c_ptr",
+        "a_batch_stride": "M * K",
+        "b_batch_stride": "N * K",
+        "bias_batch_stride": "M * N",
+        "c_batch_stride": "M * N",
+        "lda": "K",
+        "ldb": "N",
+        "ldbias": "N",
+        "ldc": "N",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+@registry.reg("cuda.bmm_rrr.config")
+def bmm_rrr_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.RowMajor,
+            b_layout=cutlass_lib.library.LayoutType.RowMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+        )
+
+    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+
+
+@registry.reg("cuda.bmm_rrr.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    a_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 0
+    )
+    b_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 1
+    )
+    c_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.OUTPUT, 0
+    )
+
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.bmm_rrr.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.bmm_rrr.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.bmm_rrr.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.bmm_rrr.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
new file mode 100644
index 000000000..bb8201291
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
@@ -0,0 +1,121 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for bmm_rrr_add, which computes A @ B + bias + C.
+A[RowMajor], B[RowMajor], bias / C[RowMajor]
+"""
+
+from ... import registry
+from ...common import gemm_common
+from . import bmm_common, bmm_rrr, common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+@registry.reg("cuda.bmm_rrr_add.config")
+def bmm_rrr_add_config(func_attrs, dtype="float16"):
+    return bmm_rrr.bmm_rrr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.bmm_rrr_add.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    a_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 0
+    )
+    b_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 1
+    )
+    c_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.OUTPUT, 0
+    )
+
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    mm_info = bmm_rrr._get_problem_info(
+        bias_ptr="d_ptr", alpha_value=func_attrs.get("alpha", 1), beta_value=1
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    d_shapes = func_attrs["input_accessors"][2].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.bmm_rrr_add.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    mm_info = bmm_rrr._get_problem_info(
+        bias_ptr="d_ptr", alpha_value=func_attrs.get("alpha", 1), beta_value=1
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    d_shapes = func_attrs["input_accessors"][2].original_shapes
+    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.bmm_rrr_add.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.bmm_rrr_add.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.bmm_rrr_add.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
new file mode 100644
index 000000000..d1d17ee8d
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
@@ -0,0 +1,219 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for bmm_rrr_permute, which computes permute(A @ B + bias).
+A[RowMajor], B[RowMajor], bias / C[RowMajor]
+"""
+
+from ... import registry
+from ...common import gemm_common
+from . import bmm_common, bmm_permute_common, common, common_permute
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+@registry.reg("cuda.bmm_rrr_permute.config")
+def bmm_rrr_permute_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common_permute.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.RowMajor,
+            b_layout=cutlass_lib.library.LayoutType.RowMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+            permute_layout=func_attrs["layout"],
+        )
+
+    func_attrs["op_instance"] = common_permute.extract_config(fproc_f16, func_attrs)
+
+
+@registry.reg("cuda.bmm_rrr_permute.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    a_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 0
+    )
+    b_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.INPUT, 1
+    )
+    c_dims = bmm_common.reverse_dim_info_mapping(
+        dim_info_dict, gemm_common.Source.OUTPUT, 0
+    )
+
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+        bias_ptr="c_ptr",
+        a_batch_stride="M * K",
+        b_batch_stride="K * N",
+        bias_batch_stride="M * N",
+        c_batch_stride="0",
+        lda="K",
+        ldb="N",
+        ldbias="N",
+        ldc="N",
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
+
+    bmm_permute_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+        emit_kernel=True,
+        extra_code=common_permute.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.bmm_rrr_permute.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    input_a_batch_stride_dim = "M * K"
+    input_a_stride_k_dim = "K"
+    input_a_offset = 0
+    input_b_batch_stride_dim = "K * N"
+    input_b_stride_k_dim = "N"
+    input_b_offset = 0
+
+    if "input_accessors" in func_attrs:
+        input_a_accessor = func_attrs["input_accessors"][0]
+        input_b_accessor = func_attrs["input_accessors"][1]
+
+        if input_a_accessor.is_from_strided_tensor:
+            input_a_offset = input_a_accessor.offset
+            if not input_a_accessor.is_contiguous:
+                a_dims = bmm_common.reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 0
+                )
+
+                input_a_batch_stride_dim = input_a_accessor.gen_stride_str(0, a_dims)
+                input_a_stride_k_dim = input_a_accessor.stride(1)
+
+        if input_b_accessor.is_from_strided_tensor:
+            input_b_offset = input_b_accessor.offset
+            if not input_b_accessor.is_contiguous:
+                b_dims = bmm_common.reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 1
+                )
+                input_b_batch_stride_dim = input_b_accessor.gen_stride_str(0, b_dims)
+                input_b_stride_k_dim = input_b_accessor.stride(1)
+
+    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
+        input_a_batch_stride_dim=input_a_batch_stride_dim,
+        input_a_stride_dim=input_a_stride_k_dim,
+        input_a_offset_val=input_a_offset,
+        input_b_batch_stride_dim=input_b_batch_stride_dim,
+        input_b_stride_dim=input_b_stride_k_dim,
+        input_b_offset_val=input_b_offset,
+    )
+
+    output_batch_stride_dim = "M * N"
+    output_stride_n_dim = "N"
+    output_offset = 0
+
+    if "output_accessors" in func_attrs:
+        output_accessor = func_attrs["output_accessors"][0]
+        if output_accessor.is_from_strided_tensor:
+            output_offset = output_accessor.offset
+            if not output_accessor.is_contiguous:
+                c_dims = bmm_common.reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.OUTPUT, 0
+                )
+                output_batch_stride_dim = output_accessor.gen_stride_str(0, c_dims)
+                output_stride_n_dim = output_accessor.stride(1)
+
+    output_addr_calculator = bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+        output_batch_stride_dim=output_batch_stride_dim,
+        output_stride_dim=output_stride_n_dim,
+        output_offset_val=output_offset,
+    )
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+        a_ptr="(a_ptr + input_a_offset)",
+        b_ptr="(b_ptr + input_b_offset)",
+        bias_ptr="(c_ptr + output_offset)",
+        c_ptr="(c_ptr + output_offset)",
+        a_batch_stride="input_a_batch_stride",
+        b_batch_stride="input_b_batch_stride",
+        bias_batch_stride="output_batch_stride",
+        c_batch_stride="0",
+        lda="input_a_stride",
+        ldb="input_b_stride",
+        ldbias="output_stride",
+        ldc="output_stride",
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+
+    return bmm_permute_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+        input_addr_calculator,
+        output_addr_calculator,
+        extra_code=common_permute.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.bmm_rrr_permute.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_permute_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.bmm_rrr_permute.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_permute_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.bmm_rrr_permute.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py
new file mode 100644
index 000000000..742d601a0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py
@@ -0,0 +1,31 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from ... import registry
+
+
+@registry.reg("cuda.bmm_softmax_bmm_permute.func_decl")
+def gen_function_decl(func_attrs):
+    raise NotImplementedError("bmm_softmax_bmm_permute kernel is not implemented.")
+
+
+@registry.reg("cuda.bmm_softmax_bmm_permute.gen_function")
+def gen_function(func_attrs):
+    raise NotImplementedError("bmm_softmax_bmm_permute kernel is not implemented.")
+
+
+@registry.reg("cuda.bmm_softmax_bmm_permute.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    raise NotImplementedError("bmm_softmax_bmm_permute kernel is not implemented.")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
new file mode 100644
index 000000000..199311035
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -0,0 +1,944 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common codegen functions for gemm.
+"""
+
+import os
+import random
+import re
+from collections import OrderedDict
+from hashlib import sha1
+from typing import Any, Dict, List, Tuple
+
+import jinja2
+
+from ....compiler.base import IntImm
+
+from ... import builder
+from ...common import gemm_common, tensor_accessor_codegen
+from ...target import Target
+
+# pylint: disable=C0301,C0415,R1705
+
+
+INPUT_ADDR_CALCULATOR = jinja2.Template(
+    """
+  int64_t input_a_batch_stride = {{input_a_batch_stride_dim}};
+  int64_t input_a_stride = {{input_a_stride_dim}};
+  int64_t input_a_offset = {{input_a_offset_val}}; // default to 0
+  int64_t input_b_batch_stride = {{input_b_batch_stride_dim}};
+  int64_t input_b_stride = {{input_b_stride_dim}};
+  int64_t input_b_offset = {{input_b_offset_val}}; // default to 0
+    """
+)
+
+
+# These should be only used for 2D gemm
+# For templates for bmm, see bmm_common
+OUTPUT_ADDR_CALCULATOR = jinja2.Template(
+    """
+  {% if not output_accessor.is_from_strided_tensor %}
+  int64_t output_stride = {{stride_dim}};
+  int64_t output_offset = 0;
+  {% else %}
+  int64_t output_stride = {{output_accessor.actual_total_elements_from_stride_dim}};
+  int64_t output_offset = {{output_accessor.offset}};
+  {% endif %}
+    """
+)
+
+DEFAULT_OUTPUT_ADDR_CALCULATOR = jinja2.Template(
+    """
+  int64_t output_stride = {{stride_dim}};
+  int64_t output_offset = 0;
+    """
+)
+
+DIM_DEFS_TEMPLATE = jinja2.Template(
+    """
+{% for dim, value in dims.items() %}
+{{indent}}int64_t {{dim}} = {{value}};
+{% endfor %}
+"""
+)
+
+
+INPUT_OUTPUT_CHECKS_TEMPLATE = jinja2.Template(
+    """
+  int64_t a_size = 1;
+{% for idx in range(input_ndims) %}
+    a_size *= *a_dim{{idx}};
+{% endfor %}
+  if (a_size != 0 && !a_ptr) {
+    throw std::runtime_error("input a is null!");
+  }
+
+  int64_t b_size = 1;
+{% for idx in range(weight_ndims) %}
+    b_size *= *b_dim{{idx}};
+{% endfor %}
+  if (b_size != 0 && !b_ptr) {
+    throw std::runtime_error("input b is null!");
+  }
+
+  int64_t c_size = 1;
+{% for idx in range(output_ndims) %}
+    c_size *= *c_dim{{idx}};
+{% endfor %}
+  if (c_size != 0) {
+    if (!c_ptr) {
+      throw std::runtime_error("input c is null!");
+    }
+  } else {
+    // output is empty and safe to return
+    return;
+  }
+
+  // One of the input tensor are empty
+  if (a_size == 0 || b_size == 0) {
+    return;
+  }
+"""
+)
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = {{config_name}};
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <memory>
+#include <random>
+#include <vector>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/device_memory.h"
+
+{{extra_code}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+void {{function_name}} (
+    cutlass::half_t* a_ptr,
+    cutlass::half_t* b_ptr,
+{% if has_d %}
+    cutlass::half_t* d_ptr,
+{% endif %}
+    cutlass::half_t* c_ptr,
+    uint8_t* workspace,
+{% if support_split_k %}
+    int split_k,
+{% endif %}
+{% for idx in range(input_ndims) %}
+    int64_t* a_dim{{idx}},
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+    int64_t* b_dim{{idx}},
+{% endfor %}
+{% for idx in range(output_ndims) %}
+    int64_t* c_dim{{idx}},
+{% endfor %}
+    cudaStream_t stream
+  ) {
+  {{shape_eval}}
+  {{input_addr_calculator}}
+  {{output_addr_calculator}}
+  {{extra_shape}}
+  {{input_output_checks}}
+
+  {{exec_paths}}
+  {% for idx in range(input_ndims) %}
+      std::cout << "input_ndims{{idx}}: " << *a_dim{{idx}} << std::endl;
+  {% endfor %}
+  {% for idx in range(weight_ndims) %}
+      std::cout << "weight_ndims{{idx}}: " << *b_dim{{idx}} << std::endl;
+  {% endfor %}
+  {% for idx in range(output_ndims) %}
+      std::cout << "output_ndims{{idx}}: " << *c_dim{{idx}} << std::endl;
+  {% endfor %}
+  throw std::runtime_error(
+      "Unsupported workload for this {{function_name}} specialization."
+  );
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+//  TODO: cast to right dtype
+{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementAccumulator;
+
+{{indent}}typename {{instance}}::Arguments arguments{
+
+{{problem_args}}
+
+{{indent}}};
+{{indent}}{{instance}} gemm_op;
+{% if is_profiler %}
+{{indent}}// https://www.youtube.com/watch?v=rRwxfYlgG-M
+{{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+{{indent}}auto status = gemm_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = gemm_op.initialize(arguments, workspace, stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = gemm_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  uint8_t*,
+{% if support_split_k %}
+  int,
+{% endif %}
+{% for idx in range(input_ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(input_ndims) %}
+  int64_t*,
+{% endfor %}
+  cudaStream_t
+);
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}{{local_dim_defs}}
+{{indent}}{{func_name}}(
+{{indent}}    {{a_ptr}},
+{{indent}}    {{b_ptr}},
+{% if has_bias %}
+{{indent}}    {{bias_ptr}},
+{% endif %}
+{{indent}}    {{c_ptr}},
+{{indent}}    global_workspace,
+{{indent}}    {{split_k}},
+{% for dim in adims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in bdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in cdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{{indent}}    stream
+{{indent}});
+{{indent}}}
+"""
+)
+
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  int64_t a_ptr_sz = a_dim0 * a_dim1;
+  int64_t b_ptr_sz = b_dim0 * b_dim1;
+  int64_t c_ptr_sz = c_dim0 * c_dim1;
+
+  // The value 1 is used to force ptr_max_sz to be non-zero
+  int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for A100 L2 cache 40M
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+
+  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
+  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+
+{% if has_bias %}
+  memory_pool->AllocateHalfTensor(c_dim1, mem_pool_sz);  // bias_ptr: index 3
+{% endif %}
+
+"""
+)
+
+
+# TODO Merge all alignment into single profiler
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+{{op_func}}
+
+struct ProfilerMemoryPool {
+  ProfilerMemoryPool() {
+    std::random_device rd;
+    gen = std::mt19937(rd());
+    uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
+    offsets.reserve(512);
+    strides.reserve(512);
+    copies.reserve(512);
+    ptrs.reserve(512);
+    blobs.reserve(512);
+  }
+  ~ProfilerMemoryPool() {}
+
+  template <typename DType>
+  DType* AllocateGaussianTensor(int64_t size) {
+    size_t length = size * sizeof(DType);
+    blobs.emplace_back(length);
+    DType* ptr = reinterpret_cast<DType*>(blobs.back().get());
+
+    uint64_t seed = uniform_dist(gen);
+    double mean = 0.f;
+    double std = 1.f;
+
+    cutlass::reference::device::BlockFillRandomGaussian(ptr, size, seed, mean,
+                                                        std);
+
+    return ptr;
+  }
+
+
+  cutlass::half_t* AllocateHalfGaussianTensor(int64_t size) {
+    return reinterpret_cast<cutlass::half_t*>(
+        AllocateGaussianTensor<__half>(size));
+  }
+
+  int AllocateHalfTensor(int64_t size, int64_t copy) {
+    offsets.push_back(0);
+    strides.push_back(size);
+    copies.push_back(copy);
+    auto ptr = AllocateHalfGaussianTensor(size * copy);
+    ptrs.push_back(reinterpret_cast<void*>(ptr));
+    return ptrs.size() - 1;
+  }
+
+  cutlass::half_t* RequestHalfTensorByIdx(int idx) {
+    auto copy = copies.at(idx);
+    auto offset = offsets.at(idx);
+    auto stride = strides.at(idx);
+    cutlass::half_t* ptr = reinterpret_cast<cutlass::half_t*>(ptrs.at(idx));
+    ptr += offset;
+    offset += stride;
+    if (offset == copy * stride) {
+        offset = 0;
+    }
+    offsets[idx] = offset;
+    return ptr;
+  }
+
+  std::vector<int64_t> offsets;
+  std::vector<int64_t> strides;
+  std::vector<int64_t> copies;
+  std::vector<void*> ptrs;
+  std::vector<cutlass::DeviceAllocation<uint8_t> > blobs;
+  std::mt19937 gen;
+  std::uniform_int_distribution<int64_t> uniform_dist;
+};
+
+
+int main(int argc, char** argv) {
+  int device_idx;
+  cudaDeviceProp device_properties;
+  cudaError_t result = cudaGetDevice(&device_idx);
+  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaGetDevice() API call failed.");
+  }
+
+  result = cudaGetDeviceProperties(&device_properties, device_idx);
+
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaGetDeviceProperties() failed");
+  }
+
+  {{args_parse}}
+
+  using ElementOutput = typename {{name}}::ElementC;
+  using ElementInputA = typename {{name}}::ElementA;
+  using ElementInputB = typename {{name}}::ElementB;
+  uint8_t* global_workspace = nullptr;
+  cudaStream_t stream = nullptr;
+
+  {{tensor_decl}}
+
+  // warmup
+  for (int i = 0; i < 5; ++i) {
+    {{func_call}}
+  }
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0]);
+  for (int i = 0; i < 10; ++i) {
+    {{func_call}}
+  }
+  cudaEventRecord(events[1]);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "TIME:" << runtime_ms << std::endl;
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  return 0;
+}
+"""
+)
+
+
+KERNEL_KEY_TEMPLATE = jinja2.Template(
+    """
+cutlass_{{opcode_class_name}}_{{extended_name}}_{{threadblock}}_{{layout}}_align_{{align_ab}}_{{align_c}}
+"""
+)
+
+
+def has_d(func_attrs):
+    if "has_d" in func_attrs:
+        return func_attrs["has_d"]
+    else:
+        return False
+
+
+def has_d1(func_attrs):
+    return func_attrs.get("num_sources", 0) >= 2
+
+
+def get_gemm_instance_template_params(
+    op_def: str,
+    kernel_config: Tuple[str, int, int] = ("cutlass::gemm::device::Gemm", 21, 3),
+) -> List[str]:
+    """
+    For a given op_def string generated by cutlass's gemm emiter, parse and
+    return the gemm instance's template parameters.
+    kernel_config is a tuple used for finding kernel params. The first element
+    of kernel_config is the kernel kind, the second is the expected number
+    of params, and the third is the index offset of alignment values in the
+    full op_def string.
+    """
+    kernel_kind, expected_num_params, _ = kernel_config
+    params = re.findall(rf"{kernel_kind}<([\s\S]+)>;", op_def)
+    assert len(params) == 1
+    param = params[0]
+    gemm_universal_params = param.strip().split("\n")
+    gemm_universal_params = [param.strip(",") for param in gemm_universal_params]
+    assert len(gemm_universal_params) == expected_num_params, (
+        f"expected len(gemm_universal_params) to be {expected_num_params}, but got "
+        f"{len(gemm_universal_params)}, {gemm_universal_params=}"
+    )
+    return gemm_universal_params
+
+
+def update_alignments_in_gemm_instance(
+    op_def: str,
+    func_attrs: Dict[str, Any],
+    for_profiler: bool,
+    kernel_config: Tuple[str, int, int] = ("cutlass::gemm::device::Gemm", 21, 3),
+) -> str:
+    """
+    update kAlignmentA, kAlignmentB, and epilogue_alignment in op_def,
+    which is a gemm instance emitted by the gemm instance emitter of cutlass.
+    kernel_config is a tuple used for finding kernel params. The first element
+    of kernel_config is the kernel kind, the second is the expected number
+    of params, and the third is the index offset of alignment values in the
+    full op_def string.
+    """
+    if for_profiler:
+        return op_def
+
+    input_accessors = func_attrs["input_accessors"]
+    a_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
+        input_accessors[0]
+    )
+    b_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
+        input_accessors[1]
+    )
+    output_accessor = func_attrs["output_accessors"][0]
+    epilogue_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
+        output_accessor
+    )
+    gemm_params = get_gemm_instance_template_params(op_def, kernel_config)
+    epilogue_align_idx = 11
+    a_align_idx = 17
+    b_align_idx = 18
+    a_curr_align = gemm_params[a_align_idx].strip()
+    b_curr_align = gemm_params[b_align_idx].strip()
+    epilogue_curr_align = gemm_params[epilogue_align_idx].strip()
+    a_alignment = min(a_alignment, int(a_curr_align))
+    b_alignment = min(b_alignment, int(b_curr_align))
+    epilogue_alignment = min(epilogue_alignment, int(epilogue_curr_align))
+    instance_lines = op_def.split("\n")
+    # a_align_idx + idx_offset in the full instance string
+    idx_offset = kernel_config[2]
+
+    def _replace_align(align_idx, curr_align, alignment):
+        curr_align_line = instance_lines[align_idx + idx_offset]
+        assert curr_align == curr_align_line.strip(
+            " ,"
+        ), f"expected {curr_align=} equal to {curr_align_line=}"
+        instance_lines[align_idx + idx_offset] = curr_align_line.replace(
+            curr_align, str(alignment)
+        )
+
+    _replace_align(a_align_idx, a_curr_align, a_alignment)
+    _replace_align(b_align_idx, b_curr_align, b_alignment)
+    _replace_align(epilogue_align_idx, epilogue_curr_align, epilogue_alignment)
+    return "\n".join(instance_lines)
+
+
+def universal_gemm_instance(
+    op_def: str, func_attrs: Dict[str, Any], for_profiler: bool
+) -> str:
+    op_def = update_alignments_in_gemm_instance(op_def, func_attrs, for_profiler)
+    tmp = op_def.replace(
+        "cutlass::gemm::device::Gemm", "cutlass::gemm::device::GemmUniversal"
+    )
+    tmp = tmp.replace("false,", "")
+    return tmp
+
+
+def kernel_name(op):
+    """Returns kernel_name of a given cutlass op_instance."""
+    from cutlass_lib import library
+
+    threadblock = op.tile_description.procedural_name()
+    extended_name = op.extended_name()
+    opcode_class_name = library.OpcodeClassNames[
+        op.tile_description.math_instruction.opcode_class
+    ]
+    layout = op.layout_name()
+    align_ab = op.A.alignment
+    align_c = op.C.alignment
+    name = KERNEL_KEY_TEMPLATE.render(
+        threadblock=threadblock,
+        extended_name=extended_name,
+        opcode_class_name=opcode_class_name,
+        layout=layout,
+        align_ab=align_ab,
+        align_c=align_c,
+    )
+    return name.replace("\n", "")
+
+
+def emit_instance(
+    op,
+    for_profiler,
+    f_instance_convertor=universal_gemm_instance,
+    emit_kernel=False,
+    func_attrs=None,
+):
+    import cutlass_lib
+
+    emitter = cutlass_lib.gemm_operation.EmitGemmInstance()
+    if emit_kernel:
+        emitter = cutlass_lib.gemm_operation.EmitGemmUniversalInstance()
+    op_def = emitter.emit(op)
+    op_def = f_instance_convertor(op_def, func_attrs, for_profiler)
+    return op_def
+
+
+def extract_config(f_proc_op):
+    import cutlass_lib
+
+    op_kind = cutlass_lib.library.OperationKind.Gemm
+    gemm_kind = cutlass_lib.library.GemmKind.Universal
+    gemm_ops = OrderedDict()
+    extract_ops = list(Target.current()._operators[op_kind].items())
+
+    for _, value in extract_ops:
+        op = value[0]
+        if op.gemm_kind == gemm_kind:
+            ret = f_proc_op(op)
+            if len(ret) > 0:
+                for op_inst in ret:
+                    key = kernel_name(op_inst)
+                    gemm_ops[key] = op_inst
+    return gemm_ops
+
+
+def extract_config_name(config):
+    pattern = re.compile(r"\s*using\s(.*?)\s=")
+    decl = config.split("\n")[2]
+    match = pattern.match(decl)
+    if match is None:
+        raise RuntimeError("Invalid config: \n" + config)
+    return match.groups()[0]
+
+
+def gen_function(
+    func_attrs,
+    src_template,
+    exec_cond_template,
+    problem_args,
+    input_ndims,
+    weight_ndims,
+    output_ndims,
+    dim_info_dict,
+    f_instance_convertor=universal_gemm_instance,
+    emit_kernel=False,
+    support_split_k=False,
+    input_addr_calculator="",
+    output_addr_calculator="",
+    extra_code="",
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for exec_item in exec_path.values():
+        fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
+        algo = exec_item.algo
+        if algo not in inst_def_flag:
+            config = emit_instance(
+                op_instance[algo],
+                for_profiler=False,
+                f_instance_convertor=f_instance_convertor,
+                emit_kernel=emit_kernel,
+                func_attrs=func_attrs,
+            )
+            inst_def_flag.add(algo)
+        else:
+            config = ""
+        inst = INSTANCE_TEMPLATE.render(
+            config=config, name=fname, config_name=extract_config_name(config)
+        )
+        instances[exec_item.exec_cond] = inst
+        instance_decl += inst
+    shape_eval_func = gemm_common.gen_shape_eval_code(
+        indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    exec_paths = ""
+    for key in instances:
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = EXEC_TEMPLATE.render(
+            indent="    ",
+            instance=fname,
+            problem_args=problem_args,
+            support_split_k=support_split_k,
+        )
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    input_output_checks = INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+    )
+    return src_template.render(
+        instances=instance_decl,
+        function_name=func_name,
+        dtype="cutlass::half_t",
+        shape_eval=shape_eval_func,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=output_addr_calculator,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_paths,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        support_split_k=support_split_k,
+        has_d=has_d(func_attrs),
+        has_d1=has_d1(func_attrs),
+        extra_code=extra_code,
+    )
+
+
+def build_profiler(file_pairs):
+    target = Target.current()
+    if target.disable_profiler_codegen():
+        file_pairs = []
+    elif target.use_dummy_profiling_results():
+        # if it is circle CI only random build 2 profiler
+        random.shuffle(file_pairs)
+        file_pairs = file_pairs[:2]
+    compile_engine = builder.Builder()
+    compile_engine.build_objs(file_pairs, target.compile_cmd(executable=True))
+
+
+def add_profiler(file_pairs, workdir, op_type, output_name, code):
+    prefix = os.path.join(workdir, "profiler", op_type)
+    if not os.path.exists(prefix):
+        os.makedirs(prefix)
+    src_path = os.path.join(prefix, output_name + ".cu")
+    obj_path = os.path.join(prefix, output_name)
+    if os.path.exists(obj_path):
+        return
+    with open(src_path, "w") as f:
+        f.write(code)
+    file_pairs.append((src_path, obj_path))
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    args_parser_template,
+    support_split_k=False,
+    output_addr_calculator="",
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    ndims = 2
+    adims = ["&a_dim" + str(i) for i in range(ndims)]
+    bdims = ["&b_dim" + str(i) for i in range(ndims)]
+    cdims = ["&c_dim" + str(i) for i in range(ndims)]
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    file_pairs = []
+    has_bias = bias_ptr_arg is not None
+    for op_name, op in op_instance.items():
+        config = emit_instance(op, for_profiler=True)
+        config_name = extract_config_name(config)
+        name = "GemmInstance"
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ",
+            instance=name,
+            is_profiler=True,
+            support_split_k=support_split_k,
+            problem_args=problem_args_template.render(),
+        )
+        input_output_checks = INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+            input_ndims=ndims,
+            weight_ndims=ndims,
+            output_ndims=ndims,
+        )
+        op_func = src_template.render(
+            instances=instance,
+            function_name="gemm",
+            input_ndims=ndims,
+            weight_ndims=ndims,
+            output_ndims=ndims,
+            shape_eval=shape_func,
+            input_output_checks=input_output_checks,
+            exec_paths=exec_program,
+            output_addr_calculator=output_addr_calculator,
+            support_split_k=support_split_k,
+            extra_code=extra_code,
+        )
+        func_call = FUNC_CALL_TEMPLATE.render(
+            func_name="gemm",
+            a_ptr="memory_pool->RequestHalfTensorByIdx(0)",
+            b_ptr="memory_pool->RequestHalfTensorByIdx(1)",
+            has_bias=has_bias,
+            bias_ptr=bias_ptr_arg,
+            c_ptr="memory_pool->RequestHalfTensorByIdx(2)",
+            split_k="split_k",
+            adims=adims,
+            bdims=bdims,
+            cdims=cdims,
+        )
+        # TODO: Render args_parse by caller.
+        args_parse = (
+            args_parser_template
+            if isinstance(args_parser_template, str)
+            else args_parser_template.render()
+        )
+        code = PROFILER_TEMPLATE.render(
+            op_func=op_func,
+            args_parse=args_parse,
+            func_call=func_call,
+            name=name,
+            tensor_decl=TENSOR_DECL_TEMPLATE.render(name=name, has_bias=has_bias),
+        )
+        add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    build_profiler(file_pairs)
+
+
+def gen_local_dim_defs(func_attrs, indent="  "):
+    """
+    used together with input TensorAccessor to access a strided input
+    """
+    if "input_accessors" not in func_attrs:
+        return ""
+
+    dims = {}
+    for input_idx, input_accessor in enumerate(func_attrs["input_accessors"]):
+        if not input_accessor.is_from_strided_tensor:
+            continue
+        original_shape = input_accessor.original_shapes
+        for idx, dim in enumerate(original_shape):
+            # skip dynamic dims
+            if isinstance(dim, IntImm):
+                input_shape = func_attrs["inputs"][input_idx]._attrs["shape"]
+                name = input_shape[idx]._attrs["name"]
+                if name in dims:
+                    assert dims[name] == dim.value(), "bmm inputs shape mismatch"
+                else:
+                    dims[name] = dim.value()
+    return DIM_DEFS_TEMPLATE.render(dims=dims, indent=indent)
+
+
+def gen_function_call(func_attrs, indent="  ", bias_ptr_arg=None):
+    a = func_attrs["inputs"][0]
+    ashapes = func_attrs["input_accessors"][0].original_shapes
+    b = func_attrs["inputs"][1]
+    bshapes = func_attrs["input_accessors"][1].original_shapes
+    c = func_attrs["outputs"][0]
+    cshapes = func_attrs["output_accessors"][0].original_shapes
+    has_bias = bias_ptr_arg is not None
+    # overwrite the global defs if we have input TensorAccessor
+    local_dim_defs = gen_local_dim_defs(func_attrs, indent=indent)
+    adims = ["&" + dim._attrs["name"] for dim in ashapes]
+    bdims = ["&" + dim._attrs["name"] for dim in bshapes]
+    cdims = ["&" + dim._attrs["name"] for dim in cshapes]
+    return FUNC_CALL_TEMPLATE.render(
+        local_dim_defs=local_dim_defs,
+        func_name=func_attrs["name"],
+        a_ptr=a._attrs["name"],
+        b_ptr=b._attrs["name"],
+        has_bias=has_bias,
+        bias_ptr=bias_ptr_arg,
+        c_ptr=c._attrs["name"],
+        split_k=func_attrs["split_k"],
+        adims=adims,
+        bdims=bdims,
+        cdims=cdims,
+        indent=indent,
+    )
+
+
+def default_fproc_f16(*, op, a_layout, b_layout, c_layout, epiligue_name):
+    import copy
+
+    import cutlass_lib
+
+    ret = []
+    data_type = cutlass_lib.library.DataType.f16
+    acc_type = cutlass_lib.library.DataType.f32
+    # check target use fp16 acc
+    if "use_fp16_acc" in Target.current()._kwargs:
+        if Target.current()._kwargs["use_fp16_acc"]:
+            acc_type = cutlass_lib.library.DataType.f16
+    if (
+        op.A.element == data_type
+        and op.B.element == data_type
+        and op.C.element == data_type
+        and op.accumulator_type() == acc_type
+        and op.A.layout == a_layout
+        and op.B.layout == b_layout
+    ):
+        op = copy.deepcopy(op)
+        # set output major
+        op.C.layout = c_layout
+        # set epilogue
+        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epiligue_name]
+        op.element_epilogue = acc_type
+        # set C alignment
+        for i in [8, 4, 2, 1]:
+            op = copy.deepcopy(op)
+            op.C.alignment = i
+            ret.append(op)
+    return ret
+
+
+def make_fproc_f16(func_attrs, layout):
+    """
+    This function sets a callback for processing the epilogue of the kernel
+    associated with func_attrs.
+    """
+
+    def fproc_f16(op):
+        a_layout, b_layout, c_layout = layout.cutlass_lib_layouts()
+        return default_fproc_f16(
+            op=op,
+            a_layout=a_layout,
+            b_layout=b_layout,
+            c_layout=c_layout,
+            epiligue_name=func_attrs["epilogue"],
+        )
+
+    func_attrs["op_instance"] = extract_config(fproc_f16)
+
+
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    tmp = cfg.split("_")
+    align_c = int(tmp[-1])
+    align_ab = int(tmp[-2])
+    if align_c != func_attrs["epilogue_alignment"]:
+        return False
+    if align_ab != ab_alignment:
+        return False
+    return True
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
new file mode 100644
index 000000000..98d8e979c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
@@ -0,0 +1,134 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common codegen functions for gemm with bias.
+"""
+
+import jinja2
+
+# pylint: disable=C0301,C0415,R1705
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = {{config_name}};
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <memory>
+#include <random>
+#include <vector>
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/epilogue/thread/linear_combination_silu.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/device_memory.h"
+
+{{extra_code}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+void {{function_name}} (
+    cutlass::half_t* a_ptr,
+    cutlass::half_t* b_ptr,
+    cutlass::half_t* bias_ptr,
+    cutlass::half_t* c_ptr,
+    uint8_t* workspace,
+{% if support_split_k %}
+    int split_k,
+{% endif %}
+{% for idx in range(input_ndims) %}
+    int64_t* a_dim{{idx}},
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+    int64_t* b_dim{{idx}},
+{% endfor %}
+{% for idx in range(input_ndims) %}
+    int64_t* c_dim{{idx}},
+{% endfor %}
+  cudaStream_t stream
+  ) {
+  {{shape_eval}}
+  {{input_addr_calculator}}
+  {{output_addr_calculator}}
+  {{extra_shape}}
+  {{input_output_checks}}
+
+  if (!bias_ptr) {
+    throw std::runtime_error("bias_ptr is null!");
+  }
+
+  {{exec_paths}}
+  {% for idx in range(input_ndims) %}
+      std::cout << "input_ndims{{idx}}: " << *a_dim{{idx}} << std::endl;
+  {% endfor %}
+  {% for idx in range(weight_ndims) %}
+      std::cout << "weight_ndims{{idx}}: " << *b_dim{{idx}} << std::endl;
+  {% endfor %}
+  {% for idx in range(input_ndims) %}
+      std::cout << "output_ndims{{idx}}: " << *c_dim{{idx}} << std::endl;
+  {% endfor %}
+  throw std::runtime_error(
+      "Unsupported workload for this {{function_name}} specialization."
+  );
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  uint8_t*,
+{% if support_split_k %}
+    int,
+{% endif %}
+{% for idx in range(input_ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(input_ndims) %}
+  int64_t*,
+{% endfor %}
+  cudaStream_t
+);
+"""
+)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
new file mode 100644
index 000000000..843230243
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
@@ -0,0 +1,93 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Common codegen functions for gemm_bias_activation.
+"""
+
+from . import common, common_bias, gemm_rcr
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    common.make_fproc_f16(func_attrs, RCR)
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    problem_args_template,
+    extra_code="",
+):
+    gemm_rcr.common_gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common_bias.SRC_TEMPLATE,
+        problem_args_template,
+        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+        extra_code=extra_code,
+    )
+
+
+def gen_function(
+    func_attrs,
+    problem_args_template,
+    exec_cond_template,
+    dim_info_dict,
+    extra_code="",
+):
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    problem_args = problem_args_template.render()
+    return common.gen_function(
+        func_attrs,
+        common_bias.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        support_split_k=True,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N",
+            output_accessor=func_attrs["output_accessors"][0],
+        ),
+        extra_code=extra_code,
+    )
+
+
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
new file mode 100644
index 000000000..5c46b3cc5
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -0,0 +1,585 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = UnaryOp2(BinaryOp2(BinaryOp1(UnaryOp1(GeMM(A, B) + bias), D1), D2)),
+"""
+
+import re
+from functools import partial
+
+import jinja2
+
+from ...common import gemm_common
+from ...target import Target
+
+from . import common, gemm_rcr
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+# For config extraction.
+GEMM_UNIVERSAL_WITH_BROADCAST_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::device::GemmUniversalWithBroadcast<
+        cutlass::half_t, {{layout.cutlass_layout_a}},
+        cutlass::half_t, {{layout.cutlass_layout_b}},
+        cutlass::half_t, {{layout.cutlass_layout_c}},
+        {{acc_type}},
+        cutlass::arch::OpClassTensorOp,
+        {{arch}},
+        {{tb_shape}},
+        {{warp_shape}},
+        {{instruction_shape}},
+        {{epilogue_functor}}<
+            cutlass::half_t, {{acc_type}}, {{acc_type}},
+            cutlass::half_t, {{epilogue_vector_length}},
+            {{unary_op1}}, {{binary_op1}}, {{unary_op2}}
+{% if has_d1 %}
+            , {{binary_op2}}
+{% endif %}
+        >,
+        cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+        {{stage}},
+        {{alignment_a}},
+        {{alignment_b}}
+    >;
+"""
+)
+
+# For func codegen.
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    { {{layout.m}}, {{layout.n}}, {{layout.k}} },
+{% if support_split_k %}
+    split_k,
+{% else %}
+    1,
+{% endif %}
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+    (void*) (a_ptr + input_a_offset),
+    (void*) (b_ptr + input_b_offset),
+    (void*) d0_ptr,
+{% if has_d1 %}
+    (void*) d1_ptr,
+{% else %}
+    nullptr,
+{% endif %}
+    (void*) (c_ptr + output_offset),
+    (void*) bias_ptr,
+    nullptr,
+    /*batch_stride_A*/ input_a_batch_stride,
+    /*batch_stride_B*/ input_b_batch_stride,
+    /*batch_stride_C1*/ 0,
+    /*batch_stride_C2*/ 0,
+    /*batch_stride_D*/ 0,
+    /*batch_stride_Vector*/ 0,
+    /*batch_stride_Tensor*/ 0,
+    input_a_stride,
+    input_b_stride,
+    {{layout.stride_c}},
+{% if has_d1 %}
+    {{layout.stride_c}},
+{% else %}
+    0,
+{% endif %}
+    output_stride,
+    /*ldr*/ 0,
+    /*/ldt*/ 0
+"""
+)
+
+# for profiler, no need to include TensorAccessor
+PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    { {{layout.m}}, {{layout.n}}, {{layout.k}} },
+{% if support_split_k %}
+    split_k,
+{% else %}
+    1,
+{% endif %}
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) d0_ptr,
+{% if has_d1 %}
+    (void*) d1_ptr,
+{% else %}
+    nullptr,
+{% endif %}
+    (void*) (c_ptr + output_offset),
+    (void*) bias_ptr,
+    nullptr,
+    /*batch_stride_A*/ 0,
+    /*batch_stride_B*/ 0,
+    /*batch_stride_C1*/ 0,
+    /*batch_stride_C2*/ 0,
+    /*batch_stride_D*/ 0,
+    /*batch_stride_Vector*/ 0,
+    /*batch_stride_Tensor*/ 0,
+    {{layout.stride_a}},
+    {{layout.stride_b}},
+    {{layout.stride_c}},
+{% if has_d1 %}
+    {{layout.stride_c}},
+{% else %}
+    0,
+{% endif %}
+    output_stride,
+    /*ldr*/ 0,
+    /*/ldt*/ 0
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination_residual_block_v2.h"
+#include "cutlass/gemm/device/gemm_universal_with_broadcast.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/device_memory.h"
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+void {{function_name}} (
+    cutlass::half_t* a_ptr,
+    cutlass::half_t* b_ptr,
+    cutlass::half_t* bias_ptr,
+    cutlass::half_t* d0_ptr,
+{% if has_d1 %}
+    cutlass::half_t* d1_ptr,
+{% endif %}
+    cutlass::half_t* c_ptr,
+    uint8_t* workspace,
+{% if support_split_k %}
+    int split_k,
+{% endif %}
+{% for idx in range(input_ndims) %}
+    int64_t* a_dim{{idx}},
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+    int64_t* b_dim{{idx}},
+{% endfor %}
+{% for idx in range(input_ndims) %}
+    int64_t* c_dim{{idx}},
+{% endfor %}
+    cudaStream_t stream
+  ) {
+  {{shape_eval}}
+  {{input_addr_calculator}}
+  {{output_addr_calculator}}
+  {{extra_shape}}
+  {{input_output_checks}}
+
+  if (!bias_ptr) {
+    throw std::runtime_error("bias is null!");
+  }
+  if (!d0_ptr) {
+    throw std::runtime_error("d0_ptr is null!");
+  }
+{% if has_d1 %}
+  if (!d1_ptr) {
+    throw std::runtime_error("d1_ptr is null!");
+  }
+{% endif %}
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this {{function_name}} specialization."
+  );
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+# For function declaration codegen.
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+{% if has_d1 %}
+  cutlass::half_t*,
+{% endif %}
+  cutlass::half_t*,
+  uint8_t*,
+{% if support_split_k %}
+    int,
+{% endif %}
+{% for idx in range(input_ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(input_ndims) %}
+  int64_t*,
+{% endfor %}
+  cudaStream_t
+);
+"""
+)
+
+
+# For function call codegen.
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}{{local_dim_defs}}
+{{indent}}{{func_name}}(
+{{indent}}    {{a_ptr}},
+{{indent}}    {{b_ptr}},
+{{indent}}    {{bias_ptr}},
+{{indent}}    {{d0_ptr}},
+{% if has_d1 %}
+{{indent}}    {{d1_ptr}},
+{% endif %}
+{{indent}}    {{c_ptr}},
+{{indent}}    global_workspace,
+{% if support_split_k %}
+{{indent}} {{split_k}},
+{% endif %}
+{% for dim in adims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in bdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in cdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{{indent}}    stream
+{{indent}});
+{{indent}}}
+"""
+)
+
+# For profiler codegen.
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::atoi(argv[1]);
+  int64_t N = std::atoi(argv[2]);
+  int64_t K = std::atoi(argv[3]);
+{% if support_split_k %}
+  int split_k = std::atoi(argv[4]);
+{% endif %}
+  {{layout.args_parser}}
+"""
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  int64_t a_ptr_sz = a_dim0 * a_dim1;
+  int64_t b_ptr_sz = b_dim0 * b_dim1;
+  int64_t c_ptr_sz = c_dim0 * c_dim1;
+  // The value 1 is used to force ptr_max_sz to be non-zero
+  int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for A100 L2 cache 40M
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+
+  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
+  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateHalfTensor(c_dim1, mem_pool_sz);  // bias_ptr: index 3
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d0 ptr: index 4
+{% if has_d1 %}
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d1 ptr: index 5
+{% endif %}
+"""
+)
+
+
+def _support_split_k(func_attrs):
+    return func_attrs["split_k"] is not None
+
+
+def gemm_bias_broadcast_instance(
+    op_def,
+    func_attrs,
+    for_profiler,
+    layout,
+    unary_op1,
+    binary_op1,
+    binary_op2,
+    unary_op2,
+):
+    """
+    adjust gemm instance with respect to input_accessors, layout and epilogue ops
+    """
+    op_def = common.update_alignments_in_gemm_instance(op_def, func_attrs, for_profiler)
+    gemm_universal_params = common.get_gemm_instance_template_params(op_def)
+    epilogue_pattern = re.compile(r"\s*(cutlass::epilogue::thread::.*)\s*<")
+    match = epilogue_pattern.match(gemm_universal_params[9])
+    if match is None:
+        raise RuntimeError("Invalid epilogue functor:\n" + gemm_universal_params[9])
+    epilogue_functor = match.groups()[0]
+
+    if (
+        "use_fp16_acc" in Target.current()._kwargs
+        and Target.current()._kwargs["use_fp16_acc"]
+    ):
+        acc_type = "cutlass::half_t"
+    else:
+        acc_type = "float"
+    gemm_universal_with_broadcast_params = (
+        GEMM_UNIVERSAL_WITH_BROADCAST_TEMPLATE.render(
+            arch=gemm_universal_params[5],
+            tb_shape=gemm_universal_params[6],
+            warp_shape=gemm_universal_params[7],
+            instruction_shape=gemm_universal_params[8],
+            epilogue_functor=epilogue_functor,
+            epilogue_vector_length=gemm_universal_params[11],
+            unary_op1=unary_op1,
+            binary_op1=binary_op1,
+            binary_op2=binary_op2,
+            unary_op2=unary_op2,
+            stage=gemm_universal_params[16],
+            alignment_a=gemm_universal_params[17],
+            alignment_b=gemm_universal_params[18],
+            layout=layout,
+            acc_type=acc_type,
+            has_d1=(binary_op2 is not None),
+        )
+    )
+    res = re.sub(
+        r"cutlass::gemm::device::Gemm<[\s\S]+>;",
+        gemm_universal_with_broadcast_params,
+        op_def,
+    )
+    return res
+
+
+def gemm_bias_broadcast_config(func_attrs, layout, dtype="float16"):
+    common.make_fproc_f16(func_attrs, layout)
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    layout,
+    unary_op1,
+    binary_op1,
+    binary_op2,
+    unary_op2,
+):
+    op_type = func_attrs["op"]
+    support_split_k = _support_split_k(func_attrs)
+    op_instance = func_attrs["op_instance"]
+    has_d1 = common.has_d1(func_attrs)
+
+    ndims = 2
+    adims = ["&a_dim" + str(i) for i in range(ndims)]
+    bdims = ["&b_dim" + str(i) for i in range(ndims)]
+    cdims = ["&c_dim" + str(i) for i in range(ndims)]
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    file_pairs = []
+    for op_name, op in op_instance.items():
+        config = common.emit_instance(
+            op,
+            for_profiler=True,
+            f_instance_convertor=partial(
+                gemm_bias_broadcast_instance,
+                layout=layout,
+                unary_op1=unary_op1,
+                binary_op1=binary_op1,
+                binary_op2=binary_op2,
+                unary_op2=unary_op2,
+            ),
+        )
+        config_name = common.extract_config_name(config)
+        name = "GemmInstance"
+        instance = common.INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = common.EXEC_TEMPLATE.render(
+            indent="  ",
+            instance=name,
+            is_profiler=True,
+            problem_args=PROFILER_PROBLEM_ARGS_TEMPLATE.render(
+                support_split_k=support_split_k, layout=layout, has_d1=has_d1
+            ),
+        )
+        input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+            input_ndims=ndims,
+            weight_ndims=ndims,
+            output_ndims=ndims,
+        )
+        op_func = SRC_TEMPLATE.render(
+            instances=instance,
+            function_name="gemm",
+            input_ndims=ndims,
+            weight_ndims=ndims,
+            shape_eval=shape_func,
+            input_output_checks=input_output_checks,
+            exec_paths=exec_program,
+            output_addr_calculator=common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+                stride_dim="N"
+            ),
+            support_split_k=support_split_k,
+            has_d1=has_d1,
+        )
+        func_call = FUNC_CALL_TEMPLATE.render(
+            func_name="gemm",
+            a_ptr="memory_pool->RequestHalfTensorByIdx(0)",
+            b_ptr="memory_pool->RequestHalfTensorByIdx(1)",
+            c_ptr="memory_pool->RequestHalfTensorByIdx(2)",
+            d0_ptr="memory_pool->RequestHalfTensorByIdx(4)",
+            d1_ptr="memory_pool->RequestHalfTensorByIdx(5)",
+            bias_ptr="memory_pool->RequestHalfTensorByIdx(3)",
+            adims=adims,
+            bdims=bdims,
+            cdims=cdims,
+            support_split_k=support_split_k,
+            split_k="split_k",
+            has_d1=has_d1,
+        )
+        code = common.PROFILER_TEMPLATE.render(
+            op_func=op_func,
+            args_parse=ARGS_PARSER_TEMPLATE.render(
+                layout=layout, support_split_k=support_split_k
+            ),
+            func_call=func_call,
+            name=name,
+            tensor_decl=TENSOR_DECL_TEMPLATE.render(name=name, has_d1=has_d1),
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
+
+
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    layout,
+    unary_op1,
+    binary_op1,
+    binary_op2,
+    unary_op2,
+):
+    input_addr_calculator = gemm_rcr.get_input_addr_calculator(func_attrs)
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    support_split_k = _support_split_k(func_attrs)
+    has_d1 = common.has_d1(func_attrs)
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        layout=layout, support_split_k=support_split_k, has_d1=has_d1
+    )
+    return common.gen_function(
+        func_attrs,
+        SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        f_instance_convertor=partial(
+            gemm_bias_broadcast_instance,
+            layout=layout,
+            unary_op1=unary_op1,
+            binary_op1=binary_op1,
+            binary_op2=binary_op2,
+            unary_op2=unary_op2,
+        ),
+        support_split_k=support_split_k,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N",
+            output_accessor=func_attrs["output_accessors"][0],
+        ),
+    )
+
+
+def gen_function_decl(func_attrs):
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=_support_split_k(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+    )
+
+
+def gen_function_call(func_attrs, indent="  "):
+    has_d1 = common.has_d1(func_attrs)
+    if has_d1:
+        (a, b, bias, d0, d1) = func_attrs["inputs"]
+    else:
+        (a, b, bias, d0) = func_attrs["inputs"]
+        d1 = None
+    c = func_attrs["outputs"][0]
+    # overwrite the global defs if we have input TensorAccessor
+    local_dim_defs = common.gen_local_dim_defs(func_attrs, indent=indent)
+    adims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["input_accessors"][0].original_shapes
+    ]
+    bdims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["input_accessors"][1].original_shapes
+    ]
+    cdims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["output_accessors"][0].original_shapes
+    ]
+    return FUNC_CALL_TEMPLATE.render(
+        local_dim_defs=local_dim_defs,
+        func_name=func_attrs["name"],
+        a_ptr=a._attrs["name"],
+        b_ptr=b._attrs["name"],
+        bias_ptr=bias._attrs["name"],
+        d0_ptr=d0._attrs["name"],
+        d1_ptr=d1._attrs["name"] if has_d1 else "",
+        c_ptr=c._attrs["name"],
+        split_k=func_attrs["split_k"],
+        adims=adims,
+        bdims=bdims,
+        cdims=cdims,
+        indent=indent,
+        support_split_k=_support_split_k(func_attrs),
+        has_d1=has_d1,
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_permute.py b/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
new file mode 100644
index 000000000..2f3f1e903
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
@@ -0,0 +1,351 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common codegen functions for gemm + permute.
+"""
+
+import re
+from collections import OrderedDict
+from hashlib import sha1
+
+import jinja2
+
+from ...common import gemm_common
+from ...target import Target
+from ..gemm_universal import common
+
+# pylint: disable=C0301,C0415,R1705
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "cutlass/layout/permute.h"
+"""
+)
+
+# HACK: we don't record different permutation shape,
+# because it has little impact on execution time compared.
+# Therefore, no matter what permutation shape it is,
+# we will use the same kernel, i.e. the first generated perm_shape
+# At runtime, the kernel will be regenerated and thus the correctness will not be affected.
+KERNEL_KEY_TEMPLATE = jinja2.Template(
+    """
+cutlass_{{opcode_class_name}}_{{extended_name}}_{{threadblock}}_{{layout}}_{{perm_type}}_{{perm_shape}}_align_{{align_ab}}_{{align_c}}
+"""
+)
+
+
+def kernel_name(op, func_attrs):
+    """Returns kernel_name given input cutlass op_instance and operator attrs."""
+
+    from cutlass_lib import library
+
+    threadblock = op.tile_description.procedural_name()
+    extended_name = op.extended_name()
+    opcode_class_name = library.OpcodeClassNames[
+        op.tile_description.math_instruction.opcode_class
+    ]
+    layout = op.layout_name()
+    align_ab = op.A.alignment
+    align_c = op.C.alignment
+    shape = func_attrs["shape"]
+    if len(shape) == 1:
+        perm_type = "perm4d"
+        perm_shape = f"{shape[0]}"
+    elif len(shape) == 3:
+        perm_type = "perm5d"
+        perm_shape = f"{shape[0]}_{shape[1]}_{shape[2]}"
+    else:
+        raise NotImplementedError(
+            f"gemm permute shape with {shape} is not implemented!"
+        )
+    name = KERNEL_KEY_TEMPLATE.render(
+        threadblock=threadblock,
+        extended_name=extended_name,
+        opcode_class_name=opcode_class_name,
+        layout=layout,
+        align_ab=align_ab,
+        align_c=align_c,
+        perm_type=perm_type,
+        perm_shape=perm_shape,
+    )
+    return name.replace("\n", "")
+
+
+def default_fproc_f16(
+    *, op, a_layout, b_layout, c_layout, epiligue_name, permute_layout
+):
+    """Generates new op_instances by adding alignment info, permute_layout, etc."""
+    import copy
+
+    import cutlass_lib
+
+    ret = []
+    data_type = cutlass_lib.library.DataType.f16
+    acc_type = cutlass_lib.library.DataType.f32
+    # check target use fp16 acc
+    if "use_fp16_acc" in Target.current()._kwargs:
+        if Target.current()._kwargs["use_fp16_acc"]:
+            acc_type = cutlass_lib.library.DataType.f16
+    if (
+        op.A.element == data_type
+        and op.B.element == data_type
+        and op.C.element == data_type
+        and op.accumulator_type() == acc_type
+        and op.A.layout == a_layout
+        and op.B.layout == b_layout
+    ):
+        op = copy.deepcopy(op)
+        # set output major
+        op.C.layout = c_layout
+        # set epilogue
+        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epiligue_name]
+        op.element_epilogue = acc_type
+        op.permute_layout = cutlass_lib.library.EpiloguePermuteLayoutName[
+            permute_layout
+        ]
+        # set C alignment
+        for i in [8, 4, 2, 1]:
+            op = copy.deepcopy(op)
+            op.C.alignment = i
+            ret.append(op)
+    return ret
+
+
+def extract_config(f_proc_op, func_attrs):
+    import cutlass_lib
+
+    op_kind = cutlass_lib.library.OperationKind.Gemm
+    gemm_kind = cutlass_lib.library.GemmKind.Universal
+    gemm_ops = OrderedDict()
+    extract_ops = list(Target.current()._operators[op_kind].items())
+
+    for _, value in extract_ops:
+        op = value[0]
+        if op.gemm_kind == gemm_kind:
+            ret = f_proc_op(op)
+            if len(ret) > 0:
+                for op_inst in ret:
+                    key = kernel_name(op_inst, func_attrs)
+                    gemm_ops[key] = op_inst
+    return gemm_ops
+
+
+def gemm_permute_instance(op_def, func_attrs, for_profiler):
+    import cutlass_lib
+
+    op_def = common.update_alignments_in_gemm_instance(
+        op_def,
+        func_attrs,
+        for_profiler,
+        # expected to have 26 of params, the index offset of alignment value
+        # in the full op_def string is 4
+        kernel_config=("cutlass::gemm::device::GemmUniversal", 26, 4),
+    )
+    shape_info = ", ".join(map(str, func_attrs["shape"]))
+    layout = cutlass_lib.library.EpiloguePermuteLayoutName[func_attrs["layout"]]
+    layout_class = cutlass_lib.library.EpiloguePermuteLayoutTag[layout]
+    tmp = re.sub(
+        r"{}".format(layout_class), "{}<{}>".format(layout_class, shape_info), op_def
+    )
+    return tmp
+
+
+def emit_instance(
+    op,
+    for_profiler,
+    f_instance_convertor=gemm_permute_instance,
+    emit_kernel=False,
+    func_attrs=None,
+):
+    import cutlass_lib
+
+    emiter = cutlass_lib.gemm_operation.EmitGemmInstance()
+    if emit_kernel:
+        emiter = cutlass_lib.gemm_operation.EmitGemmPermuteInstance()
+
+    op_def = emiter.emit(op)
+    op_def = f_instance_convertor(op_def, func_attrs, for_profiler)
+    return op_def
+
+
+def gen_function(
+    func_attrs,
+    src_template,
+    exec_cond_template,
+    problem_args,
+    input_ndims,
+    weight_ndims,
+    output_ndims,
+    dim_info_dict,
+    f_instance_convertor=gemm_permute_instance,
+    emit_kernel=False,
+    support_split_k=False,
+    input_addr_calculator="",
+    output_addr_calculator="",
+    extra_code="",
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for exec_item in exec_path.values():
+        fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
+        algo = exec_item.algo
+        if algo not in inst_def_flag:
+            config = emit_instance(
+                op_instance[algo],
+                for_profiler=False,
+                f_instance_convertor=f_instance_convertor,
+                emit_kernel=emit_kernel,
+                func_attrs=func_attrs,
+            )
+            inst_def_flag.add(algo)
+        else:
+            config = ""
+        inst = common.INSTANCE_TEMPLATE.render(
+            config=config, name=fname, config_name=common.extract_config_name(config)
+        )
+        instances[exec_item.exec_cond] = inst
+        instance_decl += inst
+    shape_eval_func = gemm_common.gen_shape_eval_code(
+        indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+    exec_paths = ""
+    for key, _ in instances.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = common.EXEC_TEMPLATE.render(
+            indent="    ",
+            instance=fname,
+            problem_args=problem_args,
+            support_split_k=support_split_k,
+        )
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+    )
+    return src_template.render(
+        instances=instance_decl,
+        function_name=func_name,
+        dtype="cutlass::half_t",
+        shape_eval=shape_eval_func,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=output_addr_calculator,
+        input_output_checks=input_output_checks,
+        exec_paths=exec_paths,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        support_split_k=support_split_k,
+        has_d=common.has_d(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+        extra_code=extra_code,
+    )
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    args_parser_template,
+    emit_kernel=False,
+    support_split_k=False,
+    output_addr_calculator="",
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+
+    ndims = 2
+    adims = ["&a_dim" + str(i) for i in range(ndims)]
+    bdims = ["&b_dim" + str(i) for i in range(ndims)]
+    cdims = ["&c_dim" + str(i) for i in range(ndims)]
+    shape_func = gemm_common.gen_shape_eval_code(
+        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    file_pairs = []
+    has_bias = bias_ptr_arg is not None
+    for op_name, op in op_instance.items():
+        config = emit_instance(
+            op, for_profiler=True, emit_kernel=emit_kernel, func_attrs=func_attrs
+        )
+        config_name = common.extract_config_name(config)
+        name = "GemmInstance"
+        instance = common.INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+        exec_program = common.EXEC_TEMPLATE.render(
+            indent="  ",
+            instance=name,
+            is_profiler=True,
+            support_split_k=support_split_k,
+            problem_args=problem_args_template.render(),
+        )
+        input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
+            input_ndims=ndims,
+            weight_ndims=ndims,
+            output_ndims=ndims,
+        )
+        op_func = src_template.render(
+            instances=instance,
+            function_name="gemm",
+            input_ndims=2,
+            weight_ndims=2,
+            output_ndims=2,
+            shape_eval=shape_func,
+            input_output_checks=input_output_checks,
+            exec_paths=exec_program,
+            output_addr_calculator=output_addr_calculator,
+            support_split_k=support_split_k,
+            extra_code=extra_code,
+        )
+        func_call = common.FUNC_CALL_TEMPLATE.render(
+            func_name="gemm",
+            a_ptr="memory_pool->RequestHalfTensorByIdx(0)",
+            b_ptr="memory_pool->RequestHalfTensorByIdx(1)",
+            has_bias=has_bias,
+            bias_ptr=bias_ptr_arg,
+            c_ptr="memory_pool->RequestHalfTensorByIdx(2)",
+            split_k="split_k",
+            adims=adims,
+            bdims=bdims,
+            cdims=cdims,
+        )
+        # TODO: Render args_parse by caller.
+        args_parse = (
+            args_parser_template
+            if isinstance(args_parser_template, str)
+            else args_parser_template.render()
+        )
+        code = common.PROFILER_TEMPLATE.render(
+            op_func=op_func,
+            args_parse=args_parse,
+            func_call=func_call,
+            name=name,
+            tensor_decl=common.TENSOR_DECL_TEMPLATE.render(
+                name=name, has_bias=has_bias
+            ),
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
new file mode 100644
index 000000000..0fb211cb0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
@@ -0,0 +1,229 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = GeMM(A, B)
+where A[RowMajor][M, K], B[ColMajor][N, K]
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::atoi(argv[1]);
+  int64_t N = std::atoi(argv[2]);
+  int64_t K = std::atoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+)
+
+# used for real execution
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+    (void*) (a_ptr + input_a_offset),
+    (void*) (b_ptr + input_b_offset),
+    (void*) (c_ptr + output_offset),
+    (void*) (c_ptr + output_offset),
+    input_a_batch_stride,
+    input_b_batch_stride,
+    /*output_batch_stride*/ M * N,
+    /*output_batch_stride*/ M * N,
+    input_a_stride,
+    input_b_stride,
+    output_stride,
+    output_stride
+"""
+)
+
+
+# for profiler, no need to include TensorAccessor
+PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) c_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    M * N,
+    M * N,
+    K,
+    K,
+    N,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    common.make_fproc_f16(func_attrs, RCR)
+
+
+def common_gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+        stride_dim="*b_dim0"
+    )
+    common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        src_template,
+        problem_args_template,
+        ARGS_PARSER_TEMPLATE,
+        support_split_k=True,
+        output_addr_calculator=output_addr_calculator,
+        bias_ptr_arg=bias_ptr_arg,
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("cuda.gemm_rcr.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        PROFILER_PROBLEM_ARGS_TEMPLATE,
+    )
+
+
+def get_input_addr_calculator(func_attrs):
+    input_a_batch_stride_dim = "M * K"
+    input_a_stride_k_dim = "K"
+    input_a_offset = 0
+    input_b_batch_stride_dim = "N * K"
+    input_b_stride_k_dim = "K"
+    input_b_offset = 0
+
+    if "input_accessors" in func_attrs:
+        input_a_accessor = func_attrs["input_accessors"][0]
+        input_b_accessor = func_attrs["input_accessors"][1]
+        if input_a_accessor.is_from_strided_tensor:
+            input_a_offset = input_a_accessor.offset
+            shapes = input_a_accessor.original_shapes
+            input_a_stride_k_dim = input_a_accessor.stride(len(shapes) - 2)
+
+        if input_b_accessor.is_from_strided_tensor:
+            input_b_offset = input_b_accessor.offset
+            shapes = input_b_accessor.original_shapes
+            input_b_stride_k_dim = input_b_accessor.stride(len(shapes) - 2)
+
+    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
+        input_a_batch_stride_dim=input_a_batch_stride_dim,
+        input_a_stride_dim=input_a_stride_k_dim,
+        input_a_offset_val=input_a_offset,
+        input_b_batch_stride_dim=input_b_batch_stride_dim,
+        input_b_stride_dim=input_b_stride_k_dim,
+        input_b_offset_val=input_b_offset,
+    )
+    return input_addr_calculator
+
+
+@registry.reg("cuda.gemm_rcr.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    input_addr_calculator = get_input_addr_calculator(func_attrs)
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    problem_args = PROBLEM_ARGS_TEMPLATE.render()
+    return common.gen_function(
+        func_attrs,
+        common.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        support_split_k=True,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
+
+
+@registry.reg("cuda.gemm_rcr.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.gemm_rcr.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
new file mode 100644
index 000000000..f54c0ed2c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
@@ -0,0 +1,158 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = GeMM(A, B) + bias
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][N]
+"""
+import jinja2
+
+from ... import registry
+from . import common, common_bias, gemm_rcr
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+# used for real execution
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+    (void*) (a_ptr + input_a_offset),
+    (void*) (b_ptr + input_b_offset),
+    (void*) bias_ptr,
+    (void*) (c_ptr + output_offset),
+    input_a_batch_stride,
+    input_b_batch_stride,
+    /*bias_batch_stride*/ N,
+    /*output_batch_stride*/ M * N,
+    input_a_stride,
+    input_b_stride,
+    /*bias_stride*/ 0,
+    output_stride
+"""
+)
+
+
+# for profiler, no need to include TensorAccessor
+PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) bias_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    N,
+    M * N,
+    K,
+    K,
+    0,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_bias.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return gemm_rcr.gemm_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    gemm_rcr.common_gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common_bias.SRC_TEMPLATE,
+        PROFILER_PROBLEM_ARGS_TEMPLATE,
+        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    input_addr_calculator = gemm_rcr.get_input_addr_calculator(func_attrs)
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    problem_args = PROBLEM_ARGS_TEMPLATE.render()
+    return common.gen_function(
+        func_attrs,
+        common_bias.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        support_split_k=True,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
new file mode 100644
index 000000000..c2fc67191
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = ADD(GeMM(A, B) + bias, D0)
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N]
+"""
+from ... import registry
+from . import common, common_bias_broadcast
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_OP1 = "cutlass::epilogue::thread::Identity"
+BINARY_OP1 = "cutlass::plus"
+BINARY_OP2 = None
+UNARY_OP2 = "cutlass::epilogue::thread::Identity"
+
+
+@registry.reg("cuda.gemm_rcr_bias_add.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    common_bias_broadcast.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_add.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_broadcast.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_add.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
new file mode 100644
index 000000000..56511dbc1
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = RELU(ADD(ADD(GeMM(A, B) + bias, D0), D1))
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
+"""
+from ... import registry
+from . import common, common_bias_broadcast
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_OP1 = "cutlass::epilogue::thread::Identity"
+BINARY_OP1 = "cutlass::plus"
+BINARY_OP2 = "cutlass::plus"
+UNARY_OP2 = "cutlass::epilogue::thread::Identity"
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    common_bias_broadcast.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_broadcast.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
new file mode 100644
index 000000000..f823baab2
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = RELU(ADD(ADD(GeMM(A, B) + bias, D0), D1))
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
+"""
+from ... import registry
+from . import common, common_bias_broadcast
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_OP1 = "cutlass::epilogue::thread::Identity"
+BINARY_OP1 = "cutlass::plus"
+BINARY_OP2 = "cutlass::plus"
+UNARY_OP2 = "cutlass::epilogue::thread::ReLu"
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add_relu.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add_relu.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    common_bias_broadcast.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add_relu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_broadcast.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add_relu.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add_relu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_add_relu.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
new file mode 100644
index 000000000..bd4f7da4b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = RELU(ADD(GeMM(A, B) + bias, D0))
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N]
+"""
+from ... import registry
+from . import common, common_bias_broadcast
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_OP1 = "cutlass::epilogue::thread::Identity"
+BINARY_OP1 = "cutlass::plus"
+BINARY_OP2 = None
+UNARY_OP2 = "cutlass::epilogue::thread::ReLu"
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_relu.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_relu.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    common_bias_broadcast.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_relu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_broadcast.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_relu.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_relu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_add_relu.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
new file mode 100644
index 000000000..f55e21cd8
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -0,0 +1,144 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for C = fast_gelu(GeMM(A, B) + bias)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][K], C[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common, common_bias_activation
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/constants.h"
+#include "cutlass/complex.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationFastGELU = LinearCombinationGeneric<GELU_taylor, ElementOutput_, Count, ElementAccumulator_,
+                                                          ElementCompute_, Scale, Round, true>;
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) bias_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    N,
+    M * N,
+    K,
+    K,
+    0,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_bias_fast_gelu.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_bias_fast_gelu.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_bias_activation.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        PROBLEM_ARGS_TEMPLATE,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_fast_gelu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_activation.gen_function(
+        func_attrs,
+        PROBLEM_ARGS_TEMPLATE,
+        exec_cond_template,
+        dim_info_dict,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_fast_gelu.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_activation.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_fast_gelu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_activation.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_fast_gelu.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
new file mode 100644
index 000000000..d16d769a1
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
@@ -0,0 +1,106 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for C = fast_gelu(GeMM(A, B) + bias)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][K], C[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common, common_bias_activation
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) bias_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    N,
+    M * N,
+    K,
+    K,
+    0,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_bias_gelu.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_bias_gelu.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_bias_activation.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        PROBLEM_ARGS_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_gelu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_activation.gen_function(
+        func_attrs,
+        PROBLEM_ARGS_TEMPLATE,
+        exec_cond_template,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_gelu.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_activation.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_gelu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_activation.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_gelu.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
new file mode 100644
index 000000000..6c22e1e3a
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -0,0 +1,106 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for C = hard_swish(GeMM(A, B) + bias)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][K], C[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common, common_bias_activation
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) bias_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    N,
+    M * N,
+    K,
+    K,
+    0,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_bias_hardswish.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_bias_hardswish.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_bias_activation.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        PROBLEM_ARGS_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_hardswish.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_activation.gen_function(
+        func_attrs,
+        PROBLEM_ARGS_TEMPLATE,
+        exec_cond_template,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_hardswish.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_activation.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_hardswish.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_activation.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_hardswish.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
new file mode 100644
index 000000000..f2049abef
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = ADD(GeMM(A, B) + bias, D0)
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N]
+"""
+from ... import registry
+from . import common, common_bias_broadcast
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_OP1 = "cutlass::epilogue::thread::Identity"
+BINARY_OP1 = "cutlass::multiplies"
+BINARY_OP2 = None
+UNARY_OP2 = "cutlass::epilogue::thread::Identity"
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    common_bias_broadcast.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_broadcast.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
new file mode 100644
index 000000000..55400a029
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Add(Mul(GeMM(A, B) + bias, D0), D1),
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
+"""
+from ... import registry
+from . import common, common_bias_broadcast
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_OP1 = "cutlass::epilogue::thread::Identity"
+BINARY_OP1 = "cutlass::multiplies"
+BINARY_OP2 = "cutlass::plus"
+UNARY_OP2 = "cutlass::epilogue::thread::Identity"
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_add.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_add.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    common_bias_broadcast.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_add.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_broadcast.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_add.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_add.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_add.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
new file mode 100644
index 000000000..3d5abf306
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = TANH(Mul((GeMM(A, B) + bias), D0))
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N]
+"""
+from ... import registry
+from . import common, common_bias_broadcast
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_OP1 = "cutlass::epilogue::thread::Identity"
+BINARY_OP1 = "cutlass::multiplies"
+BINARY_OP2 = None
+UNARY_OP2 = "cutlass::epilogue::thread::Tanh"
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_tanh.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_tanh.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    common_bias_broadcast.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_tanh.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_broadcast.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_tanh.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_tanh.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_mul_tanh.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
new file mode 100644
index 000000000..2a4c75cbe
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
@@ -0,0 +1,117 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM with bias and permute epilogue fusion
+"""
+
+from ... import registry
+from ..gemm_universal import common
+from . import common_bias, common_permute, gemm_rcr_bias, gemm_rcr_permute
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+PROBLEM_ARGS_TEMPLATE = gemm_rcr_bias.PROFILER_PROBLEM_ARGS_TEMPLATE
+
+
+@registry.reg("cuda.gemm_rcr_bias_permute.config")
+def gemm_rcr_bias_permute_config(func_attrs, dtype="float16"):
+    return gemm_rcr_permute.gemm_rcr_permute_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_bias_permute.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return gemm_rcr_permute.common_gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common_bias.SRC_TEMPLATE,
+        PROBLEM_ARGS_TEMPLATE,
+        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+        extra_code=common_permute.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_permute.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args_template=None,
+):
+    if problem_args_template is None:
+        problem_args = PROBLEM_ARGS_TEMPLATE.render()
+    else:
+        problem_args = problem_args_template.render()
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    return common_permute.gen_function(
+        func_attrs,
+        common_bias.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+        ),
+        extra_code=common_permute.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_permute.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_permute.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_permute.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
new file mode 100644
index 000000000..3a5940e7a
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
@@ -0,0 +1,107 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for C = relu(GeMM(A, B) + bias)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][K], C[RowMajor][M, N]
+"""
+
+import jinja2
+
+from ... import registry
+from . import common, common_bias_activation
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) bias_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    N,
+    M * N,
+    K,
+    K,
+    0,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_bias_relu.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_bias_relu.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_bias_activation.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        PROBLEM_ARGS_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_relu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_activation.gen_function(
+        func_attrs,
+        PROBLEM_ARGS_TEMPLATE,
+        exec_cond_template,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_relu.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_activation.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_relu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_activation.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_relu.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
new file mode 100644
index 000000000..719efbfa2
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -0,0 +1,107 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Sigmoid(GeMM(A, B) + bias)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][N]
+"""
+import jinja2
+
+from ... import registry
+from . import common, common_bias_activation
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) bias_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    N,
+    M * N,
+    K,
+    K,
+    0,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_bias_activation.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        PROBLEM_ARGS_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_activation.gen_function(
+        func_attrs,
+        PROBLEM_ARGS_TEMPLATE,
+        exec_cond_template,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_activation.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_activation.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
new file mode 100644
index 000000000..b3b306f38
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Mul(Sigmoid(GeMM(A, B) + bias), D0)
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N]
+"""
+from ... import registry
+from . import common, common_bias_broadcast
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_OP1 = "cutlass::epilogue::thread::Sigmoid"
+BINARY_OP1 = "cutlass::multiplies"
+BINARY_OP2 = None
+UNARY_OP2 = "cutlass::epilogue::thread::Identity"
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    common_bias_broadcast.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_broadcast.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
new file mode 100644
index 000000000..66cad13c4
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = TANH(Mul(Sigmoid(GeMM(A, B) + bias), D0))
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N]
+"""
+from ... import registry
+from . import common, common_bias_broadcast
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_OP1 = "cutlass::epilogue::thread::Sigmoid"
+BINARY_OP1 = "cutlass::multiplies"
+BINARY_OP2 = None
+UNARY_OP2 = "cutlass::epilogue::thread::Tanh"
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    common_bias_broadcast.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_broadcast.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        RCR,
+        UNARY_OP1,
+        BINARY_OP1,
+        BINARY_OP2,
+        UNARY_OP2,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
new file mode 100644
index 000000000..688c9daf3
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
@@ -0,0 +1,107 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = swish(GeMM(A, B) + bias)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][N]
+"""
+import jinja2
+
+from ... import registry
+from . import common, common_bias_activation
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) bias_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    N,
+    M * N,
+    K,
+    K,
+    0,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_bias_swish.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_bias_swish.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_bias_activation.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        PROBLEM_ARGS_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_swish.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_activation.gen_function(
+        func_attrs,
+        PROBLEM_ARGS_TEMPLATE,
+        exec_cond_template,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_swish.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_activation.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_swish.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_activation.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_swish.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
new file mode 100644
index 000000000..8a11c966f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
@@ -0,0 +1,144 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = tanh(GeMM(A, B) + bias)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][N]
+"""
+import jinja2
+
+from ... import registry
+from . import common, common_bias_activation
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/constants.h"
+#include "cutlass/complex.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationTanh = LinearCombinationGeneric<Tanh, ElementOutput_, Count, ElementAccumulator_,
+                                                          ElementCompute_, Scale, Round, true>;
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) bias_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    N,
+    M * N,
+    K,
+    K,
+    0,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_bias_tanh.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.gemm_rcr_bias_tanh.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_bias_activation.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        PROBLEM_ARGS_TEMPLATE,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_tanh.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common_bias_activation.gen_function(
+        func_attrs,
+        PROBLEM_ARGS_TEMPLATE,
+        exec_cond_template,
+        dim_info_dict,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_bias_tanh.func_decl")
+def gen_function_decl(func_attrs):
+    return common_bias_activation.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_bias_tanh.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_activation.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rcr_bias_tanh.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
new file mode 100644
index 000000000..f2851db12
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
@@ -0,0 +1,220 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = permute(GeMM(A, B) + bias)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][N]
+"""
+import jinja2
+
+from ... import registry
+from ..gemm_universal import common
+from . import common_permute
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::atoi(argv[1]);
+  int64_t N = std::atoi(argv[2]);
+  int64_t K = std::atoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) c_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    M * N,
+    M * N,
+    K,
+    K,
+    N,
+    output_stride
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_permute.config")
+def gemm_rcr_permute_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common_permute.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.RowMajor,
+            b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+            permute_layout=func_attrs["layout"],
+        )
+
+    func_attrs["op_instance"] = common_permute.extract_config(fproc_f16, func_attrs)
+
+
+def common_gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+        stride_dim="*b_dim0"
+    )
+    common_permute.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        src_template,
+        problem_args_template,
+        ARGS_PARSER_TEMPLATE,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=output_addr_calculator,
+        bias_ptr_arg=bias_ptr_arg,
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        PROBLEM_ARGS_TEMPLATE,
+        extra_code=common_permute.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args_template=None,
+):
+    if problem_args_template is None:
+        problem_args = PROBLEM_ARGS_TEMPLATE.render()
+    else:
+        problem_args = problem_args_template.render()
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    return common_permute.gen_function(
+        func_attrs,
+        common.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+        ),
+        extra_code=common_permute.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    a = func_attrs["inputs"][0]
+    b = func_attrs["inputs"][1]
+
+    output = func_attrs["outputs"][0]
+    has_bias = False
+    adims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["input_accessors"][0].original_shapes
+    ]
+    bdims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["input_accessors"][1].original_shapes
+    ]
+    cdims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["output_accessors"][0].original_shapes
+    ]
+    return common.FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        a_ptr=a._attrs["name"],
+        b_ptr=b._attrs["name"],
+        has_bias=has_bias,
+        c_ptr=output._attrs["name"],
+        split_k=func_attrs["split_k"],
+        adims=adims,
+        bdims=bdims,
+        cdims=cdims,
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
new file mode 100644
index 000000000..0a3d109d6
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
@@ -0,0 +1,161 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = GeMM(A, B)
+where A[RowMajor][M, K], B[RowMajor][K, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::atoi(argv[1]);
+  int64_t N = std::atoi(argv[2]);
+  int64_t K = std::atoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = K;
+  int64_t b_dim1 = N;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+)
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) c_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    M * N,
+    M * N,
+    K,
+    N,
+    N,
+    output_stride,
+"""
+)
+
+
+@registry.reg("cuda.gemm_rrr.config")
+def gemm_rrr_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.RowMajor,
+            b_layout=cutlass_lib.library.LayoutType.RowMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+        )
+
+    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+
+
+@registry.reg("cuda.gemm_rrr.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+        stride_dim="N"
+    )
+    common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        PROBLEM_ARGS_TEMPLATE,
+        ARGS_PARSER_TEMPLATE,
+        support_split_k=True,
+        output_addr_calculator=output_addr_calculator,
+    )
+
+
+@registry.reg("cuda.gemm_rrr.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    problem_args = PROBLEM_ARGS_TEMPLATE.render()
+    return common.gen_function(
+        func_attrs,
+        common.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        support_split_k=True,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="*b_dim1", output_accessor=func_attrs["output_accessors"][0]
+        ),
+    )
+
+
+@registry.reg("cuda.gemm_rrr.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.gemm_rrr.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.gemm_rrr.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
new file mode 100644
index 000000000..8653efab1
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
@@ -0,0 +1,221 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = permute(GeMM(A, B) + bias)
+where A[RowMajor][M, K], B[RowMajor][K, N], bias[RowMajor][N]
+"""
+import jinja2
+
+from ... import registry
+from ..gemm_universal import common
+from . import common_permute
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::atoi(argv[1]);
+  int64_t N = std::atoi(argv[2]);
+  int64_t K = std::atoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = K;
+  int64_t b_dim1 = N;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {M, N, K},
+    split_k,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+    (void*) a_ptr,
+    (void*) b_ptr,
+    (void*) c_ptr,
+    (void*) (c_ptr + output_offset),
+    M * K,
+    N * K,
+    M * N,
+    M * N,
+    K,
+    N,
+    N,
+    output_stride,
+"""
+)
+
+
+@registry.reg("cuda.gemm_rrr_permute.config")
+def gemm_rrr_permute_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common_permute.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.RowMajor,
+            b_layout=cutlass_lib.library.LayoutType.RowMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+            permute_layout=func_attrs["layout"],
+        )
+
+    func_attrs["op_instance"] = common_permute.extract_config(fproc_f16, func_attrs)
+
+
+def common_gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+        stride_dim="N"
+    )
+    common_permute.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        src_template,
+        problem_args_template,
+        ARGS_PARSER_TEMPLATE,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=output_addr_calculator,
+        bias_ptr_arg=bias_ptr_arg,
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_permute.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return common_gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        PROBLEM_ARGS_TEMPLATE,
+        extra_code=common_permute.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rrr_permute.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args_template=None,
+):
+    if problem_args_template is None:
+        problem_args = PROBLEM_ARGS_TEMPLATE.render()
+    else:
+        problem_args = problem_args_template.render()
+
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    return common_permute.gen_function(
+        func_attrs,
+        common.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims,
+        weight_ndims,
+        output_ndims,
+        dim_info_dict,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="*b_dim1", output_accessor=func_attrs["output_accessors"][0]
+        ),
+        extra_code=common_permute.EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rrr_permute.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_permute.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    a = func_attrs["inputs"][0]
+    b = func_attrs["inputs"][1]
+
+    output = func_attrs["outputs"][0]
+    has_bias = False
+    adims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["input_accessors"][0].original_shapes
+    ]
+    bdims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["input_accessors"][1].original_shapes
+    ]
+    cdims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["output_accessors"][0].original_shapes
+    ]
+    return common.FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        a_ptr=a._attrs["name"],
+        b_ptr=b._attrs["name"],
+        has_bias=has_bias,
+        c_ptr=output._attrs["name"],
+        split_k=func_attrs["split_k"],
+        adims=adims,
+        bdims=bdims,
+        cdims=cdims,
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_permute.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common.py b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
new file mode 100644
index 000000000..6568b3c4f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
@@ -0,0 +1,974 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common functions and templates for group-gemm-family kernels
+"""
+import re
+from hashlib import sha1
+from typing import Any, Dict, List
+
+import jinja2
+
+from ...common import tensor_accessor_codegen
+from . import common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+DIM_DEFS_TEMPLATE = jinja2.Template(
+    """
+{% for dim_name in dim_names %}
+{% set dim_value = dim_values[loop.index - 1] %}
+{{indent}}int64_t {{dim_name}} = {{dim_value}};
+{% endfor %}
+"""
+)
+
+
+GROUP_OUTPUT_ADDR_CALCULATOR = jinja2.Template(
+    """
+  {% if output_accessor.is_contiguous %}
+  int64_t output_stride_{{group_id}} = GROUP_{{group_id}}_{{output_stride_dim}};
+  int64_t output_offset_{{group_id}} = 0;
+  {% else %}
+  int64_t output_stride_{{group_id}} = {{output_accessor.actual_total_elements_from_stride_dim}};
+  int64_t output_offset_{{group_id}} = {{output_accessor.offset}};
+  {% endif %}
+"""
+)
+
+
+GROUP_INPUT_A_ADDR_CALCULATOR = jinja2.Template(
+    """
+  {% if input_a_accessor.is_contiguous %}
+  int64_t input_a_stride_{{group_id}} = GROUP_{{group_id}}_{{input_a_stride_dim}};
+  int64_t input_a_offset_{{group_id}} = 0;
+  {% else %}
+  int64_t input_a_stride_{{group_id}} = {{input_a_accessor.actual_total_elements_from_stride_dim}};
+  int64_t input_a_offset_{{group_id}} = {{input_a_accessor.offset}};
+  {% endif %}
+"""
+)
+
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::gemm::device::GemmGrouped<{{config_name}}>;
+"""
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}void {{func_name}}(
+{{indent}} int,
+{{indent}} int,
+{{indent}} int64_t*,
+{{indent}} int,
+{{indent}} cutlass::half_t*,
+{% for i in range(groups) %}
+{{indent}} cutlass::half_t*,
+{{indent}} cutlass::half_t*,
+{{indent}} cutlass::half_t*,
+{% if has_bias %}
+{{indent}} cutlass::half_t*,
+{% endif %}
+{% endfor %}
+{{indent}} uint8_t*,
+{% for i in range(groups) %}
+{{indent}} int64_t*,
+{{indent}} int64_t*,
+{{indent}} int64_t*,
+{{indent}} int64_t*,
+{{indent}} int64_t*,
+{{indent}} int64_t*,
+{% endfor %}
+{{indent}} cudaStream_t
+{{indent}});
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}} device_properties.sharedMemPerMultiprocessor,
+{{indent}} device_properties.multiProcessorCount,
+{{indent}} &{{func_name}}_state,
+{{indent}} {{problem_count}},
+{{indent}} {{device_args}},
+{% for operand in group_operands %}
+{{indent}} {{operand[0]}},
+{{indent}} {{operand[1]}},
+{{indent}} {{operand[2]}},
+{% if has_bias %}
+{{indent}} {{operand[3]}},
+{% endif %}
+{% endfor %}
+{{indent}} global_workspace,
+{% for operand_dim in group_operand_dims %}
+{{indent}} {{operand_dim[0]}},
+{{indent}} {{operand_dim[1]}},
+{{indent}} {{operand_dim[2]}},
+{{indent}} {{operand_dim[3]}},
+{{indent}} {{operand_dim[4]}},
+{{indent}} {{operand_dim[5]}},
+{% endfor %}
+{{indent}} stream
+{{indent}});
+"""
+)
+
+
+ADAPTOR_FUNCTION_TEMPLATE = jinja2.Template(
+    """
+{% if is_profiler %}
+#include <iostream>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instance}}
+
+{% endif %}
+
+{{indent}}template<typename GEMMKind>
+{{indent}}void {{func_name}}_adapter(
+    int sharedMemPerMultiprocessor,
+    int multiProcessorCount,
+    uint8_t* workspace,
+    int problem_count,
+    cutlass::gemm::GemmCoord* problem_sizes_device,
+    cutlass::half_t **ptr_A,
+    cutlass::half_t **ptr_B,
+    cutlass::half_t **ptr_C,
+{% if has_bias %}
+    cutlass::half_t **ptr_bias,
+{% endif %}
+    int64_t* lda,
+    int64_t* ldb,
+    int64_t* ldc,
+{% if has_bias %}
+    int64_t* ldd,
+{% endif %}
+    int occupancy,
+    cudaStream_t stream) {
+  {{exec_program}}
+  throw std::runtime_error(
+      "Unsupported workload for this gemm specialization."
+  );
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+ADAPTER_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}_adapter<{{instance}}>(
+    {{sharedMemPerMultiprocessor}},
+    {{multiProcessorCount}},
+    {{workspace}},
+    {{problem_count}},
+    {{problem_sizes_device}},
+    {{ptr_A}},
+    {{ptr_B}},
+    {{ptr_C}},
+{% if has_bias %}
+    {{ptr_bias}},
+{% endif %}
+    {{lda}},
+    {{ldb}},
+    {{ldc}},
+{% if has_bias %}
+    {{ldd}},
+{% endif %}
+    {{instance}}::maximum_active_blocks(),
+    stream
+    );
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+//  TODO: cast to right dtype
+{{indent}}using ElementComputeEpilogue = typename GEMMKind::ElementAccumulator;
+{{indent}}// int smem_size = int(sizeof(typename GEMMKind::GemmKernel::SharedStorage));
+{{indent}}// int occupancy = std::min(2, int(sharedMemPerMultiprocessor / smem_size));
+{{indent}}int threadblock_count = multiProcessorCount * occupancy;
+{{indent}}// Early exit
+{{indent}}if (!threadblock_count) {
+{{indent}}  throw std::runtime_error(
+{{indent}}    "Active CUDA device lacks hardware resources to run CUTLASS Grouped GEMM kernel."
+{{indent}}  );
+{{indent}}}
+
+
+{{indent}}typename GEMMKind::Arguments arguments{
+
+{{problem_args}}
+
+{{indent}}};
+{{indent}}GEMMKind gemm_op;
+{% if is_profiler %}
+{{indent}}// Debug BGM: https://www.youtube.com/watch?v=rRwxfYlgG-M
+{{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+{{indent}}// TODO: cutlass bug here
+{{indent}}// auto status = gemm_op.can_implement(arguments);
+{{indent}}// CUTLASS_CHECK(status);
+{{indent}}auto status = gemm_op.initialize(arguments, workspace, stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = gemm_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("Got cutlass error: ") + cutlassGetStatusString(error) + \\
+          " at: " + std::to_string(__LINE__);                                         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+namespace {
+template <typename T>
+void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
+  size_t bytes = count * cutlass::sizeof_bits<T>::value / 8;
+  if (bytes == 0 && count > 0)
+    bytes = 1;
+  cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
+  if (cuda_error != cudaSuccess) {
+    throw std::runtime_error("cudaMemcpy() failed");
+  }
+}
+} // namespace
+
+{{instances}}
+
+{{func_adapter}}
+
+void {{function_name}} (
+    int sharedMemPerMultiprocessor,
+    int multiProcessorCount,
+    int64_t* func_state,
+    int problem_count,
+    cutlass::half_t* device_args,
+    {% for operand in group_operands %}
+    cutlass::half_t* {{operand[0]}},
+    cutlass::half_t* {{operand[1]}},
+    cutlass::half_t* {{operand[2]}},
+    {% if has_bias %}
+    cutlass::half_t* {{operand[3]}},
+    {% endif %}
+    {% endfor %}
+    uint8_t* global_workspace,
+{% for operand_dim in group_operand_dims %}
+    int64_t* {{operand_dim[0]}},
+    int64_t* {{operand_dim[1]}},
+    int64_t* {{operand_dim[2]}},
+    int64_t* {{operand_dim[3]}},
+    int64_t* {{operand_dim[4]}},
+    int64_t* {{operand_dim[5]}},
+{% endfor %}
+    cudaStream_t stream) {
+
+    {{shape_function}}
+
+    if (!device_args) {
+      throw std::runtime_error("device_args is NULL!");
+    }
+    // It's a bit tricky to check individual gemms in group_gemm cases,
+    // so let's rule out them all if any of the input/output tensors is zero-sized.
+    // We can re-visit this part if we hit any use case, e.g. one input of the
+    // gemm is zero-sized, but all others are non-zero-sized.
+{% for operand_dim in group_operand_dims %}
+    if (*{{operand_dim[0]}} == 0 || *{{operand_dim[1]}} == 0 ||
+        *{{operand_dim[2]}} == 0 || *{{operand_dim[3]}} == 0 ||
+        *{{operand_dim[4]}} == 0 || *{{operand_dim[5]}} == 0) {
+      throw std::runtime_error("Zero-sized tensors are not supported yet");
+    }
+{% endfor %}
+{% for operand in group_operands %}
+    if (!{{operand[0]}}) {
+      throw std::runtime_error("{{operand[0]}} is NULL!");
+    }
+    if (!{{operand[1]}}) {
+      throw std::runtime_error("{{operand[1]}} is NULL!");
+    }
+    if (!{{operand[2]}}) {
+      throw std::runtime_error("{{operand[2]}} is NULL!");
+    }
+{% if has_bias %}
+    if (!{{operand[3]}}) {
+      throw std::runtime_error("{{operand[3]}} is NULL!");
+    }
+{% endif %}
+
+{% endfor %}
+
+    uint8_t* arg_ptr = (uint8_t*) device_args;
+    // problem_sizes_device: N * GemmCoord -> N * 3 * sizeof(int64_t) -> 32 * N
+    // ptrA/B/C/D: N * 8 for each
+    // lda/b/c/d: N * 8 for each
+    // total: N * 8 * 4 + N * 8 * 4 + N * 8 * 4
+    // total: 3 * 32 * N
+    int offset = 0;
+    auto problem_sizes_device =
+        (cutlass::gemm::GemmCoord*)(arg_ptr + offset);
+    offset += 32 * problem_count;
+
+    auto ptr_A = (cutlass::half_t**)(arg_ptr + offset);
+    offset += 8 * problem_count;
+    auto ptr_B = (cutlass::half_t**)(arg_ptr + offset);
+    offset += 8 * problem_count;
+    auto ptr_C = (cutlass::half_t**)(arg_ptr + offset);
+    offset += 8 * problem_count;
+    {% if has_bias %}
+    auto ptr_bias = (cutlass::half_t**)(arg_ptr + offset);
+    offset += 8 * problem_count;
+    {% endif %}
+
+    auto lda = (int64_t*)(arg_ptr + offset);
+    offset += 8 * problem_count;
+    auto ldb = (int64_t*)(arg_ptr + offset);
+    offset += 8 * problem_count;
+    auto ldc = (int64_t*)(arg_ptr + offset);
+    {% if has_bias %}
+    offset += 8 * problem_count;
+    auto ldd = (int64_t*)(arg_ptr + offset);
+    {% endif %}
+    // offset += 8 * problem_count;
+
+    if (*func_state != GROUP_0_AM) {
+        // need update
+        std::vector<cutlass::gemm::GemmCoord> problem_sizes;
+        std::vector<cutlass::half_t*> ptr_A_host;
+        std::vector<cutlass::half_t*> ptr_B_host;
+        std::vector<cutlass::half_t*> ptr_C_host;
+        {% if has_bias %}
+        std::vector<cutlass::half_t*> ptr_bias_host;
+        {% endif %}
+        std::vector<int64_t> lda_host;
+        std::vector<int64_t> ldb_host;
+        std::vector<int64_t> ldc_host;
+        {% if has_bias %}
+        std::vector<int64_t> ldd_host;
+        {% endif %}
+
+        {% for operand in group_operands %}
+        ptr_A_host.push_back({{operand[0]}} + input_a_offset_{{loop.index0}});
+        ptr_B_host.push_back({{operand[1]}});
+        ptr_C_host.push_back({{operand[2]}} + output_offset_{{loop.index0}});
+        {% if has_bias %}
+        ptr_bias_host.push_back({{operand[3]}});
+        {% endif %}
+        {% endfor %}
+
+        // AM: 0
+        // AK: 1
+        // BN: 2
+        {% for operand_dim in group_operand_dims %}
+        cutlass::gemm::GemmCoord problem_{{loop.index0}}(
+            GROUP_{{loop.index0}}_M,
+            GROUP_{{loop.index0}}_N,
+            GROUP_{{loop.index0}}_K);
+        problem_sizes.emplace_back(problem_{{loop.index0}});
+        lda_host.push_back(input_a_stride_{{loop.index0}});
+        ldb_host.push_back(GROUP_{{loop.index0}}_K);
+        {% if has_bias %}
+        ldc_host.push_back(0);
+        ldd_host.push_back(output_stride_{{loop.index0}});
+        {% else %}
+        ldc_host.push_back(output_stride_{{loop.index0}});
+        {% endif %}
+        {% endfor %}
+
+        copy(problem_sizes_device,
+             problem_sizes.data(),
+             problem_count, cudaMemcpyHostToDevice);
+
+        copy(ptr_A,
+             ptr_A_host.data(),
+             problem_count, cudaMemcpyHostToDevice);
+
+        copy(ptr_B,
+             ptr_B_host.data(),
+             problem_count, cudaMemcpyHostToDevice);
+
+        copy(ptr_C,
+             ptr_C_host.data(),
+             problem_count, cudaMemcpyHostToDevice);
+
+        {% if has_bias %}
+        copy(ptr_bias,
+             ptr_bias_host.data(),
+             problem_count, cudaMemcpyHostToDevice);
+        {% endif %}
+
+        copy(lda,
+             lda_host.data(),
+             problem_count, cudaMemcpyHostToDevice);
+
+        copy(ldb,
+             ldb_host.data(),
+             problem_count, cudaMemcpyHostToDevice);
+
+        copy(ldc,
+             ldc_host.data(),
+             problem_count, cudaMemcpyHostToDevice);
+
+        {% if has_bias %}
+        copy(ldd,
+             ldd_host.data(),
+             problem_count, cudaMemcpyHostToDevice);
+        {% endif %}
+
+        *func_state = GROUP_0_AM;
+    }
+    {{exec_paths}}
+}
+
+
+"""
+)
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int problem_count = std::atoi(argv[1]);
+  int64_t idx = 2;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes;
+  while (idx < argc) {
+    int64_t M = std::atoi(argv[idx++]);
+    int64_t N = std::atoi(argv[idx++]);
+    int64_t K = std::atoi(argv[idx++]);
+    cutlass::gemm::GemmCoord problem(M, N, K);
+    problem_sizes.push_back(problem);
+  }
+"""
+)
+
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  cutlass::DeviceAllocation<ElementInputA> blob_A;
+  cutlass::DeviceAllocation<ElementInputB> blob_B;
+  cutlass::DeviceAllocation<ElementOutput> blob_C;
+{% if has_bias %}
+  cutlass::DeviceAllocation<ElementOutput> blob_Bias;
+{% endif %}
+  int64_t total_size_A = 0;
+  int64_t total_size_B = 0;
+  int64_t total_size_C = 0;
+{% if has_bias %}
+  int64_t total_size_Bias = 0;
+{% endif %}
+
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device;
+
+  std::vector<int64_t> lda_host;
+  std::vector<int64_t> ldb_host;
+  std::vector<int64_t> ldc_host;
+{% if has_bias %}
+  std::vector<int64_t> ldd_host;
+{% endif %}
+
+
+  cutlass::DeviceAllocation<int64_t> lda;
+  cutlass::DeviceAllocation<int64_t> ldb;
+  cutlass::DeviceAllocation<int64_t> ldc;
+{% if has_bias %}
+  cutlass::DeviceAllocation<int64_t> ldd;
+{% endif %}
+
+  std::vector<ElementInputA *> ptr_A_host;
+  std::vector<ElementInputB *> ptr_B_host;
+  std::vector<ElementOutput *> ptr_C_host;
+{% if has_bias %}
+  std::vector<ElementOutput *> ptr_bias_host;
+{% endif %}
+
+
+  cutlass::DeviceAllocation<ElementInputA *> ptr_A;
+  cutlass::DeviceAllocation<ElementInputB *> ptr_B;
+  cutlass::DeviceAllocation<ElementOutput *> ptr_C;
+{% if has_bias %}
+  cutlass::DeviceAllocation<ElementOutput *> ptr_bias;
+{% endif %}
+
+
+  for (auto & mnk : problem_sizes) {
+    int64_t M = mnk.m();
+    int64_t N = mnk.n();
+    int64_t K = mnk.k();
+    lda_host.push_back(K);
+    ldb_host.push_back(K);
+{% if has_bias %}
+    ldc_host.push_back(0);
+    ldd_host.push_back(N);
+{% else %}
+    ldc_host.push_back(N);
+{% endif %}
+
+    total_size_A += M * K;
+    total_size_B += N * K;
+    total_size_C += M * N;
+{% if has_bias %}
+    total_size_Bias += N;
+{% endif %}
+  }
+
+  blob_A.reset(total_size_A);
+  blob_B.reset(total_size_B);
+  blob_C.reset(total_size_C);
+{% if has_bias %}
+  blob_Bias.reset(total_size_Bias);
+{% endif %}
+
+  int64_t offset_A = 0;
+  int64_t offset_B = 0;
+  int64_t offset_C = 0;
+{% if has_bias %}
+  int64_t offset_Bias = 0;
+{% endif %}
+
+  for (int i = 0; i < problem_sizes.size(); ++i) {
+    auto & mnk = problem_sizes.at(i);
+    int64_t M = mnk.m();
+    int64_t N = mnk.n();
+    int64_t K = mnk.k();
+
+    ptr_A_host.push_back(blob_A.get() + offset_A);
+    ptr_B_host.push_back(blob_B.get() + offset_B);
+    ptr_C_host.push_back(blob_C.get() + offset_C);
+{% if has_bias %}
+    ptr_bias_host.push_back(blob_Bias.get() + offset_Bias);
+{% endif %}
+    offset_A += M * K;
+    offset_B += N * K;
+    offset_C += M * N;
+{% if has_bias %}
+    offset_Bias += N;
+{% endif %}
+  }
+
+
+  lda.reset(problem_count);
+  ldb.reset(problem_count);
+  ldc.reset(problem_count);
+{% if has_bias %}
+  ldd.reset(problem_count);
+{% endif %}
+  lda.copy_from_host(lda_host.data());
+  ldb.copy_from_host(ldb_host.data());
+  ldc.copy_from_host(ldc_host.data());
+{% if has_bias %}
+  ldd.copy_from_host(ldd_host.data());
+{% endif %}
+
+  ptr_A.reset(problem_count);
+  ptr_B.reset(problem_count);
+  ptr_C.reset(problem_count);
+{% if has_bias %}
+  ptr_bias.reset(problem_count);
+{% endif %}
+  ptr_A.copy_from_host(ptr_A_host.data());
+  ptr_B.copy_from_host(ptr_B_host.data());
+  ptr_C.copy_from_host(ptr_C_host.data());
+{% if has_bias %}
+  ptr_bias.copy_from_host(ptr_bias_host.data());
+{% endif %}
+
+  problem_sizes_device.reset(problem_count);
+  problem_sizes_device.copy_from_host(problem_sizes.data());
+
+"""
+)
+
+
+def get_group_gemm_instance_template_params(op_def: str) -> List[str]:
+    """
+    For a given op_def string generated by cutlass's group_gemm emiter, parse and
+    return the group_gemm instance's template parameters.
+    """
+    params = re.findall(
+        r"cutlass::gemm::kernel::DefaultGemmUniversal<([\s\S]+)>::GemmKernel;", op_def
+    )
+    assert len(params) == 1
+    param = params[0]
+    gemm_universal_params = param.strip().split("\n")
+    gemm_universal_params = [param.strip(",") for param in gemm_universal_params]
+    assert len(gemm_universal_params) == 20, (
+        "expected len(gemm_universal_params) to be 20, but got "
+        "{len(gemm_universal_params)}, {gemm_universal_params=}"
+    )
+    return gemm_universal_params
+
+
+def update_alignments_in_group_gemm_instance(
+    op_def: str, func_attrs: Dict[str, Any], for_profiler: bool
+) -> str:
+    """
+    update kAlignmentA, kAlignmentB, and epilogue_alignment in op_def,
+    which is a group_gemm instance emitted by the gemm instance emitter of cutlass.
+    """
+    if for_profiler:
+        return op_def
+
+    # TODO: adjust a_alignment, b_alignment based on input_accessors
+
+    gemm_params = get_group_gemm_instance_template_params(op_def)
+    epilogue_align_idx = 12
+    epilogue_curr_align = gemm_params[epilogue_align_idx].strip()
+
+    output_accessors = func_attrs["output_accessors"]
+    epilogue_alignment = int(epilogue_curr_align)
+    for output_accessor in output_accessors:
+        epilogue_alignment = min(
+            epilogue_alignment,
+            tensor_accessor_codegen.find_max_alignment_for_accessor(output_accessor),
+        )
+
+    instance_lines = op_def.split("\n")
+    # a_align_idx + 4 in the full instance string
+    idx_offset = 4
+
+    epilogue_curr_align_line = instance_lines[epilogue_align_idx + idx_offset]
+    assert epilogue_curr_align == epilogue_curr_align_line.strip(
+        " ,"
+    ), f"expected {epilogue_curr_align=} equal to {epilogue_curr_align_line=}"
+    instance_lines[epilogue_align_idx + idx_offset] = epilogue_curr_align_line.replace(
+        epilogue_curr_align, str(epilogue_alignment)
+    )
+    return "\n".join(instance_lines)
+
+
+def group_gemm_instance(op_def: str, func_attrs: Dict[str, Any], for_profiler: bool):
+    # TODO: This is a dirty thing need to add an extra emitter to clean this up
+    op_def = update_alignments_in_group_gemm_instance(op_def, func_attrs, for_profiler)
+    tmp = op_def.replace("DefaultGemmUniversal", "DefaultGemmGrouped")
+    tmp = tmp.replace("false,", "")
+    # force output to be row major
+    # cutlass lib can't generate row major output kernels
+    tmp = re.sub(
+        r"cutlass::layout::ColumnMajor,\n", "cutlass::layout::RowMajor,\n", tmp
+    )
+    tmp = re.sub(
+        r"GemmIdentityThreadblockSwizzle<\d>",
+        "GemmBatchedIdentityThreadblockSwizzle",
+        tmp,
+    )
+    tmp = re.sub(
+        r"cutlass::arch::OpMultiplyAdd",
+        "cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly,\n"
+        + "cutlass::arch::OpMultiplyAdd",
+        tmp,
+    )
+    return tmp
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    shape_template,
+    problem_args_template,
+    has_bias=False,
+    output_addr_calculator="",
+):
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+
+    file_pairs = []
+    for op_name, op in op_instance.items():
+        config = common.emit_instance(
+            op,
+            for_profiler=True,
+            f_instance_convertor=group_gemm_instance,
+            emit_kernel=True,
+        )
+        config_name = common.extract_config_name(config)
+        name = "GemmInstance"
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name, name=name, config=config
+        )
+
+        # instance = instance
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ", is_profiler=True, problem_args=problem_args_template.render()
+        )
+        op_func = ADAPTOR_FUNCTION_TEMPLATE.render(
+            instance=instance,
+            is_profiler=True,
+            func_name=name,
+            indent=" ",
+            exec_program=exec_program,
+            has_bias=has_bias,
+        )
+        func_call = ADAPTER_CALL_TEMPLATE.render(
+            func_name=name,
+            instance=name,
+            sharedMemPerMultiprocessor="device_properties.sharedMemPerMultiprocessor",
+            multiProcessorCount="device_properties.multiProcessorCount",
+            workspace="global_workspace",
+            problem_count="problem_count",
+            problem_sizes_device="problem_sizes_device.get()",
+            ptr_A="ptr_A.get()",
+            ptr_B="ptr_B.get()",
+            ptr_C="ptr_C.get()",
+            has_bias=has_bias,
+            ptr_bias="ptr_bias.get()",
+            lda="lda.get()",
+            ldb="ldb.get()",
+            ldc="ldc.get()",
+            ldd="ldd.get()",
+        )
+        code = common.PROFILER_TEMPLATE.render(
+            op_func=op_func,
+            args_parse=ARGS_PARSER_TEMPLATE.render(),
+            func_call=func_call,
+            name=name,
+            tensor_decl=TENSOR_DECL_TEMPLATE.render(name=name, has_bias=has_bias),
+        )
+        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+    # build
+    common.build_profiler(file_pairs)
+
+
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    shape_eval_template,
+    problem_args_template,
+    has_bias=False,
+):
+    problem_args = problem_args_template.render()
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    emit_kernel = True
+    for key, value in exec_path.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        algo = value.algo
+        if algo not in inst_def_flag:
+            config = common.emit_instance(
+                op_instance[algo],
+                for_profiler=False,
+                f_instance_convertor=group_gemm_instance,
+                emit_kernel=emit_kernel,
+                func_attrs=func_attrs,
+            )
+            inst_def_flag.add(algo)
+        else:
+            raise ValueError(f"Algo {algo} already in inst_def_flags")
+
+        inst = INSTANCE_TEMPLATE.render(
+            config=config,
+            name=fname,
+            config_name=common.extract_config_name(config),
+        )
+        instances[key] = inst
+        instance_decl += inst
+    kwargs = {}
+    kwargs["indent"] = "  "
+    kwargs["dtype"] = "int64_t "
+    group_operand_dims = []
+    output_addr_cals = []
+    input_a_addr_cals = []
+    num_inputs_per_group = 3 if has_bias else 2
+
+    for i in range(func_attrs["groups"]):
+        dim_names = []
+        for j in range(6):
+            dim_names.append("*dim_{group}_{dim}".format(group=i, dim=j))
+        group_operand_dims.append(dim_names)
+        output_addr_cal = GROUP_OUTPUT_ADDR_CALCULATOR.render(
+            group_id=i,
+            output_stride_dim="CN",
+            output_accessor=func_attrs["output_accessors"][i],
+        )
+        output_addr_cals.append(output_addr_cal)
+        input_a_addr_cal = GROUP_INPUT_A_ADDR_CALCULATOR.render(
+            group_id=i,
+            input_a_stride_dim="AK",
+            input_a_accessor=func_attrs["input_accessors"][i * num_inputs_per_group],
+        )
+        input_a_addr_cals.append(input_a_addr_cal)
+    kwargs["group_operand_dims"] = group_operand_dims
+    kwargs["output_addr_cals"] = output_addr_cals
+    kwargs["input_a_addr_cals"] = input_a_addr_cals
+    shape_func = shape_eval_template.render(**kwargs)
+    exec_paths = ""
+    #
+    for key, _ in instances.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = ADAPTER_CALL_TEMPLATE.render(
+            indent="    ",
+            func_name=func_name,
+            instance=fname,
+            sharedMemPerMultiprocessor="sharedMemPerMultiprocessor",
+            multiProcessorCount="multiProcessorCount",
+            workspace="global_workspace",
+            problem_count=func_attrs["groups"],
+            problem_sizes_device="problem_sizes_device",
+            ptr_A="ptr_A",
+            ptr_B="ptr_B",
+            ptr_C="ptr_C",
+            has_bias=has_bias,
+            ptr_bias="ptr_bias",
+            lda="lda",
+            ldb="ldb",
+            ldc="ldc",
+            ldd="ldd",
+        )
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+
+    exec_program = EXEC_TEMPLATE.render(
+        indent="  ", is_profiler=False, problem_args=problem_args
+    )
+    adapter_func = ADAPTOR_FUNCTION_TEMPLATE.render(
+        func_name=func_name, exec_program=exec_program, has_bias=has_bias
+    )
+    group_operands = []
+    group_operand_dims = []
+    for i in range(func_attrs["groups"]):
+        operand = []
+        operand.append("ptr_{group}_a".format(group=i))
+        operand.append("ptr_{group}_b".format(group=i))
+        operand.append("ptr_{group}_c".format(group=i))
+        if has_bias:
+            operand.append("ptr_{group}_bias".format(group=i))
+        dims = []
+        for j in range(6):
+            dims.append("dim_{group}_{dim}".format(group=i, dim=j))
+        group_operands.append(operand)
+        group_operand_dims.append(dims)
+
+    return SRC_TEMPLATE.render(
+        instances=instance_decl,
+        func_adapter=adapter_func,
+        function_name=func_name,
+        shape_function=shape_func,
+        group_operands=group_operands,
+        group_operand_dims=group_operand_dims,
+        exec_paths=exec_paths,
+        has_bias=has_bias,
+    )
+
+
+def gen_function_call(func_attrs, ndims, has_bias=False, indent="  "):
+    group_operands = []
+    group_operand_dims = []
+    output_accessors = [a.is_contiguous for a in func_attrs["output_accessors"]]
+    with_single_strided_output = False
+    if "output_stride_dim" in func_attrs:
+        output_accessors = list(set(output_accessors))
+        # we only support two cases: either all outputs are contiguous or none
+        # of them are
+        assert len(output_accessors) == 1
+        with_single_strided_output = not output_accessors[0]
+    for i in range(func_attrs["groups"]):
+        a = func_attrs["inputs"][i * ndims]
+        b = func_attrs["inputs"][i * ndims + 1]
+        if has_bias:
+            bias = func_attrs["inputs"][i * ndims + 2]
+        c_idx = 0 if with_single_strided_output else i
+        c = func_attrs["outputs"][c_idx]
+        input_a_accessor = func_attrs["input_accessors"][i * ndims]
+        input_b_accessor = func_attrs["input_accessors"][i * ndims + 1]
+        output_accessor = func_attrs["output_accessors"][i]
+
+        ashape = input_a_accessor.original_shapes
+        bshape = input_b_accessor.original_shapes
+        cshape = output_accessor.original_shapes
+        operands = []
+        operand_dims = []
+        operands.append(a._attrs["name"])
+        operands.append(b._attrs["name"])
+        operands.append(c._attrs["name"])
+        if has_bias:
+            operands.append(bias._attrs["name"])
+        operand_dims.append("&" + ashape[0]._attrs["name"])
+        operand_dims.append("&" + ashape[1]._attrs["name"])
+        operand_dims.append("&" + bshape[0]._attrs["name"])
+        operand_dims.append("&" + bshape[1]._attrs["name"])
+        operand_dims.append("&" + cshape[0]._attrs["name"])
+        operand_dims.append("&" + cshape[1]._attrs["name"])
+        group_operands.append(operands)
+        group_operand_dims.append(operand_dims)
+    device_args = f'reinterpret_cast<cutlass::half_t*>(unique_workspace + {func_attrs["unique_workspace_offset"]})'
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        problem_count=func_attrs["groups"],
+        device_args=device_args,
+        group_operands=group_operands,
+        group_operand_dims=group_operand_dims,
+        indent=indent,
+        has_bias=has_bias,
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
new file mode 100644
index 000000000..2b556fc83
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
@@ -0,0 +1,76 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common codegen functions for group_gemm_bias-family kernels.
+"""
+import jinja2
+
+from . import group_common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+        problem_sizes_device,
+        problem_count,
+        threadblock_count,
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
+        ptr_A,
+        ptr_B,
+        ptr_bias,
+        ptr_C,
+        lda,
+        ldb,
+        ldc,
+        ldd
+"""
+)
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    shape_template,
+):
+    group_common.gen_profiler(
+        func_attrs, workdir, shape_template, PROBLEM_ARGS_TEMPLATE, has_bias=True
+    )
+
+
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    shape_eval_template,
+):
+    return group_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        shape_eval_template,
+        PROBLEM_ARGS_TEMPLATE,
+        has_bias=True,
+    )
+
+
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return group_common.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, groups=func_attrs["groups"], has_bias=True
+    )
+
+
+def gen_function_call(func_attrs, indent="  "):
+    ndims = 3
+    return group_common.gen_function_call(func_attrs, ndims, has_bias=True)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
new file mode 100644
index 000000000..354039b40
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
@@ -0,0 +1,102 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for group_gemm_rcr.
+"""
+import jinja2
+
+from ... import registry
+from . import common, group_common
+from .layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+        problem_sizes_device,
+        problem_count,
+        threadblock_count,
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+        ptr_A,
+        ptr_B,
+        ptr_C,
+        ptr_C,
+        lda,
+        ldb,
+        ldc,
+        ldc
+"""
+)
+
+
+@registry.reg("cuda.group_gemm_rcr.config")
+def group_rcr_config(func_attrs, dtype="float16"):
+    common.make_fproc_f16(func_attrs, RCR)
+
+
+@registry.reg("cuda.group_gemm_rcr.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    group_common.gen_profiler(
+        func_attrs, workdir, shape_template, PROBLEM_ARGS_TEMPLATE
+    )
+
+
+@registry.reg("cuda.group_gemm_rcr.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    shape_eval_template,
+):
+    return group_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        shape_eval_template,
+        PROBLEM_ARGS_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.group_gemm_rcr.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return group_common.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, groups=func_attrs["groups"]
+    )
+
+
+@registry.reg("cuda.group_gemm_rcr.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    ndims = 2
+    return group_common.gen_function_call(func_attrs, ndims)
+
+
+@registry.reg("cuda.group_gemm_rcr.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
new file mode 100644
index 000000000..c292c3e1d
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
@@ -0,0 +1,75 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for group_gemm_rcr_bias.
+"""
+from ... import registry
+from . import common, group_common_bias, group_gemm_rcr
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+@registry.reg("cuda.group_gemm_rcr_bias.config")
+def group_rcr_config(func_attrs, dtype="float16"):
+    group_gemm_rcr.group_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    group_common_bias.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+):
+    return group_common_bias.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+    )
+
+
+@registry.reg("cuda.group_gemm_rcr_bias.func_decl")
+def gen_function_decl(func_attrs):
+    return group_common_bias.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return group_common_bias.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
new file mode 100644
index 000000000..9345c26e4
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
@@ -0,0 +1,75 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for group_gemm_rcr_bias_relu.
+"""
+from ... import registry
+from . import common, group_common_bias, group_gemm_rcr
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_relu.config")
+def group_rcr_config(func_attrs, dtype="float16"):
+    group_gemm_rcr.group_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_relu.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    group_common_bias.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_relu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+):
+    return group_common_bias.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+    )
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_relu.func_decl")
+def gen_function_decl(func_attrs):
+    return group_common_bias.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_relu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return group_common_bias.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_relu.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
new file mode 100644
index 000000000..e247bbe2a
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
@@ -0,0 +1,75 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for group_gemm_rcr_bias_sigmoid.
+"""
+from ... import registry
+from . import common, group_common_bias, group_gemm_rcr
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_sigmoid.config")
+def group_rcr_config(func_attrs, dtype="float16"):
+    group_gemm_rcr.group_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_sigmoid.gen_profiler")
+def gen_profiler(func_attrs, workdir, shape_template):
+    group_common_bias.gen_profiler(func_attrs, workdir, shape_template)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_sigmoid.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+):
+    return group_common_bias.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+    )
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_sigmoid.func_decl")
+def gen_function_decl(func_attrs):
+    return group_common_bias.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_sigmoid.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return group_common_bias.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.group_gemm_rcr_bias_sigmoid.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/layout.py b/python/aitemplate/backend/cuda/gemm_universal/layout.py
new file mode 100644
index 000000000..8bab2b98e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/layout.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GeMM layout classes.
+"""
+
+from dataclasses import dataclass
+
+# pylint: disable=C0415
+
+
+@dataclass
+class Layout:
+    m = "M"
+    n = "N"
+    k = "K"
+
+
+@dataclass
+class RCR(Layout):
+    """
+    Layout: A[RowMajor], B[ColumnMajor], C[RowMajor]
+    """
+
+    cutlass_layout_a = "cutlass::layout::RowMajor"
+    cutlass_layout_b = "cutlass::layout::ColumnMajor"
+    cutlass_layout_c = "cutlass::layout::RowMajor"
+    stride_a = "K"
+    stride_b = "K"
+    stride_c = "N"
+
+    args_parser = """
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+
+    @staticmethod
+    def fproc_op(op):
+        import cutlass_lib
+
+        row_major = cutlass_lib.library.LayoutType.RowMajor
+        op.C.layout = row_major
+
+    @staticmethod
+    def fcond_op(op):
+        import cutlass_lib
+
+        row_major = cutlass_lib.library.LayoutType.RowMajor
+        col_major = cutlass_lib.library.LayoutType.ColumnMajor
+        return op.A.layout == row_major and op.B.layout == col_major
+
+    @staticmethod
+    def cutlass_lib_layouts():
+        """
+        return [layout_a, layout_b, layout_c] in the form of cutlass_lib definitions
+        """
+        import cutlass_lib
+
+        return [
+            cutlass_lib.library.LayoutType.RowMajor,
+            cutlass_lib.library.LayoutType.ColumnMajor,
+            cutlass_lib.library.LayoutType.RowMajor,
+        ]
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
new file mode 100644
index 000000000..580a3b005
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
@@ -0,0 +1,124 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for perm021fc_ccr, which computes
+[b, m, n] = bmm([b, k, m], [1, n, k]).
+"""
+from ... import registry
+from . import bmm_common, common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_problem_info(**kwargs):
+    problem_args = {
+        "bias_ptr": "c_ptr",
+        "a_batch_stride": "M * K",
+        "b_batch_stride": "0",
+        "bias_batch_stride": "M * N",
+        "c_batch_stride": "M * N",
+        "lda": "M",
+        "ldb": "K",
+        "ldbias": "N",
+        "ldc": "N",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+@registry.reg("cuda.perm021fc_ccr.config")
+def gemm_ccr_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+        )
+
+    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+
+
+@registry.reg("cuda.perm021fc_ccr.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=["B", "K", "M"], b_dims=["1", "N", "K"], c_dims=["B", "M", "N"]
+    )
+
+    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.perm021fc_ccr.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.perm021fc_ccr.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.perm021fc_ccr.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.perm021fc_ccr.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
new file mode 100644
index 000000000..b4f320de9
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
@@ -0,0 +1,130 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for perm021fc_ccr_bias, which computes
+[b, m, n] = bmm([b, k, m], [1, n, k]) + bias[n].
+"""
+from ... import registry
+from . import bmm_common, common, common_bias, perm021fc_ccr
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_problem_info(**kwargs):
+    problem_args = {
+        "beta_value": 1,
+        "bias_ptr": "bias_ptr",
+        "a_batch_stride": "M * K",
+        "b_batch_stride": "0",
+        "bias_batch_stride": "0",
+        "c_batch_stride": "M * N",
+        "lda": "M",
+        "ldb": "K",
+        "ldbias": "0",
+        "ldc": "N",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+@registry.reg("cuda.perm021fc_ccr_bias.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return perm021fc_ccr.gemm_ccr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.perm021fc_ccr_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=["B", "K", "M"], b_dims=["1", "N", "K"], c_dims=["B", "M", "N"]
+    )
+
+    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common_bias.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+    )
+
+
+@registry.reg("cuda.perm021fc_ccr_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    mm_info = _get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+
+    return common.gen_function(
+        func_attrs,
+        common_bias.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
+    )
+
+
+@registry.reg("cuda.perm021fc_ccr_bias.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, input_ndims=input_ndims, weight_ndims=weight_ndims
+    )
+
+
+@registry.reg("cuda.perm021fc_ccr_bias.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return bmm_common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.perm021fc_ccr_bias.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
new file mode 100644
index 000000000..5631bf3ca
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -0,0 +1,165 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common functions and templates for perm021_ccr_bias_permute, which computes
+(A.permute(0, 2, 1)[col] @ B[col] + Bias).permute(0, 2, 1)
+"""
+from ... import registry
+
+from ..gemm_universal import common
+
+from . import (
+    bmm_common,
+    bmm_permute_common,
+    common_bias,
+    common_permute,
+    perm021fc_ccr_bias,
+)
+
+
+EXTRA_CODE = """
+
+#include "cutlass/gemm/device/gemm_universal_with_perm.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/coord.h"
+#include "cutlass/tensor_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+template<int D0>
+class Tensor3DPermute021BMM {
+ public:
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  Index col_permute;
+  Index row_permute;
+  Index stride_permute;
+
+ private:
+  MatrixCoord extent_;
+
+ public:
+  CUTLASS_HOST_DEVICE
+  Tensor3DPermute021BMM() {}
+
+  CUTLASS_HOST_DEVICE
+  Tensor3DPermute021BMM(MatrixCoord extent) : extent_(extent) {}
+
+  CUTLASS_HOST_DEVICE
+  void compute(Index col_init, Index row_init, Index stride_init, Index BMM_batch_idx) {
+    // Permute as torch.permute(X1, [0, 2, 1]) -> 3D Tensor indices as [i,j,k], the dimension of X is [D0, D1, D2], after permutation the dim of X1 is [D0, D2, D1].
+    // printf("BMM batch index: %d\t GEMM_m, GEMM_n = %d, %d\\n", BMM_batch_idx, extent_.row(), extent_.column());
+
+    int k = col_init;
+    int j = row_init;
+    int i = BMM_batch_idx;
+
+    col_permute = j;
+    row_permute = k;
+    stride_permute = stride_init / extent_.column() * extent_.row(); // stride in Bytes
+  }
+};
+
+}  // namespace layout
+}  // namespace cutlass
+"""
+
+
+@registry.reg("cuda.perm021fc_ccr_bias_permute.config")
+def config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common_permute.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+            permute_layout=func_attrs["layout"],
+        )
+
+    func_attrs["op_instance"] = common_permute.extract_config(fproc_f16, func_attrs)
+
+
+@registry.reg("cuda.perm021fc_ccr_bias_permute.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    return perm021fc_ccr_bias.gen_profiler(func_attrs, workdir, dim_info_dict)
+
+
+@registry.reg("cuda.perm021fc_ccr_bias_permute.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    mm_info = perm021fc_ccr_bias._get_problem_info(
+        alpha_value=func_attrs.get("alpha", 1)
+    )
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    return bmm_permute_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+        extra_code=EXTRA_CODE,
+        has_bias=True,
+    )
+
+
+@registry.reg("cuda.perm021fc_ccr_bias_permute.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, input_ndims=input_ndims, weight_ndims=weight_ndims
+    )
+
+
+@registry.reg("cuda.perm021fc_ccr_bias_permute.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return bmm_common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.perm021fc_ccr_bias_permute.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
new file mode 100644
index 000000000..35a9ef77d
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
@@ -0,0 +1,127 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for perm021fc_crc, which computes
+[b, n, m](col) = bmm([1, k, n](col), [b, k, m](row)).
+"""
+from ... import registry
+from . import bmm_common, common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_problem_info(**kwargs):
+    problem_args = {
+        "problem_size": "{N, M, K}",
+        "bias_ptr": "c_ptr",
+        "a_batch_stride": "0",
+        "b_batch_stride": "K * M",
+        "bias_batch_stride": "M * N",
+        "c_batch_stride": "M * N",
+        "lda": "N",
+        "ldb": "M",
+        "ldbias": "N",
+        "ldc": "N",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+@registry.reg("cuda.perm021fc_crc.config")
+def gemm_crc_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            b_layout=cutlass_lib.library.LayoutType.RowMajor,
+            c_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            epiligue_name=func_attrs["epilogue"],
+        )
+
+    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+
+
+@registry.reg("cuda.perm021fc_crc.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=["1", "K", "N"], b_dims=["B", "K", "M"], c_dims=["B", "M", "N"]
+    )
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=_get_problem_info(alpha_value=func_attrs.get("alpha", 1), beta_value=0)
+    )
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.perm021fc_crc.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=_get_problem_info(alpha_value=func_attrs.get("alpha", 1), beta_value=0)
+    )
+
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+    )
+
+
+@registry.reg("cuda.perm021fc_crc.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.perm021fc_crc.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.perm021fc_crc.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
new file mode 100644
index 000000000..187a0c6c1
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
@@ -0,0 +1,133 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for perm021fc_crc_bias, which computes
+[b, n, m](col) = bmm([1, k, n](col), [b, k, m](row)) + bias[n].
+"""
+from ... import registry
+from . import bmm_common, common, common_bias, perm021fc_crc
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_problem_info(**kwargs):
+    problem_args = {
+        "beta_value": 1,
+        "problem_size": "{N, M, K}",
+        "bias_ptr": "bias_ptr",
+        "a_batch_stride": "0",
+        "b_batch_stride": "K * M",
+        "bias_batch_stride": "0",
+        "c_batch_stride": "M * N",
+        "lda": "N",
+        "ldb": "M",
+        "ldbias": "0",
+        "ldc": "N",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+@registry.reg("cuda.perm021fc_crc_bias.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return perm021fc_crc.gemm_crc_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.perm021fc_crc_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=["1", "K", "N"], b_dims=["B", "K", "M"], c_dims=["B", "M", "N"]
+    )
+
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=_get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    )
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common_bias.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+    )
+
+
+@registry.reg("cuda.perm021fc_crc_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=_get_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    )
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+
+    return common.gen_function(
+        func_attrs,
+        common_bias.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
+    )
+
+
+@registry.reg("cuda.perm021fc_crc_bias.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, input_ndims=input_ndims, weight_ndims=weight_ndims
+    )
+
+
+@registry.reg("cuda.perm021fc_crc_bias.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return bmm_common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.perm021fc_crc_bias.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
new file mode 100644
index 000000000..fe0ffe9cd
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
@@ -0,0 +1,179 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for perm102_bmm_rcr, which computes
+C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
+"""
+from ... import registry
+from . import bmm_common, common
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_default_problem_info(**kwargs):
+    problem_args = {
+        "bias_ptr": "c_ptr",
+        "a_batch_stride": "K",
+        "b_batch_stride": "N * K",
+        "bias_batch_stride": "N",
+        "c_batch_stride": "N",
+        "lda": "K * B",
+        "ldb": "K",
+        "ldbias": "N * B",
+        "ldc": "N * B",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+# Currently only has output Tensor Accessor support.
+def _get_strided_problem_info(func_attrs):
+    return bmm_common.Bmm_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+        a_ptr="a_ptr",
+        b_ptr="b_ptr",
+        bias_ptr="(c_ptr + output_offset)",
+        c_ptr="(c_ptr + output_offset)",
+        a_batch_stride="K",
+        b_batch_stride="N * K",
+        bias_batch_stride="output_batch_stride",
+        c_batch_stride="output_batch_stride",
+        lda="K * B",
+        ldb="K",
+        ldbias="output_stride",
+        ldc="output_stride",
+    )
+
+
+def get_output_addr_calculator(func_attrs):
+    output_batch_stride_dim = "N"
+    output_stride_dim = "N * B"
+    output_offset = 0
+
+    if "output_accessors" in func_attrs:
+        output_accessor = func_attrs["output_accessors"][0]
+        if output_accessor.is_from_strided_tensor:
+            output_offset = output_accessor.offset
+            if not output_accessor.is_contiguous:
+                output_stride_dim = output_accessor.stride(0)
+                original_shapes = output_accessor.original_shapes
+                actual_shapes = output_accessor.actual_shapes
+                if len(actual_shapes) == 2 and actual_shapes[0] == original_shapes[0]:
+                    # x = perm102_bmm_xxx(a, b) # [m, b, n]
+                    # y = x.reshape()[x[0], -1] # [m, b * n]
+                    # z = cat()(y0, y1, ..., yn, dim=-1)
+                    output_batch_stride_dim = "N"
+                else:
+                    raise NotImplementedError(
+                        "Other strided fusion cases are not supported."
+                    )
+
+    output_addr_calculator = bmm_common.OUTPUT_ADDR_CALCULATOR.render(
+        output_batch_stride_dim=output_batch_stride_dim,
+        output_stride_dim=output_stride_dim,
+        output_offset_val=output_offset,
+    )
+
+    return output_addr_calculator
+
+
+@registry.reg("cuda.perm102_bmm_rcr.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.RowMajor,
+            b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+        )
+
+    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+
+
+@registry.reg("cuda.perm102_bmm_rcr.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=["M", "B", "K"], b_dims=["B", "N", "K"], c_dims=["M", "B", "N"]
+    )
+
+    mm_info = _get_default_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rcr.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    bmm_problem_info = _get_strided_problem_info(func_attrs)
+
+    # broadcasting is not supported
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+        "",  # input_addr_calculator
+        get_output_addr_calculator(func_attrs),
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rcr.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.perm102_bmm_rcr.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.perm102_bmm_rcr.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
new file mode 100644
index 000000000..8c34ecd48
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
@@ -0,0 +1,155 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for perm102_bmm_rcr_bias, which computes
+C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col)) + bias[n].
+"""
+from ... import registry
+from . import bmm_common, common, common_bias, perm102_bmm_rcr
+from .perm102_bmm_rcr import get_output_addr_calculator
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_default_problem_info(**kwargs):
+    problem_args = {
+        "beta_value": 1,
+        "bias_ptr": "bias_ptr",
+        "a_batch_stride": "K",
+        "b_batch_stride": "N * K",
+        "bias_batch_stride": "N",
+        "c_batch_stride": "N",
+        "lda": "K * B",
+        "ldb": "K",
+        "ldbias": "0",
+        "ldc": "N * B",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+# Currently only has output Tensor Accessor support.
+def _get_strided_problem_info(func_attrs):
+    return bmm_common.Bmm_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+        beta_value=1,
+        a_ptr="(a_ptr)",
+        b_ptr="(b_ptr)",
+        bias_ptr="(bias_ptr)",
+        c_ptr="(c_ptr + output_offset)",
+        a_batch_stride="K",
+        b_batch_stride="N * K",
+        bias_batch_stride="N",
+        c_batch_stride="output_batch_stride",
+        lda="K * B",
+        ldb="K",
+        ldbias="0",
+        ldc="output_stride",
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rcr_bias.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return perm102_bmm_rcr.gemm_rcr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.perm102_bmm_rcr_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=["M", "B", "K"], b_dims=["B", "N", "K"], c_dims=["M", "B", "N"]
+    )
+
+    mm_info = _get_default_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common_bias.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rcr_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    bmm_problem_info = _get_strided_problem_info(func_attrs)
+
+    # broadcasting is not supported
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+
+    return common.gen_function(
+        func_attrs,
+        common_bias.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
+        output_addr_calculator=get_output_addr_calculator(func_attrs),
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rcr_bias.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, input_ndims=input_ndims, weight_ndims=weight_ndims
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rcr_bias.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return bmm_common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rcr_bias.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
new file mode 100644
index 000000000..e4a3d7d1b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
@@ -0,0 +1,148 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for perm102_bmm_rrr, which computes
+C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row))
+"""
+from ... import registry
+from . import bmm_common, common
+from .perm102_bmm_rcr import get_output_addr_calculator
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_default_problem_info(**kwargs):
+    problem_args = {
+        "bias_ptr": "c_ptr",
+        "a_batch_stride": "K",
+        "b_batch_stride": "N * K",
+        "bias_batch_stride": "N",
+        "c_batch_stride": "N",
+        "lda": "K * B",
+        "ldb": "N",
+        "ldbias": "N * B",
+        "ldc": "N * B",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+# Currently only has output Tensor Accessor support.
+def _get_strided_problem_info(func_attrs):
+    return bmm_common.Bmm_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+        a_ptr="(a_ptr)",
+        b_ptr="(b_ptr)",
+        bias_ptr="(c_ptr + output_offset)",
+        c_ptr="(c_ptr + output_offset)",
+        a_batch_stride="K",
+        b_batch_stride="N * K",
+        bias_batch_stride="output_batch_stride",
+        c_batch_stride="output_batch_stride",
+        lda="K * B",
+        ldb="N",
+        ldbias="output_stride",
+        ldc="output_stride",
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rrr.config")
+def gemm_rrr_config(func_attrs, dtype="float16"):
+    def fproc_f16(op):
+        import cutlass_lib
+
+        return common.default_fproc_f16(
+            op=op,
+            a_layout=cutlass_lib.library.LayoutType.RowMajor,
+            b_layout=cutlass_lib.library.LayoutType.RowMajor,
+            c_layout=cutlass_lib.library.LayoutType.RowMajor,
+            epiligue_name=func_attrs["epilogue"],
+        )
+
+    func_attrs["op_instance"] = common.extract_config(fproc_f16)
+
+
+@registry.reg("cuda.perm102_bmm_rrr.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=["M", "B", "K"], b_dims=["B", "K", "N"], c_dims=["M", "B", "N"]
+    )
+
+    mm_info = _get_default_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rrr.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    bmm_problem_info = _get_strided_problem_info(func_attrs)
+
+    # broadcasting is not supported
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        problem_args,
+        dim_info_dict,
+        "",  # input_addr_calculator
+        get_output_addr_calculator(func_attrs),
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rrr.func_decl")
+def gen_function_decl(func_attrs):
+    return bmm_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.perm102_bmm_rrr.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return bmm_common.gen_function_call(func_attrs, indent)
+
+
+@registry.reg("cuda.perm102_bmm_rrr.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
new file mode 100644
index 000000000..f7435c071
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
@@ -0,0 +1,155 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for perm102_bmm_rrr_bias, which computes
+C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row)) + bias[n]
+"""
+from ... import registry
+from . import bmm_common, common, common_bias, perm102_bmm_rrr
+from .perm102_bmm_rcr import get_output_addr_calculator
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+def _get_default_problem_info(**kwargs):
+    problem_args = {
+        "beta_value": 1,
+        "bias_ptr": "bias_ptr",
+        "a_batch_stride": "K",
+        "b_batch_stride": "N * K",
+        "bias_batch_stride": "N",
+        "c_batch_stride": "N",
+        "lda": "K * B",
+        "ldb": "N",
+        "ldbias": "0",
+        "ldc": "N * B",
+    }
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+# Currently only has output Tensor Accessor support.
+def _get_strided_problem_info(func_attrs):
+    return bmm_common.Bmm_problem_info(
+        alpha_value=func_attrs.get("alpha", 1),
+        beta_value=1,
+        a_ptr="(a_ptr)",
+        b_ptr="(b_ptr)",
+        bias_ptr="(bias_ptr)",
+        c_ptr="(c_ptr + output_offset)",
+        a_batch_stride="K",
+        b_batch_stride="N * K",
+        bias_batch_stride="N",
+        c_batch_stride="output_batch_stride",
+        lda="K * B",
+        ldb="N",
+        ldbias="0",
+        ldc="output_stride",
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rrr_bias.config")
+def gemm_rrr_config(func_attrs, dtype="float16"):
+    return perm102_bmm_rrr.gemm_rrr_config(func_attrs, dtype)
+
+
+@registry.reg("cuda.perm102_bmm_rrr_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+        a_dims=["M", "B", "K"], b_dims=["B", "K", "N"], c_dims=["M", "B", "N"]
+    )
+
+    mm_info = _get_default_problem_info(alpha_value=func_attrs.get("alpha", 1))
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=mm_info)
+
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        common_bias.SRC_TEMPLATE,
+        problem_args,
+        args_parser,
+        bias_ptr_arg="memory_pool->RequestHalfTensorByIdx(3)",
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rrr_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    bmm_problem_info = _get_strided_problem_info(func_attrs)
+
+    # broadcasting is not supported
+    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(mm_info=bmm_problem_info)
+
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+
+    return common.gen_function(
+        func_attrs,
+        common_bias.SRC_TEMPLATE,
+        exec_cond_template,
+        problem_args,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
+        output_addr_calculator=get_output_addr_calculator(func_attrs),
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rrr_bias.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, input_ndims=input_ndims, weight_ndims=weight_ndims
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rrr_bias.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return bmm_common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.perm102_bmm_rrr_bias.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/groupnorm/__init__.py b/python/aitemplate/backend/cuda/groupnorm/__init__.py
new file mode 100644
index 000000000..ee950628c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/groupnorm/__init__.py
@@ -0,0 +1,17 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from . import groupnorm, groupnorm_swish
+
+__all__ = ["groupnorm", "groupnorm_swish"]
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm.py
new file mode 100644
index 000000000..e26d8cd62
--- /dev/null
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm.py
@@ -0,0 +1,38 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Any, Dict
+
+from ... import registry
+
+from .groupnorm_common import (
+    groupnorm_gen_func_call,
+    groupnorm_gen_func_decl,
+    groupnorm_gen_function,
+)
+
+
+@registry.reg("cuda.groupnorm.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    return groupnorm_gen_function(func_attrs)
+
+
+@registry.reg("cuda.groupnorm.func_decl")
+def func_decl(func_attrs: Dict[str, Any]) -> str:
+    return groupnorm_gen_func_decl(func_attrs)
+
+
+@registry.reg("cuda.groupnorm.func_call")
+def gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return groupnorm_gen_func_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
new file mode 100644
index 000000000..5b075783c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
@@ -0,0 +1,179 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common codegen functions for group_norm.
+"""
+
+import os
+from typing import Any, Dict, List
+
+import jinja2
+
+from ...target import Target
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<half*>(&({{name}}->raw()))"
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+cudaError_t {{func_name}}(half* output,
+                          half* input,
+                          half* gamma,
+                          half* beta,
+                          int N,
+                          const float eps,
+                          const int max_smem_size,
+                          cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{func_name}}(
+{{indent}}     {{output}}, {{input}}, {{gamma}}, {{beta}}, {{N}},
+{{indent}}     {{eps}}, max_smem_size, stream /* default stream */
+{{indent}}  );
+{{indent}}}
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <cub/cub.cuh>
+#include "cutlass/arch/memory_sm80.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "logging.h"
+
+
+{{gamma_beta_const_defs}}
+
+namespace {
+
+{{custom_libs}}
+
+}  // namespace
+
+{{func_signature}}
+{
+    return invokeGroupNorm<{{FuseSwish}}, {{H}}, {{W}}, {{C}}, {{G}}>(
+            output,
+            input,
+            gamma,
+            beta,
+            N,
+            eps,
+            max_smem_size,
+            stream);
+}
+    """
+)
+
+
+def get_input_names(func_attrs: Dict[str, Any]) -> List[str]:
+    """
+    Return a list of rendered name strings for inputs. It returns nullptr
+    for gamma and beta if they are None.
+    """
+    inputs = func_attrs["inputs"]
+    x = inputs[0]
+    gamma = None
+    beta = None
+
+    idx = 1
+    if func_attrs["gamma_constant"] is None:
+        gamma = inputs[idx]
+        idx += 1
+    if func_attrs["beta_constant"] is None:
+        beta = inputs[idx]
+        idx += 1
+
+    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=x._attrs["name"])
+    if gamma is None:
+        gamma_name = "nullptr"
+    else:
+        gamma_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=gamma._attrs["name"])
+    if beta is None:
+        beta_name = "nullptr"
+    else:
+        beta_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=beta._attrs["name"])
+
+    return (input_name, gamma_name, beta_name)
+
+
+def groupnorm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    use_swish = True if "swish" in func_attrs["name"] else False
+    input_shape = func_attrs["inputs"][0].shape()
+
+    H = input_shape[1].value()
+    W = input_shape[2].value()
+    C = input_shape[3].value()
+    G = func_attrs["num_groups"]
+
+    return FUNC_TEMPLATE.render(
+        custom_libs=Target.current().get_custom_libs(
+            os.path.dirname(__file__), "groupnorm_kernel.cuh"
+        ),
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        FuseSwish="true" if use_swish else "false",
+        H=H,
+        W=W,
+        C=C,
+        G=G,
+    )
+
+
+def groupnorm_gen_func_decl(func_attrs: Dict[str, Any]) -> str:
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+def groupnorm_gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    output_name = ""
+    assert len(func_attrs["outputs"]) == 1
+    assert 1 <= len(
+        func_attrs["inputs"]
+    ), "expected at least 1 inputs but got {}".format(len(func_attrs["inputs"]))
+
+    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+    (input_name, gamma_name, beta_name) = get_input_names(func_attrs)
+    input_shape = func_attrs["inputs"][0]._attrs["shape"]
+    eps = func_attrs["eps"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        input=input_name,
+        gamma=gamma_name,
+        beta=beta_name,
+        N=input_shape[0]._attrs["name"],
+        eps=eps,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
new file mode 100644
index 000000000..6a235589c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -0,0 +1,561 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#ifndef GROUPNORM_KERNEL_CUH
+#define GROUPNORM_KERNEL_CUH
+
+#define FINAL_MASK 0xffffffff
+
+#ifndef GROUP_NORM_CUDA_CHECK
+#define GROUP_NORM_CUDA_CHECK(expr)                                       \
+  do {                                                                    \
+    cudaError_t status = (expr);                                          \
+    if (status != cudaSuccess) {                                          \
+      std::cerr << "CUDA error: " << cudaGetErrorString(status) << " at " \
+                << __FILE__ << ": " << __LINE__ << std::endl;             \
+      return status;                                                      \
+    }                                                                     \
+  } while (0)
+#endif
+
+#ifndef GROUP_NORM_CUDA_CHECK_LAUNCH
+#define GROUP_NORM_CUDA_CHECK_LAUNCH() GROUP_NORM_CUDA_CHECK(cudaGetLastError())
+#endif
+
+__inline__ __device__ float sigmoid(float val) {
+  return (cutlass::fast_tanh(val * 0.5f) + 1.0f) * 0.5f;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The Groupnorm implementation below is based on OneFlow's Layernorm
+// implementation at:
+// https://github.com/Oneflow-Inc/oneflow/blob/master/oneflow/core/cuda/layer_norm.cuh
+
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#define __AIT_GN_USE_FAST_MATH 1
+template <typename T>
+__forceinline__ __device__ T Div(T a, T b);
+
+template <>
+__forceinline__ __device__ float Div<float>(float a, float b) {
+#ifdef __AIT_GN_USE_FAST_MATH
+  return __fdividef(a, b);
+#else
+  return a / b;
+#endif
+}
+
+template <>
+__forceinline__ __device__ half Div<half>(half a, half b) {
+  return __hdiv(a, b);
+}
+
+template <typename T>
+__forceinline__ __device__ T Rsqrt(T x);
+
+template <>
+__forceinline__ __device__ float Rsqrt<float>(float x) {
+#ifdef __AIT_GN_USE_FAST_MATH
+  return __frsqrt_rn(x);
+#else
+  return rsqrt(x);
+#endif
+}
+
+template <>
+__forceinline__ __device__ half Rsqrt<half>(half x) {
+  return hrsqrt(x);
+}
+
+#undef __AIT_GN_USE_FAST_MATH
+
+template <typename T>
+inline __device__ void WelfordCombine(T val, T* mean, T* m2, int* count) {
+  // Use Welford Online algorithem to compute mean and variance
+  // For more details you can refer to:
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+  *count += 1;
+  T delta1 = val - *mean;
+  *mean += Div(delta1, static_cast<T>(*count));
+  T delta2 = val - *mean;
+  *m2 += delta1 * delta2;
+}
+
+template <typename T>
+inline __device__ void WelfordCombine(
+    T b_mean,
+    T b_m2,
+    int b_count,
+    T* mean,
+    T* m2,
+    int* count) {
+  if (b_count == 0) {
+    return;
+  }
+  int new_count = *count + b_count;
+  T nb_over_n = Div((T)b_count, (T)new_count);
+  T delta = b_mean - *mean;
+  *mean += delta * nb_over_n;
+  *m2 += b_m2 + delta * delta * (T)(*count) * (T)(nb_over_n);
+  *count = new_count;
+}
+
+constexpr int kWarpSize = 32;
+
+template <typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpReduce(
+    T thread_mean,
+    T thread_m2,
+    int thread_count,
+    T* mean,
+    T* m2,
+    int* count) {
+  *mean = thread_mean;
+  *m2 = thread_m2;
+  *count = thread_count;
+  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
+    T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
+    int b_count =
+        __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+    WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
+  }
+}
+
+template <typename T>
+__inline__ __device__ void WelfordBlockAllReduce(
+    T thread_mean,
+    T thread_m2,
+    int thread_count,
+    T* result_mean,
+    T* result_m2,
+    int* result_count) {
+  __shared__ T mean_shared[kWarpSize];
+  __shared__ T m2_shared[kWarpSize];
+  __shared__ int count_shared[kWarpSize];
+  __shared__ T mean_result_broadcast;
+  __shared__ T m2_result_broadcast;
+  __shared__ int count_result_broadcast;
+  const int lid = threadIdx.x % kWarpSize;
+  const int wid = threadIdx.x / kWarpSize;
+  T warp_mean = 0;
+  T warp_m2 = 0;
+  int warp_count = 0;
+  WelfordWarpReduce(
+      thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count);
+  __syncthreads();
+  if (lid == 0) {
+    mean_shared[wid] = warp_mean;
+    m2_shared[wid] = warp_m2;
+    count_shared[wid] = warp_count;
+  }
+  __syncthreads();
+  if (wid == 0) {
+    if (threadIdx.x < blockDim.x / kWarpSize) {
+      warp_mean = mean_shared[lid];
+      warp_m2 = m2_shared[lid];
+      warp_count = count_shared[lid];
+    } else {
+      warp_mean = static_cast<T>(0);
+      warp_m2 = static_cast<T>(0);
+      warp_count = static_cast<T>(0);
+    }
+    __syncwarp();
+    T block_mean = 0;
+    T block_m2 = 0;
+    int block_count = 0;
+    WelfordWarpReduce(
+        warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count);
+    if (lid == 0) {
+      mean_result_broadcast = block_mean;
+      m2_result_broadcast = block_m2;
+      count_result_broadcast = block_count;
+    }
+  }
+  __syncthreads();
+  *result_mean = mean_result_broadcast;
+  *result_m2 = m2_result_broadcast;
+  *result_count = count_result_broadcast;
+}
+
+template <typename T, typename ComputeType, bool FuseSwish>
+__global__ void groupnorm_welford_fp16(
+    T* output,
+    T* input,
+    T* gamma,
+    T* beta,
+    const float eps,
+    const int64_t elems_per_block,
+    const int64_t elems_per_group_channel,
+    const int64_t batch_stride,
+    const int64_t group_stride,
+    const int64_t num_rows,
+    const int64_t row_stride) {
+  // all the numbers and strides are counted with respect to type T
+  constexpr int vec_size = sizeof(T) / sizeof(half);
+
+  const int tid = threadIdx.x;
+  const int bid = blockIdx.x;
+  const int gid = blockIdx.y; // index of group
+  const int64_t batch_offset = bid * batch_stride;
+  const int64_t group_offset = gid * group_stride;
+  const int64_t offset = batch_offset + group_offset;
+
+  // the first input of this thread
+  const T* t_input = input + offset;
+
+  ComputeType thread_mean = ComputeType(0.0);
+  ComputeType thread_m2 = ComputeType(0.0);
+  int thread_count = 0;
+#pragma unroll
+  for (int row_id = tid; row_id < num_rows; row_id += blockDim.x) {
+#pragma unroll
+    for (int i = 0; i < elems_per_group_channel; i++) {
+      const T* local_input = t_input + i + row_id * row_stride;
+      const half* half_ptr = reinterpret_cast<const half*>(local_input);
+#pragma unroll
+      for (int j = 0; j < vec_size; ++j) {
+        WelfordCombine(
+            __half2float(half_ptr[j]), &thread_mean, &thread_m2, &thread_count);
+      }
+    }
+  }
+  ComputeType row_mean = (ComputeType)(0.0f);
+  ComputeType row_m2 = (ComputeType)(0.0f);
+  int row_count = 0;
+  if (blockDim.x <= 32) {
+    WelfordWarpReduce(
+        thread_mean, thread_m2, thread_count, &row_mean, &row_m2, &row_count);
+  } else {
+    WelfordBlockAllReduce<ComputeType>(
+        thread_mean, thread_m2, thread_count, &row_mean, &row_m2, &row_count);
+  }
+  ComputeType row_variance = Div(row_m2, static_cast<ComputeType>(row_count));
+  ComputeType row_inv_var = Rsqrt(row_variance + static_cast<ComputeType>(eps));
+
+  float local_row_mean;
+  if (std::is_same<ComputeType, half>::value) {
+    local_row_mean = __half2float(row_mean);
+  } else if (std::is_same<ComputeType, float>::value) {
+    local_row_mean = row_mean;
+  }
+  float local_row_inv_var;
+  if (std::is_same<ComputeType, half>::value) {
+    local_row_inv_var = __half2float(row_inv_var);
+  } else if (std::is_same<ComputeType, float>::value) {
+    local_row_inv_var = row_inv_var;
+  }
+
+  const T* t_gamma = gamma + group_offset;
+  const T* t_beta = beta + group_offset;
+  // the first input of this thread
+  T* t_output = output + offset;
+#pragma unroll
+  for (int row_id = tid; row_id < num_rows; row_id += blockDim.x) {
+#pragma unroll
+    for (int i = 0; i < elems_per_group_channel; i++) {
+      const T* local_input = t_input + i + row_id * row_stride;
+      const half* input_half_ptr = reinterpret_cast<const half*>(local_input);
+
+      T* local_output = t_output + i + row_id * row_stride;
+      T tmp_output;
+      half* output_half_ptr = reinterpret_cast<half*>(&tmp_output);
+
+      const T* local_gamma = t_gamma + i;
+      const T* local_beta = t_beta + i;
+      const half* gamma_half_ptr = reinterpret_cast<const half*>(local_gamma);
+      const half* beta_half_ptr = reinterpret_cast<const half*>(local_beta);
+
+#pragma unroll
+      for (int j = 0; j < vec_size; ++j) {
+        float local_val = __half2float(input_half_ptr[j]);
+        float local_gamma = __half2float(gamma_half_ptr[j]);
+        float local_beta = __half2float(beta_half_ptr[j]);
+        float out_val = (local_val - local_row_mean) * local_row_inv_var;
+        out_val = out_val * local_gamma + local_beta;
+        out_val = FuseSwish ? out_val * sigmoid(out_val) : out_val;
+        output_half_ptr[j] = __float2half_rn(out_val);
+      }
+      *local_output = tmp_output;
+    }
+  }
+}
+
+// End the Groupnorm implementation that is based on from OneFlow's Layernorm
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+template <template <typename> class ReductionOp, typename T, int block_size>
+__inline__ __device__ T BlockAllReduce(T val) {
+  typedef cub::BlockReduce<T, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T result_broadcast;
+  T result = BlockReduce(temp_storage).Reduce(val, ReductionOp<T>());
+  if (threadIdx.x == 0) {
+    result_broadcast = result;
+  }
+  __syncthreads();
+  return result_broadcast;
+}
+
+template <
+    bool FuseSwish,
+    int H,
+    int W,
+    int C,
+    int C_G,
+    int ILP = 8,
+    int BANK_CONFLICT = 0,
+    int NUM_THREADS = 1024>
+__global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
+    const half* X,
+    half* Y,
+    half* gamma,
+    half* beta,
+    int N,
+    float epsilon) {
+  constexpr int C_G_2 = C_G / 2;
+  constexpr int C_G_stride = C_G_2 + BANK_CONFLICT;
+  extern __shared__ int svals_[];
+  auto* svals = reinterpret_cast<__half2*>(&svals_[0]);
+
+  int32_t g = blockIdx.x;
+  int32_t start_c = g * C_G;
+  int32_t n = blockIdx.y;
+
+  // X: [N, H, W, C]
+  int32_t strides[4] = {H * W * C, W * C, C, 1};
+  for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
+       load_idx += blockDim.x) {
+    auto c_g_2 = load_idx % C_G_2;
+    auto w = (load_idx / C_G_2) % W;
+    auto h_ilp = ((load_idx / C_G_2) / W);
+
+#pragma unroll ILP
+    for (auto ii = 0; ii < ILP; ++ii) {
+      const __half2* src = reinterpret_cast<const __half2*>(
+          &(X[n * strides[0] + (h_ilp * ILP + ii) * strides[1] +
+              w * strides[2] + (start_c + c_g_2 * 2)]));
+      __half2* dst =
+          &svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
+      cutlass::arch::cp_async_zfill<sizeof(__half2)>(dst, src, true);
+    }
+  }
+  cutlass::arch::cp_async_wait<0>();
+
+  float thread_sum = 0;
+  for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
+       load_idx += blockDim.x) {
+    auto c_g_2 = load_idx % C_G_2;
+    auto w = (load_idx / C_G_2) % W;
+    auto h_ilp = ((load_idx / C_G_2) / W);
+#pragma unroll ILP
+    for (auto ii = 0; ii < ILP; ++ii) {
+      half2 valh =
+          svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
+      float2 val = __half22float2(valh);
+      thread_sum += val.x + val.y;
+    }
+  }
+  const float block_mean =
+      BlockAllReduce<SumOp, float, NUM_THREADS>(thread_sum) /
+      float(H * W * C_G);
+
+  float thread_sq_sum = 0;
+  for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
+       load_idx += blockDim.x) {
+    auto c_g_2 = load_idx % C_G_2;
+    auto w = (load_idx / C_G_2) % W;
+    auto h_ilp = ((load_idx / C_G_2) / W);
+
+#pragma unroll ILP
+    for (auto ii = 0; ii < ILP; ++ii) {
+      half2 valh =
+          svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
+      float2 val = __half22float2(valh);
+      thread_sq_sum += (val.x - block_mean) * (val.x - block_mean) +
+          (val.y - block_mean) * (val.y - block_mean);
+    }
+  }
+  // PyTorch uses biased estimate of std-dev.
+  const float block_inv_std = __frsqrt_rn(
+      BlockAllReduce<SumOp, float, NUM_THREADS>(thread_sq_sum) /
+          float(H * W * C_G) +
+      epsilon);
+
+  for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
+       load_idx += blockDim.x) {
+    auto c_g_2 = load_idx % C_G_2;
+    auto w = (load_idx / C_G_2) % W;
+    auto h_ilp = ((load_idx / C_G_2) / W);
+
+    auto g = __half22float2(
+        *reinterpret_cast<const __half2*>(&gamma[start_c + c_g_2 * 2]));
+    g.x *= block_inv_std;
+    g.y *= block_inv_std;
+    auto b = __half22float2(
+        *reinterpret_cast<const __half2*>(&beta[start_c + c_g_2 * 2]));
+
+#pragma unroll ILP
+    for (auto ii = 0; ii < ILP; ++ii) {
+      __half2* src =
+          &svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
+      __half2* dst = reinterpret_cast<__half2*>(
+          &(Y[n * strides[0] + (h_ilp * ILP + ii) * strides[1] +
+              w * strides[2] + (start_c + c_g_2 * 2)]));
+
+      auto fsrc = __half22float2(*src);
+      float2 result;
+      result.x = (fsrc.x - block_mean) * g.x + b.x;
+      result.y = (fsrc.y - block_mean) * g.y + b.y;
+      if (FuseSwish) {
+        result.x = result.x * sigmoid(result.x);
+        result.y = result.y * sigmoid(result.y);
+      }
+      *dst = __float22half2_rn(result);
+    }
+  }
+}
+
+template <bool FuseSwish, int H, int W, int C, int num_groups>
+cudaError_t invokeWelfordGroupNorm(
+    half* output,
+    half* input,
+    half* gamma,
+    half* beta,
+    int N,
+    const float eps,
+    cudaStream_t stream) {
+  int max_vec_size = 8;
+  while ((C / num_groups) % max_vec_size != 0) {
+    max_vec_size /= 2;
+  }
+
+  constexpr int64_t block_size = 1024;
+  // counts w.r.t. type half
+  const int64_t elems_per_group_channel = C / num_groups;
+  const int64_t elems_per_block = (H * W * C) / num_groups;
+  const int64_t batch_stride = H * W * C;
+  const int64_t group_stride = elems_per_group_channel;
+
+  CHECK_EQ(elems_per_group_channel % max_vec_size, 0);
+  CHECK_EQ(batch_stride % max_vec_size, 0);
+  CHECK_EQ(group_stride % max_vec_size, 0);
+  const int64_t v_elems_per_group_channel =
+      elems_per_group_channel / max_vec_size;
+  const int64_t v_elems_per_block = elems_per_block / max_vec_size;
+  const int64_t v_batch_stride = batch_stride / max_vec_size;
+  const int64_t v_group_stride = group_stride / max_vec_size;
+  const int64_t v_num_rows = v_elems_per_block / v_elems_per_group_channel;
+  const int64_t v_row_stride = C / max_vec_size;
+
+  dim3 grid(N, num_groups);
+
+#define __HANDLE_ONE_VEC(vec_type, vec_size)           \
+  case vec_size: {                                     \
+    groupnorm_welford_fp16<vec_type, float, FuseSwish> \
+        <<<grid, block_size, 0, stream>>>(             \
+            reinterpret_cast<vec_type*>(output),       \
+            reinterpret_cast<vec_type*>(input),        \
+            reinterpret_cast<vec_type*>(gamma),        \
+            reinterpret_cast<vec_type*>(beta),         \
+            eps,                                       \
+            v_elems_per_block,                         \
+            v_elems_per_group_channel,                 \
+            v_batch_stride,                            \
+            v_group_stride,                            \
+            v_num_rows,                                \
+            v_row_stride);                             \
+    GROUP_NORM_CUDA_CHECK_LAUNCH();                    \
+    break;                                             \
+  }
+
+  switch (max_vec_size) {
+    __HANDLE_ONE_VEC(uint4, 8)
+    __HANDLE_ONE_VEC(uint2, 4)
+    __HANDLE_ONE_VEC(unsigned, 2)
+    __HANDLE_ONE_VEC(half, 1)
+    default:
+      throw std::runtime_error("Invalid max_vec_size\n");
+  }
+
+#undef __HANDLE_ONE_VEC
+  return cudaSuccess;
+}
+
+template <bool FuseSwish, int H, int W, int C, int G>
+cudaError_t invokeGroupNorm(
+    half* output,
+    half* input,
+    half* gamma,
+    half* beta,
+    int N,
+    const float eps,
+    const int max_smem_size,
+    cudaStream_t stream) {
+  constexpr auto C_G = C / G;
+  constexpr auto C_G_2 = C_G / 2;
+  constexpr int ILP = 8;
+
+  // Use a little big more shared_memory to reduce occupancy and boost perf.
+  constexpr int MEM_BANK_CONFLICT = 1;
+
+  // Bank conflict doesn't seem to matter to perf
+  constexpr int BANK_CONFLICT = 0;
+
+  const auto smem = H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(uint16_t);
+
+  // C_G must be even, or we can have misaligned address for cp.async
+  // reserve some shared_mem for block reduction
+  if (H % 8 == 0 && C_G % 2 == 0 && smem <= max_smem_size - 1000) {
+    GROUP_NORM_CUDA_CHECK(cudaFuncSetAttribute(
+        group_norm_smem<FuseSwish, H, W, C, C_G, ILP, BANK_CONFLICT>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem));
+
+    constexpr int num_threads = std::min(1024, H / ILP * W * C_G_2);
+
+    dim3 block(num_threads);
+    group_norm_smem<FuseSwish, H, W, C, C_G, ILP, BANK_CONFLICT, num_threads>
+        <<<dim3(G, N), block, smem, stream>>>(
+            input, output, gamma, beta, N, eps);
+  } else {
+    return invokeWelfordGroupNorm<FuseSwish, H, W, C, G>(
+        output, input, gamma, beta, N, eps, stream);
+  }
+
+  // GROUP_NORM_CUDA_CHECK_LAUNCH();
+  // TODO: last error is 0, but invoked error logging no error
+  return cudaGetLastError();
+}
+
+#endif /* GROUPNORM_KERNEL_CUH */
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py
new file mode 100644
index 000000000..0f2b00dac
--- /dev/null
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py
@@ -0,0 +1,38 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Any, Dict
+
+from ... import registry
+
+from .groupnorm_common import (
+    groupnorm_gen_func_call,
+    groupnorm_gen_func_decl,
+    groupnorm_gen_function,
+)
+
+
+@registry.reg("cuda.groupnorm_swish.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    return groupnorm_gen_function(func_attrs)
+
+
+@registry.reg("cuda.groupnorm_swish.func_decl")
+def func_decl(func_attrs: Dict[str, Any]) -> str:
+    return groupnorm_gen_func_decl(func_attrs)
+
+
+@registry.reg("cuda.groupnorm_swish.func_call")
+def gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return groupnorm_gen_func_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py
new file mode 100644
index 000000000..4525406e0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py
@@ -0,0 +1,28 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+(c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+"""
+from . import (
+    batch_layernorm_sigmoid_mul,
+    group_layernorm_sigmoid_mul,
+    layernorm_sigmoid_mul,
+)
+
+__all__ = [
+    "batch_layernorm_sigmoid_mul",
+    "group_layernorm_sigmoid_mul",
+    "layernorm_sigmoid_mul",
+]
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
new file mode 100644
index 000000000..2d7b6181e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
@@ -0,0 +1,136 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch LayerNorm_Sigmoid_Mul codegen for CUDA.
+"""
+
+import os
+from typing import Any, Dict
+
+import jinja2
+
+from ... import registry
+from ...common import tensor_accessor_codegen
+from ...target import Target
+from . import layernorm_common
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "logging.h"
+
+namespace {
+
+{{gamma_beta_const_defs}}
+
+{{tensor_accessor_libs}}
+{{custom_libs}}
+
+}  // namespace
+
+{{func_signature}}
+{
+    invokeBatchLayernormSigmoidMul<half, float, {{fuse_sigmoid_mul}}>(output, input, gamma, beta, b, m, n, eps, stream);
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(half* output,
+                   half* input,
+                   const half* gamma,
+                   const half* beta,
+                   int b,
+                   int m,
+                   int n,
+                   float eps,
+                   cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{output}}, {{input}}, {{gamma}}, {{beta}},
+{{indent}}   {{b}}, {{m}}, {{n}}, {{eps}}, stream /* default stream */
+{{indent}});
+    """
+)
+
+
+@registry.reg("cuda.batch_layernorm_sigmoid_mul.gen_function")
+def batch_layernorm_sigmoid_mul_gen_function(func_attrs: Dict[str, Any]) -> str:
+    gamma_beta_const_defs = layernorm_common.gamma_beta_const_defs(func_attrs)
+    return FUNC_TEMPLATE.render(
+        custom_libs=Target.current().get_custom_libs(
+            os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
+        ),
+        tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        fuse_sigmoid_mul="true",
+        gamma_beta_const_defs=gamma_beta_const_defs,
+    )
+
+
+@registry.reg("cuda.batch_layernorm_sigmoid_mul.func_decl")
+def batch_layernorm_sigmoid_mul_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.batch_layernorm_sigmoid_mul.func_call")
+def batch_layernorm_sigmoid_mul_gen_function_call(func_attrs, indent="  "):
+    output_name = ""
+    assert len(func_attrs["outputs"]) == 1
+    assert (
+        1 <= len(func_attrs["inputs"]) <= 4
+    ), "expected 1 ~ 4 inputs but got {}".format(len(func_attrs["inputs"]))
+
+    output_name = layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+    (input_name, gamma_name, beta_name) = layernorm_common.get_input_names(func_attrs)
+
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    assert len(shapes) == 3
+
+    (b_name, m_name, n_name) = (shape._attrs["name"] for shape in shapes)
+
+    eps = func_attrs["eps"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        input=input_name,
+        gamma=gamma_name,
+        beta=beta_name,
+        b=b_name,
+        m=m_name,
+        n=n_name,
+        eps=eps,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
new file mode 100644
index 000000000..d9e53d0cf
--- /dev/null
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
@@ -0,0 +1,303 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+group LayerNorm_Sigmoid_Mul codegen for CUDA.
+"""
+
+import os
+from typing import Any, Dict
+
+import jinja2
+
+from ... import registry
+from ...common import tensor_accessor_codegen
+from ...target import Target
+from .. import cuda_common
+from . import layernorm_common
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "logging.h"
+
+namespace {
+
+{{gamma_beta_const_defs}}
+
+{{tensor_accessor_libs}}
+{{custom_libs}}
+
+}  // namespace
+
+{{func_signature}}
+{
+    {{output_accessor_template}}
+    {{input_accessor_template}}
+    invokeGroupLayernormSigmoidMul<half, float, {{fuse_sigmoid_mul}}, {{num_inputs}}>(output, input, gamma, beta, b, m, n, eps, stream, input_accessors, output_accessors);
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(half* output[],
+                   half* input[],
+                   half* gamma[],
+                   half* beta[],
+                   int b,
+                   int m,
+                   int64_t* n,
+                   float eps,
+                   cudaStream_t stream)
+    """
+)
+
+OUTPUT_ACCESSORS_TEMPLATE = jinja2.Template(
+    """
+    {{output_accessor_decls}}
+    TensorAccessor output_accessors[] = {
+        {{output_accessors}}
+    };
+    """
+)
+
+INPUT_ACCESSORS_TEMPLATE = jinja2.Template(
+    """
+    {{input_accessor_decls}}
+    TensorAccessor input_accessors[] = {
+        {{input_accessors}}
+    };
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+
+{{indent}}  {{input_elem_type}} *outputs[] = {
+{{indent}}    {{outputs}}
+{{indent}}  };
+
+{{indent}}  {{input_elem_type}} *inputs[] = {
+{{indent}}    {{inputs}}
+{{indent}}  };
+
+{{indent}}  {{input_elem_type}} *gamma[] = {
+{{indent}}    {{gamma}}
+{{indent}}  };
+
+{{indent}}  {{input_elem_type}} *beta[] = {
+{{indent}}    {{beta}}
+{{indent}}  };
+
+{{indent}}  {{m_n_shape_func}}
+{{indent}}  int64_t n[{{num_inputs}}] = {
+{{indent}}    {{n}}
+{{indent}}  };
+
+
+{{indent}}  {{func_name}}(
+{{indent}}     outputs, inputs, gamma, beta,
+{{indent}}     {{b}}, {{m}}, n, {{eps}}, stream /* default stream */
+{{indent}}  );
+{{indent}}}
+    """
+)
+
+
+@registry.reg("cuda.group_layernorm.gen_function")
+@registry.reg("cuda.group_layernorm_sigmoid_mul.gen_function")
+def group_layernorm_sigmoid_mul_gen_function(func_attrs: Dict[str, Any]) -> str:
+    output_accessor_decls_str = "\n    ".join(
+        tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+            name=f"output_accessor_{i}", tensor_accessor=output_accessor
+        )
+        for i, output_accessor in enumerate(func_attrs["output_accessors"])
+    )
+
+    input_accessor_decls_str = "\n    ".join(
+        tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+            name=f"input_accessor_{i}", tensor_accessor=input_accessor
+        )
+        for i, input_accessor in enumerate(func_attrs["input_accessors"])
+    )
+
+    output_accessors = ",\n      ".join(
+        [f"output_accessor_{i}" for i in range(len(func_attrs["output_accessors"]))]
+    )
+
+    input_accessors = ",\n      ".join(
+        [f"input_accessor_{i}" for i in range(len(func_attrs["input_accessors"]))]
+    )
+
+    output_accessor_template = OUTPUT_ACCESSORS_TEMPLATE.render(
+        output_accessor_decls=output_accessor_decls_str,
+        output_accessors=output_accessors,
+    )
+
+    input_accessor_template = INPUT_ACCESSORS_TEMPLATE.render(
+        input_accessor_decls=input_accessor_decls_str,
+        input_accessors=input_accessors,
+    )
+
+    op = func_attrs["op"]
+
+    if op == "group_layernorm_sigmoid_mul":
+        fuse_sigmoid_mul = "true"
+    elif op == "group_layernorm":
+        fuse_sigmoid_mul = "false"
+    else:
+        raise RuntimeError(f"Unsupported op: {op}")
+
+    gamma_beta_const_defs = layernorm_common.gamma_beta_const_defs(func_attrs)
+    return FUNC_TEMPLATE.render(
+        custom_libs=Target.current().get_custom_libs(
+            os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
+        ),
+        tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        fuse_sigmoid_mul=fuse_sigmoid_mul,
+        num_inputs=len(func_attrs["outputs"]),
+        output_accessor_template=output_accessor_template,
+        input_accessor_template=input_accessor_template,
+        gamma_beta_const_defs=gamma_beta_const_defs,
+    )
+
+
+@registry.reg("cuda.group_layernorm.func_decl")
+@registry.reg("cuda.group_layernorm_sigmoid_mul.func_decl")
+def group_layernorm_sigmoid_mul_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.group_layernorm.func_call")
+@registry.reg("cuda.group_layernorm_sigmoid_mul.func_call")
+def group_layernorm_sigmoid_mul_gen_function_call(func_attrs, indent="  "):
+    # [inputs, gamma, beta]
+    all_inputs = func_attrs["inputs"]
+    b = len(func_attrs["outputs"])
+    gammas = None
+    betas = None
+    inputs = all_inputs[:b]
+    idx = b
+    if func_attrs["gamma_constant"] is None:
+        gammas = all_inputs[idx : idx + b]
+        idx += b
+    if func_attrs["beta_constant"] is None:
+        betas = all_inputs[idx : idx + b]
+        idx += b
+    outputs = func_attrs["outputs"]
+
+    output_ptrs = ",\n        ".join(
+        [
+            layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+                name=out._attrs["name"]
+            )
+            for out in outputs
+        ]
+    )
+
+    input_ptrs = ",\n        ".join(
+        [
+            layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=i._attrs["name"])
+            for i in inputs
+        ]
+    )
+
+    gamma_strs = (
+        ["nullptr"] * b
+        if gammas is None
+        else (
+            [
+                layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+                    name=i._attrs["name"]
+                )
+                for i in gammas
+            ]
+        )
+    )
+    gamma_ptrs = ",\n        ".join(gamma_strs)
+
+    beta_strs = (
+        ["nullptr"] * b
+        if betas is None
+        else (
+            [
+                layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+                    name=i._attrs["name"]
+                )
+                for i in betas
+            ]
+        )
+    )
+    beta_ptrs = ",\n        ".join(beta_strs)
+
+    all_shape_funcs = []
+    # all Ms are the same
+    input_0_shapes = inputs[0]._attrs["shape"]
+    norm_ndim = len(func_attrs["normalized_shape"][0])
+    m_name = "M"
+    m_shape_func = layernorm_common.generate_m_shape_func(
+        input_0_shapes,
+        norm_ndim,
+        m_name,
+        indent + "    ",
+    )
+    all_shape_funcs.append(m_shape_func)
+
+    n = []
+    input_accessors = func_attrs["input_accessors"]
+    for i, acc in enumerate(input_accessors):
+        shapes = acc.original_shapes
+        n_name = f"N{i}"
+        n_shape_func = layernorm_common.generate_n_shape_func(
+            shapes,
+            norm_ndim,
+            n_name,
+            indent + "    ",
+        )
+        all_shape_funcs.append(n_shape_func)
+        n.append(n_name)
+
+    n_str = ", ".join([str(i) for i in n])
+
+    eps = func_attrs["eps"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        m_n_shape_func="".join(all_shape_funcs),
+        input_elem_type=cuda_common.dtype_to_cuda_type(inputs[0]._attrs["dtype"]),
+        indent=indent,
+        outputs=output_ptrs,
+        inputs=input_ptrs,
+        gamma=gamma_ptrs,
+        beta=beta_ptrs,
+        n=n_str,
+        b=b,
+        m=m_name,
+        eps=eps,
+    )
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_common.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_common.py
new file mode 100644
index 000000000..4daa082ec
--- /dev/null
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_common.py
@@ -0,0 +1,113 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common functions for layernorm kernels
+"""
+
+from typing import Any, Dict, List
+
+import jinja2
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<half*>(&({{name}}->raw()))"
+)
+
+GAMMA_BETA_CONST_DEFS_TEMPLATE = jinja2.Template(
+    """
+{% if is_gamma_const %}
+#define AIT_LAYERNORM_CONST_GAMMA {{gamma_constant}}
+{% endif %}
+{% if is_beta_const %}
+#define AIT_LAYERNORM_CONST_BETA {{beta_constant}}
+{% endif %}
+
+"""
+)
+
+SHAPE_PRODUCT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}int64_t {{M}} = 1;
+{% for dim_name in dim_names %}
+{{indent}}{{M}} *= {{dim_name}};
+{% endfor %}
+    """
+)
+
+
+def generate_m_shape_func(input_shapes, norm_ndim, m_name, indent):
+    m_names = [shape._attrs["name"] for shape in input_shapes[:-norm_ndim]]
+    return SHAPE_PRODUCT_TEMPLATE.render(
+        M=m_name,
+        dim_names=m_names,
+        indent=indent,
+    )
+
+
+def generate_n_shape_func(input_shapes, norm_ndim, n_name, indent):
+    n_names = [shape._attrs["name"] for shape in input_shapes[-norm_ndim:]]
+    return SHAPE_PRODUCT_TEMPLATE.render(
+        M=n_name,
+        dim_names=n_names,
+        indent=indent,
+    )
+
+
+def gamma_beta_const_defs(func_attrs: Dict[str, Any]) -> str:
+    """
+    Return rendered code string where we define the default gamma (1.0) and
+    beta (0.0) values, respectively.
+    """
+    gamma_constant = func_attrs["gamma_constant"]
+    is_gamma_const = gamma_constant is not None
+    beta_constant = func_attrs["beta_constant"]
+    is_beta_const = beta_constant is not None
+
+    return GAMMA_BETA_CONST_DEFS_TEMPLATE.render(
+        is_gamma_const=is_gamma_const,
+        gamma_constant=gamma_constant,
+        is_beta_const=is_beta_const,
+        beta_constant=beta_constant,
+    )
+
+
+def get_input_names(func_attrs: Dict[str, Any]) -> List[str]:
+    """
+    Return a list of rendered name strings for inputs. It returns nullptr
+    for gamma and beta if they are None.
+    """
+    inputs = func_attrs["inputs"]
+    x = inputs[0]
+    gamma = None
+    beta = None
+
+    idx = 1
+    if func_attrs["gamma_constant"] is None:
+        gamma = inputs[idx]
+        idx += 1
+    if func_attrs["beta_constant"] is None:
+        beta = inputs[idx]
+        idx += 1
+
+    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=x._attrs["name"])
+    if gamma is None:
+        gamma_name = "nullptr"
+    else:
+        gamma_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=gamma._attrs["name"])
+    if beta is None:
+        beta_name = "nullptr"
+    else:
+        beta_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(name=beta._attrs["name"])
+
+    return (input_name, gamma_name, beta_name)
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
new file mode 100644
index 000000000..e0dc892a0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
@@ -0,0 +1,184 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+LayerNorm_Sigmoid_Mul codegen for CUDA.
+"""
+
+import os
+from typing import Any, Dict
+
+import jinja2
+
+from ... import registry
+from ...common import tensor_accessor_codegen
+from ...target import Target
+from . import layernorm_common
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "logging.h"
+
+{{gamma_beta_const_defs}}
+
+namespace {
+
+{{tensor_accessor_libs}}
+{{custom_libs}}
+
+}  // namespace
+
+{{func_signature}}
+{
+    {{input_accessor}}
+    {{output_accessor}}
+    return invokeLayernormSigmoidMul<half, float, {{fuse_sigmoid_mul}}>(output, input, gamma, beta, m, n, eps, stream, input_accessor, output_accessor);
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+cudaError_t {{func_name}}(half* output,
+                   half* input,
+                   const half* gamma,
+                   const half* beta,
+                   int m,
+                   int n,
+                   const float eps,
+                   cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{m_n_shape_func}}
+{{indent}}  {{func_name}}(
+{{indent}}     {{output}}, {{input}}, {{gamma}}, {{beta}},
+{{indent}}     {{m}}, {{n}}, {{eps}}, stream /* default stream */
+{{indent}}  );
+{{indent}}}
+    """
+)
+
+
+@registry.reg("cuda.layernorm.gen_function")
+def layernorm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    gamma_beta_const_defs = layernorm_common.gamma_beta_const_defs(func_attrs)
+    return FUNC_TEMPLATE.render(
+        custom_libs=Target.current().get_custom_libs(
+            os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
+        ),
+        tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        fuse_sigmoid_mul="false",
+        input_accessor=tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+            name="input_accessor", tensor_accessor=func_attrs["input_accessors"][0]
+        ),
+        output_accessor=tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+            name="output_accessor", tensor_accessor=func_attrs["output_accessors"][0]
+        ),
+        gamma_beta_const_defs=gamma_beta_const_defs,
+    )
+
+
+@registry.reg("cuda.layernorm_sigmoid_mul.gen_function")
+def layernorm_sigmoid_mul_gen_function(func_attrs: Dict[str, Any]) -> str:
+    gamma_beta_const_defs = layernorm_common.gamma_beta_const_defs(func_attrs)
+    return FUNC_TEMPLATE.render(
+        custom_libs=Target.current().get_custom_libs(
+            os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
+        ),
+        tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        fuse_sigmoid_mul="true",
+        input_accessor=tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+            name="input_accessor", tensor_accessor=func_attrs["input_accessors"][0]
+        ),
+        output_accessor=tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+            name="output_accessor", tensor_accessor=func_attrs["output_accessors"][0]
+        ),
+        gamma_beta_const_defs=gamma_beta_const_defs,
+    )
+
+
+@registry.reg("cuda.layernorm.func_decl")
+@registry.reg("cuda.layernorm_sigmoid_mul.func_decl")
+def layernorm_sigmoid_mul_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.layernorm.func_call")
+@registry.reg("cuda.layernorm_sigmoid_mul.func_call")
+def layernorm_sigmoid_mul_gen_function_call(func_attrs, indent="  "):
+    output_name = ""
+    assert len(func_attrs["outputs"]) == 1
+    assert 1 <= len(
+        func_attrs["inputs"]
+    ), "expected at least 1 inputs but got {}".format(len(func_attrs["inputs"]))
+
+    output_name = layernorm_common.FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+    (input_name, gamma_name, beta_name) = layernorm_common.get_input_names(func_attrs)
+
+    input_accessor = func_attrs["input_accessors"][0]
+    shapes = input_accessor.original_shapes
+    norm_ndim = len(func_attrs["normalized_shape"])
+    m_name = "M"
+
+    m_shape_func = layernorm_common.generate_m_shape_func(
+        shapes,
+        norm_ndim,
+        m_name,
+        indent + "    ",
+    )
+
+    n_name = "N"
+    n_shape_func = layernorm_common.generate_n_shape_func(
+        shapes,
+        norm_ndim,
+        n_name,
+        indent + "    ",
+    )
+
+    m_n_shape_func = f"{m_shape_func}\n{n_shape_func}"
+    eps = func_attrs["eps"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        m_n_shape_func=m_n_shape_func,
+        output=output_name,
+        input=input_name,
+        gamma=gamma_name,
+        beta=beta_name,
+        m=m_name,
+        n=n_name,
+        eps=eps,
+        indent=indent + "  ",
+    )
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
new file mode 100644
index 000000000..0b2249266
--- /dev/null
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
@@ -0,0 +1,1735 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// This kernel is based on CUTLASS LayerNorm kernel
+#ifndef LAYERNORM_SIGMOID_MUL
+#define LAYERNORM_SIGMOID_MUL
+
+#define FINAL_MASK 0xffffffff
+
+// TODO: can this header be used in ROCM with minimal changes?
+#ifndef LAYER_NORM_CUDA_CHECK
+#define LAYER_NORM_CUDA_CHECK(expr)                                       \
+  do {                                                                    \
+    cudaError_t status = (expr);                                          \
+    if (status != cudaSuccess) {                                          \
+      std::cerr << "CUDA error: " << cudaGetErrorString(status) << " at " \
+                << __FILE__ << ": " << __LINE__ << std::endl;             \
+      return status;                                                      \
+    }                                                                     \
+  } while (0)
+#endif
+
+#ifndef LAYER_NORM_CUDA_CHECK_LAUNCH
+#define LAYER_NORM_CUDA_CHECK_LAUNCH() LAYER_NORM_CUDA_CHECK(cudaGetLastError())
+#endif
+
+struct half4 {
+  half x, y, z, w;
+};
+
+template <typename T, int NUM>
+__inline__ __device__ T warpReduceSum(T* val) {
+#pragma unroll
+  for (int i = 0; i < NUM; i++) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+      val[i] += __shfl_xor_sync(FINAL_MASK, val[i], mask, 32);
+    }
+  }
+  return (T)(0.0f);
+}
+
+template <typename T, int NUM>
+__inline__ __device__ T blockReduceSum(T* val) {
+  __shared__ T shared[NUM][33];
+  int lane = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  warpReduceSum<T, NUM>(val);
+
+  if (lane == 0) {
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+      shared[i][wid] = val[i];
+    }
+  }
+
+  __syncthreads();
+
+  // blockDim.x is round up to multiples of 32
+  bool is_mask = threadIdx.x < (blockDim.x / 32);
+#pragma unroll
+  for (int i = 0; i < NUM; i++) {
+    val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
+  }
+  warpReduceSum<T, NUM>(val);
+  return (T)0.0f;
+}
+
+template <typename T>
+__inline__ __device__ T normalize(T val, T mean, T variance, T gamma, T beta) {
+  return (val - mean) * variance * gamma + beta;
+}
+
+// __inline__ __device__ float sigmoid(float val) {
+//   return 1.0f / (1.0f + expf(-1.0f * val));
+// }
+
+// fast sigmoid
+__inline__ __device__ float sigmoid(float val) {
+  return (cutlass::fast_tanh(val * 0.5f) + 1.0f) * 0.5f;
+}
+
+// output [m, n] row-major
+// input [m, n] row-major
+// gamma [n]
+// beta [n]
+// grid [m]
+// block [block_size] -- each threadblock deals with block_size elements;
+// block_size: round up to multiples of 32
+// n = block_size
+template <typename T, typename T_ACC, bool FuseSigmoidMul>
+__global__ void layernorm_sigmoid_mul_stored_locally(
+    T* output,
+    const T* input,
+    const T* gamma,
+    const T* beta,
+    const int n,
+    const T_ACC eps,
+    TensorAccessor input_accessor,
+    TensorAccessor output_accessor) {
+  const uint64_t m_idx = blockIdx.x;
+  const uint64_t tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+  const uint64_t offset = m_idx * n;
+
+  float local_sums[1] = {0.0f};
+  float local_val = 0.0f;
+  if (tid < n) {
+    local_val = static_cast<float>(
+        *input_accessor.get<const T, const T>(input, offset + tid));
+  }
+  local_sums[0] = local_val;
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < n) {
+    local_sums[0] = (local_val - s_mean) * (local_val - s_mean);
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  if (tid < n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
+    const float gamma_val = static_cast<float>(gamma[tid]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
+    const float beta_val = static_cast<float>(beta[tid]);
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val *= sigmoid(
+          normalize(local_val, s_mean, s_variance, gamma_val, beta_val));
+    } else {
+      local_val = normalize(local_val, s_mean, s_variance, gamma_val, beta_val);
+    }
+
+    *(output_accessor.get<T, T>(output, offset + tid)) = T(local_val);
+  }
+}
+
+// output [m, n] row-major
+// input [m, n] row-major
+// gamma [n]
+// beta [n]
+// grid [m]
+// block [block_size] -- each threadblock deals with block_size elements;
+// block_size = n / 4
+// block_size: round up to multiples of 32
+template <bool FuseSigmoidMul>
+__global__ void layernorm_sigmoid_mul_stored_locally(
+    float4* output,
+    const float4* input,
+    const float4* gamma,
+    const float4* beta,
+    const int n,
+    const float eps,
+    TensorAccessor input_accessor,
+    TensorAccessor output_accessor) {
+  const uint64_t m_idx = blockIdx.x;
+  const uint64_t tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  const uint64_t quarter_n = n >> 2;
+  const uint64_t offset = m_idx * quarter_n;
+
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+  float local_sums[1] = {0.0f};
+  if (tid < quarter_n) {
+    local_val =
+        *input_accessor.get<const float4, const float4>(input, offset + tid);
+
+    local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < quarter_n) {
+    local_sums[0] = (local_val.x - s_mean) * (local_val.x - s_mean) +
+        (local_val.y - s_mean) * (local_val.y - s_mean) +
+        (local_val.z - s_mean) * (local_val.z - s_mean) +
+        (local_val.w - s_mean) * (local_val.w - s_mean);
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  if (tid < quarter_n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float4 gamma_val = {
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA};
+#else
+    const float4 gamma_val = gamma[tid];
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float4 beta_val = {
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA};
+#else
+    const float4 beta_val = beta[tid];
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val.x *= sigmoid(
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+      local_val.y *= sigmoid(
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+      local_val.z *= sigmoid(
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+      local_val.w *= sigmoid(
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+    } else {
+      local_val.x =
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+      local_val.y =
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+      local_val.z =
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+      local_val.w =
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+    }
+
+    *(output_accessor.get<float, float4>(output, offset + tid)) = local_val;
+  }
+}
+
+// output [m, n] row-major
+// input [m, n] row-major
+// gamma [n]
+// beta [n]
+// grid [m]
+// block [block_size] -- each threadblock deals with block_size elements;
+// block_size = n / 4
+// block_size: round up to multiples of 32
+template <bool FuseSigmoidMul>
+__global__ void layernorm_sigmoid_mul_stored_locally(
+    half4* output,
+    const half4* input,
+    const half4* gamma,
+    const half4* beta,
+    const int n,
+    const float eps,
+    TensorAccessor input_accessor,
+    TensorAccessor output_accessor) {
+  const uint64_t m_idx = blockIdx.x;
+  const uint64_t tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  const uint64_t quarter_n = n >> 2;
+  const uint64_t offset = m_idx * quarter_n;
+
+  float local_sums[1] = {0.0f};
+  half4 local_val_half{0.0f, 0.0f, 0.0f, 0.0f};
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+
+  if (tid < quarter_n) {
+    local_val_half =
+        *input_accessor.get<const half, const half4>(input, offset + tid);
+
+    local_val = {
+        static_cast<float>(local_val_half.x),
+        static_cast<float>(local_val_half.y),
+        static_cast<float>(local_val_half.z),
+        static_cast<float>(local_val_half.w)};
+    local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < quarter_n) {
+    local_sums[0] = (local_val.x - s_mean) * (local_val.x - s_mean) +
+        (local_val.y - s_mean) * (local_val.y - s_mean) +
+        (local_val.z - s_mean) * (local_val.z - s_mean) +
+        (local_val.w - s_mean) * (local_val.w - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  if (tid < quarter_n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float4 gamma_val = {
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA};
+#else
+    const half4 gamma_val_half = gamma[tid];
+    const float4 gamma_val = {
+        static_cast<float>(gamma_val_half.x),
+        static_cast<float>(gamma_val_half.y),
+        static_cast<float>(gamma_val_half.z),
+        static_cast<float>(gamma_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float4 beta_val = {
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA};
+#else
+    const half4 beta_val_half = beta[tid];
+    const float4 beta_val = {
+        static_cast<float>(beta_val_half.x),
+        static_cast<float>(beta_val_half.y),
+        static_cast<float>(beta_val_half.z),
+        static_cast<float>(beta_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val.x *= sigmoid(
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+      local_val.y *= sigmoid(
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+      local_val.z *= sigmoid(
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+      local_val.w *= sigmoid(
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+    } else {
+      local_val.x =
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+      local_val.y =
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+      local_val.z =
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+      local_val.w =
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+    }
+
+    local_val_half.x = __float2half_rn(local_val.x);
+    local_val_half.y = __float2half_rn(local_val.y);
+    local_val_half.z = __float2half_rn(local_val.z);
+    local_val_half.w = __float2half_rn(local_val.w);
+
+    *(output_accessor.get<half, half4>(output, offset + tid)) = local_val_half;
+  }
+}
+
+// output [m, n] row-major
+// input [m, n] row-major
+// gamma [n]
+// beta [n]
+// grid(m)
+// block(block_size) -- each thread deals with n / block_size elements
+// block_size = 512
+template <typename T, typename T_ACC, bool FuseSigmoidMul>
+__global__ void layernorm_sigmoid_mul(
+    T* output,
+    const T* input,
+    const T* gamma,
+    const T* beta,
+    const int n,
+    const T_ACC eps,
+    TensorAccessor input_accessor,
+    TensorAccessor output_accessor) {
+  const uint64_t m_idx = blockIdx.x;
+  const uint64_t tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+  const uint64_t offset = m_idx * n;
+
+  float local_sums[1] = {0.0f};
+  for (uint64_t i = tid; i < n; i += blockDim.x) {
+    float local_val = static_cast<float>(
+        *input_accessor.get<const T, const T>(input, offset + i));
+
+    local_sums[0] += local_val;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  for (int i = tid; i < n; i += blockDim.x) {
+    float local_val = static_cast<float>(
+        *input_accessor.get<const T, const T>(input, offset + i));
+
+    local_sums[0] += (local_val - s_mean) * (local_val - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += blockDim.x) {
+    const float gamma_val = static_cast<float>(gamma[i]);
+    const float beta_val = static_cast<float>(beta[i]);
+    float local_val = static_cast<float>(
+        *input_accessor.get<const T, const T>(input, offset + i));
+
+    if (FuseSigmoidMul) {
+      local_val *= sigmoid(
+          normalize(local_val, s_mean, s_variance, gamma_val, beta_val));
+    } else {
+      local_val = normalize(local_val, s_mean, s_variance, gamma_val, beta_val);
+    }
+
+    *(output_accessor.get<T, T>(output, offset + i)) = T(local_val);
+  }
+}
+
+// Half specialization
+template <bool FuseSigmoidMul>
+__global__ void layernorm_sigmoid_mul(
+    half* output,
+    const half* input,
+    const half* gamma,
+    const half* beta,
+    const int n,
+    const float eps,
+    TensorAccessor input_accessor,
+    TensorAccessor output_accessor) {
+  const uint64_t m_idx = blockIdx.x;
+  const uint64_t tid = threadIdx.x;
+  const uint64_t offset = m_idx * n;
+
+  __shared__ float s_mean, s_variance;
+
+  float local_sums[1] = {0.0f};
+  for (uint64_t i = tid; i < n; i += blockDim.x) {
+    float local_val = __half2float(
+        *input_accessor.get<const half, const half>(input, offset + i));
+
+    local_sums[0] += local_val;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  for (int i = tid; i < n; i += blockDim.x) {
+    float local_val = __half2float(
+        *input_accessor.get<const half, const half>(input, i + offset));
+
+    local_sums[0] += (local_val - s_mean) * (local_val - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += blockDim.x) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
+    const float gamma_val = __half2float(gamma[i]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
+    const float beta_val = __half2float(beta[i]);
+#endif // AIT_LAYERNORM_CONST_BETA
+    float local_val = __half2float(
+        *input_accessor.get<const half, const half>(input, offset + i));
+
+    if (FuseSigmoidMul) {
+      local_val *= sigmoid(
+          normalize(local_val, s_mean, s_variance, gamma_val, beta_val));
+    } else {
+      local_val = normalize(local_val, s_mean, s_variance, gamma_val, beta_val);
+    }
+
+    *(output_accessor.get<half, half>(output, offset + i)) =
+        __float2half_rn(local_val);
+  }
+}
+
+template <typename T, typename T_ACC, bool FuseSigmoidMul>
+cudaError_t invokeLayernormSigmoidMul(
+    T* output,
+    const T* input,
+    const T* gamma,
+    const T* beta,
+    int m,
+    int n,
+    const T_ACC eps,
+    cudaStream_t stream,
+    const TensorAccessor& input_accessor,
+    const TensorAccessor& output_accessor) {
+  if (m == 0 || n == 0) {
+    return cudaSuccess;
+  }
+  dim3 grid(m);
+  dim3 block(n);
+  if ((n % 4 == 0) && (n >= 128) && (n <= 4096) &&
+      /* float4 and half4 kernels read 4 elements at once;
+         so they cannot be picked when an existing strided dim has the number of
+         elements not divisible by 4 */
+      input_accessor.is_valid_alignment(4) &&
+      output_accessor.is_valid_alignment(4)) {
+    block.x = (block.x / 4 + 31) / 32 * 32;
+    if constexpr (std::is_same<T, float>::value) {
+      layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
+          <<<grid, block, 0, stream>>>(
+              (float4*)output,
+              (const float4*)input,
+              (const float4*)gamma,
+              (const float4*)beta,
+              n,
+              eps,
+              input_accessor,
+              output_accessor);
+      LAYER_NORM_CUDA_CHECK_LAUNCH();
+    } else {
+      layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
+          <<<grid, block, 0, stream>>>(
+              (half4*)output,
+              (const half4*)input,
+              (const half4*)gamma,
+              (const half4*)beta,
+              n,
+              eps,
+              input_accessor,
+              output_accessor);
+      LAYER_NORM_CUDA_CHECK_LAUNCH();
+    }
+  } else if (n < 1024) {
+    block.x = (block.x + 31) / 32 * 32;
+    layernorm_sigmoid_mul_stored_locally<T, T_ACC, FuseSigmoidMul>
+        <<<grid, block, 0, stream>>>(
+            output,
+            input,
+            gamma,
+            beta,
+            n,
+            eps,
+            input_accessor,
+            output_accessor);
+    LAYER_NORM_CUDA_CHECK_LAUNCH();
+  } else {
+    CHECK(block.x >= 512);
+    block.x = 512;
+    if constexpr (std::is_same<T, half>::value) {
+      layernorm_sigmoid_mul<FuseSigmoidMul><<<grid, block, 0, stream>>>(
+          (half*)(output),
+          (const half*)(input),
+          (const half*)(gamma),
+          (const half*)(beta),
+          n,
+          eps,
+          input_accessor,
+          output_accessor);
+      LAYER_NORM_CUDA_CHECK_LAUNCH();
+    } else {
+      layernorm_sigmoid_mul<T, T_ACC, FuseSigmoidMul>
+          <<<grid, block, 0, stream>>>(
+              output,
+              input,
+              gamma,
+              beta,
+              n,
+              eps,
+              input_accessor,
+              output_accessor);
+      LAYER_NORM_CUDA_CHECK_LAUNCH();
+    }
+  }
+  return cudaSuccess;
+}
+
+//================================BatchLayerNorm====================================
+
+// output [b, m, n] row-major
+// input [b, m, n] row-major
+// gamma [b, n]
+// beta [b, n]
+// grid(b, m)
+// block(block_size) -- each threadblock deals with block_size elements
+// block_size: round up to multiples of 32
+template <typename T, typename T_ACC, bool FuseSigmoidMul>
+__global__ void batch_layernorm_sigmoid_mul_stored_locally(
+    T* output,
+    const T* input,
+    const T* gamma,
+    const T* beta,
+    const int m,
+    const int n,
+    const T_ACC eps) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  // n is blocksize
+  const int offset = (m_idx + b_idx * m) * n;
+  const int gamma_beta_offset = b_idx * n;
+
+  input += offset;
+  output += offset;
+
+  gamma += gamma_beta_offset;
+  beta += gamma_beta_offset;
+
+  float local_sums[1] = {0.0f};
+  float local_val = 0.0f;
+  if (tid < n) {
+    local_val = static_cast<float>(input[tid]);
+    ;
+  }
+  local_sums[0] = local_val;
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < n) {
+    local_sums[0] = (local_val - s_mean) * (local_val - s_mean);
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  if (tid < n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
+    const float gamma_val = static_cast<float>(gamma[tid]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
+    const float beta_val = static_cast<float>(beta[tid]);
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val *= sigmoid(
+          normalize(local_val, s_mean, s_variance, gamma_val, beta_val));
+    } else {
+      local_val = normalize(local_val, s_mean, s_variance, gamma_val, beta_val);
+    }
+
+    output[tid] = T(local_val);
+  }
+}
+
+// output [b, m, n] row-major
+// input [b, m, n] row-major
+// gamma [b, n]
+// beta [b, n]
+// grid(b, m)
+// block(block_size) -- each threadblock deals with block_size elements
+// block_size = n / 4
+// block_size: round up to multiples of 32
+template <bool FuseSigmoidMul>
+__global__ void batch_layernorm_sigmoid_mul_stored_locally(
+    half4* output,
+    const half4* input,
+    const half4* gamma,
+    const half4* beta,
+    const int m,
+    const int n,
+    const float eps) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  const int quarter_n = n >> 2;
+  const int offset = (m_idx + b_idx * m) * quarter_n;
+  const int gamma_beta_offset = b_idx * quarter_n;
+
+  input += offset;
+  output += offset;
+
+  gamma += gamma_beta_offset;
+  beta += gamma_beta_offset;
+
+  float local_sums[1] = {0.0f};
+  half4 local_val_half{0.0f, 0.0f, 0.0f, 0.0f};
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+
+  if (tid < quarter_n) {
+    local_val_half = input[tid];
+    local_val = {
+        static_cast<float>(local_val_half.x),
+        static_cast<float>(local_val_half.y),
+        static_cast<float>(local_val_half.z),
+        static_cast<float>(local_val_half.w)};
+    local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < quarter_n) {
+    local_sums[0] = (local_val.x - s_mean) * (local_val.x - s_mean) +
+        (local_val.y - s_mean) * (local_val.y - s_mean) +
+        (local_val.z - s_mean) * (local_val.z - s_mean) +
+        (local_val.w - s_mean) * (local_val.w - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  if (tid < quarter_n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float4 gamma_val = {
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA};
+#else
+    const half4 gamma_val_half = gamma[tid];
+    const float4 gamma_val = {
+        static_cast<float>(gamma_val_half.x),
+        static_cast<float>(gamma_val_half.y),
+        static_cast<float>(gamma_val_half.z),
+        static_cast<float>(gamma_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float4 beta_val = {
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA};
+#else
+    const half4 beta_val_half = beta[tid];
+    const float4 beta_val = {
+        static_cast<float>(beta_val_half.x),
+        static_cast<float>(beta_val_half.y),
+        static_cast<float>(beta_val_half.z),
+        static_cast<float>(beta_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val.x *= sigmoid(
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+      local_val.y *= sigmoid(
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+      local_val.z *= sigmoid(
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+      local_val.w *= sigmoid(
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+    } else {
+      local_val.x =
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+      local_val.y =
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+      local_val.z =
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+      local_val.w =
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+    }
+
+    local_val_half.x = __float2half_rn(local_val.x);
+    local_val_half.y = __float2half_rn(local_val.y);
+    local_val_half.z = __float2half_rn(local_val.z);
+    local_val_half.w = __float2half_rn(local_val.w);
+
+    output[tid] = local_val_half;
+  }
+}
+
+// output [b, m, n] row-major
+// input [b, m, n] row-major
+// gamma [b, n]
+// beta [b, n]
+// grid(b, m)
+// block(block_size) -- each threadblock deals with block_size elements
+// block_size = n / 4
+// block_size: round up to multiples of 32
+template <bool FuseSigmoidMul>
+__global__ void batch_layernorm_sigmoid_mul_stored_locally(
+    float4* output,
+    const float4* input,
+    const float4* gamma,
+    const float4* beta,
+    const int m,
+    const int n,
+    const float eps) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  const int quarter_n = n >> 2;
+  const int offset = (m_idx + b_idx * m) * quarter_n;
+
+  input += offset;
+  output += offset;
+
+#if !defined(AIT_LAYERNORM_CONST_GAMMA) || !defined(AIT_LAYERNORM_CONST_BETA)
+  const int gamma_beta_offset = b_idx * quarter_n;
+#endif // !AIT_LAYERNORM_CONST_GAMMA || !AIT_LAYERNORM_CONST_BETA
+
+#ifndef AIT_LAYERNORM_CONST_GAMMA
+  gamma += gamma_beta_offset;
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifndef AIT_LAYERNORM_CONST_BETA
+  beta += gamma_beta_offset;
+#endif // AIT_LAYERNORM_CONST_BETA
+
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+  float local_sums[1] = {0.0f};
+  if (tid < quarter_n) {
+    local_val = input[tid];
+    local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < quarter_n) {
+    local_sums[0] = (local_val.x - s_mean) * (local_val.x - s_mean) +
+        (local_val.y - s_mean) * (local_val.y - s_mean) +
+        (local_val.z - s_mean) * (local_val.z - s_mean) +
+        (local_val.w - s_mean) * (local_val.w - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  if (tid < quarter_n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float4 gamma_val = {
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA};
+#else
+    const float4 gamma_val = gamma[tid];
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float4 beta_val = {
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA};
+#else
+    const float4 beta_val = beta[tid];
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val.x *= sigmoid(
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+      local_val.y *= sigmoid(
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+      local_val.z *= sigmoid(
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+      local_val.w *= sigmoid(
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+    } else {
+      local_val.x =
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+      local_val.y =
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+      local_val.z =
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+      local_val.w =
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+    }
+
+    output[tid] = local_val;
+  }
+}
+
+// output [b, m, n] row-major
+// input [b, m, n] row-major
+// gamma [b, n]
+// beta [b, n]
+// grid(b, m)
+// block(block_size) -- each thread deals with n / block_size elements
+// block_size = 512
+template <typename T, typename T_ACC, bool FuseSigmoidMul>
+__global__ void batch_layernorm_sigmoid_mul(
+    T* output,
+    T* input,
+    const T* gamma,
+    const T* beta,
+    const int m,
+    const int n,
+    const T_ACC eps) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  const int offset = (m_idx + b_idx * m) * n;
+#if !defined(AIT_LAYERNORM_CONST_GAMMA) || !defined(AIT_LAYERNORM_CONST_BETA)
+  const int gamma_beta_offset = b_idx * n;
+#endif // !AIT_LAYERNORM_CONST_GAMMA || !AIT_LAYERNORM_CONST_BETA
+
+  input += offset;
+  output += offset;
+
+#ifndef AIT_LAYERNORM_CONST_GAMMA
+  gamma += gamma_beta_offset;
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifndef AIT_LAYERNORM_CONST_BETA
+  beta += gamma_beta_offset;
+#endif // AIT_LAYERNORM_CONST_BETA
+
+  float local_sums[1] = {0.0f};
+  for (int i = tid; i < n; i += blockDim.x) {
+    float local_val = static_cast<float>(input[i]);
+    local_sums[0] += local_val;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  for (int i = tid; i < n; i += blockDim.x) {
+    float local_val = static_cast<float>(input[i]);
+    local_sums[0] += (local_val - s_mean) * (local_val - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += blockDim.x) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
+    const float gamma_val = static_cast<float>(gamma[i]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
+    const float beta_val = static_cast<float>(beta[i]);
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    float local_val = static_cast<float>(input[i]);
+    if (FuseSigmoidMul) {
+      local_val *= sigmoid(
+          normalize(local_val, s_mean, s_variance, gamma_val, beta_val));
+    } else {
+      local_val = normalize(local_val, s_mean, s_variance, gamma_val, beta_val);
+    }
+
+    output[i] = T(local_val);
+  }
+}
+
+// half specialization
+template <bool FuseSigmoidMul>
+__global__ void batch_layernorm_sigmoid_mul(
+    half* output,
+    half* input,
+    const half* gamma,
+    const half* beta,
+    const int m,
+    const int n,
+    const float eps) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  const int offset = (m_idx + b_idx * m) * n;
+#if !defined(AIT_LAYERNORM_CONST_GAMMA) || !defined(AIT_LAYERNORM_CONST_BETA)
+  const int gamma_beta_offset = b_idx * n;
+#endif // !AIT_LAYERNORM_CONST_GAMMA || !AIT_LAYERNORM_CONST_BETA
+
+  input += offset;
+  output += offset;
+
+#ifndef AIT_LAYERNORM_CONST_GAMMA
+  gamma += gamma_beta_offset;
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifndef AIT_LAYERNORM_CONST_BETA
+  beta += gamma_beta_offset;
+#endif // AIT_LAYERNORM_CONST_BETA
+
+  float local_sums[1] = {0.0f};
+  for (int i = tid; i < n; i += blockDim.x) {
+    float local_val = __half2float(input[i]);
+    local_sums[0] += local_val;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  for (int i = tid; i < n; i += blockDim.x) {
+    float local_val = __half2float(input[i]);
+    local_sums[0] += (local_val - s_mean) * (local_val - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += blockDim.x) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
+    const float gamma_val = __half2float(gamma[i]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
+    const float beta_val = __half2float(beta[i]);
+#endif // AIT_LAYERNORM_CONST_BETA
+    float local_val = __half2float(input[i]);
+    if (FuseSigmoidMul) {
+      local_val *= sigmoid(
+          normalize(local_val, s_mean, s_variance, gamma_val, beta_val));
+    } else {
+      local_val = normalize(local_val, s_mean, s_variance, gamma_val, beta_val);
+    }
+
+    output[i] = __float2half_rn(local_val);
+  }
+}
+
+template <typename T, typename T_ACC, bool FuseSigmoidMul>
+void invokeBatchLayernormSigmoidMul(
+    T* output,
+    T* input,
+    const T* gamma,
+    const T* beta,
+    int b,
+    int m,
+    int n,
+    const T_ACC eps,
+    cudaStream_t stream) {
+  if (b == 0 || m == 0 || n == 0) {
+    return;
+  }
+  dim3 grid(b, m);
+  dim3 block(n);
+  if ((n % 4 == 0) && (n >= 128) && (n <= 4096)) {
+    block.x = (block.x / 4 + 31) / 32 * 32;
+    if (std::is_same<T, float>::value) {
+      batch_layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
+          <<<grid, block, 0, stream>>>(
+              (float4*)output,
+              (const float4*)input,
+              (const float4*)gamma,
+              (const float4*)beta,
+              m,
+              n,
+              eps);
+    } else {
+      batch_layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
+          <<<grid, block, 0, stream>>>(
+              (half4*)output,
+              (const half4*)input,
+              (const half4*)gamma,
+              (const half4*)beta,
+              m,
+              n,
+              eps);
+    }
+  } else if (n < 1024) {
+    block.x = (block.x + 31) / 32 * 32;
+    batch_layernorm_sigmoid_mul_stored_locally<T, T_ACC, FuseSigmoidMul>
+        <<<grid, block, 0, stream>>>(output, input, gamma, beta, m, n, eps);
+  } else {
+    CHECK(block.x >= 512);
+    block.x = 512;
+    if (std::is_same<T, half>::value) {
+      batch_layernorm_sigmoid_mul<FuseSigmoidMul><<<grid, block, 0, stream>>>(
+          (half*)(output),
+          (half*)(input),
+          (const half*)(gamma),
+          (const half*)(beta),
+          m,
+          n,
+          eps);
+    } else {
+      batch_layernorm_sigmoid_mul<T, T_ACC, FuseSigmoidMul>
+          <<<grid, block, 0, stream>>>(output, input, gamma, beta, m, n, eps);
+    }
+  }
+}
+
+//================================GroupLayerNorm====================================
+
+template <typename T, typename T_ACC, int NumInputs>
+struct Arguments {
+  T* outputs[NumInputs]; /* pointer to each output */
+  T* inputs[NumInputs]; /* pointer to each input */
+  T* gammas[NumInputs];
+  T* betas[NumInputs];
+  int64_t N[NumInputs]; /* N of each input */
+  T_ACC eps;
+  TensorAccessor input_accessors[NumInputs];
+  TensorAccessor output_accessors[NumInputs];
+};
+
+// output b * [m, n] row-major
+// input  b * [m, n] row-major
+// gamma b * [n]
+// beta  b * [n]
+// grid [b, m]
+// block [block_size] -- each thread deals with 4 elements
+// block_size = n / 4
+template <bool FuseSigmoidMul, int NumInputs>
+__device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
+    const Arguments<half4, float, NumInputs>& args) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+
+  half4* output = args.outputs[b_idx];
+  const half4* input = args.inputs[b_idx];
+  const half4* gamma = args.gammas[b_idx];
+  const half4* beta = args.betas[b_idx];
+  const TensorAccessor& input_accessor = args.input_accessors[b_idx];
+  const TensorAccessor& output_accessor = args.output_accessors[b_idx];
+
+  const int n = args.N[b_idx];
+  const int quarter_n = n >> 2;
+  const int offset = m_idx * quarter_n;
+
+  half4 local_val_half{0, 0, 0, 0};
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+
+  if (tid < quarter_n) {
+    local_val_half =
+        *input_accessor.get<const half, const half4>(input, offset + tid);
+    local_val = {
+        static_cast<float>(local_val_half.x),
+        static_cast<float>(local_val_half.y),
+        static_cast<float>(local_val_half.z),
+        static_cast<float>(local_val_half.w)};
+    local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < quarter_n) {
+    local_sums[0] = (local_val.x - s_mean) * (local_val.x - s_mean) +
+        (local_val.y - s_mean) * (local_val.y - s_mean) +
+        (local_val.z - s_mean) * (local_val.z - s_mean) +
+        (local_val.w - s_mean) * (local_val.w - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + args.eps);
+  }
+  __syncthreads();
+
+  if (tid < quarter_n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float4 gamma_val = {
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA};
+#else
+    const half4 gamma_val_half = gamma[tid];
+    const float4 gamma_val = {
+        static_cast<float>(gamma_val_half.x),
+        static_cast<float>(gamma_val_half.y),
+        static_cast<float>(gamma_val_half.z),
+        static_cast<float>(gamma_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float4 beta_val = {
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA};
+#else
+    const half4 beta_val_half = beta[tid];
+    const float4 beta_val = {
+        static_cast<float>(beta_val_half.x),
+        static_cast<float>(beta_val_half.y),
+        static_cast<float>(beta_val_half.z),
+        static_cast<float>(beta_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val.x *= sigmoid(
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+      local_val.y *= sigmoid(
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+      local_val.z *= sigmoid(
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+      local_val.w *= sigmoid(
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+    } else {
+      local_val.x =
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+      local_val.y =
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+      local_val.z =
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+      local_val.w =
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+    }
+
+    local_val_half.x = __float2half_rn(local_val.x);
+    local_val_half.y = __float2half_rn(local_val.y);
+    local_val_half.z = __float2half_rn(local_val.z);
+    local_val_half.w = __float2half_rn(local_val.w);
+
+    *(output_accessor.get<half, half4>(output, offset + tid)) = local_val_half;
+  }
+}
+
+#define GROUP_LAYER_NORM_MAX_INLINE_INPUTS 39
+
+template <
+    bool FuseSigmoidMul,
+    int NumInputs,
+    std::enable_if_t<NumInputs <= GROUP_LAYER_NORM_MAX_INLINE_INPUTS, bool> =
+        true>
+__global__ void group_layernorm_sigmoid_mul_stored_locally_half(
+    Arguments<half4, float, NumInputs> args) {
+  group_layernorm_sigmoid_mul_stored_locally_impl<FuseSigmoidMul, NumInputs>(
+      args);
+}
+
+template <
+    bool FuseSigmoidMul,
+    int NumInputs,
+    std::enable_if_t<(NumInputs > GROUP_LAYER_NORM_MAX_INLINE_INPUTS), bool> =
+        true>
+__global__ void group_layernorm_sigmoid_mul_stored_locally_half(
+    const Arguments<half4, float, NumInputs>* args) {
+  group_layernorm_sigmoid_mul_stored_locally_impl<FuseSigmoidMul, NumInputs>(
+      *args);
+}
+
+// output b * [m, n] row-major
+// input  b * [m, n] row-major
+// gamma b * [n]
+// beta  b * [n]
+// grid [b, m]
+// block [block_size] -- each thread deals with 1 element
+// block_size = n
+template <typename T, typename T_ACC, bool FuseSigmoidMul, int NumInputs>
+__device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
+    const Arguments<T, T_ACC, NumInputs>& args) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  T* output = args.outputs[b_idx];
+  const T* input = args.inputs[b_idx];
+  const T* gamma = args.gammas[b_idx];
+  const T* beta = args.betas[b_idx];
+  const TensorAccessor& input_accessor = args.input_accessors[b_idx];
+  const TensorAccessor& output_accessor = args.output_accessors[b_idx];
+
+  const int n = args.N[b_idx];
+  const int offset = m_idx * n;
+
+  float local_sums[1] = {0.0f};
+  float local_val = 0.0f;
+
+  if (tid < n) {
+    local_val = static_cast<float>(
+        *input_accessor.get<const T, const T>(input, offset + tid));
+    local_sums[0] = local_val;
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < n) {
+    local_sums[0] = (local_val - s_mean) * (local_val - s_mean);
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + args.eps);
+  }
+  __syncthreads();
+
+  if (tid < n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
+    const float gamma_val = static_cast<float>(gamma[tid]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
+    const float beta_val = static_cast<float>(beta[tid]);
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val *= sigmoid(
+          normalize(local_val, s_mean, s_variance, gamma_val, beta_val));
+    } else {
+      local_val = normalize(local_val, s_mean, s_variance, gamma_val, beta_val);
+    }
+
+    *(output_accessor.get<T, T>(output, offset + tid)) = T(local_val);
+  }
+}
+
+template <
+    typename T,
+    typename T_ACC,
+    bool FuseSigmoidMul,
+    int NumInputs,
+    std::enable_if_t<NumInputs <= GROUP_LAYER_NORM_MAX_INLINE_INPUTS, bool> =
+        true>
+__global__ void group_layernorm_sigmoid_mul_stored_locally(
+    Arguments<T, T_ACC, NumInputs> args) {
+  group_layernorm_sigmoid_mul_stored_locally_impl<
+      T,
+      T_ACC,
+      FuseSigmoidMul,
+      NumInputs>(args);
+}
+
+template <
+    typename T,
+    typename T_ACC,
+    bool FuseSigmoidMul,
+    int NumInputs,
+    std::enable_if_t<(NumInputs > GROUP_LAYER_NORM_MAX_INLINE_INPUTS), bool> =
+        true>
+__global__ void group_layernorm_sigmoid_mul_stored_locally(
+    const Arguments<T, T_ACC, NumInputs>* args) {
+  group_layernorm_sigmoid_mul_stored_locally_impl<
+      T,
+      T_ACC,
+      FuseSigmoidMul,
+      NumInputs>(*args);
+}
+
+// output b * [m, n] row-major
+// input  b * [m, n] row-major
+// gamma b * [n]
+// beta  b * [n]
+// grid [b, m]
+// block [block_size] -- each thread deals with n / block_size element
+// block_size = 512
+template <typename T, typename T_ACC, bool FuseSigmoidMul, int NumInputs>
+__device__ void group_layernorm_sigmoid_mul_impl(
+    Arguments<T, T_ACC, NumInputs> args) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  T* output = args.outputs[b_idx];
+  const T* input = args.inputs[b_idx];
+  const T* gamma = args.gammas[b_idx];
+  const T* beta = args.betas[b_idx];
+  const TensorAccessor& input_accessor = args.input_accessors[b_idx];
+  const TensorAccessor& output_accessor = args.output_accessors[b_idx];
+
+  const int n = args.N[b_idx];
+  int offset = m_idx * n;
+
+  float local_sums[1] = {0.0f};
+  for (int i = tid; i < n; i += blockDim.x) {
+    float local_val = static_cast<float>(
+        *input_accessor.get<const T, const T>(input, offset + i));
+    local_sums[0] += local_val;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  for (int i = tid; i < n; i += blockDim.x) {
+    float local_val = static_cast<float>(
+        *input_accessor.get<const T, const T>(input, offset + i));
+    local_sums[0] += (local_val - s_mean) * (local_val - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + args.eps);
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += blockDim.x) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
+    const float gamma_val = static_cast<float>(gamma[i]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
+    const float beta_val = static_cast<float>(beta[i]);
+#endif // AIT_LAYERNORM_CONST_BETA
+    float local_val = static_cast<float>(
+        *input_accessor.get<const T, const T>(input, offset + i));
+    if (FuseSigmoidMul) {
+      local_val *= sigmoid(
+          normalize(local_val, s_mean, s_variance, gamma_val, beta_val));
+    } else {
+      local_val = normalize(local_val, s_mean, s_variance, gamma_val, beta_val);
+    }
+
+    *(output_accessor.get<T, T>(output, offset + i)) = T(local_val);
+  }
+}
+
+template <typename T, typename T_ACC, bool FuseSigmoidMul, int NumInputs>
+__global__ void group_layernorm_sigmoid_mul(
+    Arguments<T, T_ACC, NumInputs> args) {
+  group_layernorm_sigmoid_mul_impl<T, T_ACC, FuseSigmoidMul, NumInputs>(args);
+}
+
+template <typename T, typename T_ACC, bool FuseSigmoidMul, int NumInputs>
+__global__ void group_layernorm_sigmoid_mul(
+    const Arguments<T, T_ACC, NumInputs>* args) {
+  group_layernorm_sigmoid_mul_impl<T, T_ACC, FuseSigmoidMul, NumInputs>(*args);
+}
+
+// array size of output, input, gamma, beta, n: b (group size)
+template <typename T, typename T_ACC, bool FuseSigmoidMul, int NumInputs>
+cudaError_t invokeGroupLayernormSigmoidMul(
+    T* output[],
+    T* input[],
+    T* gamma[],
+    T* beta[],
+    int b,
+    int m,
+    const int64_t* n,
+    const T_ACC eps,
+    cudaStream_t stream,
+    const TensorAccessor* input_accessors,
+    const TensorAccessor* output_accessors) {
+  bool n_is_multiple_of_4 =
+      std::all_of(n, n + b, [](int i) { return i % 4 == 0; });
+
+  int max_n = *std::max_element(n, n + b);
+  int min_n = *std::min_element(n, n + b);
+  if (max_n == 0) {
+    return cudaSuccess;
+  }
+
+  dim3 grid(b, m);
+  dim3 block(max_n);
+
+  // TODO: implement float4 group kernel
+  if (std::is_same<T, half>::value && n_is_multiple_of_4 && (min_n >= 128) &&
+      (max_n <= 4096)) {
+    // round up to multiples of 32 to make warp shuffles safe
+    block.x = (block.x / 4 + 31) / 32 * 32;
+    Arguments<half4, float, NumInputs> args;
+    for (size_t i = 0; i < b; i++) {
+      args.outputs[i] = reinterpret_cast<half4*>(output[i]);
+      args.inputs[i] = reinterpret_cast<half4*>(input[i]);
+      args.gammas[i] = reinterpret_cast<half4*>(gamma[i]);
+      args.betas[i] = reinterpret_cast<half4*>(beta[i]);
+      args.N[i] = n[i];
+      args.eps = eps;
+      args.output_accessors[i] = output_accessors[i];
+      args.input_accessors[i] = input_accessors[i];
+    }
+    if constexpr (NumInputs <= GROUP_LAYER_NORM_MAX_INLINE_INPUTS) {
+      group_layernorm_sigmoid_mul_stored_locally_half<FuseSigmoidMul, NumInputs>
+          <<<grid, block, 0, stream>>>(args);
+      LAYER_NORM_CUDA_CHECK_LAUNCH();
+    } else {
+      Arguments<half4, float, NumInputs>* argsPtr;
+      LAYER_NORM_CUDA_CHECK(cudaMalloc(&argsPtr, sizeof(args)));
+      LAYER_NORM_CUDA_CHECK(
+          cudaMemcpy(argsPtr, &args, sizeof(args), cudaMemcpyHostToDevice));
+      group_layernorm_sigmoid_mul_stored_locally_half<FuseSigmoidMul, NumInputs>
+          <<<grid, block, 0, stream>>>(
+              const_cast<const Arguments<half4, float, NumInputs>*>(argsPtr));
+      LAYER_NORM_CUDA_CHECK_LAUNCH();
+      LAYER_NORM_CUDA_CHECK(cudaFree(argsPtr));
+    }
+  } else {
+    Arguments<T, T_ACC, NumInputs> args;
+    for (size_t i = 0; i < b; i++) {
+      args.outputs[i] = output[i];
+      args.inputs[i] = input[i];
+      args.gammas[i] = gamma[i];
+      args.betas[i] = beta[i];
+      args.N[i] = n[i];
+      args.eps = eps;
+      args.input_accessors[i] = input_accessors[i];
+      args.output_accessors[i] = output_accessors[i];
+    }
+    if (max_n < 1024) {
+      block.x = (block.x + 31) / 32 * 32;
+      if constexpr (NumInputs <= GROUP_LAYER_NORM_MAX_INLINE_INPUTS) {
+        group_layernorm_sigmoid_mul_stored_locally<
+            T,
+            T_ACC,
+            FuseSigmoidMul,
+            NumInputs><<<grid, block, 0, stream>>>(args);
+        LAYER_NORM_CUDA_CHECK_LAUNCH();
+      } else {
+        Arguments<T, T_ACC, NumInputs>* argsPtr;
+        LAYER_NORM_CUDA_CHECK(cudaMalloc(&argsPtr, sizeof(args)));
+        LAYER_NORM_CUDA_CHECK(
+            cudaMemcpy(argsPtr, &args, sizeof(args), cudaMemcpyHostToDevice));
+        group_layernorm_sigmoid_mul_stored_locally<
+            T,
+            T_ACC,
+            FuseSigmoidMul,
+            NumInputs><<<grid, block, 0, stream>>>(argsPtr);
+        LAYER_NORM_CUDA_CHECK_LAUNCH();
+        LAYER_NORM_CUDA_CHECK(cudaFree(argsPtr));
+      }
+    } else {
+      CHECK(block.x >= 512);
+      block.x = 512;
+      if constexpr (NumInputs <= GROUP_LAYER_NORM_MAX_INLINE_INPUTS) {
+        group_layernorm_sigmoid_mul<T, T_ACC, FuseSigmoidMul, NumInputs>
+            <<<grid, block, 0, stream>>>(args);
+        LAYER_NORM_CUDA_CHECK_LAUNCH();
+      } else {
+        Arguments<T, T_ACC, NumInputs>* argsPtr;
+        LAYER_NORM_CUDA_CHECK(cudaMalloc(&argsPtr, sizeof(args)));
+        LAYER_NORM_CUDA_CHECK(
+            cudaMemcpy(argsPtr, &args, sizeof(args), cudaMemcpyHostToDevice));
+        group_layernorm_sigmoid_mul<T, T_ACC, FuseSigmoidMul, NumInputs>
+            <<<grid, block, 0, stream>>>(argsPtr);
+        LAYER_NORM_CUDA_CHECK_LAUNCH();
+        LAYER_NORM_CUDA_CHECK(cudaFree(argsPtr));
+      }
+    }
+  }
+  return cudaSuccess;
+}
+
+#undef LAYER_NORM_CUDA_CHECK
+#undef LAYER_NORM_CUDA_CHECK_LAUNCH
+
+#endif
diff --git a/python/aitemplate/backend/cuda/lib_template.py b/python/aitemplate/backend/cuda/lib_template.py
new file mode 100644
index 000000000..16d5ee505
--- /dev/null
+++ b/python/aitemplate/backend/cuda/lib_template.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common function templates for CUDA codegen.
+"""
+import jinja2
+
+from .. import registry
+
+# pylint: disable=C0301
+
+VAR_TEMPLATE = jinja2.Template("""{{indent}} int64_t {{name}} { {{value}} };""")
+
+PTR_TEMPLATE = jinja2.Template("""{{indent}} {{dtype}} {{name}} {nullptr};""")
+
+
+@registry.reg("cuda.lib.var_decl")
+def var_decl(name, value=0, indent="  "):
+    return VAR_TEMPLATE.render(name=name, value=value, indent=indent)
+
+
+@registry.reg("cuda.lib.ptr_decl")
+def ptr_decl(name, dtype="float16", indent="  "):
+    if dtype == "float16":
+        type_string = "cutlass::half_t*"
+    elif dtype in ["float", "float32"]:
+        type_string = "float*"
+    elif dtype == "int64":
+        type_string = "int64_t*"
+    elif dtype in ["int", "int32"]:
+        type_string = "int32_t*"
+    else:
+        raise NotImplementedError
+    return PTR_TEMPLATE.render(name=name, dtype=type_string, indent=indent)
diff --git a/python/aitemplate/backend/cuda/padding/__init__.py b/python/aitemplate/backend/cuda/padding/__init__.py
new file mode 100644
index 000000000..455e327d6
--- /dev/null
+++ b/python/aitemplate/backend/cuda/padding/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA padding init
+"""
+from . import nhwc3to4, nhwc3to8, pad_last_dim
+
+__all__ = ["nhwc3to8", "pad_last_dim", "nhwc3to4"]
diff --git a/python/aitemplate/backend/cuda/padding/nhwc3to4.py b/python/aitemplate/backend/cuda/padding/nhwc3to4.py
new file mode 100644
index 000000000..7f940ba63
--- /dev/null
+++ b/python/aitemplate/backend/cuda/padding/nhwc3to4.py
@@ -0,0 +1,218 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA codegen for nhwc3to4 op
+"""
+import jinja2
+
+from ... import registry
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}nhwc3to4_launcher(
+{{indent}}    in_ptr,
+{{indent}}    out_ptr,
+{{indent}}    NI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include "logging.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+// fast kernel for c_in = 3 & c_out = 4
+template <typename Tio, typename Telement, int element_in_Tio>
+__global__ void nhwc_padding_channel_3To4_kernel(const int32_t n,
+                                                 const int32_t h,
+                                                 const int32_t w,
+                                                 const Tio *input,
+                                                 Tio *output,
+                                                 const int32_t max_output_element,
+                                                 const int32_t max_input_element,
+                                                 const Tio zero_io,
+                                                 const Telement zero_element){
+  __shared__ Tio shm[192];
+  const int tidx = blockIdx.x * 192 + threadIdx.x;
+  const int threadidx = threadIdx.x;
+
+  shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];
+  __syncthreads();
+
+  const int ouput_offset = blockIdx.x * 256;
+  const int lower_bound = max_output_element < ouput_offset + 256 ? max_output_element : ouput_offset + 256;
+  for (int i = ouput_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
+  {
+    const Telement* shm_element = (const Telement*)shm + j*3*element_in_Tio/4;
+    Telement array[element_in_Tio];
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0 ; k < element_in_Tio ; k++)
+      array[k] = ((k+1)%4 == 0) ? zero_element : shm_element[(k > 3) ? (k - 1) : k];
+    output[i] = *((const Tio *)array);
+  }
+}
+
+void nhwc3to4_launcher(cutlass::half_t* in_ptr,
+                       cutlass::half_t* out_ptr,
+                       int NI,
+                       int HI,
+                       int WI,
+                       cudaStream_t stream) {
+  dim3 block(192);
+  const int nhw = NI * HI * WI;
+  const int nhwc = nhw * 3;
+  CHECK_EQ(nhw % 8, 0);
+  const int element_in_Tio = 8;
+  const int max_input_element = nhwc / element_in_Tio;
+  const int max_output_element = nhw * 4 / element_in_Tio;
+  const int4 zero_io = {0, 0, 0, 0};
+  const cutlass::half_t zero_element = static_cast<cutlass::half_t>(0.0f);
+  dim3 grid((nhwc + 192 * element_in_Tio - 1)/(192 * element_in_Tio));
+  nhwc_padding_channel_3To4_kernel<int4, cutlass::half_t, element_in_Tio><<<grid, block, 0, stream>>>
+          (NI, HI, WI,
+          (const int4 *)in_ptr,
+          (int4 *)out_ptr,
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+}
+
+void {{function_name}} (
+    cutlass::half_t* in_ptr,
+    cutlass::half_t* out_ptr,
+    int64_t* batch,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    cudaStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("cuda.nhwc3to4.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render()
+    return SRC_TEMPLATE.render(
+        function_name=func_name, shape_function=shape_func, exec_paths=exec_paths
+    )
+
+
+@registry.reg("cuda.nhwc3to4.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.nhwc3to4.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/padding/nhwc3to8.py b/python/aitemplate/backend/cuda/padding/nhwc3to8.py
new file mode 100644
index 000000000..4bae4d217
--- /dev/null
+++ b/python/aitemplate/backend/cuda/padding/nhwc3to8.py
@@ -0,0 +1,221 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA codegen for nhwc3to8 op
+"""
+import jinja2
+
+from ... import registry
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}nhwc3to8_launcher(
+{{indent}}    in_ptr,
+{{indent}}    out_ptr,
+{{indent}}    NI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+// load 128 bit every time (8 half = 4 float)
+// use as many as thread with factor of 3:
+// each time load num_thread * 8 half = num_thread / 3 * 8 * 3ch -> num_thread / 3 * 8 * 8ch
+
+template<int num_thread>
+__global__ void nhwc3to8_kernel(const float4* input,
+                                float4* output,
+                                const int NI,
+                                const int HI,
+                                const int WI,
+                                const int max_in_elements,
+                                const int max_out_elements) {
+  __shared__ float4 shared_mem[num_thread];
+  const int out_offset = num_thread * 8 / 3;
+  const float4 zero4 = {0.0f, 0.0f, 0.0f, 0.0f};
+  const half zero = static_cast<half>(0.f);
+  const int in_idx = blockIdx.x * num_thread + threadIdx.x;
+  const int tid = threadIdx.x;
+
+  shared_mem[tid] = in_idx >= max_in_elements ? zero4 : __ldg(input + in_idx);
+  __syncthreads();
+
+  const int out_start_idx = blockIdx.x * out_offset;
+  const int boundary = out_start_idx + out_offset > max_out_elements ? max_out_elements : out_start_idx + out_offset;
+  for (int i = out_start_idx + tid, j = tid; i < boundary; i += num_thread, j += num_thread) {
+    const half* smem_element = (const half*)shared_mem + j * 3;
+    half tmp[8];
+
+    #pragma unroll
+    for (int k = 0; k < 8; ++k) {
+      tmp[k] = k < 3 ? smem_element[k] : zero;
+    }
+    output[i] = *((const float4*)tmp);
+  }
+}
+
+void nhwc3to8_launcher(cutlass::half_t* in_ptr,
+                       cutlass::half_t* out_ptr,
+                       int NI,
+                       int HI,
+                       int WI,
+                       cudaStream_t stream) {
+  const int nthread = 240;
+  const int NHW = NI * HI * WI;
+  // assert NHW % 8 == 0
+  // assert nthread % 3 == 0
+  const int max_in_elements = NHW * 3 / 8;
+  const int max_out_elements = NHW * 8 / 8;
+  dim3 thread_block(nthread);
+  dim3 grid((NHW * 3 + nthread * 8 -1) / (nthread * 8));
+  nhwc3to8_kernel<nthread><<<grid, thread_block, 0, stream>>>(
+    (const float4*)in_ptr,
+    (float4*) out_ptr,
+    NI,
+    HI,
+    WI,
+    max_in_elements,
+    max_out_elements
+  );
+}
+
+void {{function_name}} (
+    cutlass::half_t* in_ptr,
+    cutlass::half_t* out_ptr,
+    int64_t* batch,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    cudaStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("cuda.nhwc3to8.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render()
+    return SRC_TEMPLATE.render(
+        function_name=func_name, shape_function=shape_func, exec_paths=exec_paths
+    )
+
+
+@registry.reg("cuda.nhwc3to8.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.nhwc3to8.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/padding/pad_last_dim.py b/python/aitemplate/backend/cuda/padding/pad_last_dim.py
new file mode 100644
index 000000000..f26404af0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/padding/pad_last_dim.py
@@ -0,0 +1,262 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for pad_last_dim.
+"""
+import jinja2
+
+from ... import registry
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  {%for i in range(ndim)%}
+  int64_t*,
+  {% endfor %}
+  {%for i in range(ndim)%}
+  int64_t*,
+  {% endfor %}
+  int out_dim,
+  cudaStream_t stream
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{% for dim in xshape %}
+{{indent}}{{dim}},
+{% endfor %}
+{% for dim in yshape %}
+{{indent}}{{dim}},
+{% endfor %}
+{{indent}}  {{out_dim}},
+{{indent}}  stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}padding4d_launcher(
+{{indent}}    in_ptr,
+{{indent}}    out_ptr,
+{%for i in range(4 - ndim)%}
+1,
+{% endfor %}
+{%for i in range(ndim)%}
+{{indent}}    *x_dim{{i}},
+{% endfor %}
+{{indent}}    out_dim,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are not permitted.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+namespace {
+template <typename T>
+__global__ void padding4d_kernel(const T* input,
+                                 T* output,
+                                 const int32_t x_dim0,
+                                 const int32_t x_dim1,
+                                 const int32_t x_dim2,
+                                 const int32_t x_dim3,
+                                 const int32_t out_dim,
+                                 const T zero){
+
+  const int32_t idx_jump       = blockDim.x * gridDim.x;
+  const int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * out_dim;
+
+  int32_t dim3_idx = 0;
+  int32_t dim2_idx = 0;
+  int32_t dim1_idx = 0;
+  int32_t dim0_idx = 0;
+  int32_t residual = 0;
+
+  T value;
+  for (int32_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_elements; idx += idx_jump) {
+
+    dim3_idx = idx % out_dim;
+    if (dim3_idx >= x_dim3){
+      value = zero;
+    }
+    else{
+      residual = idx / out_dim;
+      dim2_idx = residual % x_dim2;
+      residual = residual / x_dim2;
+      dim1_idx = residual % x_dim1;
+      dim0_idx = residual / x_dim1;
+      residual = ((dim0_idx * x_dim1 + dim1_idx) * x_dim2 + dim2_idx) * x_dim3 + dim3_idx;
+      value = input[residual];
+    }
+    output[idx] = value;
+  }
+}
+
+
+
+void padding4d_launcher(cutlass::half_t* in_ptr,
+                        cutlass::half_t* out_ptr,
+                        const int32_t x_dim0,
+                        const int32_t x_dim1,
+                        const int32_t x_dim2,
+                        const int32_t x_dim3,
+                        const int32_t out_dim,
+                        cudaStream_t stream) {
+  const int block_size = 256;
+  if ((out_dim % 2) == 0 && (x_dim3 % 2) == 0) {
+    int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * x_dim3 / 2;
+    dim3 grid((total_elements + 255) /  block_size);
+    dim3 block(block_size);
+    const __half2 zero  = {0.0f, 0.0f};
+    padding4d_kernel<__half2><<<grid, block, 0, stream>>>(
+        (const __half2*)in_ptr, (__half2*)out_ptr,
+        x_dim0, x_dim1, x_dim2, x_dim3 / 2,
+        out_dim / 2,
+        zero
+    );
+  } else {
+    int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * x_dim3;
+    dim3 grid((total_elements + 255) /  block_size);
+    dim3 block(block_size);
+    const __half zero = static_cast<__half>(0.f);
+    padding4d_kernel<__half><<<grid, block, 0, stream>>>(
+        (const __half*)in_ptr, (__half*)out_ptr,
+        x_dim0, x_dim1, x_dim2, x_dim3,
+        out_dim,
+        zero
+    );
+  }
+}
+
+} // namespace
+
+void {{function_name}} (
+    cutlass::half_t* in_ptr,
+    cutlass::half_t* out_ptr,
+    {%for i in range(ndim)%}
+    int64_t* x_dim{{i}},
+    {% endfor %}
+    {%for i in range(ndim)%}
+    int64_t* y_dim{{i}},
+    {% endfor %}
+    int out_dim,
+    cudaStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("cuda.pad_last_dim.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    ndim = func_attrs["ndim"]
+    xshape = ["*x_dim%d" % i for i in range(ndim)]
+    shape_eval_func = shape_eval_template.render(
+        indent="  ", dtype="int64_t ", shape=xshape, out_dim="out_dim"
+    )
+    yshape = ["*y_dim%d" % i for i in range(ndim - 1)]
+    shape_save_func = shape_save_template.render(
+        indent="  ", shape=yshape, last_dim="*y_dim%d" % (ndim - 1)
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render(ndim=func_attrs["ndim"], indent="  ")
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        ndim=func_attrs["ndim"],
+    )
+
+
+@registry.reg("cuda.pad_last_dim.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name, ndim=func_attrs["ndim"])
+
+
+@registry.reg("cuda.pad_last_dim.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    xshape_args = ["&" + dim._attrs["name"] for dim in xshape]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    yshape_args = ["&" + dim._attrs["name"] for dim in yshape]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        xshape=xshape_args,
+        yshape=yshape_args,
+        out_dim=func_attrs["out_dim"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/pool2d/__init__.py b/python/aitemplate/backend/cuda/pool2d/__init__.py
new file mode 100644
index 000000000..2d21ced04
--- /dev/null
+++ b/python/aitemplate/backend/cuda/pool2d/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA pool2d module init
+"""
+from . import avg_pool2d, max_pool2d
+
+__all__ = ["avg_pool2d", "max_pool2d"]
diff --git a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
new file mode 100644
index 000000000..dbb4993bb
--- /dev/null
+++ b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
@@ -0,0 +1,191 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for avg_pool2d.
+"""
+
+import jinja2
+
+from ... import registry
+from . import pool2d
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}avg_pool_launcher<{{kernel_size}}, {{stride}}, {{padding}}>(
+{{indent}}    in_ptr,
+{{indent}}    out_ptr,
+{{indent}}    NI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    CI,
+{{indent}}    HO,
+{{indent}}    WO,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+namespace {
+
+template <int kernel_size, int stride, int padding>
+__global__ void avg_pool_f16_nhwc_kernel(const half2* input,
+                                         half2* output,
+                                         const int N,
+                                         const int H,
+                                         const int W,
+                                         const int C,
+                                         const int HO,
+                                         const int WO) {
+  const int tid = threadIdx.x;
+  const int n_idx = blockIdx.x;
+  const int out_h_idx = blockIdx.y;
+  const int out_w_idx = blockIdx.z;
+
+  int h_start_idx = out_h_idx * stride - padding;
+  int h_end_idx = h_start_idx + kernel_size;
+  h_start_idx = (h_start_idx < 0) ? 0 : h_start_idx;
+  h_end_idx = h_end_idx > H ? H : h_end_idx;
+
+  int w_start_idx = out_w_idx * stride - padding;
+  int w_end_idx = w_start_idx + kernel_size;
+  w_start_idx = (w_start_idx < 0) ? 0 : w_start_idx;
+  w_end_idx = w_end_idx > W ? W : w_end_idx;
+
+  input += n_idx * H * W * C;
+  output += ((n_idx * HO + out_h_idx) * WO + out_w_idx) * C;
+  const float norm_factor =
+      static_cast<float>(1.0f / (kernel_size * kernel_size));
+  for (int c_idx = tid; c_idx < C; c_idx += blockDim.x) {
+    float2 avg = {0.f, 0.f};
+    for (int h = h_start_idx; h < h_end_idx; h++) {
+      #pragma unroll
+      for (int w = w_start_idx; w < w_end_idx; w++) {
+        const int idx = (h * W + w) * C;
+        const half2 tmp = __ldg(input + (idx + c_idx));
+        avg.x += __half2float(tmp.x);
+        avg.y += __half2float(tmp.y);
+      }
+    }
+
+    avg.x *= norm_factor;
+    avg.y *= norm_factor;
+    output[c_idx] = __float22half2_rn(avg);
+  }
+}
+
+template <int kernel_size, int stride, int padding>
+void avg_pool_launcher(cutlass::half_t* input,
+                      cutlass::half_t* output,
+                      const int N,
+                      const int H,
+                      const int W,
+                      const int C,
+                      const int HO,
+                      const int WO,
+                      cudaStream_t stream) {
+  int num_thread = (C / 2) < 256 ? C / 2 : 256;
+  dim3 grid(N, HO, WO);
+  dim3 block(num_thread);
+  avg_pool_f16_nhwc_kernel<kernel_size, stride, padding><<<grid, block, 0, stream>>>(
+      (const half2*)input, (half2*)output, N, H, W, C / 2, HO, WO);
+}
+} // namespace
+
+void {{function_name}} (
+    cutlass::half_t* in_ptr,
+    cutlass::half_t* out_ptr,
+    int64_t* batch,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* in_ch,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    cudaStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this avg pool2d specialization."
+  );
+}
+"""
+)
+
+
+@registry.reg("cuda.avg_pool2d.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        kernel_h=func_attrs["kernel_size"],
+        kernel_w=func_attrs["kernel_size"],
+        stride=func_attrs["stride"],
+        pad=func_attrs["pad"],
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in exec_path:
+        program = EXEC_TEMPLATE.render(
+            indent="    ",
+            kernel_size=func_attrs["kernel_size"],
+            padding=func_attrs["pad"],
+            stride=func_attrs["stride"],
+        )
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return SRC_TEMPLATE.render(
+        function_name=func_name, shape_function=shape_func, exec_paths=exec_paths
+    )
+
+
+@registry.reg("cuda.avg_pool2d.func_decl")
+def avg_pool2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return pool2d.gen_function_decl(func_name)
+
+
+@registry.reg("cuda.avg_pool2d.func_call")
+def avg_pool2d_gen_function_call(func_attrs, indent="  "):
+    return pool2d.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
new file mode 100644
index 000000000..cafef0937
--- /dev/null
+++ b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
@@ -0,0 +1,236 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for max_pool2d.
+"""
+import jinja2
+
+from ... import registry
+from . import pool2d
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}max_pooling_launcher<{{kernel_size}}, {{stride}}, {{padding}}>(
+{{indent}}    in_ptr,
+{{indent}}    out_ptr,
+{{indent}}    NI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    CI,
+{{indent}}    HO,
+{{indent}}    WO,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+namespace {
+extern __shared__ char* shared_mem[];
+
+template <int kernel_size,
+          int stride,
+          int padding,
+          int block_ch,
+          int block_h,
+          int block_w>
+__global__ void max_pool_f16_nhwc_kernel(const half2* input,
+                                         half2* output,
+                                         const int N,
+                                         const int H,
+                                         const int W,
+                                         const int C,
+                                         const int HO,
+                                         const int WO) {
+  half2* shm = (half2*)shared_mem;
+  const int ldg_h = (block_h - 1) * stride + kernel_size;
+  const int ldg_w = (block_w - 1) * stride + kernel_size;
+  const int ldg_hw_num = ldg_h * ldg_w;
+
+  const int n_idx = blockIdx.x;
+  const int out_h_start_idx = blockIdx.y * block_h;
+  const int out_w_start_idx = blockIdx.z * block_w;
+
+  int ldg_h_start_idx = out_h_start_idx * stride - padding;
+
+  int ldg_w_start_idx = out_w_start_idx * stride - padding;
+
+  input += n_idx * H * W * C;
+
+  const int hw_start_idx_of_thread = threadIdx.y;
+  const int ch_thread_idx = threadIdx.x;
+
+  const half2 min = {static_cast<half>(-65503.0f),
+                     static_cast<half>(-65503.0f)};
+
+  for (int i = hw_start_idx_of_thread; i < ldg_hw_num; i += block_ch) {
+    const int shm_h_idx = i / ldg_w;
+    const int shm_w_idx = i % ldg_w;
+    const int input_h_idx = ldg_h_start_idx + shm_h_idx;
+    const int input_w_idx = ldg_w_start_idx + shm_w_idx;
+    const int input_idx = (input_h_idx * W + input_w_idx) * C + ch_thread_idx;
+    const int shm_idx = i * C + ch_thread_idx;
+    if (input_h_idx >= 0 && input_h_idx < H && input_w_idx >= 0 &&
+        input_w_idx < W) {
+      shm[shm_idx] = __ldg(input + input_idx);
+    } else {
+      shm[shm_idx] = min;
+    }
+  }
+
+  __syncthreads();
+
+  for (int i = hw_start_idx_of_thread; i < block_h * block_w; i += block_ch) {
+    const int out_h_offset = i / block_w;
+    const int out_w_offset = i % block_w;
+    const int out_h_idx = out_h_start_idx + out_h_offset;
+    const int out_w_idx = out_w_start_idx + out_w_offset;
+    if (out_h_idx >= 0 && out_h_idx < HO && out_w_idx >= 0 &&
+        out_w_idx < WO) {
+      half2 max = min;
+
+      const int shm_h_start_idx = out_h_offset * stride;
+      const int shm_h_end_idx = shm_h_start_idx + kernel_size;
+      const int shm_w_start_idx = out_w_offset * stride;
+      const int shm_w_end_idx = shm_w_start_idx + kernel_size;
+
+      for (int shm_h_idx = shm_h_start_idx; shm_h_idx < shm_h_end_idx;
+           shm_h_idx++) {
+        #pragma unroll
+        for (int shm_w_idx = shm_w_start_idx; shm_w_idx < shm_w_end_idx;
+             shm_w_idx++) {
+          const int shm_idx =
+              (shm_h_idx * ldg_w + shm_w_idx) * C + ch_thread_idx;
+          const half2 tmp = shm[shm_idx];
+          max.x = (tmp.x > max.x) ? tmp.x : max.x;
+          max.y = (tmp.y > max.y) ? tmp.y : max.y;
+        }
+      }
+      output[((n_idx * HO + out_h_idx) * WO + out_w_idx) * C +
+             ch_thread_idx] = max;
+    }
+  }
+}
+
+template <int kernel_size, int stride, int pad>
+void max_pooling_launcher(cutlass::half_t* input,
+                          cutlass::half_t* output,
+                          int NI,
+                          int HI,
+                          int WI,
+                          int CI,
+                          int HO,
+                          int WO,
+                          cudaStream_t stream) {
+  const int block_ch = 4;
+  const int block_w = 4;
+  const int block_h = 4;
+  const size_t shm_size = ((block_h - 1) * stride + kernel_size) *
+                          ((block_w - 1) * stride + kernel_size) * CI *
+                          sizeof(half);
+  dim3 grid(NI, (HO + block_h - 1) / block_h,
+            (WO + block_w - 1) / block_w);
+  dim3 block(CI / 2, block_ch);
+  max_pool_f16_nhwc_kernel<kernel_size, stride, pad, 4, 4, 4>
+      <<<grid, block, shm_size, stream>>>((const half2*)input, (half2*)output, NI, HI,
+                                  WI, CI / 2, HO, WO);
+}
+} // namespace
+
+void {{function_name}} (
+    cutlass::half_t* in_ptr,
+    cutlass::half_t* out_ptr,
+    int64_t* batch,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* in_ch,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    cudaStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this max pool2d specialization."
+  );
+}
+"""
+)
+
+
+@registry.reg("cuda.max_pool2d.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        kernel_h=func_attrs["kernel_size"],
+        kernel_w=func_attrs["kernel_size"],
+        stride=func_attrs["stride"],
+        pad=func_attrs["pad"],
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in exec_path:
+        program = EXEC_TEMPLATE.render(
+            indent="    ",
+            kernel_size=func_attrs["kernel_size"],
+            padding=func_attrs["pad"],
+            stride=func_attrs["stride"],
+        )
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return SRC_TEMPLATE.render(
+        function_name=func_name, shape_function=shape_func, exec_paths=exec_paths
+    )
+
+
+@registry.reg("cuda.max_pool2d.func_decl")
+def avg_pool2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return pool2d.gen_function_decl(func_name)
+
+
+@registry.reg("cuda.max_pool2d.func_call")
+def avg_pool2d_gen_function_call(func_attrs, indent="  "):
+    return pool2d.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/cuda/pool2d/pool2d.py b/python/aitemplate/backend/cuda/pool2d/pool2d.py
new file mode 100644
index 000000000..922536559
--- /dev/null
+++ b/python/aitemplate/backend/cuda/pool2d/pool2d.py
@@ -0,0 +1,76 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA pool2d common functions
+"""
+import jinja2
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_function_decl(func_name):
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/reduce/__init__.py b/python/aitemplate/backend/cuda/reduce/__init__.py
new file mode 100644
index 000000000..0535d8a33
--- /dev/null
+++ b/python/aitemplate/backend/cuda/reduce/__init__.py
@@ -0,0 +1,27 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA reduce module init
+"""
+from . import reduce_3d, reduce_common, reduce_mean, reduce_sum, var, vector_norm
+
+__all__ = [
+    "reduce_3d",
+    "reduce_common",
+    "reduce_mean",
+    "reduce_sum",
+    "var",
+    "vector_norm",
+]
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_3d.py b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
new file mode 100644
index 000000000..399d0e9f8
--- /dev/null
+++ b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
@@ -0,0 +1,995 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A common reduce kernel that renders 3d tensors.
+In particular, the kernel template accepts a prologue and an epilogue like
+lambda functions that can be applied to the input and final reduced result,
+respectively. So, this common kernel can be used to implement a family of
+reduction ops such as reduce_mean and norm where we need to apply a scalar-op
+to each final element.
+"""
+import bisect
+
+import jinja2
+
+from ...common import tensor_accessor_codegen
+
+from .. import cuda_common
+
+from . import reduce_small_axis
+
+
+DEFAULT_PROLOGUE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}return fragment;
+"""
+)
+
+
+DEFAULT_EPILOGUE_SCALAR_TEMPLATE = jinja2.Template(
+    """
+{{indent}}return reduced_result;
+"""
+)
+
+
+REDUCE_KERNEL_INSTANCE = jinja2.Template(
+    """
+using ReductionKernel{{layout}}_{{align}} = ReductionKernel3D<
+    {{elem_output_type}}, /* ElementOutput */
+    {{elem_input_type}}, /* ElementInput */
+    {{elem_compute_type}}, /*ElementCompute */
+    {{align}},
+    cutlass::layout::{{layout}}, /*Layout*/
+    cutlass::MatrixShape<1, {{shared_col_size}}>
+>;
+"""
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  {{elem_output_type}} * /*output*/,
+  {{elem_input_type}} * /*input*/,
+  int /*reduction_axis*/,
+  int64_t *[] /*output_shape*/,
+  const int64_t * /*input_shape*/,
+  int /*rank*/,
+  bool /*keep_dim*/,
+  cudaStream_t /*stream*/
+);
+"""
+)
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{special_exec_cond}}
+
+// If we can statically determine we always fall into special exec cond above,
+// it's safe to skip the general exec path below
+#ifndef SKIP_GENERAL_REDUCTION
+{{indent}}if (reduction_axis == rank - 1) {
+{{indent}}  if (rank == 3) {
+{{indent}}    b = input_shape[0];
+{{indent}}    m = input_shape[1];
+{{indent}}    n = input_shape[2];
+{{indent}}    if (b > 1) {
+{{indent}}      batch_stride_input = m * n;
+{{indent}}      batch_stride_output = m;
+{{indent}}    }
+{{indent}}  } else if (rank == 2) {
+{{indent}}    m = input_shape[0];
+{{indent}}    n = input_shape[1];
+{{indent}}  } else if (rank == 1) {
+{{indent}}    n = input_shape[0];
+{{indent}}  } else {
+{{indent}}    throw std::runtime_error("unreachable: invalid rank");
+{{indent}}  }
+{% for align in alignments %}
+{{indent}}  if (input_shape[reduction_axis] % {{align}} == 0) {
+{{indent}}    reduce_mean_launcher_RowMajor_{{align}}(
+{{indent}}      output, input, b, m, n, batch_stride_input, batch_stride_output, stream);
+{{indent}}    return;
+{{indent}}  }
+{% endfor %}
+{{indent}}  throw std::runtime_error("unreachable: invalid alignment");
+{{indent}}} else if (reduction_axis == rank - 2) {
+{{indent}}  if (rank == 3) {
+{{indent}}    b = input_shape[0];
+{{indent}}    m = input_shape[2];
+{{indent}}    n = input_shape[1];
+{{indent}}    if (b > 1) {
+{{indent}}      batch_stride_input = m * n;
+{{indent}}      batch_stride_output = m;
+{{indent}}    }
+{{indent}}  } else if (rank == 2) {
+{{indent}}    m = input_shape[1];
+{{indent}}    n = input_shape[0];
+{{indent}}  } else {
+{{indent}}    throw std::runtime_error("unreachable: invalid rank");
+{{indent}}  }
+{{indent}}  reduce_mean_launcher_ColumnMajor_1(
+{{indent}}    output, input, b, m, n, batch_stride_input, batch_stride_output, stream);
+{{indent}}  return;
+{{indent}}}
+#else
+#undef SKIP_GENERAL_REDUCTION
+#endif // !SKIP_GENERAL_REDUCTION
+"""
+)
+
+
+KERNEL_SRC_TEMPLATE = jinja2.Template(
+    """
+// Modified from cutlass/examples/35_gemm_softmax/gemm_with_softmax.h
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+
+*/
+
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include <limits>
+#include <numeric>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/array.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_ref.h"
+
+#ifndef CHECK_ERROR_REDUCE
+#define CHECK_ERROR_REDUCE(expr)                             \\
+  do {                                                       \\
+    cudaError_t status = (expr);                             \\
+    if (status != cudaSuccess) {                             \\
+      auto msg = std::string("Got error: ") +                \\
+        cudaGetErrorString(status) +                         \\
+        " at " + __FILE__ + ": " + std::to_string(__LINE__); \\
+      std::cerr << msg << std::endl;                         \\
+      throw std::runtime_error(msg);                         \\
+    }                                                        \\
+  } while (0)
+#endif // CHECK_ERROR_REDUCE
+
+#ifndef LAUNCH_CHECK_REDUCE
+#define LAUNCH_CHECK_REDUCE() CHECK_ERROR_REDUCE(cudaGetLastError())
+#endif // LAUNCH_CHECK_REDUCE
+
+{{extra_code}}
+
+namespace {
+
+template <
+  typename ElementOutput,
+  typename ElementInput,
+  typename ElementCompute,
+  int Alignment,
+  typename Layout_ = cutlass::layout::RowMajor,
+  typename Shape_ = cutlass::MatrixShape<4, 16>
+>
+struct ReductionKernel3D {
+
+  static int const kAlignment = Alignment;
+
+  using Layout = Layout_;
+  using Shape = Shape_;
+
+  using TensorOutput = cutlass::TensorRef<ElementOutput, Layout>;
+  using TensorInput = cutlass::TensorRef<ElementInput, Layout>;
+  using TensorCompute = cutlass::TensorRef<ElementCompute, Layout>;
+
+  struct Arguments {
+
+    TensorOutput ref_output;     ///< Output tensor
+    TensorInput ref_input;       ///< Input tensor
+    cutlass::MatrixCoord extent; ///< Extent of input and output tensors
+    int64_t input_row_stride;    ///< stride for accessing next element in
+                                 ///< the same row. It's 1 for RowMajor and
+                                 ///< extent.row() for ColMajor
+    int64_t batch_count;         ///< Batch count
+    int64_t batch_stride_output; ///< Batch stride for Output tensor
+    int64_t batch_stride_input;  ///< Batch stride for Input tensor
+
+    Arguments(
+      TensorOutput    ref_output_,        ///< Output tensor
+      TensorInput     ref_input_,         ///< Input tensor
+      cutlass::MatrixCoord extent_,       ///< Extent of input and output tensors
+      int64_t         input_row_stride_,  ///< stride for accessing input rows
+      int64_t         batch_count_,       ///< Batch count
+      int64_t         batch_stride_output_ = 0,
+      int64_t         batch_stride_input_ = 0
+    ):
+      ref_output(ref_output_),
+      ref_input(ref_input_),
+      extent(extent_),
+      input_row_stride(input_row_stride_),
+      batch_count(batch_count_),
+      batch_stride_output(batch_stride_output_),
+      batch_stride_input(batch_stride_input_)
+    { }
+  };
+
+  struct Params {
+    Arguments args;
+    Params() { }
+    Params(Arguments const &args_): args(args_) { }
+  };
+
+  struct SharedStorage {
+    cutlass::AlignedArray<ElementCompute, Shape::kCount> exchange;
+  };
+
+  CUTLASS_DEVICE
+  ReductionKernel3D() { }
+
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    reduce_partial(params.args, shared_storage);
+
+    __syncthreads();
+
+    reduce_final(params.args, shared_storage);
+
+    __syncthreads();
+  }
+
+  /// Partial reduction
+  CUTLASS_DEVICE
+  void reduce_partial(Arguments const &args, SharedStorage &shared_storage) {
+
+    using AccessTypeInput = cutlass::AlignedArray<ElementInput, kAlignment>;
+
+    int block_batch = blockIdx.z;
+    int block_m = blockIdx.x * Shape::kRow;
+    int block_n = 0;
+
+    int thread_m = threadIdx.y;
+    int thread_n = threadIdx.x * kAlignment;
+
+    int idx_m = block_m + thread_m;
+    int idx_n = block_n + thread_n;
+
+    AccessTypeInput *access_input = reinterpret_cast<AccessTypeInput *>(
+      args.ref_input.data() +
+      args.batch_stride_input * block_batch +
+      args.ref_input.layout()({idx_m, idx_n}));
+
+    using ConvertS = cutlass::NumericArrayConverter<ElementCompute, ElementInput, kAlignment>;
+    ConvertS convert_s;
+
+    using FragmentCompute = cutlass::Array<ElementCompute, kAlignment>;
+    using ReduceVectorOp = {{reduce_op}}<FragmentCompute>;
+    using ReduceScalarOp = {{reduce_op}}<ElementCompute>;
+    ReduceVectorOp reduce_v_op;
+    ReduceScalarOp reduce_s_op;
+
+    FragmentCompute frag_compute;
+    frag_compute.clear();
+
+    if (idx_m < args.extent.row()) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (
+        int idx = 0;
+        idx < args.extent.column();
+        idx += Shape::kColumn * kAlignment) {
+
+        if (idx_n < args.extent.column()) {
+
+          AccessTypeInput fetch;
+          cutlass::arch::global_load<AccessTypeInput, sizeof(AccessTypeInput)>(
+              fetch, access_input, true);
+          auto prologue_fn = [&] (FragmentCompute fragment) {
+
+{{prologue_code}}
+
+          };
+          FragmentCompute tmp = prologue_fn(convert_s(fetch));
+          frag_compute = reduce_v_op(frag_compute, tmp);
+        }
+
+        access_input += Shape::kColumn * args.input_row_stride;
+        idx_n += Shape::kColumn * kAlignment;
+      }
+
+      // Reduce the elements owned by one thread
+      ElementCompute result = frag_compute[0];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < kAlignment; ++i) {
+        result = reduce_s_op(result, frag_compute[i]);
+      }
+
+      shared_storage.exchange.data()[threadIdx.x + threadIdx.y * Shape::kColumn] = result;
+    }
+  }
+
+  /// Compute the final summation from data in SMEM
+  CUTLASS_DEVICE
+  void reduce_final(Arguments const &args, SharedStorage &shared_storage) {
+
+    //
+    // SMEM has shape `Shape::Row`-by-`Shape::Column`
+    //
+    // This computes a reduction across the `Column` dimension yielding a `Row-by-1` vector.
+    //
+
+    //
+    // Tuning parameters tradeoff ILP with latency
+    //
+    // During each step of the reduction, each thread performs `kAccesses` of
+    // vector size `kReduceVector`
+
+    // Tune the number of accesses per reduction
+    int const kAccesses = 2;
+
+    // Tune the memory access size
+    int const kReduceVector = 4;
+
+    //
+    // Static asserts to ensure integrity
+    //
+
+    static_assert(kAccesses * kReduceVector,
+      "Zero-size steps would infinitely loop.");
+
+    static_assert(
+      cutlass::is_pow2<Shape::kColumn>::value &&
+      cutlass::is_pow2<kAccesses>::value &&
+      cutlass::is_pow2<kReduceVector>::value,
+      "Powers of two only.");
+
+    static_assert(!(Shape::kColumn % (kAccesses * kReduceVector)),
+      "Divisibility not satisfied");
+
+    //
+    // Reduction operators
+    //
+
+    using FragmentCompute = cutlass::Array<ElementCompute, kReduceVector>;
+    using ReduceVectorOp = {{reduce_op}}<FragmentCompute>;
+    using ReduceScalarOp = {{reduce_op}}<ElementCompute>;
+    ReduceVectorOp reduce_v_op;
+    ReduceScalarOp reduce_s_op;
+
+    // Tree reduction
+    ElementCompute *smem_ptr = shared_storage.exchange.data() + threadIdx.y * Shape::kColumn;
+
+    ElementCompute result = ElementCompute();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (
+      int tidx_limit = Shape::kColumn / (kAccesses * kReduceVector);
+      tidx_limit > 0;
+      tidx_limit /= (kAccesses * kReduceVector)) {
+
+      if (threadIdx.x < tidx_limit) {
+        FragmentCompute fetch;
+
+        cutlass::arch::shared_load<sizeof(FragmentCompute)>(
+            &fetch,
+            cutlass::arch::cutlass_get_smem_pointer(smem_ptr + threadIdx.x * kReduceVector));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kAccesses; ++i) {
+          FragmentCompute extra;
+
+          cutlass::arch::shared_load<sizeof(FragmentCompute)>(
+              &extra,
+              cutlass::arch::cutlass_get_smem_pointer(
+                  smem_ptr + threadIdx.x * kReduceVector + tidx_limit * kReduceVector * i));
+
+          fetch = reduce_v_op(fetch, extra);
+        }
+
+        // Reduce to one element
+        result = fetch[0];
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kReduceVector; ++i) {
+          result = reduce_s_op(result, fetch[i]);
+        }
+      }
+      __syncthreads();
+
+      if (threadIdx.x < tidx_limit) {
+        smem_ptr[threadIdx.x] = result;
+      }
+      __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+
+      int const kLgResidual =
+        (cutlass::log2_down<Shape::kColumn>::value %
+         cutlass::log2_down<kAccesses * kReduceVector>::value);
+
+      // Certain shape combinations require an additional reduction step
+      if (kLgResidual) {
+        result = ElementCompute();
+
+        int const kResidualVector = (1 << kLgResidual);
+        cutlass::Array<ElementCompute, kResidualVector> fetch;
+
+        cutlass::arch::shared_load<sizeof(FragmentCompute)>(
+            &fetch,
+            cutlass::arch::cutlass_get_smem_pointer(smem_ptr));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kResidualVector; ++i) {
+          result = reduce_s_op(result, fetch[i]);
+        }
+      }
+
+      int block_batch = blockIdx.z;
+      int block_m = blockIdx.x * Shape::kRow;
+      int thread_m = threadIdx.y;
+      int idx_m = block_m + thread_m;
+      if (idx_m >= args.extent.row()) {
+        return;
+      }
+
+      int64_t output_idx = args.batch_stride_output * block_batch +
+                           args.ref_output.layout()({idx_m, 0});
+      ElementOutput *access_output =
+          get_strided_address_at_idx<ElementOutput, ElementOutput>(
+              reinterpret_cast<ElementOutput*>(args.ref_output.data()), output_idx);
+
+      cutlass::NumericConverter<ElementOutput, ElementCompute> convert_output;
+
+      auto epilogue_scalar_fn = [&] (ElementCompute reduced_result,
+                                     int num_reduced_elems) {
+
+{{epilogue_scalar_code}}
+
+      };
+      ElementCompute tmp = epilogue_scalar_fn(result, args.extent.column());
+      *access_output = convert_output(tmp);
+    }
+  }
+};
+
+{{reduce_kernel_instance}}
+
+{% for align in alignments %}
+void reduce_mean_launcher_RowMajor_{{align}}(
+  {{elem_output_type}} *output,
+  {{elem_input_type}} *input,
+  int64_t batch_count,
+  int64_t rows,
+  int64_t columns,
+  int64_t batch_stride_input,
+  int64_t batch_stride_output,
+  cudaStream_t stream
+) {
+
+  using ReductionKernel = ReductionKernelRowMajor_{{align}};
+
+  dim3 apply_block(ReductionKernel::Shape::kColumn,
+                   ReductionKernel::Shape::kRow);
+
+  int cta_rows = ReductionKernel::Shape::kRow;
+  int cta_columns = ReductionKernel::Shape::kColumn * ReductionKernel::kAlignment;
+
+  dim3 apply_grid(static_cast<int>((rows + cta_rows - 1) / cta_rows),
+                  static_cast<int>((columns + cta_columns - 1) / cta_columns),
+                  static_cast<int>(batch_count));
+
+  // row major
+  int64_t lda_output = 1;
+  int64_t lda_input = columns;
+  ReductionKernel::Layout output_layout(lda_output);
+  ReductionKernel::Layout input_layout(lda_input);
+
+  ReductionKernel::TensorOutput output_tensor(output, output_layout);
+  ReductionKernel::TensorInput input_tensor(input, input_layout);
+  ReductionKernel::Arguments kernel_args(
+      output_tensor,
+      input_tensor,
+      cutlass::MatrixCoord(static_cast<int>(rows), static_cast<int>(columns)),
+      1 /*input_row_stride*/,
+      static_cast<int>(batch_count),
+      batch_stride_output,
+      batch_stride_input
+  );
+
+  cutlass::Kernel<ReductionKernel><<<
+      apply_grid,
+      apply_block,
+      sizeof(typename ReductionKernel::SharedStorage),
+      stream
+  >>>(kernel_args);
+
+  LAUNCH_CHECK_REDUCE();
+}
+{% endfor %}
+
+void reduce_mean_launcher_ColumnMajor_1(
+  {{elem_output_type}} *output,
+  {{elem_input_type}} *input,
+  int64_t batch_count,
+  int64_t rows,
+  int64_t columns,
+  int64_t batch_stride_input,
+  int64_t batch_stride_output,
+  cudaStream_t stream
+) {
+
+  using ReductionKernel = ReductionKernelColumnMajor_1;
+
+  dim3 apply_block(ReductionKernel::Shape::kColumn,
+                   ReductionKernel::Shape::kRow);
+
+  int cta_rows = ReductionKernel::Shape::kRow;
+  int cta_columns = ReductionKernel::Shape::kColumn * ReductionKernel::kAlignment;
+
+  dim3 apply_grid(static_cast<int>((rows + cta_rows - 1) / cta_rows),
+                  static_cast<int>((columns + cta_columns - 1) / cta_columns),
+                  static_cast<int>(batch_count));
+
+  // column major
+  int64_t lda_output = 1;
+  int64_t lda_input = rows;
+  ReductionKernel::Layout output_layout(lda_output);
+  ReductionKernel::Layout input_layout(lda_input);
+
+  ReductionKernel::TensorOutput output_tensor(output, output_layout);
+  ReductionKernel::TensorInput input_tensor(input, input_layout);
+  ReductionKernel::Arguments kernel_args(
+      output_tensor,
+      input_tensor,
+      cutlass::MatrixCoord(static_cast<int>(rows), static_cast<int>(columns)),
+      rows /*input_row_stride*/,
+      static_cast<int>(batch_count),
+      batch_stride_output,
+      batch_stride_input
+  );
+
+  cutlass::Kernel<ReductionKernel><<<
+      apply_grid,
+      apply_block,
+      sizeof(typename ReductionKernel::SharedStorage),
+      stream
+  >>>(kernel_args);
+
+  LAUNCH_CHECK_REDUCE();
+}
+
+{{special_kernel}}
+
+} // namespace
+
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{kernel_source}}
+
+static int normalize_axis(int axis, int rank) {
+  if (axis >= 0) return axis;
+  return rank + axis;
+}
+
+static int64_t get_size(const int64_t *input_shape, int from, int to) {
+  int64_t sz = 1;
+  for (int i = from; i < to; i++) {
+    sz *= input_shape[i];
+  }
+  return sz;
+}
+
+static void normalize_input_shape(
+  int64_t *new_input_shape,
+  int *reduction_axis,
+  const int64_t *input_shape,
+  int *rank
+) {
+  if (*reduction_axis == 0 && *rank > 1) {
+      new_input_shape[0] = 1;
+      new_input_shape[1] = input_shape[0];
+      new_input_shape[2] = get_size(input_shape, 1, *rank);
+      *reduction_axis = 1;
+      *rank = 3;
+      return;
+  }
+
+  if (*rank <= 3) {
+    for (int i = 0; i < *rank; i++) {
+      new_input_shape[i] = input_shape[i];
+    }
+    return;
+  }
+
+  if (*reduction_axis == *rank - 1) {
+    new_input_shape[0] = input_shape[0];
+    new_input_shape[1] = get_size(input_shape, 1, *reduction_axis);
+    new_input_shape[2] = input_shape[*reduction_axis];
+    *reduction_axis = 2;
+    *rank = 3;
+    return;
+  }
+
+  new_input_shape[0] = get_size(input_shape, 0, *reduction_axis);
+  new_input_shape[1] = input_shape[*reduction_axis];
+  new_input_shape[2] = get_size(input_shape, *reduction_axis + 1, *rank);
+  *reduction_axis = 1;
+  *rank = 3;
+}
+
+void {{func_name}}(
+  {{elem_output_type}} *output,
+  {{elem_input_type}} *input,
+  int reduction_axis,
+  int64_t *output_shape[],
+  const int64_t *orig_input_shape,
+  int rank,
+  bool keep_dim,
+  cudaStream_t stream
+) {
+
+  reduction_axis = normalize_axis(reduction_axis, rank);
+  if (reduction_axis >= rank) {
+    throw std::runtime_error("reduction_axis must < rank");
+  }
+  if (reduction_axis < 0) {
+    throw std::runtime_error("reduction_axis must >= 0");
+  }
+  if (rank == 0) {
+    return;
+  }
+
+{% if not output_accessor.is_from_strided_tensor %}
+  for (int i = 0, j = 0; i < rank; i++, j++) {
+    if (i == reduction_axis) {
+      if (keep_dim) {
+        *(output_shape[j]) = orig_input_shape[j] == 0 ? 0 : 1;
+      } else {
+        j--;
+      }
+    } else {
+      if (orig_input_shape[i] != *(output_shape[j])) {
+        throw std::runtime_error("input/output dim values do not match");
+      }
+    }
+  }
+{% endif %}
+
+  int64_t input_shape[3] = {1, 1, 1};
+  normalize_input_shape(
+      input_shape, &reduction_axis, orig_input_shape, &rank
+  );
+
+  for (int i = 0; i < rank; i++) {
+    if (input_shape[i] == 0)
+      return;
+  }
+  // make sure input and output are valid
+  if (!output) {
+    throw std::runtime_error("output is NULL!");
+  }
+  if (!input) {
+    throw std::runtime_error("input is NULL!");
+  }
+
+  int64_t b = 1;
+  int64_t m = 1;
+  int64_t n = 1;
+  int64_t batch_stride_input = 0;
+  int64_t batch_stride_output = 0;
+
+  {{exec_paths}}
+
+  throw std::runtime_error(
+    "unsupported reduction_axis value for {{func_name}}"
+  );
+}
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  const int64_t {{input_name}}_shape[] = {
+{{indent}}    {{input_dims}}
+{{indent}}  };
+
+{{indent}}  int64_t *{{output_name}}_shape[] = {
+{{indent}}    {{output_dim_refs}}
+{{indent}}  };
+
+{{indent}}  {{func_name}}(
+{{indent}}      {{output_name}},
+{{indent}}      {{input_name}},
+{{indent}}      {{reduction_axis}}, /*reduction_axis*/
+{{indent}}      {{output_name}}_shape,
+{{indent}}      {{input_name}}_shape,
+{{indent}}      {{rank}}, /*rank*/
+{{indent}}      {{keep_dim}}, /*keep_dim*/
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+def gen_function_decl(func_attrs) -> str:
+    """a common function for generating the function declaration of a
+    reduce-family kernel
+
+    Parameters
+    ----------
+    func_attrs : Dit[str, Any]
+        holds attributes of this reduce op
+
+    Returns
+    -------
+    str
+        returns the rendered function declaration with appropriate replacements
+    """
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        elem_input_type=cuda_common.dtype_to_cutlass_type(x._attrs["dtype"]),
+        elem_output_type=cuda_common.dtype_to_cutlass_type(y._attrs["dtype"]),
+    )
+
+
+def gen_function(
+    func_attrs,
+    reduce_op,
+    prologue_template=DEFAULT_PROLOGUE_TEMPLATE,
+    epilogue_scalar_template=DEFAULT_EPILOGUE_SCALAR_TEMPLATE,
+    extra_code_str="",
+    accumulation_type=None,
+) -> str:
+    """a common function for generating a reduce-family kernel
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this reduce op
+    reduce_op : str
+        the reduce op's string representation such as cutlass::plus
+    prologue_template : str, optional
+        a Template that will be rendered to process input before reduction
+    epilogue_scalar_template : str, optional
+        a Template that will be rendered to process each final reduced element
+    epilogue_scalar_template : str, optional
+        a Template that will be rendered to hold extra code for reduction
+    accmulation_type : str, optional
+        specifies the data type for accumulation
+        (default is None so that we will use output's type for accumulation)
+
+    Returns
+    -------
+    str
+        returns the rendered code for the complete implementation of the reduce op
+    """
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    input_type = cuda_common.dtype_to_cutlass_type(x._attrs["dtype"])
+    output_type = cuda_common.dtype_to_cutlass_type(y._attrs["dtype"])
+    if accumulation_type is None:
+        # follow pytorch's semantics
+        acc_type = output_type
+    else:
+        acc_type = accumulation_type
+
+    # FIXME: Our kernel launch configs are determined by the amount of shared
+    # memory assigned to each block. Currently, we set up an approximate size for
+    # shared memroy based on the reduction dim value. We should consider to
+    # determine this value by profiling later.
+    axes = func_attrs["reduction_axes"]
+    if not len(axes) == 1:
+        raise NotImplementedError("Multiple reduction axes are not supported yet")
+    reduction_axis = axes[0]
+    x_shape = x._attrs["shape"]
+    reduction_dim_val = max(x_shape[reduction_axis]._attrs["values"])
+    col_size_buckets = [8, 16, 32, 64, 128]
+    # If reduction axis is not the last dim, let's increase the shared memory a bit.
+    # Note that this heuristic is observed with some test inputs, and we need a
+    # more systematic approach to determining the size of shared memory.
+    if reduction_axis < len(x_shape) - 1:
+        col_size_buckets.extend([256, 512])
+    col_pos = bisect.bisect_right(col_size_buckets, reduction_dim_val - 1)
+    col_pos = min(col_pos, len(col_size_buckets) - 1)
+    shared_col_size = col_size_buckets[col_pos]
+
+    row_layout = "RowMajor"
+    col_layout = "ColumnMajor"
+
+    # FIXME: these alignments values are only for half_t type.
+    # make it adjustable to other types such as float.
+    alignments = [16, 8, 4, 2, 1]
+    # This is ugly. Ideally, we should have templated code like below:
+    # template <typename Alignment>
+    # reduce_launcher(...) {
+    #   using ReductionKernel = ReductionKernel3D<..., 4, ...>
+    #   ...
+    #   typename ReductionKernel::Layout output_layout(lda_output);
+    #   ...
+    # }
+    #
+    # However, this dependent template pattern caused cicc (at least 11.4) to
+    # segfault. To workaround this cicc issue, we manually "instantiate" template.
+    reduce_instances = [
+        REDUCE_KERNEL_INSTANCE.render(
+            indent="  ",
+            elem_output_type=output_type,
+            elem_input_type=input_type,
+            elem_compute_type=acc_type,
+            align=align,
+            shared_col_size=shared_col_size,
+            layout=row_layout,
+        )
+        for align in alignments
+    ]
+    reduce_instances.append(
+        REDUCE_KERNEL_INSTANCE.render(
+            indent="  ",
+            elem_output_type=output_type,
+            elem_input_type=input_type,
+            elem_compute_type=acc_type,
+            align=1,
+            shared_col_size=shared_col_size,
+            layout=col_layout,
+        )
+    )
+    reduce_instance = "\n".join(reduce_instances)
+
+    prologue_code = prologue_template.render(indent=" " * 8)
+    epilogue_scalar_code = epilogue_scalar_template.render(indent=" " * 12)
+
+    output_accessors = func_attrs["output_accessors"]
+    assert (
+        len(output_accessors) == 1
+    ), f"expected the length of output_accessors to be one but got {len(output_accessors)}"
+    output_alignment = tensor_accessor_codegen.find_max_alignment_for_accessors(
+        output_accessors
+    )
+    special_exec_path, special_kernel = reduce_small_axis.get_exec_cond_and_kernel(
+        func_attrs,
+        reduce_op,
+        reduction_axis,
+        prologue_code,
+        epilogue_scalar_code,
+        input_type,
+        output_type,
+        acc_type,
+        output_accessors,
+        output_alignment,
+    )
+    exec_paths = EXEC_COND_TEMPLATE.render(
+        indent="  ",
+        func_name=func_attrs["name"],
+        alignments=alignments,
+        special_exec_cond=special_exec_path,
+    )
+
+    strided_address_func_str = (
+        tensor_accessor_codegen.STRIDED_ADDRESS_AT_IDX_FUNC_TEMPLATE.render(
+            output_accessor=output_accessors[0],
+        )
+    )
+    tensor_accessor_libs = tensor_accessor_codegen.get_libs()
+    extra_code_str += "\n\n" + tensor_accessor_libs
+    extra_code_str += "\n\nnamespace {\n" + strided_address_func_str + "\n}\n\n"
+
+    kernel_src = KERNEL_SRC_TEMPLATE.render(
+        extra_code=extra_code_str,
+        reduce_op=reduce_op,
+        reduce_kernel_instance=reduce_instance,
+        alignments=alignments,
+        prologue_code=prologue_code,
+        epilogue_scalar_code=epilogue_scalar_code,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        special_kernel=special_kernel,
+    )
+
+    return SRC_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        kernel_source=kernel_src,
+        exec_paths=exec_paths,
+        output_accessor=output_accessors[0],
+    )
+
+
+def gen_function_call(func_attrs, indent="  ") -> str:
+    """a common function for generating a call to a reduce-family function
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this reduce op
+    indent : str, optional
+        indent for each line of the function call code (default "  ")
+
+    Returns
+    -------
+    str
+        returns rendered code for invoking the reduce op
+    """
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    axes = func_attrs["reduction_axes"]
+    if not len(axes) == 1:
+        raise NotImplementedError("Multiple reduction axes are not supported yet")
+
+    x_shape = x._attrs["shape"]
+    x_dims = ", ".join([dim._attrs["name"] for dim in x_shape])
+    y_shape = func_attrs["output_accessors"][0].original_shapes
+    y_dim_refs = ", ".join(["&" + dim._attrs["name"] for dim in y_shape])
+    keep_dim = "true" if func_attrs["keepdim"] else "false"
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        func_name=func_attrs["name"],
+        output_name=y._attrs["name"],
+        input_name=x._attrs["name"],
+        input_dims=x_dims,
+        output_dim_refs=y_dim_refs,
+        reduction_axis=axes[0],
+        rank=str(len(x_shape)),
+        keep_dim=keep_dim,
+    )
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_common.py b/python/aitemplate/backend/cuda/reduce/reduce_common.py
new file mode 100644
index 000000000..fb96339be
--- /dev/null
+++ b/python/aitemplate/backend/cuda/reduce/reduce_common.py
@@ -0,0 +1,241 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA reduce common functions
+"""
+import jinja2
+
+from ....compiler.base import IntImm, IntVar
+from .. import cuda_common
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  {{elem_output_type}}* /*dst_ptr*/,
+  {{elem_input_type}}*  /*src_ptr*/,
+  int                   /*reduction_axis*/,
+  const int64_t*        /*shape*/,
+  const int             /*rank*/,
+  uint8_t*              /*workspace*/,
+  cudaStream_t
+);
+"""
+)
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if (shape[rank - 1] % {{vector_length}} == 0) {
+{{indent}}  {{func_name}}_launcher<{{vector_length}}>(
+{{indent}}      dst_ptr, src_ptr, reduction_axis, shape, rank, workspace, stream);
+{{indent}}  return;
+}
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cassert>
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "cutlass/reduction/device/tensor_reduce.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/util/host_tensor.h"
+
+#define CUTLASS_CHECK_REDUCE(status)                                                  \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+template <int VectorLength = 1>
+void {{func_name}}_launcher(
+    {{elem_output_type}} *dst_ptr,
+    {{elem_input_type}} *src_ptr,
+    int reduction_axis,
+    const int64_t *shape,
+    const int rank,
+    uint8_t* workspace,
+    cudaStream_t stream) {
+  // Instead of making our own 4D tensor definition,
+  // we simply use TensoeNHWC as a 4D tensor
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementCompute = {{elem_compute_type}};
+  using ReductionOp = {{reduction_op}}<ElementCompute>;
+  constexpr int NUM_DIMS = 4;
+  assert(rank <= NUM_DIMS);
+  assert(reduction_axis < rank);
+  assert(rank > 0);
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    {{elem_output_type}},
+    {{elem_input_type}},
+    Layout,
+    ReductionOp,
+    VectorLength,
+    ElementCompute
+  >;
+  assert(shape[rank - 1] % VectorLength == 0);
+  // adjust reduction_axis
+  reduction_axis = NUM_DIMS - rank + reduction_axis;
+  // cutlass's tensor_reduce only supports 4D tensors at the moment
+  int64_t dst_dims[NUM_DIMS];
+  int64_t src_dims[NUM_DIMS];
+  for (int i = 0; i < NUM_DIMS; i++) {
+    dst_dims[i] = 1;
+    src_dims[i] = 1;
+  }
+  for (int i = 0; i < rank; i++) {
+    int idx = NUM_DIMS - rank + i;
+    dst_dims[idx] = shape[i];
+    src_dims[idx] = shape[i];
+  }
+  dst_dims[reduction_axis] = 1;
+  Layout::TensorCoord dst_extent(
+    dst_dims[0], dst_dims[1], dst_dims[2], dst_dims[3]
+  );
+  Layout dst_layout(Layout::packed(dst_extent));
+  Layout::TensorCoord src_extent(
+    src_dims[0], src_dims[1], src_dims[2], src_dims[3]
+  );
+  Layout src_layout(Layout::packed(src_extent));
+  ElementCompute reduction_identity = ElementCompute();
+  TensorReduction reduction(src_extent, reduction_axis);
+  ReductionOp reduction_op = ReductionOp();
+  assert(dst_ptr);
+  assert(src_ptr);
+  cutlass::Status status = reduction.reduce(
+      {dst_ptr, dst_layout},
+      {src_ptr, src_layout},
+      {{workspace_ptr}},
+      reduction_identity,
+      reduction_op,
+      stream
+    );
+  CUTLASS_CHECK_REDUCE(status);
+}
+#undef CUTLASS_CHECK_REDUCE
+void {{func_name}}(
+    {{elem_output_type}} *dst_ptr,
+    {{elem_input_type}}  *src_ptr,
+    int reduction_axis,
+    const int64_t *shape,
+    const int rank,
+    uint8_t *workspace,
+    cudaStream_t stream) {
+  if (!dst_ptr) {
+    throw std::runtime_error("dst_ptr is nullptr!");
+  }
+  if (!src_ptr) {
+    throw std::runtime_error("src_ptr is nullptr!");
+  }
+  {{exec_paths}}
+  throw std::runtime_error(
+    "Unsupported workload for this {{func_name}} specialization."
+  );
+}
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+  {{indent}}int64_t shape[] = {{dims}};
+  {{indent}}{{func_name}}(
+  {{indent}}    {{dst_ptr}},
+  {{indent}}    {{src_ptr}},
+  {{indent}}    {{reduction_axis}},
+  {{indent}}    shape,
+  {{indent}}    {{rank}},
+  {{indent}}    global_workspace,
+  {{indent}}    stream
+  {{indent}});
+{{indent}}}
+"""
+)
+
+
+def gen_function_decl(func_attrs):
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        elem_input_type=cuda_common.dtype_to_cutlass_type(x._attrs["dtype"]),
+        elem_output_type=cuda_common.dtype_to_cutlass_type(y._attrs["dtype"]),
+    )
+
+
+def gen_function(func_attrs, reduction_op):
+    vector_lens_config = [32, 16, 8, 4, 1]
+    exec_paths = ""
+    for vlen in vector_lens_config:
+        exec_program = EXEC_COND_TEMPLATE.render(
+            func_name=func_attrs["name"], vector_length=vlen, indent="  "
+        )
+        exec_paths += exec_program
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    input_type = cuda_common.dtype_to_cutlass_type(x._attrs["dtype"])
+    output_type = cuda_common.dtype_to_cutlass_type(y._attrs["dtype"])
+    if func_attrs.get("workspace", 0) > 0:
+        workspace_ptr = "workspace"
+    else:
+        workspace_ptr = "nullptr"
+    return SRC_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        # Match pytorch's behavior where the accumuation type is the same
+        # as the output type
+        elem_compute_type=output_type,
+        reduction_op=reduction_op,
+        exec_paths=exec_paths,
+        workspace_ptr=workspace_ptr,
+    )
+
+
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    axes = func_attrs["reduction_axes"]
+    if not len(axes) == 1:
+        raise NotImplementedError("Multiple reduction axes are not supported yet")
+
+    def dim_to_str(dim):
+        if isinstance(dim, IntVar):
+            return dim._attrs["name"]
+        if isinstance(dim, IntImm):
+            return str(dim._attrs["values"][0])
+        raise NotImplementedError("Unsupported dim kind: {dim}".format(dim=dim))
+
+    x_shape = x._attrs["shape"]
+    dims = ",".join([dim_to_str(dim) for dim in x_shape])
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        func_name=func_attrs["name"],
+        dst_ptr=y._attrs["name"],
+        src_ptr=x._attrs["name"],
+        reduction_axis=axes[0],
+        dims="{ " + dims + " }",
+        rank=str(len(x_shape)),
+    )
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_mean.py b/python/aitemplate/backend/cuda/reduce/reduce_mean.py
new file mode 100644
index 000000000..521e18a23
--- /dev/null
+++ b/python/aitemplate/backend/cuda/reduce/reduce_mean.py
@@ -0,0 +1,88 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A reduce_mean kernel implementation
+"""
+
+import jinja2
+
+from ... import registry
+from . import reduce_3d
+
+
+EPILOGUE_SCALAR_TEMPLATE = jinja2.Template(
+    """
+{{indent}}return (reduced_result / ElementCompute(num_reduced_elems));
+"""
+)
+
+
+@registry.reg("cuda.reduce_mean.func_decl")
+def reduce_mean_gen_function_decl(func_attrs) -> str:
+    """the registered function for generating reduce_mean function declaration
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this reduce_mean op
+
+    Returns
+    -------
+    str
+        returns the rendered function declaration with appropriate replacements
+    """
+    return reduce_3d.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.reduce_mean.gen_function")
+def reduce_mean_gen_function(func_attrs) -> str:
+    """the registered function for generating reduce_mean kernel and all of
+    its auxiliary functions
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this reduce_mean op
+
+    Returns
+    -------
+    str
+        returns the rendered code for the complete implementation of this reduce mean op
+    """
+    return reduce_3d.gen_function(
+        func_attrs,
+        "cutlass::plus",
+        reduce_3d.DEFAULT_PROLOGUE_TEMPLATE,
+        EPILOGUE_SCALAR_TEMPLATE,
+    )
+
+
+@registry.reg("cuda.reduce_mean.func_call")
+def reduce_mean_gen_function_call(func_attrs, indent="  ") -> str:
+    """the registered function for generating a function call to reduce_mean
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this reduce_mean op
+    indent : str, optional
+        indentation for each line of the rendered code (default "  ")
+
+    Returns
+    -------
+    str
+        returns rendered code for invoking the reduce op
+    """
+    return reduce_3d.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
new file mode 100644
index 000000000..7c294a97c
--- /dev/null
+++ b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
@@ -0,0 +1,425 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A special reduce kernel suitable for small reduction axes. The current upper
+bound is set to 128.
+"""
+
+import math
+
+import jinja2
+
+from ....compiler.base import IntImm
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if (input_shape[reduction_axis] <= {{reduction_dim_upperbound}}) {
+{{indent}}  if (reduction_axis == rank - 1) {
+{{indent}}    constexpr int64_t cst_n = {{reduction_dim_val}};
+{{indent}}    if (rank == 3) {
+{{indent}}      b = input_shape[0];
+{{indent}}      m = input_shape[1];
+{{indent}}      if (b > 1) {
+{{indent}}        batch_stride_input = m * cst_n;
+{{indent}}        batch_stride_output = m;
+{{indent}}      }
+{{indent}}    } else if (rank == 2) {
+{{indent}}      m = input_shape[0];
+{{indent}}    } else if (rank == 1) {
+{{indent}}      // nothing to do
+{{indent}}    } else {
+{{indent}}      throw std::runtime_error("reduce_small_axis: invalid rank rank");
+{{indent}}    }
+{{indent}}    reduce_mean_launcher_small_axis<cst_n>(
+{{indent}}          output, input, b, m, batch_stride_input,
+{{indent}}          batch_stride_output, stream);
+{{indent}}    return;
+{{indent}}  } else {
+{{indent}}    // TODO: support more reduction axis
+{{indent}}    // fall-through to the general reduction kernel for now
+{{indent}}  }
+{{indent}}}
+{% if static_small_reduction_dim %}
+#define SKIP_GENERAL_REDUCTION
+{% endif %}
+"""
+)
+
+
+KERNEL_SRC_TEMPLATE = jinja2.Template(
+    """
+constexpr const int ThreadsPerBlock = 128;
+
+template <typename ElemT,
+          typename ElementCompute,
+          typename ReadVecT,
+          typename WriteVecT,
+          int64_t num_rows_per_thread,
+          int64_t num_cols>
+__global__ void reduce_small_in_v_out_v(
+    ElemT *output,
+    const ElemT *input,
+    int64_t num_rows,
+    int64_t batch_stride_input,
+    int64_t batch_stride_output) {
+  int block_batch = blockIdx.y;
+  // index within the batch
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t idx = tid * num_rows_per_thread;
+  if (idx >= num_rows)
+    return;
+  // input within the batch
+  int64_t input_offset = idx * num_cols;
+  const ElemT *this_input =
+      input + block_batch * batch_stride_input + input_offset;
+  size_t output_idx = block_batch * batch_stride_output + idx;
+  ElemT *this_output = get_strided_address_at_idx<ElemT, ElemT>(output, output_idx);
+
+  assert(sizeof(ReadVecT) % sizeof(ElemT) == 0);
+  constexpr int n_read_elems_in_v = sizeof(ReadVecT) / sizeof(ElemT);
+  // number of original elements
+  constexpr int64_t num_elems_per_thread = num_rows_per_thread * num_cols;
+  // number of vector elements
+  assert(num_elems_per_thread % n_read_elems_in_v == 0);
+  constexpr int64_t num_elems_per_thread_v =
+      num_elems_per_thread / n_read_elems_in_v;
+
+  ReadVecT read_elems_v[num_elems_per_thread_v];
+  const ReadVecT *this_input_v = reinterpret_cast<const ReadVecT*>(this_input);
+  // read
+  CUTLASS_PRAGMA_UNROLL
+  for (int64_t i = 0; i < num_elems_per_thread_v; i++) {
+    cutlass::arch::global_load<ReadVecT, sizeof(ReadVecT)>(
+        read_elems_v[i], this_input_v + i, true
+    );
+  }
+
+  // compute
+  using FragmentCompute = ElementCompute;
+  ElemT *read_elems = reinterpret_cast<ElemT *>(read_elems_v);
+  using ReduceScalarOp = {{reduce_op}}<ElementCompute>;
+  ReduceScalarOp reduce_s_op;
+  constexpr int num_reduced_elems = num_cols;
+
+  auto prologue_fn = [&] (FragmentCompute fragment) {
+    {{prologue_code}}
+  };
+  auto epilogue_scalar_fn = [&] (ElementCompute reduced_result) {
+    {{epilogue_scalar_code}}
+  };
+
+  ElemT reduced_elems[num_rows_per_thread];
+  assert(num_elems_per_thread % num_cols == 0);
+  CUTLASS_PRAGMA_UNROLL
+  for (int64_t i = 0; i < num_elems_per_thread / num_cols; i++) {
+    assert(num_elems_per_thread % num_rows_per_thread == 0);
+    FragmentCompute frag_compute = FragmentCompute(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t j = 0; j < num_cols; j++) {
+      int64_t read_idx = i * num_cols + j;
+      FragmentCompute tmp = prologue_fn(read_elems[read_idx]);
+      frag_compute = reduce_s_op(frag_compute, tmp);
+    }
+    cutlass::NumericConverter<ElemT, ElementCompute> convert_output;
+    ElementCompute tmp = epilogue_scalar_fn(frag_compute);
+    reduced_elems[i] = convert_output(tmp);
+  }
+
+  WriteVecT *this_output_v = reinterpret_cast<WriteVecT*>(this_output);
+  WriteVecT *reduced_elems_v = reinterpret_cast<WriteVecT*>(&reduced_elems[0]);
+  constexpr int n_write_elems_in_v = sizeof(WriteVecT) / sizeof(ElemT);
+  CUTLASS_PRAGMA_UNROLL
+{% if output_accessor.is_contiguous %}
+  for (int64_t i = 0; i < num_rows_per_thread / n_write_elems_in_v; i++) {
+    WriteVecT tmp = reduced_elems_v[i];
+    this_output_v[i] = tmp;
+  }
+{% else %}
+  for (int64_t read_i = 0, write_i = 0;
+       read_i < num_rows_per_thread / n_write_elems_in_v;
+       read_i++,
+       write_i += {{output_accessor.actual_total_elements_from_stride_dim}}
+  ) {
+    WriteVecT tmp = reduced_elems_v[read_i];
+    this_output_v[write_i] = tmp;
+  }
+{% endif %}
+}
+
+template <int64_t num_cols>
+void reduce_mean_launcher_small_axis(
+  {{elem_output_type}} *output,
+  {{elem_input_type}} *input,
+  int64_t num_batches,
+  int64_t num_rows,
+  int64_t batch_stride_input,
+  int64_t batch_stride_output,
+  cudaStream_t stream
+) {
+  constexpr int64_t num_read_v =
+      sizeof({{read_vec_type}}) / sizeof({{elem_input_type}});
+  constexpr int64_t row_gcd = std::gcd(num_cols, num_read_v);
+  constexpr int64_t num_rows_per_thread = num_read_v / row_gcd;
+{% if output_accessor.is_contiguous %}
+  constexpr int64_t num_write_bytes_v =
+      num_rows_per_thread * sizeof({{elem_output_type}});
+{% else %}
+  constexpr int64_t num_write_bytes_v =
+      std::min(num_rows_per_thread, static_cast<int64_t>({{output_access_alignment}})) *
+      sizeof({{elem_output_type}});
+{% endif %}
+
+  assert(num_rows % num_rows_per_thread == 0);
+  int64_t real_rows = num_rows / num_rows_per_thread;
+  dim3 grid(static_cast<int>(real_rows + ThreadsPerBlock -1 ) / ThreadsPerBlock,
+            static_cast<int>(num_batches));
+
+  if (num_rows % num_rows_per_thread == 0) {
+
+#define HANDLE_ONE_WRITE_VEC(write_bytes, write_vec_type) \\
+    case write_bytes:                                     \\
+      reduce_small_in_v_out_v<{{elem_input_type}},        \\
+                              {{elem_compute_type}},      \\
+                              {{read_vec_type}},          \\
+                              write_vec_type,             \\
+                              num_rows_per_thread,        \\
+                              num_cols>                   \\
+      <<<grid, ThreadsPerBlock, 0, stream>>>(             \\
+          output,                                         \\
+          input,                                          \\
+          num_rows,                                       \\
+          batch_stride_input,                             \\
+          batch_stride_output);                           \\
+      break;                                              \\
+
+    switch(num_write_bytes_v) {
+      HANDLE_ONE_WRITE_VEC(16, uint4)
+      HANDLE_ONE_WRITE_VEC(8, uint2)
+      HANDLE_ONE_WRITE_VEC(4, unsigned)
+      HANDLE_ONE_WRITE_VEC(2, cutlass::half_t)
+      default:
+        throw std::runtime_error("unsupported vector size for write");
+    }
+  } else {
+    throw std::runtime_error("unsupported num_row_per_threads");
+  }
+  LAUNCH_CHECK_REDUCE();
+}
+
+void reduce_mean_launcher_small_axis_column_major(
+  {{elem_output_type}} *output,
+  {{elem_input_type}} *input,
+  int64_t num_batches,
+  int64_t num_rows,
+  int64_t num_columns,
+  int64_t batch_stride_input,
+  int64_t batch_stride_output,
+  cudaStream_t stream
+) {
+}
+
+"""
+)
+
+
+def _get_read_vector_type(input_shape, input_type, force_min_vec_type=False) -> str:
+    """return vector_type for reading input along reduction axis -1 (for row-major).
+    In a long run, we should consider to add profiling support to reduce kernels
+    and then tune vector_type. Currently, we use the following heuristics based on
+    manual experiments:
+    (1) for 1-d input, it's straightforward - we pickup the max vector type
+        whose size is modulo of dim_val
+    (2) for n-d input, if the n-2 dimension value is less than 128, we utilize
+        thread-level parallelism, i.e. we don't try to read inputs across multiple
+        dimensions
+    (3) we ensure each thread won't read too many elements (maximum is 256 at
+        the moment)
+
+    Parameters
+    ----------
+    input_shape: List[IntImm]
+
+    Returns
+    -------
+    str
+        returns the vector type (uint4/uint2/unsigned/cutlass::half_t)
+        for reading input
+    """
+    type_to_size_in_bit = {
+        "half": 16,
+        "cutlass::half_t": 16,
+        "float": 32,
+    }
+
+    # FIXME
+    # (1) note that we don't support int8, so we don't have vector_type for one byte
+    # (2) the input type is inherited from reduce_3d, so we still
+    #     use cutlass::half_t for fp16. We will replace it to half once we
+    #     unify our half representation
+    vector_types = [
+        ("uint4", 16),
+        ("uint2", 8),
+        ("unsigned", 4),
+        ("cutlass::half_t", 2),
+    ]
+
+    def _size_to_vector_type(sz_in_byte) -> str:
+        """return vector_type for the given size"""
+        for vec_type, sz in vector_types:
+            if sz_in_byte % sz == 0:
+                return vec_type
+        raise NotImplementedError("Unsupported vector size: {}".format(sz_in_byte))
+
+    reduction_axis = -1
+    assert isinstance(input_shape[reduction_axis], IntImm)
+    rank = len(input_shape)
+    reduction_dim_val = input_shape[reduction_axis]._attrs["values"][0]
+    input_type_sz_in_bit = type_to_size_in_bit.get(input_type)
+    assert input_type_sz_in_bit is not None and input_type_sz_in_bit % 8 == 0
+    input_type_sz_in_byte = input_type_sz_in_bit / 8
+
+    # return the minimal vector type based on the input_type
+    if force_min_vec_type:
+        return _size_to_vector_type(input_type_sz_in_byte)
+
+    # 1-d tensor
+    if rank == 1:
+        return _size_to_vector_type(reduction_dim_val * input_type_sz_in_byte)
+
+    # no matter input_shape[-2] is IntImm or IntVar, we get the minimal dim value
+    dim_2_val = input_shape[-2]._attrs["values"][0]
+    # If input_shape[-2] is too small, we utilize thread-level parallelism
+    tlp_lower_bound = 128
+    if dim_2_val <= tlp_lower_bound:
+        return _size_to_vector_type(reduction_dim_val * input_type_sz_in_byte)
+    # When dim_2_val % 2 == 1, we cannot read inputs across dimension
+    if dim_2_val % 2 == 1:
+        return _size_to_vector_type(reduction_dim_val * input_type_sz_in_byte)
+
+    # Let's make sure that each thread won't read too many.
+    # Currently set it to be less than 256 elements
+    max_num_elems_per_thread = 256
+
+    def _valid_vector_type(vec_type, sz_in_byte):
+        if sz_in_byte % input_type_sz_in_byte != 0:
+            return False
+        num_elems_in_a_vec = int(sz_in_byte / input_type_sz_in_byte)
+        gcd = math.gcd(reduction_dim_val, num_elems_in_a_vec)
+        num_reduction_dim = int(num_elems_in_a_vec / gcd)
+        if num_reduction_dim >= dim_2_val or dim_2_val % num_reduction_dim != 0:
+            return False
+        if num_reduction_dim == 1:
+            return True
+        if num_reduction_dim * reduction_dim_val > max_num_elems_per_thread:
+            return False
+        return True
+
+    for vec_type, sz in vector_types:
+        if _valid_vector_type(vec_type, sz):
+            return vec_type
+
+    raise RuntimeError("Cannot find valid vector type!")
+
+
+def get_exec_cond_and_kernel(
+    func_attrs,
+    reduce_op,
+    reduction_axis,
+    prologue_code,
+    epilogue_scalar_code,
+    input_type,
+    output_type,
+    acc_type,
+    output_accessors,
+    output_alignment,
+) -> str:
+    """return a pair that contains the execution condition for this special
+       reduction kernel and the source code of this reduction kernel
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this reduce op
+    reduce_op : str
+        the reduce op's string representation such as cutlass::plus
+    reduction_axis : int
+        the axis along which the reduction is performed
+    prologue_code : str
+        prologue code to process input before reduction
+    epilogue_scalar_code : str
+        epilogue code to process each final reduced element
+    input_type : str
+        specifies the input data type
+    output_type : str
+        specifies the output data type
+    acc_type : str,
+        specifies the data type for accumulation
+    output_accessors : List[TensorAccessor]
+        output TensorAccessor(s)
+    output_alignment : int
+        max alignment value that meets the requirement for accessing strided output
+
+    Returns
+    -------
+    str
+        returns the rendered code for the complete implementation of the reduce op
+    """
+    x = func_attrs["inputs"][0]
+    x_shape = x._attrs["shape"]
+    reduction_dim = x_shape[reduction_axis]
+    # TODO: support dynamic reduction axis
+    if not isinstance(reduction_dim, IntImm):
+        return ("", "")
+
+    reduction_dim_upperbound = 128
+    if reduction_dim._attrs["values"][0] > reduction_dim_upperbound:
+        return ("", "")
+
+    rank = len(x_shape)
+    # TODO: support reduction_axis = rank - 2
+    valid_static_small_reduction_dim = reduction_axis == rank - 1
+    reduction_dim_val = reduction_dim._attrs["values"][0]
+
+    exec_cond = EXEC_COND_TEMPLATE.render(
+        indent="  ",
+        func_name=func_attrs["name"],
+        reduction_dim_upperbound=reduction_dim_upperbound,
+        reduction_dim_val=reduction_dim_val,
+        static_small_reduction_dim=valid_static_small_reduction_dim,
+    )
+
+    if output_accessors[0].is_contiguous:
+        read_vec_type = _get_read_vector_type(x_shape, input_type)
+    else:
+        # For strided accesses, we force to take minimal vector type to
+        # reduce uncoalesced device memory accesses. The perf penalty
+        # of uncoalesced accesses (both read and write) seems to outweigh the
+        # benefit from vector load.
+        read_vec_type = _get_read_vector_type(
+            x_shape, input_type, force_min_vec_type=True
+        )
+    kernel_src = KERNEL_SRC_TEMPLATE.render(
+        reduce_op=reduce_op,
+        prologue_code=prologue_code,
+        epilogue_scalar_code=epilogue_scalar_code,
+        elem_input_type=input_type,
+        elem_compute_type=acc_type,
+        elem_output_type=output_type,
+        read_vec_type=read_vec_type,
+        output_accessor=output_accessors[0],
+        output_access_alignment=output_alignment,
+    )
+    return (exec_cond, kernel_src)
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_sum.py b/python/aitemplate/backend/cuda/reduce/reduce_sum.py
new file mode 100644
index 000000000..a30c91bfb
--- /dev/null
+++ b/python/aitemplate/backend/cuda/reduce/reduce_sum.py
@@ -0,0 +1,105 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+reduce_sum kernel:
+    (1) it invokes reduce_3d kernel for reduction_dim = -1 cases; and
+    (2) invokes reduce_common for all other cases
+
+We do this because there is huge perf difference between reduce_3d and
+reduce_common for different reduction dims. We should consider to unify
+our implementation later. Ideally, we should fix the perf issue in
+reduce_3d for non-neg-dim cases, because reduce_3d can take prologue and
+epilogue so it is more general than reduce_common.
+"""
+
+from ... import registry
+from . import reduce_3d, reduce_common
+
+
+def _is_last_reduction_dim(func_attrs):
+    """return true if the reduction dim is the last dim (i.e. inner most dim)"""
+    axes = func_attrs["reduction_axes"]
+    if not len(axes) == 1:
+        raise NotImplementedError("Multiple reduction axes are not supported yet")
+    reduction_dim = axes[0]
+    # make sure our frontend handle negative dims
+    assert reduction_dim >= 0, "cannot have negative dim here: {}".format(reduction_dim)
+    x = func_attrs["inputs"][0]
+    rank = x._rank()
+    assert rank >= 1, "rank must >= 1, got: {}".format(rank)
+    return reduction_dim == rank - 1
+
+
+@registry.reg("cuda.reduce_sum.func_decl")
+def reduce_sum_gen_function_decl(func_attrs):
+    """the registered function for generating reduce_sum function declaration
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this reduce_sum op
+
+    Returns
+    -------
+    [type] : str
+        returns the rendered function declaration with appropriate replacements
+    """
+    if _is_last_reduction_dim(func_attrs):
+        return reduce_3d.gen_function_decl(func_attrs)
+    else:
+        return reduce_common.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.reduce_sum.gen_function")
+def reduce_sum_gen_function(func_attrs):
+    """the registered function for generating reduce_sum kernel and all of
+    its auxiliary functions
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this reduce_sum op
+
+    Returns
+    -------
+    str
+        returns the rendered code for the complete implementation of this reduce sum op
+    """
+    if _is_last_reduction_dim(func_attrs):
+        return reduce_3d.gen_function(func_attrs, "cutlass::plus")
+    else:
+        return reduce_common.gen_function(func_attrs, "cutlass::plus")
+
+
+@registry.reg("cuda.reduce_sum.func_call")
+def reduce_sum_gen_function_call(func_attrs, indent="  "):
+    """the registered function for generating a function call to reduce_mean
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this reduce_mean op
+    indent : str, optional
+        indentation for each line of the rendered code (default "  ")
+
+    Returns
+    -------
+    str
+        returns rendered code for invoking the reduce op
+    """
+    if _is_last_reduction_dim(func_attrs):
+        return reduce_3d.gen_function_call(func_attrs, indent)
+    else:
+        return reduce_common.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/cuda/reduce/var.py b/python/aitemplate/backend/cuda/reduce/var.py
new file mode 100644
index 000000000..54d0468f2
--- /dev/null
+++ b/python/aitemplate/backend/cuda/reduce/var.py
@@ -0,0 +1,289 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A variance kernel implementation based on the Welford's onlien algorithm:
+
+https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+
+"""
+
+import jinja2
+
+from ... import registry
+from . import reduce_3d
+
+
+EXTRA_CODE_TEMPLATE = jinja2.Template(
+    """
+namespace {
+
+template <typename ElementT, bool BesselCorrection>
+struct WelfordData {
+  int32_t count;
+  ElementT mean;
+  ElementT m2;
+
+  CUTLASS_HOST_DEVICE
+  WelfordData() : count(0), mean(0), m2(0) {}
+
+  CUTLASS_HOST_DEVICE
+  WelfordData(ElementT mean_) : count(1), mean(mean_), m2(0) {}
+
+  CUTLASS_HOST_DEVICE
+  WelfordData(int) : count(0), mean(0), m2(0) {}
+
+  CUTLASS_HOST_DEVICE
+  WelfordData(int count_, ElementT mean_, ElementT m2_)
+    : count(count_), mean(mean_), m2(m2_) {}
+
+  CUTLASS_HOST_DEVICE
+  WelfordData reduce(WelfordData existing, WelfordData new_data) {
+    if (new_data.count == 0) {
+        return existing;
+    }
+    if (existing.count == 0 && new_data.count > 0) {
+        return new_data;
+    }
+    ElementT mean = existing.mean;
+    ElementT m2 = existing.m2;
+
+    // combine two results
+    if (count > 0 && new_data.count > 0) {
+      ElementT delta = new_data.mean - mean;
+      int new_count = new_data.count + count;
+      ElementT nb_over_n = ElementT(new_data.count) / ElementT(new_count);
+      mean = mean + delta * nb_over_n;
+      m2 =  m2 + new_data.m2 + delta * delta * count * nb_over_n;
+      return WelfordData(new_count, mean, m2);
+    }
+
+    count++;
+    ElementT delta = new_data.mean - mean;
+    mean = mean + delta / ElementT(count);
+    ElementT delta2 = new_data.mean - mean;
+    m2 += delta * delta2;
+    return WelfordData(count, mean, m2);
+  }
+};
+
+} // anonymous namespace
+
+namespace cutlass {
+
+namespace arch {
+
+/// note that we add this extra load utility for loading our WelfordData, which
+/// requires more bytes to be loaded with kReduceVector = 4.
+///
+/// ld.shared - 2 of 128b
+template <>
+CUTLASS_DEVICE
+void shared_load<32>(void *dst, uint32_t ptr) {
+  uint4 *dst_u128 = reinterpret_cast<uint4 *>(dst);
+  asm volatile("ld.shared.v4.u32 {{ '{%0, %1, %2, %3}, [%4]' }};\\n"
+    :
+      "=r"(dst_u128->x),
+      "=r"(dst_u128->y),
+      "=r"(dst_u128->z),
+      "=r"(dst_u128->w)
+    : "r"(ptr));
+
+  dst_u128++;
+  ptr = ptr + sizeof(uint4);
+  asm volatile("ld.shared.v4.u32 {{ '{%0, %1, %2, %3}, [%4]' }};\\n"
+    :
+      "=r"(dst_u128->x),
+      "=r"(dst_u128->y),
+      "=r"(dst_u128->z),
+      "=r"(dst_u128->w)
+    : "r"(ptr));
+}
+
+} // namespace arch
+
+template <typename ElementT, bool BesselCorrection>
+struct NumericConverter<WelfordData<ElementT, BesselCorrection>,
+                        ElementT,
+                        FloatRoundStyle::round_to_nearest> {
+
+  using result_type = WelfordData<ElementT, BesselCorrection>;
+  using source_type = ElementT;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    return WelfordData<ElementT, BesselCorrection>(-1, s, ElementT(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+template <typename ElementT, bool BesselCorrection>
+struct NumericConverter<ElementT,
+                        WelfordData<ElementT, BesselCorrection>,
+                        FloatRoundStyle::round_to_nearest> {
+
+  using result_type = ElementT;
+  using source_type = WelfordData<ElementT, BesselCorrection>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    if (BesselCorrection) {
+      // Bessel's correction (unbias = true)
+      if (s.count <= 1) {
+        return ElementT(nanf("Not a Number"));
+      } else {
+        return s.m2 / ElementT((int)(s.count - 1));
+      }
+    } else {
+      // sample variance
+      if (s.count <= 0) {
+        return ElementT(nanf("Not a Number"));
+      } else {
+        return s.m2 / ElementT((int)(s.count));
+      }
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) {
+    return convert(s);
+  }
+};
+
+template <typename T>
+struct welford_op {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    return lhs.reduce(lhs, rhs);
+  }
+};
+
+template <typename T, int N>
+struct welford_op<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    welford_op<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    welford_op<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    welford_op<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+} // namespace cutlass
+"""
+)
+
+
+@registry.reg("cuda.var.func_decl")
+def var_gen_function_decl(func_attrs) -> str:
+    """the registered function for generating var function declaration
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds the attributes of this var op
+
+    Returns
+    -------
+    str
+        returns the rendered function declaration with appropriate replacements
+    """
+    return reduce_3d.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.var.gen_function")
+def var_gen_function(func_attrs) -> str:
+    """the registered function for generating var kernel and all of
+    its auxiliary functions
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds the attributes of this var op
+
+    Returns
+    -------
+    str
+        returns the rendered code for the complete implementation of this var op
+    """
+    bessel = "true" if func_attrs["unbiased"] else "false"
+    acc_type = "WelfordData<cutlass::half_t, {}>".format(bessel)
+    return reduce_3d.gen_function(
+        func_attrs,
+        "cutlass::welford_op",
+        reduce_3d.DEFAULT_PROLOGUE_TEMPLATE,
+        reduce_3d.DEFAULT_EPILOGUE_SCALAR_TEMPLATE,
+        EXTRA_CODE_TEMPLATE.render(),
+        accumulation_type=acc_type,
+    )
+
+
+@registry.reg("cuda.var.func_call")
+def var_gen_function_call(func_attrs, indent="  "):
+    """the registered function for generating a function call to var
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds the attributes of this var op
+    indent : str, optional
+        indentation for each line of the rendered code (default "  ")
+
+    Returns
+    -------
+    str
+        returns rendered code for invoking the reduce op
+    """
+    return reduce_3d.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/cuda/reduce/vector_norm.py b/python/aitemplate/backend/cuda/reduce/vector_norm.py
new file mode 100644
index 000000000..21bf195e5
--- /dev/null
+++ b/python/aitemplate/backend/cuda/reduce/vector_norm.py
@@ -0,0 +1,102 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A kernel that implements vector_norm
+"""
+
+import jinja2
+
+from ... import registry
+from . import reduce_3d
+
+
+L2_NORM_PROLOGUE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using MultipliesOp = cutlass::multiplies<FragmentCompute>;
+{{indent}}MultipliesOp multiplies;
+{{indent}}return multiplies(fragment, fragment);
+"""
+)
+
+
+L2_NORM_EPILOGUE_SCALAR_TEMPLATE = jinja2.Template(
+    """
+{{indent}}cutlass::NumericConverter<ElementCompute, float> local_converter;
+{{indent}}return local_converter(fast_sqrt(reduced_result));
+"""
+)
+
+
+@registry.reg("cuda.vector_norm.func_decl")
+def vector_norm_gen_function_decl(func_attrs) -> str:
+    """the registered function for generating vector_norm function declaration
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this vector_norm op
+
+    Returns
+    -------
+    str
+        returns the rendered function declaration with appropriate replacements
+    """
+    return reduce_3d.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.vector_norm.gen_function")
+def vector_norm_gen_function(func_attrs) -> str:
+    """the registered function for generating vector_norm kernel and all of
+    its auxiliary functions
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this vector_norm op
+
+    Returns
+    -------
+    str
+        returns the rendered code for the complete implementation of this reduce mean op
+    """
+    ord_kind = func_attrs["ord_kind"]
+    if ord_kind == "2":
+        prologue_template = L2_NORM_PROLOGUE_TEMPLATE
+        epilogue_scalar_template = L2_NORM_EPILOGUE_SCALAR_TEMPLATE
+    else:
+        raise RuntimeError("unsupported ord_kind {} for vector_norm".format(ord_kind))
+
+    return reduce_3d.gen_function(
+        func_attrs, "cutlass::plus", prologue_template, epilogue_scalar_template
+    )
+
+
+@registry.reg("cuda.vector_norm.func_call")
+def vector_norm_gen_function_call(func_attrs, indent="  ") -> str:
+    """the registered function for generating a function call to vector_norm
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        holds attributes of this vector_norm op
+    indent : str, optional
+        indentation for each line of the rendered code (default "  ")
+
+    Returns
+    -------
+    str
+        returns rendered code for invoking the reduce op
+    """
+    return reduce_3d.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/cuda/softmax/__init__.py b/python/aitemplate/backend/cuda/softmax/__init__.py
new file mode 100644
index 000000000..3b3b17330
--- /dev/null
+++ b/python/aitemplate/backend/cuda/softmax/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+(c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+"""
+from . import softmax
+
+__all__ = ["softmax"]
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.cuh b/python/aitemplate/backend/cuda/softmax/softmax.cuh
new file mode 100644
index 000000000..8a6e2317e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/softmax/softmax.cuh
@@ -0,0 +1,538 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+#ifndef CUDA_SOFTMAX
+#define CUDA_SOFTMAX
+
+template <typename T>
+__inline__ __device__ T Inf();
+
+template <>
+__inline__ __device__ float Inf<float>() {
+  return CUDART_INF_F;
+}
+
+template <>
+__inline__ __device__ double Inf<double>() {
+  return CUDART_INF;
+}
+
+template <typename T>
+struct Arguments {
+  T* input;
+  T* output;
+};
+
+struct float8 {
+  float4 f0;
+  float4 f1;
+};
+
+#define FINAL_MASK 0xffffffff
+
+template <typename T, int NUM>
+__inline__ __device__ T warpReduceSum(T* val, int thread_group_width = 32) {
+#pragma unroll
+  for (int i = 0; i < NUM; i++) {
+#pragma unroll
+    for (int mask = thread_group_width / 2; mask > 0; mask >>= 1) {
+      val[i] += __shfl_xor_sync(FINAL_MASK, val[i], mask, 32);
+    }
+  }
+  return (T)(0.0f);
+}
+
+template <typename T, int NUM>
+__inline__ __device__ T blockReduceSum(T* val) {
+  __shared__ T shared[NUM][33];
+  int lane = threadIdx.x & 0x1f; // threadIdx.x % warp_size
+  int wid = threadIdx.x >> 5; // threadIdx.x / warp_size
+
+  warpReduceSum<T, NUM>(val);
+
+  if (lane == 0) {
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+      shared[i][wid] = val[i];
+    }
+  }
+
+  __syncthreads();
+
+  bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+#pragma unroll
+  for (int i = 0; i < NUM; i++) {
+    val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
+  }
+  if (wid == 0)
+    warpReduceSum<T, NUM>(val);
+  return (T)0.0f;
+}
+
+template <typename T, int NUM>
+__inline__ __device__ T warpReduceMax(T* val, int thread_group_width = 32) {
+#pragma unroll
+  for (int i = 0; i < NUM; i++) {
+#pragma unroll
+    for (int mask = thread_group_width / 2; mask > 0; mask >>= 1) {
+      val[i] = max(val[i], __shfl_xor_sync(FINAL_MASK, val[i], mask, 32));
+    }
+  }
+  return (T)(0.0f);
+}
+
+template <typename T, int NUM>
+__inline__ __device__ T blockReduceMax(T* val) {
+  __shared__ T shared[NUM][33];
+  int lane = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  warpReduceMax<T, NUM>(val);
+
+  if (lane == 0) {
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+      shared[i][wid] = val[i];
+    }
+  }
+
+  __syncthreads();
+
+  bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+#pragma unroll
+  for (int i = 0; i < NUM; i++) {
+    val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
+  }
+  if (wid == 0)
+    warpReduceMax<T, NUM>(val);
+  return (T)0.0f;
+}
+
+// input size: [M, K]
+// Currently the softmax kernel only supports 2D input with dim=1.
+// For input with more dimensions, reshape first.
+// This kernel is fast for even K, but slow for odd K (K >= 15).
+// dtype=float is not tested.
+
+// each thread reduces a tile of size [m, K]
+// m is the tile size in M dim
+template <
+    typename T,
+    typename VECTORIZED_TYPE,
+    int num_thread,
+    size_t K,
+    size_t m>
+__global__ void softmax_small_k(Arguments<T> args, size_t M) {
+  const size_t idx = blockIdx.x * num_thread + threadIdx.x;
+  const size_t m_idx = m * idx;
+
+  if (m_idx >= M) {
+    return;
+  }
+
+  constexpr size_t vector_len = sizeof(VECTORIZED_TYPE) / sizeof(T);
+  constexpr bool can_use_vector_load = ((m * K) % vector_len) == 0;
+  // read input
+  if (can_use_vector_load && m_idx + m < M) {
+    VECTORIZED_TYPE* input = reinterpret_cast<VECTORIZED_TYPE*>(args.input);
+    VECTORIZED_TYPE* output = reinterpret_cast<VECTORIZED_TYPE*>(args.output);
+
+    const size_t offset = (m_idx * K) / vector_len;
+    input += offset;
+    output += offset;
+
+    static_assert(m <= 8, "tile size m should always be <= 8");
+
+    // round up to make compiler happy
+    constexpr int n_tile = (m * K + vector_len - 1) / vector_len;
+    VECTORIZED_TYPE input_tile_vec[n_tile];
+    T* input_tile = reinterpret_cast<T*>(&input_tile_vec);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (size_t i = 0; i < n_tile; i++) {
+      input_tile_vec[i] = input[i];
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (size_t i = 0; i < m; i++) {
+      T max = platform::numeric_limits<T>::lowest();
+      // find max
+      CUTLASS_PRAGMA_UNROLL
+      for (size_t j = 0; j < K; j++) {
+        max = cutlass::fast_max(input_tile[i * K + j], max);
+      }
+      // get sum
+      float sum = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for (size_t j = 0; j < K; j++) {
+        const int tile_idx = i * K + j;
+        input_tile[tile_idx] = cutlass::fast_exp(input_tile[tile_idx] - max);
+        sum += static_cast<float>(input_tile[tile_idx]);
+      }
+      // normalize
+      const float sum_inverse = 1.0 / sum;
+      CUTLASS_PRAGMA_UNROLL
+      for (size_t j = 0; j < K; j++) {
+        const int tile_idx = i * K + j;
+        input_tile[tile_idx] = static_cast<T>(
+            static_cast<float>(input_tile[tile_idx]) * sum_inverse);
+      }
+    }
+    CUTLASS_PRAGMA_UNROLL
+    for (size_t i = 0; i < n_tile; i++) {
+      output[i] = input_tile_vec[i];
+    }
+  } else {
+    T* input = args.input;
+    T* output = args.output;
+
+    const size_t offset = m_idx * K;
+    input += offset;
+    output += offset;
+
+    // handles both odd K and tail batches
+    const size_t real_m = M - m_idx >= m ? m : M - m_idx;
+
+    for (size_t i = 0; i < real_m; i++) {
+      T input_tile[K];
+
+      // read input
+      CUTLASS_PRAGMA_UNROLL
+      for (size_t j = 0; j < K; j++) {
+        input_tile[j] = input[i * K + j];
+      }
+
+      T max = platform::numeric_limits<T>::lowest();
+      // find max
+      CUTLASS_PRAGMA_UNROLL
+      for (size_t j = 0; j < K; j++) {
+        max = cutlass::fast_max(input_tile[j], max);
+      }
+      // get sum
+      float sum = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for (size_t j = 0; j < K; j++) {
+        const int tile_idx = i * K + j;
+        input_tile[j] = cutlass::fast_exp(input_tile[j] - max);
+        sum += static_cast<float>(input_tile[j]);
+      }
+      // normalize
+      float sum_inverse = 1.0 / sum;
+      CUTLASS_PRAGMA_UNROLL
+      for (size_t j = 0; j < K; j++) {
+        input_tile[j] =
+            static_cast<T>(static_cast<float>(input_tile[j]) * sum_inverse);
+      }
+      // write output
+      CUTLASS_PRAGMA_UNROLL
+      for (size_t j = 0; j < K; j++) {
+        output[i * K + j] = input_tile[j];
+      }
+    }
+  }
+}
+
+// This is a special case where K is really large, we still use block reduction.
+// In this case, we won’t have enough shared memory and we will not cache any
+// kernel. i.e. we no longer keep shared memory, but calculate exp(buf[i]-s_max)
+// each time we need it.
+template <typename T>
+__global__ void softmaxBlockNocache(
+    T* input,
+    T* output,
+    size_t m,
+    const size_t n) {
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  __shared__ float s_max, s_sum;
+  int offset = m_idx * n;
+  input += offset;
+  output += offset;
+
+  float local_max[1] = {-Inf<float>()};
+  for (int i = tid; i < n; i += blockDim.x) {
+    float local_val = static_cast<float>(input[i]);
+    local_max[0] = max(local_val, local_max[0]);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceMax<float, 1>(local_max);
+  } else {
+    blockReduceMax<float, 1>(local_max);
+  }
+  if (threadIdx.x == 0) {
+    s_max = local_max[0];
+  }
+  __syncthreads();
+  float local_sum[1] = {0.0f};
+  for (int i = tid; i < n; i += blockDim.x) {
+    local_sum[0] += exp(static_cast<float>(input[i]) - s_max);
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sum);
+  } else {
+    blockReduceSum<float, 1>(local_sum);
+  }
+  if (threadIdx.x == 0) {
+    s_sum = local_sum[0];
+  }
+  __syncthreads();
+  for (int i = tid; i < n; i += blockDim.x) {
+    output[i] = T(exp(static_cast<float>(input[i]) - s_max) / s_sum);
+  }
+}
+
+// Assuming input[M, K], we use vector read with pack_size as length.
+// There are two cases:
+// 1) When K/pack_size >= 32.* We launch M/pack_size blocks and 128 threads.
+// Each block is further partition into two dimensions x and y,
+// where on x dimension we perform wrap reduction on columns, on y dimension we
+// parallelize independent row operations. The warp size is 32 as K >
+// 32*pack_size. i.e. GridDim = <M/pack_size>, BlockDim = <32, 4>. Each thread
+// processes K/32 columns. Each block processes 4 rows, 32 columns. Each grid
+// processes M/4 rows. 2) When K/pack_size < 32.* We launch M*K/pack_size/128
+// blocks and 128 threads. Each block is further partition into two dimensions x
+// and y, where on x dimension we perform wrap reduction on columns, on y
+// dimension we parallelize independent row operations. But this time the wrap
+// size is K/pack_size i.e. GridDim = <MK/128/pack_size>, BlockDim =
+// <K/pack_size, 128/K*pack_size> Each thread processes pack_size columns.
+// (pack_size) Each block processes 128/K*pack_size rows, K/pack_size columns.
+// Each grid processes M*K/128/pack_size rows.
+
+template <typename T, typename ACT_T, int cols_per_thread>
+__global__ void softmax_stored_locally_multi_dim(
+    const T* input,
+    T* output,
+    size_t m,
+    size_t n) {
+  const int read_t_sz = sizeof(T);
+  const int act_t_sz = sizeof(ACT_T);
+  const int pack_size = read_t_sz / act_t_sz;
+
+  constexpr int num_packs = (cols_per_thread + pack_size - 1) / pack_size;
+  float buf[cols_per_thread];
+  const int m_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  const int tid = threadIdx.x;
+
+  for (int64_t row = m_idx; row < m; row += gridDim.x * blockDim.y) {
+    const int64_t row_offset = row * int((n + pack_size - 1) / pack_size);
+    const T* row_x = input + row_offset;
+    T* row_y = output + row_offset;
+    float local_max[1] = {-Inf<float>()};
+#pragma unroll
+    for (int i = 0; i < num_packs; ++i) {
+      const int col = i * blockDim.x + tid;
+      T tmp_in = row_x[col];
+      const ACT_T* pack_x = reinterpret_cast<const ACT_T*>(&tmp_in);
+      if (col < n / pack_size) {
+#pragma unroll
+        for (int j = 0; j < pack_size; j++) {
+          buf[i * pack_size + j] = static_cast<float>(pack_x[j]);
+          local_max[0] = max(local_max[0], buf[i * pack_size + j]);
+        }
+      } else {
+#pragma unroll
+        for (int j = 0; j < pack_size; j++) {
+          buf[i * pack_size + j] = -Inf<float>();
+        }
+      }
+    }
+    warpReduceMax<float, 1>(local_max, blockDim.x);
+
+    float local_sum[1] = {0.0f};
+#pragma unroll
+    for (int i = 0; i < cols_per_thread; ++i) {
+      buf[i] = exp(buf[i] - local_max[0]);
+      local_sum[0] += buf[i];
+    }
+    warpReduceSum<float, 1>(local_sum, blockDim.x);
+
+    T tmp_o;
+    ACT_T* pack_y = reinterpret_cast<ACT_T*>(&tmp_o);
+#pragma unroll
+    for (int i = 0; i < num_packs; i++) {
+      const int col = i * blockDim.x + tid;
+      if (col < n / pack_size) {
+        for (int j = 0; j < pack_size; j++) {
+          pack_y[j] = ACT_T(buf[i * pack_size + j] / local_sum[0]);
+        }
+        row_y[col] = tmp_o;
+      }
+    }
+  }
+}
+
+template <typename T, typename ACT_T, int block_size>
+__global__ void softmax_block_smem(
+    const T* input,
+    T* output,
+    size_t m,
+    const size_t n) {
+  const int read_t_sz = sizeof(T);
+  const int act_t_sz = sizeof(ACT_T);
+  const int pack_size = read_t_sz / act_t_sz;
+
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  extern __shared__ __align__(sizeof(
+      float)) unsigned char shared_buf[]; // size_t smem = n*sizeof(float)
+  auto* buf = reinterpret_cast<float*>(shared_buf);
+  const int num_packs = (n + pack_size - 1) / pack_size;
+  for (int64_t row = m_idx; row < m; row += gridDim.x) {
+    const int64_t row_offset = row * int((n + pack_size - 1) / pack_size);
+    const T* row_x = input + row_offset;
+    T* row_y = output + row_offset;
+    float local_max[1] = {-Inf<float>()};
+
+    for (int pack_id = tid; pack_id < num_packs; pack_id += blockDim.x) {
+      T tmp_in = row_x[pack_id];
+      const ACT_T* pack_x = reinterpret_cast<const ACT_T*>(&tmp_in);
+      // store to local register, which is faster than shared memory
+      for (int j = 0; j < pack_size; j++) {
+        float pack = pack_x[j];
+        buf[j * num_packs + pack_id] = pack;
+        local_max[0] = max(local_max[0], pack);
+      }
+    }
+    blockReduceMax<float, 1>(local_max); // reduce on a block of #blockDim.x
+
+    __shared__ float s_max;
+    if (threadIdx.x == 0) {
+      s_max = local_max[0];
+    }
+    __syncthreads();
+
+    float local_sum[1] = {0.0f};
+    for (int i = tid; i < n; i += blockDim.x) {
+      float local_val = exp(buf[i] - s_max);
+      buf[i] = local_val;
+      local_sum[0] += local_val;
+    }
+    blockReduceSum<float, 1>(local_sum);
+
+    __shared__ float s_sum;
+    if (threadIdx.x == 0) {
+      s_sum = local_sum[0];
+    }
+    __syncthreads();
+
+    T tmp_o;
+    ACT_T* pack_y = reinterpret_cast<ACT_T*>(&tmp_o);
+
+    for (int i = tid; i < num_packs; i += blockDim.x) {
+      for (int j = 0; j < pack_size; j++) {
+        const int col = i + j * num_packs;
+        pack_y[j] = ACT_T(buf[col] / s_sum);
+      }
+      row_y[i] = tmp_o;
+    }
+  }
+}
+
+// We launch M blocks and 1024 (maximum) threads. Each block handles a column
+// and we launch as many blocks as #rows. i.e. We launch GridDim = <M>, BlockDim
+// = <block_size>, Shared memory = K*sizeof(float). The block_size can be one of
+// 1024, 512, 256, 128. We first use
+// cudaOccupancyMaxActiveBlocksPerMultiprocessor to calculate actual used
+// threads. If there is no waste, we would like it to be as large as possible to
+// achieve higher concurrency (e.g 1024). Each thread processes K/block_size
+// columns. Each block processes block_size columns. Each grid processes M rows.
+template <typename T, typename ACT_T, size_t n>
+inline cudaError_t LaunchSoftmaxBlockAll(
+    const T* input,
+    T* output,
+    size_t m,
+    cudaStream_t stream,
+    bool* success) {
+  unsigned read_t_sz = sizeof(T);
+  unsigned comp_t_sz = sizeof(ACT_T);
+  unsigned pack_size = read_t_sz / comp_t_sz;
+  dim3 grid(m);
+  dim3 block(int((n + pack_size - 1) / pack_size));
+  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_2 = 256;
+  constexpr int block_size_conf_3 = 512;
+  constexpr int block_size_conf_4 = 1024;
+  const size_t smem = n * sizeof(float);
+  int max_active_blocks_conf_1;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_1,
+        softmax_block_smem<T, ACT_T, block_size_conf_1>,
+        block_size_conf_1,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_1 <= 0) {
+    *success = false;
+    return cudaSuccess;
+  }
+  *success = true;
+  int max_active_blocks_conf_4;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_4,
+        softmax_block_smem<T, ACT_T, block_size_conf_4>,
+        block_size_conf_4,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
+    softmax_block_smem<T, ACT_T, block_size_conf_4>
+        <<<grid, block_size_conf_4, smem, stream>>>(input, output, m, n);
+    return cudaSuccess;
+  }
+  int max_active_blocks_conf_3;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_3,
+        softmax_block_smem<T, ACT_T, block_size_conf_3>,
+        block_size_conf_3,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
+    softmax_block_smem<T, ACT_T, block_size_conf_3>
+        <<<grid, block_size_conf_3, smem, stream>>>(input, output, m, n);
+    return cudaSuccess;
+  }
+  int max_active_blocks_conf_2;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_2,
+        softmax_block_smem<T, ACT_T, block_size_conf_2>,
+        block_size_conf_2,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
+    softmax_block_smem<T, ACT_T, block_size_conf_2>
+        <<<grid, block_size_conf_2, smem, stream>>>(input, output, m, n);
+    return cudaSuccess;
+  }
+  softmax_block_smem<T, ACT_T, block_size_conf_1>
+      <<<grid, block_size_conf_1, smem, stream>>>(input, output, m, n);
+  return cudaSuccess;
+}
+
+#endif
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.py b/python/aitemplate/backend/cuda/softmax/softmax.py
new file mode 100644
index 000000000..4e98321fc
--- /dev/null
+++ b/python/aitemplate/backend/cuda/softmax/softmax.py
@@ -0,0 +1,347 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Softmax codegen for CUDA.
+"""
+
+import os
+from typing import Any, Dict
+
+import jinja2
+
+from ....compiler.base import IntImm
+
+from ... import registry
+from ...target import Target
+
+# pylint: disable=C0301, C0116
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<cutlass::half_t*>(&({{name}}->raw()))"
+)
+
+FUNC_CALL_FP32_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<float*>(&({{name}}->raw()))"
+)
+
+# input size: [M, K]
+# We put if else condition here to avoid long compilation time.
+# i.e. for each K, we only need to compile one of the implementation, not all.
+#
+# For each K, whether to use wrapReduce or blockReduce was done by experiment
+# Please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
+# and this experiment log: https://docs.google.com/spreadsheets/d/1bl3GCLQ67p27kXOSVJikEob38fojqaZIS--mPdQxeo0/edit#gid=931264442
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/platform/platform.h"
+#include <math_constants.h>
+#include <assert.h>
+#include <cuda.h>
+namespace {
+
+{{custom_libs}}
+
+}  // namespace
+
+{{func_signature}}
+{
+  {{shape_functions}}
+  size_t m0 = {{m}};
+  size_t n = {{K}};
+  size_t m = M;
+  bool success = true;
+
+  {% if K <= 32 and K % 4 == 0 or K <= 8 %}
+    const int n_threads = 128;
+    const int m0_by_n_threads = m0 * n_threads;
+    dim3 block(n_threads);
+    dim3 grid((m + m0_by_n_threads - 1) / m0_by_n_threads);
+    softmax_small_k<{{dtype}}, float4, n_threads, {{K}}, {{m}}>
+        <<<grid, block, 0, stream>>>({input, output}, m);
+  {% elif K % 8 == 0 %}
+    {% if K/8 <=32 %}
+      int thread_group_width = -1;
+      for(auto i: {1, 8, 16, 32}){
+        if (8*i >= n){
+          thread_group_width = i;
+          break;
+        }
+      }
+      int thread_group_per_block = 128/thread_group_width;
+      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
+      dim3 grid(grid_dim_x);
+      dim3 block(thread_group_width, thread_group_per_block);
+      {% if dtype=="float" %}
+        softmax_stored_locally_multi_dim<float8,{{dtype}},8><<<grid, block, 0, stream>>>( (const float8*)input, (float8*)output, m, n);
+      {% elif "half" in dtype %}
+        softmax_stored_locally_multi_dim<float4,{{dtype}},8><<<grid, block, 0, stream>>>( (const float4*)input, (float4*)output, m, n);
+      {% endif %}
+    {% elif K <= 3840 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
+      int thread_group_per_block = 128/32;//4
+      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
+      dim3 grid(grid_dim_x);
+      dim3 block(32,thread_group_per_block);
+      const int num_packs = (int(({{K}}+31)/32)+7)/8;
+      const int cols_per_thread = num_packs * 8;
+      {% if dtype=="float" %}
+        softmax_stored_locally_multi_dim<float8,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float8*)input, (float8*)output, m, n);
+      {% elif "half" in dtype %}
+        softmax_stored_locally_multi_dim<float4,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float4*)input, (float4*)output, m, n);
+      {% endif %}
+    {% elif dtype=="float" and K > 3840 %}
+        LaunchSoftmaxBlockAll<float8,{{dtype}},{{K}}>( (const float8*) input, (float8*) output, m, stream, &success);
+    {% elif "half" in dtype and K > 3840 %}
+        LaunchSoftmaxBlockAll<float4,{{dtype}},{{K}}>( (const float4*) input, (float4*) output, m, stream, &success);
+    {% endif %}
+  {% elif K % 4 == 0 %}
+    {% if K/4 <=32 %}
+      int thread_group_width = -1;
+      for(auto i: {1, 4, 8, 16, 32}){
+        if (4*i >= n){
+          thread_group_width = i;
+          break;
+        }
+      }
+      int thread_group_per_block = 128/thread_group_width;
+      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
+      dim3 grid(grid_dim_x);
+      dim3 block(thread_group_width, thread_group_per_block);
+      {% if dtype=="float" %}
+        softmax_stored_locally_multi_dim<float4,{{dtype}},8><<<grid, block, 0, stream>>>( (const float4*)input, (float4*)output, m, n);
+      {% elif "half" in dtype %}
+        softmax_stored_locally_multi_dim<float2,{{dtype}},8><<<grid, block, 0, stream>>>( (const float2*)input, (float2*)output, m, n);
+      {% endif %}
+    {% elif K <= 1920 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
+      int thread_group_per_block = 128/32;//4
+      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
+      dim3 grid(grid_dim_x);
+      dim3 block(32,thread_group_per_block);
+      const int num_packs = (int(({{K}}+31)/32)+3)/4;
+      const int cols_per_thread = num_packs * 8;
+      {% if dtype=="float" %}
+        softmax_stored_locally_multi_dim<float4,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float4*)input, (float4*)output, m, n);
+      {% elif "half" in dtype %}
+        softmax_stored_locally_multi_dim<float2,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float2*)input, (float2*)output, m, n);
+      {% endif %}
+    {% elif dtype=="float" and K > 1920 %}
+        LaunchSoftmaxBlockAll<float4,{{dtype}},{{K}}>( (const float4*) input, (float4*) output, m, stream, &success);
+    {% elif "half" in dtype and K > 1920 %}
+        LaunchSoftmaxBlockAll<float2,{{dtype}},{{K}}>( (const float2*) input, (float2*) output, m, stream, &success);
+    {% endif %}
+  {% elif K % 2 == 0 %}
+    {% if K/2 <=32 %}
+      int thread_group_width = -1;
+      for(auto i: {1, 2, 4, 8, 16, 32}){
+        if (2*i >= n){
+          thread_group_width = i;
+          break;
+        }
+      }
+      int thread_group_per_block = 128/thread_group_width;
+      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
+      dim3 grid(grid_dim_x);
+      dim3 block(thread_group_width, thread_group_per_block);
+      {% if dtype=="float" %}
+        softmax_stored_locally_multi_dim<float2,{{dtype}},8><<<grid, block, 0, stream>>>( (const float2*)input, (float2*)output, m, n);
+      {% elif "half" in dtype %}
+        softmax_stored_locally_multi_dim<float,{{dtype}},8><<<grid, block, 0, stream>>>( (const float*)input, (float*)output, m, n);
+      {% endif %}
+    {% elif K <= 1152 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
+      int thread_group_per_block = 128/32;//4
+      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
+      dim3 grid(grid_dim_x);
+      dim3 block(32,thread_group_per_block);
+      const int num_packs = (int(({{K}}+31)/32)+1)/2;
+      const int cols_per_thread = num_packs * 2;
+      {% if dtype=="float" %}
+        softmax_stored_locally_multi_dim<float2,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float2*)input, (float2*)output, m, n);
+      {% elif "half" in dtype %}
+        softmax_stored_locally_multi_dim<float,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float*)input, (float*)output, m, n);
+      {% endif %}
+    {% elif dtype=="float" and K > 1152 %}
+        LaunchSoftmaxBlockAll<float2,{{dtype}},{{K}}>( (const float2*) input, (float2*) output, m, stream, &success);
+    {% elif "half" in dtype and K > 1152 %}
+        LaunchSoftmaxBlockAll<float,{{dtype}},{{K}}>( (const float*) input, (float*) output, m, stream, &success);
+    {% endif %}
+  {% else %}
+    {% if K <=32 %}
+      int thread_group_width = -1;
+      for(auto i: {1, 2, 4, 8, 16, 32}){
+        if (i >= n){
+          thread_group_width = i;
+          break;
+        }
+      }
+      int thread_group_per_block = 128/thread_group_width;
+      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
+      dim3 grid(grid_dim_x);
+      dim3 block(thread_group_width, thread_group_per_block);
+      {% if dtype=="float" %}
+        softmax_stored_locally_multi_dim<float,{{dtype}},8><<<grid, block, 0, stream>>>( (const float*)input, (float*)output, m, n);
+      {% elif "half" in dtype %}
+        softmax_stored_locally_multi_dim<half,{{dtype}},8><<<grid, block, 0, stream>>>( (const half*)input, (half*)output, m, n);
+      {% endif %}
+    {% elif K <= 1408 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
+      int thread_group_per_block = 128/32;//4
+      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
+      dim3 grid(grid_dim_x);
+      dim3 block(32,thread_group_per_block);
+      const int cols_per_thread = ({{K}}+31)/32;
+      {% if dtype=="float" %}
+        softmax_stored_locally_multi_dim<float,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float*)input, (float*)output, m, n);
+      {% elif "half" in dtype %}
+        softmax_stored_locally_multi_dim<half,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const half*)input, (half*)output, m, n);
+      {% endif %}
+    {% elif dtype=="float" and K > 1408 %}
+        LaunchSoftmaxBlockAll<float,{{dtype}},{{K}}>( (const float*) input, (float*) output, m, stream, &success);
+    {% elif "half" in dtype and K > 1408 %}
+        LaunchSoftmaxBlockAll<half,{{dtype}},{{K}}>( (const half*) input, (half*) output, m, stream, &success);
+    {% endif %}
+  {% endif %}
+
+  if(!success){
+    softmaxBlockNocache<half><<<m, 1024, 0, stream>>>((half*)input, (half*)output, m, n);
+  }
+}
+    """
+)
+
+SHAPE_FUNCTIONS = jinja2.Template(
+    """
+    int64_t M = 1;
+{% for idx in range(input_ndim - 1) %}
+    M *= *in_{{idx}};
+{% endfor %}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}({{dtype}}* input,
+                   {{dtype}}* output,
+{% for idx in range(input_ndim - 1) %}
+                   int64_t* in_{{idx}},
+{% endfor %}
+                   cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{input}},
+{{indent}}   {{output}},
+{% for name in input_dim_names[:-1] %}
+{{indent}}    &{{name}},
+{% endfor %}
+{{indent}}   stream
+{{indent}});
+    """
+)
+
+
+def get_func_signature(func_attrs: Dict[str, Any]) -> str:
+    input_ndim = func_attrs["inputs"][0]._rank()
+    return FUNC_SIGNATURE.render(
+        func_name=func_attrs["name"],
+        dtype="cutlass::half_t",
+        input_ndim=input_ndim,
+    ).strip()
+
+
+def find_tile_size(k: int) -> int:
+    """
+    Find the smalled m that would make m * k multiples of 8.
+    For odd k, only apply for k <= 5
+    """
+    m = 1 if k > 1 else 8
+    for i in (1, 2, 4, 8):
+        if (k * i) % 8 == 0:
+            m = i
+            break
+    if k % 2 == 1 and k >= 7:
+        m = 1
+    return m
+
+
+@registry.reg("cuda.softmax.gen_function")
+def softmax_gen_function(func_attrs: Dict[str, Any]) -> str:
+    dim = func_attrs["dim"]
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    rank = len(shapes)
+
+    assert (
+        dim == rank - 1
+    ), f"softmax only supports dim == rank - 1, dim={dim}, rank={rank}"
+
+    assert isinstance(
+        shapes[dim], IntImm
+    ), "softmax requires reduction dim to be static"
+
+    k = shapes[dim].value()
+
+    return FUNC_TEMPLATE.render(
+        custom_libs=Target.current().get_custom_libs(
+            os.path.dirname(__file__), "softmax.cuh"
+        ),
+        func_signature=get_func_signature(func_attrs),
+        shape_functions=SHAPE_FUNCTIONS.render(input_ndim=rank),
+        dtype="cutlass::half_t",
+        K=k,
+        m=find_tile_size(k),
+    )
+
+
+@registry.reg("cuda.softmax.func_decl")
+def softmax_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(func_signature=get_func_signature(func_attrs))
+
+
+@registry.reg("cuda.softmax.func_call")
+def softmax_gen_function_call(func_attrs, indent="  "):
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 1
+
+    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][0]._attrs["name"]
+    )
+    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    assert (
+        len(shapes) >= 2
+    ), f"Softmax only supports input with rank >= 2, current rank: {len(shapes)}"
+
+    input_dim_names = [shape._attrs["name"] for shape in shapes]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input=input_name,
+        output=output_name,
+        input_dim_names=input_dim_names,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
new file mode 100644
index 000000000..3b9e1ec84
--- /dev/null
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -0,0 +1,171 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA target specialization
+"""
+
+import os
+import re
+import shutil
+import sys
+
+from typing import List
+
+from .. import registry
+from ..target import AIT_STATIC_FILES_PATH, CUTLASS_PATH, Target
+
+# pylint: disable=C0415,W0707,W0611,W0702,W1401
+
+
+class CUDA(Target):
+    """CUDA target."""
+
+    def __init__(
+        self,
+        template_path=CUTLASS_PATH,
+        ait_static_files_path=AIT_STATIC_FILES_PATH,
+        arch="80",
+        **kwargs,
+    ):
+        """CUDA target init.
+
+        Parameters
+        ----------
+        template_path : str, optional
+            by default "${repo_root}/3rdparty/cutlass"
+        ait_static_files_path : str
+            Absolute path to the AIT static/ directory
+        """
+        super().__init__(ait_static_files_path)
+        self._target_type = 1
+        self._template_path = template_path
+        self._ait_include_path = ait_static_files_path
+        self._arch = arch
+        self._kwargs = kwargs
+        self._compile_options = self._build_compile_options()
+
+    def _build_compile_options(self):
+        flash_attention_path = ""
+        if os.path.exists(
+            os.path.join(
+                self._template_path,
+                "../../python/aitemplate/backend/cuda/attention/src",
+            )
+        ):
+            # setup develop
+            flash_attention_path = os.path.join(
+                self._template_path,
+                "../../python/aitemplate/backend/cuda/attention/src",
+            )
+        else:
+            # in wheel
+            flash_attention_path = os.path.join(
+                self._template_path, "../../backend/cuda/attention/src"
+            )
+        cutlass_path = [
+            os.path.join(self._template_path, "include"),
+            os.path.join(self._template_path, "tools/util/include"),
+            os.path.join(self._template_path, "examples/35_gemm_softmax"),
+            os.path.join(
+                flash_attention_path,
+                "./",
+            ),
+            os.path.join(
+                flash_attention_path,
+                "fmha",
+            ),
+            os.path.join(self._template_path, "../cub"),
+        ]
+        options = [
+            "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
+            "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
+            "-w",
+            "-gencode=arch=compute_%s,code=[sm_%s,compute_%s]"
+            % (self._arch, self._arch, self._arch),
+            "-Xcompiler=-fPIC",
+            "-Xcompiler=-Wconversion",
+            "-Xcompiler=-fno-strict-aliasing",
+            "-Xcompiler -fvisibility=hidden",
+            "-O3",
+            "-std=c++17",
+            "--expt-relaxed-constexpr",
+            "--use_fast_math",
+            "-I" + cutlass_path[0],
+            "-I" + cutlass_path[1],
+            "-I" + cutlass_path[2],
+            "-I" + cutlass_path[3],
+            "-I" + cutlass_path[4],
+        ]
+        return " ".join(options)
+
+    def src_extension(self):
+        return ".cu"
+
+    def _gen_cutlass_lib_pkg(self):
+        self.lib_folder = None
+        try:
+            import cutlass_lib  # noqa: F401
+        except Exception:
+            try:
+                f_make_lib = registry.get("cuda.make_cutlass_lib")
+                dst_path = f_make_lib(self.template_path())
+                sys.path.insert(1, dst_path)
+            except Exception as err:
+                raise RuntimeError(
+                    "Failed to create cutlass library lib: {}".format(err)
+                ) from err
+            self.lib_folder = dst_path
+
+    def __enter__(self):
+        super().__enter__()
+        self._gen_cutlass_lib_pkg()
+        f_gen_ops = registry.get("cuda.gen_cutlass_ops")
+        self._operators = f_gen_ops(self._arch)
+
+    def __exit__(self, ptype, value, trace):
+        super().__exit__(ptype, value, trace)
+        if self.lib_folder and os.path.exists(self.lib_folder):
+            shutil.rmtree(self.lib_folder)
+
+    def cc(self):
+        return "nvcc"
+
+    def compile_cmd(self, executable=False):
+        if executable:
+            cmd = self.cc() + " " + self._compile_options + " -o {target} {src}"
+        else:
+            cmd = self.cc() + " " + self._compile_options + " -c -o {target} {src}"
+        return cmd
+
+    def dev_select_flag(self):
+        return "CUDA_VISIBLE_DEVICES"
+
+    def select_minimal_algo(self, algo_names: List[str]):
+        def comp_func(name):
+            compute_args = re.findall(r"(\d+)x(\d+)_(\d+)x(\d+)", name)
+            if len(compute_args) != 1:
+                raise RuntimeError("Invalid cutlass op name")
+            args = [int(x) for x in compute_args[0]]
+            align_args = name.split("_")
+            args.append(int(align_args[-2]))
+            args.append(int(align_args[-1]))
+            return tuple(args)
+
+        return sorted(algo_names, key=comp_func)[0]
+
+
+@registry.reg("cuda.create_target")
+def create_target(template_path, arch, **kwargs):
+    return CUDA(template_path=template_path, arch=arch, **kwargs)
diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
new file mode 100644
index 000000000..da8747d41
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -0,0 +1,51 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA tensor ops module init
+"""
+from . import (
+    argmax,
+    batch_gather,
+    concatenate,
+    concatenate_tanh,
+    dynamic_slice,
+    expand,
+    gather,
+    permute021,
+    permute102,
+    permute210,
+    slice_reshape_scatter,
+    slice_scatter,
+    split,
+    topk,
+)
+
+__all__ = [
+    "argmax",
+    "batch_gather",
+    "concatenate",
+    "concatenate_tanh",
+    "dynamic_slice",
+    "expand",
+    "gather",
+    "permute021",
+    "permute102",
+    "permute210",
+    "slice_reshape_scatter",
+    "slice_scatter",
+    "split",
+    "argmax",
+    "topk",
+]
diff --git a/python/aitemplate/backend/cuda/tensor/argmax.py b/python/aitemplate/backend/cuda/tensor/argmax.py
new file mode 100644
index 000000000..9f38f2102
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/argmax.py
@@ -0,0 +1,52 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+argmax kernel codegen for CUDA.
+"""
+
+from typing import Any, Dict
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common.tensor import argmax_common
+
+# pylint: disable=C0301
+
+header_files = """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include <cub/cub.cuh>
+"""
+
+
+@registry.reg("cuda.argmax.gen_function")
+def argmax_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return argmax_common.gen_function(func_attrs, header_files, CUDASpec())
+
+
+@registry.reg("cuda.argmax.func_decl")
+def argmax_gen_function_decl(func_attrs: Dict[str, Any]):
+    return argmax_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.argmax.func_call")
+def argmax_gen_function_call(func_attrs, indent="  "):
+    return argmax_common.gen_function_call(func_attrs, CUDASpec(), indent)
+
+
+@registry.reg("cuda.argmax.gen_profiler")
+def gen_profiler(func_attrs, workdir):
+    return argmax_common.gen_profiler(func_attrs, workdir, header_files, CUDASpec())
diff --git a/python/aitemplate/backend/cuda/tensor/batch_gather.py b/python/aitemplate/backend/cuda/tensor/batch_gather.py
new file mode 100644
index 000000000..721bbb84b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/batch_gather.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+batch_gather kernel codegen for CUDA.
+"""
+
+from typing import Any, Dict
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common.tensor import batch_gather_common
+
+# pylint: disable=C0301
+
+header_files = """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+"""
+
+
+@registry.reg("cuda.batch_gather.gen_function")
+def batch_gather_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return batch_gather_common.gen_function(func_attrs, header_files, CUDASpec())
+
+
+@registry.reg("cuda.batch_gather.func_decl")
+def batch_gather_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    return batch_gather_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.batch_gather.func_call")
+def batch_gather_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return batch_gather_common.gen_function_call(func_attrs, indent, True)
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate.py b/python/aitemplate/backend/cuda/tensor/concatenate.py
new file mode 100644
index 000000000..8f56b12ba
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/concatenate.py
@@ -0,0 +1,87 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA concatenate function
+"""
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common import concatenate_common
+
+
+@registry.reg("cuda.concatenate.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    # get dtype from orig_x in case actual "inputs" is turned into empty
+    # by some transformation
+    return concatenate_common.gen_function_decl(
+        func_attrs=func_attrs,
+        backend_spec=CUDASpec(),
+    )
+
+
+@registry.reg("cuda.concatenate.gen_function")
+def gen_function(func_attrs, element_func=None, element_func_def=None):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return concatenate_common.gen_function(
+        func_attrs=func_attrs,
+        backend_spec=CUDASpec(),
+        element_func=element_func,
+        element_func_def=element_func_def,
+    )
+
+
+@registry.reg("cuda.concatenate.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return concatenate_common.gen_function_call(
+        func_attrs=func_attrs,
+        backend_spec=CUDASpec(),
+        indent="  ",
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py b/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
new file mode 100644
index 000000000..efb2ea440
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
@@ -0,0 +1,103 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for concatenate_tanh.
+"""
+import jinja2
+
+from ... import registry
+from . import concatenate
+
+TANH_DEF = jinja2.Template(
+    """
+#include <cutlass/fast_math.h>
+
+#ifndef __HALF2_TO_UI
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#endif
+
+#ifndef __HALF_TO_US
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#endif
+
+__device__  half2 fast_tanh(half2 x) {
+  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+
+  asm volatile ( "tanh.approx.f16x2 %0, %1;" : "=r"(__HALF2_TO_UI(x)) : "r"(__HALF2_TO_UI(x)));
+  return x;
+
+  #else
+  CUTLASS_NOT_IMPLEMENTED();
+  #endif
+}
+
+__device__  half fast_tanh(half x) {
+  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+
+  asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(__HALF_TO_US(x)) : "h"(__HALF_TO_US(x)));
+  return x;
+
+  #else
+  return half(cutlass::fast_tanh(float(x)));
+  #endif
+}
+
+__device__  float fast_tanh(float x) {
+    float y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] =  fast_tanh(x_vec[0]);
+    return y;
+}
+
+__device__  float2 fast_tanh(float2 x) {
+    float2 y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] = fast_tanh(x_vec[0]);
+    y_vec[1] = fast_tanh(x_vec[1]);
+    return y;
+}
+
+__device__  float4 fast_tanh(float4 x) {
+    float4 y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] = fast_tanh(x_vec[0]);
+    y_vec[1] = fast_tanh(x_vec[1]);
+    y_vec[2] = fast_tanh(x_vec[2]);
+    y_vec[3] = fast_tanh(x_vec[3]);
+    return y;
+}
+
+"""
+)
+
+
+@registry.reg("cuda.concatenate_tanh.func_decl")
+def gen_function_decl(func_attrs):
+    return concatenate.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.concatenate_tanh.gen_function")
+def gen_function(func_attrs):
+    return concatenate.gen_function(
+        func_attrs, element_func="fast_tanh", element_func_def=TANH_DEF.render()
+    )
+
+
+@registry.reg("cuda.concatenate_tanh.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return concatenate.gen_function_call(func_attrs, indent=indent)
diff --git a/python/aitemplate/backend/cuda/tensor/dynamic_slice.py b/python/aitemplate/backend/cuda/tensor/dynamic_slice.py
new file mode 100644
index 000000000..cee387517
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/dynamic_slice.py
@@ -0,0 +1,84 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Dynamic slice CUDA implementation.
+"""
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common.tensor import slice_common
+
+
+@registry.reg("cuda.dynamic_slice.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return slice_common.gen_function_decl(func_attrs, backend_spec=CUDASpec())
+
+
+@registry.reg("cuda.dynamic_slice.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return slice_common.gen_function(
+        func_attrs, backend_spec=CUDASpec(), elems_per_thread=8
+    )
+
+
+@registry.reg("cuda.dynamic_slice.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return slice_common.gen_function_call(
+        backend_spec=CUDASpec(),
+        func_name=func_attrs["name"],
+        inputs=func_attrs["inputs"],
+        outputs=func_attrs["outputs"],
+        start_indices=[func_attrs["start_indices"]],
+        end_indices=[func_attrs["end_indices"]],
+        dim=0,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/expand.py b/python/aitemplate/backend/cuda/tensor/expand.py
new file mode 100644
index 000000000..cdd50a486
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/expand.py
@@ -0,0 +1,31 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from ... import registry
+
+
+@registry.reg("cuda.expand.func_decl")
+def gen_function_decl(func_attrs):
+    raise NotImplementedError("Expand copying kernel is not implemented.")
+
+
+@registry.reg("cuda.expand.gen_function")
+def gen_function(func_attrs):
+    raise NotImplementedError("Expand copying kernel is not implemented.")
+
+
+@registry.reg("cuda.expand.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    raise NotImplementedError("Expand copying kernel is not implemented.")
diff --git a/python/aitemplate/backend/cuda/tensor/gather.py b/python/aitemplate/backend/cuda/tensor/gather.py
new file mode 100644
index 000000000..3ef41b477
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/gather.py
@@ -0,0 +1,412 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA gather function
+"""
+import jinja2
+
+from ... import registry
+from .. import cuda_common
+
+CAST_TO_CONST_HALF_PTR_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<const half*>(&({{name}}->raw()))"
+)
+
+
+CAST_TO_CONST_INDEX_PTR_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<const {{index_type}}*>({{name}})"
+)
+
+
+CAST_TO_HALF_PTR_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<half*>(&({{name}}->raw()))"
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    {{elem_output_type}} * /*output*/,
+    const {{elem_input_type}} * /*input*/,
+    const {{index_type}} * /*indices*/,
+    int64_t *[] /*output_shape*/,
+    const int64_t * /*input_shape*/,
+    const int64_t * /*index_shape*/,
+    int /*gather_dim*/,
+    int /*rank*/,
+    cudaStream_t /*stream*/
+    );
+"""
+)
+
+
+KERNEL_SRC_TEMPLATE = jinja2.Template(
+    """
+#include <assert.h>
+#include <cuda_fp16.h>
+
+#include <chrono>
+#include <functional>
+#include <iostream>
+#include <string>
+
+#ifndef CUDA_CHECK_ERROR_GATHER
+#define CUDA_CHECK_ERROR_GATHER(expr)                        \\
+  do {                                                       \\
+    cudaError_t status = (expr);                             \\
+    if (status != cudaSuccess) {                             \\
+      auto msg = std::string("Got error: ") +                \\
+       cudaGetErrorString(status) +                          \\
+        " at " + __FILE__ + ": " + std::to_string(__LINE__); \\
+      std::cerr << msg << std::endl;                         \\
+      throw std::runtime_error(msg);                         \\
+    }                                                        \\
+  } while (0)
+#endif // CUDA_CHECK_ERROR_GATHER
+
+#ifndef CUDA_LAUNCH_CHECK_GATHER
+#define CUDA_LAUNCH_CHECK_GATHER() CUDA_CHECK_ERROR_GATHER(cudaGetLastError())
+#endif // CUDA_LAUNCH_CHECK_GATHER
+
+namespace {
+
+using INDEX_TYPE = {{index_type}};
+
+template <int Rank>
+struct InputMetaData {
+  int64_t input_strides[Rank];
+  int64_t index_shape[Rank];
+};
+
+__host__ __device__ __forceinline__
+int64_t get_num_elems(const int64_t *shape, int rank) {
+  int num = 1;
+  for (int i = 0; i < rank; i++) {
+    num *= shape[i];
+  }
+  return num;
+}
+
+template <int Rank>
+__host__ __device__ int64_t compute_input_elem_offset(
+    const int64_t *input_strides,
+    const int64_t *index_shape,
+    int64_t curr_gather_dim_size,
+    int gather_dim,
+    int64_t linear_index_idx) {
+  int64_t input_offset = 0;
+  for (int i = Rank - 1; i >= 0; --i) {
+    int curr_index_idx = linear_index_idx % index_shape[i];
+    int dim_size = i == gather_dim ? curr_gather_dim_size : curr_index_idx;
+    input_offset += dim_size * input_strides[i];
+    linear_index_idx /= index_shape[i];
+  }
+  assert(linear_index_idx == 0);
+  return input_offset;
+}
+
+template <typename READ_T, typename READ_INDEX_T, typename ELEM_T,
+          int Rank, int ElemsPerThread>
+__global__ void
+gather_kernel(
+    ELEM_T *orig_output,
+    const ELEM_T *orig_input,
+    const INDEX_TYPE *orig_indices,
+    InputMetaData<Rank> input_meta,
+    const int gather_dim,
+    const int64_t num_output_elems) {
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  READ_T* output = reinterpret_cast<READ_T*>(orig_output);
+  const READ_INDEX_T* indices =
+      reinterpret_cast<const READ_INDEX_T*>(orig_indices);
+
+  constexpr unsigned read_t_sz = sizeof(READ_T);
+  constexpr unsigned elem_t_sz = sizeof(ELEM_T);
+  assert(read_t_sz >= elem_t_sz && (read_t_sz % elem_t_sz == 0));
+  constexpr int n_of_elem_t = read_t_sz / elem_t_sz;
+  assert(sizeof(READ_INDEX_T) % sizeof(INDEX_TYPE) == 0);
+  assert(n_of_elem_t == (sizeof(READ_INDEX_T) / sizeof(INDEX_TYPE)));
+  // number of READ_T elements per thread
+  constexpr int reads_per_thread_in_read_t = ElemsPerThread / n_of_elem_t;
+  const int num_elems_in_read_t = num_output_elems / n_of_elem_t;
+  int read_idx = tid;
+
+#pragma unroll
+  for (int i = 0; i < reads_per_thread_in_read_t;
+       i++, read_idx += blockDim.x * gridDim.x) {
+    if (read_idx >= num_elems_in_read_t) {
+      break;
+    }
+    READ_INDEX_T curr_gather_dim_sizes_vec = indices[read_idx];
+    INDEX_TYPE *curr_gather_dim_sizes_ptr =
+        reinterpret_cast<INDEX_TYPE*>(&curr_gather_dim_sizes_vec);
+    READ_T input_values_vec;
+    ELEM_T *input_values_ptr = reinterpret_cast<ELEM_T*>(&input_values_vec);
+    #pragma unroll
+    for (int j = 0; j < n_of_elem_t; j++) {
+      int64_t input_elem_offset =
+          compute_input_elem_offset<Rank>(input_meta.input_strides,
+                                          input_meta.index_shape,
+                                          curr_gather_dim_sizes_ptr[j],
+                                          gather_dim,
+                                          read_idx * n_of_elem_t + j);
+      input_values_ptr[j] = orig_input[input_elem_offset];
+    }
+    output[read_idx] = input_values_vec;
+  }
+}
+
+template <typename ELEM_T, int Rank, int ElemsPerThread, int ThreadsPerBlock>
+void gather_kernel_launcher(
+    ELEM_T *output,
+    const ELEM_T *input,
+    const INDEX_TYPE *indices,
+    const int64_t *input_shape,
+    const int64_t *index_shape,
+    const int gather_dim,
+    cudaStream_t stream) {
+
+  InputMetaData<Rank> input_meta;
+  input_meta.input_strides[Rank - 1] = 1;
+  input_meta.index_shape[Rank - 1] = index_shape[Rank - 1];
+  for (int i = Rank - 2; i >= 0; i--) {
+    input_meta.input_strides[i] = input_meta.input_strides[i+1] * input_shape[i+1];
+    input_meta.index_shape[i] = index_shape[i];
+  }
+
+  int64_t num_output_elems = get_num_elems(index_shape, Rank);
+  int m = (num_output_elems % (ThreadsPerBlock * ElemsPerThread) != 0);
+  int num_blocks_x =
+      (num_output_elems / (ThreadsPerBlock * ElemsPerThread)) + m;
+  int grid_config = num_blocks_x;
+
+  if (num_output_elems % 2 == 0) {
+    gather_kernel<float, int4, ELEM_T, Rank, ElemsPerThread>
+    <<<grid_config, ThreadsPerBlock, 0, stream>>>(
+        output,
+        input,
+        indices,
+        input_meta,
+        gather_dim,
+        num_output_elems);
+    CUDA_LAUNCH_CHECK_GATHER();
+  } else{
+    gather_kernel<half, INDEX_TYPE, ELEM_T, Rank, ElemsPerThread>
+    <<<grid_config, ThreadsPerBlock, 0, stream>>>(
+        output,
+        input,
+        indices,
+        input_meta,
+        gather_dim,
+        num_output_elems);
+    CUDA_LAUNCH_CHECK_GATHER();
+  }
+}
+
+#undef CUDA_CHECK_ERROR_GATHER
+#undef CUDA_LAUNCH_CHECK_GATHER
+
+} // namespace
+"""
+)
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if (rank == {{rank}}) {
+{{indent}}  /* TODO: more profiling on ElemsPerThread and ThreadsPerBlock */
+{{indent}}  gather_kernel_launcher<{{elem_type}},
+                                   {{rank}}/*Rank*/,
+                                   {{elems_per_thread}}/*ElemsPerThread*/,
+{{indent}}                         {{threads_per_block}}/*THREADS_PER_BLOCK*/>(
+{{indent}}    output, input, indices, input_shape, index_shape, gather_dim, stream);
+{{indent}}  return;
+{{indent}}}
+"""
+)
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{kernel_src}}
+
+void {{func_name}}(
+    {{elem_output_type}}* output,
+    const {{elem_input_type}}* input,
+    const INDEX_TYPE* indices,
+    int64_t *output_shape[],
+    const int64_t *input_shape,
+    const int64_t *index_shape,
+    int gather_dim,
+    int rank,
+    cudaStream_t stream
+    ) {
+
+  if (rank < 0) {
+    throw std::runtime_error("rank must be larger than 0!");
+  }
+  if (rank == 0)
+    return;
+  if (gather_dim >= rank) {
+    throw std::runtime_error("gather_dim must be smaller than rank!");
+  }
+
+  bool empty_tensor = false;
+  for (int i = 0; i < rank; i++) {
+    if (i != gather_dim && index_shape[i] > input_shape[i]) {
+      throw std::runtime_error("index dimension must be <= input dimension");
+    }
+    *(output_shape[i]) = index_shape[i];
+    if (index_shape[i] == 0)
+      empty_tensor = true;
+  }
+  if (empty_tensor)
+    return;
+
+  // make sure input and output are valid
+  if (!input) {
+    throw std::runtime_error("input is NULL!");
+  }
+  if (!output) {
+    throw std::runtime_error("output is NULL!");
+  }
+
+{{exec_paths}}
+
+  throw std::runtime_error(
+      "Unsupported cat kernel specialization!"
+  );
+}
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+
+{{indent}}  int64_t *{{output_name}}_shape[] = {
+{{indent}}      {{output_dims}}
+{{indent}}  };
+
+{{indent}}  const int64_t {{input_name}}_shape[] = {
+{{indent}}    {{input_dims}}
+{{indent}}  };
+
+{{indent}}  const int64_t {{index_name}}_shape[] = {
+{{indent}}    {{index_dims}}
+{{indent}}  };
+
+{{indent}}  {{func_name}}(
+{{indent}}      {{output_ptr}},
+{{indent}}      {{input_ptr}},
+{{indent}}      {{index_ptr}},
+{{indent}}      {{output_name}}_shape,
+{{indent}}      {{input_name}}_shape,
+{{indent}}      {{index_name}}_shape,
+{{indent}}      {{gather_dim}}/*gather_dim*/,
+{{indent}}      {{rank}}/*rank*/,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+@registry.reg("cuda.gather.func_decl")
+def gen_function_decl(func_attrs):
+    inputs = func_attrs["inputs"]
+    x = inputs[0]
+    index = inputs[1]
+    y = func_attrs["outputs"][0]
+    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
+    index_type = cuda_common.dtype_to_cuda_type(index._attrs["dtype"])
+    output_type = cuda_common.dtype_to_cuda_type(y._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        elem_output_type=output_type,
+        elem_input_type=input_type,
+        index_type=index_type,
+    )
+
+
+@registry.reg("cuda.gather.gen_function")
+def gen_function(func_attrs):
+    inputs = func_attrs["inputs"]
+    x = inputs[0]
+    index = inputs[1]
+    y = func_attrs["outputs"][0]
+    x_shape = x._attrs["shape"]
+
+    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
+    index_type = cuda_common.dtype_to_cuda_type(index._attrs["dtype"])
+    output_type = cuda_common.dtype_to_cuda_type(y._attrs["dtype"])
+
+    # TODO: consider to add profiling paths for tuning
+    # elems_per_thread and threads_per_block
+    exec_paths = EXEC_COND_TEMPLATE.render(
+        indent="  ",
+        rank=len(x_shape),
+        elem_type=input_type,
+        elems_per_thread=2,
+        threads_per_block=128,
+    )
+
+    kernel_src = KERNEL_SRC_TEMPLATE.render(index_type=index_type)
+    return SRC_TEMPLATE.render(
+        kernel_src=kernel_src,
+        func_name=func_attrs["name"],
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        exec_paths=exec_paths,
+    )
+
+
+@registry.reg("cuda.gather.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    inputs = func_attrs["inputs"]
+    x = inputs[0]
+    index = inputs[1]
+    y = func_attrs["outputs"][0]
+    gather_dim = func_attrs["gather_dim"]
+
+    def _dims(t, ref=""):
+        return ", ".join([ref + dim._attrs["name"] for dim in t._attrs["shape"]])
+
+    x_dims = _dims(x)
+    index_dims = _dims(index)
+    y_dims = _dims(y, ref="&")
+
+    index_type = cuda_common.dtype_to_cuda_type(index._attrs["dtype"])
+    casted_x_ptr = CAST_TO_CONST_HALF_PTR_TEMPLATE.render(name=x._attrs["name"])
+    casted_index_ptr = CAST_TO_CONST_INDEX_PTR_TEMPLATE.render(
+        index_type=index_type, name=index._attrs["name"]
+    )
+    casted_y_ptr = CAST_TO_HALF_PTR_TEMPLATE.render(name=y._attrs["name"])
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        func_name=func_attrs["name"],
+        output_name=y._attrs["name"],
+        input_name=x._attrs["name"],
+        index_name=index._attrs["name"],
+        output_dims=y_dims,
+        input_dims=x_dims,
+        index_dims=index_dims,
+        output_ptr=casted_y_ptr,
+        input_ptr=casted_x_ptr,
+        index_ptr=casted_index_ptr,
+        gather_dim=gather_dim,
+        rank=len(x._attrs["shape"]),
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/permute021.py b/python/aitemplate/backend/cuda/tensor/permute021.py
new file mode 100644
index 000000000..c51a7ace1
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/permute021.py
@@ -0,0 +1,91 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute021 for cuda
+"""
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common.tensor import permute021_common
+
+# pylint: disable=C0301,W0613,W0612
+
+Header_files = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+"""
+
+
+@registry.reg("cuda.permute021.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+    shape_eval_template : jinja template
+    shape_save_template : jinja template
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    return permute021_common.gen_function(
+        func_attrs,
+        template_path,
+        shape_eval_template,
+        shape_save_template,
+        Header_files,
+        CUDASpec(),
+    )
+
+
+@registry.reg("cuda.permute021.func_decl")
+def gen_function_decl(func_attrs):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    return permute021_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.permute021.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    return permute021_common.gen_function_call(func_attrs, CUDASpec(), indent)
diff --git a/python/aitemplate/backend/cuda/tensor/permute102.py b/python/aitemplate/backend/cuda/tensor/permute102.py
new file mode 100644
index 000000000..715623e54
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/permute102.py
@@ -0,0 +1,91 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute102 for cuda
+"""
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common.tensor import permute102_common
+
+# pylint: disable=C0301,W0613,W0612
+
+Header_files = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+"""
+
+
+@registry.reg("cuda.permute102.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+    shape_eval_template : jinja template
+    shape_save_template : jinja template
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    return permute102_common.gen_function(
+        func_attrs,
+        template_path,
+        shape_eval_template,
+        shape_save_template,
+        Header_files,
+        CUDASpec(),
+    )
+
+
+@registry.reg("cuda.permute102.func_decl")
+def gen_function_decl(func_attrs):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    return permute102_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.permute102.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    return permute102_common.gen_function_call(func_attrs, CUDASpec(), indent)
diff --git a/python/aitemplate/backend/cuda/tensor/permute210.py b/python/aitemplate/backend/cuda/tensor/permute210.py
new file mode 100644
index 000000000..d029277e2
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/permute210.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute210 for cuda
+"""
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common.tensor import permute210_common
+
+# pylint: disable=C0301,W0613,W0612
+
+Header_files = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+"""
+
+
+@registry.reg("cuda.permute210.gen_function")
+def gen_function(func_attrs, template_path):
+    """
+
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    template_path : str
+        path to library used
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    return permute210_common.gen_function(func_attrs, Header_files, CUDASpec())
+
+
+@registry.reg("cuda.permute210.func_decl")
+def gen_function_decl(func_attrs):
+    return permute210_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.permute210.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    return permute210_common.gen_function_call(func_attrs, CUDASpec(), indent)
diff --git a/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py b/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py
new file mode 100644
index 000000000..15d54efc5
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py
@@ -0,0 +1,167 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Slice reshape scatter CUDA implementation.
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common.tensor import slice_reshape_scatter_common
+
+OUTPUT_DIM_DEF_TEMPLATE = jinja2.Template(
+    """
+{{indent}}int64_t {{dim_name}} = {{dim_value}};
+"""
+)
+
+OUTPUT_SHAPE_DEF_TEMPLATE = jinja2.Template(
+    """
+{{dim_defs}}
+{{indent}}  int64_t *{{output_name}}_shape[] = {
+{{indent}}    {{output_dim_refs}}
+{{indent}}  };
+"""
+)
+
+TANH_DEF = jinja2.Template(
+    """
+#ifndef __HALF2_TO_UI
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#endif
+
+#ifndef __HALF_TO_US
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#endif
+
+__device__  half2 fast_tanh(half2 x) {
+  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+
+  asm volatile ( "tanh.approx.f16x2 %0, %1;" : "=r"(__HALF2_TO_UI(x)) : "r"(__HALF2_TO_UI(x)));
+  return x;
+
+  #else
+  CUTLASS_NOT_IMPLEMENTED();
+  #endif
+}
+
+__device__  half fast_tanh(half x) {
+  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+
+  asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(__HALF_TO_US(x)) : "h"(__HALF_TO_US(x)));
+  return x;
+
+  #else
+  return half(cutlass::fast_tanh(float(x)));
+  #endif
+}
+
+__device__  float fast_tanh(float x) {
+    float y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] =  fast_tanh(x_vec[0]);
+    return y;
+}
+
+__device__  float2 fast_tanh(float2 x) {
+    float2 y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] = fast_tanh(x_vec[0]);
+    y_vec[1] = fast_tanh(x_vec[1]);
+    return y;
+}
+
+__device__  float4 fast_tanh(float4 x) {
+    float4 y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] = fast_tanh(x_vec[0]);
+    y_vec[1] = fast_tanh(x_vec[1]);
+    y_vec[2] = fast_tanh(x_vec[2]);
+    y_vec[3] = fast_tanh(x_vec[3]);
+    return y;
+}
+
+"""
+)
+
+EXTRA_HEADER_TEMPLATE = jinja2.Template(
+    """
+{% if element_func_def %}
+#include <cutlass/fast_math.h>
+{% endif %}
+"""
+)
+
+
+@registry.reg("cuda.slice_reshape_scatter.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return slice_reshape_scatter_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.slice_reshape_scatter.gen_function")
+def gen_function(func_attrs, element_func=None):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    element_func: str
+        Attributes for ease of tanh concatenate fusion.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    # TODO: consider to profile elems_per_thread
+    return slice_reshape_scatter_common.gen_function(
+        func_attrs, CUDASpec(), TANH_DEF, element_func, EXTRA_HEADER_TEMPLATE
+    )
+
+
+@registry.reg("cuda.slice_reshape_scatter.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return slice_reshape_scatter_common.gen_function_call(
+        func_attrs, CUDASpec(), indent
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/slice_scatter.py b/python/aitemplate/backend/cuda/tensor/slice_scatter.py
new file mode 100644
index 000000000..f42524c01
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/slice_scatter.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Slice scatter CUDA implementation.
+"""
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common.tensor import slice_common
+
+
+@registry.reg("cuda.slice_scatter.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return slice_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.slice_scatter.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    # TODO: consider to profile elems_per_thread
+    elems_per_thread = 8 if len(func_attrs["inputs"]) == 1 else 256
+    return slice_common.gen_function(
+        func_attrs, backend_spec=CUDASpec(), elems_per_thread=elems_per_thread
+    )
+
+
+@registry.reg("cuda.slice_scatter.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    slice_ops = func_attrs["slice_ops"]
+    assert len(slice_ops) >= 1
+    start_indices = [op._attrs["start_indices"] for op in slice_ops]
+    end_indices = [op._attrs["end_indices"] for op in slice_ops]
+    return slice_common.gen_function_call(
+        backend_spec=CUDASpec(),
+        func_name=func_attrs["name"],
+        inputs=func_attrs["inputs"],
+        outputs=func_attrs["outputs"],
+        start_indices=start_indices,
+        end_indices=end_indices,
+        dim=func_attrs["scatter_dim"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/split.py b/python/aitemplate/backend/cuda/tensor/split.py
new file mode 100644
index 000000000..b0bf6c531
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/split.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA concatenate function
+"""
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common import split_common
+
+
+@registry.reg("cuda.split.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return split_common.gen_function_decl(
+        func_attrs=func_attrs, backend_spec=CUDASpec()
+    )
+
+
+@registry.reg("cuda.split.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return split_common.gen_function(func_attrs=func_attrs, backend_spec=CUDASpec())
+
+
+@registry.reg("cuda.split.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return split_common.gen_function_call(
+        func_attrs=func_attrs, backend_spec=CUDASpec()
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/topk.py b/python/aitemplate/backend/cuda/tensor/topk.py
new file mode 100644
index 000000000..36916d4e2
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/topk.py
@@ -0,0 +1,52 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+topk kernel codegen for CUDA.
+"""
+
+from typing import Any, Dict
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common.tensor import topk_common
+
+# pylint: disable=C0301
+
+header_files = """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include <cub/cub.cuh>
+"""
+
+
+@registry.reg("cuda.topk.gen_function")
+def topk_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return topk_common.gen_function(func_attrs, header_files, CUDASpec())
+
+
+@registry.reg("cuda.topk.func_decl")
+def topk_gen_function_decl(func_attrs: Dict[str, Any]):
+    return topk_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.topk.func_call")
+def topk_gen_function_call(func_attrs, indent="  "):
+    return topk_common.gen_function_call(func_attrs, CUDASpec(), indent)
+
+
+@registry.reg("cuda.topk.gen_profiler")
+def gen_profiler(func_attrs, workdir):
+    return topk_common.gen_profiler(func_attrs, workdir, header_files, CUDASpec())
diff --git a/python/aitemplate/backend/cuda/upsample/__init__.py b/python/aitemplate/backend/cuda/upsample/__init__.py
new file mode 100644
index 000000000..98b87b6d8
--- /dev/null
+++ b/python/aitemplate/backend/cuda/upsample/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA upsampling module init
+"""
+from . import upsampling2d, upsampling2d_add
+
+__all__ = ["upsampling2d", "upsampling2d_add"]
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d.py b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
new file mode 100644
index 000000000..f1de127b5
--- /dev/null
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
@@ -0,0 +1,96 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for upsampling2d.
+"""
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common import upsampling2d_common
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+
+Header_Files = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+"""
+
+
+@registry.reg("cuda.upsampling2d.gen_function")
+def gen_function(
+    func_attrs,
+    template_path,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    half2_data_ref = backend_spec.half2_data_ref
+
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        scale_factor=func_attrs["scale_factor"],
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in exec_path:
+        program = upsampling2d_common.EXEC_TEMPLATE.render()
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return upsampling2d_common.SRC_TEMPLATE.render(
+        header_files=Header_Files,
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        half2_data_ref=half2_data_ref,
+        mode=func_attrs["mode"],
+        tsize=upsampling2d_common.gen_alignment(x),
+    )
+
+
+@registry.reg("cuda.upsampling2d.func_decl")
+def upsampling2d_gen_function_decl(func_attrs):
+    return upsampling2d_common.gen_function_decl(func_attrs, backend_spec=CUDASpec())
+
+
+@registry.reg("cuda.upsampling2d.func_call")
+def upsampling2d_gen_function_call(func_attrs, indent="    "):
+    return upsampling2d_common.gen_function_call(
+        func_attrs, backend_spec=CUDASpec(), indent=indent
+    )
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
new file mode 100644
index 000000000..9f7139189
--- /dev/null
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
@@ -0,0 +1,99 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for upsampling2d_add.
+"""
+
+from ... import registry
+from ...backend_spec import CUDASpec
+from ...common import upsampling2d_common
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+
+Header_Files = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+"""
+
+
+@registry.reg("cuda.upsampling2d_add.gen_function")
+def gen_function(
+    func_attrs,
+    template_path,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    half2_data_ref = backend_spec.half2_data_ref
+
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        scale_factor=func_attrs["scale_factor"],
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in exec_path:
+        program = upsampling2d_common.EXEC_TEMPLATE.render(bias_add=True)
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return upsampling2d_common.SRC_TEMPLATE.render(
+        header_files=Header_Files,
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        half2_data_ref=half2_data_ref,
+        mode=func_attrs["mode"],
+        bias_add=True,
+        tsize=upsampling2d_common.gen_alignment(x),
+    )
+
+
+@registry.reg("cuda.upsampling2d_add.func_decl")
+def upsampling2d_gen_function_decl(func_attrs):
+    return upsampling2d_common.gen_function_decl(
+        func_attrs, backend_spec=CUDASpec(), bias_add=True
+    )
+
+
+@registry.reg("cuda.upsampling2d_add.func_call")
+def upsampling2d_gen_function_call(func_attrs, indent="    "):
+    return upsampling2d_common.gen_function_call(
+        func_attrs, backend_spec=CUDASpec(), indent=indent, bias_add=True
+    )
diff --git a/python/aitemplate/backend/cuda/utils.py b/python/aitemplate/backend/cuda/utils.py
new file mode 100644
index 000000000..ada46c404
--- /dev/null
+++ b/python/aitemplate/backend/cuda/utils.py
@@ -0,0 +1,63 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Util functions for CUDA codegen.
+"""
+from aitemplate.utils.mk_cutlass_lib.mk_cutlass_lib import mk_cutlass_lib
+
+from ...utils import logger
+from .. import registry
+
+# pylint: disable=C0103,C0415,W0707
+
+
+class Args(object):
+    def __init__(self, arch):
+        self.operations = "all"
+        self.build_dir = ""
+        self.curr_build_dir = ""
+        self.generator_target = ""
+        self.architectures = arch
+        self.kernels = "all"
+        self.ignore_kernels = ""
+        self.cuda_version = "11.4.0"
+        self.kernel_filter_file = None
+        self.selected_kernel_list = None
+        self.interface_dir = None
+        self.filter_by_cc = True
+
+
+registry.reg("cuda.make_cutlass_lib")(mk_cutlass_lib)
+
+
+@registry.reg("cuda.gen_cutlass_ops")
+def gen_ops(arch):
+    import cutlass_lib
+
+    args = Args(arch)
+    manifest = cutlass_lib.manifest.Manifest(args)
+    try:
+        func = getattr(cutlass_lib.generator, "GenerateSM" + arch)
+        func(manifest, args.cuda_version)
+    except AttributeError as e:
+        raise NotImplementedError(
+            "Arch " + arch + " is not supported by current cutlass lib."
+        ) from e
+    try:
+        func = getattr(cutlass_lib.extra_operation, "GenerateSM" + arch)
+        func(manifest, args)
+    except AttributeError:
+        logger.warning(__file__, "Arch " + arch + " is not supported by extra ops.")
+    return manifest.operations
diff --git a/python/aitemplate/backend/cuda/view_ops/__init__.py b/python/aitemplate/backend/cuda/view_ops/__init__.py
new file mode 100644
index 000000000..bc232e36a
--- /dev/null
+++ b/python/aitemplate/backend/cuda/view_ops/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA view_ops module init
+"""
+from . import view_ops
+
+__all__ = ["view_ops"]
diff --git a/python/aitemplate/backend/cuda/view_ops/view_ops.py b/python/aitemplate/backend/cuda/view_ops/view_ops.py
new file mode 100644
index 000000000..17cbaaa6f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/view_ops/view_ops.py
@@ -0,0 +1,230 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for view ops.
+"""
+import jinja2
+
+from ....backend import registry
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_runtime.h>
+
+void {{function_name}} (
+    {{input_args}}
+    {{output_args}}
+) {
+  {{shape_functions}}
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+# indent: 4 spaces
+INPUT_ARGS_TEMPLATE = jinja2.Template(
+    """
+{% for idx in range(input_ndim) %}
+    int64_t* in_{{idx}},
+{% endfor %}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+# indent: 4 spaces
+OUTPUT_ARGS_TEMPLATE = jinja2.Template(
+    """
+{% for idx in range(output_ndim - 1) %}
+    int64_t* out_{{idx}},
+{% endfor %}
+    int64_t* out_{{output_ndim - 1}}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+{% for idx in range(input_ndim + output_ndim - 1) %}
+  int64_t*,
+{% endfor %}
+  int64_t*
+);
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{% for name in input_names %}
+{{indent}}    &{{name}},
+{% endfor %}
+{% for name in output_names_except_last %}
+{{indent}}    &{{name}},
+{% endfor %}
+{{indent}}    &{{last_output}}
+{{indent}});
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+@registry.reg("cuda.reshape.gen_function")
+@registry.reg("cuda.flatten.gen_function")
+def reshape_gen_function(func_attrs, shape_eval_template):
+    func_name = func_attrs["name"]
+
+    input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
+    output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
+    unknown_idx = func_attrs["unknown_idx"]
+
+    input_args = INPUT_ARGS_TEMPLATE.render(input_ndim=input_ndim)
+    output_args = OUTPUT_ARGS_TEMPLATE.render(output_ndim=output_ndim)
+
+    shape_functions = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        input_ndim=input_ndim,
+        output_ndim=output_ndim,
+        unknown_idx=unknown_idx,
+    )
+
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        shape_functions=shape_functions.strip(),
+        input_args=input_args.strip(),
+        output_args=output_args.strip(),
+    )
+
+
+@registry.reg("cuda.reshape.func_decl")
+@registry.reg("cuda.flatten.func_decl")
+def reshape_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
+    output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
+
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, input_ndim=input_ndim, output_ndim=output_ndim
+    )
+
+
+@registry.reg("cuda.reshape.func_call")
+@registry.reg("cuda.flatten.func_call")
+def reshape_gen_function_call(func_attrs, indent="  "):
+    func_name = func_attrs["name"]
+    input_names = [
+        shape._attrs["name"] for shape in func_attrs["inputs"][0]._attrs["shape"]
+    ]
+    output_names = [
+        shape._attrs["name"] for shape in func_attrs["outputs"][0]._attrs["shape"]
+    ]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_name,
+        input_names=input_names,
+        output_names_except_last=output_names[:-1],
+        last_output=output_names[-1],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.squeeze.gen_function")
+@registry.reg("cuda.unsqueeze.gen_function")
+def squeeze_gen_function(func_attrs, shape_eval_template):
+    """Generate the function body squeeze/unsqueeze.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        The _attrs dict from the original op.
+    shape_eval_template : jinja2.Template
+        The template that implements the logic for writing to dynamic shapes.
+    """
+    func_name = func_attrs["name"]
+    out_dim_to_in = func_attrs["out_dim_to_in"]
+
+    input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
+    output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
+
+    input_args = INPUT_ARGS_TEMPLATE.render(input_ndim=input_ndim)
+    output_args = OUTPUT_ARGS_TEMPLATE.render(output_ndim=output_ndim)
+
+    shape_functions = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        input_ndim=input_ndim,
+        output_ndim=output_ndim,
+        out_dim_to_in=out_dim_to_in,
+    )
+
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        shape_functions=shape_functions.strip(),
+        input_args=input_args.strip(),
+        output_args=output_args.strip(),
+    )
+
+
+@registry.reg("cuda.squeeze.func_decl")
+@registry.reg("cuda.unsqueeze.func_decl")
+def squeeze_gen_function_decl(func_attrs):
+    """Generate the function declaration for squeeze/unsqueeze.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        The _attrs dict from the original op.
+    """
+    func_name = func_attrs["name"]
+    input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
+    output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
+
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, input_ndim=input_ndim, output_ndim=output_ndim
+    )
+
+
+@registry.reg("cuda.squeeze.func_call")
+@registry.reg("cuda.unsqueeze.func_call")
+def squeeze_gen_function_call(func_attrs, indent="  "):
+    """Generate the function invocation for squeeze/unsqueeze.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        The _attrs dict from the original op.
+    ident : str
+        Sequence to use to generate the indentations in the CUDA code
+    """
+    func_name = func_attrs["name"]
+    input_names = [
+        shape._attrs["name"] for shape in func_attrs["inputs"][0]._attrs["shape"]
+    ]
+    output_names = [
+        shape._attrs["name"] for shape in func_attrs["outputs"][0]._attrs["shape"]
+    ]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_name,
+        input_names=input_names,
+        output_names_except_last=output_names[:-1],
+        last_output=output_names[-1],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/vision_ops/__init__.py b/python/aitemplate/backend/cuda/vision_ops/__init__.py
new file mode 100644
index 000000000..5cdda2760
--- /dev/null
+++ b/python/aitemplate/backend/cuda/vision_ops/__init__.py
@@ -0,0 +1,21 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA vision ops
+"""
+# flake8: noqa
+
+from .nms import *
+from .roi_ops import *
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py b/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py
new file mode 100644
index 000000000..280f1ada0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+(c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+"""
+from . import batched_nms, efficient_nms, nms  # noqa
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
new file mode 100644
index 000000000..be18598d0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
@@ -0,0 +1,141 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+batched_nms kernel codegen for CUDA.
+"""
+
+import os
+from typing import Any, Dict
+
+import jinja2
+
+from .... import registry
+
+# pylint: disable=C0301
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<half*>(&({{name}}->raw()))"
+)
+
+FUNC_CALL_INT64_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int64_t*>({{name}})")
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+
+#include <cub/cub.cuh>
+
+namespace {
+
+{{custom_libs}}
+
+}  // namespace
+
+{{func_signature}}
+{
+
+    batched_nms_launcher<half>(0, instance_num, keep_n, iou_threshold, input, workspace, output, mask);
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(int64_t* output,
+                   const half* input,
+                   const int instance_num,
+                   const int keep_n,
+                   const float iou_threshold,
+                   int64_t* mask,
+                   uint8_t* workspace,
+                   cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{output}}, {{input}},
+{{indent}}    {{instance_num}},
+{{indent}}    {{keep_n}},
+{{indent}}    {{iou_threshold}},
+{{indent}}    {{mask}},
+{{indent}}    global_workspace, stream /* default stream */
+{{indent}});
+    """
+)
+
+
+def get_custom_libs() -> str:
+    script_dir = os.path.dirname(__file__)
+    filename = os.path.join(script_dir, "batched_nms_kernel.cuh")
+    with open(filename) as f:
+        res = f.read()
+        return res
+
+
+@registry.reg("cuda.batched_nms.gen_function")
+def batched_nms_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return FUNC_TEMPLATE.render(
+        custom_libs=get_custom_libs(),
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+    )
+
+
+@registry.reg("cuda.batched_nms.func_decl")
+def batched_nms_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.batched_nms.func_call")
+def batched_nms_gen_function_call(func_attrs, indent="  "):
+    output_name = ""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 2
+
+    output_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][0]._attrs["name"]
+    )
+    tmp_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][1]._attrs["name"]
+    )
+
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    instance_num = xshape[0]._attrs["values"][0]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        input=input_name,
+        instance_num=instance_num,
+        keep_n=func_attrs["keep_n"],
+        iou_threshold=func_attrs["iou_threshold"],
+        mask=tmp_name,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh
new file mode 100644
index 000000000..51d7032ae
--- /dev/null
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms_kernel.cuh
@@ -0,0 +1,203 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+// CUDA batched_nms kernel
+
+int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+#define THREADS_PER_BLOCK 256
+
+#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
+  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
+    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
+
+inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
+  int optimal_block_num = (N + num_threads - 1) / num_threads;
+  int max_block_num = 4096;
+  return min(optimal_block_num, max_block_num);
+}
+
+int64_t* alignPtr(int64_t* ptr, uintptr_t to) {
+  uintptr_t addr = (uintptr_t)ptr;
+  if (addr % to) {
+    addr += to - addr % to;
+  }
+  return (int64_t*)addr;
+}
+
+__device__ inline half hmax(const half a, const half b) {
+#if __CUDA_ARCH__ >= 800
+  return __hmax(a, b);
+#else
+  return a > b ? a : b;
+#endif
+}
+
+__device__ inline half hmin(const half a, const half b) {
+#if __CUDA_ARCH__ >= 800
+  return __hmin(a, b);
+#else
+  return a < b ? a : b;
+#endif
+}
+
+template <typename T>
+__device__ inline bool devIoU(
+    T const* const a,
+    T const* const b,
+    const int offset_,
+    const float threshold) {
+  T offset = __float2half_rn(float(offset_));
+  T left = hmax(a[0], b[0]), right = hmin(a[2], b[2]);
+  T top = hmax(a[1], b[1]), bottom = hmin(a[3], b[3]);
+  T width = hmax(right - left + offset, 0.f),
+    height = hmax(bottom - top + offset, 0.f);
+  float interS = __half2float(width) * __half2float(height);
+  float Sa =
+      __half2float(a[2] - a[0] + offset) * __half2float(a[3] - a[1] + offset);
+  float Sb =
+      __half2float(b[2] - b[0] + offset) * __half2float(b[3] - b[1] + offset);
+
+  return interS > threshold * (Sa + Sb - interS);
+}
+
+template <typename T>
+__global__ void nms_cuda(
+    const int n_boxes,
+    const float iou_threshold,
+    const int offset,
+    const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    const int tid = threadIdx.x;
+
+    if (row_start > col_start)
+      return;
+
+    const int row_size =
+        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    __shared__ T block_boxes[threadsPerBlock * 4];
+    if (tid < col_size) {
+      block_boxes[tid * 4 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+      block_boxes[tid * 4 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+      block_boxes[tid * 4 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+      block_boxes[tid * 4 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+    }
+    __syncthreads();
+
+    if (tid < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + tid;
+      const T* cur_box = dev_boxes + cur_box_idx * 4;
+
+      int i = 0;
+      unsigned long long int t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = tid + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+          t |= 1ULL << i;
+        }
+      }
+      dev_mask[cur_box_idx * gridDim.y + col_start] = t;
+    }
+  }
+}
+
+__global__ void gather_keep_from_mask(
+    int64_t* keep,
+    const unsigned long long* dev_mask,
+    const int n_boxes) {
+  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  const int tid = threadIdx.x;
+
+  // mark the bboxes which have been removed.
+  extern __shared__ unsigned long long removed[];
+
+  // initialize removed.
+  for (int i = tid; i < col_blocks; i += blockDim.x) {
+    removed[i] = 0;
+  }
+  __syncthreads();
+
+  for (int nblock = 0; nblock < col_blocks; ++nblock) {
+    auto removed_val = removed[nblock];
+    __syncthreads();
+    const int i_offset = nblock * threadsPerBlock;
+#pragma unroll
+    for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
+      const int i = i_offset + inblock;
+      if (i >= n_boxes)
+        break;
+
+      // select a candidate, check if it should kept.
+      if (!(removed_val & (1ULL << inblock))) {
+        if (tid == 0) {
+          keep[i] = 1;
+        }
+        auto p = dev_mask + i * col_blocks;
+        // remove all bboxes which overlap the candidate.
+        for (int j = tid; j < col_blocks; j += blockDim.x) {
+          if (j >= nblock)
+            removed[j] |= p[j];
+        }
+        __syncthreads();
+        removed_val = removed[nblock];
+      }
+    }
+  }
+}
+
+template <typename T>
+void batched_nms_launcher(
+    cudaStream_t stream,
+    const int num_instance,
+    const int keep_n,
+    const float iou_threshold,
+    const void* input,
+    void* workspace,
+    void* output,
+    int64_t* mask) {
+  cudaMemsetAsync(output, 0, num_instance * sizeof(int64_t), stream);
+
+  int boxes_num = num_instance;
+  const int offset = 1;
+  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
+  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
+
+  dim3 blocks(col_blocks_alloc, col_blocks_alloc);
+  dim3 threads(threadsPerBlock);
+
+  nms_cuda<<<blocks, threads, 0, stream>>>(
+      boxes_num,
+      iou_threshold,
+      offset,
+      (const T*)input,
+
+      (unsigned long long*)mask);
+
+  gather_keep_from_mask<<<
+      1,
+      min(col_blocks, THREADS_PER_BLOCK),
+      col_blocks * sizeof(unsigned long long),
+      stream>>>((int64_t*)output, (const unsigned long long*)mask, boxes_num);
+}
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py b/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py
new file mode 100644
index 000000000..c7ecca653
--- /dev/null
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py
@@ -0,0 +1,62 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+efficient_nms kernel codegen for CUDA.
+"""
+
+from typing import Any, Dict
+
+from .... import registry
+from ....backend_spec import CUDASpec
+from ....common.vision_ops import efficient_nms_common
+
+# pylint: disable=C0301
+
+func_header_files = """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cuda_runtime_api.h"
+#include "cub/cub.cuh"
+"""
+
+profiler_header_files = """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include <cub/cub.cuh>
+"""
+
+
+@registry.reg("cuda.efficient_nms.gen_function")
+def efficient_nms_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return efficient_nms_common.gen_function(func_attrs, func_header_files, CUDASpec())
+
+
+@registry.reg("cuda.efficient_nms.func_decl")
+def efficient_nms_gen_function_decl(func_attrs: Dict[str, Any]):
+    return efficient_nms_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.efficient_nms.func_call")
+def efficient_nms_gen_function_call(func_attrs, indent="  "):
+    return efficient_nms_common.gen_function_call(func_attrs, CUDASpec(), indent)
+
+
+@registry.reg("cuda.efficient_nms.gen_profiler")
+def gen_profiler(func_attrs, workdir):
+    return efficient_nms_common.gen_profiler(
+        func_attrs, workdir, profiler_header_files, CUDASpec()
+    )
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/nms.py b/python/aitemplate/backend/cuda/vision_ops/nms/nms.py
new file mode 100644
index 000000000..ac4780747
--- /dev/null
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/nms.py
@@ -0,0 +1,52 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+nms kernel codegen for CUDA.
+"""
+
+from typing import Any, Dict
+
+from .... import registry
+from ....backend_spec import CUDASpec
+from ....common.vision_ops import nms_common
+
+# pylint: disable=C0301
+
+header_files = """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include <cub/cub.cuh>
+"""
+
+
+@registry.reg("cuda.nms.gen_function")
+def nms_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return nms_common.gen_function(func_attrs, header_files, CUDASpec())
+
+
+@registry.reg("cuda.nms.func_decl")
+def nms_gen_function_decl(func_attrs: Dict[str, Any]):
+    return nms_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.nms.func_call")
+def nms_gen_function_call(func_attrs, indent="  "):
+    return nms_common.gen_function_call(func_attrs, CUDASpec(), indent)
+
+
+@registry.reg("cuda.nms.gen_profiler")
+def gen_profiler(func_attrs, workdir):
+    return nms_common.gen_profiler(func_attrs, workdir, header_files, CUDASpec())
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py
new file mode 100644
index 000000000..bbdaf07a4
--- /dev/null
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA roi_align module init
+"""
+from . import multi_level_roi_align, roi_align
+
+__all__ = ["roi_align", "multi_level_roi_align"]
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
new file mode 100644
index 000000000..d94d484ed
--- /dev/null
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for multi-level roi align.
+"""
+import jinja2
+
+from .... import registry
+from ....backend_spec import CUDASpec
+from ....common.vision_ops import multi_level_roi_align_common
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+EXTRA_HEADER = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+"""
+)
+
+
+@registry.reg("cuda.multi_level_roi_align.gen_function")
+def gen_function(
+    func_attrs,
+    template_path,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+
+    exec_paths = ""
+    for key, _ in exec_path.items():
+        program = multi_level_roi_align_common.EXEC_TEMPLATE.render(
+            indent="    ",
+            num_rois=func_attrs["num_rois"],
+            pooled_size=func_attrs["pooled_size"],
+            sampling_ratio=func_attrs["sampling_ratio"],
+            spatial_scale=func_attrs["spatial_scale"],
+            position_sensitive=func_attrs["position_sensitive"],
+            continuous_coordinate=func_attrs["continuous_coordinate"],
+        )
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return multi_level_roi_align_common.SRC_TEMPLATE.render(
+        function_name=func_name,
+        exec_paths=exec_paths,
+        header_files=EXTRA_HEADER.render(),
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+    )
+
+
+@registry.reg("cuda.multi_level_roi_align.func_decl")
+def multi_level_roi_align_gen_function_decl(func_attrs):
+    return multi_level_roi_align_common.gen_function_decl(
+        func_attrs, backend_spec=CUDASpec()
+    )
+
+
+@registry.reg("cuda.multi_level_roi_align.func_call")
+def multi_level_roi_align_gen_function_call(func_attrs, indent="  "):
+    return multi_level_roi_align_common.gen_function_call(
+        func_attrs, backend_spec=CUDASpec()
+    )
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
new file mode 100644
index 000000000..cbbf6368f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
@@ -0,0 +1,108 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for roi_align.
+"""
+
+import jinja2
+
+from .... import registry
+from ....backend_spec import CUDASpec
+from ....common.vision_ops import roi_align_common
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+EXTRA_HEADER = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+"""
+)
+
+
+@registry.reg("cuda.roi_align.gen_function")
+def gen_function(
+    func_attrs,
+    template_path,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    half2_data_ref = backend_spec.half2_data_ref
+
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        num_rois=func_attrs["num_rois"],
+        pooled_size=func_attrs["pooled_size"],
+        position_sensitive=func_attrs["position_sensitive"],
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in exec_path:
+        program = roi_align_common.EXEC_TEMPLATE.render(
+            indent="    ",
+            num_rois=func_attrs["num_rois"],
+            pooled_size=func_attrs["pooled_size"],
+            sampling_ratio=func_attrs["sampling_ratio"],
+            spatial_scale=func_attrs["spatial_scale"],
+            position_sensitive=func_attrs["position_sensitive"],
+            continuous_coordinate=func_attrs["continuous_coordinate"],
+        )
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return roi_align_common.SRC_TEMPLATE.render(
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        prefix=backend_spec.prefix,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        header_files=EXTRA_HEADER.render(),
+        index_type=backend_spec.index_type,
+        half2_data_ref=half2_data_ref,
+    )
+
+
+@registry.reg("cuda.roi_align.func_decl")
+def roi_align_gen_function_decl(func_attrs):
+    return roi_align_common.gen_function_decl(func_attrs, backend_spec=CUDASpec())
+
+
+@registry.reg("cuda.roi_align.func_call")
+def roi_align_gen_function_call(func_attrs, indent="  "):
+    return roi_align_common.gen_function_call(
+        func_attrs, backend_spec=CUDASpec(), indent=indent
+    )
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_ops.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_ops.py
new file mode 100644
index 000000000..550043c93
--- /dev/null
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_ops.py
@@ -0,0 +1,94 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for roi ops.
+"""
+import jinja2
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  cutlass::half_t*,
+  cutlass::half_t*,
+  cutlass::half_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  float,
+  bool,
+  bool,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{rois_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{sampling_ratio}},
+{{indent}}    {{spatial_scale}},
+{{indent}}    {{position_sensitive}},
+{{indent}}    {{continuous_coordinate}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_function_decl(func_name):
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    rois = func_attrs["inputs"][1]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        rois_ptr=rois._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        sampling_ratio=func_attrs["sampling_ratio"],
+        spatial_scale=func_attrs["spatial_scale"],
+        position_sensitive="true" if func_attrs["position_sensitive"] else "false",
+        continuous_coordinate="true"
+        if func_attrs["continuous_coordinate"]
+        else "false",
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
new file mode 100644
index 000000000..a8c00e82d
--- /dev/null
+++ b/python/aitemplate/backend/main_templates.py
@@ -0,0 +1,378 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This file contains class definitions used in the generated main.cu file.
+"""
+import jinja2
+
+
+MODEL_TEMPLATE = jinja2.Template(
+    """
+#pragma once
+#include "logging.h"
+#include "device_functions-generated.h"
+#include "model_interface.h"
+#include "raii_wrapper.h"
+#include "macros.h"
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <unordered_map>
+#include <math.h>
+
+{{ function_decl }}
+
+#define CHECK_VECTOR_ACCESS(vector, idx)                                  \\
+  if (idx >= vector.size()) {                                             \\
+    throw std::out_of_range(                                              \\
+        "[__func__]: index out of range, " #vector ".size()=" +           \\
+        std::to_string(vector.size()) + ", got " + std::to_string(idx));  \\
+  }
+
+namespace ait {
+namespace {
+void DeviceCheckLastError(const char* file, int line) {
+  auto device_error = GetLastError();
+  if (device_error != GetDeviceSuccess()) {
+    std::string msg = std::string("Got error: ") + GetLastErrorString() +
+                      " enum: " + std::to_string(device_error) +
+                      " at " + file + ": " + std::to_string(line);
+    LOG(ERROR) << msg;
+    throw std::runtime_error(msg);
+  }
+}
+}
+
+// Model is the class that actually performs inference. It owns memory for
+// intermediate tensors and dynamic dimensions. Constants are owned by
+// the model's owning container object, and input/output memory is owned
+// by the user.
+// Once an inference run has started, it is not safe to re-use the Model
+// until the run has finished!
+class Model {
+  public:
+  Model(
+      size_t blob_size,
+      size_t workspace_size,
+      size_t num_inputs,
+      size_t num_outputs,
+      size_t num_unbound_constants,
+      uint8_t* constants)
+      : blob(RAII_DeviceMalloc(blob_size)),
+        workspace(RAII_DeviceMalloc(workspace_size)),
+        params(num_inputs + num_outputs + num_unbound_constants),
+        num_inputs(num_inputs),
+        constants(constants) {
+      dmlc::InitLogging("aitemplate"); // TODO(xxx): render network name
+      LOG(INFO) << "Init AITemplate Runtime.";
+      global_workspace = static_cast<uint8_t*>(workspace.get()) + {{ unique_workspace_size }};
+      unique_workspace = static_cast<uint8_t*>(workspace.get());
+      DEVICE_CHECK(GetDevice(&device_idx))
+      DEVICE_CHECK(CreateEvent(&run_finished));
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+      DEVICE_CHECK(cudaDeviceGetAttribute(
+        &max_smem_size, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx));
+#endif
+      DEVICE_CHECK(GetDeviceProperties(&device_properties, device_idx));
+      DEVICE_CHECK(StreamCreate(&graph_capture_stream, /*non_blocking=*/true));
+
+  {{ set_up_constants }}
+      auto* blob_ptr = static_cast<uint8_t*>(blob.get());
+  {{ tensor_slice }}
+  {{ tensor_map_set }}
+  {{ set_up_param_dynamic_shapes }}
+    }
+
+    ~Model() {
+      DestroyEvent(run_finished);
+      StreamDestroy(graph_capture_stream);
+      if (graph_exec != nullptr) {
+        GraphExecDestroy(graph_exec);
+      }
+      if (graph != nullptr) {
+        GraphDestroy(graph);
+      }
+    }
+
+    Model(Model&&) = default;
+    Model& operator=(Model&&) = default;
+
+    Model(const Model&) = delete;
+    Model& operator=(const Model&) = delete;
+
+    void SetUpInputsOutputs() {
+        {{ set_inputs }}
+    }
+
+    void DeviceToDeviceCopies(StreamType stream) {
+  {{ device_to_device_copies }}
+    }
+
+    void Run(StreamType stream, bool graph_mode) {
+      SetUpInputsOutputs();
+      if (target_has_graph_mode && graph_mode) {
+        RunAsGraph(stream);
+      } else {
+        RunImpl(stream);
+      }
+      DEVICE_CHECK(EventRecord(run_finished, stream));
+    }
+
+    void RunImpl(StreamType stream) {
+  {% for func in function_seq %}
+  {{ func }}
+      DeviceCheckLastError(__FILE__, __LINE__);
+  {% endfor %}
+      DeviceToDeviceCopies(stream);
+    }
+
+    bool IsPending() {
+      auto query = QueryEvent(run_finished);
+      if (query == GetDeviceNotReady()) {
+        return true;
+      }
+      if (query != GetDeviceSuccess()) {
+        LOG(WARNING) << "Pending model run did not finish successfully. Error: "
+                    << GetErrorString(query);
+      }
+      return false;
+    }
+
+    void WaitForCompletion() {
+      DEVICE_CHECK(EventSynchronize(run_finished));
+    }
+
+    size_t NumInputs() const {
+      return num_inputs;
+    }
+
+    size_t NumOutputs() const {
+      return params.size() - num_inputs;
+    }
+
+    void SetParam(const void* src, size_t param_idx) {
+      CHECK_VECTOR_ACCESS(params, param_idx)
+      // const_cast is not ideal here, but it is unfortunately
+      // necessary:
+      // 1) We store outputs and inputs in the same vector,
+      //    and outputs cannot be const.
+      // 2) Most of the codegen is not const-correct (most ops
+      //    require non-const pointers). So even if we put const
+      //    pointers into params, a const_cast would be required
+      //    somewhere else.
+      params[param_idx].ptr = const_cast<void*>(src);
+    }
+
+    void SetInput(const void* src, const AITemplateParamShape& shape, size_t idx) {
+      SetInputShape(shape, idx);
+      SetParam(src, idx);
+    }
+
+    void SetOutput(void* src, size_t idx) {
+      SetParam(src, idx + num_inputs);
+    }
+
+    // Write the (possibly dynamic) output shape to the given pointer.
+    // Note that this should be called _after_ the shape inference in
+    // Run() is finished. output_shape_out should be able to store
+    // at least GetOutputMaximumShape(idx).size values.
+    void GetOutputShape(size_t idx, int64_t* output_shape_out) {
+      const auto param_idx = idx + num_inputs;
+      CHECK_VECTOR_ACCESS(params, param_idx);
+      const auto& shape_ptrs = params[param_idx].shape_ptrs;
+      for (size_t i = 0; i < shape_ptrs.size(); ++i) {
+        output_shape_out[i] = shape_ptrs[i].GetValue();
+      }
+    }
+
+    void SetConstant(const char* name, const void* src) {
+      auto it = constant_name_to_ptr_.find(name);
+      if (it == constant_name_to_ptr_.end()) {
+        throw std::out_of_range(std::string("Could not find constant ") + name);
+      }
+      const void** ptr = it->second;
+      *ptr = src;
+    }
+
+  private:
+    void SetInputShape(const AITemplateParamShape& shape, size_t idx) {
+      auto& param = params[idx];
+      if (shape.size != param.shape_ptrs.size()) {
+        throw std::runtime_error(
+          "[SetInputShape] Got wrong param shape for input " + std::to_string(idx) +
+          "; expected " + std::to_string(param.shape_ptrs.size()) + ", got " +
+          std::to_string(shape.size));
+      }
+      for (size_t i = 0; i < param.shape_ptrs.size(); ++i) {
+        param.shape_ptrs[i].SetValue(shape.shape_data[i]);
+      }
+    }
+
+    void RunAsGraph(StreamType stream) {
+      DEVICE_CHECK(StreamBeginCapture(graph_capture_stream));
+      try {
+        RunImpl(graph_capture_stream);
+      } catch (...) {
+        DEVICE_CHECK(StreamEndCapture(graph_capture_stream, &graph));
+        throw;
+      }
+      DEVICE_CHECK(StreamEndCapture(graph_capture_stream, &graph));
+
+      if (graph_exec == nullptr) {
+        DEVICE_CHECK(GraphInstantiate(&graph_exec, graph));
+      } else if (GraphExecUpdate(graph_exec, graph) != GetDeviceSuccess()) {
+        DEVICE_CHECK(GraphExecDestroy(graph_exec));
+        DEVICE_CHECK(GraphInstantiate(&graph_exec, graph));
+      }
+
+      DEVICE_CHECK(GraphExecLaunch(graph_exec, stream));
+    }
+
+    int device_idx;
+    int max_smem_size{0};
+    DevicePropertyType device_properties;
+    // This event tracks when the inference is finished
+    // so that this Model may be reclaimed by its owning
+    // ModelContainer.
+    EventType run_finished;
+    // A blob of memory used for storing intermediate tensors.
+    GPUPtr blob;
+    // Memory for constants that were folded into the *.so. Unowned by Model,
+    // owned by ModelContainer.
+    // TODO: make this const. It can't be const right now because we derive
+    // tensor pointers from it, and no tensor pointers are const.
+    uint8_t* constants;
+    size_t num_inputs;
+
+    // The workspace blob is used as scratch memory. See
+    // _generate_workspace in memory planning for more information.
+    GPUPtr workspace;
+    uint8_t* global_workspace{nullptr};
+    uint8_t* unique_workspace{nullptr};
+
+    class ParamDim {
+      public:
+        ParamDim(int64_t lower_bound, int64_t upper_bound, int64_t* value) :
+          lower_bound_(lower_bound),
+          upper_bound_(upper_bound),
+          value_(value) {}
+
+        void SetValue(int64_t new_value) {
+          if (new_value < lower_bound_ || new_value > upper_bound_) {
+            throw std::out_of_range(
+              "[SetValue] Dimension got value out of bounds; expected value to be in [" +
+              std::to_string(lower_bound_) + ", " + std::to_string(upper_bound_) + "], but got " +
+              std::to_string(new_value)
+            );
+          }
+          *value_ = new_value;
+        }
+
+        int64_t GetValue() const {
+          return *value_;
+        }
+
+      private:
+        int64_t lower_bound_;
+        int64_t upper_bound_;
+        int64_t* value_;
+    };
+
+    struct ParamInfo {
+      void* ptr = nullptr;
+      // TODO add offset
+      const char* name;
+      std::vector<ParamDim> shape_ptrs;
+    };
+
+    // Contains info for all tensors marked as inputs
+    // or outputs. The first num_inputs elements are the inputs.
+    // Constants are not included.
+    std::vector<ParamInfo> params;
+
+    GraphExecType graph_exec = nullptr;
+    GraphType graph = nullptr;
+    StreamType graph_capture_stream;
+
+    std::unordered_map<std::string, const void**> constant_name_to_ptr_;
+
+    constexpr static bool target_has_graph_mode = {{ target_has_graph_mode }};
+
+{{ tensor_decl }}
+{{ dim_decl }}
+{{ function_state }}
+};
+} // namespace ait
+"""
+)
+
+MODEL_CONTAINER_TEMPLATE = jinja2.Template(
+    """
+#include "model_container.h"
+#include "owned_constants.h"
+
+namespace ait {
+namespace {
+// Contains the metadata for each constant.
+constexpr std::array<ConstantInfo, {{ num_constants }}> owned_constants = {
+  {{ owned_constants_init }}
+};
+} // namespace
+
+ModelContainerBase::ModelContainerBase(
+    size_t num_inputs,
+    size_t num_outputs,
+    size_t num_unbound_constants,
+    size_t params_size)
+    : constants_(RAII_DeviceMalloc(params_size)),
+      num_params_(num_inputs + num_outputs + num_unbound_constants),
+      param_names_(num_params_),
+      param_dtypes_(num_params_),
+      max_param_shapes_(num_params_),
+      max_param_numel_(num_params_),
+      max_param_storage_bytes_(num_params_) {
+{{ set_up_constant_names }}
+{{ set_up_param_names }}
+{{ set_up_param_dtypes }}
+{{ set_up_output_shapes }}
+  for (size_t i = 0; i < num_params_; ++i) {
+    max_param_numel_[i] = std::accumulate(
+      max_param_shapes_[i].begin(),
+      max_param_shapes_[i].end(),
+      1,
+      std::multiplies<int64_t>()
+    );
+    max_param_storage_bytes_[i] = max_param_numel_[i] * AITemplateDtypeSizeBytes(param_dtypes_[i]);
+  }
+
+  auto* constants_ptr = static_cast<uint8_t*>(constants_.get());
+  DEVICE_CHECK(DeviceMemset(constants_ptr, 0, params_size));
+  const auto binary_constants_bin_size = static_cast<size_t>(_binary_constants_bin_end - _binary_constants_bin_start);
+  for (auto& constant_info : owned_constants) {
+    auto* dst = constants_ptr + constant_info.internal_offset;
+    if (constant_info.data_offset + constant_info.num_bytes > binary_constants_bin_size) {
+      throw std::runtime_error(std::string("Copying constant ") + constant_info.name + " would overflow constant buffer");
+    }
+    DEVICE_CHECK(CopyToDevice(dst, _binary_constants_bin_start + constant_info.data_offset, constant_info.num_bytes));
+  }
+}
+
+ModelContainer* CreateModelContainer(size_t num_runtimes) {
+  // num_runtimes, blob_size, workspace_size, num_inputs, num_outputs, num_unbound_constants, param_size
+  return new ModelContainer(num_runtimes, {{blob_size}}, {{workspace_size}}, {{num_inputs}}, {{num_outputs}}, {{num_unbound_constants}}, {{param_size}});
+}
+} // namespace ait
+"""
+)
diff --git a/python/aitemplate/backend/profiler_cache.py b/python/aitemplate/backend/profiler_cache.py
new file mode 100644
index 000000000..389928dc3
--- /dev/null
+++ b/python/aitemplate/backend/profiler_cache.py
@@ -0,0 +1,554 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+SQLite backend for conv/gemm profiling cache
+"""
+import enum
+import sqlite3
+
+from typing import Any, Dict, Tuple
+
+import jinja2
+
+from ..utils import logger
+
+# pylint: disable=W0613
+
+
+class CacheMode(enum.Enum):
+    r"""Enum for cache mode
+
+    Profiling cache can be stored locally or remotely.
+    For LOCAL mode, the cache is stored in a SQLite database.
+    For REMOTE mode, the profiled results can be queried with RESTFul API.
+
+    REMOTE mode is not implemented yet.
+    """
+
+    LOCAL = 1
+    REMOTE = 2
+
+
+GEMM_INIT_TEMPLATE = jinja2.Template(
+    """
+ CREATE TABLE IF NOT EXISTS {{dev}}_gemm (
+  id INTEGER PRIMARY KEY AUTOINCREMENT,
+  exec_entry VARCHAR(8192) NOT NULL,
+  exec_entry_sha1 VARCHAR(64) NOT NULL,
+  dtype_a INTEGER NOT NULL,
+  dtype_b INTEGER NOT NULL,
+  dtype_c INTEGER NOT NULL,
+  dtype_acc INTEGER NOT NULL,
+  major_a INTEGER NOT NULL,
+  major_b INTEGER NOT NULL,
+  major_c INTEGER NOT NULL,
+  op_type VARCHAR(512) NOT NULL,
+  epilogue VARCHAR(512) NOT NULL,
+  device VARCHAR(16) NOT NULL,
+  algo VARCHAR(512) NOT NULL,
+  workspace INTEGER DEFAULT 0,
+  duration FLOAT DEFAULT -1,
+  split_k INTEGER DEFAULT 1,
+  pshape VARCHAR(64) NOT NULL,
+  template_ver INTEGER NOT NULL DEFAULT 290,
+  created_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL
+);
+"""
+)
+
+GEMM_QUERY_TEMPLATE = jinja2.Template(
+    """
+SELECT algo, workspace, split_k
+FROM {{dev}}_gemm
+WHERE
+dtype_a={{dtype_a}} AND
+dtype_b={{dtype_b}} AND
+dtype_c={{dtype_c}} AND
+dtype_acc={{dtype_acc}} AND
+major_a={{major_a}} AND
+major_b={{major_b}} AND
+major_c={{major_c}} AND
+op_type='{{op_type}}' AND
+device='{{device}}' AND
+epilogue={{epilogue}} AND
+pshape='{{pshape}}' AND
+exec_entry_sha1='{{exec_entry_sha1}}';
+"""
+)
+
+GEMM_INSERT_TEMPLATE = jinja2.Template(
+    """
+INSERT INTO {{dev}}_gemm (
+    exec_entry,
+    exec_entry_sha1,
+    dtype_a,
+    dtype_b,
+    dtype_c,
+    dtype_acc,
+    major_a,
+    major_b,
+    major_c,
+    op_type,
+    epilogue,
+    device,
+    algo,
+    workspace,
+    split_k,
+    pshape
+)
+VALUES (
+    '{{exec_entry}}',
+    '{{exec_entry_sha1}}',
+    {{dtype_a}},
+    {{dtype_b}},
+    {{dtype_c}},
+    {{dtype_acc}},
+    {{major_a}},
+    {{major_b}},
+    {{major_c}},
+    '{{op_type}}',
+    {{epilogue}},
+    '{{device}}',
+    '{{algo}}',
+    {{workspace}},
+    {{split_k}},
+    '{{pshape}}'
+);
+"""
+)
+
+GEMM_ENTRY_QUERY = jinja2.Template(
+    """
+SELECT id
+FROM {{dev}}_gemm
+WHERE
+op_type='{{op_type}}' AND
+exec_entry_sha1='{{exec_entry_sha1}}';
+"""
+)
+
+CONV_INIT_TEMPLATE = jinja2.Template(
+    """
+ CREATE TABLE IF NOT EXISTS {{dev}}_conv (
+  id INTEGER PRIMARY KEY AUTOINCREMENT,
+  exec_entry VARCHAR(8192) NOT NULL,
+  exec_entry_sha1 VARCHAR(64) NOT NULL,
+  dtype_a INTEGER NOT NULL,
+  dtype_b INTEGER NOT NULL,
+  dtype_c INTEGER NOT NULL,
+  dtype_acc INTEGER NOT NULL,
+  major_a INTEGER NOT NULL,
+  major_b INTEGER NOT NULL,
+  major_c INTEGER NOT NULL,
+  kh INTEGER NOT NULL,
+  kw INTEGER NOT NULL,
+  co INTEGER NOT NULL,
+  stride INTEGER NOT NULL,
+  pad INTEGER NOT NULL,
+  dilate INTEGER NOT NULL,
+  op_type VARCHAR(512) NOT NULL,
+  epilogue VARCHAR(512) NOT NULL,
+  device VARCHAR(16) NOT NULL,
+  algo VARCHAR(512) NOT NULL,
+  workspace INTEGER DEFAULT 0,
+  duration FLOAT DEFAULT -1,
+  split_k INTEGER DEFAULT 1,
+  template_ver INTEGER NOT NULL DEFAULT 290,
+  created_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL
+);
+"""
+)
+
+CONV_QUERY_TEMPLATE = jinja2.Template(
+    """
+SELECT algo, workspace
+FROM {{dev}}_conv
+WHERE
+dtype_a={{dtype_a}} AND
+dtype_b={{dtype_b}} AND
+dtype_c={{dtype_c}} AND
+dtype_acc={{dtype_acc}} AND
+major_a={{major_a}} AND
+major_b={{major_b}} AND
+major_c={{major_c}} AND
+kh={{kh}} AND
+kw={{kw}} AND
+co={{co}} AND
+stride={{stride}} AND
+pad={{pad}} AND
+dilate={{dilate}} AND
+op_type='{{op_type}}' AND
+device='{{device}}' AND
+epilogue={{epilogue}} AND
+split_k={{split_k}} AND
+exec_entry_sha1='{{exec_entry_sha1}}';
+"""
+)
+
+CONV_INSERT_TEMPLATE = jinja2.Template(
+    """
+INSERT INTO {{dev}}_conv (
+    exec_entry,
+    exec_entry_sha1,
+    dtype_a,
+    dtype_b,
+    dtype_c,
+    dtype_acc,
+    major_a,
+    major_b,
+    major_c,
+    kh,
+    kw,
+    co,
+    stride,
+    pad,
+    dilate,
+    op_type,
+    epilogue,
+    device,
+    algo,
+    workspace,
+    split_k
+)
+VALUES (
+    '{{exec_entry}}',
+    '{{exec_entry_sha1}}',
+    {{dtype_a}},
+    {{dtype_b}},
+    {{dtype_c}},
+    {{dtype_acc}},
+    {{major_a}},
+    {{major_b}},
+    {{major_c}},
+    {{kh}},
+    {{kw}},
+    {{co}},
+    {{stride}},
+    {{pad}},
+    {{dilate}},
+    '{{op_type}}',
+    {{epilogue}},
+    '{{device}}',
+    '{{algo}}',
+    {{workspace}},
+    {{split_k}}
+);
+"""
+)
+
+
+NORM_INIT_TEMPLATE = jinja2.Template(
+    """
+ CREATE TABLE IF NOT EXISTS {{dev}}_normalization (
+  id INTEGER PRIMARY KEY AUTOINCREMENT,
+  exec_entry VARCHAR(8192) NOT NULL,
+  exec_entry_sha1 VARCHAR(64) NOT NULL,
+  dtype_in INTEGER NOT NULL,
+  dtype_acc INTEGER NOT NULL,
+  dtype_out INTEGER NOT NULL,
+  rank INTEGER NOT NULL,
+  op_type VARCHAR(512) NOT NULL,
+  device VARCHAR(16) NOT NULL,
+  algo VARCHAR(512) NOT NULL,
+  workspace INTEGER DEFAULT 0,
+  duration FLOAT DEFAULT -1,
+  template_ver INTEGER NOT NULL DEFAULT 290,
+  created_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL
+);
+"""
+)
+
+NORM_QUERY_TEMPLATE = jinja2.Template(
+    """
+SELECT algo, workspace
+FROM {{dev}}_normalization
+WHERE
+dtype_in={{dtype_in}} AND
+dtype_out={{dtype_out}} AND
+dtype_acc={{dtype_acc}} AND
+rank={{rank}} AND
+op_type='{{op_type}}' AND
+device='{{device}}' AND
+exec_entry_sha1='{{exec_entry_sha1}}';
+"""
+)
+
+NORM_INSERT_TEMPLATE = jinja2.Template(
+    """
+INSERT INTO {{dev}}_normalization (
+    exec_entry,
+    exec_entry_sha1,
+    dtype_in,
+    dtype_out,
+    dtype_acc,
+    rank,
+    op_type,
+    device,
+    algo,
+    workspace
+)
+VALUES (
+    '{{exec_entry}}',
+    '{{exec_entry_sha1}}',
+    {{dtype_in}},
+    {{dtype_out}},
+    {{dtype_acc}},
+    {{rank}},
+    '{{op_type}}',
+    '{{device}}',
+    '{{algo}}',
+    {{workspace}}
+);
+"""
+)
+
+
+class ProfileCacheDB(object):
+    r"""Local SQLite profile cache database."""
+
+    def __init__(
+        self, target: str, path: str = None, uri: str = None, port: str = None
+    ):
+        r"""=
+
+        Parameters
+        ----------
+        target : str
+            target device name. CUDA or ROCM
+        path : str, optional
+            path to the database file. If not specified, a temporary file is created.
+        uri : str, optional
+            uri to the RESFul API (Not implemented yet)
+        port : str, optional
+            port to the RESFul API (Not implemented yet)
+
+        """
+        self._target = target
+        self._mode = CacheMode.LOCAL
+        self._db_commit_flag = False
+        if uri is not None:
+            self._mode = CacheMode.REMOTE
+        if self._mode == CacheMode.LOCAL:
+            assert path is not None
+            self._con = sqlite3.connect(path)
+            self._cur = self._con.cursor()
+            self._init_db()
+        else:
+            raise NotImplementedError
+
+    def _init_db(self):
+        """Creates table in cache."""
+        self._create_gemm_table()
+        self._create_conv_table()
+        self._create_norm_table()
+
+    def _create_gemm_table(self):
+        """Creates gemm table."""
+        sql = GEMM_INIT_TEMPLATE.render(dev=self._target)
+        self._cur.execute(sql)
+        self._con.commit()
+
+    def _create_conv_table(self):
+        """Creates conv table."""
+        sql = CONV_INIT_TEMPLATE.render(dev=self._target)
+        self._cur.execute(sql)
+        self._con.commit()
+
+    def _create_norm_table(self):
+        """Creates conv table."""
+        sql = NORM_INIT_TEMPLATE.render(dev=self._target)
+        self._cur.execute(sql)
+        self._con.commit()
+
+    def _query(self, sql: str) -> Tuple[str, int]:
+        """a function to query op from cache
+
+        Parameters
+        ----------
+        sql : str
+            sql statement for query
+
+        Returns
+        -------
+        Tuple
+            profiling results
+        """
+        if self._mode == CacheMode.LOCAL:
+            if self._db_commit_flag:
+                self._con.commit()
+                self._db_commit_flag = False
+            self._cur.execute(sql)
+            out = self._cur.fetchall()
+            if len(out) == 0:
+                return None
+            return out[0]
+
+        raise NotImplementedError
+
+    def query_gemm(self, args: Dict[str, Any]) -> Tuple[str, int]:
+        """a function to query gemm op epilogue from cache
+
+        Parameters
+        ----------
+        args : Dict
+            gemm query entry
+
+        Returns
+        -------
+        Tuple
+            profiling results
+        """
+        sql = GEMM_QUERY_TEMPLATE.render(dev=self._target, **args)
+        return self._query(sql)
+
+    def query_conv(self, args: Dict[str, Any]) -> Tuple[str, int]:
+        """a function to query conv op epilogue from cache,
+        here we use the same sql table for conv and gemm
+
+        Parameters
+        ----------
+        args : Dict
+            Conv query entry
+
+        Returns
+        -------
+        Tuple
+            profiling results
+        """
+        sql = CONV_QUERY_TEMPLATE.render(dev=self._target, **args)
+        return self._query(sql)
+
+    def query_normalization(self, args: Dict[str, Any]) -> Tuple[str, int]:
+        """a function to query normalization op epilogue from cache
+
+        Parameters
+        ----------
+        args : Dict
+            Conv query entry
+
+        Returns
+        -------
+        Tuple
+            profiling results
+        """
+        sql = NORM_QUERY_TEMPLATE.render(dev=self._target, **args)
+        return self._query(sql)
+
+    def _insert(self, query_sql: str, insert_sql: str) -> None:
+        """a function to insert op into cache
+
+        Parameters
+        ----------
+        query_sql : str
+            cache query sql
+        insert_sql: str
+            cache insert sql
+        """
+        if self._mode == CacheMode.LOCAL:
+            self._cur.execute(query_sql)
+            out = self._cur.fetchall()
+            if len(out) == 0:
+                self._cur.execute(insert_sql)
+                self._db_commit_flag = True
+            else:
+                logger.info(__name__, "Ignore repeat profile_record: " + query_sql)
+
+    def insert_gemm(self, args: Dict[str, Any]) -> None:
+        """a function to insert gemm op epilogue into cache
+
+        Parameters
+        ----------
+        args : Dict
+            Gemm Record Entry
+        """
+        query_sql = GEMM_QUERY_TEMPLATE.render(
+            dev=self._target,
+            dtype_a=args["dtype_a"],
+            dtype_b=args["dtype_b"],
+            dtype_c=args["dtype_c"],
+            dtype_acc=args["dtype_acc"],
+            major_a=args["major_a"],
+            major_b=args["major_b"],
+            major_c=args["major_c"],
+            op_type=args["op_type"],
+            device=args["device"],
+            epilogue=args["epilogue"],
+            split_k=args["split_k"],
+            pshape=args["pshape"],
+            exec_entry_sha1=args["exec_entry_sha1"],
+        )
+        insert_sql = GEMM_INSERT_TEMPLATE.render(dev=self._target, **args)
+        self._insert(query_sql, insert_sql)
+
+    def insert_conv(self, args: Dict[str, Any]) -> None:
+        """a function to insert conv op epilogue into cache,
+        here we use the same sql table for conv and gemm
+
+        Parameters
+        ----------
+        args : Dict
+            Conv Record Entry
+
+        """
+        query_sql = CONV_QUERY_TEMPLATE.render(
+            dev=self._target,
+            dtype_a=args["dtype_a"],
+            dtype_b=args["dtype_b"],
+            dtype_c=args["dtype_c"],
+            dtype_acc=args["dtype_acc"],
+            major_a=args["major_a"],
+            major_b=args["major_b"],
+            major_c=args["major_c"],
+            kh=args["kh"],
+            kw=args["kw"],
+            co=args["co"],
+            stride=args["stride"],
+            pad=args["pad"],
+            dilate=args["dilate"],
+            op_type=args["op_type"],
+            device=args["device"],
+            epilogue=args["epilogue"],
+            split_k=args["split_k"],
+            exec_entry_sha1=args["exec_entry_sha1"],
+        )
+        insert_sql = CONV_INSERT_TEMPLATE.render(dev=self._target, **args)
+        self._insert(query_sql, insert_sql)
+
+    def insert_normalization(self, args: Dict[str, Any]) -> None:
+        """a function to insert conv op epilogue into cache,
+        here we use the same sql table for conv and gemm
+
+        Parameters
+        ----------
+        args : Dict
+            Conv Record Entry
+
+        """
+        query_sql = NORM_QUERY_TEMPLATE.render(
+            dev=self._target,
+            dtype_in=args["dtype_in"],
+            dtype_acc=args["dtype_acc"],
+            dtype_out=args["dtype_out"],
+            rank=args["rank"],
+            op_type=args["op_type"],
+            device=args["device"],
+            exec_entry_sha1=args["exec_entry_sha1"],
+        )
+        insert_sql = NORM_INSERT_TEMPLATE.render(dev=self._target, **args)
+        self._insert(query_sql, insert_sql)
+
+    def __del__(self):
+        self._con.commit()
+        self._con.close()
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
new file mode 100644
index 000000000..bcbce8bdd
--- /dev/null
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -0,0 +1,123 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A subprocess based multiple GPUs runner for auto-tuning
+"""
+
+from __future__ import annotations
+
+import re
+import typing
+from collections import namedtuple
+
+from ..utils import logger
+from .target import Target
+from .task_runner import BaseRunner, Task
+
+# pylint: disable=W0221
+
+RUNTIME_PATTERN = re.compile(r"TIME:([\d\.]+)")
+WORKSPACE_PATTERN = re.compile(r"WS:([\d]+)")
+
+ProfileResult = namedtuple("ProfileResult", "duration workspace")
+"""Object to store profiling result
+"""
+
+
+def process_task(task: Task) -> None:
+    """Extract kernel execution time and workspace from task process outputs
+
+    Parameters
+    ----------
+    task : Task
+        A profiling task
+    """
+    stdout = task._stdout
+    stderr = task._stderr
+    if len(stderr) > 0:
+        task._failed = True
+        logger.debug(
+            __name__,
+            "Failed: [{name}][{algo}]\ncmd:\n{cmd}\nstderr:\n{stderr}".format(
+                name=task._name,
+                algo=task._idx,
+                cmd=task._cmd,
+                stderr=stderr,
+            ),
+        )
+    else:
+        duration = float(RUNTIME_PATTERN.findall(stdout)[0])
+        workspace = int(WORKSPACE_PATTERN.findall(stdout)[0])
+        task._ret = ProfileResult(duration, workspace)
+        logger.info(
+            __name__,
+            "Successful: [{name}][{algo}]: TIME: {duration} WS:{ws}".format(
+                name=task._name, algo=task._idx, duration=duration, ws=workspace
+            ),
+        )
+
+
+def process_return(task: Task) -> typing.Tuple[typing.Union[int, str], ProfileResult]:
+    """Generate profile result from a profiling task
+
+    Parameters
+    ----------
+    task : Task
+        A profiling task
+
+    Returns
+    -------
+    Tuple[Union[int, str], ProfileResult]
+        Tuple of task idx (usually the algorithm name/id) and profiling result
+    """
+    return (task._idx, task._ret)
+
+
+class Runner(BaseRunner):
+    """A parallel runner for multiple GPUs profiling tasks.
+    Runner is inherited from BaseRunner.
+    """
+
+    def __init__(self, devs: list[int], op_name: str, timeout: int = 30):
+        logger.info(
+            __name__, "Using {n} GPU for profiling {op}".format(n=len(devs), op=op_name)
+        )
+        super().__init__(devs, op_name, timeout)
+        self._dev_flag = Target.current().dev_select_flag()
+        self._ftask_proc = process_task
+        self._fret_proc = process_return
+
+    def push(self, idx: typing.Union[int, str], cmd: str):
+        """Push a new profiling task into runner's queue
+
+        Parameters
+        ----------
+        idx : Union[int, str]
+            Profiling task id (usually is algorithm id or name)
+        cmd : str
+            Bash command to execute the profiling task
+        """
+        self._queue.append(Task(idx, cmd, self._tag, dev_flag=self._dev_flag))
+
+    def pull(self):
+        """Pull results from all profiling tasks assigned to runner.
+
+        Returns
+        -------
+        list[Tuple[Union[int, str], ProfileResult]]
+            Profiling results of all successful tasks.
+        """
+        ret = super().pull(self._ftask_proc, self._fret_proc)
+        return ret
diff --git a/python/aitemplate/backend/registry.py b/python/aitemplate/backend/registry.py
new file mode 100644
index 000000000..62f4a10ee
--- /dev/null
+++ b/python/aitemplate/backend/registry.py
@@ -0,0 +1,99 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Registry is a design pattern to map a string key to a function.
+The registry decorator is mainly used for backend functions.
+"""
+
+from __future__ import annotations
+
+from typing import Callable
+
+BACKEND_FUNCTIONS = {}
+
+
+def reg(func_name: str, func: Callable = None) -> Callable:
+    """Register a new function
+
+    Example
+
+    .. highlight:: python
+    .. code-block:: python
+
+        @registry.reg("func_name")
+        def func(args):
+            ....
+
+
+    Parameters
+    ----------
+    func_name : str
+        Registry key for the function
+    func : Callable, optional
+        Function to be registered, by default None
+
+    Returns
+    -------
+    Callable
+        Function in registry
+
+    Raises
+    ------
+    RuntimeError
+        If same key is founded in registry, will raise a RuntimeError
+    """
+    if func_name in BACKEND_FUNCTIONS:
+        raise RuntimeError("{name} funcion has been registered.".format(name=func_name))
+
+    def _do_reg(func):
+        BACKEND_FUNCTIONS[func_name] = func
+        return func
+
+    if func is None:
+        return _do_reg
+    return func
+
+
+def get(func_name: str) -> Callable:
+    """Get a function from registry by using a key
+
+    Example
+
+    .. highlight:: python
+    .. code-block:: python
+
+        func = registry.get("func_name")
+        func(args)
+
+
+
+    Parameters
+    ----------
+    func_name : str
+        Key for function in registry
+
+    Returns
+    -------
+    Callable
+        Function associated with the key
+
+    Raises
+    ------
+    RuntimeError
+        If key is not founded in registry, will raise a RuntimeError
+    """
+    if func_name not in BACKEND_FUNCTIONS:
+        raise RuntimeError(f"{func_name} function has not been registered.")
+    return BACKEND_FUNCTIONS[func_name]
diff --git a/python/aitemplate/backend/rocm/__init__.py b/python/aitemplate/backend/rocm/__init__.py
new file mode 100644
index 000000000..28df18128
--- /dev/null
+++ b/python/aitemplate/backend/rocm/__init__.py
@@ -0,0 +1,30 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+Rocm backend init.
+"""
+from . import lib_template, target_def, utils
+from .common import *
+from .conv2d import *
+from .gemm import *
+from .pool2d import *
+from .view_ops import *
+from .elementwise import *
+from .tensor import *
+from .normalization import softmax
+from .upsample import *
+from .vision_ops import *
+from .normalization import groupnorm, groupnorm_swish, layernorm
diff --git a/python/aitemplate/backend/rocm/common/__init__.py b/python/aitemplate/backend/rocm/common/__init__.py
new file mode 100644
index 000000000..50ab82434
--- /dev/null
+++ b/python/aitemplate/backend/rocm/common/__init__.py
@@ -0,0 +1,19 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+ROCM Common module init
+"""
+from .dummy_op import *
diff --git a/python/aitemplate/backend/rocm/common/dummy_op.py b/python/aitemplate/backend/rocm/common/dummy_op.py
new file mode 100644
index 000000000..e4342ff43
--- /dev/null
+++ b/python/aitemplate/backend/rocm/common/dummy_op.py
@@ -0,0 +1,36 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Dummy op codegen for ROCM.
+"""
+
+from typing import Any, Dict
+
+from ... import registry
+
+
+@registry.reg("rocm.size.gen_function")
+def dummy_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return ""
+
+
+@registry.reg("rocm.size.func_decl")
+def dummy_gen_function_decl(func_attrs):
+    return ""
+
+
+@registry.reg("rocm.size.func_call")
+def dummy_gen_function_call(func_attrs, indent):
+    return ""
diff --git a/python/aitemplate/backend/rocm/conv2d/__init__.py b/python/aitemplate/backend/rocm/conv2d/__init__.py
new file mode 100644
index 000000000..8a330f108
--- /dev/null
+++ b/python/aitemplate/backend/rocm/conv2d/__init__.py
@@ -0,0 +1,36 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM conv2d init.
+"""
+from . import (
+    conv2d,
+    conv2d_bias,
+    conv2d_bias_add_relu,
+    conv2d_bias_relu,
+    conv2d_bias_sigmoid,
+    transposed_conv2d,
+    transposed_conv2d_bias_relu,
+)
+
+__all__ = [
+    "conv2d",
+    "conv2d_bias",
+    "conv2d_bias_add_relu",
+    "conv2d_bias_relu",
+    "conv2d_bias_sigmoid",
+    "transposed_conv2d",
+    "transposed_conv2d_bias_relu",
+]
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
new file mode 100644
index 000000000..8e7d893eb
--- /dev/null
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -0,0 +1,892 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common ROCM template for conv2d.
+"""
+import os
+import re
+from collections import OrderedDict
+from hashlib import sha1
+
+import jinja2
+
+from ... import builder
+from ...target import Target
+
+# pylint: disable=C0103,C0415,W0611,C0301
+
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = {{ config_name }};
+"""
+)
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+{{indent}}                                static_cast<ck::half_t *>(in_ptr),
+{{indent}}                                static_cast<ck::half_t *>(weight_ptr),
+
+{% if conv2d_flag == "" %}
+{{indent}}                                {},
+{% elif conv2d_flag in ["bias", "bias_relu", "bias_sigmoid"] %}
+{{indent}}                                std::array<const void*, 1>{static_cast<ck::half_t *>(bias_ptr)},
+{% elif conv2d_flag == "bias_add_relu" %}
+{{indent}}                                std::array<const void*, 2>{static_cast<ck::half_t *>(bias_ptr), static_cast<ck::half_t *>(res_ptr)},
+{% endif %}
+{{indent}}                                static_cast<ck::half_t *>(out_ptr),
+{{indent}}                                a_g_n_c_wis_lengths,
+{{indent}}                                a_g_n_c_wis_strides,
+{{indent}}                                b_g_k_c_xs_lengths,
+{{indent}}                                b_g_k_c_xs_strides,
+{% if conv2d_flag == "" %}
+{{indent}}                                {}, {},
+{% elif conv2d_flag in ["bias", "bias_relu", "bias_sigmoid"] %}
+{{indent}}                                std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{ {d_g_n_k_wos_lengths} },
+{{indent}}                                std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{ {d_g_n_k_wos_strides} },
+{% elif conv2d_flag == "bias_add_relu" %}
+{{indent}}                                std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{ {d_g_n_k_wos_lengths, e_g_n_k_wos_lengths} },
+{{indent}}                                std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{ {d_g_n_k_wos_strides, e_g_n_k_wos_strides} },
+{% endif %}
+{{indent}}                                e_g_n_k_wos_lengths,
+{{indent}}                                e_g_n_k_wos_strides,
+{{indent}}                                conv_filter_strides,
+{{indent}}                                conv_filter_dilations,
+{{indent}}                                input_left_pads,
+{{indent}}                                input_right_pads,
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{% if conv2d_flag == "" %}
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
+{% elif conv2d_flag == "bias" %}
+{{indent}}                                ck::tensor_operation::element_wise::Add{}
+{% elif conv2d_flag == "bias_relu" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddRelu{}
+{% elif conv2d_flag == "bias_sigmoid" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddSigmoid{}
+{% elif conv2d_flag == "bias_add_relu" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddAddRelu{}
+{% endif %}
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}auto op =  {{instance}}{};
+{{indent}}auto invoker  = op.MakeInvoker();
+{{indent}}auto argument = op.MakeArgument(
+{{problem_args}}
+{{indent}});
+{{indent}}if(!op.IsSupportedArgument(argument)) {
+{{indent}}  throw std::runtime_error(
+{{indent}}    "wrong! device_conv with the specified compilation parameters does "
+{{indent}}    "not support this Conv problem");
+{{indent}}}
+{% if is_profiler %}
+{{indent}}auto workspace_size = op.GetWorkSpaceSize(&argument);
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+{{indent}}invoker.Run(argument, StreamConfig{stream, false});
+{{indent}}return;
+"""
+)
+
+HEADER_CODE = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+// #include <half.hpp>
+#include <random>
+#include <rocrand/rocrand.h>
+#include "include/ck/utility/print.hpp"
+#include "library/include/ck/library/utility/device_memory.hpp"
+#include "library/include/ck/library/utility/host_tensor.hpp"
+#include "library/include/ck/library/utility/host_tensor_generator.hpp"
+#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+
+{{extra_code}}
+
+{{instances}}
+
+
+void {{function_name}}(
+    void * in_ptr,
+    void * weight_ptr,
+    void * out_ptr,
+{% if "bias" in conv2d_flag %}
+    void * bias_ptr,
+{% endif %}
+{% if conv2d_flag == "bias_add_relu" %}
+    void * res_ptr,
+{% endif %}
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int64_t stride,
+    int64_t dilation,
+    int64_t pad,
+    int64_t group,
+    hipStream_t stream
+    ) {
+  {{shape_function}}
+
+  int C_ = (*in_ch)  / group;
+  int K_ = (*out_ch) / group;
+  int N_ = (*batch);
+  const int NDimSpatial = 2;
+
+  // input
+  std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{
+    static_cast<ck::index_t>(group),
+    static_cast<ck::index_t>(N_),
+    static_cast<ck::index_t>(C_),
+    static_cast<ck::index_t>(*in_h),
+    static_cast<ck::index_t>(*in_w)
+  };
+
+  std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{
+    static_cast<ck::index_t>(C_ ),
+    static_cast<ck::index_t>((*in_h) * (*in_w) * group * C_),
+    static_cast<ck::index_t>(1),
+    static_cast<ck::index_t>((*in_w) * group * C_),
+    static_cast<ck::index_t>(group * C_)
+  };
+
+  // weight
+  std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{
+    static_cast<ck::index_t>(group),
+    static_cast<ck::index_t>(K_), // K
+    static_cast<ck::index_t>(C_),
+    static_cast<ck::index_t>(*kernel_h),
+    static_cast<ck::index_t>(*kernel_w)
+  };
+  std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{
+    static_cast<ck::index_t>((*kernel_h) * (*kernel_w)  * C_ * K_),
+    static_cast<ck::index_t>((*kernel_h) * (*kernel_w)  * C_),
+    static_cast<ck::index_t>(1),
+    static_cast<ck::index_t>((*kernel_w)  * C_),
+    static_cast<ck::index_t>(C_)
+  };
+
+  // bias
+  std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_lengths{
+    static_cast<ck::index_t>(group),
+    static_cast<ck::index_t>(N_), // N
+    static_cast<ck::index_t>(K_), // K
+    static_cast<ck::index_t>(*out_h),
+    static_cast<ck::index_t>(*out_w)
+  };
+  std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_strides{
+    static_cast<ck::index_t>(K_), 0, 1, 0, 0};
+
+  // output
+  std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{
+    static_cast<ck::index_t>(group),
+    static_cast<ck::index_t>(N_),
+    static_cast<ck::index_t>(K_),
+    static_cast<ck::index_t>(*out_h),
+    static_cast<ck::index_t>(*out_w)
+  };
+
+  std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{
+    static_cast<ck::index_t>(K_),
+    static_cast<ck::index_t>((*out_h) * (*out_w) * group * K_),
+    static_cast<ck::index_t>(1),
+    static_cast<ck::index_t>((*out_w) * group * K_),
+    static_cast<ck::index_t>(group * K_)
+  };
+
+  std::array<ck::index_t, NDimSpatial> conv_filter_strides{static_cast<ck::index_t>(stride), static_cast<ck::index_t>(stride)};
+  std::array<ck::index_t, NDimSpatial> conv_filter_dilations{static_cast<ck::index_t>(dilation), static_cast<ck::index_t>(dilation)};
+  std::array<ck::index_t, NDimSpatial> input_left_pads{static_cast<ck::index_t>(pad), static_cast<ck::index_t>(pad)};
+  std::array<ck::index_t, NDimSpatial> input_right_pads{static_cast<ck::index_t>(pad), static_cast<ck::index_t>(pad)};
+
+
+  {{exec_paths}}
+
+  throw std::runtime_error(
+      "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{% if "bias" in conv2d_flag %}
+{{indent}}    {{bias_ptr}},
+{% endif %}
+{% if conv2d_flag == "bias_add_relu" %}
+{{indent}}    {{res_ptr}},
+{% endif %}
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{stride}},
+{{indent}}    {{dilation}},
+{{indent}}    {{pad}},
+{{indent}}    {{group}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+ARGS_PARSE_TEMPLATE = jinja2.Template(
+    """
+  const int64_t batch = std::stoi(argv[1]);
+  const int64_t in_h = std::stoi(argv[2]);
+  const int64_t in_w = std::stoi(argv[3]);
+  const int64_t in_ch = std::stoi(argv[4]);
+  const int64_t kernel_h = std::stoi(argv[5]);
+  const int64_t kernel_w = std::stoi(argv[6]);
+  const int64_t out_ch = std::stoi(argv[7]);
+  const int64_t stride = std::stoi(argv[8]);
+  const int64_t pad = std::stoi(argv[9]);
+  const int64_t dilation = std::stoi(argv[10]);
+  const int64_t group = std::stoi(argv[11]);
+"""
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  int64_t a_ptr_sz = NI * HI * WI * CI;
+  int64_t b_ptr_sz = CO * KH * KW * CI  / group;
+  int64_t c_ptr_sz = NO * HO * WO * CO;
+  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for 8M L2 cache
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+
+  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
+  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // w: index 1
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // y: index 2
+{% if "bias" in conv2d_flag %}
+  memory_pool->AllocateHalfTensor(CO, 8);  // b: index 3
+{% endif %}
+{% if conv2d_flag == "bias_add_relu" %}
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // r: index 4
+{% endif %}
+"""
+)
+
+STRUCTS_DEF_TEMPLATE = jinja2.Template(
+    """
+
+struct ProfilerMemoryPool {
+  ProfilerMemoryPool() {
+    std::random_device rd;
+    gen = std::mt19937(rd());
+    uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
+    offsets.reserve(512);
+    strides.reserve(512);
+    copies.reserve(512);
+    ptrs.reserve(512);
+  }
+  ~ProfilerMemoryPool() {
+    for(int i = 0; i < ptrs.size(); i++){
+      if (hipFree(&ptrs[i]) != hipSuccess) {
+      // ...
+      }
+    }
+  }
+
+  template <typename DType>
+  DType* AllocateGaussianTensor(int64_t size) {
+    size_t length = size * sizeof(DType);
+    DType *d_x;
+    //hipMalloc(&d_x, length);
+    if (hipMalloc(&d_x, length) != hipSuccess) {
+      throw std::runtime_error(
+          " ROCMWorkspace: hipMalloc( " + std::to_string(length) +
+          " ) failed.");
+    }
+
+    float mean = 0.0f;
+    float stddev = 1.0f;
+    uint64_t seed = uniform_dist(gen);
+    rocrand_set_seed(generator, seed);
+    rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
+    return d_x;
+  }
+
+  ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
+    return reinterpret_cast<ck::half_t*>(
+        AllocateGaussianTensor<ck::half_t>(size));
+  }
+
+  int AllocateHalfTensor(int64_t size, int64_t copy) {
+    offsets.push_back(0);
+    strides.push_back(size);
+    copies.push_back(copy);
+    auto ptr = AllocateHalfGaussianTensor(size * copy);
+    ptrs.push_back(reinterpret_cast<void*>(ptr));
+    return ptrs.size() - 1;
+  }
+
+  ck::half_t* RequestHalfTensorByIdx(int idx) {
+    auto copy = copies.at(idx);
+    auto offset = offsets.at(idx);
+    auto stride = strides.at(idx);
+    ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
+    ptr += offset;
+    offset += stride;
+    if (offset == copy * stride) {
+        offset = 0;
+    }
+    offsets[idx] = offset;
+    return ptr;
+  }
+  std::vector<int64_t> offsets;
+  std::vector<int64_t> strides;
+  std::vector<int64_t> copies;
+  std::vector<void*> ptrs;
+  std::mt19937 gen;
+  std::uniform_int_distribution<int64_t> uniform_dist;
+  rocrand_generator generator;
+};
+
+// hack for DeviceMem linking error
+// TODO fix this by making CK a header-only lib
+// <<< hack begin
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
+void DeviceMem::ToDevice(const void* p) const
+{
+  hipGetErrorString(
+        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+void DeviceMem::FromDevice(void* p) const
+{
+  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+struct KernelTimerImpl
+{
+  KernelTimerImpl() {
+    hipGetErrorString(hipEventCreate(&mStart));
+    hipGetErrorString(hipEventCreate(&mEnd));
+  }
+  ~KernelTimerImpl() {
+    hipGetErrorString(hipEventDestroy(mStart));
+    hipGetErrorString(hipEventDestroy(mEnd));
+  }
+  void Start() {
+    hipGetErrorString(hipDeviceSynchronize());
+    hipGetErrorString(hipEventRecord(mStart, nullptr));
+  }
+  void End() {
+    hipGetErrorString(hipEventRecord(mEnd, nullptr));
+    hipGetErrorString(hipEventSynchronize(mEnd));
+  }
+  float GetElapsedTime() const {
+    float time;
+    hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
+    return time;
+  }
+  hipEvent_t mStart, mEnd;
+};
+// >>> hack end
+
+"""
+)
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+{{op_func}}
+
+{{structs_def}}
+
+int main(int argc, char** argv) {
+  if (argc < 10) {
+    throw std::runtime_error("wrong params");
+  }
+  {{args_parse}}
+  {{shape_func}}
+  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
+  hipStream_t stream = nullptr;
+  {{tensor_decl}}
+  // TODO: random init
+  // warmup
+  for(int i = 0; i < 3; ++i) {
+    {{func_call}}
+  }
+  // run
+  auto timer = new KernelTimerImpl();
+  timer->Start();
+  for(int i = 0; i < 5; ++i) {
+    {{func_call}}
+  }
+  timer->End();
+  std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
+  std::cout << "TIME:" << timer->GetElapsedTime() << std::endl;
+  delete(timer);
+}
+"""
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void *,
+  void *,
+  void *,
+{% if "bias" in conv2d_flag %}
+  void *,
+{% endif %}
+{% if conv2d_flag == "bias_add_relu" %}
+  void *,
+{% endif %}
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  hipStream_t stream
+);
+"""
+)
+
+
+def emit_instance(op):
+    """Emits instance."""
+    import ck_lib  # noqa: F401
+
+    op_def = op.emit()
+    return op_def
+
+
+def extract_config(op_kind, extra_kind):
+    """
+    Parameters
+    ----------
+    op_kind : ck_lib.library.OperationKind
+        Operation kind.
+    extra_kind : ck_lib.library.[AnyKind]
+        Used to as extra flag to distinguish kernels.
+        E.g. bias_add_relu vs. add_relu_bias
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    conv2d_ops = OrderedDict()
+    extract_ops = list(Target.current()._operators[op_kind][extra_kind].items())
+    for key, value in extract_ops:
+        conv2d_ops[key] = value[0]
+    return conv2d_ops
+
+
+def extract_config_name(config):
+    """
+    Parameters
+    ----------
+    config : str
+        Configuration as a string in the format of 'using model = xxx'.
+
+    Returns
+    -------
+    str
+        Extracted name from the statement, e.g. 'model' for 'using model = xxx'.
+
+    Raises
+    ------
+    RuntimeError
+        Invalid config.
+    """
+    pattern = re.compile(r"\s*using\s(.*?)\s=")
+    decl = config.split("\n")[1]
+    match = pattern.match(decl)
+    if match is None:
+        raise RuntimeError("Invalid config: \n" + config)
+    return match.groups()[0]
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    shape_template,
+    conv2d_flag,
+    extra_code="",
+    src_template=SRC_TEMPLATE,
+    prob_args_template=PROBLEM_ARGS_TEMPLATE,
+):
+
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    shape_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    conv2d_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu'.
+    extra_code : str
+        Extra code for self-defined operators.
+    """
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    # shape function
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_h",
+        x_dim2="in_w",
+        x_dim3="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_h",
+        w_dim2="kernel_w",
+        stride="stride",
+        dilate="dilation",
+        pad="pad",
+    )
+    file_paris = []
+    for op_name, op in op_instance.items():
+        config = emit_instance(op)
+        config_name = extract_config_name(config)
+        instance = INSTANCE_TEMPLATE.render(
+            name="DeviceConvFwdInstance", config_name=config_name, config=config
+        )
+        problem_args = prob_args_template.render(
+            indent="  ",
+            conv2d_flag=conv2d_flag,
+        )
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ",
+            instance="DeviceConvFwdInstance",
+            problem_args=problem_args,
+            is_profiler=True,
+        )
+        op_func = src_template.render(
+            instances=instance,
+            function_name="conv",
+            shape_func="",
+            exec_paths=exec_program,
+            extra_code=extra_code,
+            conv2d_flag=conv2d_flag,
+        )
+        structs_def = STRUCTS_DEF_TEMPLATE.render()
+        args_parse = ARGS_PARSE_TEMPLATE.render()
+        tensor_decl = TENSOR_DECL_TEMPLATE.render(conv2d_flag=conv2d_flag)
+        func_call = FUNC_CALL_TEMPLATE.render(
+            func_name="conv",
+            in_ptr="(void *) memory_pool->RequestHalfTensorByIdx(0)",
+            weight_ptr="(void *) memory_pool->RequestHalfTensorByIdx(1)",
+            out_ptr="(void *) memory_pool->RequestHalfTensorByIdx(2)",
+            bias_ptr="(void *) memory_pool->RequestHalfTensorByIdx(3)",
+            res_ptr="(void *) memory_pool->RequestHalfTensorByIdx(4)",
+            p_batch="&NI",
+            p_out_ch="&CO",
+            p_in_ch="&CI",
+            p_kernel_h="&KH",
+            p_kernel_w="&KW",
+            p_in_h="&HI",
+            p_in_w="&WI",
+            p_out_batch="&NO",
+            p_out_h="&HO",
+            p_out_w="&WO",
+            stride="SH",
+            dilation="DH",
+            pad="PH",
+            group="group",
+            conv2d_flag=conv2d_flag,
+        )
+
+        code = PROFILER_TEMPLATE.render(
+            structs_def=structs_def,
+            op_func=op_func,
+            shape_func=shape_func,
+            args_parse=args_parse,
+            tensor_decl=tensor_decl,
+            func_call=func_call,
+        )
+        prefix = os.path.join(workdir, "profiler", op_type)
+        if not os.path.exists(prefix):
+            os.makedirs(prefix)
+        src_path = os.path.join(prefix, op_name + ".cpp")
+        obj_path = os.path.join(prefix, op_name)
+        if os.path.exists(obj_path):
+            continue
+        with open(src_path, "w") as fo:
+            fo.write(code)
+        file_paris.append((src_path, obj_path))
+
+    # build
+    target = Target.current()
+    compile_engine = builder.Builder()
+    compile_engine.build_objs(file_paris, target.compile_cmd(executable=True))
+    # cleanup source
+    # for src_path, _ in file_paris:
+    #     os.remove(src_path)
+
+
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+    conv2d_flag,
+    extra_code="",
+    src_template=SRC_TEMPLATE,
+    prob_args_template=PROBLEM_ARGS_TEMPLATE,
+):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_remplate : jinja2.Template
+        Generates if statement to execute kernel.
+    shape_eval_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    shape_save_template : jinja2.Template
+        Generates output dimensions.
+        The template is passed from compiler/ops/pool.
+    conv2d_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu'.
+    extra_code : str
+        Extra code for self-defined operators.
+
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for key, value in exec_path.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        if value not in inst_def_flag:
+            config = emit_instance(op_instance[value])
+            inst_def_flag.add(value)
+        else:
+            config = ""
+        inst = INSTANCE_TEMPLATE.render(
+            config=config, name=fname, config_name=extract_config_name(config)
+        )
+        instances[key] = inst
+        instance_decl += inst
+
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        w_dim0="*out_ch",
+        w_dim1="*kernel_h",
+        w_dim2="*kernel_w",
+        stride="stride",
+        dilate="dilation",
+        pad="pad",
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+        y_dim3="*out_ch",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key, _ in instances.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        problem_args = prob_args_template.render(
+            indent="    ",
+            conv2d_flag=conv2d_flag,
+        )
+        program = EXEC_TEMPLATE.render(
+            indent="    ",
+            instance=fname,
+            problem_args=problem_args,
+            is_profiler=False,
+        )
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return src_template.render(
+        instances=instance_decl,
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        extra_code=extra_code,
+        conv2d_flag=conv2d_flag,
+    )
+
+
+def gen_function_decl(func_name, conv2d_flag):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    conv2d_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu'.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name, conv2d_flag=conv2d_flag)
+
+
+def gen_function_call(func_attrs, indent="  ", conv2d_flag=""):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    bias_ptr = ""
+    res_ptr = ""
+    if "bias" in conv2d_flag:
+        b = func_attrs["inputs"][2]
+        bias_ptr = b._attrs["name"]
+    if "bias_add_relu" == conv2d_flag:
+        r = func_attrs["inputs"][3]
+        res_ptr = r._attrs["name"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        bias_ptr=bias_ptr,
+        res_ptr=res_ptr,
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        stride=func_attrs["stride"],
+        dilation=func_attrs["dilate"],
+        pad=func_attrs["pad"],
+        group=func_attrs["group"],
+        indent=indent,
+        conv2d_flag=conv2d_flag,
+    )
+
+
+def function_filter(cfg, func_attrs, offset):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    offset: Int
+        Offset of split(cfg,"_") to get conv2d specialization
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    from ck_lib.conv2d_operation import Conv2DSpecialization
+
+    kh = func_attrs["KH"]
+    kw = func_attrs["KW"]
+    pad = func_attrs["pad"]
+    stride = func_attrs["stride"]
+    cond0 = kh == 1 and kw == 1 and pad == 0
+    cond1 = stride == 1
+    spec = int(cfg.split("_")[offset])
+    if cond0 and cond1:  # 1x1, stride 1, pad 0
+        return spec == Conv2DSpecialization.ConvFwd1x1S1P0.value
+    if cond0 and not cond1:  # 1x1, pad 0
+        return spec == Conv2DSpecialization.ConvFwd1x1P0.value
+    return True
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d.py b/python/aitemplate/backend/rocm/conv2d/conv2d.py
new file mode 100644
index 000000000..1757f3608
--- /dev/null
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d.py
@@ -0,0 +1,170 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for conv2d.
+"""
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613
+
+
+@registry.reg("rocm.conv2d.config")
+def conv2d_config(func_attrs):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.Conv2dKind.GroupConv2dBiasRelu
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    func_attrs["op_instance"] = common.extract_config(op_kind, extra_kind)
+
+
+@registry.reg("rocm.conv2d.gen_profiler")
+def conv2d_gen_profiler(func_attrs, workdir, shape_template):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    shape_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        shape_template=shape_template,
+        conv2d_flag="",
+        extra_code=common.HEADER_CODE.render(),
+    )
+
+
+@registry.reg("rocm.conv2d.gen_function")
+def conv2d_gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    instance_template : jinja2.Template
+        Template that defines the model. e.g. 'using model=xxx'.
+    exec_template : jinja2.Template
+        Execution statements in main function.
+    src_template : jinja2.Template
+        Full main.cpp with headers, embedding all templates.
+    exec_cond_remplate : jinja2.Template
+        Generates if statement to execute kernel.
+    shape_eval_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    shape_save_template : jinja2.Template
+        Generates output dimensions.
+        The template is passed from compiler/ops/pool.
+
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        "",
+        extra_code=common.HEADER_CODE.render(),
+    )
+
+
+@registry.reg("rocm.conv2d.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, conv2d_flag="")
+
+
+@registry.reg("rocm.conv2d.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, conv2d_flag="")
+
+
+@registry.reg("rocm.conv2d.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    # return common.function_filter(cfg, func_attrs, 1)
+    return True
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
new file mode 100644
index 000000000..ae5207fe5
--- /dev/null
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
@@ -0,0 +1,163 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for Conv2dBias: conv2d(w, x) + b
+"""
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613
+
+
+@registry.reg("rocm.conv2d_bias.config")
+def conv2d_config(func_attrs):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.Conv2dKind.GroupConv2dBiasRelu
+    extra_kind = ck_lib.library.TensorOperation.Add
+    func_attrs["op_instance"] = common.extract_config(op_kind, extra_kind)
+
+
+@registry.reg("rocm.conv2d_bias.gen_profiler")
+def conv2d_gen_profiler(func_attrs, workdir, shape_template):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    shape_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        shape_template=shape_template,
+        conv2d_flag="bias",
+        extra_code=common.HEADER_CODE.render(),
+    )
+
+
+@registry.reg("rocm.conv2d_bias.gen_function")
+def conv2d_gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_remplate : jinja2.Template
+        Generates if statement to execute kernel.
+    shape_eval_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    shape_save_template : jinja2.Template
+        Generates output dimensions.
+        The template is passed from compiler/ops/pool.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        "bias",
+        extra_code=common.HEADER_CODE.render(),
+    )
+
+
+@registry.reg("rocm.conv2d_bias.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, conv2d_flag="bias")
+
+
+@registry.reg("rocm.conv2d_bias.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, conv2d_flag="bias")
+
+
+@registry.reg("rocm.conv2d_bias.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    # return common.function_filter(cfg, func_attrs, 3)
+    return True
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
new file mode 100644
index 000000000..bd94efd51
--- /dev/null
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
@@ -0,0 +1,207 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for conv2d_bias_add_relu.
+"""
+import jinja2
+
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddAddRelu
+{
+    __host__ __device__ constexpr void
+    operator()(half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
+    {
+        half_t a = x0 + x1 + x2;
+        y = a > 0 ? a : 0;
+    }
+
+    __host__ __device__ constexpr void
+    operator()(float& y, const float& x0, const float& x1, const float& x2) const
+    {
+        float a = x0 + x1+ x2;
+        float b = a > 0 ? a : 0;
+        y       = b;
+    }
+
+    __host__ __device__ constexpr void
+    operator()(half_t& y, const float& x0, const half_t& x1, const half_t& x2) const
+    {
+        float a = x0 + x1 + x2;
+        float b = a > 0 ? a : 0;
+        y       = b;
+    }
+};
+} // namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.conv2d_bias_add_relu.config")
+def conv2d_config(func_attrs):
+    """Extracts (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.Conv2dKind.GroupConv2dBiasRelu
+    extra_kind = ck_lib.library.TensorOperation.AddAddRelu
+    func_attrs["op_instance"] = common.extract_config(op_kind, extra_kind)
+
+
+@registry.reg("rocm.conv2d_bias_add_relu.gen_profiler")
+def conv2d_gen_profiler(func_attrs, workdir, shape_template):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    shape_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    """
+    extra_code = EXTRA_CODE.render()
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        shape_template=shape_template,
+        conv2d_flag="bias_add_relu",
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("rocm.conv2d_bias_add_relu.gen_function")
+def conv2d_gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_remplate : jinja2.Template
+        Generates if statement to execute kernel.
+    shape_eval_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    shape_save_template : jinja2.Template
+        Generates output dimensions.
+        The template is passed from compiler/ops/pool.
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    extra_code = EXTRA_CODE.render()
+    return common.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        "bias_add_relu",
+        extra_code,
+    )
+
+
+@registry.reg("rocm.conv2d_bias_add_relu.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, conv2d_flag="bias_add_relu")
+
+
+@registry.reg("rocm.conv2d_bias_add_relu.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, conv2d_flag="bias_add_relu")
+
+
+@registry.reg("rocm.conv2d_bias_add_relu.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
new file mode 100644
index 000000000..44aaf5963
--- /dev/null
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
@@ -0,0 +1,165 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for conv2d_bias_relu.
+"""
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613
+
+
+@registry.reg("rocm.conv2d_bias_relu.config")
+def conv2d_config(func_attrs):
+    """Extracts (operation name, operation instance) pair from
+    all operation candidates.
+
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.Conv2dKind.GroupConv2dBiasRelu
+    extra_kind = ck_lib.library.TensorOperation.AddRelu
+    func_attrs["op_instance"] = common.extract_config(op_kind, extra_kind)
+
+
+@registry.reg("rocm.conv2d_bias_relu.gen_profiler")
+def conv2d_gen_profiler(func_attrs, workdir, shape_template):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    shape_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        shape_template=shape_template,
+        conv2d_flag="bias_relu",
+        extra_code=common.HEADER_CODE.render(),
+    )
+
+
+@registry.reg("rocm.conv2d_bias_relu.gen_function")
+def conv2d_gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_remplate : jinja2.Template
+        Generates if statement to execute kernel.
+    shape_eval_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    shape_save_template : jinja2.Template
+        Generates output dimensions.
+        The template is passed from compiler/ops/pool.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        "bias_relu",
+        extra_code=common.HEADER_CODE.render(),
+    )
+
+
+@registry.reg("rocm.conv2d_bias_relu.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, conv2d_flag="bias_relu")
+
+
+@registry.reg("rocm.conv2d_bias_relu.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    """
+    Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, conv2d_flag="bias_relu")
+
+
+@registry.reg("rocm.conv2d_bias_relu.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
+    # return common.function_filter(cfg, func_attrs, 3)
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
new file mode 100644
index 000000000..d94ad48de
--- /dev/null
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
@@ -0,0 +1,212 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for conv2d_bias_sigmoid.
+"""
+import jinja2
+
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddSigmoid
+{
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const float& x1) const
+    {
+        const float a = x0 + x1;
+        y             = 1.0f / (1.0f + exp(-a));
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        const double a = x0 + x1;
+        y              = 1.0 / (1.0 + exp(-a));
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        const half_t a = x0 + x1;
+        y              = type_convert<half_t>(1.0) / (type_convert<half_t>(1.0) + type_convert<half_t>(exp(ck::type_convert<float>(-a))));
+    };
+};
+} // namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.conv2d_bias_sigmoid.config")
+def conv2d_config(func_attrs):
+    """Extracts (operation name, operation instance) pair from
+    all operation candidates.
+
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.Conv2dKind.GroupConv2dBiasRelu
+    extra_kind = ck_lib.library.TensorOperation.AddSigmoid
+    func_attrs["op_instance"] = common.extract_config(op_kind, extra_kind)
+
+
+@registry.reg("rocm.conv2d_bias_sigmoid.gen_profiler")
+def conv2d_gen_profiler(func_attrs, workdir, shape_template):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    shape_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        shape_template=shape_template,
+        conv2d_flag="bias_sigmoid",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.conv2d_bias_sigmoid.gen_function")
+def conv2d_gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_remplate : jinja2.Template
+        Generates if statement to execute kernel.
+    shape_eval_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    shape_save_template : jinja2.Template
+        Generates output dimensions.
+        The template is passed from compiler/ops/pool.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        "bias_sigmoid",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.conv2d_bias_sigmoid.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, conv2d_flag="bias_sigmoid")
+
+
+@registry.reg("rocm.conv2d_bias_sigmoid.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, conv2d_flag="bias_sigmoid")
+
+
+@registry.reg("rocm.conv2d_bias_sigmoid.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    # return common.function_filter(cfg, func_attrs, 3)
+    return True
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
new file mode 100644
index 000000000..0a0bce2f7
--- /dev/null
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
@@ -0,0 +1,198 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for transposed_conv2d.
+"""
+import jinja2
+
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613
+EXTRA_CODE = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/device/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+"""
+)
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+{{indent}}                                static_cast<ck::half_t *>(out_ptr),
+{{indent}}                                static_cast<ck::half_t *>(weight_ptr),
+{{indent}}                                static_cast<ck::half_t *>(in_ptr),
+{{indent}}                                N_,
+{{indent}}                                K_,
+{{indent}}                                C_,
+{{indent}}                                {static_cast<ck::index_t>(*out_h), static_cast<ck::index_t>(*out_w)},
+{{indent}}                                {static_cast<ck::index_t>(*kernel_h), static_cast<ck::index_t>(*kernel_w)},
+{{indent}}                                {static_cast<ck::index_t>(*in_h), static_cast<ck::index_t>(*in_w)},
+{{indent}}                                {static_cast<ck::index_t>(stride), static_cast<ck::index_t>(stride)},
+{{indent}}                                {static_cast<ck::index_t>(dilation), static_cast<ck::index_t>(dilation)},
+{{indent}}                                {static_cast<ck::index_t>(pad), static_cast<ck::index_t>(pad)},
+{{indent}}                                {static_cast<ck::index_t>(pad), static_cast<ck::index_t>(pad)},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
+"""
+)
+
+
+@registry.reg("rocm.transposed_conv2d.config")
+def conv2d_config(func_attrs):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.Conv2dKind.TransposedConv2d
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    func_attrs["op_instance"] = common.extract_config(op_kind, extra_kind)
+
+
+@registry.reg("rocm.transposed_conv2d.gen_profiler")
+def conv2d_gen_profiler(func_attrs, workdir, shape_template):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    shape_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        shape_template=shape_template,
+        conv2d_flag="",
+        prob_args_template=PROBLEM_ARGS_TEMPLATE,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.transposed_conv2d.gen_function")
+def conv2d_gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    instance_template : jinja2.Template
+        Template that defines the model. e.g. 'using model=xxx'.
+    exec_template : jinja2.Template
+        Execution statements in main function.
+    src_template : jinja2.Template
+        Full main.cpp with headers, embedding all templates.
+    exec_cond_remplate : jinja2.Template
+        Generates if statement to execute kernel.
+    shape_eval_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    shape_save_template : jinja2.Template
+        Generates output dimensions.
+        The template is passed from compiler/ops/pool.
+
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        "",
+        prob_args_template=PROBLEM_ARGS_TEMPLATE,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.transposed_conv2d.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, conv2d_flag="")
+
+
+@registry.reg("rocm.transposed_conv2d.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, conv2d_flag="")
+
+
+@registry.reg("rocm.transposed_conv2d.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
new file mode 100644
index 000000000..44771e9b7
--- /dev/null
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
@@ -0,0 +1,172 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for transposed_conv2d_bias_relu.
+"""
+import jinja2
+
+from ... import registry
+from . import common
+
+# pylint: disable=C0103,C0415,W0613
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+"""
+)
+
+
+@registry.reg("rocm.transposed_conv2d_bias_relu.config")
+def conv2d_config(func_attrs):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.Conv2dKind.TransposedConv2dBiasRelu
+    extra_kind = ck_lib.library.TensorOperation.AddRelu
+    func_attrs["op_instance"] = common.extract_config(op_kind, extra_kind)
+
+
+@registry.reg("rocm.transposed_conv2d_bias_relu.gen_profiler")
+def conv2d_gen_profiler(func_attrs, workdir, shape_template):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    shape_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        shape_template=shape_template,
+        conv2d_flag="bias_relu",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.transposed_conv2d_bias_relu.gen_function")
+def conv2d_gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_remplate : jinja2.Template
+        Generates if statement to execute kernel.
+    shape_eval_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    shape_save_template : jinja2.Template
+        Generates output dimensions.
+        The template is passed from compiler/ops/pool.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+        "bias_relu",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.transposed_conv2d_bias_relu.func_decl")
+def conv2d_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, conv2d_flag="bias_relu")
+
+
+@registry.reg("rocm.transposed_conv2d_bias_relu.func_call")
+def conv2d_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, conv2d_flag="bias_relu")
+
+
+@registry.reg("rocm.transposed_conv2d_bias_relu.filter")
+def conv2d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/elementwise/__init__.py b/python/aitemplate/backend/rocm/elementwise/__init__.py
new file mode 100644
index 000000000..0bf6e473f
--- /dev/null
+++ b/python/aitemplate/backend/rocm/elementwise/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+(c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+"""
+from . import fused_elementwise
+
+__all__ = ["fused_elementwise"]
diff --git a/python/aitemplate/backend/rocm/elementwise/custom_math.h b/python/aitemplate/backend/rocm/elementwise/custom_math.h
new file mode 100644
index 000000000..caa84c424
--- /dev/null
+++ b/python/aitemplate/backend/rocm/elementwise/custom_math.h
@@ -0,0 +1,318 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#ifndef CUSTOM_MATH
+#define CUSTOM_MATH
+
+#ifndef __HALF2_TO_UI
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
+#endif
+
+#ifndef __HALF_TO_US
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+#endif
+
+template <typename T>
+__device__ T sign_custom(const T a) {
+  return T(a > T(0)) - T(a < T(0));
+}
+
+__device__ half2 h2sign_custom(const half2 a) {
+  return half2(
+      sign_custom(*reinterpret_cast<const half*>(&a.x)),
+      sign_custom(*reinterpret_cast<const half*>(&a.y)));
+}
+
+__device__ half2 fast_tanh(half2 x) {
+  // 1-2/(e^(2x)+1)
+  const half2 u = __hmul2(half2(2), x);
+  const half2 emu = h2exp(u);
+  const half2 cdf =
+      __hsub2(half2(1), __h2div(half2(2), __hadd2(half2(1), emu)));
+  return cdf;
+}
+
+__device__ half fast_tanh(half x) {
+  // 1-2/(e^(2x)+1)
+  const half u = __hmul(half(2), x);
+  const half emu = hexp(u);
+  const half cdf = __hsub(half(1), __hdiv(half(2), __hadd(half(1), emu)));
+  return cdf;
+}
+
+// Return 1
+__device__ half one() {
+  uint16_t bits = 0x3c00u;
+  return reinterpret_cast<half const&>(bits);
+}
+
+/// Returns (1/2)  (specialization for half_t)
+__device__ half constant_half() {
+  uint16_t bits = 0x3800u;
+  return reinterpret_cast<half const&>(bits);
+}
+
+__device__ float fsigmoid_custom(const float a) {
+  return (tanhf(a * 0.5f) + 1.0f) * 0.5f;
+}
+
+__device__ half hsigmoid_custom(const half a) {
+  half half_val = constant_half();
+  half one_val = one();
+  return __hmul((__hadd(fast_tanh(__hmul(a, half_val)), one_val)), half_val);
+}
+
+__device__ half2 h2sigmoid_custom(const half2 a) {
+  half2 halfX2 = half2(constant_half(), constant_half());
+  half2 oneX2 = half2(one(), one());
+  return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
+}
+
+__device__ float fsilu(const float a) {
+  return a * fsigmoid_custom(a);
+}
+
+__device__ half hsilu(const half a) {
+  return __hmul(a, hsigmoid_custom(a));
+}
+
+__device__ half2 h2silu(const half2 a) {
+  return __hmul2(a, h2sigmoid_custom(a));
+}
+
+__device__ half hsin_custom(const half a) {
+  float x = __half2float(a);
+  return __float2half_rn(sin(x));
+}
+
+__device__ float leaky_relu(const float a, const float negativeSlope) {
+  return a > 0.f ? a : a * negativeSlope;
+}
+
+__device__ half leaky_relu(const half a, const half negativeSlope) {
+  return a > half(0.f) ? a : __hmul(a, negativeSlope);
+}
+
+__device__ half2 leaky_relu(const half2 a, const half2 negativeSlope) {
+  return half2(
+      leaky_relu(
+          *reinterpret_cast<const half*>(&a.x),
+          *reinterpret_cast<const half*>(&negativeSlope.x)),
+      leaky_relu(
+          *reinterpret_cast<const half*>(&a.y),
+          *reinterpret_cast<const half*>(&negativeSlope.y)));
+}
+
+__device__ float relu(const float a) {
+  return a > 0.f ? a : 0.f;
+}
+
+__device__ half relu(const half a) {
+  return a > half(0.f) ? a : half(0.f);
+}
+
+__device__ half2 relu(const half2 a) {
+  half2 zeroX2 = half2(half(0.f), half(0.f));
+  return half2(
+      relu(*reinterpret_cast<const half*>(&a.x)),
+      relu(*reinterpret_cast<const half*>(&a.y)));
+}
+
+template <typename T>
+__device__ T hard_tanh(const T a, T min_val, T max_val) {
+  if (a <= min_val) {
+    return min_val;
+  } else if (a >= max_val) {
+    return max_val;
+  } else {
+    return a;
+  }
+}
+
+__device__ half2
+h2hard_tanh(const half2 a, const half2 min_val, const half2 max_val) {
+  return half2(
+      hard_tanh(
+          *reinterpret_cast<const half*>(&a.x),
+          *reinterpret_cast<const half*>(&min_val.x),
+          *reinterpret_cast<const half*>(&max_val.x)),
+      hard_tanh(
+          *reinterpret_cast<const half*>(&a.y),
+          *reinterpret_cast<const half*>(&min_val.y),
+          *reinterpret_cast<const half*>(&max_val.y)));
+}
+
+__device__ half replace_if_inf(
+    const half a,
+    const half inf_replace,
+    const half neginf_replace) {
+  auto is_inf = __hisinf(a);
+  if (is_inf == -1) {
+    return neginf_replace;
+  }
+  if (is_inf == 1) {
+    return inf_replace;
+  }
+  return a;
+}
+
+__device__ float replace_if_inf(
+    const float a,
+    const float inf_replace,
+    const float neginf_replace) {
+  auto is_inf = isinf(a);
+  if (is_inf == -1) {
+    return neginf_replace;
+  }
+  if (is_inf == 1) {
+    return inf_replace;
+  }
+  return a;
+}
+
+__device__ half2 nan_to_num(
+    const half2 a,
+    const half2 nan_replace,
+    const half2 inf_replace,
+    const half2 neginf_replace) {
+  half2 isnan = __hisnan2(a);
+  return half2(
+      *reinterpret_cast<const half*>(&isnan.x)
+          ? *reinterpret_cast<const half*>(&nan_replace.x)
+          : replace_if_inf(
+                *reinterpret_cast<const half*>(&a.x),
+                *reinterpret_cast<const half*>(&inf_replace.x),
+                *reinterpret_cast<const half*>(&neginf_replace.x)),
+      *reinterpret_cast<const half*>(&isnan.y)
+          ? *reinterpret_cast<const half*>(&nan_replace.y)
+          : replace_if_inf(
+                *reinterpret_cast<const half*>(&a.y),
+                *reinterpret_cast<const half*>(&inf_replace.y),
+                *reinterpret_cast<const half*>(&neginf_replace.y)));
+}
+
+__device__ half nan_to_num(
+    const half a,
+    const half nan_replace,
+    const half inf_replace,
+    const half neginf_replace) {
+  if (__hisnan(a)) {
+    return nan_replace;
+  }
+  return replace_if_inf(a, inf_replace, neginf_replace);
+}
+
+__device__ float nan_to_num(
+    const float a,
+    const float nan_replace,
+    const float inf_replace,
+    const float neginf_replace) {
+  if (isnan(a)) {
+    return nan_replace;
+  }
+  return replace_if_inf(a, inf_replace, neginf_replace);
+}
+
+__device__ half2 clamp_nan_to_num(
+    const half2 a,
+    const half2 clamp_min,
+    const half2 clamp_max,
+    const half2 nan_replace) {
+  half2 isnan = __hisnan2(a);
+  return half2(
+      *reinterpret_cast<const half*>(&isnan.x)
+          ? *reinterpret_cast<const half*>(&nan_replace.x)
+          : hard_tanh(
+                *reinterpret_cast<const half*>(&a.x),
+                *reinterpret_cast<const half*>(&clamp_min.x),
+                *reinterpret_cast<const half*>(&clamp_max.x)),
+      *reinterpret_cast<const half*>(&isnan.y)
+          ? *reinterpret_cast<const half*>(&nan_replace.y)
+          : hard_tanh(
+                *reinterpret_cast<const half*>(&a.y),
+                *reinterpret_cast<const half*>(&clamp_min.y),
+                *reinterpret_cast<const half*>(&clamp_max.y)));
+}
+
+__device__ half clamp_nan_to_num(
+    const half a,
+    const half clamp_min,
+    const half clamp_max,
+    const half nan_replace) {
+  return __hisnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
+}
+
+__device__ float clamp_nan_to_num(
+    const float a,
+    const float clamp_min,
+    const float clamp_max,
+    const float nan_replace) {
+  return isnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
+}
+
+// Backup functions
+__device__ half nanh() {
+  return __float2half(nanf(""));
+}
+
+__device__ bool half_isnan(half h) {
+  return h != h;
+}
+
+__device__ half hmin(half a, half b) {
+  return (a < b) ? a : b;
+}
+
+__device__ half hmax(half a, half b) {
+  return (a > b) ? a : b;
+}
+
+// max/min functions that let NaNs pass through
+__device__ float fmaxf_nan(const float a, const float b) {
+  return (isnan(a) || isnan(b)) ? nanf("") : fmaxf(a, b);
+}
+
+__device__ half hmax_nan(const half a, const half b) {
+  return (half_isnan(a) || half_isnan(b)) ? nanh() : hmax(a, b);
+}
+
+__device__ half2 hmax2_nan(const half2 a, const half2 b) {
+  return half2(
+      hmax_nan(
+          *reinterpret_cast<const half*>(&a.x),
+          *reinterpret_cast<const half*>(&b.x)),
+      hmax_nan(
+          *reinterpret_cast<const half*>(&a.y),
+          *reinterpret_cast<const half*>(&b.y)));
+}
+
+__device__ float fminf_nan(const float a, const float b) {
+  return (isnan(a) || isnan(b)) ? nanf("") : fminf(a, b);
+}
+
+__device__ half hmin_nan(const half a, const half b) {
+  return (half_isnan(a) || half_isnan(b)) ? nanh() : hmin(a, b);
+}
+
+__device__ half2 hmin2_nan(const half2 a, const half2 b) {
+  return half2(
+      hmin_nan(
+          *reinterpret_cast<const half*>(&a.x),
+          *reinterpret_cast<const half*>(&b.x)),
+      hmin_nan(
+          *reinterpret_cast<const half*>(&a.y),
+          *reinterpret_cast<const half*>(&b.y)));
+}
+
+#endif
diff --git a/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py b/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
new file mode 100644
index 000000000..177d84cd1
--- /dev/null
+++ b/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
@@ -0,0 +1,65 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Elementwise codegen for ROCM.
+"""
+
+import os
+from typing import Any, Dict
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common import elementwise_common
+from ...target import Target
+
+
+HEAD_TEMPLATE = """
+#include <hip/math_functions.h>
+#include <hip/device_functions.h>
+"""
+
+
+@registry.reg("rocm.fused_elementwise.gen_function")
+def fused_elementwise_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generates fused_elementwise function definition."""
+
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "custom_math.h"
+    )
+    return elementwise_common.fused_elementwise_gen_function(
+        func_attrs=func_attrs,
+        custom_libs=custom_libs,
+        head_template=HEAD_TEMPLATE,
+        backend_spec=ROCMSpec(),
+    )
+
+
+@registry.reg("rocm.fused_elementwise.func_decl")
+def fused_elementwise_gen_function_decl(func_attrs):
+    """Generates fused_elementwise function declaration."""
+    return elementwise_common.fused_elementwise_gen_function_decl(
+        func_attrs=func_attrs,
+        backend_spec=ROCMSpec(),
+    )
+
+
+@registry.reg("rocm.fused_elementwise.func_call")
+def fused_elementwise_gen_function_call(func_attrs, indent):
+    """Generates fused_elementwise function call."""
+    return elementwise_common.fused_elementwise_gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+        backend_spec=ROCMSpec(),
+    )
diff --git a/python/aitemplate/backend/rocm/gemm/__init__.py b/python/aitemplate/backend/rocm/gemm/__init__.py
new file mode 100644
index 000000000..ce3fefe28
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/__init__.py
@@ -0,0 +1,49 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Rocm gemm init.
+"""
+from . import (  # noqa: F401
+    bmm_ccr,
+    bmm_crr,
+    bmm_rcr,
+    bmm_rcr_permute,
+    bmm_rrr,
+    bmm_rrr_permute,
+    bmm_softmax_bmm,
+    bmm_softmax_bmm_permute,
+    gemm_rcr,
+    gemm_rcr_bias,
+    gemm_rcr_bias_add,
+    gemm_rcr_bias_add_add,
+    gemm_rcr_bias_add_add_relu,
+    gemm_rcr_bias_add_relu,
+    gemm_rcr_bias_fast_gelu,
+    gemm_rcr_bias_mul,
+    gemm_rcr_bias_mul_add,
+    gemm_rcr_bias_mul_tanh,
+    gemm_rcr_bias_permute,
+    gemm_rcr_bias_permute_m2n3,
+    gemm_rcr_bias_permute_m3n2,
+    gemm_rcr_bias_relu,
+    gemm_rcr_bias_sigmoid,
+    gemm_rcr_bias_sigmoid_mul,
+    gemm_rcr_bias_sigmoid_mul_tanh,
+    gemm_rcr_bias_swish,
+    gemm_rcr_bias_tanh,
+    gemm_rcr_permute_m2n3,
+    gemm_rrr,
+    gemm_rrr_bias_permute,
+)
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_ccr.py b/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
new file mode 100644
index 000000000..edb3f10e9
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
@@ -0,0 +1,170 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend for A[ColumnMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[b, m, n] = bmm(a[b, k, m], b[b, n, k])
+This is used for `ops.bmm_ccr`.
+"""
+import jinja2
+
+from ... import registry
+from . import bmm_common, common
+from .layout import CCR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = K;
+  int64_t a_dim2 = M;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = N;
+  int64_t b_dim2 = K;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = N;
+"""
+)
+
+
+@registry.reg("rocm.bmm_ccr.config")
+def bmm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemm
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, CCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_ccr.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    bmm_common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        dim_info_dict=dim_info_dict,
+        gemm_flag="",
+    )
+
+
+@registry.reg("rocm.bmm_ccr.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return bmm_common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+
+
+@registry.reg("rocm.bmm_ccr.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(func_name=func_name, gemm_flag="")
+
+
+@registry.reg("rocm.bmm_ccr.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="")
+
+
+@registry.reg("rocm.bmm_ccr.filter")
+def bmm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_common.py b/python/aitemplate/backend/rocm/gemm/bmm_common.py
new file mode 100644
index 000000000..a56f8b6f0
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_common.py
@@ -0,0 +1,252 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common template for bmm
+"""
+import jinja2
+
+from . import common
+
+EXTRA_SHAPE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}const int64_t stride_a = *a_dim2;
+{{indent}}const int64_t stride_b = *b_dim2;
+{{indent}}const int64_t stride_c = *c_dim2;
+"""
+)
+EXTRA_HEADER_TEMPLATE = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp"
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+{{indent}}                                static_cast<ck::half_t *>(in_ptr),
+{{indent}}                                static_cast<ck::half_t *>(weight_ptr),
+{% if "bias" in gemm_flag %}
+{{indent}}                                std::array<const void*, 1>{static_cast<ck::half_t *>(bias_ptr)},
+{% endif %}
+{{indent}}                                static_cast<ck::half_t *>(out_ptr),
+{{indent}}                                M,
+{{indent}}                                N,
+{{indent}}                                K,
+{{indent}}                                stride_a,
+{{indent}}                                stride_b,
+{% if "bias" in gemm_flag %}
+{{indent}}                                std::array<ck::index_t, 1>{0},
+{% endif %}
+{{indent}}                                stride_c,
+{{indent}}                                M*K,
+{{indent}}                                N*K,
+{{indent}}                                M*N,
+{{indent}}                                B,
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{% if gemm_flag == "" %}
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
+{% elif gemm_flag == "bias" %}
+{{indent}}                                ck::tensor_operation::element_wise::Add{}
+{% elif gemm_flag == "bias_relu" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddRelu{}
+{% elif gemm_flag == "bias_sigmoid" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddSigmoid{}
+{% endif %}
+"""
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  int64_t a_ptr_sz = B*M*K;
+  int64_t b_ptr_sz = B*N*K;
+  int64_t c_ptr_sz = B*M*N;
+  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for 8M L2 cache
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+
+  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
+  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // w: index 1
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // y: index 2
+{% if "bias" in gemm_flag %}
+  memory_pool->AllocateHalfTensor(N, mem_pool_sz);  // b: index 3
+{% endif %}
+"""
+)
+
+
+# FIXME: consider to move this into a common function shared by both cuda
+# and rocm
+def reverse_dim_info_mapping(dim_info_dict, source, tensor_idx):
+    def _fill(arr, idx, val):
+        if len(arr) <= idx:
+            arr = arr + [None] * (idx - len(arr) + 1)
+        arr[idx] = val
+        return arr
+
+    ret = []
+    for name, dim_infos in dim_info_dict.items():
+        for dim_info in dim_infos:
+            if dim_info.source == source and dim_info.tensor_idx == tensor_idx:
+                for dim_idx in dim_info.dim_idx:
+                    ret = _fill(ret, dim_idx, name)
+
+    if None in ret:
+        raise RuntimeError(
+            "dim_info_dict for source: {}, tensor_idx: {} not complete.".format(
+                source, tensor_idx
+            )
+        )
+
+    return ret
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    args_parse,
+    gemm_flag,
+    problem_args_template=PROBLEM_ARGS_TEMPLATE,
+    extra_header_template=EXTRA_HEADER_TEMPLATE,
+    tensor_decl_template=TENSOR_DECL_TEMPLATE,
+    extra_shape_template=EXTRA_SHAPE_TEMPLATE,
+    extra_code="",
+):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    args_parse: str
+        Profiler input argument parser.
+    gemm_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_sigmoid','bias_add_relu'.
+    extra_code : str
+        Extra code for self-defined operators.
+    """
+    common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        args_parse,
+        gemm_flag,
+        extra_code=extra_code,
+        ndims=3,
+        extra_shape_template=extra_shape_template,
+        problem_args_template=problem_args_template,
+        extra_header_template=extra_header_template,
+        tensor_decl_template=tensor_decl_template,
+    )
+
+
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    gemm_flag,
+    problem_args_template=PROBLEM_ARGS_TEMPLATE,
+    extra_header_template=EXTRA_HEADER_TEMPLATE,
+    extra_shape_template=EXTRA_SHAPE_TEMPLATE,
+    extra_code="",
+    input_addr_calculator="",
+    output_addr_calculator="",
+):
+    """Generate function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    gemm_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu'.
+    extra_code : str
+        Extra code for self-defined operators.
+    input_addr_calculator : str
+        Used to adjust input address based on input tensor accessors if accessors exist
+    output_addr_calculator : str
+        Used to adjust output address based on output tensor accessors if accessors exist
+
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        gemm_flag,
+        extra_code=extra_code,
+        ndims=3,
+        extra_shape_template=extra_shape_template,
+        problem_args_template=problem_args_template,
+        extra_header_template=extra_header_template,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=output_addr_calculator,
+    )
+
+
+def gen_function_decl(func_name, gemm_flag, pdims=0):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    gemm_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu'.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    return common.gen_function_decl(
+        func_name=func_name, gemm_flag=gemm_flag, ndims=3, pdims=pdims
+    )
+
+
+def gen_function_call(func_attrs, indent="  ", gemm_flag=""):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+    gemm_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu'.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent=indent, gemm_flag=gemm_flag)
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_crr.py b/python/aitemplate/backend/rocm/gemm/bmm_crr.py
new file mode 100644
index 000000000..cb8886214
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_crr.py
@@ -0,0 +1,170 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend for A[ColumnMajor], B[RowMajor], C[RowMajor], i.e.
+c[b, m, n] = bmm(a[b, k, m], b[b, k, n])
+This is used for `ops.bmm_crr`.
+"""
+import jinja2
+
+from ... import registry
+from . import bmm_common, common
+from .layout import CRR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = K;
+  int64_t a_dim2 = M;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = K;
+  int64_t b_dim2 = N;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = N;
+"""
+)
+
+
+@registry.reg("rocm.bmm_crr.config")
+def bmm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemm
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, CRR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_crr.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    bmm_common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        dim_info_dict=dim_info_dict,
+        gemm_flag="",
+    )
+
+
+@registry.reg("rocm.bmm_crr.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return bmm_common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+
+
+@registry.reg("rocm.bmm_crr.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(func_name=func_name, gemm_flag="")
+
+
+@registry.reg("rocm.bmm_crr.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="")
+
+
+@registry.reg("rocm.bmm_crr.filter")
+def bmm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_permute_common.py b/python/aitemplate/backend/rocm/gemm/bmm_permute_common.py
new file mode 100644
index 000000000..2b05b84c8
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_permute_common.py
@@ -0,0 +1,65 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common template for bmm
+"""
+import jinja2
+
+EXTRA_HEADER_TEMPLATE = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_e_permute_xdl.hpp"
+"""
+)
+
+EXTRA_SHAPE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}const int64_t G1 = p_dim0; // G1
+
+{{indent}}const int64_t stride_a = *a_dim2;
+{{indent}}const int64_t stride_b = *b_dim2;
+{{indent}}const int64_t stride_c = *c_dim2;
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+{{indent}}                                static_cast<ck::half_t *>(in_ptr),
+{{indent}}                                static_cast<ck::half_t *>(weight_ptr),
+{% if "bias" in gemm_flag %}
+{{indent}}                                std::array<const void*, 1>{static_cast<ck::half_t *>(bias_ptr)},
+{% endif %}
+{{indent}}                                static_cast<ck::half_t *>(out_ptr),
+{{indent}}                                M,
+{{indent}}                                N,
+{{indent}}                                K,
+{{indent}}                                stride_a,
+{{indent}}                                stride_b,
+{{indent}}                                M * K, // batch_stride_A
+{{indent}}                                N * K, // batch_stride_B
+{% if "bias" in gemm_flag %}
+{{indent}}                                std::array<ck::index_t, 1>{0},
+{% endif %}
+{{indent}}                                {int(B/G1), int(G1), int(M), int(N), int(M*G1*N), int(N), int(G1*N), 1},
+{{indent}}                                B,
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{% if gemm_flag == "" %}
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
+{% elif gemm_flag == "bias" %}
+{{indent}}                                ck::tensor_operation::element_wise::Add{}
+{% endif %}
+"""
+)
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rcr.py b/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
new file mode 100644
index 000000000..08c939828
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
@@ -0,0 +1,170 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[b, m, n] = bmm(a[b, m, k], b[b, n, k])
+This is used for `ops.bmm_rcr`.
+"""
+import jinja2
+
+from ... import registry
+from . import bmm_common, common
+from .layout import RCR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = M;
+  int64_t a_dim2 = K;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = N;
+  int64_t b_dim2 = K;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = N;
+"""
+)
+
+
+@registry.reg("rocm.bmm_rcr.config")
+def bmm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemm
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_rcr.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    bmm_common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        dim_info_dict=dim_info_dict,
+        gemm_flag="",
+    )
+
+
+@registry.reg("rocm.bmm_rcr.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return bmm_common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+
+
+@registry.reg("rocm.bmm_rcr.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(func_name=func_name, gemm_flag="")
+
+
+@registry.reg("rocm.bmm_rcr.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="")
+
+
+@registry.reg("rocm.bmm_rcr.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
new file mode 100644
index 000000000..9672310af
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
@@ -0,0 +1,185 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[b, m, n] = bmm(a[b, m, k], b[b, n, k])
+This is used for `ops.bmm_rcr`.
+"""
+import jinja2
+
+from ... import registry
+from . import bmm_common, bmm_permute_common, common
+from .layout import RCR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+  int64_t G = std::atoi(argv[5]);
+
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = M;
+  int64_t a_dim2 = K;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = N;
+  int64_t b_dim2 = K;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = N;
+  int64_t p_dim0 = G;
+"""
+)
+
+
+@registry.reg("rocm.bmm_rcr_permute.config")
+def bmm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemmPermute
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_rcr_permute.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        ARGS_PARSER_TEMPLATE.render(),
+        gemm_flag="",
+        problem_args_template=bmm_permute_common.PROBLEM_ARGS_TEMPLATE,
+        extra_header_template=bmm_permute_common.EXTRA_HEADER_TEMPLATE,
+        extra_shape_template=bmm_permute_common.EXTRA_SHAPE_TEMPLATE,
+    )
+
+
+@registry.reg("rocm.bmm_rcr_permute.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        gemm_flag="",
+        problem_args_template=bmm_permute_common.PROBLEM_ARGS_TEMPLATE,
+        extra_header_template=bmm_permute_common.EXTRA_HEADER_TEMPLATE,
+        extra_shape_template=bmm_permute_common.EXTRA_SHAPE_TEMPLATE,
+    )
+
+
+@registry.reg("rocm.bmm_rcr_permute.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(
+        func_name=func_name, gemm_flag="", pdims=len(func_attrs["shape"])
+    )
+
+
+@registry.reg("rocm.bmm_rcr_permute.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="")
+
+
+@registry.reg("rocm.bmm_rcr_permute.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rrr.py b/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
new file mode 100644
index 000000000..e7f660ae0
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
@@ -0,0 +1,170 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend A[RowMajor], B[RowMajor], C[RowMajor], i.e.
+c[b, m, n] = a[b, m, k] * b[b, k, n]
+This is used for `ops.bmm_rrr`.
+"""
+import jinja2
+
+from ... import registry
+from . import bmm_common, common
+from .layout import RRR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = M;
+  int64_t a_dim2 = K;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = K;
+  int64_t b_dim2 = N;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = N;
+"""
+)
+
+
+@registry.reg("rocm.bmm_rrr.config")
+def bmm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemm
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, RRR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_rrr.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    bmm_common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        dim_info_dict=dim_info_dict,
+        gemm_flag="",
+    )
+
+
+@registry.reg("rocm.bmm_rrr.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return bmm_common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+
+
+@registry.reg("rocm.bmm_rrr.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(func_name=func_name, gemm_flag="")
+
+
+@registry.reg("rocm.bmm_rrr.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="")
+
+
+@registry.reg("rocm.bmm_rrr.filter")
+def bmm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
new file mode 100644
index 000000000..9bfed99b5
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
@@ -0,0 +1,185 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[b, m, n] = bmm(a[b, m, k], b[b, n, k])
+This is used for `ops.bmm_rrr_permute`.
+"""
+import jinja2
+
+from ... import registry
+from . import bmm_common, bmm_permute_common, common
+from .layout import RRR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+  int64_t G = std::atoi(argv[5]);
+
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = M;
+  int64_t a_dim2 = K;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = K;
+  int64_t b_dim2 = N;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = N;
+  int64_t p_dim0 = G;
+"""
+)
+
+
+@registry.reg("rocm.bmm_rrr_permute.config")
+def bmm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemmPermute
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, RRR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_rrr_permute.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    bmm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        dim_info_dict,
+        ARGS_PARSER_TEMPLATE.render(),
+        gemm_flag="",
+        problem_args_template=bmm_permute_common.PROBLEM_ARGS_TEMPLATE,
+        extra_header_template=bmm_permute_common.EXTRA_HEADER_TEMPLATE,
+        extra_shape_template=bmm_permute_common.EXTRA_SHAPE_TEMPLATE,
+    )
+
+
+@registry.reg("rocm.bmm_rrr_permute.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        gemm_flag="",
+        problem_args_template=bmm_permute_common.PROBLEM_ARGS_TEMPLATE,
+        extra_header_template=bmm_permute_common.EXTRA_HEADER_TEMPLATE,
+        extra_shape_template=bmm_permute_common.EXTRA_SHAPE_TEMPLATE,
+    )
+
+
+@registry.reg("rocm.bmm_rrr_permute.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(
+        func_name=func_name, gemm_flag="", pdims=len(func_attrs["shape"])
+    )
+
+
+@registry.reg("rocm.bmm_rrr_permute.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="")
+
+
+@registry.reg("rocm.bmm_rrr_permute.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
new file mode 100644
index 000000000..53c07aaf9
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
@@ -0,0 +1,289 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[b, m, n] = bmm(a[b, m, k], b[b, n, k])
+This is used for `ops.bmm_rcr`.
+"""
+import jinja2
+
+from ... import registry
+from . import bmm_common, common
+from .layout import RCR
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+const ck::half_t alpha = {{scale}};
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AttnMul
+{
+    AttnMul(){};
+
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c) const
+    {
+        float s = {{scale}};
+        e = c * type_convert<half_t>(s);
+    };
+
+
+    __host__ __device__ void operator()(float& e, const float& c) const
+    {
+        float s = {{scale}};
+        e = c * s;
+    };
+
+};
+} //namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+EXTRA_SHAPE_TEMPLATE = jinja2.Template(
+    """
+
+{{indent}}const int64_t stride_a = *a_dim2; // K
+{{indent}}const int64_t stride_b = *b_dim2; // K
+{{indent}}const int64_t stride_b1 = *c_dim2; // O
+{{indent}}const int64_t stride_c = *c_dim2; // O
+
+"""
+)
+
+EXTRA_HEADER_TEMPLATE = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
+"""
+)
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+{{indent}}                                static_cast<ck::half_t *>(in_ptr),
+{{indent}}                                static_cast<ck::half_t *>(weight_ptr),
+{{indent}}                                static_cast<ck::half_t *>(bias_ptr),
+{{indent}}                                static_cast<ck::half_t *>(out_ptr),
+{{indent}}                                M,
+{{indent}}                                N,
+{{indent}}                                K,
+{{indent}}                                O,
+{{indent}}                                B,
+{{indent}}                                stride_a,
+{{indent}}                                stride_b,
+{{indent}}                                stride_b1,
+{{indent}}                                stride_c,
+{{indent}}                                M*K,
+{{indent}}                                N*K,
+{{indent}}                                N*O,
+{{indent}}                                M*O,
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::ScaleAndResetNaNToMinusInfinity{alpha},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
+"""
+)
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+  int64_t O = std::atoi(argv[5]);
+
+  // B,M,K * B,N,K = B,M,N // RCR
+  // B,M,N * B,N,O = B,M,O // RRR
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = M;
+  int64_t a_dim2 = K;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = N;
+  int64_t b_dim2 = K;
+  int64_t b1_dim0 = B;
+  int64_t b1_dim1 = N;
+  int64_t b1_dim2 = O;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = O;
+"""
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  int64_t a_ptr_sz = B*M*K;
+  int64_t b_ptr_sz = B*N*K;
+  int64_t b1_ptr_sz = B*N*O;
+  int64_t c_ptr_sz = B*M*O;
+  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for 8M L2 cache
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+
+  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
+  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b0: index 1
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // y: index 2
+{% if "bias" in gemm_flag %}
+  memory_pool->AllocateHalfTensor(b1_ptr_sz, mem_pool_sz);  // b1: index 3
+{% endif %}
+"""
+)
+
+
+@registry.reg("rocm.bmm_softmax_bmm.config")
+def bmm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemmSoftmaxGemm
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_softmax_bmm.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    bmm_common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        dim_info_dict=dim_info_dict,
+        gemm_flag="bias_b1",
+        extra_header_template=EXTRA_HEADER_TEMPLATE,
+        tensor_decl_template=TENSOR_DECL_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        extra_shape_template=EXTRA_SHAPE_TEMPLATE,
+        extra_code=EXTRA_CODE.render(scale=func_attrs["scale"]),
+    )
+
+
+@registry.reg("rocm.bmm_softmax_bmm.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_b1",
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        extra_shape_template=EXTRA_SHAPE_TEMPLATE,
+        extra_header_template=EXTRA_HEADER_TEMPLATE,
+        extra_code=EXTRA_CODE.render(scale=func_attrs["scale"]),
+    )
+
+
+@registry.reg("rocm.bmm_softmax_bmm.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(func_name=func_name, gemm_flag="bias_b1")
+
+
+@registry.reg("rocm.bmm_softmax_bmm.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="bias_b1")
+
+
+@registry.reg("rocm.bmm_softmax_bmm.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
new file mode 100644
index 000000000..7ad9999e4
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
@@ -0,0 +1,387 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched Gemm ROCM backend for
+
+  BMM + Softmax + BMM
+  (B, M, K) * (B, N, K) = (B, M, N) #RCR
+  softmax on dim N (B, M, N)
+  (B, M, N) * (B, N, O) = (B, M, O) #RRR
+
+This is used for `ops.bmm_softmax_bmm_permute`.
+"""
+import jinja2
+
+from ... import registry
+from ...common import gemm_common
+from . import bmm_common, common
+from .layout import RCR
+
+INPUT_ADDR_CALCULATOR = jinja2.Template(
+    """
+  int64_t in_batch_stride = {{in_batch_stride_dim}};
+  int64_t in_stride = {{in_stride_dim}};
+  int64_t in_offset = {{in_offset_val}}; // default to 0
+  int64_t weight_batch_stride = {{weight_batch_stride_dim}};
+  int64_t weight_stride = {{weight_stride_dim}};
+  int64_t weight_offset = {{weight_offset_val}}; // default to 0
+  int64_t bias_batch_stride = {{bias_batch_stride_dim}};
+  int64_t bias_stride = {{bias_stride_dim}};
+  int64_t bias_offset = {{bias_offset_val}}; // default to 0
+    """
+)
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+const ck::half_t alpha = {{scale}};
+"""
+)
+
+PROFILER_EXTRA_SHAPE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}const int64_t G1 = p_dim0; // G1
+
+{{indent}}const int64_t in_batch_stride=M * K;
+{{indent}}const int64_t in_stride=K;
+{{indent}}const int64_t in_offset=0;
+{{indent}}const int64_t weight_batch_stride=N * K;
+{{indent}}const int64_t weight_stride=K;
+{{indent}}const int64_t weight_offset=0;
+{{indent}}const int64_t bias_batch_stride=N * O;
+{{indent}}const int64_t bias_stride=O;
+{{indent}}const int64_t bias_offset=0;
+
+"""
+)
+
+EXTRA_SHAPE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}const int64_t G1 = p_dim0; // G1
+"""
+)
+
+EXTRA_HEADER_TEMPLATE = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+{{indent}}                                static_cast<ck::half_t *>(in_ptr) + in_offset,
+{{indent}}                                static_cast<ck::half_t *>(weight_ptr) + weight_offset,
+{{indent}}                                static_cast<ck::half_t *>(bias_ptr) + bias_offset,
+{{indent}}                                static_cast<ck::half_t *>(out_ptr),
+{{indent}}                                M,
+{{indent}}                                N,
+{{indent}}                                K,
+{{indent}}                                O,
+{{indent}}                                B,
+{{indent}}                                {int(B/G1), int(G1), int(M), int(O)},
+{{indent}}                                {int(M * G1 * O), int(O), int(G1 * O), 1},
+{{indent}}                                in_stride,
+{{indent}}                                weight_stride,
+{{indent}}                                bias_stride,
+{{indent}}                                in_batch_stride,
+{{indent}}                                weight_batch_stride,
+{{indent}}                                bias_batch_stride,
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::ScaleAndResetNaNToMinusInfinity{alpha},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
+"""
+)
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t B = std::atoi(argv[1]);
+  int64_t M = std::atoi(argv[2]);
+  int64_t N = std::atoi(argv[3]);
+  int64_t K = std::atoi(argv[4]);
+  int64_t O = std::atoi(argv[5]);
+  int64_t G = std::atoi(argv[6]);
+
+  // B,M,K * B,N,K = B,M,N // RCR
+  // B,M,N * B,N,O = B,M,O // RRR
+  int64_t a_dim0 = B;
+  int64_t a_dim1 = M;
+  int64_t a_dim2 = K;
+  int64_t b_dim0 = B;
+  int64_t b_dim1 = N;
+  int64_t b_dim2 = K;
+  int64_t b1_dim0 = B;
+  int64_t b1_dim1 = N;
+  int64_t b1_dim2 = O;
+  int64_t c_dim0 = B;
+  int64_t c_dim1 = M;
+  int64_t c_dim2 = O;
+  int64_t p_dim0 = G;
+"""
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  int64_t a_ptr_sz = B*M*K;
+  int64_t b_ptr_sz = B*N*K;
+  int64_t b1_ptr_sz = B*N*O;
+  int64_t c_ptr_sz = B*M*O;
+  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for 8M L2 cache
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+
+  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
+  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b0: index 1
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // y: index 2
+{% if "bias" in gemm_flag %}
+  memory_pool->AllocateHalfTensor(b1_ptr_sz, mem_pool_sz);  // b1: index 3
+{% endif %}
+"""
+)
+
+
+@registry.reg("rocm.bmm_softmax_bmm_permute.config")
+def bmm_softmax_bmm_permute_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemmSoftmaxGemmPermute
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_softmax_bmm_permute_causal.config")
+def bmm_softmax_bmm_permute_causal_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.BatchGemmSoftmaxGemmPermute
+    extra_kind = ck_lib.library.TensorOperation.CausalMask
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.bmm_softmax_bmm_permute.gen_profiler")
+@registry.reg("rocm.bmm_softmax_bmm_permute_causal.gen_profiler")
+def bmm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    bmm_common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        dim_info_dict=dim_info_dict,
+        gemm_flag="bias_b1",
+        extra_header_template=EXTRA_HEADER_TEMPLATE,
+        tensor_decl_template=TENSOR_DECL_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        extra_shape_template=PROFILER_EXTRA_SHAPE_TEMPLATE,
+        extra_code=EXTRA_CODE.render(scale=func_attrs["scale"]),
+    )
+
+
+@registry.reg("rocm.bmm_softmax_bmm_permute.gen_function")
+@registry.reg("rocm.bmm_softmax_bmm_permute_causal.gen_function")
+def bmm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from bmm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    in_batch_stride_dim = "M * K"
+    in_stride_k_dim = "K"
+    in_offset = 0
+    weight_batch_stride_dim = "N * K"
+    weight_stride_k_dim = "K"
+    weight_offset = 0
+    bias_batch_stride_dim = "N * O"
+    bias_stride_k_dim = "O"
+    bias_offset = 0
+
+    if "input_accessors" in func_attrs:
+        in_accessor = func_attrs["input_accessors"][0]
+        weight_accessor = func_attrs["input_accessors"][1]
+        bias_accessor = func_attrs["input_accessors"][2]
+
+        if in_accessor.is_from_strided_tensor:
+            in_offset = in_accessor.offset
+            if not in_accessor.is_contiguous:
+                a_dims = bmm_common.reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 0
+                )
+
+                in_batch_stride_dim = in_accessor.gen_stride_str(0, a_dims)
+                in_stride_k_dim = in_accessor.stride(1)
+
+        if weight_accessor.is_from_strided_tensor:
+            weight_offset = weight_accessor.offset
+            if not weight_accessor.is_contiguous:
+                w_dims = bmm_common.reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 1
+                )
+
+                weight_batch_stride_dim = weight_accessor.gen_stride_str(0, w_dims)
+                weight_stride_k_dim = weight_accessor.stride(1)
+
+        if bias_accessor.is_from_strided_tensor:
+            bias_offset = bias_accessor.offset
+            if not bias_accessor.is_contiguous:
+                b_dims = bmm_common.reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 2
+                )
+
+                bias_batch_stride_dim = bias_accessor.gen_stride_str(0, b_dims)
+                bias_stride_k_dim = bias_accessor.stride(1)
+
+    input_addr_calculator = INPUT_ADDR_CALCULATOR.render(
+        in_batch_stride_dim=in_batch_stride_dim,
+        in_stride_dim=in_stride_k_dim,
+        in_offset_val=in_offset,
+        weight_batch_stride_dim=weight_batch_stride_dim,
+        weight_stride_dim=weight_stride_k_dim,
+        weight_offset_val=weight_offset,
+        bias_batch_stride_dim=bias_batch_stride_dim,
+        bias_stride_dim=bias_stride_k_dim,
+        bias_offset_val=bias_offset,
+    )
+    # TODO: add support for output_tensor_accessors
+
+    return bmm_common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_b1",
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        extra_shape_template=EXTRA_SHAPE_TEMPLATE,
+        extra_header_template=EXTRA_HEADER_TEMPLATE,
+        extra_code=EXTRA_CODE.render(scale=func_attrs["scale"]),
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator="",
+    )
+
+
+@registry.reg("rocm.bmm_softmax_bmm_permute.func_decl")
+@registry.reg("rocm.bmm_softmax_bmm_permute_causal.func_decl")
+def bmm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return bmm_common.gen_function_decl(
+        func_name=func_name, gemm_flag="bias_b1", pdims=len(func_attrs["shape"])
+    )
+
+
+@registry.reg("rocm.bmm_softmax_bmm_permute.func_call")
+@registry.reg("rocm.bmm_softmax_bmm_permute_causal.func_call")
+def bmm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return bmm_common.gen_function_call(func_attrs, indent, gemm_flag="bias_b1")
+
+
+@registry.reg("rocm.bmm_softmax_bmm_permute.filter")
+@registry.reg("rocm.bmm_softmax_bmm_permute_causal.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
new file mode 100644
index 000000000..bbc35048c
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -0,0 +1,974 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common template for gemm
+"""
+import os
+import re
+from collections import OrderedDict
+from hashlib import sha1
+
+import jinja2
+
+from ... import builder
+from ...common import gemm_common
+from ...target import Target
+
+# pylint: disable=C0103,C0415,W0611,C0301
+
+EXTRA_SHAPE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}const int64_t stride_a = *a_dim1;
+{{indent}}const int64_t stride_b = *b_dim1;
+{{indent}}const int64_t stride_c = *c_dim1;
+"""
+)
+
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = {{ config_name }};
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}auto op =  {{instance}}{};
+{{indent}}auto invoker  = op.MakeInvoker();
+{{indent}}auto argument = op.MakeArgument(
+{{problem_args}}
+{{indent}});
+{{indent}}if(!op.IsSupportedArgument(argument)) {
+{{indent}}  throw std::runtime_error(
+{{indent}}    "wrong! device_gemm with the specified compilation parameters does "
+{{indent}}    "not support this Gemm problem");
+{{indent}}}
+{% if is_profiler %}
+{{indent}}auto workspace_size = op.GetWorkSpaceSize(&argument);
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% endif %}
+{{indent}}invoker.Run(argument, StreamConfig{stream, false});
+{{indent}}return;
+"""
+)
+
+
+EXTRA_HEADER_TEMPLATE = jinja2.Template(
+    """
+{% if gemm_flag == "" %}
+#include "include/ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
+{% elif gemm_flag == "permute_m2n3" %}
+#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp"
+{% elif "bias" in gemm_flag or has_d0 %}
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
+    {% if gemm_flag == "bias_permute" %}
+#include "ck/tensor_operation/gpu/device/device_gemm_bias_e_permute_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+    {% elif gemm_flag in ["bias_permute_m2n3", "bias_permute_m3n2"]  %}
+#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp"
+    {% endif %}
+{% endif %}
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+// #include <half.hpp>
+#include <random>
+#include <rocrand/rocrand.h>
+#include "include/ck/utility/print.hpp"
+#include "library/include/ck/library/utility/device_memory.hpp"
+#include "library/include/ck/library/utility/host_tensor.hpp"
+#include "library/include/ck/library/utility/host_tensor_generator.hpp"
+#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+{{extra_header}}
+
+
+{{extra_code}}
+
+{{instances}}
+
+
+void {{function_name}}(
+    void * in_ptr,
+    void * weight_ptr,
+    void * out_ptr,
+{% if "bias" in gemm_flag %}
+    void * bias_ptr,
+{% endif %}
+{% if has_d0 %}
+    void * d0_ptr,
+{% endif %}
+{% if has_d1 %}
+    void * d1_ptr,
+{% endif %}
+{% for idx in range(ndims) %}
+    int64_t* a_dim{{idx}},
+{% endfor %}
+{% for idx in range(ndims) %}
+    int64_t* b_dim{{idx}},
+{% endfor %}
+{% for idx in range(ndims) %}
+    int64_t* c_dim{{idx}},
+{% endfor %}
+{% for idx in range(pdims) %}
+    const int p_dim{{idx}},
+{% endfor %}
+    hipStream_t stream
+    ) {
+  {{shape_func}}
+  {{extra_shape}}
+  {{input_addr_calculator}}
+  {{output_addr_calculator}}
+  {{exec_paths}}
+
+  throw std::runtime_error(
+      "Unsupported workload for this gemm specialization."
+  );
+}
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{% if "bias" in gemm_flag %}
+{{indent}}    {{bias_ptr}},
+{% endif %}
+{% if d0_ptr != "" %}
+{{indent}}    {{d0_ptr}},
+{% endif %}
+{% if d1_ptr != "" %}
+{{indent}}    {{d1_ptr}},
+{% endif %}
+{% for dim in adims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in bdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in cdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in pdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+{{indent}}                                static_cast<ck::half_t *>(in_ptr),
+{{indent}}                                static_cast<ck::half_t *>(weight_ptr),
+
+{% if gemm_flag == "bias_permute" %}
+{{indent}}                                static_cast<ck::half_t *>(bias_ptr),
+{% elif gemm_flag == "bias_permute_m2n3" %}
+{{indent}}                                std::array<const void*, 1>{static_cast<ck::half_t *>(bias_ptr)},
+{% elif gemm_flag == "permute_m2n3" %}
+{{indent}}                                {},
+{% else %}
+{% if "bias" in gemm_flag and not has_d0 %}
+{{indent}}                                std::array<const void*, 1>{static_cast<ck::half_t *>(bias_ptr)},
+{% elif has_d0 and not has_d1 %}
+{{indent}}                                std::array<const void*, 2>{static_cast<ck::half_t *>(bias_ptr),
+                                                                    static_cast<ck::half_t *>(d0_ptr)},
+{% elif has_d1 %}
+{{indent}}                                std::array<const void*, 3>{static_cast<ck::half_t *>(bias_ptr),
+                                                                    static_cast<ck::half_t *>(d0_ptr),
+                                                                    static_cast<ck::half_t *>(d1_ptr)},
+{% endif %}
+{% endif %}
+{{indent}}                                static_cast<ck::half_t *>(out_ptr),
+{% if gemm_flag not in ["permute_m2n3", "bias_permute_m2n3", "bias_permute_m3n2"]  %}
+{{indent}}                                M,
+{{indent}}                                N,
+{{indent}}                                K,
+{{indent}}                                stride_a,
+{{indent}}                                stride_b,
+{% endif %}
+{% if gemm_flag == "bias_permute" %}
+{{indent}}                                {M0, M1, M2, N0, N1, stride_D_M0, stride_D_M1, stride_D_M2, stride_D_N0, stride_D_N1},
+{{indent}}                                {M0, M1, M2, N0, N1, stride_E_M0, stride_E_M1, stride_E_M2, stride_E_N0, stride_E_N1},
+{% elif gemm_flag in ["permute_m2n3", "bias_permute_m2n3", "bias_permute_m3n2"]  %}
+{{indent}}                                a_ms_ks_lengths,
+{{indent}}                                a_ms_ks_strides,
+{{indent}}                                b_ns_ks_lengths,
+{{indent}}                                b_ns_ks_strides,
+    {% if gemm_flag == "permute_m2n3"  %}
+{{indent}}                                {},
+{{indent}}                                {},
+    {% else %}
+{{indent}}                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+{{indent}}                                std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+    {% endif %}
+{{indent}}                                e_ms_ns_lengths,
+{{indent}}                                e_ms_ns_strides,
+{% else %}
+{% if "bias" in gemm_flag and not has_d0 %}
+{{indent}}                                std::array<ck::index_t, 1>{0},
+{% elif has_d0 and not has_d1 %}
+{{indent}}                                std::array<ck::index_t, 2>{0, static_cast<int>(stride_c)},
+{% elif has_d1 %}
+{{indent}}                                std::array<ck::index_t, 3>{0, static_cast<int>(stride_c), static_cast<int>(stride_c)},
+{% endif %}
+{{indent}}                                stride_c,
+{% endif %}
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
+{% if gemm_flag == "" %}
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
+{% elif gemm_flag == "permute_m2n3" %}
+{{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
+{% elif gemm_flag == "bias" or "bias_permute" in gemm_flag %}
+{{indent}}                                ck::tensor_operation::element_wise::Add{}
+{% elif gemm_flag == "bias_relu" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddRelu{}
+{% elif gemm_flag == "bias_fast_gelu" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddFastGelu{}
+{% elif gemm_flag == "bias_swish" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddHardswish{}
+{% elif gemm_flag == "bias_tanh" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddTanh{}
+{% elif gemm_flag == "bias_sigmoid" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddSigmoid{}
+{% elif gemm_flag == "bias_add" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddAdd{}
+{% elif gemm_flag == "bias_mul" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddMul{}
+{% elif gemm_flag == "bias_mul_tanh" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddMulTanh{}
+{% elif gemm_flag == "bias_add_relu" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddAddRelu{}
+{% elif gemm_flag == "bias_add_fast_gelu" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddAddFastGelu{}
+{% elif gemm_flag == "bias_sigmoid_mul" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddSigmoidMul{}
+{% elif gemm_flag == "bias_sigmoid_mul_tanh" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddSigmoidMulTanh{}
+{% elif gemm_flag == "bias_mul_add" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddMulAdd{}
+{% elif gemm_flag == "bias_add_add" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddAddAdd{}
+{% elif gemm_flag == "bias_add_add_relu" %}
+{{indent}}                                ck::tensor_operation::element_wise::AddAddAddRelu{}
+{% endif %}
+"""
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  int64_t a_ptr_sz = M*K;
+  int64_t b_ptr_sz = N*K;
+  int64_t c_ptr_sz = M*N;
+  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
+  // TODO: special pool size for 8M L2 cache
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+
+  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
+  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // w: index 1
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // y: index 2
+{% if "bias" in gemm_flag %}
+  memory_pool->AllocateHalfTensor(N, mem_pool_sz);  // b: index 3
+{% endif %}
+{% if has_d0 %}
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d0 ptr: index 4
+{% endif %}
+{% if has_d1 %}
+  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d1 ptr: index 5
+{% endif %}
+"""
+)
+
+STRUCTS_DEF_TEMPLATE = jinja2.Template(
+    """
+
+struct ProfilerMemoryPool {
+  ProfilerMemoryPool() {
+    std::random_device rd;
+    gen = std::mt19937(rd());
+    uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
+    offsets.reserve(512);
+    strides.reserve(512);
+    copies.reserve(512);
+    ptrs.reserve(512);
+  }
+  ~ProfilerMemoryPool() {
+    for(int i = 0; i < ptrs.size(); i++){
+      hipFree(ptrs[i]);
+    }
+  }
+
+  template <typename DType>
+  DType* AllocateGaussianTensor(int64_t size) {
+    size_t length = size * sizeof(DType);
+    DType *d_x;
+    hipMalloc(&d_x, length);
+
+    float mean = 0.0f;
+    float stddev = 1.0f;
+    uint64_t seed = uniform_dist(gen);
+    rocrand_set_seed(generator, seed);
+    rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
+    return d_x;
+  }
+
+  ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
+    return reinterpret_cast<ck::half_t*>(
+        AllocateGaussianTensor<ck::half_t>(size));
+  }
+
+  int AllocateHalfTensor(int64_t size, int64_t copy) {
+    offsets.push_back(0);
+    strides.push_back(size);
+    copies.push_back(copy);
+    auto ptr = AllocateHalfGaussianTensor(size * copy);
+    ptrs.push_back(reinterpret_cast<void*>(ptr));
+    return ptrs.size() - 1;
+  }
+
+  ck::half_t* RequestHalfTensorByIdx(int idx) {
+    auto copy = copies.at(idx);
+    auto offset = offsets.at(idx);
+    auto stride = strides.at(idx);
+    ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
+    ptr += offset;
+    offset += stride;
+    if (offset == copy * stride) {
+        offset = 0;
+    }
+    offsets[idx] = offset;
+    return ptr;
+  }
+  std::vector<int64_t> offsets;
+  std::vector<int64_t> strides;
+  std::vector<int64_t> copies;
+  std::vector<void*> ptrs;
+  std::mt19937 gen;
+  std::uniform_int_distribution<int64_t> uniform_dist;
+  rocrand_generator generator;
+};
+
+// hack for DeviceMem linking error
+// TODO fix this by making CK a header-only lib
+// <<< hack begin
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
+void DeviceMem::ToDevice(const void* p) const
+{
+  hipGetErrorString(
+        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+void DeviceMem::FromDevice(void* p) const
+{
+  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+struct KernelTimerImpl
+{
+  KernelTimerImpl() {
+    hipGetErrorString(hipEventCreate(&mStart));
+    hipGetErrorString(hipEventCreate(&mEnd));
+  }
+  ~KernelTimerImpl() {
+    hipGetErrorString(hipEventDestroy(mStart));
+    hipGetErrorString(hipEventDestroy(mEnd));
+  }
+  void Start() {
+    hipGetErrorString(hipDeviceSynchronize());
+    hipGetErrorString(hipEventRecord(mStart, nullptr));
+  }
+  void End() {
+    hipGetErrorString(hipEventRecord(mEnd, nullptr));
+    hipGetErrorString(hipEventSynchronize(mEnd));
+  }
+  float GetElapsedTime() const {
+    float time;
+    hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
+    return time;
+  }
+  hipEvent_t mStart, mEnd;
+};
+// >>> hack end
+
+"""
+)
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+{{op_func}}
+
+{{structs_def}}
+
+int main(int argc, char** argv) {
+  if (argc < 4) {
+    throw std::runtime_error("wrong params");
+  }
+  {{args_parse}}
+  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
+  hipStream_t stream = nullptr;
+  {{tensor_decl}}
+  // TODO: random init
+  // warmup
+  for(int i = 0; i < 3; ++i) {
+    {{func_call}}
+  }
+  // run
+  auto timer = new KernelTimerImpl();
+  timer->Start();
+  for(int i = 0; i < 5; ++i) {
+    {{func_call}}
+  }
+  timer->End();
+  std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
+  std::cout << "TIME:" << timer->GetElapsedTime() << std::endl;
+  delete(timer);
+}
+"""
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void *,
+  void *,
+  void *,
+{% if "bias" in gemm_flag %}
+  void *,
+{% endif %}
+{% if has_d0 %}
+  void *,
+{% endif %}
+{% if has_d1 %}
+  void *,
+{% endif %}
+{% for idx in range(ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(ndims) %}
+  int64_t*,
+{% endfor %}
+{% for idx in range(pdims) %}
+  const int,
+{% endfor %}
+  hipStream_t
+);
+"""
+)
+
+
+def has_d0(func_attrs):
+    return func_attrs.get("num_sources", 0) >= 1
+
+
+def has_d1(func_attrs):
+    return func_attrs.get("num_sources", 0) >= 2
+
+
+def emit_instance(op):
+    """Emit instance."""
+    import ck_lib  # noqa: F401
+
+    op_def = op.emit()
+    return op_def
+
+
+def extract_config(op_kind, extra_kind, f_proc_op):
+    """Extract (operation name, operation instance) pair
+    from all operation candidates.
+
+    Parameters
+    ----------
+    op_kind : ck_lib.library.OperationKind
+        Operation kind.
+    extra_kind : ck_lib.library.[AnyKind]
+        Used to as extra flag to distinguish kernels.
+        E.g. bias_add_relu vs. add_relu_bias
+    f_prop_op: function
+        Used to filter operation.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair.
+    """
+    gemm_ops = OrderedDict()
+    extract_ops = list(Target.current()._operators[op_kind][extra_kind].items())
+
+    for key, value in extract_ops:
+        op = value[0]
+        ret = f_proc_op(op)
+        if len(ret) > 0:
+            for op_inst in ret:
+                gemm_ops[key] = op_inst
+    return gemm_ops
+
+
+def extract_config_name(config):
+    """Exract name from the statement, e.g. 'model' for 'using model = xxx'.
+
+    Parameters
+    ----------
+    config : str
+        Configuration as a string in the format of 'using model = xxx'.
+
+    Returns
+    -------
+    str
+        Extracted name from the statement.
+
+    Raises
+    ------
+    RuntimeError
+        Invalid config.
+    """
+    pattern = re.compile(r"\s*using\s(.*?)\s=")
+    decl = config.split("\n")[1]
+    match = pattern.match(decl)
+    if match is None:
+        raise RuntimeError("Invalid config: \n" + config)
+    return match.groups()[0]
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    dim_info_dict,
+    args_parse,
+    gemm_flag,
+    extra_code="",
+    ndims=2,
+    extra_shape_template=EXTRA_SHAPE_TEMPLATE,
+    problem_args_template=PROBLEM_ARGS_TEMPLATE,
+    extra_header_template=EXTRA_HEADER_TEMPLATE,
+    tensor_decl_template=TENSOR_DECL_TEMPLATE,
+):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    args_parse: str
+        Profiler input argument parser.
+    gemm_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_sigmoid','bias_add_relu'.
+    extra_code : str
+        Extra code for self-defined operators.
+    ndims : int
+        Number of dims for each parameter, 2 for gemm, 3 for bmm
+    extra_shape_template: jinja2.Template
+        Shape evaluation template.
+    problem_args_template: jinja2.Template
+        Problem args template for profiler.
+    extra_header_template: jinja2.Template
+        Extra header template as we have different headers for gemm and bmm.
+    tensor_decl_template: jinja2.Template
+        Tensor declaration template.
+    """
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    # shape function
+    op_func_shape = gemm_common.gen_shape_eval_code(
+        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+
+    adims = ["&a_dim" + str(i) for i in range(ndims)]
+    bdims = ["&b_dim" + str(i) for i in range(ndims)]
+    cdims = ["&c_dim" + str(i) for i in range(ndims)]
+    pdims = []
+    if func_attrs.get("shape") is not None:
+        pdims = ["p_dim" + str(i) for i in range(len(func_attrs["shape"]))]
+    extra_shape_func = extra_shape_template.render(indent="  ")
+    file_paris = []
+    has_d0_flag = has_d0(func_attrs)
+    has_d1_flag = has_d1(func_attrs)
+    for op_name, op in op_instance.items():
+        config = emit_instance(op)
+        config_name = extract_config_name(config)
+        instance = INSTANCE_TEMPLATE.render(
+            name="DeviceGemmInstance", config_name=config_name, config=config
+        )
+        problem_args = problem_args_template.render(
+            indent="  ",
+            gemm_flag=gemm_flag,
+            has_d0=has_d0_flag,
+            has_d1=has_d1_flag,
+        )
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ",
+            instance="DeviceGemmInstance",
+            problem_args=problem_args,
+            is_profiler=True,
+        )
+        extra_header = extra_header_template.render(
+            gemm_flag=gemm_flag, has_d0=has_d0_flag
+        )
+        op_func = SRC_TEMPLATE.render(
+            instances=instance,
+            function_name="gemm",
+            ndims=ndims,
+            pdims=len(pdims),
+            has_d0=has_d0_flag,
+            has_d1=has_d1_flag,
+            shape_func=op_func_shape,
+            extra_shape=extra_shape_func,
+            exec_paths=exec_program,
+            extra_code=extra_code,
+            gemm_flag=gemm_flag,
+            extra_header=extra_header,
+        )
+        structs_def = STRUCTS_DEF_TEMPLATE.render()
+        tensor_decl = tensor_decl_template.render(
+            gemm_flag=gemm_flag, has_d0=has_d0_flag, has_d1=has_d1_flag
+        )
+        func_call = FUNC_CALL_TEMPLATE.render(
+            func_name="gemm",
+            in_ptr="(void *) memory_pool->RequestHalfTensorByIdx(0)",
+            weight_ptr="(void *) memory_pool->RequestHalfTensorByIdx(1)",
+            out_ptr="(void *) memory_pool->RequestHalfTensorByIdx(2)",
+            bias_ptr="(void *) memory_pool->RequestHalfTensorByIdx(3)",
+            d0_ptr="(void *) memory_pool->RequestHalfTensorByIdx(4)"
+            if has_d0_flag
+            else "",
+            d1_ptr="(void *) memory_pool->RequestHalfTensorByIdx(5)"
+            if has_d1_flag
+            else "",
+            adims=adims,
+            bdims=bdims,
+            cdims=cdims,
+            pdims=pdims,
+            gemm_flag=gemm_flag,
+        )
+
+        code = PROFILER_TEMPLATE.render(
+            structs_def=structs_def,
+            op_func=op_func,
+            args_parse=args_parse,
+            tensor_decl=tensor_decl,
+            func_call=func_call,
+        )
+        prefix = os.path.join(workdir, "profiler", op_type)
+        if not os.path.exists(prefix):
+            os.makedirs(prefix)
+        src_path = os.path.join(prefix, op_name + ".cpp")
+        obj_path = os.path.join(prefix, op_name)
+        if os.path.exists(obj_path):
+            continue
+        with open(src_path, "w") as fo:
+            fo.write(code)
+        file_paris.append((src_path, obj_path))
+
+    # build
+    target = Target.current()
+    compile_engine = builder.Builder()
+    compile_engine.build_objs(file_paris, target.compile_cmd(executable=True))
+
+
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    gemm_flag,
+    extra_code="",
+    ndims=2,
+    extra_shape_template=EXTRA_SHAPE_TEMPLATE,
+    problem_args_template=PROBLEM_ARGS_TEMPLATE,
+    extra_header_template=EXTRA_HEADER_TEMPLATE,
+    input_addr_calculator="",
+    output_addr_calculator="",
+):
+    """Generate function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    gemm_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu','bias_add_relu'.
+    extra_code : str
+        Extra code for self-defined operators.
+    ndims : int
+        Number of dims for each parameter, 2 for gemm, 3 for bmm.
+    extra_shape_template: jinja2.Template
+        Shape evaluation template.
+    extra_header_template: jinja2.Template
+        Extra header template as we have different headers for gemm and bmm.
+    input_addr_calculator : str
+        Used to adjust input address based on input tensor accessors if accessors exist
+    output_addr_calculator : str
+        Used to adjust output address based on output tensor accessors if accessors exist
+
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    has_d0_flag = has_d0(func_attrs)
+    has_d1_flag = has_d1(func_attrs)
+    for key, value in exec_path.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        algo = value.algo
+        if algo not in inst_def_flag:
+            config = emit_instance(op_instance[algo])
+            inst_def_flag.add(algo)
+        else:
+            config = ""
+        inst = INSTANCE_TEMPLATE.render(
+            config=config, name=fname, config_name=extract_config_name(config)
+        )
+        instances[key] = inst
+        instance_decl += inst
+
+    extra_shape_func = extra_shape_template.render(indent="  ")
+    shape_eval_func = gemm_common.gen_shape_eval_code(
+        indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
+    )
+    exec_paths = ""
+    for key, _ in instances.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        problem_args = problem_args_template.render(
+            indent="    ",
+            gemm_flag=gemm_flag,
+            has_d0=has_d0_flag,
+            has_d1=has_d1_flag,
+        )
+        program = EXEC_TEMPLATE.render(
+            indent="    ",
+            instance=fname,
+            problem_args=problem_args,
+            is_profiler=False,
+        )
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    extra_header = extra_header_template.render(
+        gemm_flag=gemm_flag, has_d0=has_d0(func_attrs)
+    )
+    pdims = len(func_attrs["shape"]) if func_attrs.get("shape") is not None else 0
+    return SRC_TEMPLATE.render(
+        instances=instance_decl,
+        function_name=func_name,
+        shape_func=shape_eval_func,
+        extra_shape=extra_shape_func,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=output_addr_calculator,
+        exec_paths=exec_paths,
+        extra_code=extra_code,
+        extra_header=extra_header,
+        gemm_flag=gemm_flag,
+        ndims=ndims,
+        pdims=pdims,
+        has_d0=has_d0_flag,
+        has_d1=has_d1_flag,
+    )
+
+
+def gen_function_decl(func_name, gemm_flag, ndims=2, pdims=0, has_d0="", has_d1=""):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    gemm_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu'.
+    ndims : int
+        Number of dims for each parameter, 2 for gemm, 3 for bmm.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        gemm_flag=gemm_flag,
+        ndims=ndims,
+        pdims=pdims,
+        has_d0=has_d0,
+        has_d1=has_d1,
+    )
+
+
+def gen_function_call(func_attrs, indent="  ", gemm_flag=""):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+    gemm_flag : str
+        Flag telling which backend should be generated. options are '','bias','bias_relu'.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    a = func_attrs["inputs"][0]
+    b = func_attrs["inputs"][1]
+    c = func_attrs["outputs"][0]
+    bias_ptr = ""
+    if "bias" in gemm_flag:
+        bias = func_attrs["inputs"][2]
+        bias_ptr = bias._attrs["name"]
+    d0_ptr = ""
+    if has_d0(func_attrs):
+        d0 = func_attrs["inputs"][3]
+        d0_ptr = d0._attrs["name"]
+    d1_ptr = ""
+    if has_d1(func_attrs):
+        d1 = func_attrs["inputs"][4]
+        d1_ptr = d1._attrs["name"]
+    adims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["input_accessors"][0].original_shapes
+    ]
+    bdims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["input_accessors"][1].original_shapes
+    ]
+    cdims = [
+        "&" + dim._attrs["name"]
+        for dim in func_attrs["output_accessors"][0].original_shapes
+    ]
+    pdims = []
+    if func_attrs.get("shape") is not None:
+        pdims = list(func_attrs["shape"])
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=a._attrs["name"],
+        weight_ptr=b._attrs["name"],
+        out_ptr=c._attrs["name"],
+        bias_ptr=bias_ptr,
+        d0_ptr=d0_ptr,
+        d1_ptr=d1_ptr,
+        adims=adims,
+        bdims=bdims,
+        cdims=cdims,
+        pdims=pdims,
+        indent=indent,
+        gemm_flag=gemm_flag,
+    )
+
+
+def default_fproc_f16(*, op, a_layout, b_layout, c_layout):
+    """Filter the input operation by layouts.
+
+    Parameters
+    ----------
+    op: operation
+        aitemplate operation
+    a_layout: ck_lib.library.LayoutType
+        a layout type.
+    b_layout: ck_lib.library.LayoutType
+        b layout type.
+    c_layout: ck_lib.library.LayoutType
+        c layout type.
+    Returns
+    -------
+    List
+        List of filtered op (can be empty).
+    """
+    import copy
+
+    import ck_lib
+
+    ret = []
+    data_type = ck_lib.library.DataType.f16
+    acc_type = ck_lib.library.DataType.f32
+    if (
+        op.A.element == data_type
+        and op.B.element == data_type
+        and op.C.element == data_type
+        and op.accumulator_type() == acc_type
+        and op.A.layout == a_layout
+        and op.B.layout == b_layout
+        and op.C.layout == c_layout
+    ):
+        ret += [copy.deepcopy(op)]
+    return ret
+
+
+def make_fproc_f16(func_attrs, layout, op_kind, extra_kind):
+    """This function sets a callback for processing the epilogue of the kernel
+    associated with func_attrs.
+
+    Parameters
+    ----------
+    func_attrs: Dictionary
+        kernel attributes dictionary
+    layout: layout object
+        kernel layout
+    op_kind : ck_lib.library.OperationKind
+        Operation kind.
+    extra_kind : ck_lib.library.[AnyKind]
+        Used to as extra flag to distinguish kernels.
+        E.g. bias_add_relu vs. add_relu_bias
+    """
+
+    def fproc_f16(op):
+        a_layout, b_layout, c_layout = layout.ck_lib_layouts()
+        return default_fproc_f16(
+            op=op,
+            a_layout=a_layout,
+            b_layout=b_layout,
+            c_layout=c_layout,
+        )
+
+    func_attrs["op_instance"] = extract_config(op_kind, extra_kind, fproc_f16)
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py b/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py
new file mode 100644
index 000000000..962b0a7d7
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Templates for different GeMM epilogues.
+"""
+from typing import Dict, List, NamedTuple
+
+from ....compiler.ops.common.epilogue import EpilogueOp
+
+
+class GeMMEpilogueSpec(NamedTuple):
+    """
+    Base class for GeMM epilogue templates.
+    """
+
+    dtype_to_config_filenames: Dict[str, List[str]]
+    element_op_template: str
+    arguments_template: str = ""
+    dtype_to_instantiation_template: Dict[str, str] = {"float16": ""}
+    dtype_to_profiler_call_template: Dict[str, str] = {"float16": ""}
+    func_decl_template: str = ""
+    func_call_template: str = ""
+
+
+class GeMMBiasEpilogueSpec(GeMMEpilogueSpec):
+    """
+    Base class for Activation(GeMM + Bias) templates.
+    """
+
+    arguments_template: str = """
+{{indent}}                                static_cast<BiasDataType*>(bias_ptr),
+"""
+    dtype_to_instantiation_template: Dict[str, str] = {
+        "float16": """
+DeviceMem b(sizeof(ck::half_t) * n);
+""",
+    }
+    dtype_to_profiler_call_template: Dict[str, str] = {
+        "float16": """
+      (ck::half_t*) b.GetDeviceBuffer(),
+""",
+    }
+    func_decl_template: str = """
+  {{dtype}}* bias_ptr,
+"""
+    func_call_template: str = """
+{{indent}}    {{bias_ptr}},
+"""
+
+
+_GEMM_EPILOGUE_TO_SPEC = {
+    EpilogueOp.NA: GeMMEpilogueSpec(
+        dtype_to_config_filenames={
+            "float16": [
+                "device_operation/src/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp"  # pylint: disable=C0301
+            ],
+        },
+        element_op_template="""
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = PassThrough;
+        """,
+    ),
+    EpilogueOp.BIAS: GeMMBiasEpilogueSpec(
+        dtype_to_config_filenames={
+            "float16": [
+                "device_operation/src/device_gemm_xdl_c_shuffle_bias_add_f16_f16_f16_mk_nk_mn_instance.cpp"  # pylint: disable=C0301
+            ],
+        },
+        element_op_template="""
+using BiasAdd = ck::tensor_operation::element_wise::BiasAdd;
+using OutElementOp = BiasAdd;
+        """,
+    ),
+}
+
+
+def get_epilogue_spec(epilogue_op: EpilogueOp) -> GeMMEpilogueSpec:
+    return _GEMM_EPILOGUE_TO_SPEC[epilogue_op]
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
new file mode 100644
index 000000000..ca790b2b0
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
@@ -0,0 +1,151 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = a[m, k] * b[n, k]
+This is used for `torch.nn.functional.linear(bias=false)`
+When used for `linear`, need to set A->Data, B->Weight
+"""
+from ... import registry
+from . import common
+from .layout import RCR
+
+# pylint: disable=C0415,W0613
+
+
+@registry.reg("rocm.gemm_rcr.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="",
+    )
+
+
+@registry.reg("rocm.gemm_rcr.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+
+
+@registry.reg("rocm.gemm_rcr.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="")
+
+
+@registry.reg("rocm.gemm_rcr.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="")
+
+
+@registry.reg("rocm.gemm_rcr.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
new file mode 100644
index 000000000..aff8e059e
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
@@ -0,0 +1,151 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = a[m, k] * b[n, k] + bias[n]
+This is used for `torch.nn.functional.linear`
+When used for `linear`, need to set A->Data, B->Weight, C->Bias
+"""
+from ... import registry
+from . import common
+from .layout import RCR
+
+# pylint: disable=C0415,W0613
+
+
+@registry.reg("rocm.gemm_rcr_bias.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.Add
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias",
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "bias")
+
+
+@registry.reg("rocm.gemm_rcr_bias.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="bias")
+
+
+@registry.reg("rocm.gemm_rcr_bias.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias")
+
+
+@registry.reg("rocm.gemm_rcr_bias.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
new file mode 100644
index 000000000..dcf346288
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
@@ -0,0 +1,193 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Add(GeMM(A, B) + bias, D0)),
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddAdd
+{
+    AddAdd(){};
+
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& bias, const ck::half_t& d0) const
+    {
+        const ck::half_t x = c + bias;
+        e = x + d0;
+    };
+};
+} // namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_add.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddAdd
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_add.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_add",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_add",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add.func_decl")
+def gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name,
+        gemm_flag="bias_add",
+        has_d0=common.has_d0(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_add")
+
+
+@registry.reg("rocm.gemm_rcr_bias_add.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
new file mode 100644
index 000000000..112cd21e2
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
@@ -0,0 +1,193 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Relu(Add(Add(GeMM(A, B) + bias, D0), D1)),
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddAddAdd
+{
+    AddAddAdd(){};
+
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& bias, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        e = c + bias + d0 + d1;
+    };
+
+};
+} //namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddAddAdd
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_add_add",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_add_add",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add.func_decl")
+def gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name,
+        gemm_flag="bias_add_add_relu",
+        has_d0=common.has_d0(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_add_add")
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
new file mode 100644
index 000000000..bf41915ba
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
@@ -0,0 +1,194 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Relu(Add(Add(GeMM(A, B) + bias, D0), D1)),
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddAddAddRelu
+{
+    AddAddAddRelu(){};
+
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& bias, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const ck::half_t x = c + bias + d0 + d1;
+        e = x > 0 ? x : 0;
+    };
+
+};
+} //namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add_relu.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddAddAddRelu
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add_relu.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_add_add_relu",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add_relu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_add_add_relu",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add_relu.func_decl")
+def gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name,
+        gemm_flag="bias_add_add_relu",
+        has_d0=common.has_d0(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add_relu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_add_add_relu")
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_add_relu.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
new file mode 100644
index 000000000..2c2e96a03
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
@@ -0,0 +1,194 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Relu(Add(GeMM(A, B) + bias, D0)),
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddAddRelu
+{
+    AddAddRelu(){};
+
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& bias, const ck::half_t& d0) const
+    {
+        const ck::half_t x = c + bias + d0;
+        e = x > 0 ? x : 0;
+    };
+
+};
+} //namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_relu.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddAddRelu
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_relu.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_add_relu",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_relu.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_add_relu",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_relu.func_decl")
+def gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name,
+        gemm_flag="bias_add_relu",
+        has_d0=common.has_d0(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_relu.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_add_relu")
+
+
+@registry.reg("rocm.gemm_rcr_bias_add_relu.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
new file mode 100644
index 000000000..cbeacdb51
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
@@ -0,0 +1,156 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = fast_gelu(a[m, k] * b[n, k] + bias[n])
+This is used for `torch.nn.functional.linear + swish`
+When used for `linear`, need to set A->Data, B->Weight, C->Bias
+"""
+from ... import registry
+from . import common
+from .layout import RCR
+
+# pylint: disable=C0415,W0613
+
+
+@registry.reg("rocm.gemm_rcr_bias_fast_gelu.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddFastGelu
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_fast_gelu.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_fast_gelu",
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_fast_gelu.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_fast_gelu",
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_fast_gelu.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="bias_fast_gelu")
+
+
+@registry.reg("rocm.gemm_rcr_bias_fast_gelu.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_fast_gelu")
+
+
+@registry.reg("rocm.gemm_rcr_bias_fast_gelu.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
new file mode 100644
index 000000000..db7bbe766
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
@@ -0,0 +1,193 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Add(GeMM(A, B) + bias, D0)),
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddMul
+{
+    AddMul(){};
+
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& bias, const ck::half_t& d0) const
+    {
+        const ck::half_t x = c + bias;
+        e = x * d0;
+    };
+};
+} // namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddMul
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_mul",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_mul",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul.func_decl")
+def gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name,
+        gemm_flag="bias_mul",
+        has_d0=common.has_d0(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_mul")
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
new file mode 100644
index 000000000..0dba9d33f
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
@@ -0,0 +1,164 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Add(Mul(GeMM(A, B) + bias, D0), D1),
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddMulAdd
+{
+    AddMulAdd(){};
+
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& bias, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const ck::half_t x = c + bias;
+        e = x * d0 + d1;
+    };
+
+};
+} //namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_add.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddMulAdd
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_add.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_mul_add",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_add.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_mul_add",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_add.func_decl")
+def gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name,
+        gemm_flag="bias_mul_add",
+        has_d0=common.has_d0(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_add.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_mul_add")
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_add.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
new file mode 100644
index 000000000..5b993ed70
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
@@ -0,0 +1,196 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Add(GeMM(A, B) + bias, D0)),
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N]
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddMulTanh
+{
+    AddMulTanh(){};
+
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& bias, const ck::half_t& d0) const
+    {
+        const ck::half_t x = c + bias;
+        const ck::half_t x2 = x * d0;
+        // 1-2/(e^(2x)+1)
+        e = type_convert<half_t>(1.0) - type_convert<half_t>(2.0) /
+                    (type_convert<half_t>(1.0) + type_convert<half_t>(exp(ck::type_convert<float>(2*x2))));
+    };
+};
+} // namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_tanh.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddMulTanh
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_tanh.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_mul_tanh",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_tanh.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_mul_tanh",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_tanh.func_decl")
+def gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name,
+        gemm_flag="bias_mul_tanh",
+        has_d0=common.has_d0(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_tanh.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_mul_tanh")
+
+
+@registry.reg("rocm.gemm_rcr_bias_mul_tanh.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
new file mode 100644
index 000000000..0043c65cb
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
@@ -0,0 +1,166 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = a[m, k] * b[n, k] + bias[n]
+This is used for `torch.nn.functional.linear`
+When used for `linear`, need to set A->Data, B->Weight, C->Bias
+"""
+from ... import registry
+from . import common, permute_common
+from .layout import RCR
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.GemmPermute
+    extra_kind = ck_lib.library.TensorOperation.Add
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_permute",
+        extra_code="const int G1={}, G2={}, G3={};".format(
+            func_attrs["shape"][0],
+            func_attrs["shape"][1],
+            func_attrs["shape"][2],
+        ),
+        extra_shape_template=permute_common.EXTRA_SHAPE_TEMPLATE,
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_permute",
+        extra_code="const int G1={}, G2={}, G3={};".format(
+            func_attrs["shape"][0],
+            func_attrs["shape"][1],
+            func_attrs["shape"][2],
+        ),
+        extra_shape_template=permute_common.EXTRA_SHAPE_TEMPLATE,
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="bias")
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias")
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
new file mode 100644
index 000000000..7c74479d3
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
@@ -0,0 +1,183 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = a[m, k] * b[n, k] + bias[n]
+where m = M0 * M1 , n = N1 * N0 * N2
+c = c.reshape(M0, M1, N0, N1, N2)
+output = torch.permute(c, [2, 0, 3, 1, 4])
+"""
+import jinja2
+
+from ... import registry
+from . import common, permute_common
+from .layout import RCR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::stoi(argv[1]);
+  int64_t N = std::stoi(argv[2]);
+  int64_t K = std::stoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+  int64_t G1 = std::atoi(argv[5]);
+  int64_t G2 = std::atoi(argv[6]);
+  int64_t G3 = std::atoi(argv[7]);
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+  int64_t p_dim0 = G1;
+  int64_t p_dim1 = G2;
+  int64_t p_dim2 = G3;
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m2n3.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.GemmPermuteM2N3
+    extra_kind = ck_lib.library.TensorOperation.Add
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m2n3.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        gemm_flag="bias_permute_m2n3",
+        extra_shape_template=permute_common.EXTRA_SHAPE_TEMPLATE_M2N3,
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m2n3.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_permute_m2n3",
+        extra_shape_template=permute_common.EXTRA_SHAPE_TEMPLATE_M2N3,
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m2n3.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name, gemm_flag="bias", pdims=len(func_attrs["shape"])
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m2n3.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias")
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m2n3.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
new file mode 100644
index 000000000..527b34f1b
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
@@ -0,0 +1,183 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = a[m, k] * b[n, k] + bias[n]
+where m = M0 * M1 * M2, n = N0 * N1
+c = c.reshape(M0, M1, M2, N0, N1)
+output = torch.permute(c, [2, 0, 3, 1, 4])
+"""
+import jinja2
+
+from ... import registry
+from . import common, permute_common
+from .layout import RCR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::stoi(argv[1]);
+  int64_t N = std::stoi(argv[2]);
+  int64_t K = std::stoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+  int64_t G1 = std::atoi(argv[5]);
+  int64_t G2 = std::atoi(argv[6]);
+  int64_t G3 = std::atoi(argv[7]);
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+  int64_t p_dim0 = G1;
+  int64_t p_dim1 = G2;
+  int64_t p_dim2 = G3;
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m3n2.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.GemmPermuteM3N2
+    extra_kind = ck_lib.library.TensorOperation.Add
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m3n2.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        gemm_flag="bias_permute_m3n2",
+        extra_shape_template=permute_common.EXTRA_SHAPE_TEMPLATE_M3N2,
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m3n2.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_permute_m3n2",
+        extra_shape_template=permute_common.EXTRA_SHAPE_TEMPLATE_M3N2,
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m3n2.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name, gemm_flag="bias", pdims=len(func_attrs["shape"])
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m3n2.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias")
+
+
+@registry.reg("rocm.gemm_rcr_bias_permute_m3n2.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
new file mode 100644
index 000000000..d28bca9ca
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
@@ -0,0 +1,153 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = RELU(a[m, k] * b[n, k] + bias[n])
+This is used for `torch.nn.functional.linear + relu`
+When used for `linear`, need to set A->Data, B->Weight, C->Bias
+"""
+from ... import registry
+from . import common
+from .layout import RCR
+
+# pylint: disable=C0415,W0613
+
+
+@registry.reg("rocm.gemm_rcr_bias_relu.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddRelu
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_relu.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_relu",
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_relu.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs, exec_cond_template, dim_info_dict, "bias_relu"
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_relu.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="bias_relu")
+
+
+@registry.reg("rocm.gemm_rcr_bias_relu.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_relu")
+
+
+@registry.reg("rocm.gemm_rcr_bias_relu.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
new file mode 100644
index 000000000..8dadff7a0
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
@@ -0,0 +1,204 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = Sigmoid(a[m, k] * b[n, k] + bias[n])
+This is used for `torch.nn.functional.linear + sigmoid`
+When used for `linear`, need to set A->Data, B->Weight, C->Bias
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+# pylint: disable=C0415,W0613
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddSigmoid
+{
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const float& x1) const
+    {
+        const float a = x0 + x1;
+        y             = 1.0f / (1.0f + exp(-a));
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        const double a = x0 + x1;
+        y              = 1.0 / (1.0 + exp(-a));
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        const half_t a = x0 + x1;
+        y              = type_convert<half_t>(1.0) / (type_convert<half_t>(1.0) + type_convert<half_t>(exp(ck::type_convert<float>(-a))));
+    };
+};
+} // namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddSigmoid
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_sigmoid",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_sigmoid",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="bias_sigmoid")
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_sigmoid")
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
new file mode 100644
index 000000000..1e0f1e42c
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
@@ -0,0 +1,195 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Mul(Sigmoid(GeMM(A, B) + bias), D0),
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N].
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddSigmoidMul
+{
+    AddSigmoidMul(){};
+
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& bias, const ck::half_t& d0) const
+    {
+        const ck::half_t x = c + bias;
+        // 1/(1+e^(-x))
+        const ck::half_t x2 = type_convert<half_t>(1.0) / (type_convert<half_t>(1.0) + type_convert<half_t>(exp(ck::type_convert<float>(-x))));
+        e = x2 * d0;
+    };
+};
+} // namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddSigmoidMul
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_sigmoid_mul",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_sigmoid_mul",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul.func_decl")
+def gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name,
+        gemm_flag="bias_sigmoid_mul",
+        has_d0=common.has_d0(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_sigmoid_mul")
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
new file mode 100644
index 000000000..d9818e3a9
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
@@ -0,0 +1,200 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = Tanh(Mul(Sigmoid(GeMM(A, B) + bias), D0)),
+where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
+bias[RowMajor][N], D0[RowMajor][M, N].
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddSigmoidMulTanh
+{
+    AddSigmoidMulTanh(){};
+
+    __host__ __device__ void operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& bias, const ck::half_t& d0) const
+    {
+        const ck::half_t x = c + bias;
+        // 1/(1+e^(-x))
+        const ck::half_t x2 = type_convert<half_t>(1.0) / (type_convert<half_t>(1.0) + type_convert<half_t>(exp(ck::type_convert<float>(-x))));
+        const ck::half_t x3 = x2*d0;
+        // 1-2/(e^(2x)+1)
+        e = type_convert<half_t>(1.0) - type_convert<half_t>(2.0) /
+                    (type_convert<half_t>(1.0) + type_convert<half_t>(exp(ck::type_convert<float>(2*x3))));
+    };
+};
+} // namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul_tanh.config")
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddSigmoidMulTanh
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul_tanh.gen_profiler")
+def gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_sigmoid_mul_tanh",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul_tanh.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_sigmoid_mul_tanh",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul_tanh.func_decl")
+def gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name,
+        gemm_flag="bias_sigmoid_mul_tanh",
+        has_d0=common.has_d0(func_attrs),
+        has_d1=common.has_d1(func_attrs),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul_tanh.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(
+        func_attrs, indent, gemm_flag="bias_sigmoid_mul_tanh"
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_sigmoid_mul_tanh.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
new file mode 100644
index 000000000..ed18338e4
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
@@ -0,0 +1,157 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = swish(a[m, k] * b[n, k] + bias[n])
+This is used for `torch.nn.functional.linear + swish`
+When used for `linear`, need to set A->Data, B->Weight, C->Bias
+"""
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+# pylint: disable=C0415,W0613
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddHardswish
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_swish",
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_swish",
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="bias_swish")
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_swish")
+
+
+@registry.reg("rocm.gemm_rcr_bias_swish.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
new file mode 100644
index 000000000..cc1d8f6db
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
@@ -0,0 +1,206 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = tanh(a[m, k] * b[n, k] + bias[n])
+This is used for `torch.nn.functional.linear + tanh`
+When used for `linear`, need to set A->Data, B->Weight, C->Bias
+"""
+import jinja2
+
+from ... import registry
+from . import common
+from .layout import RCR
+
+# pylint: disable=C0415,W0613
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+namespace {
+struct AddTanh
+{
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+
+    // 1-2/(e^(2x)+1)
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const float& x1) const
+    {
+        const float a = x0 + x1;
+        y             = 1.0f - 2.0f / (1.0f + exp(2.0f*a));
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        const double a = x0 + x1;
+        y             = 1.0 - 2.0 / (1.0 + exp(2.0*a));
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        const float a = ck::type_convert<float>(x0 + x1);
+        y = type_convert<half_t>(1.0) - type_convert<half_t>(2.0) /
+                    (type_convert<half_t>(1.0) + type_convert<half_t>(exp(2.0f*a)));
+    };
+};
+} // namespace
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_bias_tanh.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.AddTanh
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_bias_tanh.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RCR.args_parse,
+        gemm_flag="bias_tanh",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_tanh.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_tanh",
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("rocm.gemm_rcr_bias_tanh.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="bias_tanh")
+
+
+@registry.reg("rocm.gemm_rcr_bias_tanh.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias_tanh")
+
+
+@registry.reg("rocm.gemm_rcr_bias_tanh.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
new file mode 100644
index 000000000..07833dc13
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
@@ -0,0 +1,183 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[ColumnMajor], C[RowMajor], i.e.
+c[m, n] = a[m, k] * b[n, k] + bias[n]
+where m = M0 * M1 , n = N1 * N0 * N2
+c = c.reshape(M0, M1, N0, N1, N2)
+output = torch.permute(c, [2, 0, 3, 1, 4])
+"""
+import jinja2
+
+from ... import registry
+from . import common, permute_common
+from .layout import RCR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::stoi(argv[1]);
+  int64_t N = std::stoi(argv[2]);
+  int64_t K = std::stoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+  int64_t G1 = std::atoi(argv[5]);
+  int64_t G2 = std::atoi(argv[6]);
+  int64_t G3 = std::atoi(argv[7]);
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+  int64_t p_dim0 = G1;
+  int64_t p_dim1 = G2;
+  int64_t p_dim2 = G3;
+"""
+)
+
+
+@registry.reg("rocm.gemm_rcr_permute_m2n3.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.GemmPermuteM2N3
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rcr_permute_m2n3.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
+        gemm_flag="permute_m2n3",
+        extra_shape_template=permute_common.EXTRA_SHAPE_TEMPLATE_M2N3,
+    )
+
+
+@registry.reg("rocm.gemm_rcr_permute_m2n3.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "permute_m2n3",
+        extra_shape_template=permute_common.EXTRA_SHAPE_TEMPLATE_M2N3,
+    )
+
+
+@registry.reg("rocm.gemm_rcr_permute_m2n3.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(
+        func_name=func_name, gemm_flag="", pdims=len(func_attrs["shape"])
+    )
+
+
+@registry.reg("rocm.gemm_rcr_permute_m2n3.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="")
+
+
+@registry.reg("rocm.gemm_rcr_permute_m2n3.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rrr.py b/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
new file mode 100644
index 000000000..582d63ad8
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
@@ -0,0 +1,151 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[RowMajor], C[RowMajor], i.e.
+c[m, n] = a[m, k] * b[k, n]
+This is used for `torch.mm`
+When used for `mm`, need to set A->Data, B->Weight
+"""
+from ... import registry
+from . import common
+from .layout import RRR
+
+# pylint: disable=C0415,W0613
+
+
+@registry.reg("rocm.gemm_rrr.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.GemmKind.Gemm
+    extra_kind = ck_lib.library.TensorOperation.PassThrough
+    common.make_fproc_f16(func_attrs, RRR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rrr.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RRR.args_parse,
+        gemm_flag="",
+    )
+
+
+@registry.reg("rocm.gemm_rrr.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(func_attrs, exec_cond_template, dim_info_dict, "")
+
+
+@registry.reg("rocm.gemm_rrr.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="")
+
+
+@registry.reg("rocm.gemm_rrr.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="")
+
+
+@registry.reg("rocm.gemm_rrr.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py b/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
new file mode 100644
index 000000000..b080779db
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
@@ -0,0 +1,166 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM ROCM backend for A[RowMajor], B[RowMajor], C[RowMajor], i.e.
+c[new_dim] = permute(a[m, k] * b[k, n] + bias[n], new_dim), where *new_dim = m*n
+This is used for `torch.nn.functional.linear`
+When used for `linear`, need to set A->Data, B->Weight, C->Bias
+"""
+from ... import registry
+from . import common, permute_common
+from .layout import RRR
+
+
+@registry.reg("rocm.gemm_rrr_bias_permute.config")
+def gemm_config(func_attrs, dtype="float16"):
+    """Extract (operation name, operation instance) pair from
+    all operation candidates.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair
+        from all operation candidates.
+    """
+    import ck_lib  # noqa: F401
+
+    op_kind = ck_lib.library.GemmKind.GemmPermute
+    extra_kind = ck_lib.library.TensorOperation.Add
+    common.make_fproc_f16(func_attrs, RRR, op_kind, extra_kind)
+
+
+@registry.reg("rocm.gemm_rrr_bias_permute.gen_profiler")
+def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+    """
+    common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        dim_info_dict=dim_info_dict,
+        args_parse=RRR.args_parse,
+        gemm_flag="bias_permute",
+        extra_code="const int G1={}, G2={}, G3={};".format(
+            func_attrs["shape"][0],
+            func_attrs["shape"][1],
+            func_attrs["shape"][2],
+        ),
+        extra_shape_template=permute_common.EXTRA_SHAPE_TEMPLATE,
+    )
+
+
+@registry.reg("rocm.gemm_rrr_bias_permute.gen_function")
+def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    dim_info_dict: Dict[str, DimInfo]
+        Generated from gemm._extract_dims().
+        Used to store mapping between dim_names to input / output tensor dims.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    return common.gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+        "bias_permute",
+        extra_code="const int G1={}, G2={}, G3={};".format(
+            func_attrs["shape"][0],
+            func_attrs["shape"][1],
+            func_attrs["shape"][2],
+        ),
+        extra_shape_template=permute_common.EXTRA_SHAPE_TEMPLATE,
+    )
+
+
+@registry.reg("rocm.gemm_rrr_bias_permute.func_decl")
+def gemm_gen_function_decl(func_attrs):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    func_name = func_attrs["name"]
+    return common.gen_function_decl(func_name=func_name, gemm_flag="bias")
+
+
+@registry.reg("rocm.gemm_rrr_bias_permute.func_call")
+def gemm_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    return common.gen_function_call(func_attrs, indent, gemm_flag="bias")
+
+
+@registry.reg("rocm.gemm_rrr_bias_permute.filter")
+def gemm_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return True
diff --git a/python/aitemplate/backend/rocm/gemm/layout.py b/python/aitemplate/backend/rocm/gemm/layout.py
new file mode 100644
index 000000000..34d196132
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/layout.py
@@ -0,0 +1,246 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GeMM layout classes.
+"""
+
+from dataclasses import dataclass
+
+# pylint: disable=C0415
+
+
+@dataclass
+class Layout:
+    m = "M"
+    n = "N"
+    k = "K"
+
+
+@dataclass
+class RCR(Layout):
+    """
+    Layout: A[RowMajor], B[ColumnMajor], C[RowMajor]
+    """
+
+    ck_layout_a = "ck::tensor_layout::gemm::RowMajor"
+    ck_layout_b = "ck::tensor_layout::gemm::ColumnMajor"
+    ck_layout_c = "ck::tensor_layout::gemm::RowMajor"
+    stride_a = "K"
+    stride_b = "K"
+    stride_c = "N"
+
+    args_parse = """
+  int64_t M = std::stoi(argv[1]);
+  int64_t N = std::stoi(argv[2]);
+  int64_t K = std::stoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+
+    @staticmethod
+    def fproc_op(op):
+        import ck_lib
+
+        row_major = ck_lib.library.LayoutType.RowMajor
+        op.C.layout = row_major
+
+    @staticmethod
+    def fcond_op(op):
+        import ck_lib
+
+        row_major = ck_lib.library.LayoutType.RowMajor
+        col_major = ck_lib.library.LayoutType.ColumnMajor
+        return op.A.layout == row_major and op.B.layout == col_major
+
+    @staticmethod
+    def ck_lib_layouts():
+        """
+        return [layout_a, layout_b, layout_c] in the form of ck_lib definitions
+        """
+        import ck_lib
+
+        return [
+            ck_lib.library.LayoutType.RowMajor,
+            ck_lib.library.LayoutType.ColumnMajor,
+            ck_lib.library.LayoutType.RowMajor,
+        ]
+
+
+@dataclass
+class RRR(Layout):
+    """
+    Layout: A[RowMajor], B[RowMajor], C[RowMajor]
+    """
+
+    ck_layout_a = "ck::tensor_layout::gemm::RowMajor"
+    ck_layout_b = "ck::tensor_layout::gemm::RowMajor"
+    ck_layout_c = "ck::tensor_layout::gemm::RowMajor"
+    stride_a = "K"
+    stride_b = "N"
+    stride_c = "N"
+
+    args_parse = """
+  int64_t M = std::stoi(argv[1]);
+  int64_t N = std::stoi(argv[2]);
+  int64_t K = std::stoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = K;
+  int64_t b_dim1 = N;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+
+    @staticmethod
+    def fproc_op(op):
+        import ck_lib
+
+        row_major = ck_lib.library.LayoutType.RowMajor
+        op.C.layout = row_major
+
+    @staticmethod
+    def fcond_op(op):
+        import ck_lib
+
+        row_major = ck_lib.library.LayoutType.RowMajor
+        return op.A.layout == row_major and op.B.layout == row_major
+
+    @staticmethod
+    def ck_lib_layouts():
+        """
+        return [layout_a, layout_b, layout_c] in the form of ck_lib definitions
+        """
+        import ck_lib
+
+        return [
+            ck_lib.library.LayoutType.RowMajor,
+            ck_lib.library.LayoutType.RowMajor,
+            ck_lib.library.LayoutType.RowMajor,
+        ]
+
+
+@dataclass
+class CCR(Layout):
+    """
+    Layout: A[ColumnMajor], B[ColumnMajor], C[RowMajor]
+    """
+
+    ck_layout_a = "ck::tensor_layout::gemm::ColumnMajor"
+    ck_layout_b = "ck::tensor_layout::gemm::ColumnMajor"
+    ck_layout_c = "ck::tensor_layout::gemm::RowMajor"
+    stride_a = "M"
+    stride_b = "K"
+    stride_c = "N"
+
+    args_parse = """
+  int64_t M = std::stoi(argv[1]);
+  int64_t N = std::stoi(argv[2]);
+  int64_t K = std::stoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+  int64_t a_dim0 = K;
+  int64_t a_dim1 = M;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+
+    @staticmethod
+    def fproc_op(op):
+        import ck_lib
+
+        row_major = ck_lib.library.LayoutType.RowMajor
+        op.C.layout = row_major
+
+    @staticmethod
+    def fcond_op(op):
+        import ck_lib
+
+        col_major = ck_lib.library.LayoutType.ColumnMajor
+        return op.A.layout == col_major and op.B.layout == col_major
+
+    @staticmethod
+    def ck_lib_layouts():
+        """
+        return [layout_a, layout_b, layout_c] in the form of ck_lib definitions
+        """
+        import ck_lib
+
+        return [
+            ck_lib.library.LayoutType.ColumnMajor,
+            ck_lib.library.LayoutType.ColumnMajor,
+            ck_lib.library.LayoutType.RowMajor,
+        ]
+
+
+@dataclass
+class CRR(Layout):
+    """
+    Layout: A[ColumnMajor], B[RowMajor], C[RowMajor]
+    """
+
+    ck_layout_a = "ck::tensor_layout::gemm::ColumnMajor"
+    ck_layout_b = "ck::tensor_layout::gemm::RowMajor"
+    ck_layout_c = "ck::tensor_layout::gemm::RowMajor"
+    stride_a = "M"
+    stride_b = "N"
+    stride_c = "N"
+
+    args_parse = """
+  int64_t M = std::stoi(argv[1]);
+  int64_t N = std::stoi(argv[2]);
+  int64_t K = std::stoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+  int64_t a_dim0 = K;
+  int64_t a_dim1 = M;
+  int64_t b_dim0 = K;
+  int64_t b_dim1 = N;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+
+    @staticmethod
+    def fproc_op(op):
+        import ck_lib
+
+        row_major = ck_lib.library.LayoutType.RowMajor
+        op.C.layout = row_major
+
+    @staticmethod
+    def fcond_op(op):
+        import ck_lib
+
+        row_major = ck_lib.library.LayoutType.RowMajor
+        col_major = ck_lib.library.LayoutType.ColumnMajor
+        return op.A.layout == col_major and op.B.layout == row_major
+
+    @staticmethod
+    def ck_lib_layouts():
+        """
+        return [layout_a, layout_b, layout_c] in the form of ck_lib definitions
+        """
+        import ck_lib
+
+        return [
+            ck_lib.library.LayoutType.ColumnMajor,
+            ck_lib.library.LayoutType.RowMajor,
+            ck_lib.library.LayoutType.RowMajor,
+        ]
diff --git a/python/aitemplate/backend/rocm/gemm/permute_common.py b/python/aitemplate/backend/rocm/gemm/permute_common.py
new file mode 100644
index 000000000..95e0af5d7
--- /dev/null
+++ b/python/aitemplate/backend/rocm/gemm/permute_common.py
@@ -0,0 +1,128 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import jinja2
+
+EXTRA_SHAPE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}const int64_t stride_a = *a_dim1;
+{{indent}}const int64_t stride_b = *b_dim1;
+{{indent}}const int64_t stride_c = *c_dim1;
+    ck::index_t M0 = M / G1 / G2;
+    ck::index_t M1 = G1;
+    ck::index_t M2 = G2;
+    ck::index_t N0 = G3;
+    ck::index_t N1 = N / G3;
+    // GEMM shape
+    //ck::index_t M = M0 * M1 * M2;
+    //ck::index_t N = N0 * N1;
+    //ck::index_t K = 128;
+    //ck::index_t stride_A = K;
+    //ck::index_t stride_B = K;
+    // E = [M0, N0, M1, N1, M2]
+    /* 0, 3, 1, 4, 2
+    ck::index_t stride_E_M0 = N0 * M1 * N1 * M2;
+    ck::index_t stride_E_M1 = N1 * M2;
+    ck::index_t stride_E_M2 = 1;
+    ck::index_t stride_E_N0 = M1 * N1 * M2;
+    ck::index_t stride_E_N1 = M2;
+    */
+    // E = [M2, M0, N0, M1, N1] 2, 0, 3, 1, 4
+    ck::index_t stride_E_M0 = N0* M1* N1;
+    ck::index_t stride_E_M1 = N1;
+    ck::index_t stride_E_M2 = M0* N0* M1* N1;
+    ck::index_t stride_E_N0 = M1 * N1;
+    ck::index_t stride_E_N1 = 1;
+    // D = [0, N0, 0, N1, 0]
+    ck::index_t stride_D_M0 = 0;
+    ck::index_t stride_D_M1 = 0;
+    ck::index_t stride_D_M2 = 0;
+    ck::index_t stride_D_N0 = N1;
+    ck::index_t stride_D_N1 = 1;
+"""
+)
+
+EXTRA_SHAPE_TEMPLATE_M2N3 = jinja2.Template(
+    """
+    const int64_t G1 = p_dim0; // G1
+    const int64_t G2 = p_dim1; // G2
+    const int64_t G3 = p_dim2; // G3
+
+    ck::index_t M0 = M / G1;
+    ck::index_t M1 = G1;
+    ck::index_t N0 = G2;
+    ck::index_t N1 = G3;
+    ck::index_t N2 = N / G2 / G3;
+
+    ck::index_t K0 = K;
+    ck::index_t G = 1;
+
+    // A[G, M0, M1, M2, K0]
+    std::vector<ck::index_t> a_ms_ks_lengths{G, M0, M1, K0};
+    std::vector<ck::index_t> a_ms_ks_strides{M0*M1*K0, M1 * K0, K0, 1};
+    // B[G, N0, N1, K0]
+    std::vector<ck::index_t> b_ns_ks_lengths{G, N0, N1, N2, K0};
+    std::vector<ck::index_t> b_ns_ks_strides{N0*N1*N2*K0, N1 * N2 * K0, N2 * K0, K0, 1};
+
+    // D[G, N0, M0, N1, M1, N2]
+    std::vector<ck::index_t> d_ms_ns_lengths{G, M0, M1, N0, N1, N2};
+    std::vector<ck::index_t> d_ms_ns_strides{N0 * N1 * N2, 0, 0, N1 * N2, N2, 1};
+
+    // E[G, N0, M0, N1, M1, N2] 2, 0, 3, 1, 4
+    std::vector<ck::index_t> e_ms_ns_lengths{G, M0, M1, N0, N1, N2};
+    std::vector<ck::index_t> e_ms_ns_strides{M0* M1* N0* N1* N2,
+                                               N1 * M1 * N2,
+                                               N2,
+                                               M0 * N1 * M1 * N2,
+                                               M1 * N2,
+                                               1};
+
+"""
+)
+
+
+EXTRA_SHAPE_TEMPLATE_M3N2 = jinja2.Template(
+    """
+    const int64_t G1 = p_dim0; // G1
+    const int64_t G2 = p_dim1; // G2
+    const int64_t G3 = p_dim2; // G3
+
+    ck::index_t M0 = M / G1 / G2;
+    ck::index_t M1 = G1;
+    ck::index_t M2 = G2;
+    ck::index_t N0 = G3;
+    ck::index_t N1 = N / G3;
+
+    ck::index_t K0 = K;
+    ck::index_t G = 1;
+
+
+    // A[M0, M1, M2, K0]
+    std::vector<ck::index_t> a_ms_ks_lengths{G, M0, M1, M2, K0};
+    std::vector<ck::index_t> a_ms_ks_strides{M0 * M1 * M2 * K0, M1 * M2 * K0, M2 * K0, K0, 1};
+    // B[N0, N1, K0]
+    std::vector<ck::index_t> b_ns_ks_lengths{G, N0, N1, K0};
+    std::vector<ck::index_t> b_ns_ks_strides{N0 * N1 * K0, N1 * K0, K0, 1};
+
+    // D[M0, N0, M1, N1, M2]
+    std::vector<ck::index_t> d_ms_ns_lengths{G, M0, M1, M2, N0, N1};
+    std::vector<ck::index_t> d_ms_ns_strides{N0*N1, 0, 0, 0, N1, 1};
+    // E[M0, N0, M1, N1, M2]
+    std::vector<ck::index_t> e_ms_ns_lengths{G, M0, M1, M2, N0, N1};
+    std::vector<ck::index_t> e_ms_ns_strides{M0 * M1* M2 * N1* N0, N0* M1* N1, N1, M0* N0* M1* N1, M1 * N1, 1};
+
+
+"""
+)
diff --git a/python/aitemplate/backend/rocm/lib_template.py b/python/aitemplate/backend/rocm/lib_template.py
new file mode 100644
index 000000000..36e8151d6
--- /dev/null
+++ b/python/aitemplate/backend/rocm/lib_template.py
@@ -0,0 +1,42 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common codegen functions for ROCM.
+"""
+import jinja2
+
+from .. import registry
+
+# pylint: disable=W0613
+
+VAR_TEMPLATE = jinja2.Template("""{{indent}} int64_t {{name}} { {{value}} };""")
+
+PTR_TEMPLATE = jinja2.Template("""{{indent}} void * {{name}} {nullptr};""")
+
+
+@registry.reg("rocm.lib.var_decl")
+def var_decl(name, value=0, indent="  "):
+    return VAR_TEMPLATE.render(name=name, value=value, indent=indent)
+
+
+@registry.reg("rocm.lib.ptr_decl")
+def ptr_decl(name, dtype="float16", indent="  "):
+    if dtype == "float16":
+        type_string = "ck::half_t*"
+    elif dtype == "int64":
+        type_string = "int64_t*"
+    else:
+        raise NotImplementedError
+    return PTR_TEMPLATE.render(name=name, dtype=type_string, indent=indent)
diff --git a/python/aitemplate/backend/rocm/normalization/__init__.py b/python/aitemplate/backend/rocm/normalization/__init__.py
new file mode 100644
index 000000000..fb90889b3
--- /dev/null
+++ b/python/aitemplate/backend/rocm/normalization/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common modules for backends
+"""
+from . import norm_common, softmax  # noqa
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm.py b/python/aitemplate/backend/rocm/normalization/groupnorm.py
new file mode 100644
index 000000000..a6d3443ee
--- /dev/null
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm.py
@@ -0,0 +1,444 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Groupnorm codegen for ROCM.
+"""
+
+from hashlib import sha1
+from typing import Any, Dict, OrderedDict
+
+import jinja2
+
+from ....compiler.base import IntImm
+
+from ... import registry
+from ...target import Target
+from . import norm_common
+
+EXTRA_HEADERS = jinja2.Template(
+    """
+#include "include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
+"""
+)
+
+EXTRA_CODE_TEMPLATE = jinja2.Template(
+    """
+{%if use_swish %}
+struct YElementOp
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
+                          ck::is_same<T, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        T a;
+
+        ck::tensor_operation::element_wise::Sigmoid{}(a, x);
+
+        y = x * a;
+    };
+};
+
+{% else %}
+
+using YElementOp   = ck::tensor_operation::element_wise::PassThrough;
+
+{% endif %}
+"""
+)
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<ck::half_t*>(({{name}}))"
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  // N, H, W, G, C
+  const int64_t N = in_0;
+  const int64_t H = in_1;
+  const int64_t W = in_2;
+  const int64_t G = in_3;
+  const int64_t C = in_4 ;
+  int64_t ptr_sz = N * H * W * G * C;
+
+  // TODO: special pool size for 8M L2 cache
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_sz)));
+
+  memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // in: index 0
+  memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // out: index 1
+  memory_pool->AllocateHalfTensor(G * C, mem_pool_sz);  // gamma: index 2
+  memory_pool->AllocateHalfTensor(G * C, mem_pool_sz);  // beta: index 3
+
+"""
+)
+
+SHAPE_EVAL_TEMPLATE = jinja2.Template(
+    """
+    """
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+    C = C / G;
+    std::vector<ck::index_t> i_inStrides;
+
+    i_inStrides.push_back(H * W * G * C);
+    i_inStrides.push_back(W * G * C);
+    i_inStrides.push_back(G * C);
+    i_inStrides.push_back(C);
+    i_inStrides.push_back(1);
+
+    std::vector<ck::index_t> gamma_beta_Strides;
+    gamma_beta_Strides.push_back(0);
+    gamma_beta_Strides.push_back(0);
+    gamma_beta_Strides.push_back(0);
+    gamma_beta_Strides.push_back(C);
+    gamma_beta_Strides.push_back(1);
+
+    auto device_instance = {{instance}}{};
+    auto argument_ptr = device_instance.MakeArgumentPointer(
+        {static_cast<ck::index_t>(N),
+         static_cast<ck::index_t>(H),
+         static_cast<ck::index_t>(W),
+         static_cast<ck::index_t>(G),
+         static_cast<ck::index_t>(C)},
+        i_inStrides, // x stride
+        gamma_beta_Strides,
+        gamma_beta_Strides,
+        i_inStrides, // y stride
+        {1, 2, 4}, // reduction dimension: [H, W, C]
+        1e-5,
+        static_cast<ck::half_t *>(input),
+        static_cast<ck::half_t *>(gamma),
+        static_cast<ck::half_t *>(beta),
+        static_cast<ck::half_t *>(output),
+        YElementOp{}
+    );
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "wrong! device_layernorm with the specified compilation parameters does "
+            "not support this Groupnorm problem");
+    };
+    std::string instance_name = device_instance.GetTypeString();
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
+    return;
+"""
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}({{dtype}}* input,
+                   {{dtype}}* gamma,
+                   {{dtype}}* beta,
+                   {{dtype}}* output,
+                   int64_t N,
+                   int64_t H,
+                   int64_t W,
+                   int64_t G,
+                   int64_t C,
+                   hipStream_t stream)
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{input}},
+{{indent}}   {{gamma}},
+{{indent}}   {{beta}},
+{{indent}}   {{output}},
+{{indent}}   {{N}},
+{{indent}}   {{H}},
+{{indent}}   {{W}},
+{{indent}}   {{G}},
+{{indent}}   {{C}},
+{{indent}}   stream
+{{indent}});
+    """
+)
+
+
+PROFILER_FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{input}},
+{{indent}}   {{gamma}},
+{{indent}}   {{beta}},
+{{indent}}   {{output}},
+{{indent}}   N,
+{{indent}}   H,
+{{indent}}   W,
+{{indent}}   G,
+{{indent}}   C,
+{{indent}}   stream
+{{indent}});
+    """
+)
+
+
+@registry.reg("rocm.groupnorm.config")
+def groupnorm_extract_config(func_attrs):
+    """Extract (operation name, operation instance) pair
+    from all operation candidates.
+
+    Parameters
+    ----------
+    op_kind : ck_lib.library.OperationKind
+        Operation kind.
+    extra_kind : ck_lib.library.[AnyKind]
+        Used to as extra flag to distinguish kernels.
+        E.g. bias_add_relu vs. add_relu_bias
+    f_prop_op: function
+        Used to filter operation.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.OperationKind.GroupNorm
+    extra_kind = 5
+    extract_ops = list(Target.current()._operators[op_kind][extra_kind].items())
+    groupnorm_ops = OrderedDict()
+    for key, value in extract_ops:
+        groupnorm_ops[key] = value[0]
+    func_attrs["op_instance"] = groupnorm_ops
+
+
+def get_func_signature_profiler(func_attrs: Dict[str, Any]) -> str:
+    return FUNC_SIGNATURE.render(
+        func_name=func_attrs["name"],
+        dtype="void",
+        input_ndim=5,
+    ).strip()
+
+
+@registry.reg("rocm.groupnorm.gen_profiler")
+def groupnorm_gen_profiler(
+    func_attrs: Dict[str, Any],
+    workdir: str,
+    indent: str = "  ",
+    use_swish: bool = False,
+) -> str:
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+    use_swish : bool, optional
+        Use swish if True
+    """
+    # N, H, W, C
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+
+    for dim_idx in (1, 2, 3):
+        assert isinstance(
+            shapes[dim_idx], IntImm
+        ), f"groupnorm requires reduction dim {dim_idx=} to be static"
+
+    norm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        5,  # rank
+        SHAPE_EVAL_TEMPLATE,
+        EXEC_TEMPLATE,
+        TENSOR_DECL_TEMPLATE,
+        EXTRA_HEADERS,
+        get_func_signature_profiler,
+        EXTRA_CODE_TEMPLATE.render(use_swish=use_swish),
+        PROFILER_FUNC_CALL_TEMPLATE,
+        indent,
+    )
+
+
+# This function has diverged from norm_common.gen_function
+# due to the change to the profiler exec_key
+# TODO: merge with norm_common.gen_function after fixing softmax
+def gen_function(
+    func_attrs: Dict[str, Any],
+    shape_eval_template: jinja2.Template,
+    exec_template: jinja2.Template,
+    extra_header_template: jinja2.Template,
+    extra_code_template: jinja2.Template,
+    get_func_signature: Any,
+    use_swish: bool = False,
+) -> str:
+    """Generate function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_template : jinja2.Template
+        Execution block template.
+    extra_header_template : jinja2.Template
+        Extra header template.
+    extra_code_template : jinja2.Template
+        Extra code template.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for exec_item in exec_path.values():
+        fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
+        algo = exec_item.algo
+        if algo not in inst_def_flag:
+            config = norm_common.emit_instance(op_instance[algo])
+            inst_def_flag.add(algo)
+        else:
+            config = ""
+        inst = norm_common.INSTANCE_TEMPLATE.render(
+            config=config,
+            name=fname,
+            config_name=norm_common.extract_config_name(config),
+        )
+        instances[exec_item.exec_cond] = inst
+        instance_decl += inst
+
+    exec_cond_template = func_attrs["exec_cond_template"]
+    exec_paths = ""
+    for key, _ in instances.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = exec_template.render(instance=fname, dtype="void")
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+
+    return norm_common.FUNC_TEMPLATE.render(
+        instances_decl=instance_decl,
+        func_signature=get_func_signature(func_attrs),
+        shape_eval="",
+        exec_paths=exec_paths,
+        extra_headers=extra_header_template.render(),
+        extra_code=extra_code_template.render(use_swish=use_swish),
+    )
+
+
+@registry.reg("rocm.groupnorm.gen_function")
+def groupnorm_gen_function(func_attrs: Dict[str, Any], use_swish: bool = False) -> str:
+    """Generate function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    # N, H, W, C
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+
+    for dim_idx in (1, 2, 3):
+        assert isinstance(
+            shapes[dim_idx], IntImm
+        ), f"groupnorm requires reduction dim {dim_idx=} to be static"
+
+    return gen_function(
+        func_attrs,
+        SHAPE_EVAL_TEMPLATE,
+        EXEC_TEMPLATE,
+        EXTRA_HEADERS,
+        EXTRA_CODE_TEMPLATE,
+        get_func_signature,
+        use_swish,
+    )
+
+
+def get_func_signature(func_attrs: Dict[str, Any]) -> str:
+    input_ndim = func_attrs["inputs"][0]._rank()
+    return FUNC_SIGNATURE.render(
+        func_name=func_attrs["name"],
+        dtype="void",
+        input_ndim=input_ndim,
+    ).strip()
+
+
+@registry.reg("rocm.groupnorm.func_decl")
+def groupnorm_gen_func_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(func_signature=get_func_signature(func_attrs))
+
+
+@registry.reg("rocm.groupnorm.func_call")
+def groupnorm_gen_func_call(func_attrs, indent="  "):
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 3
+
+    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][0]._attrs["name"]
+    )
+    gamma_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][1]._attrs["name"]
+    )
+    beta_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][2]._attrs["name"]
+    )
+    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    assert (
+        len(shapes) == 4
+    ), f"GroupNorm only supports input with rank == 4, current rank: {len(shapes)}"
+
+    N = shapes[0]._attrs["name"]
+    H = shapes[1]._attrs["name"]
+    W = shapes[2]._attrs["name"]
+    G = func_attrs["num_groups"]
+    C = shapes[3]._attrs["name"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input=input_name,
+        gamma=gamma_name,
+        beta=beta_name,
+        output=output_name,
+        N=N,
+        H=H,
+        W=W,
+        G=G,
+        C=C,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py b/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
new file mode 100644
index 000000000..2808f88f7
--- /dev/null
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
@@ -0,0 +1,50 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Any, Dict
+
+from ... import registry
+
+from .groupnorm import (
+    groupnorm_extract_config,
+    groupnorm_gen_func_call,
+    groupnorm_gen_func_decl,
+    groupnorm_gen_function,
+    groupnorm_gen_profiler,
+)
+
+
+@registry.reg("rocm.groupnorm_swish.config")
+def extract_config(func_attrs):
+    return groupnorm_extract_config(func_attrs)
+
+
+@registry.reg("rocm.groupnorm_swish.gen_profiler")
+def gen_profiler(func_attrs: Dict[str, Any], workdir: str, indent: str = "  ") -> str:
+    groupnorm_gen_profiler(func_attrs, workdir, indent, use_swish=True)
+
+
+@registry.reg("rocm.groupnorm_swish.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    return groupnorm_gen_function(func_attrs, use_swish=True)
+
+
+@registry.reg("rocm.groupnorm_swish.func_decl")
+def func_decl(func_attrs: Dict[str, Any]) -> str:
+    return groupnorm_gen_func_decl(func_attrs)
+
+
+@registry.reg("rocm.groupnorm_swish.func_call")
+def gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return groupnorm_gen_func_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/rocm/normalization/layernorm.py b/python/aitemplate/backend/rocm/normalization/layernorm.py
new file mode 100644
index 000000000..f2392c024
--- /dev/null
+++ b/python/aitemplate/backend/rocm/normalization/layernorm.py
@@ -0,0 +1,371 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Layernorm codegen for ROCM.
+"""
+
+from hashlib import sha1
+from typing import Any, Dict, OrderedDict
+
+import jinja2
+
+from ....compiler.base import IntImm
+
+from ... import registry
+from ...target import Target
+from . import norm_common
+
+EXTRA_HEADERS = jinja2.Template(
+    """
+#include "include/ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
+"""
+)
+
+FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
+    "reinterpret_cast<ck::half_t*>(({{name}}))"
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  int64_t ptr_sz = in_{{ range(rank)|join(' * in_') }};
+
+  int64_t norm_dim = in_{{rank - 1}};
+
+  // TODO: special pool size for 8M L2 cache
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_sz)));
+
+  memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // in: index 0
+  memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // out: index 1
+  memory_pool->AllocateHalfTensor(norm_dim, mem_pool_sz);  // gamma: index 2
+  memory_pool->AllocateHalfTensor(norm_dim, mem_pool_sz);  // beta: index 3
+
+"""
+)
+
+SHAPE_EVAL_TEMPLATE = jinja2.Template(
+    """
+    int M = *in_{{ range(rank - 1)|join(' * *in_') }};
+    int N = *in_{{rank - 1}};
+    """
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+    std::vector<ck::index_t> i_inStrides;
+
+    i_inStrides.push_back(N);
+    i_inStrides.push_back(1);
+
+
+    auto device_instance = {{instance}}{};
+    auto argument_ptr = device_instance.MakeArgumentPointer(
+        {M, N},
+        i_inStrides,
+        std::vector<ck::index_t>{0, 1},
+        std::vector<ck::index_t>{0, 1},
+        i_inStrides,
+        {1},
+        {{eps}},
+        static_cast<ck::half_t *>(input),
+        static_cast<ck::half_t *>(gamma),
+        static_cast<ck::half_t *>(beta),
+        static_cast<ck::half_t *>(output),
+        ck::tensor_operation::element_wise::PassThrough{}
+    );
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "wrong! device_layernorm with the specified compilation parameters does "
+            "not support this Softmax problem");
+    };
+    std::string instance_name = device_instance.GetTypeString();
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
+    return;
+"""
+)
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}({{dtype}}* input,
+                   {{dtype}}* gamma,
+                   {{dtype}}* beta,
+                   {{dtype}}* output,
+{% for idx in range(input_ndim) %}
+                   int64_t* in_{{idx}},
+{% endfor %}
+                   hipStream_t stream)
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{input}},
+{{indent}}   {{gamma}},
+{{indent}}   {{beta}},
+{{indent}}   {{output}},
+{% for name in input_dim_names %}
+{{indent}}    const_cast<int64_t *>(&{{name}}),
+{% endfor %}
+{{indent}}   stream
+{{indent}});
+    """
+)
+
+
+@registry.reg("rocm.layernorm.config")
+def extract_config(func_attrs):
+    """Extract (operation name, operation instance) pair
+    from all operation candidates.
+
+    Parameters
+    ----------
+    op_kind : ck_lib.library.OperationKind
+        Operation kind.
+    extra_kind : ck_lib.library.[AnyKind]
+        Used to as extra flag to distinguish kernels.
+        E.g. bias_add_relu vs. add_relu_bias
+    f_prop_op: function
+        Used to filter operation.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.OperationKind.LayerNorm
+    extra_kind = 2
+    extract_ops = list(Target.current()._operators[op_kind][extra_kind].items())
+    layernorm_ops = OrderedDict()
+    for key, value in extract_ops:
+        layernorm_ops[key] = value[0]
+    func_attrs["op_instance"] = layernorm_ops
+
+
+def get_func_signature_profiler(func_attrs: Dict[str, Any]) -> str:
+    return FUNC_SIGNATURE.render(
+        func_name=func_attrs["name"],
+        dtype="void",
+        input_ndim=2,
+    ).strip()
+
+
+@registry.reg("rocm.layernorm.gen_profiler")
+def layernorm_gen_profiler(
+    func_attrs: Dict[str, Any], workdir: str, indent: str = "  "
+) -> str:
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+    """
+    dim = -1
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+
+    assert isinstance(
+        shapes[dim], IntImm
+    ), "layernorm requires reduction dim to be static"
+
+    norm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        2,  # rank
+        SHAPE_EVAL_TEMPLATE,
+        EXEC_TEMPLATE,
+        TENSOR_DECL_TEMPLATE,
+        EXTRA_HEADERS,
+        get_func_signature_profiler,
+        "",  # extra_code
+        FUNC_CALL_TEMPLATE,
+        indent,
+    )
+
+
+# This function has diverged from norm_common.gen_function
+# due to the change to the profiler exec_key
+# TODO: merge with norm_common.gen_function after fixing softmax
+def gen_function(
+    func_attrs: Dict[str, Any],
+    shape_eval_template: jinja2.Template,
+    exec_template: jinja2.Template,
+    extra_header_template: jinja2.Template,
+    get_func_signature: Any,
+) -> str:
+    """Generate function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_template : jinja2.Template
+        Execution block template.
+    extra_header_template : jinja2.Template
+        Extra header template.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    rank = func_attrs["inputs"][0]._rank()
+    eps = func_attrs.get("eps", "1e-5")
+
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for exec_item in exec_path.values():
+        fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
+        algo = exec_item.algo
+        if algo not in inst_def_flag:
+            config = norm_common.emit_instance(op_instance[algo])
+            inst_def_flag.add(algo)
+        else:
+            config = ""
+        inst = norm_common.INSTANCE_TEMPLATE.render(
+            config=config,
+            name=fname,
+            config_name=norm_common.extract_config_name(config),
+        )
+        instances[exec_item.exec_cond] = inst
+        instance_decl += inst
+
+    shape_eval = shape_eval_template.render(rank=rank) if shape_eval_template else ""
+    exec_cond_template = func_attrs["exec_cond_template"]
+    exec_paths = ""
+    for key, _ in instances.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = exec_template.render(
+            instance=fname, dtype="void", reduce_dims=rank - 1, eps=eps
+        )
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+
+    return norm_common.FUNC_TEMPLATE.render(
+        instances_decl=instance_decl,
+        func_signature=get_func_signature(func_attrs),
+        shape_eval=shape_eval,
+        exec_paths=exec_paths,
+        extra_headers=extra_header_template.render(),
+    )
+
+
+@registry.reg("rocm.layernorm.gen_function")
+def layernorm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generate function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    dim = -1
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+
+    assert isinstance(
+        shapes[dim], IntImm
+    ), "layernorm requires reduction dim to be static"
+
+    return gen_function(
+        func_attrs,
+        SHAPE_EVAL_TEMPLATE,
+        EXEC_TEMPLATE,
+        EXTRA_HEADERS,
+        get_func_signature,
+    )
+
+
+def get_func_signature(func_attrs: Dict[str, Any]) -> str:
+    input_ndim = func_attrs["inputs"][0]._rank()
+    return FUNC_SIGNATURE.render(
+        func_name=func_attrs["name"],
+        dtype="void",
+        input_ndim=input_ndim,
+    ).strip()
+
+
+@registry.reg("rocm.layernorm.func_decl")
+def layernorm_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(func_signature=get_func_signature(func_attrs))
+
+
+@registry.reg("rocm.layernorm.func_call")
+def layernorm_gen_function_call(func_attrs, indent="  "):
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 3
+
+    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][0]._attrs["name"]
+    )
+    gamma_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][1]._attrs["name"]
+    )
+    beta_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][2]._attrs["name"]
+    )
+    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    assert (
+        len(shapes) >= 2
+    ), f"LayerNorm only supports input with rank >= 2, current rank: {len(shapes)}"
+
+    input_dim_names = [shape._attrs["name"] for shape in shapes]
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+
+    elem_cnt = 1
+    for shape in xshape:
+        elem_cnt *= shape._attrs["values"][0]
+    instance_size = xshape[-1]._attrs["values"][0]
+    instance_num = elem_cnt // instance_size
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input=input_name,
+        gamma=gamma_name,
+        beta=beta_name,
+        output=output_name,
+        M=instance_num,
+        N=instance_size,
+        input_dim_names=input_dim_names,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
new file mode 100644
index 000000000..fb4e5135c
--- /dev/null
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -0,0 +1,503 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Normalization common codegen for ROCM.
+"""
+
+import os
+import re
+from hashlib import sha1
+from typing import Any, Dict, OrderedDict
+
+import jinja2
+
+from ... import builder
+from ...target import Target
+
+FUNC_CALL_PARAM_TEMPLATE = jinja2.Template("(void *)({{name}})")
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = {{ config_name }};
+"""
+)
+
+ARGS_PARSE_TEMPLATE = jinja2.Template(
+    """
+{% for idx in range(rank) %}
+  const int64_t in_{{idx}} = std::stoi(argv[{{ idx + 1 }}]);
+{% endfor %}
+"""
+)
+
+
+STRUCTS_DEF_TEMPLATE = jinja2.Template(
+    """
+
+struct ProfilerMemoryPool {
+  ProfilerMemoryPool() {
+    std::random_device rd;
+    gen = std::mt19937(rd());
+    uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
+    offsets.reserve(512);
+    strides.reserve(512);
+    copies.reserve(512);
+    ptrs.reserve(512);
+  }
+  ~ProfilerMemoryPool() {
+    for(int i = 0; i < ptrs.size(); i++){
+      hipFree(ptrs[i]);
+    }
+  }
+
+  template <typename DType>
+  DType* AllocateGaussianTensor(int64_t size) {
+    size_t length = size * sizeof(DType);
+    DType *d_x;
+    hipMalloc(&d_x, length);
+
+    float mean = 0.0f;
+    float stddev = 1.0f;
+    uint64_t seed = uniform_dist(gen);
+    rocrand_set_seed(generator, seed);
+    rocrand_generate_normal(generator, reinterpret_cast<float*>(d_x), size, mean, stddev);
+    return d_x;
+  }
+
+  ck::half_t* AllocateHalfGaussianTensor(int64_t size) {
+    return reinterpret_cast<ck::half_t*>(
+        AllocateGaussianTensor<ck::half_t>(size));
+  }
+
+  int AllocateHalfTensor(int64_t size, int64_t copy) {
+    offsets.push_back(0);
+    strides.push_back(size);
+    copies.push_back(copy);
+    auto ptr = AllocateHalfGaussianTensor(size * copy);
+    ptrs.push_back(reinterpret_cast<void*>(ptr));
+    return ptrs.size() - 1;
+  }
+
+  ck::half_t* RequestHalfTensorByIdx(int idx) {
+    auto copy = copies.at(idx);
+    auto offset = offsets.at(idx);
+    auto stride = strides.at(idx);
+    ck::half_t* ptr = reinterpret_cast<ck::half_t*>(ptrs.at(idx));
+    ptr += offset;
+    offset += stride;
+    if (offset == copy * stride) {
+        offset = 0;
+    }
+    offsets[idx] = offset;
+    return ptr;
+  }
+  std::vector<int64_t> offsets;
+  std::vector<int64_t> strides;
+  std::vector<int64_t> copies;
+  std::vector<void*> ptrs;
+  std::mt19937 gen;
+  std::uniform_int_distribution<int64_t> uniform_dist;
+  rocrand_generator generator;
+};
+
+// hack for DeviceMem linking error
+// TODO fix this by making CK a header-only lib
+// <<< hack begin
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
+void DeviceMem::ToDevice(const void* p) const
+{
+  hipGetErrorString(
+        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+void DeviceMem::FromDevice(void* p) const
+{
+  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+struct KernelTimerImpl
+{
+  KernelTimerImpl() {
+    hipGetErrorString(hipEventCreate(&mStart));
+    hipGetErrorString(hipEventCreate(&mEnd));
+  }
+  ~KernelTimerImpl() {
+    hipGetErrorString(hipEventDestroy(mStart));
+    hipGetErrorString(hipEventDestroy(mEnd));
+  }
+  void Start() {
+    hipGetErrorString(hipDeviceSynchronize());
+    hipGetErrorString(hipEventRecord(mStart, nullptr));
+  }
+  void End() {
+    hipGetErrorString(hipEventRecord(mEnd, nullptr));
+    hipGetErrorString(hipEventSynchronize(mEnd));
+  }
+  float GetElapsedTime() const {
+    float time;
+    hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
+    return time;
+  }
+  hipEvent_t mStart, mEnd;
+};
+// >>> hack end
+
+"""
+)
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+{{op_func}}
+
+{{structs_def}}
+
+int main(int argc, char** argv) {
+  {{args_parse}}
+  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
+  hipStream_t stream = nullptr;
+  {{tensor_decl}}
+  // warmup
+  for(int i = 0; i < 3; ++i) {
+    {{func_call}}
+  }
+  // run
+  KernelTimerImpl timer;
+  timer.Start();
+  for(int i = 0; i < 5; ++i) {
+    {{func_call}}
+  }
+  timer.End();
+  std::cout << "WS:" <<GLOBAL_WORKSPACE_SIZE<<std::endl;
+  std::cout << "TIME:" << timer.GetElapsedTime() << std::endl;
+}
+"""
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <random>
+#include <rocrand/rocrand.h>
+#include "include/ck/utility/print.hpp"
+#include "library/include/ck/library/utility/device_memory.hpp"
+#include "library/include/ck/library/utility/host_tensor.hpp"
+#include "library/include/ck/library/utility/host_tensor_generator.hpp"
+#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/reduction_operator.hpp"
+{{extra_headers}}
+
+{{extra_code}}
+
+{{instances_decl}}
+
+{{func_signature}}
+{
+{{shape_eval}}
+{{exec_paths}}
+}
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{input}},
+{{indent}}   {{output}},
+{% for name in input_dim_names %}
+{{indent}}    const_cast<int64_t *>(&{{name}}),
+{% endfor %}
+{{indent}}   stream
+{{indent}});
+    """
+)
+
+
+def extract_config(func_attrs):
+    """Extract (operation name, operation instance) pair
+    from all operation candidates.
+
+    Parameters
+    ----------
+    op_kind : ck_lib.library.OperationKind
+        Operation kind.
+    extra_kind : ck_lib.library.[AnyKind]
+        Used to as extra flag to distinguish kernels.
+        E.g. bias_add_relu vs. add_relu_bias
+    f_prop_op: function
+        Used to filter operation.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair.
+    """
+    import ck_lib
+
+    op_kind = ck_lib.library.OperationKind.Softmax
+    extra_kind = len(func_attrs["inputs"][0]._attrs["shape"])
+    extract_ops = list(Target.current()._operators[op_kind][extra_kind].items())
+    softmax_ops = OrderedDict()
+    for key, value in extract_ops:
+        softmax_ops[key] = value[0]
+    func_attrs["op_instance"] = softmax_ops
+
+
+def emit_instance(op):
+    """Emit instance"""
+    import ck_lib  # noqa: F401
+
+    op_def = op.emit()
+    return op_def
+
+
+def extract_config_name(config):
+    """Extract configuration names.
+
+    Parameters
+    ----------
+    config : str
+        Configuration as a string in the format of 'using model = xxx'.
+
+    Returns
+    -------
+    str
+        Extracted name from the statement, e.g. 'model' for 'using model = xxx'.
+
+    Raises
+    ------
+    RuntimeError
+        Invalid config.
+    """
+    pattern = re.compile(r"\s*using\s(.*?)\s=")
+    decl = config.split("\n")[1]
+    match = pattern.match(decl)
+    if match is None:
+        raise RuntimeError("Invalid config: \n" + config)
+    return match.groups()[0]
+
+
+def gen_profiler(
+    func_attrs: Dict[str, Any],
+    workdir: str,
+    rank: int,
+    shape_eval_template: jinja2.Template,
+    exec_template: jinja2.Template,
+    tensor_decl_template: jinja2.Template,
+    extra_header_template: jinja2.Template,
+    get_func_signature: Any,
+    extra_code: str = "",
+    func_call_template: jinja2.Template = FUNC_CALL_TEMPLATE,
+    indent: str = "  ",
+) -> str:
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    rank: int
+        Rank of the input tensor. If using [M, N] in exec_key, the rank here
+        must be 2 because if implies that the inputs are reshaped for profiling.
+        For code gen, the real shapes are used.
+    exec_template : jinja2.Template
+        Execution block template.
+    tensor_decl_template: jinja2.Template
+        Tensor declaration template.
+    extra_header_template : jinja2.Template
+        Extra header template.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+    """
+    op_type = func_attrs["op"]
+    shape_eval = shape_eval_template.render(rank=rank) if shape_eval_template else ""
+    eps = func_attrs.get("eps", "1e-5")
+
+    op_instance = func_attrs["op_instance"]
+    file_paris = []
+    for op_name, op in op_instance.items():
+
+        config = emit_instance(op)
+        config_name = extract_config_name(config)
+        instances = INSTANCE_TEMPLATE.render(
+            name="DeviceInstance", config_name=config_name, config=config
+        )
+        exe_path = exec_template.render(
+            instance="DeviceInstance",
+            dtype="void",
+            reduce_dims=rank - 1,
+            rank=rank,
+            eps=eps,
+        )
+
+        op_func = FUNC_TEMPLATE.render(
+            instances_decl=instances,
+            func_signature=get_func_signature(func_attrs),
+            shape_eval=shape_eval,
+            exec_paths=exe_path,
+            extra_headers=extra_header_template.render(),
+            extra_code=extra_code,
+        )
+        structs_def = STRUCTS_DEF_TEMPLATE.render()
+        args_parse = ARGS_PARSE_TEMPLATE.render(rank=rank)
+        tensor_decl = tensor_decl_template.render(rank=rank)
+
+        input_dim_names = [f"in_{i}" for i in range(rank)]
+        func_call = func_call_template.render(
+            func_name=func_attrs["name"],
+            input="(void *) memory_pool->RequestHalfTensorByIdx(0)",
+            gamma="(void *) memory_pool->RequestHalfTensorByIdx(2)",
+            beta="(void *) memory_pool->RequestHalfTensorByIdx(3)",
+            output="(void *) memory_pool->RequestHalfTensorByIdx(1)",
+            input_dim_names=input_dim_names,
+            indent=indent,
+        )
+        code = PROFILER_TEMPLATE.render(
+            op_func=op_func,
+            structs_def=structs_def,
+            args_parse=args_parse,
+            tensor_decl=tensor_decl,
+            func_call=func_call,
+        )
+
+        prefix = os.path.join(workdir, "profiler", op_type)
+        if not os.path.exists(prefix):
+            os.makedirs(prefix)
+        src_path = os.path.join(prefix, op_name + ".cpp")
+        obj_path = os.path.join(prefix, op_name)
+        if os.path.exists(obj_path):
+            continue
+        with open(src_path, "w") as fo:
+            fo.write(code)
+        file_paris.append((src_path, obj_path))
+
+    # build
+    target = Target.current()
+    compile_engine = builder.Builder()
+    compile_engine.build_objs(file_paris, target.compile_cmd(executable=True))
+
+
+# no longer used by layernorm
+def gen_function(
+    func_attrs: Dict[str, Any],
+    exec_template: jinja2.Template,
+    extra_header_template: jinja2.Template,
+    get_func_signature: Any,
+) -> str:
+    """Generate function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    exec_template : jinja2.Template
+        Execution block template.
+    extra_header_template : jinja2.Template
+        Extra header template.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    rank = len(shapes)
+
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for exec_item in exec_path.values():
+        fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
+        algo = exec_item.algo
+        if algo not in inst_def_flag:
+            config = emit_instance(op_instance[algo])
+            inst_def_flag.add(algo)
+        else:
+            config = ""
+        inst = INSTANCE_TEMPLATE.render(
+            config=config, name=fname, config_name=extract_config_name(config)
+        )
+        instances[exec_item.exec_cond] = inst
+        instance_decl += inst
+
+    exec_cond_template = func_attrs["exec_cond_template"]
+    exec_paths = ""
+    for key, _ in instances.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = exec_template.render(
+            instance=fname, dtype="void", reduce_dims=rank - 1, rank=rank
+        )
+        cond_vars = re.findall(r"\S+(?= >=)", key)
+        cond_vars += re.findall(r"\S+(?= ==)", key)
+        cond = key
+        for i, var in enumerate(cond_vars):
+            cond = cond.replace(var + " ", "*in_" + str(i))
+        exec_inst = exec_cond_template.render(indent="  ", cond=cond, program=program)
+        exec_paths += exec_inst
+
+    return FUNC_TEMPLATE.render(
+        instances_decl=instance_decl,
+        func_signature=get_func_signature(func_attrs),
+        exec_paths=exec_paths,
+        extra_headers=extra_header_template.render(),
+    )
+
+
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    input_name = FUNC_CALL_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][0]._attrs["name"]
+    )
+    output_name = FUNC_CALL_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][0]._attrs["name"]
+    )
+
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    input_dim_names = [shape._attrs["name"] for shape in shapes]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input=input_name,
+        output=output_name,
+        input_dim_names=input_dim_names,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/normalization/softmax.py b/python/aitemplate/backend/rocm/normalization/softmax.py
new file mode 100644
index 000000000..7b12a264e
--- /dev/null
+++ b/python/aitemplate/backend/rocm/normalization/softmax.py
@@ -0,0 +1,239 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Softmax codegen for ROCM.
+"""
+
+from typing import Any, Dict
+
+import jinja2
+
+from ....compiler.base import IntImm
+
+from ... import registry
+from . import norm_common
+
+EXTRA_HEADERS = jinja2.Template(
+    """
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+"""
+)
+
+TENSOR_DECL_TEMPLATE = jinja2.Template(
+    """
+  int64_t ptr_sz = in_{{ range(rank)|join(' * in_') }};
+  // TODO: special pool size for 8M L2 cache
+  // need to tune it for other devices
+  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_sz)));
+
+  memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // in: index 0
+  memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // out: index 1
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+    float alpha = 1.0f;
+    float beta  = 0.0f;
+    const std::vector<int> reduceDims{ {{reduce_dims}} };
+    std::vector<ck::index_t> i_inLengths;
+    std::vector<ck::index_t> i_inStrides;
+{% for idx in range(rank) %}
+    i_inLengths.push_back(*in_{{idx}});
+{% endfor %}
+{% for start in range(1, rank) %}
+    i_inStrides.push_back( (*in_{{ range(start, rank)|join(') * (*in_') }}) );
+{% endfor %}
+    i_inStrides.push_back(1);
+    auto device_instance = {{instance}}{};
+    auto argument_ptr = device_instance.MakeArgumentPointer(i_inLengths,
+                                                            i_inStrides,
+                                                            reduceDims,
+                                                            &alpha,
+                                                            &beta,
+                                                            static_cast<ck::half_t *>(input),
+                                                            static_cast<ck::half_t *>(output),
+                                                            ck::tensor_operation::element_wise::PassThrough{},
+                                                            ck::tensor_operation::element_wise::PassThrough{}
+                                                            );
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "wrong! device_softmax with the specified compilation parameters does "
+            "not support this Softmax problem");
+    };
+    std::string instance_name = device_instance.GetTypeString();
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
+    return;
+"""
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}({{dtype}}* input,
+                   {{dtype}}* output,
+{% for idx in range(input_ndim) %}
+                   int64_t* in_{{idx}},
+{% endfor %}
+                   hipStream_t stream)
+    """
+)
+
+
+def get_func_signature(func_attrs: Dict[str, Any]) -> str:
+    input_ndim = func_attrs["inputs"][0]._rank()
+    return FUNC_SIGNATURE.render(
+        func_name=func_attrs["name"],
+        dtype="void",
+        input_ndim=input_ndim,
+    ).strip()
+
+
+@registry.reg("rocm.softmax.config")
+def extract_config(func_attrs):
+    """Extract (operation name, operation instance) pair
+    from all operation candidates.
+
+    Parameters
+    ----------
+    op_kind : ck_lib.library.OperationKind
+        Operation kind.
+    extra_kind : ck_lib.library.[AnyKind]
+        Used to as extra flag to distinguish kernels.
+        E.g. bias_add_relu vs. add_relu_bias
+    f_prop_op: function
+        Used to filter operation.
+
+    Returns
+    -------
+    Dict
+        Extracted (operation name, operation instance) pair.
+    """
+    norm_common.extract_config(func_attrs)
+
+
+@registry.reg("rocm.softmax.gen_profiler")
+def softmax_gen_profiler(
+    func_attrs: Dict[str, Any], workdir: str, indent: str = "  "
+) -> str:
+    """Generates standalone executables for profiler.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+    workdir : str
+        Directory to store the generated outputs.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+    """
+    dim = func_attrs["dim"]
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    rank = len(shapes)
+
+    assert (
+        dim == rank - 1
+    ), f"rocm softmax only supports dim == rank - 1, dim={dim}, rank={rank}"
+
+    assert isinstance(
+        shapes[dim], IntImm
+    ), "softmax requires reduction dim to be static"
+
+    norm_common.gen_profiler(
+        func_attrs,
+        workdir,
+        rank,
+        "",
+        EXEC_TEMPLATE,
+        TENSOR_DECL_TEMPLATE,
+        EXTRA_HEADERS,
+        get_func_signature=get_func_signature,
+        indent=indent,
+    )
+
+
+@registry.reg("rocm.softmax.gen_function")
+def softmax_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generate function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+    """
+    dim = func_attrs["dim"]
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    rank = len(shapes)
+
+    assert (
+        dim == rank - 1
+    ), f"rocm softmax only supports dim == rank - 1, dim={dim}, rank={rank}"
+
+    assert isinstance(
+        shapes[dim], IntImm
+    ), "softmax requires reduction dim to be static"
+    return norm_common.gen_function(
+        func_attrs, EXEC_TEMPLATE, EXTRA_HEADERS, get_func_signature
+    )
+
+
+@registry.reg("rocm.softmax.func_decl")
+def softmax_gen_function_decl(func_attrs: Dict[str, Any]):
+    """Generates function declarations.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Operation attributes.
+
+    Returns
+    -------
+    str
+        The rentered template of function declaration.
+    """
+    return get_func_signature(func_attrs) + ";"
+
+
+@registry.reg("rocm.softmax.func_call")
+def softmax_gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 1
+    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    assert (
+        len(shapes) >= 2
+    ), f"Softmax only supports input with rank >= 2, current rank: {len(shapes)}"
+
+    return norm_common.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/rocm/pool2d/__init__.py b/python/aitemplate/backend/rocm/pool2d/__init__.py
new file mode 100644
index 000000000..4a1ee6bf3
--- /dev/null
+++ b/python/aitemplate/backend/rocm/pool2d/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM pool2d init
+"""
+from . import avg_pool2d, max_pool2d
+
+__all__ = ["avg_pool2d", "max_pool2d"]
diff --git a/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py b/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
new file mode 100644
index 000000000..4ec7db8b0
--- /dev/null
+++ b/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM avg_pool2d funcs
+"""
+from ... import registry
+from . import pool2d
+
+
+@registry.reg("rocm.avg_pool2d.gen_function")
+def max_pool2d_gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return pool2d.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("rocm.avg_pool2d.func_decl")
+def avg_pool2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return pool2d.gen_function_decl(func_name)
+
+
+@registry.reg("rocm.avg_pool2d.func_call")
+def avg_pool2d_gen_function_call(func_attrs, indent="  "):
+    return pool2d.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/rocm/pool2d/max_pool2d.py b/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
new file mode 100644
index 000000000..38a108ddf
--- /dev/null
+++ b/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM max_pool2d funcs
+"""
+from ... import registry
+from . import pool2d
+
+
+@registry.reg("rocm.max_pool2d.gen_function")
+def max_pool2d_gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    return pool2d.gen_function(
+        func_attrs,
+        exec_cond_remplate,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("rocm.max_pool2d.func_decl")
+def avg_pool2d_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return pool2d.gen_function_decl(func_name)
+
+
+@registry.reg("rocm.max_pool2d.func_call")
+def avg_pool2d_gen_function_call(func_attrs, indent="  "):
+    return pool2d.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/rocm/pool2d/pool2d.py b/python/aitemplate/backend/rocm/pool2d/pool2d.py
new file mode 100644
index 000000000..197621abb
--- /dev/null
+++ b/python/aitemplate/backend/rocm/pool2d/pool2d.py
@@ -0,0 +1,278 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for pool2d.
+"""
+from hashlib import sha1
+
+import jinja2
+
+# pylint: disable=C0103,C0301,W0613,W0612
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+using {{name}} = ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
+ck::half_t, ck::half_t, float, {{reduce_func}}, false, 64, 64, 1, 4, 1, 4>;
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}auto op =  {{instance}}{};
+{{indent}}auto invoker_ptr  = op.MakeInvokerPointer();
+{{indent}}auto argument_ptr = op.MakeArgumentPointer(static_cast<ck::half_t *>(in_ptr),
+{{indent}}                                           static_cast<ck::half_t *>(out_ptr),
+{{indent}}                                           nullptr,
+{{indent}}                                           *batch,
+{{indent}}                                           *in_ch,
+{{indent}}                                           input_shape,
+{{indent}}                                           kernel_shape,
+{{indent}}                                           output_shape,
+{{indent}}                                           conv_filter_strides,
+{{indent}}                                           input_left_pads,
+{{indent}}                                           input_right_pads);
+{{indent}}if(!op.IsSupportedArgument(argument_ptr.get())) {
+{{indent}}  throw std::runtime_error(
+{{indent}}    "wrong! device_conv with the specified compilation parameters does "
+{{indent}}    "not support this Conv problem");
+{{indent}}}
+{{indent}}invoker_ptr->Run(argument_ptr.get(), StreamConfig{stream, false});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include "include/ck/utility/print.hpp"
+#include "library/include/ck/library/utility/device_memory.hpp"
+#include "library/include/ck/library/utility/host_tensor.hpp"
+#include "library/include/ck/library/utility/host_tensor_generator.hpp"
+#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/reduction_operator.hpp"
+#include "include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp"
+
+{{instances}}
+
+
+void {{function_name}}(
+    void* in_ptr,
+    void* out_ptr,
+    int64_t* batch,
+    int64_t* in_ch,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int64_t kernel_h,
+    int64_t kernel_w,
+    int64_t stride,
+    int64_t pad,
+    hipStream_t stream
+    ) {
+  {{shape_function}}
+
+  const std::array<ck::index_t, 2> conv_filter_strides{static_cast<ck::index_t>(stride),
+    static_cast<ck::index_t>(stride)};
+  const std::array<ck::index_t, 2> input_left_pads{static_cast<ck::index_t>(pad),
+    static_cast<ck::index_t>(pad)};
+  const std::array<ck::index_t, 2> input_right_pads{static_cast<ck::index_t>(pad),
+    static_cast<ck::index_t>(pad)};
+  const std::array<ck::index_t, 2> input_shape{static_cast<ck::index_t>(*in_h),
+    static_cast<ck::index_t>(*in_w)};
+  const std::array<ck::index_t, 2> kernel_shape{static_cast<ck::index_t>(kernel_h),
+    static_cast<ck::index_t>(kernel_w)};
+  const std::array<ck::index_t, 2> output_shape{static_cast<ck::index_t>(*out_h),
+    static_cast<ck::index_t>(*out_w)};
+
+  {{exec_paths}}
+
+  throw std::runtime_error(
+      "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  hipStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{kernel_h}},
+{{indent}}    {{kernel_w}},
+{{indent}}    {{stride}},
+{{indent}}    {{pad}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_function(
+    func_attrs,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    exec_cond_template : jinja2.Template
+        Generates if statement to execute kernel.
+    shape_eval_template : jinja2.Template
+        Generates shape calculation.
+        The template is passed from compiler/ops/pool.
+    shape_save_template : jinja2.Template
+        Generates output dimensions.
+        The template is passed from compiler/ops/pool.
+
+    Returns
+    -------
+    str
+        The rendered template of generated function body.
+
+    Raises
+    ------
+    NotImplementedError
+        An error is raised if op_type is not max or average pooling.
+    """
+    op_type = func_attrs["op"]
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    reduce_op = ""
+    if "max" in op_type:
+        reduce_op = "ck::ReduceTensorOp::MAX"
+    elif "avg" in op_type:
+        reduce_op = "ck::ReduceTensorOp::AVG"
+    else:
+        raise NotImplementedError
+    instances = {}
+    instance_decl = ""
+    for key, _ in exec_path.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        inst = INSTANCE_TEMPLATE.render(name=fname, reduce_func=reduce_op)
+        instances[key] = inst
+        instance_decl += inst
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        kernel_h="kernel_h",
+        kernel_w="kernel_w",
+        stride="stride",
+        pad="pad",
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+        y_dim3="*in_ch",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in instances:
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = EXEC_TEMPLATE.render(indent="    ", instance=fname)
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return SRC_TEMPLATE.render(
+        instances=instance_decl,
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+    )
+
+
+def gen_function_decl(func_name):
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for codegen, target dependent e.g. C++, python, etc., by default "  ".
+
+    Returns
+    -------
+    str
+        The rendered template of generated function call.
+    """
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        kernel_h=func_attrs["kernel_size"],
+        kernel_w=func_attrs["kernel_size"],
+        stride=func_attrs["stride"],
+        pad=func_attrs["pad"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
new file mode 100644
index 000000000..6d7b3330b
--- /dev/null
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -0,0 +1,265 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Rocm target specialization.
+"""
+# pylint: disable=W0702,W0707,W0611,C0415
+
+import os
+import re
+import shutil
+import sys
+from typing import List
+
+from aitemplate.backend.target import AIT_STATIC_FILES_PATH
+
+from .. import registry
+from ..target import COMPOSABLE_KERNEL_PATH, Target
+
+# pylint: disable=W0613
+
+
+class ROCM(Target):
+    """ROCM target.
+
+    Parameters
+    ----------
+    Target : Target
+        All attributes needed for ROCM.
+    """
+
+    def __init__(
+        self,
+        template_path=COMPOSABLE_KERNEL_PATH,
+        arch="GFX908",
+        ait_static_files_path=AIT_STATIC_FILES_PATH,
+        **kwargs,
+    ):
+        """Initialize ROCM target.
+
+        Parameters
+        ----------
+        template_path : str, optional
+            Path to composable kernel library, by default "${repo_root}/3rdparty/composable_kernel".
+        ait_static_files_path : str
+            Absolute path to the AIT static/ directory
+        arch : str, optional
+            Supported ROCM architecture, by default "GFX908".
+        """
+        super().__init__(ait_static_files_path)
+        self._target_type = 2
+        self._template_path = template_path
+        self._arch = arch
+        self._kwargs = kwargs
+        self._compile_options = self._build_compile_options()
+
+    def _pkg_path(self):
+        """Initialize package target.
+
+        Returns
+        -------
+        str
+            path to rocm compiler library
+        """
+        rocm_path = os.environ.get("ROCM_PATH", "/opt/rocm")
+        return rocm_path
+
+    def _build_compile_options(self):
+        """Build compilation commands, including compilation flag library and includes.
+
+        Returns
+        -------
+        List
+            List of compilation options.
+
+        Raises
+        ------
+        RuntimeError
+            Unsupported GPU Arch.
+        """
+
+        ck_paths = [
+            os.path.join(self._template_path),
+            os.path.join(self._template_path, "include/"),
+            os.path.join(self._template_path, "include/ck/"),
+            os.path.join(self._template_path, "include/ck/problem_transform/"),
+            os.path.join(self._template_path, "include/ck/tensor/"),
+            os.path.join(self._template_path, "include/ck/tensor_description/"),
+            os.path.join(self._template_path, "include/ck/tensor_operation/"),
+            os.path.join(self._template_path, "include/ck/tensor_operation/gpu/block/"),
+            os.path.join(
+                self._template_path, "include/ck/tensor_operation/gpu/device/"
+            ),
+            os.path.join(
+                self._template_path, "include/ck/tensor_operation/gpu/device/impl/"
+            ),
+            os.path.join(
+                self._template_path, "include/ck/tensor_operation/gpu/element/"
+            ),
+            os.path.join(self._template_path, "include/ck/tensor_operation/gpu/grid/"),
+            os.path.join(
+                self._template_path, "include/ck/tensor_operation/gpu/thread/"
+            ),
+            os.path.join(self._template_path, "include/ck/tensor_operation/gpu/warp/"),
+            os.path.join(self._template_path, "include/ck/utility/"),
+            os.path.join(self._template_path, "include/ck/host_utility"),
+            os.path.join(self._template_path, "external/include/half/"),
+            os.path.join(self._template_path, "library/include/ck/library/host/"),
+            os.path.join(
+                self._template_path, "library/include/ck/library/host_tensor/"
+            ),
+            os.path.join(
+                self._template_path,
+                "library/include/ck/library/obselete_driver_offline/",
+            ),
+            os.path.join(
+                self._template_path,
+                "library/include/ck/library/reference_tensor_operation/cpu/",
+            ),
+            os.path.join(
+                self._template_path,
+                "library/include/ck/library/reference_tensor_operation/gpu/",
+            ),
+            os.path.join(
+                self._template_path,
+                "library/include/ck/library/tensor_operation_instance/",
+            ),
+            os.path.join(
+                self._template_path,
+                "library/include/ck/library/tensor_operation_instance/gpu/" + "reduce/",
+            ),
+            os.path.join(self._template_path, "library/include/ck/library/tensor_op/"),
+            os.path.join(self._template_path, "library/include/ck/library/utility/"),
+            os.path.join(self._template_path, "profiler/include/"),
+        ]
+        options = [
+            "-O3",
+            "-fPIC",
+            "-fvisibility=hidden",
+            "-std=c++17",
+            "-w",
+            "-DCK_TIME_KERNEL=0",
+            "-Xclang -mlink-builtin-bitcode -Xclang {}/amdgcn/bitcode/oclc_abi_version_400.bc".format(
+                self._pkg_path()
+            ),
+        ]
+        if self._arch in {"GFX908", "gfx908"}:
+            options.append("-DCK_AMD_GPU_GFX908")
+            options.append("--amdgpu-target=gfx908")
+        elif self._arch in {"GFX90a", "gfx90a"}:
+            options.append("-DCK_AMD_GPU_GFX90A")
+            options.append("--amdgpu-target=gfx90a")
+        else:
+            raise RuntimeError("Unsupported GPU Arch")
+        for path in ck_paths:
+            options.append("-I" + path)
+        rocrand_path = os.path.join(self._pkg_path(), "rocrand/lib/")
+        options.append("-L" + rocrand_path)
+        options.append("-lrocrand")
+        return " ".join(options)
+
+    def _gen_ck_lib_pkg(self):
+        """Build composable kernel python library.
+
+        Raises
+        ------
+        RuntimeError
+            Failed to create ck library.
+        """
+        self.lib_folder = None
+        try:
+            import ck_lib  # noqa: F401
+        except BaseException:
+            try:
+                cur_path = os.path.dirname(os.path.realpath(__file__))
+                ck_lib_path = os.path.normpath(
+                    os.path.join(cur_path, "..", "..", "utils", "mk_ck_lib")
+                )
+                f_make_lib = registry.get("rocm.make_ck_lib")
+                dst_path = f_make_lib(ck_lib_path)
+                sys.path.insert(1, dst_path)
+            except BaseException as err:
+                raise RuntimeError("Failed to create ck library") from err
+            self.lib_folder = dst_path
+
+    def __enter__(self):
+        """Generate the ck library and generate ck operations."""
+        super().__enter__()
+        # Generate library.
+        self._gen_ck_lib_pkg()
+        # Choose the right ops to launch.
+        f_gen_ops = registry.get("rocm.gen_ck_ops")
+        self._operators = f_gen_ops(self._arch)
+
+    def __exit__(self, ptype, value, trace):
+        """Delete the ck library."""
+        super().__exit__(ptype, value, trace)
+        if self.lib_folder and os.path.exists(self.lib_folder):
+            shutil.rmtree(self.lib_folder)
+
+    def cc(self):
+        return "hipcc"
+
+    def compile_cmd(self, executable=False):
+        """Compile commands.
+
+        Parameters
+        ----------
+        executable : bool, optional
+            Flag of whether to generate executable or obj, by default False.
+
+        Returns
+        -------
+        str
+            Full commands for compilation.
+        """
+        if executable:
+            cmd = self.cc() + " " + self._compile_options + " -o {target} {src}"
+        else:
+            cmd = self.cc() + " " + self._compile_options + " -c -o {target} {src}"
+        return cmd
+
+    def src_extension(self):
+        return ".cpp"
+
+    def dev_select_flag(self):
+        return "HIP_VISIBLE_DEVICES"
+
+    def select_minimal_algo(self, algo_names: List[str]):
+        def comp_func(name):
+            compute_args = re.findall(r"_(\d+)_*", name)
+            if len(compute_args) != 1:
+                raise RuntimeError("Invalid ck op name")
+            args = [int(x) for x in compute_args[0]]
+            if "Gemm" in name:
+                if "GemmPadding" in name:
+                    args.insert(0, 0)
+                if "GemmDefault" in name:
+                    args.insert(0, 1)
+            elif "Conv" in name:
+                if "ConvFwdDefault" in name:
+                    args.insert(0, 0)
+                else:
+                    args.insert(0, 1)
+            else:
+                raise RuntimeError("Unknown CK ops.")
+            return tuple(args)
+
+        return sorted(algo_names, key=comp_func)[0]
+
+
+@registry.reg("rocm.create_target")
+def create_target(template_path, arch, **kwargs):
+    return ROCM(template_path=template_path, arch=arch, **kwargs)
diff --git a/python/aitemplate/backend/rocm/tensor/__init__.py b/python/aitemplate/backend/rocm/tensor/__init__.py
new file mode 100644
index 000000000..c5fe33e23
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/__init__.py
@@ -0,0 +1,31 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM tensor ops module init
+"""
+from . import (  # noqa
+    argmax,
+    batch_gather,
+    concatenate,
+    concatenate_tanh,
+    dynamic_slice,
+    permute021,
+    permute102,
+    permute210,
+    slice_reshape_scatter,
+    slice_scatter,
+    split,
+    topk,
+)
diff --git a/python/aitemplate/backend/rocm/tensor/argmax.py b/python/aitemplate/backend/rocm/tensor/argmax.py
new file mode 100644
index 000000000..15049bed5
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/argmax.py
@@ -0,0 +1,51 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+argmax kernel codegen for ROCM.
+"""
+
+from typing import Any, Dict
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.tensor import argmax_common
+
+# pylint: disable=C0301
+
+header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+"""
+
+
+@registry.reg("rocm.argmax.gen_function")
+def argmax_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return argmax_common.gen_function(func_attrs, header_files, ROCMSpec())
+
+
+@registry.reg("rocm.argmax.func_decl")
+def argmax_gen_function_decl(func_attrs: Dict[str, Any]):
+    return argmax_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.argmax.func_call")
+def argmax_gen_function_call(func_attrs, indent="  "):
+    return argmax_common.gen_function_call(func_attrs, ROCMSpec(), indent)
+
+
+@registry.reg("rocm.argmax.gen_profiler")
+def gen_profiler(func_attrs, workdir):
+    return argmax_common.gen_profiler(func_attrs, workdir, header_files, ROCMSpec())
diff --git a/python/aitemplate/backend/rocm/tensor/batch_gather.py b/python/aitemplate/backend/rocm/tensor/batch_gather.py
new file mode 100644
index 000000000..8deff3144
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/batch_gather.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+batch_gather kernel codegen for ROCM.
+"""
+
+from typing import Any, Dict
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.tensor import batch_gather_common
+
+# pylint: disable=C0301
+
+header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+"""
+
+
+@registry.reg("rocm.batch_gather.gen_function")
+def batch_gather_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return batch_gather_common.gen_function(func_attrs, header_files, ROCMSpec())
+
+
+@registry.reg("rocm.batch_gather.func_decl")
+def batch_gather_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    return batch_gather_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.batch_gather.func_call")
+def batch_gather_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return batch_gather_common.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/rocm/tensor/concatenate.py b/python/aitemplate/backend/rocm/tensor/concatenate.py
new file mode 100644
index 000000000..ac56c8dde
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/concatenate.py
@@ -0,0 +1,85 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM concatenate function
+"""
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common import concatenate_common
+
+
+@registry.reg("rocm.concatenate.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return concatenate_common.gen_function_decl(
+        func_attrs=func_attrs,
+        backend_spec=ROCMSpec(),
+    )
+
+
+@registry.reg("rocm.concatenate.gen_function")
+def gen_function(func_attrs, element_func=None, element_func_def=None):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return concatenate_common.gen_function(
+        func_attrs=func_attrs,
+        backend_spec=ROCMSpec(),
+        element_func=element_func,
+        element_func_def=element_func_def,
+    )
+
+
+@registry.reg("rocm.concatenate.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return concatenate_common.gen_function_call(
+        func_attrs=func_attrs,
+        backend_spec=ROCMSpec(),
+        indent="  ",
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py b/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py
new file mode 100644
index 000000000..3b2c8f93e
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py
@@ -0,0 +1,122 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Concatenate tanh op for ROCM backend.
+"""
+import jinja2
+
+from ... import registry
+from . import concatenate
+
+TANH_DEF = jinja2.Template(
+    """
+__device__  half2 fast_tanh(half2 x) {
+  // 1-2/(e^(2x)+1)
+  const half2 u = __hmul2(half2(2), x);
+  const half2 emu = h2exp(u);
+  const half2 cdf =
+      __hsub2(half2(1), __h2div(half2(2), __hadd2(half2(1), emu)));
+  return cdf;
+}
+__device__  half fast_tanh(half x) {
+  // 1-2/(e^(2x)+1)
+  const half u = __hmul(half(2), x);
+  const half emu = hexp(u);
+  const half cdf = __hsub(half(1), __hdiv(half(2), __hadd(half(1), emu)));
+  return cdf;
+}
+__device__  float fast_tanh(float x) {
+    float y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] =  fast_tanh(x_vec[0]);
+    return y;
+}
+__device__  float2 fast_tanh(float2 x) {
+    float2 y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] = fast_tanh(x_vec[0]);
+    y_vec[1] = fast_tanh(x_vec[1]);
+    return y;
+}
+__device__  float4 fast_tanh(float4 x) {
+    float4 y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] = fast_tanh(x_vec[0]);
+    y_vec[1] = fast_tanh(x_vec[1]);
+    y_vec[2] = fast_tanh(x_vec[2]);
+    y_vec[3] = fast_tanh(x_vec[3]);
+    return y;
+}
+"""
+)
+
+
+@registry.reg("rocm.concatenate_tanh.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return concatenate.gen_function_decl(func_attrs)
+
+
+@registry.reg("rocm.concatenate_tanh.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    element_func_def: str
+        Functions that defines how tanh work.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return concatenate.gen_function(
+        func_attrs, element_func="fast_tanh", element_func_def=TANH_DEF.render()
+    )
+
+
+@registry.reg("rocm.concatenate_tanh.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return concatenate.gen_function_call(func_attrs, indent=indent)
diff --git a/python/aitemplate/backend/rocm/tensor/dynamic_slice.py b/python/aitemplate/backend/rocm/tensor/dynamic_slice.py
new file mode 100644
index 000000000..4f39785d3
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/dynamic_slice.py
@@ -0,0 +1,84 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Dynamic slice ROCM implementation.
+"""
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.tensor import slice_common
+
+
+@registry.reg("rocm.dynamic_slice.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return slice_common.gen_function_decl(func_attrs, backend_spec=ROCMSpec())
+
+
+@registry.reg("rocm.dynamic_slice.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return slice_common.gen_function(
+        func_attrs, backend_spec=ROCMSpec(), elems_per_thread=8
+    )
+
+
+@registry.reg("rocm.dynamic_slice.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return slice_common.gen_function_call(
+        backend_spec=ROCMSpec(),
+        func_name=func_attrs["name"],
+        inputs=func_attrs["inputs"],
+        outputs=func_attrs["outputs"],
+        start_indices=[func_attrs["start_indices"]],
+        end_indices=[func_attrs["end_indices"]],
+        dim=0,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/permute021.py b/python/aitemplate/backend/rocm/tensor/permute021.py
new file mode 100644
index 000000000..8dc0d1e40
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/permute021.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute021 for rocm
+"""
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.tensor import permute021_common
+
+# pylint: disable=C0301,W0613,W0612
+
+Header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include "library/include/ck/library/utility/host_tensor.hpp"
+"""
+
+
+@registry.reg("rocm.permute021.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+    shape_eval_template : jinja template
+    shape_save_template : jinja template
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    return permute021_common.gen_function(
+        func_attrs,
+        template_path,
+        shape_eval_template,
+        shape_save_template,
+        Header_files,
+        ROCMSpec(),
+    )
+
+
+@registry.reg("rocm.permute021.func_decl")
+def gen_function_decl(func_attrs):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    return permute021_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.permute021.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    return permute021_common.gen_function_call(func_attrs, ROCMSpec(), indent)
diff --git a/python/aitemplate/backend/rocm/tensor/permute102.py b/python/aitemplate/backend/rocm/tensor/permute102.py
new file mode 100644
index 000000000..df6fd3e82
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/permute102.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute102 for rocm
+"""
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.tensor import permute102_common
+
+# pylint: disable=C0301,W0613,W0612
+
+Header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include "library/include/ck/library/utility/host_tensor.hpp"
+"""
+
+
+@registry.reg("rocm.permute102.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+    shape_eval_template : jinja template
+    shape_save_template : jinja template
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    return permute102_common.gen_function(
+        func_attrs,
+        template_path,
+        shape_eval_template,
+        shape_save_template,
+        Header_files,
+        ROCMSpec(),
+    )
+
+
+@registry.reg("rocm.permute102.func_decl")
+def gen_function_decl(func_attrs):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    return permute102_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.permute102.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    return permute102_common.gen_function_call(func_attrs, ROCMSpec(), indent)
diff --git a/python/aitemplate/backend/rocm/tensor/permute210.py b/python/aitemplate/backend/rocm/tensor/permute210.py
new file mode 100644
index 000000000..31fdf6d91
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/permute210.py
@@ -0,0 +1,71 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute210 for rocm
+"""
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.tensor import permute210_common
+
+# pylint: disable=C0301,W0613,W0612
+
+Header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include "library/include/ck/library/utility/host_tensor.hpp"
+"""
+
+
+@registry.reg("rocm.permute210.gen_function")
+def gen_function(func_attrs, template_path):
+    """
+
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    template_path : str
+        path to library used
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    return permute210_common.gen_function(func_attrs, Header_files, ROCMSpec())
+
+
+@registry.reg("rocm.permute210.func_decl")
+def gen_function_decl(func_attrs):
+    return permute210_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.permute210.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    return permute210_common.gen_function_call(func_attrs, ROCMSpec(), indent)
diff --git a/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py b/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py
new file mode 100644
index 000000000..5405ae749
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py
@@ -0,0 +1,129 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Slice reshape scatter ROCM implementation.
+"""
+
+import jinja2
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.tensor import slice_reshape_scatter_common
+
+TANH_DEF = jinja2.Template(
+    """
+__device__  half2 fast_tanh(half2 x) {
+  // 1-2/(e^(2x)+1)
+  const half2 u = __hmul2(half2(2), x);
+  const half2 emu = h2exp(u);
+  const half2 cdf =
+      __hsub2(half2(1), __h2div(half2(2), __hadd2(half2(1), emu)));
+  return cdf;
+}
+
+__device__  half fast_tanh(half x) {
+  // 1-2/(e^(2x)+1)
+  const half u = __hmul(half(2), x);
+  const half emu = hexp(u);
+  const half cdf = __hsub(half(1), __hdiv(half(2), __hadd(half(1), emu)));
+  return cdf;
+}
+
+__device__  float fast_tanh(float x) {
+    float y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] =  fast_tanh(x_vec[0]);
+    return y;
+}
+
+__device__  float2 fast_tanh(float2 x) {
+    float2 y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] = fast_tanh(x_vec[0]);
+    y_vec[1] = fast_tanh(x_vec[1]);
+    return y;
+}
+
+__device__  float4 fast_tanh(float4 x) {
+    float4 y;
+    half2* x_vec = (half2*)(&x);
+    half2* y_vec = (half2*)(&y);
+    y_vec[0] = fast_tanh(x_vec[0]);
+    y_vec[1] = fast_tanh(x_vec[1]);
+    y_vec[2] = fast_tanh(x_vec[2]);
+    y_vec[3] = fast_tanh(x_vec[3]);
+    return y;
+}
+
+"""
+)
+
+
+@registry.reg("rocm.slice_reshape_scatter.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return slice_reshape_scatter_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.slice_reshape_scatter.gen_function")
+def gen_function(func_attrs, element_func=None):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return slice_reshape_scatter_common.gen_function(
+        func_attrs, ROCMSpec(), TANH_DEF, element_func
+    )
+
+
+@registry.reg("rocm.slice_reshape_scatter.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return slice_reshape_scatter_common.gen_function_call(
+        func_attrs, ROCMSpec(), indent
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/slice_scatter.py b/python/aitemplate/backend/rocm/tensor/slice_scatter.py
new file mode 100644
index 000000000..4641c0905
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/slice_scatter.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Slice scatter ROCM implementation.
+"""
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.tensor import slice_common
+
+
+@registry.reg("rocm.slice_scatter.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return slice_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.slice_scatter.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    # TODO: consider to profile elems_per_thread
+    elems_per_thread = 8 if len(func_attrs["inputs"]) == 1 else 256
+    return slice_common.gen_function(
+        func_attrs, backend_spec=ROCMSpec(), elems_per_thread=elems_per_thread
+    )
+
+
+@registry.reg("rocm.slice_scatter.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    slice_ops = func_attrs["slice_ops"]
+    assert len(slice_ops) >= 1
+    start_indices = [op._attrs["start_indices"] for op in slice_ops]
+    end_indices = [op._attrs["end_indices"] for op in slice_ops]
+    return slice_common.gen_function_call(
+        backend_spec=ROCMSpec(),
+        func_name=func_attrs["name"],
+        inputs=func_attrs["inputs"],
+        outputs=func_attrs["outputs"],
+        start_indices=start_indices,
+        end_indices=end_indices,
+        dim=func_attrs["scatter_dim"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/split.py b/python/aitemplate/backend/rocm/tensor/split.py
new file mode 100644
index 000000000..1e545a2b2
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/split.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM split function
+"""
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common import split_common
+
+
+@registry.reg("rocm.split.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return split_common.gen_function_decl(
+        func_attrs=func_attrs, backend_spec=ROCMSpec()
+    )
+
+
+@registry.reg("rocm.split.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return split_common.gen_function(func_attrs=func_attrs, backend_spec=ROCMSpec())
+
+
+@registry.reg("rocm.split.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return split_common.gen_function_call(
+        func_attrs=func_attrs, backend_spec=ROCMSpec()
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/topk.py b/python/aitemplate/backend/rocm/tensor/topk.py
new file mode 100644
index 000000000..038a4b361
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/topk.py
@@ -0,0 +1,51 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+topk kernel codegen for ROCM.
+"""
+
+from typing import Any, Dict
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.tensor import topk_common
+
+# pylint: disable=C0301
+
+header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+"""
+
+
+@registry.reg("rocm.topk.gen_function")
+def topk_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return topk_common.gen_function(func_attrs, header_files, ROCMSpec())
+
+
+@registry.reg("rocm.topk.func_decl")
+def topk_gen_function_decl(func_attrs: Dict[str, Any]):
+    return topk_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.topk.func_call")
+def topk_gen_function_call(func_attrs, indent="  "):
+    return topk_common.gen_function_call(func_attrs, ROCMSpec(), indent)
+
+
+@registry.reg("rocm.topk.gen_profiler")
+def gen_profiler(func_attrs, workdir):
+    return topk_common.gen_profiler(func_attrs, workdir, header_files, ROCMSpec())
diff --git a/python/aitemplate/backend/rocm/upsample/__init__.py b/python/aitemplate/backend/rocm/upsample/__init__.py
new file mode 100644
index 000000000..500b24f81
--- /dev/null
+++ b/python/aitemplate/backend/rocm/upsample/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM upsampling module init
+"""
+from . import upsampling2d, upsampling2d_add
+
+__all__ = ["upsampling2d", "upsampling2d_add"]
diff --git a/python/aitemplate/backend/rocm/upsample/upsampling2d.py b/python/aitemplate/backend/rocm/upsample/upsampling2d.py
new file mode 100644
index 000000000..08dd19267
--- /dev/null
+++ b/python/aitemplate/backend/rocm/upsample/upsampling2d.py
@@ -0,0 +1,96 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for unsampling2d_add.
+"""
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common import upsampling2d_common
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+
+Header_Files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include "library/include/ck/library/utility/host_tensor.hpp"
+"""
+
+
+@registry.reg("rocm.upsampling2d.gen_function")
+def gen_function(
+    func_attrs,
+    template_path,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    backend_spec = ROCMSpec()
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    half2_data_ref = backend_spec.half2_data_ref
+
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        scale_factor=func_attrs["scale_factor"],
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in exec_path:
+        program = upsampling2d_common.EXEC_TEMPLATE.render()
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return upsampling2d_common.SRC_TEMPLATE.render(
+        header_files=Header_Files,
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        half2_data_ref=half2_data_ref,
+        mode=func_attrs["mode"],
+        tsize=upsampling2d_common.gen_alignment(x),
+    )
+
+
+@registry.reg("rocm.upsampling2d.func_decl")
+def upsampling2d_gen_function_decl(func_attrs):
+    return upsampling2d_common.gen_function_decl(func_attrs, backend_spec=ROCMSpec())
+
+
+@registry.reg("rocm.upsampling2d.func_call")
+def upsampling2d_gen_function_call(func_attrs, indent="    "):
+    return upsampling2d_common.gen_function_call(
+        func_attrs, backend_spec=ROCMSpec(), indent=indent
+    )
diff --git a/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py b/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
new file mode 100644
index 000000000..1f534af7a
--- /dev/null
+++ b/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
@@ -0,0 +1,99 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for unsampling2d_add.
+"""
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common import upsampling2d_common
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+
+Header_Files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include "library/include/ck/library/utility/host_tensor.hpp"
+"""
+
+
+@registry.reg("rocm.upsampling2d_add.gen_function")
+def gen_function(
+    func_attrs,
+    template_path,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    backend_spec = ROCMSpec()
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    half2_data_ref = backend_spec.half2_data_ref
+
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        scale_factor=func_attrs["scale_factor"],
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in exec_path:
+        program = upsampling2d_common.EXEC_TEMPLATE.render(bias_add=True)
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return upsampling2d_common.SRC_TEMPLATE.render(
+        header_files=Header_Files,
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        half2_data_ref=half2_data_ref,
+        mode=func_attrs["mode"],
+        bias_add=True,
+        tsize=upsampling2d_common.gen_alignment(x),
+    )
+
+
+@registry.reg("rocm.upsampling2d_add.func_decl")
+def upsampling2d_gen_function_decl(func_attrs):
+    return upsampling2d_common.gen_function_decl(
+        func_attrs, backend_spec=ROCMSpec(), bias_add=True
+    )
+
+
+@registry.reg("rocm.upsampling2d_add.func_call")
+def upsampling2d_gen_function_call(func_attrs, indent="    "):
+    return upsampling2d_common.gen_function_call(
+        func_attrs, backend_spec=ROCMSpec(), indent=indent, bias_add=True
+    )
diff --git a/python/aitemplate/backend/rocm/utils.py b/python/aitemplate/backend/rocm/utils.py
new file mode 100644
index 000000000..40c45fb6e
--- /dev/null
+++ b/python/aitemplate/backend/rocm/utils.py
@@ -0,0 +1,114 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Util functions for ROCM.
+"""
+import os
+import pathlib
+import re
+import shutil
+import tempfile
+
+from .. import registry
+
+# from . import extra_conv_emit, extra_cutlass_generator, extra_enum
+
+# pylint: disable=C0103,C0415,W0707
+
+
+class Args(object):
+    def __init__(self, arch):
+        self.operations = "all"
+        self.build_dir = ""
+        self.curr_build_dir = ""
+        self.rocm_version = "5.0.2"
+        self.generator_target = ""
+        self.architectures = arch
+        self.kernels = "all"
+        self.ignore_kernels = ""
+        self.kernel_filter_file = None
+        self.selected_kernel_list = None
+        self.interface_dir = None
+        self.filter_by_cc = True
+
+
+@registry.reg("rocm.make_ck_lib")
+def mk_ck_lib(src_prefix, dst_prefix=None):
+    if dst_prefix is None:
+        dst_prefix = tempfile.mkdtemp()
+    lib_dst = os.path.join(dst_prefix, "ck_lib")
+    if pathlib.Path(lib_dst).is_dir():
+        shutil.rmtree(lib_dst)
+
+    os.makedirs(lib_dst)
+    with open(os.path.join(lib_dst, "__init__.py"), "w") as fo:
+        fo.write("from . import library\n")
+        fo.write("from . import generator\n")
+        fo.write("from . import manifest\n")
+        fo.write("from . import gemm_operation\n")
+        fo.write("from . import conv2d_operation\n")
+
+    def process_code(src_path, dst_path, code_set):
+        pattern = re.compile(r"from\s([a-z_0-9]+)\simport \*")
+        with open(src_path) as fi:
+            lines = fi.readlines()
+        output = []
+
+        for line in lines:
+            match = pattern.match(line)
+            if match is not None:
+                name = match.groups()[0]
+                if name + ".py" in code_set:
+                    line = "from .{name} import *\n".format(name=name)
+            output.append(line)
+        # if "library.py" in dst_path:
+        #     lines = extra_enum.emit_library()
+        #     output.append(lines)
+        # if "conv2d_operation.py" in dst_path:
+        #     lines = extra_conv_emit.emit_library()
+        #     output.append(lines)
+        with open(dst_path, "w") as fo:
+            fo.writelines(output)
+
+    srcs = os.listdir(src_prefix)
+    for file in srcs:
+        src_path = os.path.join(src_prefix, file)
+        if not os.path.isfile(src_path):
+            continue
+        dst_path = os.path.join(lib_dst, file)
+        process_code(src_path, dst_path, srcs)
+
+    # extra configs
+    # dst_path = os.path.join(lib_dst, "extra_operation.py")
+    # with open(dst_path, "w") as fo:
+    #     code = extra_ck_generator.emit_library()
+    #     fo.write(code)
+    return dst_prefix
+
+
+@registry.reg("rocm.gen_ck_ops")
+def gen_ops(arch):
+    import ck_lib
+
+    args = Args(arch)
+    manifest = ck_lib.manifest.Manifest(args)
+    try:
+        func = getattr(ck_lib.generator, "Generate" + arch.upper())
+        func(manifest, args.rocm_version)
+    except AttributeError as exc:
+        raise NotImplementedError(
+            "Arch " + arch + " is not supported by current cklib lib."
+        ) from exc
+    return manifest.operations
diff --git a/python/aitemplate/backend/rocm/view_ops/__init__.py b/python/aitemplate/backend/rocm/view_ops/__init__.py
new file mode 100644
index 000000000..7fdffeffd
--- /dev/null
+++ b/python/aitemplate/backend/rocm/view_ops/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM view_ops module init
+"""
+from . import view_ops
+
+__all__ = ["view_ops"]
diff --git a/python/aitemplate/backend/rocm/view_ops/view_ops.py b/python/aitemplate/backend/rocm/view_ops/view_ops.py
new file mode 100644
index 000000000..f11f20857
--- /dev/null
+++ b/python/aitemplate/backend/rocm/view_ops/view_ops.py
@@ -0,0 +1,228 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for view ops.
+"""
+import jinja2
+
+from ....backend import registry
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+void {{function_name}} (
+    {{input_args}}
+    {{output_args}}
+) {
+  {{shape_functions}}
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+# indent: 4 spaces
+INPUT_ARGS_TEMPLATE = jinja2.Template(
+    """
+{% for idx in range(input_ndim) %}
+    int64_t* in_{{idx}},
+{% endfor %}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+# indent: 4 spaces
+OUTPUT_ARGS_TEMPLATE = jinja2.Template(
+    """
+{% for idx in range(output_ndim - 1) %}
+    int64_t* out_{{idx}},
+{% endfor %}
+    int64_t* out_{{output_ndim - 1}}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+{% for idx in range(input_ndim + output_ndim - 1) %}
+  int64_t*,
+{% endfor %}
+  int64_t*
+);
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{% for name in input_names %}
+{{indent}}    &{{name}},
+{% endfor %}
+{% for name in output_names_except_last %}
+{{indent}}    &{{name}},
+{% endfor %}
+{{indent}}    &{{last_output}}
+{{indent}});
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+@registry.reg("rocm.reshape.gen_function")
+@registry.reg("rocm.flatten.gen_function")
+def reshape_gen_function(func_attrs, shape_eval_template):
+    func_name = func_attrs["name"]
+
+    input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
+    output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
+    unknown_idx = func_attrs["unknown_idx"]
+
+    input_args = INPUT_ARGS_TEMPLATE.render(input_ndim=input_ndim)
+    output_args = OUTPUT_ARGS_TEMPLATE.render(output_ndim=output_ndim)
+
+    shape_functions = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        input_ndim=input_ndim,
+        output_ndim=output_ndim,
+        unknown_idx=unknown_idx,
+    )
+
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        shape_functions=shape_functions.strip(),
+        input_args=input_args.strip(),
+        output_args=output_args.strip(),
+    )
+
+
+@registry.reg("rocm.reshape.func_decl")
+@registry.reg("rocm.flatten.func_decl")
+def reshape_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
+    output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
+
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, input_ndim=input_ndim, output_ndim=output_ndim
+    )
+
+
+@registry.reg("rocm.reshape.func_call")
+@registry.reg("rocm.flatten.func_call")
+def reshape_gen_function_call(func_attrs, indent="  "):
+    func_name = func_attrs["name"]
+    input_names = [
+        shape._attrs["name"] for shape in func_attrs["inputs"][0]._attrs["shape"]
+    ]
+    output_names = [
+        shape._attrs["name"] for shape in func_attrs["outputs"][0]._attrs["shape"]
+    ]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_name,
+        input_names=input_names,
+        output_names_except_last=output_names[:-1],
+        last_output=output_names[-1],
+        indent=indent,
+    )
+
+
+@registry.reg("rocm.squeeze.gen_function")
+@registry.reg("rocm.unsqueeze.gen_function")
+def squeeze_gen_function(func_attrs, shape_eval_template):
+    """Generate the function body squeeze/unsqueeze.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        The _attrs dict from the original op.
+    shape_eval_template : jinja2.Template
+        The template that implements the logic for writing to dynamic shapes.
+    """
+    func_name = func_attrs["name"]
+    out_dim_to_in = func_attrs["out_dim_to_in"]
+
+    input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
+    output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
+
+    input_args = INPUT_ARGS_TEMPLATE.render(input_ndim=input_ndim)
+    output_args = OUTPUT_ARGS_TEMPLATE.render(output_ndim=output_ndim)
+
+    shape_functions = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        input_ndim=input_ndim,
+        output_ndim=output_ndim,
+        out_dim_to_in=out_dim_to_in,
+    )
+
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        shape_functions=shape_functions.strip(),
+        input_args=input_args.strip(),
+        output_args=output_args.strip(),
+    )
+
+
+@registry.reg("rocm.squeeze.func_decl")
+@registry.reg("rocm.unsqueeze.func_decl")
+def squeeze_gen_function_decl(func_attrs):
+    """Generate the function declaration for squeeze/unsqueeze.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        The _attrs dict from the original op.
+    """
+    func_name = func_attrs["name"]
+    input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
+    output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
+
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name, input_ndim=input_ndim, output_ndim=output_ndim
+    )
+
+
+@registry.reg("rocm.squeeze.func_call")
+@registry.reg("rocm.unsqueeze.func_call")
+def squeeze_gen_function_call(func_attrs, indent="  "):
+    """Generate the function invocation for squeeze/unsqueeze.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        The _attrs dict from the original op.
+    ident : str
+        Sequence to use to generate the indentations in the ROCM code
+    """
+    func_name = func_attrs["name"]
+    input_names = [
+        shape._attrs["name"] for shape in func_attrs["inputs"][0]._attrs["shape"]
+    ]
+    output_names = [
+        shape._attrs["name"] for shape in func_attrs["outputs"][0]._attrs["shape"]
+    ]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_name,
+        input_names=input_names,
+        output_names_except_last=output_names[:-1],
+        last_output=output_names[-1],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/vision_ops/__init__.py b/python/aitemplate/backend/rocm/vision_ops/__init__.py
new file mode 100644
index 000000000..fc8b18622
--- /dev/null
+++ b/python/aitemplate/backend/rocm/vision_ops/__init__.py
@@ -0,0 +1,19 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+(c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+"""
+from . import efficient_nms, nms  # noqa
+from .roi_ops import multi_level_roi_align, roi_align  # noqa  # noqa
diff --git a/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py b/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py
new file mode 100644
index 000000000..9d0c947bd
--- /dev/null
+++ b/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py
@@ -0,0 +1,53 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+efficient_nms kernel codegen for ROCM.
+"""
+
+from typing import Any, Dict
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.vision_ops import efficient_nms_common
+
+# pylint: disable=C0301
+
+header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+"""
+
+
+@registry.reg("rocm.efficient_nms.gen_function")
+def efficient_nms_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return efficient_nms_common.gen_function(func_attrs, header_files, ROCMSpec())
+
+
+@registry.reg("rocm.efficient_nms.func_decl")
+def efficient_nms_gen_function_decl(func_attrs: Dict[str, Any]):
+    return efficient_nms_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.efficient_nms.func_call")
+def efficient_nms_gen_function_call(func_attrs, indent="  "):
+    return efficient_nms_common.gen_function_call(func_attrs, ROCMSpec(), indent)
+
+
+@registry.reg("rocm.efficient_nms.gen_profiler")
+def gen_profiler(func_attrs, workdir):
+    return efficient_nms_common.gen_profiler(
+        func_attrs, workdir, header_files, ROCMSpec()
+    )
diff --git a/python/aitemplate/backend/rocm/vision_ops/nms.py b/python/aitemplate/backend/rocm/vision_ops/nms.py
new file mode 100644
index 000000000..694f4d205
--- /dev/null
+++ b/python/aitemplate/backend/rocm/vision_ops/nms.py
@@ -0,0 +1,51 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+nms kernel codegen for ROCM.
+"""
+
+from typing import Any, Dict
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+from ...common.vision_ops import nms_common
+
+# pylint: disable=C0301
+
+header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+"""
+
+
+@registry.reg("rocm.nms.gen_function")
+def nms_gen_function(func_attrs: Dict[str, Any]) -> str:
+    return nms_common.gen_function(func_attrs, header_files, ROCMSpec())
+
+
+@registry.reg("rocm.nms.func_decl")
+def nms_gen_function_decl(func_attrs: Dict[str, Any]):
+    return nms_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.nms.func_call")
+def nms_gen_function_call(func_attrs, indent="  "):
+    return nms_common.gen_function_call(func_attrs, ROCMSpec(), indent)
+
+
+@registry.reg("rocm.nms.gen_profiler")
+def gen_profiler(func_attrs, workdir):
+    return nms_common.gen_profiler(func_attrs, workdir, header_files, ROCMSpec())
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py
new file mode 100644
index 000000000..6082dbff7
--- /dev/null
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM roi_align module init
+"""
+from . import multi_level_roi_align, roi_align
+
+__all__ = ["roi_align", "multi_level_roi_align"]
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
new file mode 100644
index 000000000..d875ae9c2
--- /dev/null
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
@@ -0,0 +1,87 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for multi-level roi align.
+"""
+
+import jinja2
+
+from .... import registry
+from ....backend_spec import ROCMSpec
+from ....common.vision_ops import multi_level_roi_align_common
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+EXTRA_HEADER = jinja2.Template(
+    """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include "library/include/ck/library/utility/host_tensor.hpp"
+"""
+)
+
+
+@registry.reg("rocm.multi_level_roi_align.gen_function")
+def gen_function(
+    func_attrs,
+    template_path,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    backend_spec = ROCMSpec()
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+
+    exec_paths = ""
+    for key, _ in exec_path.items():
+        program = multi_level_roi_align_common.EXEC_TEMPLATE.render(
+            indent="    ",
+            num_rois=func_attrs["num_rois"],
+            pooled_size=func_attrs["pooled_size"],
+            sampling_ratio=func_attrs["sampling_ratio"],
+            spatial_scale=func_attrs["spatial_scale"],
+            position_sensitive=func_attrs["position_sensitive"],
+            continuous_coordinate=func_attrs["continuous_coordinate"],
+        )
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return multi_level_roi_align_common.SRC_TEMPLATE.render(
+        function_name=func_name,
+        exec_paths=exec_paths,
+        header_files=EXTRA_HEADER.render(),
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+    )
+
+
+@registry.reg("rocm.multi_level_roi_align.func_decl")
+def multi_level_roi_align_gen_function_decl(func_attrs):
+    return multi_level_roi_align_common.gen_function_decl(
+        func_attrs, backend_spec=ROCMSpec()
+    )
+
+
+@registry.reg("rocm.multi_level_roi_align.func_call")
+def multi_level_roi_align_gen_function_call(func_attrs, indent="  "):
+    return multi_level_roi_align_common.gen_function_call(
+        func_attrs, backend_spec=ROCMSpec()
+    )
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
new file mode 100644
index 000000000..81ec65afd
--- /dev/null
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
@@ -0,0 +1,108 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM codegen functions for roi align.
+"""
+
+import jinja2
+
+from .... import registry
+from ....backend_spec import ROCMSpec
+from ....common.vision_ops import roi_align_common
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+EXTRA_HEADER = jinja2.Template(
+    """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include "library/include/ck/library/utility/host_tensor.hpp"
+"""
+)
+
+
+@registry.reg("rocm.roi_align.gen_function")
+def gen_function(
+    func_attrs,
+    template_path,
+    exec_cond_remplate,
+    shape_eval_template,
+    shape_save_template,
+):
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    backend_spec = ROCMSpec()
+    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    half2_data_ref = backend_spec.half2_data_ref
+
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+        x_dim3="*in_ch",
+        num_rois=func_attrs["num_rois"],
+        pooled_size=func_attrs["pooled_size"],
+        position_sensitive=func_attrs["position_sensitive"],
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in exec_path:
+        program = roi_align_common.EXEC_TEMPLATE.render(
+            indent="    ",
+            num_rois=func_attrs["num_rois"],
+            pooled_size=func_attrs["pooled_size"],
+            sampling_ratio=func_attrs["sampling_ratio"],
+            spatial_scale=func_attrs["spatial_scale"],
+            position_sensitive=func_attrs["position_sensitive"],
+            continuous_coordinate=func_attrs["continuous_coordinate"],
+        )
+        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return roi_align_common.SRC_TEMPLATE.render(
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        prefix=backend_spec.prefix,
+        elem_input_type=input_type,
+        elem_output_type=output_type,
+        header_files=EXTRA_HEADER.render(),
+        index_type=backend_spec.index_type,
+        half2_data_ref=half2_data_ref,
+    )
+
+
+@registry.reg("rocm.roi_align.func_decl")
+def roi_align_gen_function_decl(func_attrs):
+    return roi_align_common.gen_function_decl(func_attrs, backend_spec=ROCMSpec())
+
+
+@registry.reg("rocm.roi_align.func_call")
+def roi_align_gen_function_call(func_attrs, indent="  "):
+    return roi_align_common.gen_function_call(
+        func_attrs, backend_spec=ROCMSpec(), indent=indent
+    )
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
new file mode 100644
index 000000000..9baec587c
--- /dev/null
+++ b/python/aitemplate/backend/target.py
@@ -0,0 +1,433 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Target object for AITemplate.
+"""
+import os
+import pathlib
+import shutil
+import tempfile
+from enum import IntEnum
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ..utils import logger
+from . import registry
+from .profiler_cache import ProfileCacheDB
+
+_MYPATH = os.path.dirname(os.path.realpath(__file__))
+_3RDPARTY_PATH = os.path.normpath(os.path.join(_MYPATH, "..", "..", "..", "3rdparty"))
+_WHEEL_3RDPARTY_PATH = os.path.normpath(os.path.join(_MYPATH, "..", "3rdparty"))
+if os.path.exists(_WHEEL_3RDPARTY_PATH):
+    _3RDPARTY_PATH = _WHEEL_3RDPARTY_PATH
+AIT_STATIC_FILES_PATH = os.path.join(_3RDPARTY_PATH, "../static")
+CUTLASS_PATH = os.path.join(_3RDPARTY_PATH, "cutlass")
+COMPOSABLE_KERNEL_PATH = os.path.join(_3RDPARTY_PATH, "composable_kernel")
+CUB_PATH = os.path.join(_3RDPARTY_PATH, "cub")
+DEFAULT_INTERNAL_DB_PATH = "aitemplate/AITemplate/python/aitemplate"
+
+CURRENT_TARGET = None
+
+
+class TargetType(IntEnum):
+    """Enum for target type."""
+
+    cuda = 1
+    rocm = 2
+
+
+class Target(object):
+    def __init__(self, static_files_path: str):
+        """
+        Parameters
+        ----------
+        static_files_path : str
+            Absolute path to the AIT static/ directory
+        """
+        self._target_type = -1
+        self._template_path = ""
+        self._compile_cmd = ""
+        self._cache_path = ""
+        self._profile_cache = None
+        self.static_files_path = static_files_path
+
+    def __enter__(self):
+        """Enter the target context manager.
+
+        This will set CURRENT_TARGET to this target.
+
+        Raises
+        ------
+        RuntimeError
+            If CURRENT_TARGET is already set, this will raise a RuntimeError.
+        """
+        self._load_profile_cache()
+        global CURRENT_TARGET
+        if CURRENT_TARGET is not None:
+            raise RuntimeError("Target has been set.")
+        assert self._target_type > 0
+        CURRENT_TARGET = self
+
+    def __exit__(self, ptype, value, trace):
+        """Exit the target context manager."""
+        self._profile_cache = None
+        global CURRENT_TARGET
+        CURRENT_TARGET = None
+
+    @staticmethod
+    def current():
+        """Obtain the current target.
+
+        Returns
+        -------
+        Target
+            return the current target object.
+
+        Raises
+        ------
+        RuntimeError
+            If no target is set, this will raise a RuntimeError.
+        """
+        if CURRENT_TARGET is None:
+            raise RuntimeError("Target is not set yet.")
+        return CURRENT_TARGET
+
+    def template_path(self) -> str:
+        """Return CUTLASS/CK path for this target.
+
+        Returns
+        -------
+        str
+            Absolute path to the CUTLASS/CK template directory.
+        """
+        return self._template_path
+
+    def get_custom_libs(self, absolute_dir, filename) -> str:
+        filename = os.path.join(absolute_dir, filename)
+        with open(filename) as f:
+            res = f.read()
+            return res
+
+    def name(self) -> str:
+        """Return the name of the target.
+
+        Returns
+        -------
+        str
+            The name of the target.
+        """
+        return TargetType(self._target_type).name
+
+    def cc(self):
+        """Compiler for this target.
+
+        Raises
+        ------
+        NotImplementedError
+            Need to be implemented by subclass.
+        """
+        raise NotImplementedError
+
+    def compile_cmd(self, executable: bool = False):
+        """Compile command string template for this target.
+
+        Parameters
+        ----------
+        executable : bool, optional
+            Whether the command with compile an executable object
+            by default False
+
+        Raises
+        ------
+        NotImplementedError
+            Need to be implemented by subclass.
+        """
+        raise NotImplementedError
+
+    def binary_compile_cmd(self):
+        """
+        A command that turns a raw binary file into an object file that
+        can be linked into the executable.
+        """
+        return "ld -r -b binary -o {target} {src}"
+
+    def compile_options(self) -> str:
+        """Options for compiling the target.
+
+        Returns
+        -------
+        str
+        """
+        return ""
+
+    def src_extension(self):
+        """Source file extension for this target.
+
+        Returns
+        -------
+        str
+            Source file extension for this target.
+        """
+        return NotImplementedError
+
+    def dev_select_flag(self):
+        """Environment variable to select the device.
+
+        Returns
+        -------
+        str
+            Environment variable to select the device.
+        """
+        return NotImplementedError
+
+    def apply_op_rules(self, op_def):
+        """Apply special rules to change template op definition
+        Parameters
+        ----------
+        op_def : str
+            Operator definition code string
+        Returns
+        -------
+        str
+            Modified op definition code string
+        """
+        return op_def
+
+    def select_minimal_algo(self, algo_names: List[str]):
+        """Select the minimal algorithm from the list of algorithms.
+
+        This is used in CI to speed up the test without running actually profiling.
+
+        Parameters
+        ----------
+        algo_names : List[str]
+            All the available algorithm names for selection.
+        """
+        return NotImplementedError
+
+    def trick_ci_env(self) -> bool:
+        """Check if we want to trick in_ci_env to make it False.
+
+        This is used in workers where we do not have control of CI_FLAG
+
+        Returns
+        -------
+        bool
+            Whether to trick ci env.
+        """
+        return os.environ.get("TRICK_CI_ENV", None) == "1"
+
+    def in_ci_env(self) -> Union[None, str]:
+        """Check if the current environment is CI.
+
+        Returns
+        -------
+        Union[None, str]
+            CI environment name if in CI environment, otherwise None.
+        """
+        return os.environ.get("CI_FLAG", None) == "CIRCLECI" and not self.trick_ci_env()
+
+    def disable_profiler_codegen(self) -> bool:
+        """Whether to disable profiler codegen.
+
+        disable profiler codegen completely in CI to speed up long running unittest
+
+        Returns
+        -------
+        bool
+            Whether to disable profiler codegen.
+        """
+        return (
+            os.environ.get("DISABLE_PROFILER_CODEGEN", None) == "1"
+            and not self.force_profile()
+        )
+
+    def force_profile(self) -> bool:
+        """Whether to force profile.
+
+        Force profiling regarless in_ci_env, disable_profiler_codegen
+
+        Returns
+        -------
+        bool
+            Whether to force profile.
+        """
+        return os.environ.get("FORCE_PROFILE", None) == "1"
+
+    def use_dummy_profiling_results(self) -> bool:
+        """Whether to use dummy profiling results."""
+        # Whether to use dummy profiling results to speed up runs.
+        return self.in_ci_env() and not self.force_profile()
+
+    def _get_cache_file_name(self) -> str:
+        """Get the cache file name for this target.
+
+        Returns
+        -------
+        str
+            The cache file name for this target.
+        """
+        # TODO: Add device name
+        cache_file = "{dev_type}.db".format(dev_type=TargetType(self._target_type).name)
+        return cache_file
+
+    def _prepare_profile_cache_path(self) -> Optional[str]:
+        """Prepare local profile cache for this target."""
+        if self.use_dummy_profiling_results():
+            logger.info(
+                __name__, "Escape loading profile cache when using dummy profiling"
+            )
+            return None
+
+        prefix = None
+        if os.environ.get("CACHE_DIR", None):
+            prefix = os.environ.get("CACHE_DIR", None)
+        if os.getenv("INSIDE_RE_WORKER") == "1":
+            from libfb.py import parutil
+
+            prefix = parutil.get_file_path(DEFAULT_INTERNAL_DB_PATH)
+        cache_file = self._get_cache_file_name()
+        if prefix is None:
+            prefix = os.path.join(pathlib.Path.home(), ".aitemplate")
+
+        try:
+            os.makedirs(prefix, exist_ok=True)
+        except OSError as error:
+            logger.info(__name__, f"Cannot mkdir at {prefix} due to issue {error}")
+            prefix = os.path.join(tempfile.mkdtemp(prefix="aitemplate_"), ".aitemplate")
+            os.makedirs(prefix, exist_ok=True)
+            logger.info(__name__, f"mkdir at {prefix} instead")
+
+        cache_path = os.path.join(prefix, cache_file)
+        flush_flag = os.environ.get("FLUSH_PROFILE_CACHE", "0")
+        if flush_flag != "0":
+            os.remove(cache_path)
+        return cache_path
+
+    def _load_profile_cache(self):
+        """Load local profile cache for this target."""
+        self._cache_path = self._prepare_profile_cache_path()
+        if self._cache_path is None:
+            return
+
+        logger.info(__name__, f"Loading profile cache from: {self._cache_path}")
+        self._profile_cache = ProfileCacheDB(
+            TargetType(self._target_type).name, path=self._cache_path
+        )
+
+    def get_profile_cache_path(self):
+        """Get local profile cache path for this target."""
+        return self._cache_path
+
+    def query_profile_cache(self, op_class: str, args: str) -> Tuple[str]:
+        """Query the profile cache for the given op class and args.
+
+        Parameters
+        ----------
+        op_class : str
+            Op class name. gemm, conv or normalization
+        args : str
+            Op arguments.
+
+        Returns
+        -------
+        Tuple[str]
+            Queried best profile results.
+
+        Raises
+        ------
+        NotImplementedError
+            If op class is not supported, raise error.
+        """
+        if op_class == "gemm":
+            return self._profile_cache.query_gemm(args)
+        if op_class == "conv":
+            return self._profile_cache.query_conv(args)
+        if op_class == "normalization":
+            return self._profile_cache.query_normalization(args)
+        raise NotImplementedError
+
+    def insert_profile_cache(self, op_class: str, args: str):
+        """Insert the profile cache for the given op class and args."""
+        if op_class == "gemm":
+            self._profile_cache.insert_gemm(args)
+        elif op_class == "conv":
+            self._profile_cache.insert_conv(args)
+        elif op_class == "normalization":
+            self._profile_cache.insert_normalization(args)
+        else:
+            raise NotImplementedError
+
+    def copy_headers_and_csrc_to_workdir(self, workdir: str) -> List[str]:
+        """
+        Copy over all the files in include/ and csrc/ to some working directory.
+        Skips files that are not marked with .cpp/.h
+
+        Returns a list of copied source files (to be built later).
+
+        Parameters
+        ----------
+        workdir : str
+            The path to copy to
+        """
+        sources = []
+        csrc = os.path.join(self.static_files_path, "csrc")
+        for fname in os.listdir(csrc):
+            fname_dst, ext = os.path.splitext(fname)
+            if ext != ".cpp":
+                continue
+            # TODO: Remove this file when the linker error gets fixed in rocm backend.
+            # All files in csrc should be shared between the ROCM and CUDA backends.
+            if fname == "rocm_hack.cpp" and self.name() != "rocm":
+                continue
+            fname_src = os.path.join(csrc, fname)
+            fname_dst_cpp = os.path.join(workdir, f"{fname_dst}{self.src_extension()}")
+            shutil.copyfile(fname_src, fname_dst_cpp)
+            sources.append(fname_dst_cpp)
+
+        headers = []
+        include = os.path.join(self.static_files_path, "include")
+        for fname in os.listdir(include):
+            _, ext = os.path.splitext(fname)
+            if ext != ".h":
+                continue
+            fname_src = os.path.join(include, fname)
+            fname_dst = os.path.join(workdir, fname)
+            shutil.copyfile(fname_src, fname_dst)
+            headers.append(fname_dst)
+        return sources
+
+    @classmethod
+    def remote_logger(cls, record: Dict[str, Any]) -> None:
+        """
+        Upload the record remotely to some logging table.
+
+        Parameters
+        ----------
+        record : Dict[str, Any]
+            The dictionary storing the record
+        """
+        return
+
+
+def CUDA(template_path: str = CUTLASS_PATH, arch: str = "80", **kwargs):
+    """Create a CUDA target."""
+    func = registry.get("cuda.create_target")
+    return func(template_path, arch, **kwargs)
+
+
+def ROCM(template_path: str = COMPOSABLE_KERNEL_PATH, arch: str = "gfx908", **kwargs):
+    """Create a ROCM target."""
+    func = registry.get("rocm.create_target")
+    return func(template_path, arch, **kwargs)
diff --git a/python/aitemplate/backend/task_runner.py b/python/aitemplate/backend/task_runner.py
new file mode 100644
index 000000000..a4714715b
--- /dev/null
+++ b/python/aitemplate/backend/task_runner.py
@@ -0,0 +1,327 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This module is a general-purpose subprocess-based task runner.
+"""
+
+from __future__ import annotations
+
+import os
+import subprocess
+import time
+import typing
+from collections import OrderedDict
+
+# pylint: disable=R1732,R1710,R1721
+class Task(object):
+    """Task is an object containing a bash command,
+    process for the command, and output of the process.
+    """
+
+    def __init__(
+        self, idx: typing.Union[int, str], cmd: str, name: str, **kwargs
+    ) -> None:
+        """
+
+        Parameters
+        ----------
+        idx : Union[int, str]
+            unique id for the task
+        cmd : str
+            bash command for the task
+        name : str
+            alias name of the task
+        """
+        self._finished = False
+        self._is_timeout = False
+        self._failed = False
+        self._idx = idx
+        self._cmd = cmd
+        self._name = name
+        self._ret = None
+        self._assigned_dev = None
+        self._proc = None
+        self._timestamp = 0
+        self._stdout = ""
+        self._stderr = ""
+        self._kwargs = kwargs
+
+    def __call__(self, dev_id: int) -> None:
+        """Execute the bash command with a new subprocess.
+
+        Parameters
+        ----------
+        dev_id : int
+            Target execution device id.
+        """
+        self._assigned_dev = dev_id
+        use_shell = False
+        if "shell" in self._kwargs:
+            use_shell = self._kwargs["shell"]
+        env = os.environ.copy()
+        if "dev_flag" in self._kwargs:
+            env[self._kwargs["dev_flag"]] = str(dev_id)
+        self._proc = subprocess.Popen(
+            self._cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env,
+            shell=use_shell,
+        )
+        self._timestamp = time.time()
+
+    def is_running(self) -> bool:
+        """Check whether the task process is still running.
+
+        Returns
+        -------
+        bool
+            Whether the task process is still running
+        """
+        return self._proc is not None
+
+    def is_finished(self) -> bool:
+        """Check whether the task is finished
+
+        Returns
+        -------
+        bool
+            Whether the task is finished
+        """
+        return self._finished
+
+    def is_timeout(self) -> bool:
+        """Check whether the task is timeout
+
+        Returns
+        -------
+        bool
+            Whether the task is timeout
+        """
+        return self._is_timeout
+
+    def poll(self, current_time, timeout) -> bool:
+        """Given the current time, check whether
+        the task is running, finished or timed out.
+
+        Parameters
+        ----------
+        current_time : int
+            Current timestamp
+        timeout : int
+            Timeout time
+
+        Returns
+        -------
+        bool
+            Whether the task is finished
+        """
+        if self.is_running() is False:
+            return False
+        # handle timeout job
+        step = current_time - self._timestamp
+        if step > timeout:
+            self._proc.kill()
+            self._finished = True
+            self._is_timeout = True
+            self._failed = True
+            return True
+        # handle finished job
+        if self._proc.poll() is not None:
+            self._finished = True
+        return self._finished
+
+    def pull(self, fproc: typing.Callable) -> None:
+        """Pull stdout & stderr from process,
+        process stdout & stderr with fproc, and set the output for the task.
+
+        Parameters
+        ----------
+        fproc : Callable
+            Process function of the task given stdout & stderr
+        """
+        if self._failed:
+            return None
+        self._stdout = self._proc.stdout.read().decode("utf-8")
+        self._stderr = self._proc.stderr.read().decode("utf-8")
+        fproc(self)
+
+    def is_failed(self) -> bool:
+        """Check whether the task is failed
+
+        Returns
+        -------
+        bool
+            Whether the task is failed
+        """
+        return self._failed
+
+    def assigned_dev(self) -> int:
+        """Return the assigned device id for the task
+
+        Returns
+        -------
+        int
+            Assigned device id
+        """
+        return self._assigned_dev
+
+    def __del__(self) -> None:
+        """Clean up process resource"""
+        if self._proc:
+            if self._proc.stdout:
+                self._proc.stdout.close()
+            if self._proc.stderr:
+                self._proc.stderr.close()
+
+
+class DeviceFarm(object):
+    """Device Farm is a stateful object to
+    schedule and assigns a task to the available devices.
+    Devices are logical devices, can be CPUs or GPUs.
+    """
+
+    def __init__(self, devs: list[int]) -> None:
+        """Initialize a Device Farm given a list of device ids.
+
+        Parameters
+        ----------
+        devs : list[int]
+            List of device ids in int
+        """
+        if isinstance(devs, int):
+            devs = list(range(devs))
+        assert isinstance(devs, list)
+        self._dev_stats = OrderedDict()
+        self._devs = devs
+        for dev in devs:
+            self._dev_stats[dev] = False
+
+    def next_idle_dev(self) -> typing.Optional[int]:
+        """Return the next idle (available) device id
+
+        Returns
+        -------
+        Union[None, int]
+            The next idle device id. If all devices are busy, return None
+        """
+        ret_id = None
+        for dev_id, dev_status in self._dev_stats.items():
+            if dev_status is False:
+                ret_id = dev_id
+        self._dev_stats[ret_id] = True
+        return ret_id
+
+    def reset_dev_state(self, dev_id: int) -> None:
+        """Rest the device id state to idle
+
+        Parameters
+        ----------
+        dev_id : int
+            The id of device will be reset
+        """
+        self._dev_stats[dev_id] = False
+
+    def reset_all(self) -> None:
+        """Reset all devices to be idle"""
+        for dev in self._devs:
+            self._dev_stats[dev] = False
+
+
+class BaseRunner(object):
+    """Genetic subprocess task runner for different purposes"""
+
+    def __init__(self, devs: list[int], tag: str, timeout: int = 10) -> None:
+        """
+        Parameters
+        ----------
+        devs : list[int]
+            List of device ids for tasks.
+        tag : str
+            Runner's name tag
+        timeout : int, optional
+            Timeout value. Default is 10 (seconds).
+        """
+        self._tag = tag
+        self._devs = DeviceFarm(devs)
+        self._timeout = timeout
+        self._finished_tasks = set()
+        self._queue = []
+
+    def join(self) -> None:
+        """Waiting until all tasks are finished."""
+        while True:
+            all_finished = True
+            current_time = time.time()
+            for task in self._queue:
+                all_finished = all_finished and task.is_finished()
+                if task._idx in self._finished_tasks:
+                    continue
+                if task.is_running() is False:
+                    next_dev = self._devs.next_idle_dev()
+                    if next_dev is None:
+                        continue
+                    task(next_dev)
+                    continue
+                if task.poll(current_time, self._timeout):
+                    self._devs.reset_dev_state(task.assigned_dev())
+                    self._finished_tasks.add(task._idx)
+            if all_finished:
+                break
+
+    def reset(self) -> None:
+        """Reset runner, clear task queue and device states"""
+        self._devs.reset_all()
+        self._finished_tasks = set()
+        self._queue = []
+
+    def pull(
+        self, ftask_proc: typing.Callable, fret_proc: typing.Callable
+    ) -> list[object]:
+        """Pull results from all tasks executed on the runner.
+
+        Parameters
+        ----------
+        ftask_proc : Callable
+            Function to process each task's output
+        fret_proc : Callable
+            Function to extract returns from task
+
+        Returns
+        -------
+        list
+            Aggregated returns from all tasks
+        """
+        ret = []
+        for task in self._queue:
+            task.pull(ftask_proc)
+            if task.is_finished():
+                if task._ret is not None:
+                    ret.append(fret_proc(task))
+        self.reset()
+        return ret
+
+    def push(self, idx: typing.Union[int, str], cmd: str):
+        """Push a task into runner
+
+        Parameters
+        ----------
+        idx : Union[int, str]
+            id of the task
+        cmd : str
+            bash command line for the task
+
+        """
+        raise NotImplementedError
diff --git a/python/aitemplate/compiler/__init__.py b/python/aitemplate/compiler/__init__.py
new file mode 100644
index 000000000..175a06eac
--- /dev/null
+++ b/python/aitemplate/compiler/__init__.py
@@ -0,0 +1,29 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from . import base, ops, tensor_accessor, transform
+from .compiler import compile_model
+from .model import AIT_DEFAULT_NUM_RUNTIMES, AITData, Model
+
+__all__ = [
+    "base",
+    "op_registry",
+    "ops",
+    "tensor_accessor",
+    "transform",
+    "compile_model",
+    "Model",
+    "AITData",
+    "AIT_DEFAULT_NUM_RUNTIMES",
+]
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
new file mode 100644
index 000000000..fd1d07488
--- /dev/null
+++ b/python/aitemplate/compiler/base.py
@@ -0,0 +1,829 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Basic data types of AITemplate.
+"""
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from enum import Enum
+from functools import reduce
+from pprint import pformat
+from typing import Any, Dict, List, Set, Union
+
+import numpy as np
+
+from aitemplate.utils.torch_utils import torch_dtype_to_string
+
+from ..utils.tensor_utils import wrap_dim
+from .op_registry import OP_REGISTRY
+
+# pylint: disable=C0206,W0613,C0201,W0102,W0231,W0233
+
+
+class Node(ABC):
+    """Base class of Tensor, Operator, etc."""
+
+    def __init__(self) -> None:
+        """
+        Initializes self._attrs field, which is a dict that stores
+        all attributes for this Node.
+        Basic attributes include:
+
+            * name: str, name of the node.
+            * depth: int, depth of the node in a graph. None if this is not applicable.
+            * nop: bool, marks whether this node is a no-operation.
+        Child classes add their own attributes to this dict.
+        """
+        super().__init__()
+        self._attrs: Dict[str, Any] = {"name": None, "depth": 0, "nop": False}
+
+    def __str__(self) -> str:
+        """Returns a string version of this object."""
+        return pformat(self._attrs, indent=2, depth=2)
+
+    def __repr__(self) -> str:
+        """Returns a string containing a printable representation of this object."""
+        return self.__str__()
+
+    @abstractmethod
+    def pseudo_code(self, with_shape: bool = False) -> str:
+        """Returns a string containing pseudo code of this object.
+
+        Parameters
+        ----------
+        with_shape: bool
+            Marks whether to include shape info in the returned pseudo code.
+
+        Returns
+        ----------
+        str
+            Pseudo code.
+        """
+        pass
+
+
+class IntVar(Node):
+    """
+    An IntVar represents a dynamic dimension.
+    IntVar and IntImm (see below) are used together to represent a Tensor's shape.
+    """
+
+    def __init__(
+        self,
+        values: List[int],
+        name: str = None,
+    ) -> None:
+        """Initializes an IntVar.
+
+        Parameters
+        ----------
+        values : list[int]
+            A list of possible values of this dynamic dimension.
+            len(values) must be >= 2.
+
+            When len(values) == 2, the values are treated as a lower bound and an upper bound.
+            Both upper bound and lower bound are inclusive.
+            This is the default use case.
+
+            When len(values) > 2, the first / last values are treated as lower / upper bounds,
+            and the other values are used for internal profiling purpose.
+            This is a legacy use case.
+
+        name : str, optional
+            Name of this dimension, by default None.
+            This field must be set for dims which are used by input tensors.
+        """
+        super().__init__()
+        self._attrs["name"] = name
+
+        if values is None or len(values) < 2:
+            raise RuntimeError(
+                "IntVar 'values' field must have at least 2 values! values: {}, name: {}".format(
+                    values, name
+                )
+            )
+        if min(values) < 0:
+            raise RuntimeError(
+                "IntVar has < 0 value! values: {}, name: {}".format(values, name)
+            )
+        self._attrs["values"] = sorted(set(values))
+        if len(self._attrs["values"]) == 1:
+            self._attrs["values"] = self._attrs["values"] * 2
+
+    def __str__(self) -> str:
+        return pformat(self._attrs, indent=2)
+
+    def __eq__(self, another: Any) -> bool:
+        return (
+            isinstance(another, IntVar)
+            and self._attrs["values"] == another._attrs["values"]
+            and self._attrs["name"] == another._attrs["name"]
+        )
+
+    def __hash__(self) -> int:
+        return hash((self._attrs["name"], tuple(self._attrs["values"])))
+
+    def lower_bound(self) -> int:
+        """Returns lower bound of this dynamic dim."""
+        return self._attrs["values"][0]
+
+    def upper_bound(self) -> int:
+        """Returns upper bound of this dynamic dim."""
+        return self._attrs["values"][-1]
+
+    def pseudo_code(self, with_shape=False) -> str:
+        return (
+            self._attrs["name"]
+            if self._attrs["name"] is not None
+            else f"IntVar({str(self._attrs['values'])})"
+        )
+
+
+class IntImm(IntVar):
+    """
+    An IntImm represents a static dimension.
+    IntVar (see above) and IntImm are used together to represent a Tensor's shape.
+    """
+
+    def __init__(
+        self,
+        value: int,
+        name: str = None,
+    ) -> None:
+        """Initializes an IntImm.
+
+        Parameters
+        ----------
+        value : int
+            Value of this static dimension.
+
+        name : str, optional
+            Name of this dimension, by default None.
+            This field must be set for dims which are used by input tensors.
+        """
+
+        if not isinstance(value, int):
+            raise RuntimeError(
+                "IntImm only takes an int value! Name: {}, current value: {}".format(
+                    name, value
+                )
+            )
+
+        Node.__init__(self)  # pylint: disable=W0233
+        self._attrs["name"] = name
+        self._attrs["values"] = [value]
+
+    def __eq__(self, another: Union[int, IntVar]) -> bool:
+        if isinstance(another, int):
+            return self.value() == another
+
+        return (
+            isinstance(another, IntImm)
+            and self._attrs["values"] == another._attrs["values"]
+        )
+
+    def value(self) -> int:
+        """Returns value of this IntImm."""
+        return self._attrs["values"][0]
+
+    def pseudo_code(self, with_shape=False) -> str:
+        return str(self.value())
+
+
+_DTYPE2BYTE = {
+    "float16": 2,
+    "float32": 4,
+    "float": 4,
+    "int": 4,
+    "int32": 4,
+    "int64": 8,
+}
+
+
+def get_dtype_size(dtype: str) -> int:
+    """Returns size (in bytes) of the given dtype str.
+
+    Parameters
+    ----------
+    dtype: str
+        A data type string.
+
+    Returns
+    ----------
+    int
+        Size (in bytes) of this dtype.
+    """
+
+    if dtype not in _DTYPE2BYTE:
+        raise KeyError(f"Unknown dtype: {dtype}. Expected one of {_DTYPE2BYTE.keys()}")
+    return _DTYPE2BYTE[dtype]
+
+
+def get_aligned_size(shape: List[IntVar], dtype: str, alignment: int = 64) -> int:
+    """Returns aligned size (in bytes) of given shape and dtype.
+
+    Parameters
+    ----------
+    shape: List[IntVar]
+        A list of IntVars, which represents the shape of a Tensor.
+    dtype: str
+        A data type string.
+    alignment: int
+        Alignment requirement (in bytes). Default alignment is 64 bytes.
+
+    Returns
+    ----------
+    int
+        Size (in bytes) of this shape with dtype, aligned in alignment bytes.
+    """
+
+    size = reduce(lambda cur, dim: cur * dim.upper_bound(), shape, 1)
+    size = size * get_dtype_size(dtype)
+    if size % alignment != 0:
+        size = int((size // alignment + 1) * alignment)
+    return size
+
+
+class _ConstantTensorData(ABC):
+    """
+    Represents data to be stored in a Tensor.
+    The data can be owned or unowned; each subclass should
+    implement its own setup and cleanup logic.
+
+    Note that this class is different from the blobs that are used
+    in the Python API (e.g. in Run(), compile_model). Those
+    blobs must represent unowned GPU memory; subclasses of _ConstantTensorData
+    may be owned or unowned, and may or may not reside in host memory.
+
+    Why is this separate class useful? During compilation, we have no way to
+    allocate memory on the GPU, so host memory must be used to store tensors that
+    we introduce (e.g. padding tensors). At the same time, when lowering PyTorch models,
+    we may want to store owned GPU data in the graph.
+    """
+
+    def __init__(self, dtype: str):
+        super().__init__()
+        self.dtype = self._normalize_dtype(dtype)
+
+    def _normalize_dtype(self, dtype: str) -> str:
+        if dtype == "int":
+            return "int32"
+        if dtype == "float":
+            return "float32"
+        return dtype
+
+    @abstractmethod
+    def to_bytes(self) -> bytes:
+        """
+        Converts the stored data to a byte string.
+        Called during codegen to save the ConstantTensor to the
+        .so.
+        """
+        pass
+
+    def size(self) -> int:
+        """
+        The number of bytes stored. Should be equal to
+        len(self.to_bytes()).
+        """
+        return len(self.to_bytes())
+
+    def is_dtype(self, dtype: str) -> bool:
+        return self._normalize_dtype(dtype) == self.dtype
+
+    def __len__(self) -> int:
+        return self.size()
+
+
+class _HostConstantTensorData(_ConstantTensorData):
+    """
+    The simplest possible _ConstantTensorData; just a
+    lightweight wrapper around some host data.
+    """
+
+    def __init__(self, data: bytes, dtype: str = "float16"):
+        super().__init__(dtype)
+        self.data = data
+
+    def to_bytes(self) -> bytes:
+        return self.data
+
+
+class _TorchConstantTensorData(_ConstantTensorData):
+    """
+    Wraps a torch.Tensor for storage in _ConstantTensorData.
+    """
+
+    def __init__(self, tensor):
+        super().__init__(torch_dtype_to_string(tensor.dtype))
+        self.tensor = tensor
+
+    def to_bytes(self) -> bytes:
+        return self.tensor.cpu().detach().numpy().tobytes()
+
+    def size(self) -> int:
+        """
+        Override size() to avoid D2H copy.
+        """
+        return self.tensor.element_size() * self.tensor.nelement()
+
+
+class _NumpyConstantTensorData(_ConstantTensorData):
+    """
+    Wraps an ndarray for storage in _ConstantTensorData.
+    """
+
+    def __init__(self, arr: np.ndarray):
+        super().__init__(str(arr.dtype))
+        self.arr = arr
+
+    def to_bytes(self) -> bytes:
+        return self.arr.tobytes()
+
+
+class Tensor(Node):
+    """
+    A Tensor represents a piece of data, which is used as an input / output of an Operator.
+    Both Tensor and Operator are used at model compilation stage.
+    """
+
+    def __init__(
+        self,
+        shape: List[IntVar],
+        name: str = None,
+        src_ops: Set[Node] = None,
+        dst_ops: Set[Node] = None,
+        dtype: str = "float16",
+        is_input: bool = False,
+        is_output: bool = False,
+        value: Any = None,
+        is_view_of: Any = None,
+    ) -> None:
+        """Initializes a Tensor.
+
+        Parameters
+        ----------
+        shape : List[IntVar]
+            Shape of this Tensor.
+        name : str, optional
+            Name of this Tensor. By default it's None.
+        src_ops : Set[Node], optional
+            Source operators of this Tensor which write to this Tensor.
+            By default it's an empty set.
+        dst_ops : Set[Node], optional
+            Destination operators of this Tensor which take this Tensor as
+            one of their inputs.
+            By default it's an empty set.
+        dtype : str, optional
+            Date type of this Tensor. By default it's "float16".
+        is_input : bool, optional
+            Whether this Tensor is an input Tensor of a graph.
+            Note that constant Tensors (e.g. weights) are NOT input Tensors.
+        is_output : bool, optional
+            Whether this Tensor is an output Tensor of a graph.
+        value : Any, optional
+            The value of this Tensor. When value is set and shape is an
+            empty list, this Tensor is used to represent a number.
+        is_view_of : Any, optional
+            Whether this Tensor is a view of another Tensor.
+        """
+        super().__init__()
+        self._attrs["shape"] = self._convert_shape(shape)
+        self._attrs["name"] = name
+        self._attrs["src_ops"] = src_ops if src_ops is not None else set()
+        self._attrs["dst_ops"] = dst_ops if dst_ops is not None else set()
+        self._attrs["dtype"] = dtype
+        self._attrs["is_output"] = is_output
+        self._attrs["is_input"] = is_input
+        self._attrs["is_param"] = False
+
+        # True if this is an internal tensor that aliases an output through
+        # a view. Set up in mark_param_tensor
+        self._attrs["has_output_aliases"] = False
+
+        # For special views. When an output is a view of an input/constant/other
+        # output, this attribute points to that view. Note that this is not the
+        # same as is_view_of if the output is a view of a view. This is set up
+        # in the mark_param_tensor graph pass.
+        self._attrs["external_tensor"] = None
+
+        # link to original tensor if this tensor is a view
+        self._attrs["is_view_of"] = is_view_of
+
+        if is_view_of:
+            self._attrs["dtype"] = is_view_of._attrs["dtype"]
+
+        self._attrs["value"] = value
+        src_deps = [src_op._attrs["depth"] for src_op in self._attrs["src_ops"]]
+        self._attrs["depth"] = max(src_deps) + 1 if len(src_deps) > 0 else 0
+
+        # Offset into internal memory slab, set by memory planning
+        self._attrs["offset"] = None
+
+        # Data to be bound for constant folding. See _bind_data.
+        self._attrs["data"] = None
+
+    def __str__(self) -> str:
+        output = {}
+        for key in self._attrs.keys():
+            if key in ("src_ops", "dst_ops") and self._attrs[key] is not None:
+                output[key] = [x._attrs["name"] for x in self._attrs[key]]
+            else:
+                output[key] = self._attrs[key]
+        return pformat(output, indent=2)
+
+    def _convert_shape(self, shape: List[Union[int, IntVar]]) -> List[IntVar]:
+        """
+        Converts from a list of ints / IntVars to a list of IntVars.
+        """
+        ret = []
+        for v in shape:
+            if isinstance(v, int):
+                ret.append(IntImm(v))
+            elif isinstance(v, IntVar):
+                ret.append(v)
+            else:
+                raise RuntimeError(f"Unsupported dim type: {type(v)}, dim: {v}")
+        return ret
+
+    def shape(self) -> List[IntVar]:
+        """
+        Returns the shape of the tensor.
+        It should not be used directly in IR.
+        """
+        return self._attrs["shape"]
+
+    def _rank(self) -> int:
+        """
+        Returns the rank of the tensor.
+        It should not be used directly in IR.
+        """
+        return len(self._attrs["shape"])
+
+    def _size(self, dim) -> IntVar:
+        """
+        Gets the size of tensor at dim=dim.
+        dim must be between [-rank, rank - 1].
+        It should not be used directly in IR, use ops.size(dim) instead.
+        """
+        return self._attrs["shape"][wrap_dim(dim, self._rank())]
+
+    def dtype(self) -> str:
+        """Returns Tensor's data type str."""
+        return self._attrs["dtype"]
+
+    def src_ops(self) -> Set[Operator]:
+        """Returns a set of source operators which write to this Tensor."""
+        return self._attrs["src_ops"]
+
+    def dst_ops(self) -> Set[Operator]:
+        """Returns a set of destination operators which read from this Tensor."""
+        return self._attrs["dst_ops"]
+
+    def is_a_const_num(self) -> bool:
+        """Returns whether this Tensor represents a constant number."""
+        return len(self._attrs["shape"]) == 0 and self._attrs["value"] is not None
+
+    def size_bytes(self, alignment: int = 1) -> int:
+        """Returns acutal size (in bytes) of this Tensor."""
+        return get_aligned_size(self._attrs["shape"], self.dtype(), alignment)
+
+    def pseudo_code(self, with_shape=True) -> str:
+        name = self._attrs["name"]
+        if name is None:
+            name = "None"
+
+        args = [f"name={name}"]
+
+        if with_shape:
+            shapes = ", ".join([dim.pseudo_code() for dim in self._attrs["shape"]])
+            args.append(f"shape={shapes}")
+
+        data = self._attrs["data"]
+        if data is not None:
+            args.append(f"data=({data.size()} bytes)")
+
+        return f"Tensor({', '.join(args)})"
+
+    def _bind_data(self, data: _ConstantTensorData) -> None:
+        """
+        Bind some data to this tensor.
+        - This tensor must not have any src_ops().
+        - The provided data's size in bytes much match the maximum size of this tensor
+
+        Tensors with bound data can participate in constant folding.
+        """
+        if self.src_ops():
+            raise ValueError(
+                f"Cannot bind tensor {self._attrs['name']}; {len(self.src_ops())=} > 0"
+            )
+        dtype = self._attrs["dtype"]
+        if not data.is_dtype(dtype):
+            raise ValueError(
+                f"data's dtype did not match: expected {dtype}, got {data.dtype}"
+            )
+        tensor_size = self.size_bytes(alignment=1)
+        if tensor_size != len(data):
+            raise ValueError(
+                (
+                    "ConstantTensor's maximum size is not equal to len(data)! "
+                    f"Got {len(data)=}, but expected at least {tensor_size} bytes. "
+                    "Check that the ConstantTensor's size and dtype are correct."
+                )
+            )
+        self._attrs["data"] = data
+
+    def __add__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("ADD")(self, other)
+
+    def __radd__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("ADD")(other, self)
+
+    def __sub__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("SUB")(self, other)
+
+    def __rsub__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("SUB")(other, self)
+
+    def __mul__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("MUL")(self, other)
+
+    def __rmul__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("MUL")(other, self)
+
+    def __truediv__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("DIV")(self, other)
+
+    def __rtruediv__(self, other: Any) -> Tensor:
+        return OP_REGISTRY.get("DIV")(other, self)
+
+    def __neg__(self) -> Tensor:
+        return OP_REGISTRY.get("MUL")(-1, self)
+
+
+def _create_host_zero_tensor(
+    shape: List[Union[int, IntVar]],
+    name: str = None,
+    dst_ops: Set[Node] = None,
+    dtype: str = "float16",
+    is_output: bool = False,
+):
+    """
+    Create a zero tensor stored on the host machine.
+    """
+    shape = [dim if isinstance(dim, IntVar) else IntImm(dim) for dim in shape]
+    zeros = _HostConstantTensorData(
+        b"\x00" * get_aligned_size(shape, dtype, alignment=1), dtype=dtype
+    )
+    tensor = Tensor(shape, name, dst_ops=dst_ops, dtype=dtype, is_output=is_output)
+    tensor._bind_data(zeros)
+    return tensor
+
+
+class IntVarTensor(Tensor):
+    """
+    A special tensor which represents an IntImm / IntVar.
+    This Tensor can be used as inputs of some Operators (e.g. reshape, layernorm).
+    An IntVarTensor instead of IntVar is used here to keep reference to
+    src_ops and dst_ops.
+    """
+
+    def __init__(
+        self,
+        int_var: IntVar,
+        name: str = None,
+        src_ops: Set[Node] = None,
+        dst_ops: Set[Node] = None,
+        dtype: str = "float16",
+        is_input: bool = False,
+        is_output: bool = False,
+        value: Any = None,
+        is_view_of: Any = None,
+    ) -> None:
+        """Initializes an IntVar Tensor.
+
+        Parameters
+        ----------
+        int_var: IntVar
+            The underlying IntVar variable.
+        """
+        shape = []
+        super().__init__(
+            shape,
+            name,
+            src_ops,
+            dst_ops,
+            is_input=is_input,
+            is_output=is_output,
+        )
+        self._attrs["int_var"] = int_var
+
+    def pseudo_code(self, with_shape=True) -> str:
+        return f"IntVarTensor({self._attrs['int_var'].pseudo_code()})"
+
+
+class DynamicProfileStrategy(Enum):
+    """Dynamic profiling stategy enum.
+    Instances are used to select profiling strategy when there are dynamic dims.
+    """
+
+    # Always use an IntVar's min value to profile.
+    MIN = 1
+    # Always use an IntVar's max value to profile.
+    MAX = 2
+    # Profiling according to an IntVar's value list.
+    # For testing purpose only.
+    HINTS = 3
+
+
+@dataclass
+class ExecItem:
+    """A data class to store profiling info."""
+
+    profiling_key: str
+    exec_cond: str
+    algo: str
+
+
+class Operator(Node):
+    """Base class for all operators"""
+
+    def __init__(self) -> None:
+        """Initializes the operator."""
+        super().__init__()
+        self._attrs["inputs"] = None
+        self._attrs["has_profiler"] = False
+
+    def __call__(self, *args: List[Tensor]) -> List[Tensor]:
+        """Performs offline shape inference and constructs the model graph.
+
+        Parameters
+        -------
+        *args : List[Tensor]
+            Input tensors.
+
+        Returns
+        -------
+        List[Tensor]
+            Output tensors.
+
+        Raises
+        ------
+        NotImplementedError
+        """
+        raise NotImplementedError
+
+    def _set_depth(self) -> None:
+        """
+        Sets operator depth and dst_ops.
+        This function must be called by each operator subclass inside
+        __call__() method once self._attrs["inputs"] is set.
+        """
+        max_depth = 0
+        if self._attrs["inputs"] is not None:
+            for inp in self._attrs["inputs"]:
+                max_depth = max(max_depth, inp._attrs["depth"])
+                inp._attrs["dst_ops"].add(self)
+        self._attrs["depth"] = max_depth
+
+    def __str__(self) -> str:
+        """Generates a debug string."""
+        output = {}
+        for key in self._attrs.keys():
+            if (
+                key in ("inputs", "args", "outputs", "original_inputs")
+                and self._attrs[key] is not None
+            ):
+                output[key] = [x._attrs["name"] for x in self._attrs[key]]
+            else:
+                output[key] = self._attrs[key]
+        return pformat(output, indent=2)
+
+    def gen_profiler(
+        self, workdir: str = None, dynamic_profiling_strategy=None
+    ) -> None:
+        """Generates source files for profiling purpose.
+
+        Parameters
+        ----------
+        workdir : str, optional
+            The directory to generate source files.
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy, used to filter generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        """
+        return
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.MAX,
+    ) -> None:
+        """Selects the fastest kernel configurations.
+
+        Parameters
+        ----------
+        workdir : str, optional
+            The directory which contains source files, by default "./"
+        devices: list, optional
+            A list of device ids which can be used for profiling.
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            Profiling strategy used when there are dynamic dims.
+            By default MAX is used, i.e. to profile a dynamic range, an upper bound will be used.
+        """
+
+        return
+
+    def gen_function(self) -> str:
+        """Generates function source code string.
+
+        Returns
+        -------
+        str : a string which contains C++ function implementation source code.
+
+        Raises
+        ------
+        NotImplementedError
+        """
+        raise NotImplementedError("gen_function is not defined for {}".format(self))
+
+    # APIs below are for graph transformations.
+    def replace_input_tensor(self, old_tensor, new_tensor) -> None:
+        """Replaces old_tensors in self._attrs["inputs"] with new_tensor.
+
+        Parameters
+        ----------
+        old_tensor: Tensor
+            The old tensor to be replaced.
+        new_tensor: Tensor
+            The new tensor.
+
+        Returns
+        -------
+        None.
+        """
+
+        self._attrs["inputs"] = [
+            new_tensor if tensor is old_tensor else tensor
+            for tensor in self._attrs["inputs"]
+        ]
+
+    def _get_op_attributes(self) -> Dict[str, Any]:
+        """
+        Returns a dictionary of the core attributes of the op.
+        The core attributes are attributes that are required to create an op, for
+        example, the FuncEnum for a elementwise op.
+
+        This is used when we need to copy the op with identical behaviour.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        Dict of attributes
+        """
+
+        return {}
+
+    # APIs below are for pseudo code generation.
+    def _inputs_for_pseudo_code(self):
+        return self._attrs["inputs"]
+
+    def _outputs_for_pseudo_code(self):
+        return self._attrs["outputs"]
+
+    def _args_for_pseudo_code(self):
+        return []
+
+    def _pseudo_code_helper(self, node: Any, with_shape: bool) -> str:
+        if isinstance(node, list):
+            if len(node) > 3 and isinstance(node[0], Tensor):
+                return ",\n".join(self._pseudo_code_helper(n, with_shape) for n in node)
+            else:
+                return ", ".join(self._pseudo_code_helper(n, with_shape) for n in node)
+        if isinstance(node, Node):
+            return node.pseudo_code(with_shape)
+        return str(node)
+
+    def pseudo_code(self, with_shape=True):
+        args = self._pseudo_code_helper(self._args_for_pseudo_code(), with_shape)
+        inputs = self._pseudo_code_helper(self._inputs_for_pseudo_code(), with_shape)
+        outputs = self._pseudo_code_helper(self._outputs_for_pseudo_code(), with_shape)
+        return f"({outputs}) \n= {self._attrs['op']}({args})(\n{inputs})\n"
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
new file mode 100644
index 000000000..49307c3bd
--- /dev/null
+++ b/python/aitemplate/compiler/compiler.py
@@ -0,0 +1,236 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+build a test module from a tensor
+"""
+import os
+from typing import Dict, List, Optional, Union
+
+from aitemplate import backend, compiler
+from aitemplate.utils import graph_utils, logger
+
+from .base import DynamicProfileStrategy, Tensor
+
+from .model import AIT_DEFAULT_NUM_RUNTIMES, Model, TorchTensor
+
+# pylint: disable=W0102
+
+
+def _validate_tensor_args(sorted_graph: List[Tensor], output_tensors: List[Tensor]):
+    """
+    Validate the user's desired output name -> index ordering.
+
+    Errors if:
+    1) The given ordering has duplicates
+    2) The given ordering has non-outputs
+    3) The given ordering is missing outputs that are reachable
+
+    Note that we have to do this before any optimizations. It is legal to replace output tensors
+    with new Tensor objects of the same name, so the user-provided tensors might not be in
+    the graph after optimizations (replacing a Tensor sets is_output=False).
+    """
+    seen_tensors = set()
+    for tensor in output_tensors:
+        name = tensor._attrs["name"]
+        if not tensor._attrs["is_output"]:
+            raise ValueError(f"Got non-output tensor in output_tensors list: {name}")
+        if name in seen_tensors:
+            raise ValueError(f"Got duplicate name {name} in output_tensors list.")
+        seen_tensors.add(name)
+
+    given_tensors = {tensor._attrs["name"] for tensor in output_tensors}
+    for tensor in reversed(sorted_graph):
+        name = tensor._attrs["name"]
+        if tensor._attrs["is_output"] and name not in given_tensors:
+            raise ValueError(f"Output {name} was not passed into output_tensors")
+
+
+def _verify_outputs_still_in_graph(sorted_graph: List[Tensor], outputs: List[Tensor]):
+    seen = {tensor._attrs["name"]: False for tensor in outputs}
+    for tensor in sorted_graph:
+        name = tensor._attrs["name"]
+        if name not in seen:
+            continue
+
+        if seen[name]:
+            raise ValueError(
+                f"Output {name} appears in the graph twice after optimizations."
+            )
+
+        seen[name] = True
+
+    for tensor, was_seen in seen.items():
+        if not was_seen:
+            raise ValueError(
+                f"Output {tensor._attrs['name']} was not found in the graph after opitmizations."
+            )
+
+
+def compile_model(
+    tensor: Union[Tensor, List[Tensor]],
+    target: backend.target.Target,
+    workdir: str,
+    test_name: str,
+    profile_devs: List[int] = None,
+    dynamic_profiling_strategy: DynamicProfileStrategy = DynamicProfileStrategy.MAX,
+    dll_name: str = "test.so",
+    num_runtimes: int = AIT_DEFAULT_NUM_RUNTIMES,
+    profile_dir: str = None,
+    constants: Optional[Dict[str, TorchTensor]] = None,
+) -> Model:
+    """Compiles a model and generates a .so file.
+
+    Parameters
+    ----------
+    tensor : Union[Tensor, List[Tensor]]
+        An output Tensor, or a list of output Tensors.
+        The compiled module will preserve the ordering of the outputs in its
+        internal ordering.
+    target : Target
+        A compilation target. See comments for Target.
+    workdir : str
+        A workdir to store profiling and execution source codes, as well as the result .so file.
+    test_name : str
+        Name of the test. Used as the name of the subdir which stores the generated .so file.
+    profile_devs : List[int], optional
+        A list of profiling devices, by default device 0 will be used.
+    dynamic_profiling_strategy: DynamicProfileStrategy, optional
+        A DynamicProfileStrategy used for profiling. See comments for DynamicProfileStrategy.
+    dll_name: str
+        The output .so name.
+    num_runtimes: int
+            How many runtimes should be stored in the internal pool. This
+            determines how many inferences can happen concurrently. By
+            default, set to 2. Must be positive.
+
+    Returns
+    -------
+    Model
+        A model object.
+    """
+    if constants is None:
+        constants = {}
+
+    recompile = os.getenv("RECOMPILE", "1")
+    graph = None
+    # Super important: we cannot have commas in the test name.
+    # We want to add a -Iworkdir/test_name flag to nvcc, but
+    # if the name has a comma in it, it will be parsed as two
+    # arguments (even if we put quotes around it)!!
+    test_name = test_name.replace(",", "_")
+    test_dir = os.path.join(workdir, test_name)
+    profile_dir = workdir if profile_dir is None else profile_dir
+    if int(recompile) == 1:
+        os.makedirs(test_dir, exist_ok=True)
+        with target:
+            graph = compiler.transform.toposort(tensor)
+            graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "toposort")
+
+            output_tensors = [tensor] if isinstance(tensor, Tensor) else tensor
+            _validate_tensor_args(graph, output_tensors)
+
+            compiler.transform.bind_constants(graph, constants)
+            graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "bind_constants")
+
+            compiler.transform.remove_unused_ops(graph)
+            graph_utils.dump_graph_debug_str_to_file(
+                graph, test_dir, "remove_unused_ops"
+            )
+
+            compiler.transform.remove_no_ops(graph)
+            graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "remove_no_ops")
+
+            compiler.transform.name_graph(graph)
+            graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "name_graph")
+
+            compiler.transform.mark_param_tensor(graph)
+            graph_utils.dump_graph_debug_str_to_file(
+                graph, test_dir, "mark_param_tensor"
+            )
+
+            graph = compiler.transform.optimize_graph(graph, test_dir)
+            graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "optimize_graph")
+
+            compiler.transform.mark_special_views(graph)
+            compiler.transform.refine_graph(graph)
+            graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "refine_graph")
+
+            if profile_devs is None:
+                device_env = os.getenv(target.dev_select_flag(), None)
+                if device_env is None:
+                    profile_devs = [0]
+                else:
+                    profile_devs = device_env.split(",")
+            compiler.transform.profile(
+                graph, profile_dir, profile_devs, dynamic_profiling_strategy
+            )
+            graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "profile")
+
+            constant_folding_workdir = os.path.join(workdir, test_name)
+            os.makedirs(constant_folding_workdir, exist_ok=True)
+            graph = compiler.transform.constant_folding(graph, constant_folding_workdir)
+            graph_utils.dump_graph_debug_str_to_file(
+                graph, test_dir, "constant_folding"
+            )
+
+            _verify_outputs_still_in_graph(graph, output_tensors)
+            (
+                max_blob,
+                max_constant_blob,
+                workspace,
+            ) = compiler.transform.memory_planning(graph)
+            graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "memory_planning")
+            file_pairs = backend.codegen.gen_function_src(graph, workdir, test_name)
+
+            # It's possible that the original output tensor has been replaced with a new tensor.
+            # Preserve original output tensors' orders but use the new tensors.
+            new_output_tensor_dict = {
+                tensor._attrs["name"]: tensor
+                for tensor in graph
+                if tensor._attrs["is_output"]
+            }
+            output_tensors = [tensor] if isinstance(tensor, Tensor) else tensor
+            output_tensors = [
+                new_output_tensor_dict[tensor._attrs["name"]]
+                for tensor in output_tensors
+            ]
+
+            main_pairs = backend.codegen.gen_library_src(
+                graph,
+                max_blob,
+                max_constant_blob,
+                workspace,
+                workdir,
+                output_tensors,
+                test_name,
+            )
+            file_pairs.extend(main_pairs)
+
+            compile_engine = backend.builder.Builder()
+            if logger.is_debug():
+                compile_engine.gen_makefile(file_pairs, dll_name, workdir, test_name)
+
+            compile_engine.build_objs(
+                file_pairs,
+                backend.target.Target.current().compile_cmd(False),
+                backend.target.Target.current().binary_compile_cmd(),
+            )
+            compile_engine.build_so(
+                os.path.join(workdir, test_name, dll_name), [p[1] for p in file_pairs]
+            )
+
+    module = Model(os.path.join(workdir, test_name, dll_name), num_runtimes)
+    module.debug_sorted_graph = graph
+    return module
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
new file mode 100644
index 000000000..18da2d589
--- /dev/null
+++ b/python/aitemplate/compiler/model.py
@@ -0,0 +1,856 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Python bindings to the AIT runtime.
+"""
+import ctypes
+import enum
+import logging
+import math
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, TypeVar, Union
+
+import numpy as np
+
+from aitemplate.utils.torch_utils import torch_dtype_to_string
+
+# Controls how many runtimes will be used in ModelContainer by default.
+# See the runtime README.md for more information on the Model/ModelContainer
+# system and the num_runtimes parameter.
+# This value is used as the default for the num_runtimes argument
+# in both Model.__init__ and compile_model. Changing it will have no
+# effect since Python default arguments only get evaluated once.
+AIT_DEFAULT_NUM_RUNTIMES = 1
+
+# pylint: disable=C0103
+
+DTYPE_TO_BYTES: Dict[str, str] = {
+    "float16": 2,
+    "float32": 4,
+    "float": 4,
+    "int": 4,
+    "int32": 4,
+    "int64": 8,
+}
+
+
+# Stand-in for torch.Tensor. Use a TypeVar for some APIs since we can't introduce
+# a torch dependency.
+TorchTensor = TypeVar("TorchTensor")
+
+
+def _dlclose(dll: ctypes.CDLL):
+    syms = ctypes.CDLL(None)
+    if hasattr(syms, "dlclose"):
+        f_dlclose = syms.dlclose
+        f_dlclose.argtypes = [ctypes.c_void_p]
+        f_dlclose(dll._handle)
+    else:
+        logging.warning("dlclose() not found, library may not be unloaded properly!")
+
+
+def _check_tensors(
+    tensor_list: Union[Dict[str, TorchTensor], List[TorchTensor]],
+    is_error_fn: Callable[[TorchTensor], bool],
+    list_name: str,
+    condition_description: str,
+):
+    """
+    Helper for various input/output sanity checks.
+    """
+    if isinstance(tensor_list, dict):
+        tensor_list = tensor_list.values()
+
+    for i, tensor in enumerate(tensor_list):
+        if is_error_fn(tensor):
+            raise ValueError(f"{list_name}[{i}] failed check: {condition_description}")
+
+
+def _check_tensors_contiguous_and_on_gpu(
+    tensors: Union[Dict[str, TorchTensor], List[TorchTensor]], name: str
+):
+    def is_bad_tensor(tensor: TorchTensor) -> bool:
+        return not tensor.is_contiguous() or not tensor.is_cuda
+
+    _check_tensors(tensors, is_bad_tensor, name, "contiguous and on GPU")
+
+
+def _check_tensors_contiguous_and_on_host(
+    tensors: Union[Dict[str, TorchTensor], List[TorchTensor]], name: str
+):
+    def is_bad_tensor(tensor: TorchTensor) -> bool:
+        return not tensor.is_contiguous() or tensor.is_cuda
+
+    _check_tensors(tensors, is_bad_tensor, name, "contiguous and on host")
+
+
+def torch_to_ait_data(tensor):
+    """
+    Convert a torch Tensor to a AITData.
+    """
+    return AITData(
+        tensor.data_ptr(), list(tensor.size()), torch_dtype_to_string(tensor.dtype)
+    )
+
+
+def _convert_tensor_args(params):
+    """
+    Helper function for the WithTensors APIs.
+    """
+    if isinstance(params, dict):
+        result = {name: torch_to_ait_data(x) for name, x in params.items()}
+    else:
+        result = [torch_to_ait_data(x) for x in params]
+    return result
+
+
+def _reshape_tensor(tensor: TorchTensor, shape: List[int]) -> TorchTensor:
+    """
+    Reinterpret a blob of contiguous memory as some shape. Used to convert
+    outputs in RunWithTensors.
+    """
+    assert tensor.ndim == len(
+        shape
+    ), f"Expected output tensor's ndim to match the length of Run()'s return value: {tensor.ndim=} != {len(shape)=}"
+    numel = math.prod(shape)
+    new_tensor = tensor.flatten()[:numel]
+    return new_tensor.reshape(shape)
+
+
+class AITemplateMemcpyKind(enum.Enum):
+    HostToDevice = 0
+    DeviceToHost = 1
+    DeviceToDevice = 2
+
+
+class AITData(NamedTuple):
+    """
+    Input or output tensor for Model.run. We require the extra data for safety
+    checks inside the runtime.
+    """
+
+    data_ptr: int
+    shape: List[int]
+    dtype: str
+
+
+class _AITemplateShape(ctypes.Structure):
+    _fields_ = [
+        ("shape_data", ctypes.POINTER(ctypes.c_longlong)),
+        ("size", ctypes.c_size_t),
+    ]
+
+
+class _CFormatAITData(ctypes.Structure):
+    _fields_ = [
+        ("pointer", ctypes.c_void_p),
+        ("shape", _AITemplateShape),
+        ("dtype", ctypes.c_int),
+    ]
+
+
+class Model(object):
+    """AITemplate Python runtime binding."""
+
+    class _DLLWrapper:
+        def __init__(self, lib_path: str, num_runtimes: int):
+            self.lib_path = lib_path
+            self.DLL = ctypes.cdll.LoadLibrary(lib_path)
+
+            self.handle = ctypes.c_void_p()
+            self.DLL.AITemplateModelContainerCreate(
+                ctypes.pointer(self.handle), ctypes.c_size_t(num_runtimes)
+            )
+            self.is_open = True
+
+        def close(self):
+            if self.is_open:
+                self.DLL.AITemplateModelContainerDelete(self.handle)
+                _dlclose(self.DLL)
+                self.is_open = False
+
+        def __getattr__(self, name):
+            if not self.is_open:
+                raise RuntimeError(f"Cannot use closed AIT library: {self.lib_path}")
+
+            method = getattr(self.DLL, name)
+
+            def _wrapped_func(*args):
+                err = method(*args)
+                if err:
+                    raise RuntimeError(f"Error in function: {method.__name__}")
+
+            return _wrapped_func
+
+    def __init__(self, lib_path: str, num_runtimes: int = AIT_DEFAULT_NUM_RUNTIMES):
+        """
+        Instantiates a wrapper around the C++ model_interface.
+
+        Parameters
+        ----------
+        lib_path : str
+            The path to the compiled .so
+        num_runtimes : int, optional
+            How many runtimes should be stored in the internal pool. This
+            determines how many inferences can happen concurrently. By
+            default, set to 2. Must be positive.
+        """
+        if num_runtimes <= 0:
+            raise ValueError(f"num_runtimes must be positive, but got {num_runtimes}")
+
+        self.DLL = self._DLLWrapper(lib_path, num_runtimes)
+        self.handle = self.DLL.handle
+        self.lib_path = self.DLL.lib_path
+
+        # We use this list to add reference counts of Torch tensors
+        # to avoid lifetime issues caused by user misuse.
+        self.torch_constant_tensors = {}
+
+        # The corresponding sorted_graph. Optional. For debugging purpose.
+        self.debug_sorted_graph = None
+
+        # Maps dtype strings to AITemplateDtype enum in model_interface.h.
+        # Must be kept in sync!
+        # We can consider defining an AITemplateDtype enum to use on the Python
+        # side at some point, but stick to strings for now to keep things consistent
+        # with other Python APIs.
+        self._DTYPE_TO_ENUM = {
+            "float16": 1,
+            "float32": 2,
+            "float": 2,
+            "int": 3,
+            "int32": 3,
+            "int64": 4,
+        }
+        self._output_name_to_index = self._construct_output_name_to_index_map()
+        self._input_name_to_index = self._construct_input_name_to_index_map()
+        self._output_ndims = [
+            len(self.get_output_maximum_shape(i))
+            for i in range(len(self._output_name_to_index))
+        ]
+
+        # Set of pointers allocated with numpy_to_ait_data.
+        # If the user forgets to free their data, we use this to
+        # avoid leaking memory.
+        self._allocated_ait_data = set()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.close()
+
+    def __del__(self):
+        self.close()
+
+    def close(self):
+        # Copy to avoid set size changed during iteration
+        for ptr in list(self._allocated_ait_data):
+            self.free_gpu_memory(ptr, sync=True)
+        self.DLL.close()
+
+    def __getstate__(self):
+        return {"lib_path": self.DLL.lib_path}
+
+    def __setstate__(self, d):
+        if "lib_path" not in d:
+            raise RuntimeError(f"Didn't find 'lib_path' property in {d}")
+        self.__init__(d["lib_path"])
+
+    def _dtype_str_to_enum(self, dtype: str) -> int:
+        if dtype not in self._DTYPE_TO_ENUM:
+            raise ValueError(
+                f"Got unsupported input dtype {dtype}! Supported dtypes are: {list(self._DTYPE_TO_ENUM.keys())}"
+            )
+        return self._DTYPE_TO_ENUM[dtype]
+
+    def _convert_single_param_to_c_format(self, param: AITData) -> _CFormatAITData:
+        pointer, shape, dtype = param
+        c_pointer = ctypes.c_void_p(pointer)
+        c_shape_data = (ctypes.c_longlong * len(shape))()
+        for j, dim in enumerate(shape):
+            c_shape_data[j] = ctypes.c_longlong(dim)
+        c_shape = _AITemplateShape(c_shape_data, ctypes.c_size_t(len(shape)))
+        c_dtype = self._dtype_str_to_enum(dtype)
+        return _CFormatAITData(c_pointer, c_shape, c_dtype)
+
+    def _convert_params_to_c_format(self, params: List[AITData]):
+        c_params = (_CFormatAITData * len(params))()
+        for i, param in enumerate(params):
+            c_params[i] = self._convert_single_param_to_c_format(param)
+        return c_params
+
+    def _prepare_run(
+        self,
+        inputs,
+        outputs,
+        stream_ptr,
+    ):
+        c_inputs = self._convert_params_to_c_format(inputs)
+        c_outputs = self._convert_params_to_c_format(outputs)
+        c_stream = (
+            ctypes.c_void_p() if stream_ptr is None else ctypes.c_void_p(stream_ptr)
+        )
+
+        num_outputs = len(self._output_ndims)
+        c_output_shapes_out = (ctypes.POINTER(ctypes.c_int64) * num_outputs)()
+        for i in range(num_outputs):
+            c_output_shapes_out[i] = ctypes.cast(
+                (ctypes.c_int64 * self._output_ndims[i])(),
+                ctypes.POINTER(ctypes.c_int64),
+            )
+
+        return (
+            c_inputs,
+            c_outputs,
+            c_stream,
+            c_output_shapes_out,
+        )
+
+    def _dict_to_ordered_list(self, params, is_inputs):
+        if is_inputs:
+            index_map = self._input_name_to_index
+        else:
+            index_map = self._output_name_to_index
+        if len(params) != len(index_map):
+            raise ValueError(
+                f"Did not get correct number of {'inputs' if is_inputs else 'outputs'} expected {len(index_map)}, got {len(params)}"
+            )
+
+        result = [None for i in range(len(index_map))]
+        for name, tensor in params.items():
+            if name not in index_map:
+                raise ValueError(
+                    f"Got unexpected {'input' if is_inputs else 'output'}: {name}"
+                )
+
+            result[index_map[name]] = tensor
+
+        return result
+
+    def _make_ait_outputs(
+        self, outputs: List[AITData], c_output_shapes
+    ) -> Dict[str, List[int]]:
+        output_shapes = []
+        for i, c_shape in enumerate(c_output_shapes):
+            shape = []
+            for j in range(self._output_ndims[i]):
+                shape.append(c_shape[j])
+            output_shapes.append(shape)
+
+        return {
+            name: AITData(outputs[idx].data_ptr, output_shapes[idx], outputs[idx].dtype)
+            for name, idx in self._output_name_to_index.items()
+        }
+
+    def _run_impl(
+        self,
+        inputs: Union[Dict[str, AITData], List[AITData]],
+        outputs: Union[Dict[str, AITData], List[AITData]],
+        stream_ptr: Optional[int] = None,
+        sync: bool = True,
+        graph_mode: bool = False,
+        outputs_on_host: bool = False,
+    ) -> Dict[str, AITData]:
+        if isinstance(inputs, dict):
+            inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
+        if isinstance(outputs, dict):
+            outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
+        (c_inputs, c_outputs, c_stream, c_output_shapes_out,) = self._prepare_run(
+            inputs,
+            outputs,
+            stream_ptr,
+        )
+
+        if not outputs_on_host:
+            self.DLL.AITemplateModelContainerRun(
+                self.handle,
+                c_inputs,
+                ctypes.c_size_t(len(inputs)),
+                c_outputs,
+                ctypes.c_size_t(len(outputs)),
+                c_stream,
+                ctypes.c_bool(sync),
+                ctypes.c_bool(graph_mode),
+                c_output_shapes_out,
+            )
+        else:
+            self.DLL.AITemplateModelContainerRunWithOutputsOnHost(
+                self.handle,
+                c_inputs,
+                ctypes.c_size_t(len(inputs)),
+                c_outputs,
+                ctypes.c_size_t(len(outputs)),
+                c_stream,
+                ctypes.c_bool(graph_mode),
+                c_output_shapes_out,
+            )
+
+        return self._make_ait_outputs(outputs, c_output_shapes_out)
+
+    def run(
+        self,
+        inputs: Union[Dict[str, AITData], List[AITData]],
+        outputs: Union[Dict[str, AITData], List[AITData]],
+        stream_ptr: Optional[int] = None,
+        sync: bool = True,
+        graph_mode: bool = False,
+    ) -> Dict[str, AITData]:
+        """
+        Run the model.
+
+        Parameters
+        ----------
+        inputs: Union[Dict[str, AITData], List[AITData]]
+            The inputs to use. AITData is a named tuple containing
+            the tensor's data_ptr, size, and dtype. If inputs is a list,
+            it must be ordered correctly (as specified by GetInputNameToIndexMap).
+            This parameter can also be a dictionary (name -> AITData).
+        outputs: Union[Dict[str, AITData], List[AITData]]
+            The outputs to use. Similar to inputs, can either be a list of ordered
+            outputs, or a dictionary (output name -> AITData).
+            These should be allocated with enough memory to store their maximum
+            size (which can be queried with GetOutputMaximumSize).
+        stream_ptr: int
+            A pointer to CUDA stream to run on. If None, use the legacy stream.
+        sync: bool:
+            If True, synchronize the stream at the end of the run
+        graph_mode: bool
+            If True, use a CUDA graph kernel (experimental)
+
+        Returns
+        -------
+        AITDatas with output shapes that are computed by shape inference. This may not be
+        the maximum shape. The output memory blobs that are passed in to Run()
+        should be interpreted and possibly truncated according to these sizes.
+        """
+        return self._run_impl(
+            inputs, outputs, stream_ptr, sync, graph_mode, outputs_on_host=False
+        )
+
+    def _interpret_tensors_as_shapes(
+        self,
+        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        outputs_ait: Dict[str, AITData],
+    ) -> Dict[str, TorchTensor]:
+        if isinstance(outputs, dict):
+            return {
+                name: _reshape_tensor(tensor, outputs_ait[name].shape)
+                for name, tensor in outputs.items()
+            }
+        else:
+            return {
+                name: _reshape_tensor(outputs[idx], outputs_ait[name].shape)
+                for name, idx in self._output_name_to_index.items()
+            }
+
+    def run_with_tensors(
+        self,
+        inputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        stream_ptr: Optional[int] = None,
+        sync: bool = True,
+        graph_mode: bool = False,
+    ) -> Dict[str, TorchTensor]:
+        """
+        Run the model with torch.Tensors. See Run() for information about the
+        arguments.
+
+        Inputs may either be a dictionary (name -> torch.Tensor), or a list
+        of torch.Tensors ordered according to GetInputNameToIndexMap. Outputs
+        can also be a dictionary, or a list ordered according to GetOutputNameToIndexMap.
+        """
+
+        _check_tensors_contiguous_and_on_gpu(
+            inputs,
+            name="inputs",
+        )
+        _check_tensors_contiguous_and_on_gpu(
+            outputs,
+            name="outputs",
+        )
+        outputs_ait = self.run(
+            _convert_tensor_args(inputs),
+            _convert_tensor_args(outputs),
+            stream_ptr=stream_ptr,
+            sync=sync,
+            graph_mode=graph_mode,
+        )
+
+        return self._interpret_tensors_as_shapes(outputs, outputs_ait)
+
+    def _run_with_outputs_on_host(
+        self,
+        inputs: Union[Dict[str, AITData], List[AITData]],
+        outputs: Union[Dict[str, int], List[int]],
+        stream_ptr: Optional[int] = None,
+        graph_mode: bool = False,
+    ) -> Dict[str, AITData]:
+        """
+        Like Run(), but takes host memory outputs. Note that there is no sync parameter;
+        the stream will always be synchronized after copying the outputs to the host.
+
+        Warning: don't use this! It's not optimal with respect to performance.
+        It's here for use by internal constant folding passes.
+        """
+        return self._run_impl(
+            inputs, outputs, stream_ptr, graph_mode=graph_mode, outputs_on_host=True
+        )
+
+    def _run_with_tensors_outputs_on_host(
+        self,
+        inputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        stream_ptr: Optional[int] = None,
+        graph_mode: bool = False,
+    ) -> Dict[str, TorchTensor]:
+        """
+        Like RunWithTensors(), but takes host memory tensors
+
+        Warning: don't use this! It's not optimal with respect to performance.
+        It's here for use by internal constant folding passes.
+        """
+        _check_tensors_contiguous_and_on_gpu(
+            inputs,
+            name="inputs",
+        )
+        _check_tensors_contiguous_and_on_host(
+            outputs,
+            name="outputs",
+        )
+        output_shapes = self._run_with_outputs_on_host(
+            _convert_tensor_args(inputs),
+            _convert_tensor_args(outputs),
+            stream_ptr=stream_ptr,
+            graph_mode=graph_mode,
+        )
+        return self._interpret_tensors_as_shapes(outputs, output_shapes)
+
+    def benchmark(
+        self,
+        inputs: Union[Dict[str, AITData], List[AITData]],
+        outputs: Union[Dict[str, int], List[int]],
+        stream_ptr: Optional[int] = None,
+        graph_mode: bool = False,
+        count: int = 10,
+        repeat: int = 1,
+        num_threads: int = 1,
+        use_unique_stream_per_thread: bool = False,
+    ) -> Tuple[float, float, Dict[str, AITData]]:
+        """
+        Benchmark the model. See run() for information on most parameters.
+        """
+        if isinstance(inputs, dict):
+            inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
+        if isinstance(outputs, dict):
+            outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
+        (c_inputs, c_outputs, c_stream, c_output_shapes_out,) = self._prepare_run(
+            inputs,
+            outputs,
+            stream_ptr,
+        )
+        time_ms = []
+        runtime_ms = ctypes.c_float()
+        for _ in range(repeat):
+            self.DLL.AITemplateModelContainerBenchmark(
+                self.handle,
+                c_inputs,
+                ctypes.c_size_t(len(inputs)),
+                c_outputs,
+                ctypes.c_size_t(len(outputs)),
+                c_stream,
+                ctypes.c_bool(graph_mode),
+                ctypes.c_size_t(count),
+                ctypes.c_size_t(num_threads),
+                ctypes.c_bool(use_unique_stream_per_thread),
+                ctypes.byref(runtime_ms),
+                c_output_shapes_out,
+            )
+            time_ms.append(runtime_ms.value)
+        mean = np.mean(time_ms)
+        std = np.std(time_ms)
+
+        return (mean, std, self._make_ait_outputs(outputs, c_output_shapes_out))
+
+    def benchmark_with_tensors(
+        self,
+        inputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        stream_ptr: Optional[int] = None,
+        graph_mode: bool = False,
+        count: int = 10,
+        repeat: int = 1,
+        num_threads: int = 1,
+        use_unique_stream_per_thread: bool = False,
+    ) -> Tuple[float, float, Dict[str, TorchTensor]]:
+        """
+        Benchmark the model. See run_with_tensors() for information on most parameters.
+        """
+
+        _check_tensors_contiguous_and_on_gpu(
+            inputs,
+            name="inputs",
+        )
+        _check_tensors_contiguous_and_on_gpu(
+            outputs,
+            name="outputs",
+        )
+
+        mean, std, ait_outputs = self.benchmark(
+            _convert_tensor_args(inputs),
+            _convert_tensor_args(outputs),
+            stream_ptr,
+            graph_mode,
+            count,
+            repeat,
+            num_threads,
+            use_unique_stream_per_thread,
+        )
+        return (mean, std, self._interpret_tensors_as_shapes(outputs, ait_outputs))
+
+    def _get_map_helper(self, n: int, get_name_func) -> Dict[str, int]:
+        result = {}
+        for i in range(n):
+            c_name = ctypes.c_char_p()
+            c_idx = ctypes.c_size_t(i)
+            get_name_func(c_idx, ctypes.byref(c_name))
+            name = c_name.value.decode("utf-8")
+            result[name] = i
+        return result
+
+    def _construct_input_name_to_index_map(self) -> Dict[str, int]:
+        num_inputs = ctypes.c_size_t()
+        self.DLL.AITemplateModelContainerGetNumInputs(
+            self.handle, ctypes.byref(num_inputs)
+        )
+        get_input_name = (
+            lambda idx, name: self.DLL.AITemplateModelContainerGetInputName(
+                self.handle, idx, name
+            )
+        )
+        return self._get_map_helper(num_inputs.value, get_input_name)
+
+    def get_input_name_to_index_map(self) -> Dict[str, int]:
+        """
+        Get the name to index mapping. Note that the ordering of inputs
+        is not guaranteed to be deterministic.
+
+        If using run()'s list interface, this ordering must be used!
+        """
+        # Copy so people can't modify our version of the map
+        return self._input_name_to_index.copy()
+
+    def _construct_output_name_to_index_map(self) -> Dict[str, int]:
+        num_outputs = ctypes.c_size_t()
+        self.DLL.AITemplateModelContainerGetNumOutputs(
+            self.handle, ctypes.byref(num_outputs)
+        )
+        get_output_name = (
+            lambda idx, name: self.DLL.AITemplateModelContainerGetOutputName(
+                self.handle, idx, name
+            )
+        )
+        return self._get_map_helper(num_outputs.value, get_output_name)
+
+    def get_output_name_to_index_map(self) -> Dict[str, int]:
+        """
+        Get the name to index mapping. Unlike inputs, outputs
+        have a guaranteed ordering; the order that outputs were
+        provided to `compile_model` is always used as the internal
+        name to index mapping.
+
+        If using run()'s list interface, this ordering must be used!
+        """
+        # Copy so people can't modify our version of the map
+        return self._output_name_to_index.copy()
+
+    def set_constant(self, name: str, tensor: AITData):
+        """
+        Set a constant. All constants must have values before calling run().
+
+        Note that the pointer inside tensor must be valid for the entire
+        duration of run().
+        """
+        b_name = name.encode("utf-8")
+        c_name = ctypes.c_char_p(b_name)
+        c_tensor = self._convert_single_param_to_c_format(tensor)
+        self.DLL.AITemplateModelContainerSetConstant(
+            self.handle, c_name, ctypes.byref(c_tensor)
+        )
+
+    def set_constant_with_tensor(self, name: str, tensor: TorchTensor):
+        """
+        Set a constant with a PyTorch tensor.
+        Model will store a reference to the given tensor in
+        torch_constant_tensors until it is explicitly deleted or replaced.
+        """
+        if not tensor.is_contiguous() or not tensor.is_cuda:
+            raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
+        self.torch_constant_tensors[name] = tensor
+        self.set_constant(name, torch_to_ait_data(tensor))
+
+    def get_output_maximum_shape(
+        self, output_idx_or_name: Union[int, str]
+    ) -> List[int]:
+        """
+        Get the maximum output shape. The input here can either be an output name
+        or an index. The index is the runtime's internal index (as specified by
+        GetOutputNameToIndexMap)
+        """
+        if isinstance(output_idx_or_name, int):
+            output_idx = output_idx_or_name
+        elif isinstance(output_idx_or_name, str):
+            if output_idx_or_name not in self._output_name_to_index:
+                raise ValueError(
+                    f"Name {output_idx_or_name} not in OutputNameToIndexMap! Available names: {list(self._output_name_to_index.keys())}"
+                )
+            output_idx = self._output_name_to_index[output_idx_or_name]
+        else:
+            raise TypeError(
+                f"output_idx_or_name must be str or int, but got {type(output_idx_or_name)}"
+            )
+
+        class Shape(ctypes.Structure):
+            _fields_ = [
+                ("shape_data", ctypes.POINTER(ctypes.c_longlong)),
+                ("size", ctypes.c_size_t),
+            ]
+
+        raw_shape = Shape()
+        self.DLL.AITemplateModelContainerGetMaximumOutputShape(
+            self.handle, output_idx, ctypes.byref(raw_shape)
+        )
+        return [raw_shape.shape_data[idx] for idx in range(raw_shape.size)]
+
+    def get_output_dtype(self, index):
+        """
+        Get the expected dtype of an output.
+        """
+        output = ctypes.c_int()
+        self.DLL.AITemplateModelContainerGetOutputDtype(
+            self.handle, index, ctypes.byref(output)
+        )
+        return output.value
+
+    def allocate_gpu_memory(
+        self, nbytes: int, stream_ptr: Optional[int] = None, sync: bool = True
+    ) -> int:
+        """
+        Helper function for allocating memory on the GPU. Can be useful if
+        third-party libraries like PyTorch or pycuda are not available.
+
+        The pointer returned by this function must be freed by free_gpu_memory
+        to avoid memory leaks.
+        """
+        ptr = ctypes.c_void_p()
+        self.DLL.AITemplateDeviceMalloc(
+            ctypes.byref(ptr),
+            ctypes.c_size_t(nbytes),
+            ctypes.c_void_p(stream_ptr),
+            ctypes.c_bool(sync),
+        )
+        return ptr.value
+
+    def free_gpu_memory(
+        self, ptr: int, stream_ptr: Optional[int] = None, sync: bool = True
+    ) -> None:
+        """
+        Helper function for freeing memory on the GPU. Can be useful if
+        third-party libraries like PyTorch or pycuda are not available.
+        """
+        if ptr in self._allocated_ait_data:
+            self._allocated_ait_data.remove(ptr)
+
+        self.DLL.AITemplateDeviceFree(
+            ctypes.c_void_p(ptr), ctypes.c_void_p(stream_ptr), ctypes.c_bool(sync)
+        )
+
+    def memcpy(
+        self,
+        dst: int,
+        src: int,
+        count: int,
+        kind: AITemplateMemcpyKind,
+        stream_ptr: Optional[int] = None,
+        sync: bool = True,
+    ) -> None:
+        """
+        Helper function for copying memory on the GPU. Can be useful if
+        third-party libraries like PyTorch or pycuda are not available.
+
+        Supports D2H, H2D, and D2D copies. The copy direction can be
+        specified by the AITemplateMemcpyKind enum.
+        """
+        self.DLL.AITemplateMemcpy(
+            ctypes.c_void_p(dst),
+            ctypes.c_void_p(src),
+            ctypes.c_size_t(count),
+            ctypes.c_int(kind.value),
+            ctypes.c_void_p(stream_ptr),
+            ctypes.c_bool(sync),
+        )
+
+    def get_num_runtimes(self) -> int:
+        """
+        Get the number of runtimes this model container stores.
+        """
+        out = ctypes.c_size_t()
+        self.DLL.AITemplateModelContainerGetNumRuntimes(self.handle, ctypes.byref(out))
+        return out.value
+
+    def numpy_to_ait_data(
+        self, arr: np.ndarray, stream_ptr: Optional[int] = None, sync: bool = True
+    ) -> AITData:
+        """
+        Convert a numpy array to AIT-usable data. Mallocs and copies
+        on the given stream.
+
+        The allocated buffer should be manually freed with free_gpu_memory.
+        But, in case of misuse, Model will keep track of pointers allocated with
+        this method and free them all at the end.
+        """
+        dtype = str(arr.dtype)
+        shape = list(arr.shape)
+        gpu_mem = self.allocate_gpu_memory(arr.nbytes, stream_ptr=stream_ptr, sync=sync)
+        self._allocated_ait_data.add(gpu_mem)
+        self.memcpy(
+            gpu_mem,
+            arr.ctypes._data.value,
+            arr.nbytes,
+            AITemplateMemcpyKind.HostToDevice,
+            sync=sync,
+            stream_ptr=stream_ptr,
+        )
+        return AITData(gpu_mem, shape, dtype)
+
+    def ait_data_to_numpy(
+        self,
+        ait_data: AITData,
+        stream_ptr: Optional[int] = None,
+        sync: bool = True,
+    ) -> np.ndarray:
+        """
+        Create numpy array from an AITData.
+        Copies on the given stream.
+        """
+        arr = np.empty(ait_data.shape, dtype=ait_data.dtype)
+        self.memcpy(
+            arr.ctypes._data.value,
+            ait_data.data_ptr,
+            arr.nbytes,
+            AITemplateMemcpyKind.DeviceToHost,
+            sync=sync,
+            stream_ptr=stream_ptr,
+        )
+        return arr
diff --git a/python/aitemplate/compiler/op_registry.py b/python/aitemplate/compiler/op_registry.py
new file mode 100644
index 000000000..7b870c869
--- /dev/null
+++ b/python/aitemplate/compiler/op_registry.py
@@ -0,0 +1,23 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Registry for basic operators and math functions.
+"""
+from typing import Callable, Dict
+
+# OP_REGISTRY defines a mapping from a FuncEnum name to a function to create this elementwise operator.
+# This object is initialized in elementwise.py, and referenced in base.py and math.py.
+OP_REGISTRY: Dict[str, Callable] = {}
diff --git a/python/aitemplate/compiler/ops/__init__.py b/python/aitemplate/compiler/ops/__init__.py
new file mode 100644
index 000000000..0d78fae7f
--- /dev/null
+++ b/python/aitemplate/compiler/ops/__init__.py
@@ -0,0 +1,34 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+AIT operators.
+"""
+from .common import *
+from .conv import *
+from .embedding import *
+from .gemm_special import *
+from .gemm_universal import *
+from .gemm_epilogue_vistor import *
+from .layernorm import *
+from .padding import *
+from .pool import *
+from .reduce import *
+from .softmax import *
+from .tensor import *
+from .upsample import *
+from .vision_ops import *
+from .attention import *
+from .groupnorm import *
diff --git a/python/aitemplate/compiler/ops/attention/__init__.py b/python/aitemplate/compiler/ops/attention/__init__.py
new file mode 100644
index 000000000..962476aaa
--- /dev/null
+++ b/python/aitemplate/compiler/ops/attention/__init__.py
@@ -0,0 +1,21 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+flash attention module init
+"""
+from .flash_attention import flash_attention
+
+
+__all__ = ["flash_attention"]
diff --git a/python/aitemplate/compiler/ops/attention/flash_attention.py b/python/aitemplate/compiler/ops/attention/flash_attention.py
new file mode 100644
index 000000000..527476fe7
--- /dev/null
+++ b/python/aitemplate/compiler/ops/attention/flash_attention.py
@@ -0,0 +1,186 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Flash attention.
+"""
+import itertools
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ...base import Operator, Tensor
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}total = {{x_dim0}};
+{{indent}}{{dtype}}num_heads = {{x_dim2}};
+{{indent}}{{dtype}}head_sizes = {{x_dim3}};
+{{indent}}{{dtype}}NO = total;
+{{indent}}{{dtype}}HO = num_heads;
+{{indent}}{{dtype}}WO = head_sizes;
+"""
+)
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+total == {{x_dim0}} && num_heads == {{x_dim2}} && head_sizes == {{x_dim3}}
+"""
+)
+
+
+class flash_attention(Operator):
+    r"""FlashAttention provides an implementation for fused
+    multi-head attention module:
+
+    .. math::
+        \text{Attention}(Q, K, V) = \text{softmax}(\frac{QK}{\sqrt(d)}) * V
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+    """
+
+    def __init__(self, batch_size, dropout, max_seq_len, causal) -> None:
+        """initilize attention module"""
+        super().__init__()
+        assert dropout == 0
+        self._attrs["op"] = "flash_attention"
+        self._attrs["has_profiler"] = False
+        self._attrs["batch_size"] = batch_size
+        self._attrs["dropout"] = dropout
+        self._attrs["max_seq_len"] = max_seq_len
+        self._attrs["seq_len"] = 512
+        self._attrs["head_size"] = -1
+        self._attrs["causal"] = causal
+        self._attrs["workspace"] = 0
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+
+    def _infer_shape(self, x: List[int], w: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor, w: Tensor):
+        """infer the output shape for attention module"""
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
+        # run infer shape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, w_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def __call__(self, x: Tensor, cu_seqlens: Tensor) -> Tensor:
+        """call the op
+
+        Parameters
+        ----------
+        x : float16
+            QKV tensor
+            shape: (batch*seqlen, 3, num_heads, head_size)
+        cu_seqlens : int
+            seq lens tensor
+            shape (batch_size + 1)
+
+        Returns
+        ----------
+            Tensor
+        """
+        self._attrs["inputs"] = [x, cu_seqlens]
+        self._set_depth()
+        self._extract_exec_path(x)
+        output_shape = self._infer_shapes(x, cu_seqlens)
+        output = Tensor(output_shape, src_ops={self})
+
+        batch_size = self._attrs["batch_size"]
+        max_seq_len = self._attrs["max_seq_len"]
+        total = x._attrs["shape"][0]._attrs["values"][0]
+        num_heads = x._attrs["shape"][2]._attrs["values"][0]
+        head_size = x._attrs["shape"][3]._attrs["values"][0]
+        assert head_size in [8, 16, 32, 64, 128]
+        self._attrs["head_size"] = head_size
+
+        base_N = 256  # SM80
+        if max_seq_len <= 128:
+            seq_len = 128
+        elif max_seq_len <= 256:
+            seq_len = 256
+        else:
+            seq_len = ((max_seq_len + base_N - 1) // base_N) * base_N
+        self._attrs["seq_len"] = seq_len
+
+        self._attrs["workspace"] = (
+            4 * num_heads * (total * head_size + batch_size * seq_len)
+        )
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _gen_exec_key(self, shape):
+        """rendering shape info"""
+        return self.exec_key_template.render(
+            x_dim0=shape[0],
+            x_dim1=shape[1],
+            x_dim2=shape[2],
+            x_dim3=shape[3],
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def gen_function(self) -> str:
+        """call backend functions"""
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/common/__init__.py b/python/aitemplate/compiler/ops/common/__init__.py
new file mode 100644
index 000000000..cb0530124
--- /dev/null
+++ b/python/aitemplate/compiler/ops/common/__init__.py
@@ -0,0 +1,24 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+Common ops.
+"""
+from .elementwise import *
+from .epilogue import *
+from .fused_elementwise import *
+from .math import *
+from .python_ops import *
+from .view_ops import *
diff --git a/python/aitemplate/compiler/ops/common/elementwise.py b/python/aitemplate/compiler/ops/common/elementwise.py
new file mode 100644
index 000000000..dbd3a4296
--- /dev/null
+++ b/python/aitemplate/compiler/ops/common/elementwise.py
@@ -0,0 +1,153 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Elementwise operator definition, which covers UNARY / Binary / Ternary operators.
+"""
+import functools
+from typing import Any, List
+
+from ....utils import shape_utils
+from ...base import IntVar, Operator, Tensor
+from ...op_registry import OP_REGISTRY
+from .epilogue import FuncEnum
+
+# pylint: disable=C0103,W0221,W0102,C0301,W0223,R1724
+
+
+class elementwise(Operator):
+    """elementwise operator definition."""
+
+    def __init__(self, func_enum: FuncEnum) -> None:
+        """
+        Parameters
+        ----------
+        func_enum : the underlying function enum.
+        """
+
+        super().__init__()
+        self._attrs["op"] = "elementwise"
+        self._attrs["func"] = func_enum
+        self._attrs["has_profiler"] = False
+
+    def _infer_shapes(self, *args: Tensor) -> List[IntVar]:
+        """Offline shape inference."
+
+        Parameters
+        ----------
+        args : input tensors.
+
+        Returns
+        -------
+        List[IntVar] : output tensor shape.
+        """
+
+        if len(args) == 0:
+            raise RuntimeError(
+                "Elementwise op {} doesn't have inputs!".format(self._attrs["func"])
+            )
+        max_shape = None
+        for tensor in args:
+            shape = tensor._attrs["shape"]
+            if max_shape is None:
+                max_shape = list(shape)
+            broadcastable, max_shape = shape_utils.get_broadcast_max_shape(
+                max_shape, shape
+            )
+            if not broadcastable:
+                raise RuntimeError(
+                    "Tensor shapes of elementwise ops are not compatible! Shape1: {}, shape2: {}".format(
+                        max_shape, shape
+                    )
+                )
+        return max_shape
+
+    def __call__(self, *args: Tensor) -> Tensor:
+        converted_args = []
+        for arg in args:
+            if isinstance(arg, int) or isinstance(arg, float):
+                converted_args.append(Tensor(shape=[], value=arg))
+            elif isinstance(arg, Tensor):
+                converted_args.append(arg)
+            else:
+                raise RuntimeError(
+                    f"Unsupported data type {arg} in elementwise {self}!"
+                )
+
+        self._attrs["args"] = list(converted_args)
+        self._attrs["inputs"] = [
+            arg for arg in converted_args if not arg.is_a_const_num()
+        ]
+        self._set_depth()
+        output_shape = self._infer_shapes(*converted_args)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def replace_input_tensor(self, old_tensor, new_tensor) -> None:
+        super().replace_input_tensor(old_tensor, new_tensor)
+        self._attrs["args"] = [
+            new_tensor if tensor is old_tensor else tensor
+            for tensor in self._attrs["args"]
+        ]
+
+    def _args_for_pseudo_code(self):
+        return [self._attrs["func"]]
+
+
+# TODO: move it to math.py and update it to a function.
+class clamp(Operator):
+    """Clamps all elements in input into the range [min_value, max_value].
+    Returns y = min(max(x, min_value), max_value).
+    If min is None, there is no lower bound. Or, if max is None there is no upper bound.
+    If min is greater than max torch.clamp(..., min, max) sets all elements in input to
+    the value of max.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "clamp"
+        self._attrs["has_profiler"] = False
+
+    def __call__(
+        self, x: Tensor, min_value: Any = None, max_value: Any = None
+    ) -> Tensor:
+        if isinstance(min_value, (int, float)):
+            min_value = Tensor(value=min_value, shape=[])
+        if isinstance(max_value, (int, float)):
+            max_value = Tensor(value=max_value, shape=[])
+        if min_value is None and max_value is not None:
+            return elementwise(FuncEnum.MIN)(
+                x,
+                max_value,
+            )
+        if max_value is None and min_value is not None:
+            return elementwise(FuncEnum.MAX)(
+                x,
+                min_value,
+            )
+        assert not (max_value is None and max_value is None)
+        return elementwise(FuncEnum.MIN)(
+            elementwise(FuncEnum.MAX)(x, min_value),
+            max_value,
+        )
+
+
+def _elementwise_func(func_enum: FuncEnum, *args: Tensor) -> Tensor:
+    return elementwise(func_enum)(*args)
+
+
+# Initialize OP_REGISTRY so that Tensor built-in functions can use.
+for name, func_enum in FuncEnum.__members__.items():
+    OP_REGISTRY[name] = functools.partial(_elementwise_func, func_enum)
diff --git a/python/aitemplate/compiler/ops/common/epilogue.py b/python/aitemplate/compiler/ops/common/epilogue.py
new file mode 100644
index 000000000..fb8ca9926
--- /dev/null
+++ b/python/aitemplate/compiler/ops/common/epilogue.py
@@ -0,0 +1,59 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Epilogue definitions.
+TODO(ipiszy): Rename it to types.py.
+"""
+
+from enum import Enum
+
+
+class EpilogueOp(Enum):
+    """
+    Epilogue enum.
+    """
+
+    NA = 1
+    BIAS = 2
+    BIAS_RELU = 3
+    BIAS_RELU_ADD = 4
+
+
+class FuncEnum(Enum):
+    """
+    Elementwise func enum.
+    """
+
+    ADD = 1
+    SUB = 2
+    MUL = 3
+    DIV = 4
+    TANH = 5
+    COS = 6
+    SIN = 7
+    SIGN = 8
+    ABS = 9
+    LOGE = 10
+    EXP = 11
+    SQRT = 12
+    MAX = 13
+    MIN = 14
+    SIGMOID = 15
+    LRELU = 16
+    HARDTANH = 17
+    RELU = 18
+    NAN_TO_NUM = 19
+    CLAMP_NAN_TO_NUM = 20
+    SILU = 21
diff --git a/python/aitemplate/compiler/ops/common/fused_elementwise.py b/python/aitemplate/compiler/ops/common/fused_elementwise.py
new file mode 100644
index 000000000..40fd3ecd8
--- /dev/null
+++ b/python/aitemplate/compiler/ops/common/fused_elementwise.py
@@ -0,0 +1,156 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused elementwise operator definition.
+"""
+from typing import List
+
+from .... import backend
+from ....backend import registry
+from ...base import Operator
+from ...tensor_accessor import TensorAccessor
+from .elementwise import elementwise
+
+# pylint: disable=C0301,C0103,W0223
+
+
+def _check_shapes_eq(shapes1, shapes2) -> bool:
+    if len(shapes1) != len(shapes2):
+        return False
+    for shape1, shape2 in zip(shapes1, shapes2):
+        if shape1 != shape2:
+            return False
+    return True
+
+
+class fused_elementwise(Operator):
+    """fused_elementwise operator is used internally.
+    It's the actual operator which does ++ codegen.
+    """
+
+    def _check_output_shape(self) -> None:
+        outputs = self._attrs["outputs"]
+        shape = outputs[0]._attrs["shape"]
+        for i in range(1, len(outputs)):
+            if not _check_shapes_eq(shape, outputs[i]._attrs["shape"]):
+                raise RuntimeError(
+                    "Output shapes of fused_elementwise Op do not match! Shape1: {}. Shape2: {}.".format(
+                        shape, outputs[i]._attrs["shape"]
+                    )
+                )
+
+    def _update_inputs_outputs(self) -> None:
+        ops = set(self._attrs["elementwise_ops"])
+        external_inputs = set()
+        external_outputs = set()
+        tmp_inputs = set()
+        tmp_outputs = set()
+
+        for op in ops:
+            for input_tensor in op._attrs["inputs"]:
+                tmp_inputs.add(input_tensor)
+                if (
+                    len(input_tensor._attrs["src_ops"]) == 0
+                    or len(set(input_tensor._attrs["src_ops"]) - ops) > 0
+                ) and (not input_tensor.is_a_const_num()):
+                    external_inputs.add(input_tensor)
+                assert op in input_tensor._attrs["dst_ops"]
+            for output_tensor in op._attrs["outputs"]:
+                tmp_outputs.add(output_tensor)
+                if (
+                    output_tensor._attrs["is_output"]
+                    or len(output_tensor._attrs["dst_ops"] - ops) > 0
+                ):
+                    external_outputs.add(output_tensor)
+                assert len(output_tensor._attrs["src_ops"]) == 1
+                assert list(output_tensor._attrs["src_ops"])[0] == op
+
+        assert (
+            external_inputs == tmp_inputs - tmp_outputs
+        ), "external_inputs: {} is not equal to tmp_inputs: {} - tmp_outputs: {}.".format(
+            external_inputs, tmp_inputs, tmp_outputs
+        )
+        assert (
+            len(tmp_outputs - tmp_inputs - external_outputs) == 0
+        ), "tmp_outputs: {} - tmp_inputs: {} - external_outputs: {} is not empty.".format(
+            tmp_outputs, tmp_inputs, external_outputs
+        )
+        assert (
+            len(external_outputs - tmp_outputs) == 0
+        ), "external_outputs: {} - tmp_outputs: {} is not empty.".format(
+            external_outputs, tmp_outputs
+        )
+
+        self._attrs["inputs"] = list(external_inputs)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+
+        self._attrs["outputs"] = list(external_outputs)
+        self._attrs["output_accessors"] = [
+            TensorAccessor(output_tensor) for output_tensor in self._attrs["outputs"]
+        ]
+        self._check_output_shape()
+
+        # Preserve original tensors in case there are scatter / gather fusions.
+        # Need to copy Tensor objects.
+        self._attrs["original_inputs"] = list(self._attrs["inputs"])
+        self._attrs["original_outputs"] = list(self._attrs["outputs"])
+
+        for tensor in tmp_inputs | tmp_outputs:
+            tensor._attrs["src_ops"] = set(tensor._attrs["src_ops"]) - ops
+            tensor._attrs["dst_ops"] = tensor._attrs["dst_ops"] - ops
+        for tensor in external_inputs:
+            tensor._attrs["dst_ops"].add(self)
+        for tensor in external_outputs:
+            tensor._attrs["src_ops"].add(self)
+
+    def _check_constant(self) -> None:
+        if len(self._attrs["inputs"]) == 0:
+            raise RuntimeError(f"No inputs for fused_elementwise! {self}")
+        for input_tensor in self._attrs["inputs"]:
+            if not input_tensor.is_a_const_num():
+                return
+        raise NotImplementedError(
+            "Cannot handle the case that all inputs of a fused_elementwise are constant numbers! "
+            f"Please use Python to calculate directly. Operator: {self}"
+        )
+
+    def __init__(self, elementwise_ops: List[elementwise]) -> None:
+        super().__init__()
+
+        if len(elementwise_ops) == 0:
+            raise RuntimeError(
+                "fused_elementwise argument elementwise_ops cannot be empty!"
+            )
+
+        self._attrs["op"] = "fused_elementwise"
+        self._attrs["elementwise_ops"] = elementwise_ops
+        self._attrs["has_profiler"] = False
+
+        self._update_inputs_outputs()
+        self._set_depth()
+        self._check_constant()
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+    def _args_for_pseudo_code(self):
+        return [op._attrs["func"] for op in self._attrs["elementwise_ops"]]
diff --git a/python/aitemplate/compiler/ops/common/math.py b/python/aitemplate/compiler/ops/common/math.py
new file mode 100644
index 000000000..f8cddfe19
--- /dev/null
+++ b/python/aitemplate/compiler/ops/common/math.py
@@ -0,0 +1,87 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Basic math functions.
+"""
+
+from typing import Any
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.op_registry import OP_REGISTRY
+
+
+def tanh(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("TANH")(tensor)
+
+
+def cos(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("COS")(tensor)
+
+
+def sin(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("SIN")(tensor)
+
+
+def sign(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("SIGN")(tensor)
+
+
+def abs(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("ABS")(tensor)
+
+
+def log(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("LOGE")(tensor)
+
+
+def exp(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("EXP")(tensor)
+
+
+def sqrt(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("SQRT")(tensor)
+
+
+def max(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("MAX")(tensor)
+
+
+def min(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("MIN")(tensor)
+
+
+def sigmoid(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("SIGMOID")(tensor)
+
+
+def leaky_relu(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("LRELU")(tensor)
+
+
+def hardtanh(*args, **kwargs) -> Tensor:
+    return OP_REGISTRY.get("HARDTANH")(*args, **kwargs)
+
+
+def relu(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("RELU")(tensor)
+
+
+def silu(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("SILU")(tensor)
+
+
+def nan_to_num(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("NAN_TO_NUM")(tensor)
diff --git a/python/aitemplate/compiler/ops/common/python_ops.py b/python/aitemplate/compiler/ops/common/python_ops.py
new file mode 100644
index 000000000..b53d76050
--- /dev/null
+++ b/python/aitemplate/compiler/ops/common/python_ops.py
@@ -0,0 +1,56 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Syntax sugar ops to support List/Tuples in the IR. These ops don't generate any code.
+"""
+from typing import Any, List, Tuple, Union
+
+from ....utils.tensor_utils import wrap_dim
+from ...base import IntImm, IntVar, Operator, Tensor
+
+# pylint: disable=C0103,W0221,R1732,W0613
+
+
+class getitem(Operator):
+    """Retrieve a single element from a list of tuple at a certain index."""
+
+    def __call__(self, vals: Union[List[Any], Tuple[Any]], index: int) -> Any:
+        assert isinstance(vals, (tuple, list))
+        assert len(vals) > 0
+
+        wrapped_idx = wrap_dim(int(index), len(vals))
+        val = vals[wrapped_idx]
+        if isinstance(val, Tensor) or isinstance(val, (IntVar, IntImm)):
+            return val
+        else:
+            raise NotImplementedError(
+                "getitem op does not support this val type: {}".format(val)
+            )
+
+
+class tuple_construct(Operator):
+    """Construct a tuple of tensors."""
+
+    def __call__(self, *args: Union[Tensor, IntVar]) -> Tuple[Tensor]:
+        outputs = tuple(args)
+        return outputs
+
+
+class list_construct(Operator):
+    """Construct a list of tensors."""
+
+    def __call__(self, *args: Union[Tensor, IntVar]) -> List[Tensor]:
+        outputs = list(args)
+        return outputs
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
new file mode 100644
index 000000000..bab845786
--- /dev/null
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -0,0 +1,495 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+View ops.
+"""
+
+import itertools
+import logging
+import math
+from typing import Any, List, Optional, Tuple, Union
+
+import jinja2
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
+
+from ....utils.shape_utils import convert_shape_to_IntVar
+from ....utils.tensor_utils import wrap_dim
+
+
+# SHAPE_ASSIGNMENT_TEMPLATE is folded in here
+# Only used in generating C++ code
+RESHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{% if unknown_idx >= 0 %}
+{% for idx in range(input_ndim) %}
+{{indent}}{{dtype}}IN_{{idx}} = *in_{{idx}};
+{% endfor %}
+
+{% for idx in range(output_ndim) %}
+{{indent}}{{dtype}}OUT_{{idx}} = *out_{{idx}};
+{% endfor %}
+
+{{indent}}{{dtype}}prod = 1;
+{% for idx in range(input_ndim) %}
+{{indent}}prod *= IN_{{idx}};
+{% endfor %}
+
+{{indent}}{{dtype}}out_prod = 1;
+
+{% for j in range(0, unknown_idx) %}
+{{indent}}out_prod *= OUT_{{j}};
+{% endfor %}
+{% for j in range(unknown_idx + 1, output_ndim) %}
+{{indent}}out_prod *= OUT_{{j}};
+{% endfor %}
+
+{{indent}}*out_{{unknown_idx}} = prod / out_prod;
+
+{% endif %}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+SQUEEZE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{% for idx in range(output_ndim) %}
+{% if idx in out_dim_to_in %}
+{{indent}}*out_{{idx}} = *in_{{out_dim_to_in[idx]}};
+{% endif %}
+{% endfor %}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+# no EXEC_COND_TEMPLATE because there is no cuda/rocm kernel generated for reshape
+
+# pylint: disable=C0103,W0221,R1732,W0613
+logging.basicConfig(level=logging.INFO)
+
+
+class _view(Operator):
+    """
+    Base class for View operators.
+    """
+
+    def replace_input_tensor(self, old_tensor, new_tensor) -> None:
+        super().replace_input_tensor(old_tensor, new_tensor)
+        for output in self._attrs["outputs"]:
+            if output._attrs["is_view_of"] is old_tensor:
+                output._attrs["is_view_of"] = new_tensor
+
+
+class _reshape_base(_view):
+    """
+    Base class for reshape and flatten
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["unknown_idx"] = -1
+
+    def make_output_shape(
+        self,
+        y_shape_values: List[Union[List[int], int]],
+        dynamic_dim: IntVar = None,
+    ) -> List[IntVar]:
+        """
+        Make the output shape from the output shape values.
+        """
+        output_shape = []
+        for idx, values in enumerate(y_shape_values):
+            if len(values) == 1:
+                output_shape.append(IntImm(values[0]))
+            else:
+                assert (
+                    self._attrs["unknown_idx"] == -1
+                ), f"{self._attrs['op']} doesn't support multiple dynamic dims, "
+                "got {idx} and {self._attrs['unknown_idx']}"
+                self._attrs["unknown_idx"] = idx
+                output_shape.append(
+                    dynamic_dim if dynamic_dim is not None else IntVar(values=values)
+                )
+        return output_shape
+
+
+def _is_dynamic_dim_reused(x_shape_values, y_shape_values) -> bool:
+    x_cumulative_static_dim = math.prod(v[0] for v in x_shape_values if 1 == len(v))
+    y_cumulative_static_dim = math.prod(v[0] for v in y_shape_values if 1 == len(v))
+    x_count_dynamic_dims = sum(len(v) > 1 for v in x_shape_values)
+    y_count_dynamic_dims = sum(len(v) > 1 for v in y_shape_values)
+
+    # if there is a single dynamic dim in current and output shape,
+    # and the values for dynamic dim are same between current and output shape,
+    # (equivalently, product of static dims is the same),
+    # we can reuse the current dynamic dim in the output shape;
+    # otherwise, a new dynamic dim will be created.
+    return (
+        x_count_dynamic_dims == y_count_dynamic_dims
+        and x_cumulative_static_dim == y_cumulative_static_dim
+        and x_count_dynamic_dims == 1
+    )
+
+
+class reshape(_reshape_base):
+    """
+    Returns a tensor with the same data and number of elements as input, but with the
+    specified shape. Inputs must be contiguous. It always returns a tensor view which
+    shares the same underlying data as the input.
+
+    A single dimension may be -1, in which case it’s inferred from the remaining
+    dimensions and the number of elements in input.
+
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "reshape"
+        self.shape_eval_template = RESHAPE_FUNC_TEMPLATE
+
+    def _infer_shape(self, x: Tuple[int], shape: Tuple[int]):
+        new_shape = list(shape)
+        cur_shape = x
+        unknown_idx = -1
+        prod = 1
+        for idx, v in enumerate(new_shape):
+            if v == -1:
+                # no multiple -1s
+                assert unknown_idx == -1
+                unknown_idx = idx
+            else:
+                prod *= v
+        numel = 1
+        for dim in cur_shape:
+            numel *= dim
+
+        if unknown_idx == -1:
+            assert (
+                numel == prod
+            ), f"When there is no unknown index, we expect dim products to be equal, got current shape {numel=} != new shape {prod=}"
+        else:
+            # FIXME: note that this RuntimeError rules out some "valid" PyTorch
+            # code like:
+            # t = torch.arange(0).reshape(4, 0)
+            # this is valid in PT but would trigger RuntimeError below
+            # t.reshape(2, 2, -1)
+            # We can fix it later.
+            if prod <= 0:
+                raise RuntimeError(f"cannot reshape tensor {x} with shape {shape}")
+            assert numel % prod == 0
+            new_shape[unknown_idx] = numel // prod
+        return new_shape
+
+    def _infer_shapes(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_dynamic_dims = [
+            var for var in x._attrs["shape"] if 1 < len(var._attrs["values"])
+        ]
+        x_shapes = list(itertools.product(*x_shape_values))
+
+        new_shape_vals = [var._attrs["values"] for var in self._attrs["shape"]]
+        new_shapes = list(itertools.product(*new_shape_vals))
+
+        # len(x_shapes) > 1 means that at least 1 dim in the shapes of x is dynamic.
+        # len(new_shapes) > 1 means that the dynamic dim is retained; otherwise, it would
+        # have been replaced with -1 or a concrete number.
+        if len(x_shapes) > len(new_shapes):
+            # we only support two cases here, when len(x_shapes) > 1, len(x_shapes) must
+            # be either len(new_shapes) (the dynamic dim is retained) or 1 (use -1 to
+            # mark the dynamic or unknown index and no other dim is dynamic).
+            assert len(new_shapes) == 1
+            new_shapes = new_shapes * len(x_shapes)
+
+        # run infershape for each
+        y_shapes = [
+            self._infer_shape(x_shape, new_shape)
+            for x_shape, new_shape in zip(x_shapes, new_shapes)
+        ]
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        y_shape_values = list(map(unique, zip(*y_shapes)))
+        reuse_dynamic_dim = _is_dynamic_dim_reused(x_shape_values, y_shape_values)
+
+        return self.make_output_shape(
+            y_shape_values,
+            dynamic_dim=x_dynamic_dims[0] if reuse_dynamic_dim else None,
+        )
+
+    def __call__(self, x: Tensor, shape: List[Any]) -> Tensor:
+        self._attrs["shape"] = convert_shape_to_IntVar(
+            [shape] if isinstance(shape, int) else shape
+        )
+        self._attrs["inputs"] = [x]
+        for s in shape:
+            if isinstance(s, IntVarTensor):
+                # Add IntVarTensors to inputs as well.
+                self._attrs["inputs"].append(s)
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self}, is_view_of=x)
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs, self.shape_eval_template)
+
+    def _inputs_for_pseudo_code(self):
+        return [
+            self._attrs["inputs"][0],
+            f"shape=[{self._pseudo_code_helper(self._attrs['shape'], with_shape=True)}]",
+        ]
+
+
+class flatten(_reshape_base):
+    """
+    Flattens input by reshaping it into a one-dimensional tensor. If start_dim or end_dim
+    are passed, only dimensions starting with start_dim and ending with end_dim are
+    flattened. The order of elements in input is unchanged.
+
+    It always returns a tensor view.
+    """
+
+    def __init__(self, start_dim=0, end_dim=-1) -> None:
+        super().__init__()
+        self._attrs["op"] = "flatten"
+        self.shape_eval_template = RESHAPE_FUNC_TEMPLATE
+        self._attrs["start"] = start_dim
+        self._attrs["end"] = end_dim
+
+    def _infer_shape(self, x: List[int]):
+        start = self._attrs["start"]
+        end = self._attrs["end"]
+
+        start = wrap_dim(start, len(x))
+        end = wrap_dim(end, len(x))
+
+        new_shape = []
+        for idx in range(start):
+            new_shape.append(x[idx])
+
+        prod = 1
+        for dim in x[start : end + 1]:
+            prod *= dim
+        new_shape.append(prod)
+
+        for dim in x[end + 1 :]:
+            new_shape.append(dim)
+
+        return new_shape
+
+    def _infer_shapes(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        x_dynamic_dims = [
+            var for var in x._attrs["shape"] if 1 < len(var._attrs["values"])
+        ]
+
+        # run infershape for each
+        y_shapes = [self._infer_shape(x_shape) for x_shape in x_shapes]
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        y_shape_values = list(map(unique, zip(*y_shapes)))
+        reuse_dynamic_dim = _is_dynamic_dim_reused(x_shape_values, y_shape_values)
+        return self.make_output_shape(
+            y_shape_values,
+            dynamic_dim=x_dynamic_dims[0] if reuse_dynamic_dim else None,
+        )
+
+    def _sanity_check(self, x_shape):
+        x_rank = len(x_shape)
+        start_dim = wrap_dim(self._attrs["start"], x_rank)
+        end_dim = wrap_dim(self._attrs["end"], x_rank)
+        assert (
+            start_dim >= 0 and start_dim < x_rank
+        ), f"flatten start_dim={start_dim} must be non-negative and less than input rank={x_rank}"
+        assert (
+            end_dim >= 0 and end_dim < x_rank
+        ), f"flatten end_dim={end_dim} must be non-negative and less than input rank={x_rank}"
+        assert (
+            start_dim <= end_dim
+        ), f"flatten start_dim={start_dim} must be less than or equal to end_dim={end_dim}"
+
+    def __call__(self, x: Tensor) -> Tensor:
+        self._sanity_check(x._attrs["shape"])
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self}, is_view_of=x)
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs, self.shape_eval_template)
+
+    def _args_for_pseudo_code(self):
+        return [f"start={self._attrs['start']}", f"end={self._attrs['end']}"]
+
+
+class squeeze(_view):
+    """
+    Examines the specified dimension and gets rid of it if it is of size 1.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        >>> x = Tensor(shape=[IntImm(3), IntImm(2), IntImm(1)])
+        >>> squeeze(2)(x)
+        Tensor(shape=[IntImm(3), IntImm(2)])
+
+        >>> x = Tensor(shape=[IntImm(3), IntImm(2), IntImm(1)])
+        >>> squeeze(1)(x)
+        Tensor(shape=[IntImm(3), IntImm(2), IntImm(1)])
+
+        >>> x = Tensor(shape=[IntImm(4), IntImm(1), IntImm(3)])
+        >>> squeeze(-2)(x)
+        Tensor(shape=[IntImm(4), IntImm(3)])
+
+        >>> x = Tensor(shape=[IntImm(1), IntImm(1), IntImm(4)])
+        >>> squeeze(None)(x)
+        Tensor(shape=[IntImm(4)])
+
+    There are some additional assumptions for dynamic dims. Since our shape inference
+    system cannot handle outputs with variable outputs, we assume that if a dynamic dim
+    is squeezed, it contains no ones:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        >>> x = Tensor(shape=[IntVar([3, 2]), IntImm(2)])
+        >>> y = Tensor(shape=[IntVar([1, 2]), IntImm(2)])
+        >>> squeeze(0)(x) # OK
+        Tensor(shape=[IntVar([3, 2]), IntImm(2)])
+        >>> squeeze(1)(y) # error!
+
+    * :attr:`dim (Optional[int])` : the dimension to get rid of. If None, get rid of all dimensions of size 1.
+
+    Args:
+        x (Tensor): the source tensor to squeeze.
+
+    Returns:
+        Tensor: the squeezed tensor.
+    """
+
+    def __init__(self, dim: Optional[int]) -> None:
+        super().__init__()
+        self._attrs["op"] = "squeeze"
+        self._attrs["dim"] = dim
+        self.shape_eval_template = SQUEEZE_FUNC_TEMPLATE
+
+    def _infer_shapes(self, x: Tensor) -> IntVar:
+        dim = self._attrs["dim"]
+        x_shape = x._attrs["shape"]
+
+        if dim is not None:
+            dim = wrap_dim(self._attrs["dim"], len(x_shape))
+
+        new_shape = []
+        out_dim_to_in = {}
+        out_dim = 0
+        for input_idx, shape in enumerate(x_shape):
+            if (dim is None or input_idx == dim) and shape == IntImm(1):
+                # This dim is squeezed
+                continue
+
+            if isinstance(shape, IntVar):
+                # Dynamic shape needs to be written to in generated code.
+                # Save it here.
+                out_dim_to_in[out_dim] = input_idx
+            out_dim += 1
+            new_shape.append(shape)
+
+        self._attrs["out_dim_to_in"] = out_dim_to_in
+        return new_shape
+
+    def __call__(self, x: Tensor) -> Tensor:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self}, is_view_of=x)
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs, self.shape_eval_template)
+
+    def _args_for_pseudo_code(self):
+        return [f"dim={self._attrs['dim']}"]
+
+
+class unsqueeze(squeeze):
+    """
+    Adds a dimension of size 1 at a specified location.
+    >>> x = Tensor(shape=[IntImm(4), IntImm(3)])
+    >>> unsqueeze(0)(x)
+    Tensor(shape=[IntImm(1), IntImm(4), IntImm(3)])
+    >>> unsqueeze(-1)(x)
+    Tensor(shape=[IntImm(4), IntImm(3), IntImm(1)])
+
+    Args:
+        dim (int): Where to add the dimension, must be in range [-input_ndim - 1, input_dim + 1)
+    """
+
+    def __init__(self, dim: int) -> None:
+        super().__init__(dim)
+        self._attrs["op"] = "unsqueeze"
+        self._attrs["dim"] = dim
+
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        x_shape = x._attrs["shape"]
+        dim = wrap_dim(self._attrs["dim"], len(x_shape) + 1)
+
+        y_shapes = []
+        out_dim_to_in = {}
+        out_dim = 0
+        for idx, shape in enumerate(x_shape):
+            if idx == dim:
+                y_shapes.append(IntImm(1))
+                out_dim += 1
+
+            if isinstance(shape, IntVar):
+                out_dim_to_in[out_dim] = idx
+
+            y_shapes.append(shape)
+            out_dim += 1
+
+        if len(y_shapes) == len(x_shape):
+            # New dim is added at the end
+            y_shapes.append(IntImm(1))
+
+        self._attrs["out_dim_to_in"] = out_dim_to_in
+        return y_shapes
diff --git a/python/aitemplate/compiler/ops/conv/__init__.py b/python/aitemplate/compiler/ops/conv/__init__.py
new file mode 100644
index 000000000..3671a2308
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/__init__.py
@@ -0,0 +1,32 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+Conv2d family operators.
+"""
+from .conv2d import conv2d
+from .conv2d_bias import conv2d_bias
+from .conv2d_bias_add import conv2d_bias_add
+from .conv2d_bias_add_hardswish import conv2d_bias_add_hardswish
+from .conv2d_bias_add_relu import conv2d_bias_add_relu
+from .conv2d_bias_few_channels import conv2d_bias_few_channels
+from .conv2d_bias_hardswish import conv2d_bias_hardswish
+from .conv2d_bias_hardswish_few_channels import conv2d_bias_hardswish_few_channels
+from .conv2d_bias_relu import conv2d_bias_relu
+from .conv2d_bias_relu_few_channels import conv2d_bias_relu_few_channels
+from .conv2d_bias_sigmoid import conv2d_bias_sigmoid
+from .transposed_conv2d import transposed_conv2d
+from .transposed_conv2d_bias import transposed_conv2d_bias
+from .transposed_conv2d_bias_relu import transposed_conv2d_bias_relu
diff --git a/python/aitemplate/compiler/ops/conv/cache_entry.py b/python/aitemplate/compiler/ops/conv/cache_entry.py
new file mode 100644
index 000000000..8e7b989e9
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/cache_entry.py
@@ -0,0 +1,71 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Cache entry for conv2d.
+"""
+from dataclasses import dataclass
+
+# pylint: disable=C0103
+
+
+@dataclass
+class ConvQueryEntry:
+    """Query Entry"""
+
+    dtype_a: int
+    dtype_b: int
+    dtype_c: int
+    dtype_acc: int
+    major_a: int
+    major_b: int
+    major_c: int
+    kh: int
+    kw: int
+    co: int
+    stride: int
+    pad: int
+    dilate: int
+    op_type: str
+    device: str
+    epilogue: int
+    split_k: int
+    exec_entry_sha1: str
+
+
+@dataclass
+class ConvRecordEntry:
+    """Record Entry"""
+
+    exec_entry: str
+    exec_entry_sha1: str
+    dtype_a: int
+    dtype_b: int
+    dtype_c: int
+    dtype_acc: int
+    major_a: int
+    major_b: int
+    major_c: int
+    kh: int
+    kw: int
+    co: int
+    stride: int
+    pad: int
+    dilate: int
+    op_type: str
+    epilogue: int
+    device: str
+    algo: str
+    workspace: int
+    split_k: int
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
new file mode 100644
index 000000000..dfedba010
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
@@ -0,0 +1,97 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused conv2d_bias_activation op.
+"""
+from typing import Tuple
+
+from ...base import Tensor
+from .conv2d import conv2d
+
+# pylint: disable=C0103
+class conv2d_bias_activation(conv2d):
+    """Base class of conv2d with bias + activation."""
+
+    def __init__(self, activation, stride, pad, dilate=1, group=1) -> None:
+        """Conv2d_bias_activation constructor.
+
+        Parameters
+        ----------
+        activation : str
+            The name of the activation operator
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__(stride, pad, dilate=dilate, group=group)
+        self._attrs["op"] = "conv2d_bias_{act}".format(act=activation)
+        self._attrs["epilogue"] = "LinearCombinationRelu"
+
+    def __call__(self, x: Tensor, w: Tensor, b: Tensor):
+        """Call conv2d_bias_activation with tensors x, w, b
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_h, K_w, C_in)
+        b : Tensor
+            in shape (C_out)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, H_out, W_out, C_out)
+        """
+        self._attrs["inputs"] = [x, w, b]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        output = Tensor(output_shape, src_ops={self})
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        self._attrs["outputs"] = [output]
+        return output
+
+    @staticmethod
+    def is_valid_inputs(x: Tensor, w: Tensor, b: Tensor) -> Tuple[bool, str]:
+        x_shape = x._attrs["shape"]
+        if len(x_shape) != 4:
+            return False, f"x should be 4D: {x_shape=}"
+
+        w_shape = w._attrs["shape"]
+        if len(w_shape) != 4:
+            return False, f"w should be 4D: {w_shape=}"
+
+        b_shape = b._attrs["shape"]
+        if len(b_shape) != 1:
+            return False, f"b should be 1D: {b_shape=}"
+
+        if b_shape[0] != w_shape[0]:
+            return (
+                False,
+                f"out channels in bias does not match: {b_shape[0]=} != {w_shape[0]=}",
+            )
+
+        # No need to check compatibility of x/w. This function is only used
+        # for fusing conv/elementwise into conv_bias. If x and w were not compatible,
+        # it would fail in the original conv.__call__.
+        return True, ""
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
new file mode 100644
index 000000000..30c83471b
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
@@ -0,0 +1,74 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused conv2d_bias_add_activation op.
+"""
+
+from ...base import Tensor
+from .conv2d import conv2d
+
+
+# pylint: disable=C0103
+class conv2d_bias_add_activation(conv2d):
+    """Base class of conv2d with bias + add + activation."""
+
+    def __init__(self, activation, stride, pad, dilate=1, group=1) -> None:
+        """Conv2d_bias_add_activation constructor.
+
+        Parameters
+        ----------
+        activation : str
+            Name of the activation operator
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__(stride, pad, dilate=dilate, group=group)
+        self._attrs["op"] = "conv2d_bias_add_{act}".format(act=activation)
+        self._attrs["epilogue"] = "LinearCombinationResidualBlock"
+
+    def __call__(self, x: Tensor, w: Tensor, b: Tensor, r: Tensor):
+        """Call conv2d_bias_add_activation with tensors x, w, b, r
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_h, K_w, C_in)
+        b : Tensor
+            in shape (C_out)
+        r : Tensor
+            in shape (N, H_out, W_out, C_out)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, H_out, W_out, C_out)
+        """
+        self._attrs["inputs"] = [x, w, b, r]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        output = Tensor(output_shape, src_ops={self})
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        self._attrs["outputs"] = [output]
+        return output
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
new file mode 100644
index 000000000..f77b863ee
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -0,0 +1,621 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Base class for conv2d.
+"""
+import itertools
+import os
+import re
+from collections import OrderedDict
+from hashlib import sha1
+from typing import Any, Dict, List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....backend.target import Target
+from ....utils import logger, shape_utils
+from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
+from .cache_entry import ConvQueryEntry, ConvRecordEntry
+
+# pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
+
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}HI = {{x_dim1}};
+{{indent}}{{dtype}}WI = {{x_dim2}};
+{{indent}}{{dtype}}CI = {{x_dim3}};
+{{indent}}{{dtype}}CO = {{w_dim0}};
+{{indent}}{{dtype}}KH = {{w_dim1}};
+{{indent}}{{dtype}}KW = {{w_dim2}};
+{{indent}}{{dtype}}SH = {{stride}};
+{{indent}}{{dtype}}SW = {{stride}};
+{{indent}}{{dtype}}DH = {{dilate}};
+{{indent}}{{dtype}}DW = {{dilate}};
+{{indent}}{{dtype}}PH = {{pad}};
+{{indent}}{{dtype}}PW = {{pad}};
+{{indent}}{{dtype}}KHEff = (KH - 1) * DH + 1;
+{{indent}}{{dtype}}KWEff = (KW - 1) * DW + 1;
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}HO = (HI + PH + PH - KHEff) {{div}} SH + 1;
+{{indent}}{{dtype}}WO = (WI + PW + PW - KWEff) {{div}} SW + 1;
+"""
+)
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = NO;
+{{indent}}{{y_dim1}} = HO;
+{{indent}}{{y_dim2}} = WO;
+{{indent}}{{y_dim3}} = CO;
+"""
+)
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+NI == {{x_dim0}} && HI == {{x_dim1}} && WI == {{x_dim2}} && CI == {{x_dim3}}
+"""
+)
+
+EXEC_DYN_KEY_TEMPLATE = jinja2.Template(
+    """
+NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} && HI == {{x_dim1}} && WI == {{x_dim2}} && CI == {{x_dim3}}
+"""
+)
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class conv2d(Operator):
+    r"""
+    Applies a 2D convolution on input with size (N, H, W, C_in), and produces output with size (N, H_out, W_out, C_out) where N is batch size, H, W are the height and width of the image in pixels, and C is the number of channels.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, H, W, C_{\text{in}})` and output :math:`(N, H_{\text{out}}, W_{\text{out}}, C_{\text{out}})`
+    can be precisely described as:
+
+    .. math::
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
+
+    where :math:`\star` is the valid 2D `cross-correlation`_ operator.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`pad` controls the amount of implicit zero padding on both
+      sides for ``dilation * (kernel_size - 1) - padding`` number of points.
+
+    * :attr:`dilate` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
+
+    * :attr:`group` controls the number of blocked connections from input channels
+      to output channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+    This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        OP = aitemplate.compiler.ops.conv2d(stride=1, pad=1, dilate=1)
+        Y = OP(X, W)
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = NHWC2NCHW(W_ait)
+
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt)
+        Y = NCHW2NHWC(Y_pt)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _`here`:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv2d constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__()
+        self._attrs["op"] = "conv2d"
+        self._attrs["stride"] = stride
+        self._attrs["pad"] = pad
+        self._attrs["dilate"] = dilate
+        self._attrs["group"] = group
+        self._attrs["has_profiler"] = True
+        self._attrs["epilogue_alignment"] = 1
+        self._attrs["epilogue"] = "LinearCombination"
+        self._attrs["workspace"] = 0
+        self._attrs["split_k"] = None
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+        self.exec_dyn_key_template = EXEC_DYN_KEY_TEMPLATE
+        self.exec_cond_template = EXEC_COND_TEMPLATE
+
+    def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
+        if x[3] != w[3] * self._attrs["group"]:
+            raise RuntimeError("X/W Shape mismatch for conv2d")
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            stride=self._attrs["stride"],
+            pad=self._attrs["pad"],
+            dilate=self._attrs["dilate"],
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            w_dim0=w[0],
+            w_dim1=w[1],
+            w_dim2=w[2],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor, w: Tensor) -> List[int]:
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
+        self._attrs["CO"] = w_shape[0]
+        self._attrs["KH"] = w_shape[1]
+        self._attrs["KW"] = w_shape[2]
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, w_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_exec_key(self, shape: List[int]):
+        return self.exec_key_template.render(
+            x_dim0=shape[0], x_dim1=shape[1], x_dim2=shape[2], x_dim3=shape[3]
+        ).replace("\n", "")
+
+    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3):
+        return self.exec_dyn_key_template.render(
+            x_dim0_lb=dim0_lb, x_dim0_ub=dim0_ub, x_dim1=dim1, x_dim2=dim2, x_dim3=dim3
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def _signature(self):
+        signature = "conv2d: K=[{kh}, {kw}], S=[{s}], P=[{p}], CO=[{co}]".format(
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            s=self._attrs["stride"],
+            p=self._attrs["pad"],
+            co=self._attrs["CO"],
+        )
+        return signature
+
+    def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
+        epilogue_dim = output_shape[-1]
+        if not isinstance(epilogue_dim, IntImm):
+            raise RuntimeError("Conv output last dimension must be static!")
+        shape = epilogue_dim._attrs["values"][0]
+        if shape % 8 == 0:
+            self._attrs["epilogue_alignment"] = 8
+        elif shape % 4 == 0:
+            self._attrs["epilogue_alignment"] = 4
+        elif shape % 2 == 0:
+            self._attrs["epilogue_alignment"] = 2
+
+    def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
+        """Call conv2d with tensors x, w
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_h, K_w, C_in)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, H_out, W_out, C_out)
+        """
+        self._attrs["inputs"] = [x, w]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_op_attributes(self) -> Dict[str, Any]:
+        target_attrs = ["dilate", "group", "pad", "stride"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def gen_profiler(
+        self,
+        workdir: str = None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+    ) -> None:
+        """Profiler generator.
+
+        Parameters
+        ----------
+        workdir : str, optional, by default None
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy, used to filter generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        """
+        target = backend.target.Target.current()
+
+        func_key = "{target}.{op}.config".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs)
+        func_key = "{target}.{op}.gen_profiler".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs, workdir, self.shape_eval_template)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd = [exe_path]
+        cmd.append(x_shape[0])
+        cmd.append(x_shape[1])
+        cmd.append(x_shape[2])
+        cmd.append(x_shape[3])
+        cmd.append(self._attrs["KH"])
+        cmd.append(self._attrs["KW"])
+        cmd.append(self._attrs["CO"])
+        cmd.append(self._attrs["stride"])
+        cmd.append(self._attrs["pad"])
+        cmd.append(self._attrs["dilate"])
+        cmd.append(self._attrs["group"])
+        command = [str(x) for x in cmd]
+        return command
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        target = backend.target.Target.current()
+        # if in CI just choose minimal configs
+        # workspace is a hack just provides 102400 Byte
+        if target.use_dummy_profiling_results():
+            algo = target.select_minimal_algo(list(self._attrs["op_instance"].keys()))
+            logger.info(__name__, f"Select minimal algo {algo} for CI")
+            return (algo, 102400)
+        # query cache
+        tmp_key = next(iter(self._attrs["op_instance"].keys()))
+        tmp_op = self._attrs["op_instance"][tmp_key]
+        exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
+        split_k = 1 if self._attrs["split_k"] is None else self._attrs["split_k"]
+        query = ConvQueryEntry(
+            dtype_a=tmp_op.A.element.value,
+            dtype_b=tmp_op.B.element.value,
+            dtype_c=tmp_op.C.element.value,
+            dtype_acc=tmp_op.accumulator_type().value,
+            major_a=tmp_op.A.layout.value,
+            major_b=tmp_op.B.layout.value,
+            major_c=tmp_op.C.layout.value,
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            co=self._attrs["CO"],
+            stride=self._attrs["stride"],
+            pad=self._attrs["pad"],
+            dilate=self._attrs["dilate"],
+            op_type=self._attrs["op"],
+            device=target._arch,
+            epilogue=tmp_op.epilogue_functor.value,
+            split_k=split_k,
+            exec_entry_sha1=exec_entry_sha1,
+        )
+        cache_value = target.query_profile_cache("conv", query.__dict__)
+        if cache_value is not None and not target.force_profile():
+            logger.info(__name__, "Load profiling result from cache.")
+            return cache_value
+        if target.use_dummy_profiling_results():
+            op_type = self._attrs["op"]
+            raise Exception(
+                "This is a CI run but we could not find the following cache ",
+                f"available on device {target._arch}\n",
+                f"{op_type} {exec_entry_sha1}.\n",
+                "To bypass, you need to make it available in the db table.",
+            )
+
+        func_key = "{target}.{op}.filter".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        content = list(self._attrs["op_instance"].keys())
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        x_shape = self._invert_exec_key(exec_key)
+        for cfg in content:
+            if not func(cfg, self._attrs, x_shape):
+                continue
+            command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
+            runner.push(cfg, command)
+
+        runner.join()
+        result = runner.pull()
+
+        out = sorted(result, key=lambda x: x[1])
+        if len(out) == 0:
+            raise RuntimeError(
+                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+            )
+        best_algo = out[0][0]
+        workspace = out[0][1].workspace
+        ## cache
+        cache_record = ConvRecordEntry(
+            exec_entry=exec_key,
+            exec_entry_sha1=exec_entry_sha1,
+            dtype_a=tmp_op.A.element.value,
+            dtype_b=tmp_op.B.element.value,
+            dtype_c=tmp_op.C.element.value,
+            dtype_acc=tmp_op.accumulator_type().value,
+            major_a=tmp_op.A.layout.value,
+            major_b=tmp_op.B.layout.value,
+            major_c=tmp_op.C.layout.value,
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            co=self._attrs["CO"],
+            stride=self._attrs["stride"],
+            pad=self._attrs["pad"],
+            dilate=self._attrs["dilate"],
+            op_type=self._attrs["op"],
+            epilogue=tmp_op.epilogue_functor.value,
+            device=target._arch,
+            algo=best_algo,
+            workspace=workspace,
+            split_k=split_k,  # todo add into profile
+        )
+        Target.current().insert_profile_cache("conv", cache_record.__dict__)
+        return (best_algo, workspace)
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+    ):
+        if devices is None:
+            devices = [0]
+        self._profile_static(workdir, devices)
+
+        has_dynamic = False
+        for input_tensor in self._attrs["inputs"]:
+            for dim in input_tensor._attrs["shape"]:
+                if not isinstance(dim, IntImm):
+                    has_dynamic = True
+                    break
+        if has_dynamic:
+            if dynamic_profiling_strategy != DynamicProfileStrategy.HINTS:
+                raise NotImplementedError(
+                    "conv2d only supports HINTS dynamic profiling strategy for now! Current strategy: {}".format(
+                        dynamic_profiling_strategy
+                    )
+                )
+            self._profile_dynamic_dim(workdir)
+
+    def _profile_static(self, workdir, devices):
+        """Profiles with static shapes."""
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        if "op_instance" not in self._attrs:
+            target = backend.target.Target.current()
+            # init candidate ops
+            func_key = "{target}.{op}.config".format(
+                target=target.name(), op=self._attrs["op"]
+            )
+            func = registry.get(func_key)
+            func(self._attrs)
+
+        for wkl in workloads:
+            logger.info(
+                __name__,
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            best_algo, workspace = self._profile_single_workload(
+                profiler_prefix, wkl, devices
+            )
+            self._attrs["exec_path"][wkl] = best_algo
+            self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+
+    def _profile_dynamic_dim(self, workdir):
+        """Profiles with dynamic shapes."""
+
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
+        # extract dynamic dim from exec_path
+        if len(self._attrs["exec_path"]) <= 1:
+            return
+
+        def _extract_dynamic_dim(exec_keys):
+            logger.info(__name__, "ONLY SUPPORT DYNAMIC BATCH (dim0)!")
+            var_dims = [[], [], [], []]
+            for key in exec_keys:
+                dims = self._invert_exec_key(key)
+                for i, v in enumerate(dims):
+                    var_dims[i].append(v)
+            return var_dims
+
+        dims = _extract_dynamic_dim(self._attrs["exec_path"].keys())
+        dim1 = dims[1][0]
+        dim2 = dims[2][0]
+        dim3 = dims[3][0]
+        algos = list(self._attrs["exec_path"].values())
+        # generate region
+        regions = []  # lb, ub, lb_algos, ub_algos
+        for i in range(len(dims[0]) - 1):
+            regions.append([dims[0][i], dims[0][i + 1], algos[i], algos[i + 1]])
+        # for each region,
+        #   binary search to find cutting point
+        #   generate new exec
+        special_cases = OrderedDict()
+        new_exec_paths = OrderedDict()
+        for lb, ub, lb_algo, ub_algo in regions:
+            mid = (lb + ub) // 2
+            origin_lb = lb
+            origin_ub = ub
+            last_mid = mid
+            while mid > lb and mid < ub:
+                mid = (lb + ub) // 2
+                mid_shape = [mid, dim1, dim2, dim3]
+                logger.info(
+                    __name__,
+                    "current: lb_algo: {lb_algo}, LB:{lb} MID:{mid} UB:{ub}".format(
+                        lb_algo=lb_algo, lb=lb, mid=mid, ub=ub
+                    ),
+                )
+
+                mid_lb_algo_cmd = self._gen_profile_cmd(
+                    profiler_prefix, str(lb_algo), mid_shape
+                )
+                mid_ub_algo_cmd = self._gen_profile_cmd(
+                    profiler_prefix, str(ub_algo), mid_shape
+                )
+                runner.push(0, mid_lb_algo_cmd)
+                runner.push(1, mid_ub_algo_cmd)
+                runner.join()
+                result = runner.pull()
+                assert len(result) >= 1
+                # if there is only one result, assume ub algo failed.
+                if len(result) == 1:
+                    assert result[0][0] == 0
+                    # last_lb = lb
+                    lb = mid + 1
+                # if there are two result, compare to decide new lb/ub
+                else:
+                    lb_time = result[0][1]
+                    ub_time = result[1][1]
+                    if lb_time < ub_time:
+                        # lb algo can work with larger batch
+                        # last_lb = lb
+                        lb = mid + 1
+                    else:
+                        # ub algo can work with smaller batch
+                        # last_ub = ub
+                        ub = mid - 1
+                last_mid = mid
+                mid = (lb + ub) // 2
+            lo_region_key = self._gen_dyn_exec_key(
+                origin_lb, last_mid, dim1, dim2, dim3
+            )
+            up_region_key = self._gen_dyn_exec_key(
+                last_mid, origin_ub, dim1, dim2, dim3
+            )
+            new_exec_paths[lo_region_key] = lb_algo
+            new_exec_paths[up_region_key] = ub_algo
+            # find special cases
+            # This code is kept in case need fully tested dynamic code
+            # So far I find binary search works well.
+            # def _find_special_case(lb, ub, algo):
+            #     for i in range(lb + 1, ub + 1):
+            #         x_shape = [i, dim1, dim2, dim3]
+            #         cmd = self._gen_profile_cmd(profiler_prefix, str(algo), x_shape)
+            #         runner.push(0, cmd)
+            #         runner.join()
+            #         out = runner.pull()
+            #         if len(out) == 0:
+            #             logger.info(self._attrs["name"], "Find specail case: batch=%d" % i)
+            #             algo = self._profile_single_workload(profiler_prefix, x_shape, [0])
+            #             special_cases[self._gen_exec_key(x_shape)] = algo
+
+            # logger.info(self._attrs["name"],
+            #     "Searching for specail cases between [{lb}, {ub}]".format(lb=origin_lb,
+            #         ub=last_mid))
+            # _find_special_case(origin_lb, last_mid, lb_algo)
+            # logger.info(self._attrs["name"],
+            #     "Searching for specail cases between [{lb}, {ub}]".format(lb=last_mid + 1,
+            #         ub=origin_ub))
+            # _find_special_case(last_mid, origin_ub, ub_algo)
+        special_cases.update(new_exec_paths)
+        self._attrs["exec_path"] = special_cases
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            self.exec_cond_template,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias.py b/python/aitemplate/compiler/ops/conv/conv2d_bias.py
new file mode 100644
index 000000000..2102ffaa5
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias.py
@@ -0,0 +1,74 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Conv2d with bias.
+"""
+from .common_conv2d_bias_activation import conv2d_bias_activation
+
+
+# pylint: disable=C0103
+class conv2d_bias(conv2d_bias_activation):
+    r"""Conv2d with bias.
+
+    Applies a 2D convolution on input in shape (N, H, W, C_in), adds a bias in shape (C_out) produces output in shape (N, H_out, W_out, C_out). N is batch size, H, W are the height and width of the input images in pixels, and C is the number of channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+        bias: optional bias tensor of shape :math:`(\text{out\_channels})`. Default: ``None``
+
+    This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        B = Tensor(shape=[C_out], dtype="float16", name="bias", is_input=True)
+        OP = aitemplate.compiler.ops.conv2d_bias(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B)
+
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = NHWC2NCHW(W_ait)
+        B_pt = NHWC2NCHW(B_ait)
+
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, bias=B)
+        Y = NCHW2NHWC(Y_pt)
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv2d_bias constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__("identity", stride, pad, dilate=dilate, group=group)
+        self._attrs["op"] = "conv2d_bias"
+        self._attrs["epilogue"] = "LinearCombination"
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
new file mode 100644
index 000000000..4581e4a7a
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+fused conv2d_bias_add op
+"""
+from .common_conv2d_bias_add_activation import conv2d_bias_add_activation
+
+
+# pylint: disable=C0103
+class conv2d_bias_add(conv2d_bias_add_activation):
+    r"""Conv2d_bias_add.
+
+    Applies a 2D convolution on input in shape (N, H, W, C_in), adds a bias in shape (C_out), adds the residual in shape (N, H_out, W_out, C_out), produces output in shape (N, H_out, W_out, C_out). N is batch size, H, W are the height and width of the input images in pixels, and C is the number of channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+        bias: optional bias tensor of shape :math:`(\text{out\_channels})`
+
+        residual: residual to add after conv2d_bias
+
+    This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        B = Tensor(shape=[C_out], dtype="float16", name="bias", is_input=True)
+        R = Tensor(shape=[N, H_out, W_out, C_out], dtype="float16", name="residual", is_input=True)
+        OP = aitemplate.compiler.ops.conv2d_bias_add(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B, R)
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = NHWC2NCHW(W_ait)
+        B_pt = NHWC2NCHW(B_ait)
+        R_pt = NHWC2NCHW(R_ait)
+
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, bias=B_pt)
+        Z_pt = Y_pt + R_pt
+        Result_pt = Z_pt
+        Result = NCHW2NHWC(Result_pt)
+
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv2d_bias_add constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+            Number of input channels to process to compute one output channel, by default 1
+        """
+        super().__init__("identity", stride, pad, dilate=dilate, group=group)
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
new file mode 100644
index 000000000..b3632f617
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
@@ -0,0 +1,76 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+fused conv2d_bias_add_hardswish op, for residual block
+"""
+from .common_conv2d_bias_add_activation import conv2d_bias_add_activation
+
+
+# pylint: disable=C0103
+class conv2d_bias_add_hardswish(conv2d_bias_add_activation):
+    r"""Conv2d_bias_add_hardswish.
+
+    Applies a 2D convolution on input in shape (N, H, W, C_in), adds a bias in shape (C_out), adds the residual in shape (N, H_out, W_out, C_out), performs hardswish operation and produces output in shape (N, H_out, W_out, C_out). N is batch size, H, W are the height and width of the input images in pixels, and C is the number of channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+        bias: optional bias tensor of shape :math:`(\text{out\_channels})`
+
+        residual: residual to add after conv2d_bias
+
+    This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        B = Tensor(shape=[C_out], dtype="float16", name="bias", is_input=True)
+        R = Tensor(shape=[N, H_out, W_out, C_out], dtype="float16", name="residual", is_input=True)
+        OP = aitemplate.compiler.ops.conv2d_bias_add_hardswish(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B, R)
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = NHWC2NCHW(W_ait)
+        B_pt = NHWC2NCHW(B_ait)
+        R_pt = NHWC2NCHW(R_ait)
+
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, bias=B_pt)
+        Z_pt = Y_pt + R_pt
+        Result_pt = torch.nn.functional.hardswish(Z_pt)
+        Result = NCHW2NHWC(Result_pt)
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv2d_bias_add_hardswish constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+            Number of input channels to process to compute one output channel, by default 1
+        """
+        super().__init__("hardswish", stride, pad, dilate=dilate, group=group)
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
new file mode 100644
index 000000000..3483c2d16
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+fused conv2d_bias_relu_add op, for residual block
+"""
+from .common_conv2d_bias_add_activation import conv2d_bias_add_activation
+
+
+# pylint: disable=C0103
+class conv2d_bias_add_relu(conv2d_bias_add_activation):
+    r"""Conv2d_bias_add_relu.
+
+    Applies a 2D convolution on input in shape (N, H, W, C_in), adds a bias in shape (C_out), adds the residual in shape (N, H_out, W_out, C_out), performs relu operation and produces output in shape (N, H_out, W_out, C_out). N is batch size, H, W are the height and width of the input images in pixels, and C is the number of channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+        bias: optional bias tensor of shape :math:`(\text{out\_channels})`
+
+        residual: residual to add after conv2d_bias
+
+    This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        B = Tensor(shape=[C_out], dtype="float16", name="bias", is_input=True)
+        R = Tensor(shape=[N, H_out, W_out, C_out], dtype="float16", name="residual", is_input=True)
+        OP = aitemplate.compiler.ops.conv2d_bias_add_relu(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B, R)
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = NHWC2NCHW(W_ait)
+        B_pt = NHWC2NCHW(B_ait)
+        R_pt = NHWC2NCHW(R_ait)
+
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, bias=B_pt)
+        Z_pt = Y_pt + R_pt
+        Result_pt = torch.nn.functional.relu(Z_pt)
+        Result = NCHW2NHWC(Result_pt)
+
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv2d_bias_add_relu constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+            Number of input channels to process to compute one output channel, by default 1
+        """
+        super().__init__("relu", stride, pad, dilate=dilate, group=group)
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
new file mode 100644
index 000000000..6b61f1b80
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
@@ -0,0 +1,41 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused conv2d_bias_few_channels op.
+"""
+from .special_conv2d_bias_activation import special_conv2d_bias_activation
+
+# pylint: disable=C0103
+class conv2d_bias_few_channels(special_conv2d_bias_activation):
+    """conv2d_bias_few_channels.
+
+    This operator equals to conv2d_bias but has improved performance for in_channels < 8.
+    """
+
+    def __init__(self, stride, pad, dilate=1, auto_padding=True) -> None:
+        """Initializes conv2d_bias_few_channels"""
+        super().__init__("identity", stride, pad, dilate, auto_padding)
+        self._attrs["op"] = "conv2d_bias_few_channels"
+        self._attrs["epilogue"] = "LinearCombination"
+
+    def _get_op_attributes(self):
+        target_attrs = ["dilate", "pad", "stride"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
new file mode 100644
index 000000000..d98a510d0
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused conv2d_bias_hardswish op.
+"""
+from .common_conv2d_bias_activation import conv2d_bias_activation
+
+
+# pylint: disable=C0103
+class conv2d_bias_hardswish(conv2d_bias_activation):
+    r"""Conv2d with bias + hardswish.
+
+    Applies a 2D convolution on input in shape (N, H, W, C_in), adds a bias in shape (C_out), performs hardswish and produces output in shape (N, H_out, W_out, C_out). N is batch size, H, W are the height and width of the input images in pixels, and C is the number of channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+        bias: optional bias tensor of shape :math:`(\text{out\_channels})`
+
+    This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        B = Tensor(shape=[C_out], dtype="float16", name="weight", is_input=True)
+        OP = aitemplate.compiler.ops.conv2d_bias_hardswish(stride=1, pad=1, dilate=1)
+        Result_ait = OP(X, W, B)
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = NHWC2NCHW(W_ait)
+        B_pt = NHWC2NCHW(B_ait)
+
+        Y = torch.nn.functional.conv2d(X_pt, W_pt, bias=B_pt)
+        Result_pt = torch.nn.functional.hardswish(Y)
+        Result_ait = NCHW2NHWC(Result_pt)
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv2d_bias_hardswish constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+            Number of input channels to process to compute one output channel, by default 1
+        """
+        super().__init__("hardswish", stride, pad, dilate=dilate, group=group)
+        self._attrs["epilogue"] = "LinearCombinationHardSwish"
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
new file mode 100644
index 000000000..9f9e4b31c
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
@@ -0,0 +1,29 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused conv2d_bias_hardswish_few_channels op.
+"""
+from .special_conv2d_bias_activation import special_conv2d_bias_activation
+
+# pylint: disable=C0103
+class conv2d_bias_hardswish_few_channels(special_conv2d_bias_activation):
+    """conv2d_bias_hardswish_few_channels.
+
+    This operator equals to conv2d_bias_hardswish but has imporved performance for in_channels < 8.
+    """
+
+    def __init__(self, stride, pad, dilate=1, auto_padding=True) -> None:
+        """Initializes conv2d_bias_relu_few_channels"""
+        super().__init__("hardswish", stride, pad, dilate, auto_padding)
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
new file mode 100644
index 000000000..13c2b6cfe
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
@@ -0,0 +1,71 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused conv2d_bias_relu op.
+"""
+from .common_conv2d_bias_activation import conv2d_bias_activation
+
+
+# pylint: disable=C0103
+class conv2d_bias_relu(conv2d_bias_activation):
+    r"""Conv2d with bias + relu.
+
+    Applies a 2D convolution on input in shape (N, H, W, C_in), adds a bias in shape (C_out), performs relu and produces output in shape (N, H_out, W_out, C_out). N is batch size, H, W are the height and width of the input images in pixels, and C is the number of channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+        bias: optional bias tensor of shape :math:`(\text{out\_channels})`
+
+    This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        B = Tensor(shape=[C_out], dtype="float16", name="weight", is_input=True)
+        OP = aitemplate.compiler.ops.conv2d_bias_relu(stride=1, pad=1, dilate=1)
+        Result_ait = OP(X, W, B)
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = NHWC2NCHW(W_ait)
+        B_pt = NHWC2NCHW(B_ait)
+
+        Y = torch.nn.functional.conv2d(X_pt, W_pt, bias=B_pt)
+        Result_pt = torch.nn.functional.relu(Y)
+        Result_ait = NCHW2NHWC(Result_pt)
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv2d_bias_relu constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+            Number of input channels to process to compute one output channel, by default 1
+        """
+        super().__init__("relu", stride, pad, dilate=dilate, group=group)
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
new file mode 100644
index 000000000..6f2c2c6d8
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
@@ -0,0 +1,39 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused conv2d_bias_relu_few_channels op.
+"""
+from .special_conv2d_bias_activation import special_conv2d_bias_activation
+
+# pylint: disable=C0103
+class conv2d_bias_relu_few_channels(special_conv2d_bias_activation):
+    """conv2d_bias_relu_few_channels.
+
+    This operator equals to conv2d_bias_relu but has imporved performance for in_channels < 8.
+    """
+
+    def __init__(self, stride, pad, dilate=1, auto_padding=True) -> None:
+        """Initializes conv2d_bias_relu_few_channels"""
+        super().__init__("relu", stride, pad, dilate, auto_padding)
+
+    def _get_op_attributes(self):
+        target_attrs = ["dilate", "pad", "stride"]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
new file mode 100644
index 000000000..75adc9919
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused conv2d_bias_sigmoid op.
+"""
+from .common_conv2d_bias_activation import conv2d_bias_activation
+
+
+# pylint: disable=C0103
+class conv2d_bias_sigmoid(conv2d_bias_activation):
+    r"""Conv2d with bias + sigmoid.
+
+    Applies a 2D convolution on input in shape (N, H, W, C_in), adds a bias in shape (C_out), performs sigmoid and produces output in shape (N, H_out, W_out, C_out). N is batch size, H, W are the height and width of the input images in pixels, and C is the number of channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+        bias: optional bias tensor of shape :math:`(\text{out\_channels})`
+
+    This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        B = Tensor(shape=[C_out], dtype="float16", name="weight", is_input=True)
+        OP = aitemplate.compiler.ops.conv2d_bias_sigmoid(stride=1, pad=1, dilate=1)
+        Result_ait = OP(X, W, B)
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = NHWC2NCHW(W_ait)
+        B_pt = NHWC2NCHW(B_ait)
+
+        Y = torch.nn.functional.conv2d(X_pt, W_pt, bias=B_pt)
+        Result_pt = torch.sigmoid(Y)
+        Result_ait = NCHW2NHWC(Result_pt)
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv2d_bias_sigmoid constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+            Number of input channels to process to compute one output channel, by default 1
+        """
+        super().__init__("sigmoid", stride, pad, dilate=dilate, group=group)
+        self._attrs["epilogue"] = "LinearCombinationSigmoid"
diff --git a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
new file mode 100644
index 000000000..025daae9d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
@@ -0,0 +1,87 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused special_conv2d_bias_activation op.
+"""
+from ...base import Tensor
+from ..padding import nhwc3to4, nhwc3to8
+from .conv2d import conv2d
+
+# pylint: disable=C0103
+class special_conv2d_bias_activation(conv2d):
+    """Special_conv2d_bias_activation.
+
+    This operator equals to conv2d_bias_activation but has improved performance for in_channels < 8.
+    """
+
+    def __init__(self, activation, stride, pad, dilate=1, auto_padding=True) -> None:
+        """Special_conv2d_bias_activation constructor.
+
+        Parameters
+        ----------
+        activation : str
+            Name of the activation operator
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+            Number of input channels to process to compute one output channel, by default 1
+        """
+        super().__init__(stride, pad, dilate=dilate)
+        self._attrs["op"] = "conv2d_bias_{act}_few_channels".format(act=activation)
+        if activation == "relu":
+            self._attrs["epilogue"] = "LinearCombinationRelu"
+        elif activation == "hardswish":
+            self._attrs["epilogue"] = "LinearCombinationHardSwish"
+        elif activation == "identity":
+            self._attrs["epilogue"] = "LinearCombination"
+        else:
+            raise NotImplementedError
+        self._auto_padding = auto_padding
+
+    def __call__(self, x: Tensor, w: Tensor, b: Tensor):
+        """Call special_conv2d_bias_activation with tensors x, w, b.
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_h, K_w, C_in)
+        b : Tensor
+            in shape (C_out)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, H_out, W_out, C_out)
+        """
+        if self._auto_padding:
+            last_dim = x._attrs["shape"][-1]._attrs["values"][0]
+            if last_dim in range(1, 4):
+                x = nhwc3to4()(x)
+            elif last_dim in range(5, 8):
+                x = nhwc3to8()(x)
+        self._attrs["inputs"] = [x, w, b]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        output = Tensor(output_shape, src_ops={self})
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        self._attrs["outputs"] = [output]
+        return output
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
new file mode 100644
index 000000000..23c8ab1fc
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
@@ -0,0 +1,111 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Transposed conv2d op.
+"""
+import jinja2
+
+from .conv2d import conv2d
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}HI = {{x_dim1}};
+{{indent}}{{dtype}}WI = {{x_dim2}};
+{{indent}}{{dtype}}CI = {{x_dim3}};
+{{indent}}{{dtype}}CO = {{w_dim0}};
+{{indent}}{{dtype}}KH = {{w_dim1}};
+{{indent}}{{dtype}}KW = {{w_dim2}};
+{{indent}}{{dtype}}SH = {{stride}};
+{{indent}}{{dtype}}SW = {{stride}};
+{{indent}}{{dtype}}DH = {{dilate}};
+{{indent}}{{dtype}}DW = {{dilate}};
+{{indent}}{{dtype}}PH = {{pad}};
+{{indent}}{{dtype}}PW = {{pad}};
+{{indent}}{{dtype}}KHEff = (KH - 1) * DH + 1;
+{{indent}}{{dtype}}KWEff = (KW - 1) * DW + 1;
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}HO = (HI - 1) * SH - 2 * PH + KHEff;
+{{indent}}{{dtype}}WO = (WI - 1) * SW - 2 * PW + KWEff;
+"""
+)
+
+# pylint: disable=C0103
+class transposed_conv2d(conv2d):
+    r"""Transposed conv2d.
+
+    Applies a 2D transposed convolution on input in shape (N, H, W, C_in) and produces output in shape (N, H_out, W_out, C_out). N is batch size, H, W are the height and width of the input images in pixels, and C is the number of channels.
+
+    This module can be seen as the gradient of Conv2d with respect to its input. It is also known as a fractionally-strided convolution or a deconvolution (although it is not an actual deconvolution operation as it does not compute a true inverse of convolution). For more information, see the visualizations `here`_ and the `Deconvolutional Networks`_ paper.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`pad` controls the amount of implicit zero padding on both
+      sides for ``dilation * (kernel_size - 1) - padding`` number of points.
+
+    * :attr:`dilate` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
+
+    * :attr:`group` controls the number of blocked connections from input channels
+      to output channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+    This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        OP = aitemplate.compiler.ops.transposed_conv2d(stride=1, pad=1, dilate=1)
+        Y = OP(X, W)
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = NHWC2NCHW(W_ait)
+        Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt)
+        Y = NCHW2NHWC(Y_pt)
+
+    .. _`here`:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    .. _`Deconvolutional Networks`:
+        https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Transposed_conv2d constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+            Number of input channels to process to compute one output channel, by default 1
+        """
+        super().__init__(stride, pad, dilate=dilate, group=group)
+        self._attrs["op"] = "transposed_conv2d"
+        self._attrs["epilogue"] = "LinearCombination"
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
new file mode 100644
index 000000000..7a1d1c801
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
@@ -0,0 +1,110 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused transposed_conv2d_bias op.
+"""
+
+from typing import Tuple
+
+from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
+    conv2d_bias_activation,
+)
+
+from ...base import Tensor
+from .transposed_conv2d import transposed_conv2d
+
+# pylint: disable=C0103
+class transposed_conv2d_bias(transposed_conv2d):
+    r"""Transposed conv2d with bias.
+
+    Applies a 2D transposed convolution on input in shape (N, H, W, C_in), adds a bias in shape (C_out) and produces output in shape (N, H_out, W_out, C_out). N is batch size, H, W are the height and width of the input images in pixels, and C is the number of channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+        bias: optional bias tensor of shape :math:`(\text{out\_channels})`
+
+     This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        B = Tensor(shape=[C_out], dtype="float16", name="bias", is_input=True)
+        OP = aitemplate.compiler.ops.transposed_conv2d_bias(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B)
+
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = MHWC2NCHW(W_ait)
+        B_pt = NHWC2NCHW(B_ait)
+        Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, bias=B_Pt)
+        Y = nchw2nhwc(Y_pt)
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Transposed_conv2d_bias constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__(stride, pad, dilate=dilate, group=group)
+        self._attrs["op"] = "transposed_conv2d_bias"
+        self._attrs["epilogue"] = "LinearCombination"
+
+    def __call__(self, x: Tensor, w: Tensor, b: Tensor):
+        """Call transposed_conv2d_bias with tensors x, w, b.
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_h, K_w, C_in)
+        b : Tensor
+            in shape (C_out)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, H_out, W_out, C_out)
+        """
+        self._attrs["inputs"] = [x, w, b]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        output = Tensor(output_shape, src_ops={self})
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        self._attrs["outputs"] = [output]
+        return output
+
+    @staticmethod
+    def is_valid_inputs(x: Tensor, w: Tensor, b: Tensor) -> Tuple[bool, str]:
+        return conv2d_bias_activation.is_valid_inputs(x, w, b)
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
new file mode 100644
index 000000000..b66d8162d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
@@ -0,0 +1,73 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fused transposed_conv2d_bias_relu op.
+"""
+from .transposed_conv2d_bias import transposed_conv2d_bias
+
+# pylint: disable=C0103
+class transposed_conv2d_bias_relu(transposed_conv2d_bias):
+    r"""Transposed conv2d with bias + relu.
+
+    Applies a 2D transposed convolution on input in shape (N, H, W, C_in), adds a bias in shape (C_out), performs relu and produces output in shape (N, H_out, W_out, C_out). N is batch size, H, W are the height and width of the input images in pixels, and C is the number of channels.
+
+    Args:
+        input: input tensor of shape :math:`(N , H , W, \text{in\_channels})`
+
+        weight: filters of shape :math:`(\text{out\_channels} , K_h, K_w, \frac{\text{in\_channels}}{\text{groups}})`
+
+        bias: optional bias tensor of shape :math:`(\text{out\_channels})`
+
+     This operator uses "channels_last" data format. Below is an example and its equivalence in PyTorch:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X = Tensor(shape=[N, H, W, C_in], dtype="float16", name="images", is_input=True)
+        W = Tensor(shape=[C_out, K_h, K_w, C_in], dtype="float16", name="weight", is_input=True)
+        B = Tensor(shape=[C_out], dtype="float16", name="bias", is_input=True)
+        OP = aitemplate.compiler.ops.transposed_conv2d_bias_relu(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B)
+
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = NHWC2NCHW(X_ait)
+        W_pt = MHWC2NCHW(W_ait)
+        B_pt = NHWC2NCHW(B_ait)
+        Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, bias=B_pt)
+        Result_pt = torch.nn.functional.relu(Y_pt)
+        Result = NCHW2NHWC(Result_pt)
+    """
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Transposed_conv2d_bias_relu constructor.
+
+        Parameters
+        ----------
+        stride : int
+            Stride of the convolution
+        pad : int
+            Size of padding to add to the input
+        dilate : int, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__(stride, pad, dilate=dilate, group=group)
+        self._attrs["op"] = "transposed_conv2d_bias_relu"
+        self._attrs["epilogue"] = "LinearCombinationRelu"
diff --git a/python/aitemplate/compiler/ops/embedding/__init__.py b/python/aitemplate/compiler/ops/embedding/__init__.py
new file mode 100644
index 000000000..8e8178f4b
--- /dev/null
+++ b/python/aitemplate/compiler/ops/embedding/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from .bert_embeddings import bert_embeddings
+
+__all__ = [
+    "bert_embeddings",
+]
diff --git a/python/aitemplate/compiler/ops/embedding/bert_embeddings.py b/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
new file mode 100644
index 000000000..54da519bd
--- /dev/null
+++ b/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
@@ -0,0 +1,136 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Operator definition for bert_embeddings.
+"""
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ...base import IntImm, Operator, Tensor
+
+
+class bert_embeddings(Operator):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "bert_embeddings"
+        self._attrs["has_profiler"] = False
+
+    def _infer_shapes(self, x, embeddings):
+        return x.shape() + [embeddings._size(-1)]
+
+    def __call__(
+        self,
+        input_ids,  # [B, S] or [B * S]
+        token_type_ids,  # [B, S] or [B * S]
+        position_ids,  # [B, S] or [B * S]
+        word_embeddings,  # [vocab_size, hidden_size]
+        token_type_embeddings,  # [type_vocab_size, hidden_size]
+        position_embeddings,  # [max_position_embeddings, hidden_size]
+        gamma,  # [hidden_size]
+        beta,  # [hidden_size]
+        eps=1e-5,
+    ) -> Tensor:
+        """Compute
+        embedding = layernorm(word_embedding + token_type_embedding + position_embedding).
+
+        Parameters
+        ----------
+        input_ids, token_type_ids, position_ids must have the same sizes, either 2D or 1D.
+
+        Returns
+        -------
+        The computed embedding.
+        """
+
+        # dtype check
+        dtype_input_ids = input_ids._attrs["dtype"]
+        dtype_token_type_ids = token_type_ids._attrs["dtype"]
+        dtype_position_ids = position_ids._attrs["dtype"]
+        assert (
+            dtype_input_ids == dtype_token_type_ids
+            and dtype_input_ids == dtype_position_ids
+        ), "dtype of input_ids, token_type_ids, and position_ids must be the same"
+
+        dtype_word_embeddings = word_embeddings._attrs["dtype"]
+        dtype_token_type_embeddings = token_type_embeddings._attrs["dtype"]
+        dtype_position_embeddings = position_embeddings._attrs["dtype"]
+        assert (
+            dtype_word_embeddings == dtype_token_type_embeddings
+            and dtype_word_embeddings == dtype_position_embeddings
+        ), "dtype of word_embeddings, token_type_embeddings, position_embeddings must be the same"
+
+        assert dtype_input_ids in [
+            "int",
+            "int32",
+            "int64",
+        ], f"Expected dtype int/int32/int64 for index, got dtype {dtype_input_ids}"
+
+        assert (
+            dtype_word_embeddings == "float16"
+        ), f"Expected float16 embeddings, but got {dtype_word_embeddings}"
+
+        # expecting all three ids to have the same shapes
+        assert shape_utils.is_same_shape(input_ids.shape(), token_type_ids.shape()), (
+            f"Expecting input_ids and token_type_ids to have the same shapes, but got "
+            f"input_ids.shape(): {input_ids.shape()}, token_type_ids.shape(): {token_type_ids.shape()}"
+        )
+        assert shape_utils.is_same_shape(input_ids.shape(), position_ids.shape()), (
+            f"Expecting input_ids and position_ids to have the same shapes, but got "
+            f"input_ids.shape(): {input_ids.shape()}, position_ids.shape(): {position_ids.shape()}"
+        )
+
+        # expecting the last dim of all three embedding tables to be the same
+        dim = word_embeddings._size(-1)
+        assert isinstance(dim, IntImm), f"Embedding dim {dim} must be static."
+        dim_value = dim.value()
+        assert dim_value % 8 == 0, f"Embedding dim {dim} must be multiple of 8."
+        assert dim == token_type_embeddings._size(-1), (
+            f"Expecting the last dim of word_embeddings and token_type_embeddings to be the same, "
+            f"but got {word_embeddings._size(-1)} and {token_type_embeddings._size(-1)}"
+        )
+        assert dim == position_embeddings._size(-1), (
+            f"Expecting the last dim of word_embeddings and position_embeddings to be the same, "
+            f"but got {word_embeddings._size(-1)} and {position_embeddings._size(-1)}"
+        )
+
+        self._attrs["eps"] = eps
+
+        self._attrs["inputs"] = [
+            input_ids,
+            token_type_ids,
+            position_ids,
+            word_embeddings,
+            token_type_embeddings,
+            position_embeddings,
+            gamma,
+            beta,
+        ]
+
+        self._set_depth()
+
+        output_shape = self._infer_shapes(input_ids, word_embeddings)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
new file mode 100644
index 000000000..5958fe5ae
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from .bmm_rcr_softmax import bmm_rcr_softmax
+from .gemm_rcr_bias_softmax import gemm_rcr_bias_softmax
+from .gemm_rcr_softmax import gemm_rcr_softmax
+
+
+__all__ = ["bmm_rcr_softmax", "gemm_rcr_bias_softmax", "gemm_rcr_softmax"]
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
new file mode 100644
index 000000000..c3166b925
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -0,0 +1,127 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+"""
+
+from ...base import IntImm, Tensor
+from ...tensor_accessor import TensorAccessor
+from ..gemm_universal import gemm_common as common
+from ..gemm_universal.bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class bmm_rcr_softmax(bmm):
+    """BatchGemm with softmax,
+    A: row_major, B: column_major, C: row_major,
+    A: [b, m, k], B: [b, n, k], C: [b, m, n]
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "bmm_rcr_softmax"
+        raise Exception("BMM + Softmax is disabled for now")
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(k, k)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        a_shapes = a._attrs["shape"]
+        b_shapes = b._attrs["shape"]
+
+        batch_size_a = a_shapes[0]
+        batch_size_b = b_shapes[0]
+        if batch_size_a != batch_size_b and batch_size_a != 1 and batch_size_b != 1:
+            raise RuntimeError(
+                "bmm operand A and B should have same batch_size, or batch_size = 1! "
+                "Current shape A: {} shape B: {} .".format(a_shapes, b_shapes)
+            )
+        batch_size = batch_size_b if batch_size_a == IntImm(1) else batch_size_a
+        return [batch_size, a_shapes[1], b_shapes[1]]
+
+    def __call__(self, a: Tensor, b: Tensor) -> Tensor:
+        """Performs sanity checks, offline shape inference and returns an output tensor."""
+
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b]
+
+        self._sanity_check(a, b)
+
+        output_shape = self._infer_shapes(a, b)
+        self._extract_epilogue_alignment(output_shape)
+
+        temp_c = Tensor(output_shape, dst_ops={self})
+        temp_d = Tensor(output_shape, dst_ops={self})
+        temp_n = Tensor(
+            [output_shape[0], output_shape[1], IntImm(1)],
+            dtype="float32",
+            dst_ops={self},
+        )
+
+        self._attrs["inputs"].append(temp_c)
+        self._attrs["inputs"].append(temp_d)
+        self._attrs["inputs"].append(temp_n)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+
+        self._set_depth()
+
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
+
+    def _extract_dims(self, for_profiling=False):
+        # (B, M, K) * (B, N, K) = (B, M, N)
+        return {
+            "B": [common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0)],
+            "M": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=1),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+            "N": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=1),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+            "K": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=2),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=2),
+            ],
+        }
+
+    def _invert_exec_key(self, key):
+        """extract tensor shape info from key"""
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        """get command line for profiling"""
+
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
new file mode 100644
index 000000000..358dd891b
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
@@ -0,0 +1,76 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Operator definition for gemm_rcr_bias_softmax.
+"""
+from ...base import _create_host_zero_tensor, Tensor
+from ...tensor_accessor import TensorAccessor
+from .gemm_rcr_softmax import gemm_rcr_softmax
+
+# pylint: disable=C0103,R1711,W0102,W0221,E1120,W0223
+
+
+class gemm_rcr_bias_softmax(gemm_rcr_softmax):
+    """gemm_rcr_bias_softmax operator."""
+
+    def __init__(self):
+        """Initializes gemm_rcr_bias_softmax."""
+
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_softmax"
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, bias: Tensor):
+        """Infers output shapes from input tensors."""
+
+        bias_shape = bias._attrs["shape"]
+        if len(bias_shape) != 1:
+            raise RuntimeError("Bias should be 1D vector ")
+        bias_shape_value = bias_shape[0]._attrs["values"]
+        if len(bias_shape_value) != 1:
+            raise RuntimeError("Bias should be fixed 1D vector")
+        bias_dim = bias_shape_value[0]
+        outshape = super()._infer_shapes(a, b)
+        if outshape[1]._attrs["values"][0] != bias_dim:
+            raise RuntimeError("GEMM/Bias shape doesn't match")
+        return outshape
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        """Performs sanity checks, offline shape inference and returns an output tensor."""
+
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, bias]
+
+        self._sanity_check(a, b)
+
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+
+        temp_d = _create_host_zero_tensor(output_shape, dst_ops={self})
+        temp_n = _create_host_zero_tensor(
+            [output_shape[0], 1], dtype="float32", dst_ops={self}
+        )
+
+        self._attrs["inputs"].append(temp_d)
+        self._attrs["inputs"].append(temp_n)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+
+        self._set_depth()
+
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
new file mode 100644
index 000000000..163238824
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+"""
+
+from ...base import _create_host_zero_tensor, IntImm, Tensor
+from ...tensor_accessor import TensorAccessor
+from ..gemm_universal.gemm_rcr import gemm_rcr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rcr_softmax(gemm_rcr):
+    """gemm_rcr_softmax operator."""
+
+    def __init__(self):
+        """Initializes gemm_rcr_softmax."""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_softmax"
+        raise Exception("GEMM + Softmax is disabled for now")
+
+    def __call__(self, a: Tensor, b: Tensor) -> Tensor:
+        """Performs sanity checks, offline shape inference and returns an output tensor."""
+
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b]
+
+        self._sanity_check(a, b)
+
+        output_shape = self._infer_shapes(a, b)
+        self._extract_epilogue_alignment(output_shape)
+
+        temp_c = _create_host_zero_tensor(output_shape, dst_ops={self})
+        temp_d = _create_host_zero_tensor(output_shape, dst_ops={self})
+        temp_n = _create_host_zero_tensor(
+            [output_shape[0], IntImm(1)], dtype="float32", dst_ops={self}
+        )
+
+        self._attrs["inputs"].append(temp_c)
+        self._attrs["inputs"].append(temp_d)
+        self._attrs["inputs"].append(temp_n)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+
+        self._set_depth()
+
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_special/__init__.py b/python/aitemplate/compiler/ops/gemm_special/__init__.py
new file mode 100644
index 000000000..019225be5
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_special/__init__.py
@@ -0,0 +1,23 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+special gemm ops
+"""
+from .bmm_rcr_n1 import bmm_rcr_n1
+from .bmm_rrr_k1_tanh import bmm_rrr_k1_tanh
+from .gemm_rrr_small_nk import gemm_rrr_small_nk
+
+
+__all__ = ["bmm_rcr_n1", "bmm_rrr_k1_tanh", "gemm_rrr_small_nk"]
diff --git a/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py b/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
new file mode 100644
index 000000000..a288d0576
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
@@ -0,0 +1,97 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+
+Special kernel for GEMV case:
+A: [B, M, K]
+B: [B, N, K]
+C: [B, M, N]
+where N = 1
+
+This kernel computes C = alpha * A @ B
+"""
+
+from ...base import IntImm, Tensor
+from ...tensor_accessor import TensorAccessor
+from ..gemm_universal import bmm_rcr
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class bmm_rcr_n1(bmm_rcr):
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "bmm_rcr_n1"
+        self._attrs["f_ab_alignment"] = True
+        self._attrs["has_profiler"] = False
+
+    @staticmethod
+    def is_valid_shape(a: Tensor, b: Tensor):
+        """
+        Check input a/b is valid for bmm_rcr_n1.
+
+        Requirements:
+            1) matching dimension of a/b (where a is row major, b is column major)
+            2) dim N of b needs to be 1
+            3) dim K of b needs to be multiple of 8
+        """
+        if len(a.shape()) != 3 or len(b.shape()) != 3:
+            return False
+
+        valid = True
+        valid &= a.shape()[0] == b.shape()[0]
+        valid &= a.shape()[2] == b.shape()[2]
+
+        # check N = 1
+        BN = b.shape()[1]
+        if not isinstance(BN, IntImm):
+            return False
+        valid &= BN.value() == 1
+
+        # check BK is static dim
+        BK = b.shape()[2]
+        if not isinstance(BK, IntImm):
+            return False
+
+        return valid
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        assert self.is_valid_shape(
+            a, b
+        ), "shape (tensor a:{}, tensor b:{}) not valid for bmm_rcr_n1".format(
+            a.shape(), b.shape()
+        )
+        return super()._infer_shapes(a, b)
+
+    def __call__(self, a: Tensor, b: Tensor, alpha: float = 1.0) -> Tensor:
+        self._attrs["inputs"] = [a, b]
+        self._attrs["alpha"] = alpha
+        self._set_depth()
+        output_shape = self._infer_shapes(a, b)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
+
+    def gen_profiler(
+        self, workdir: str = None, dynamic_profiling_strategy=None
+    ) -> None:
+        """This kernel doesn't require profiling."""
+        return
diff --git a/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py b/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
new file mode 100644
index 000000000..6fd8c2800
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
@@ -0,0 +1,84 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Operator definition for bmm_rrr_k1_tanh.
+"""
+from typing import List
+
+from ...base import IntVar, Tensor
+from ..gemm_universal import bmm_rrr
+
+# pylint: disable=C0103,W0221,C0200
+
+
+class bmm_rrr_k1_tanh(bmm_rrr):
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "bmm_rrr_k1_tanh"
+        self._attrs["f_ab_alignment"] = True
+        self._attrs["has_profiler"] = False
+
+    def _infer_shapes(self, a: Tensor, b: Tensor) -> List[IntVar]:
+        """Given input tensors, infers output tensor shapes."""
+
+        a_shapes = a._attrs["shape"]
+        if len(a_shapes) != 3:
+            raise RuntimeError(
+                "bmm operand A should have 3 dimensions! Current shape: {}.".format(
+                    a_shapes
+                )
+            )
+        b_shapes = b._attrs["shape"]
+        if len(b_shapes) != 3:
+            raise RuntimeError(
+                "bmm operand B should have 3 dimensions! Current shape: {}.".format(
+                    b_shapes
+                )
+            )
+        batch_size_a = a_shapes[0]
+        batch_size_b = b_shapes[0]
+        if batch_size_a != batch_size_b:
+            raise RuntimeError(
+                "bmm operand A and B should have same batch_size! "
+                "Current shape A: {} shape B: {} .".format(a_shapes, b_shapes)
+            )
+        assert (
+            a_shapes[2] == b_shapes[1]
+        ), f"bmm operand A and B should have same K dim! Current shape A: {a_shapes}, shape B: {b_shapes}"
+        m_values = a_shapes[1]._attrs["values"]
+        # TODO: remove shape check after fixing the kernel
+        assert all(
+            val % 8 == 0 for val in m_values
+        ), f"M should be multiples of 8. M: {a_shapes[1]}"
+        n_values = b_shapes[2]._attrs["values"]
+        assert all(
+            val % 8 == 0 for val in n_values
+        ), f"N should be multiples of 8. N: {b_shapes[2]}"
+        c_shapes = [batch_size_a, a_shapes[1], b_shapes[2]]
+        return c_shapes
+
+    def __call__(self, a: Tensor, b: Tensor) -> List[Tensor]:
+        self._attrs["inputs"] = [a, b]
+        self._set_depth()
+        output_shape = self._infer_shapes(a, b)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_profiler(
+        self, workdir: str = None, dynamic_profiling_strategy=None
+    ) -> None:
+        """This kernel does not require profiling."""
+        return
diff --git a/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
new file mode 100644
index 000000000..3e28087f3
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
@@ -0,0 +1,109 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[RowMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+
+Special kernel for small K and N
+K <= 16, N <= 8
+A: [M, K] A can be ND with the first N - 1 dimensions as batch dimensions
+B: [K, N]
+C: [M, N]
+"""
+
+from ...base import IntImm, Tensor
+from ..gemm_universal import gemm_common as common
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rrr_small_nk(common.gemm):
+    """Special gemm kernel for small K and N (K <= 8, N <= 8)
+    A: [M, K]
+    B: [K, N]
+    C: [M, N]
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "gemm_rrr_small_nk"
+        self._attrs["f_ab_alignment"] = True
+        self._attrs["has_profiler"] = False
+
+    @staticmethod
+    def is_valid_shape(a: Tensor, b: Tensor):
+        valid = len(a.shape()) >= 2 and len(b.shape()) == 2
+        for idx in range(2):
+            dim = b.shape()[idx]
+            if not isinstance(dim, IntImm):
+                return False
+            if idx == 0:
+                # check for K <= 16
+                valid &= dim.value() <= 16
+            else:
+                # check for N <= 8
+                valid &= dim.value() <= 8
+        return valid
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        assert (
+            a.shape()[-1] == b.shape()[0]
+        ), f"gemm_rrr operand A and B should have the same K dim! A shape: {a.shape()}, B shape: {b.shape()}"
+
+        assert gemm_rrr_small_nk.is_valid_shape(
+            a, b
+        ), "shape (tensor a:{}, tensor b:{}) not valid for gemm_rrr_small_nk".format(
+            a.shape(), b.shape()
+        )
+        return a._attrs["shape"][:-1] + [b._attrs["shape"][1]]
+
+    def gen_profiler(
+        self, workdir: str = None, dynamic_profiling_strategy=None
+    ) -> None:
+        """This kernel does not require profiling"""
+        return
+
+    def _extract_dims(self, for_profiling=False):
+        A_rank = self._attrs["inputs"][0]._rank()
+        # (M, K) * (K, N) = (M, N)
+        return {
+            "M": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=list(range(A_rank - 1))
+                ),
+                common.DimInfo(
+                    common.Source.OUTPUT, tensor_idx=0, dim_idx=list(range(A_rank - 1))
+                ),
+            ],
+            "N": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=1),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+            "K": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=A_rank - 1),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=0),
+            ],
+        }
+
+    def __call__(self, a: Tensor, b: Tensor) -> Tensor:
+        self._attrs["inputs"] = [a, b]
+        self._set_depth()
+        output_shape = self._infer_shapes(a, b)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        # self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/__init__.py b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
new file mode 100644
index 000000000..32e4b839a
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
@@ -0,0 +1,63 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from .bmm_ccr import bmm_ccr
+from .bmm_ccr_add import bmm_ccr_add
+from .bmm_crr import bmm_crr
+from .bmm_crr_add import bmm_crr_add
+from .bmm_rcr import bmm_rcr
+from .bmm_rcr_permute import bmm_rcr_permute
+from .bmm_rrr import bmm_rrr
+from .bmm_rrr_add import bmm_rrr_add
+from .bmm_rrr_permute import bmm_rrr_permute
+from .bmm_softmax_bmm import bmm_softmax_bmm
+from .bmm_softmax_bmm_permute import bmm_softmax_bmm_permute
+from .gemm_rcr import gemm_rcr
+from .gemm_rcr_bias import gemm_rcr_bias
+from .gemm_rcr_bias_add import gemm_rcr_bias_add
+from .gemm_rcr_bias_add_add import gemm_rcr_bias_add_add
+from .gemm_rcr_bias_add_add_relu import gemm_rcr_bias_add_add_relu
+from .gemm_rcr_bias_add_relu import gemm_rcr_bias_add_relu
+from .gemm_rcr_bias_fast_gelu import gemm_rcr_bias_fast_gelu
+from .gemm_rcr_bias_gelu import gemm_rcr_bias_gelu
+from .gemm_rcr_bias_hardswish import gemm_rcr_bias_hardswish
+from .gemm_rcr_bias_mul import gemm_rcr_bias_mul
+from .gemm_rcr_bias_mul_add import gemm_rcr_bias_mul_add
+from .gemm_rcr_bias_mul_tanh import gemm_rcr_bias_mul_tanh
+from .gemm_rcr_bias_permute import gemm_rcr_bias_permute
+from .gemm_rcr_bias_relu import gemm_rcr_bias_relu
+from .gemm_rcr_bias_sigmoid import gemm_rcr_bias_sigmoid
+from .gemm_rcr_bias_sigmoid_mul import gemm_rcr_bias_sigmoid_mul
+from .gemm_rcr_bias_sigmoid_mul_tanh import gemm_rcr_bias_sigmoid_mul_tanh
+from .gemm_rcr_bias_swish import gemm_rcr_bias_swish
+from .gemm_rcr_bias_tanh import gemm_rcr_bias_tanh
+from .gemm_rcr_permute import gemm_rcr_permute
+from .gemm_rrr import gemm_rrr
+from .gemm_rrr_bias import gemm_rrr_bias
+from .gemm_rrr_bias_permute import gemm_rrr_bias_permute
+from .gemm_rrr_permute import gemm_rrr_permute
+from .group_gemm_rcr import group_gemm_rcr
+from .group_gemm_rcr_bias import group_gemm_rcr_bias
+from .group_gemm_rcr_bias_relu import group_gemm_rcr_bias_relu
+from .group_gemm_rcr_bias_sigmoid import group_gemm_rcr_bias_sigmoid
+from .perm021fc_ccr import perm021fc_ccr
+from .perm021fc_ccr_bias import perm021fc_ccr_bias
+from .perm021fc_ccr_bias_permute import perm021fc_ccr_bias_permute
+from .perm021fc_crc import perm021fc_crc
+from .perm021fc_crc_bias import perm021fc_crc_bias
+from .perm102_bmm_rcr import perm102_bmm_rcr
+from .perm102_bmm_rcr_bias import perm102_bmm_rcr_bias
+from .perm102_bmm_rrr import perm102_bmm_rrr
+from .perm102_bmm_rrr_bias import perm102_bmm_rrr_bias
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm.py b/python/aitemplate/compiler/ops/gemm_universal/bmm.py
new file mode 100644
index 000000000..6cfbf89c3
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Base class for Batch GEMM.
+"""
+
+# pylint: disable=C0103,W0223
+
+from aitemplate.compiler.base import Tensor
+
+from .gemm_common import gemm
+
+
+class bmm(gemm):
+    """Base class for bmm."""
+
+    def _get_batch_size(self, a: Tensor, b: Tensor):
+        self._sanity_check(a, b)
+
+        a_shapes = a._attrs["shape"]
+        b_shapes = b._attrs["shape"]
+        if len(a_shapes) == 2:
+            return b_shapes[0]
+        elif len(b_shapes) == 2:
+            return a_shapes[0]
+
+        batch_size_a = a_shapes[0]
+        batch_size_b = b_shapes[0]
+        if batch_size_a != batch_size_b and batch_size_a != 1 and batch_size_b != 1:
+            raise RuntimeError(
+                "bmm operand A and B should have same batch_size, or batch_size = 1! "
+                "Current shape A: {} shape B: {} .".format(a_shapes, b_shapes)
+            )
+
+        return a_shapes[0] if a_shapes[0] != 1 else b_shapes[0]
+
+    def _sanity_check(self, a: Tensor, b: Tensor):
+        a_shapes = a._attrs["shape"]
+        if len(a_shapes) != 2 and len(a_shapes) != 3:
+            raise RuntimeError(
+                "bmm operand A should have 2 or 3 dimensions! Current shape: {}.".format(
+                    a_shapes
+                )
+            )
+        b_shapes = b._attrs["shape"]
+        if len(b_shapes) != 2 and len(b_shapes) != 3:
+            raise RuntimeError(
+                "bmm operand B should have 2 or 3 dimensions! Current shape: {}.".format(
+                    b_shapes
+                )
+            )
+        if len(a_shapes) == 2 and len(b_shapes) == 2:
+            raise RuntimeError(
+                "bmm operand A and B both have 2 dimensions! Use gemm instead."
+            )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
new file mode 100644
index 000000000..45d6ee06d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
@@ -0,0 +1,111 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor].
+"""
+
+from ...base import Tensor
+from . import gemm_common as common
+from .bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class bmm_ccr(bmm):
+    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor].
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
+    """
+
+    def __init__(self):
+        """Constructor for bmm_ccr"""
+        super().__init__()
+        self._attrs["op"] = "bmm_ccr"
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(m, k)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        batch_size = self._get_batch_size(a, b)
+        return [batch_size, a.shape()[-1], b.shape()[-2]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (B, K, M) * (B, N, K) = (B, M, N)
+        a_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 0
+        )
+        b_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 1
+        )
+        output_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.OUTPUT, 0
+        )
+
+        B_dim = common.create_input_batch_diminfo(
+            [a_shapes, b_shapes], [0, 0], output_shapes[0]
+        )
+        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
+
+        dim_info_dict = {
+            "B": B_dim,
+            "M": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
+                ),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+            "N": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
+                ),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+            "K": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
+                ),
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
+                ),
+            ],
+        }
+
+        return dim_info_dict
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
new file mode 100644
index 000000000..5dce354fd
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
@@ -0,0 +1,81 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor] with Add.
+"""
+
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+from ...base import Tensor
+from . import bmm_ccr
+
+# pylint: disable=C0103, W0223
+
+
+class bmm_ccr_add(bmm_ccr):
+    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor] with Add.
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        WT = torch.transpose(W_pt, 2, 1)
+        Y_pt = torch.bmm(XT, WT)
+        Y_pt = Y_pt + D_pt
+    """
+
+    def __init__(self):
+        """Constructor for bmm_ccr_add"""
+        super().__init__()
+        self._attrs["op"] = "bmm_ccr_add"
+        self._attrs["has_d"] = True
+
+    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        """Call bmm_ccr_add with tensors a, b, c
+
+        Equivalent to the following PyTorch code:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            y = bmm(a.transpose(2, 1), b.transpose(2, 1)) + c
+
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, K, M)
+        b : Tensor
+            Tensor in shape (B, N, K)
+        c : Tensor
+            Tensor in shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, M, N)
+        """
+        output = super().__call__(a, b)
+        self._attrs["inputs"].append(c)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
new file mode 100644
index 000000000..35915f4a5
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
@@ -0,0 +1,111 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor].
+"""
+
+from ...base import Tensor
+from . import gemm_common as common
+from .bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class bmm_crr(bmm):
+    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor].
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "bmm_crr"
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(m, n)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        batch_size = self._get_batch_size(a, b)
+        return [batch_size, a.shape()[-1], b.shape()[-1]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (B, K, M) * (B, K, N) = (B, M, N)
+        a_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 0
+        )
+        b_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 1
+        )
+        output_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.OUTPUT, 0
+        )
+
+        B_dim = common.create_input_batch_diminfo(
+            [a_shapes, b_shapes], [0, 0], output_shapes[0]
+        )
+        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
+
+        dim_info_dict = {
+            "B": B_dim,
+            "M": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
+                ),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+            "N": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
+                ),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+            "K": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
+                ),
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
+                ),
+            ],
+        }
+
+        return dim_info_dict
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
new file mode 100644
index 000000000..b03bb183b
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
@@ -0,0 +1,81 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor] with Add.
+"""
+
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+from ...base import Tensor
+from . import bmm_crr
+
+# pylint: disable=C0103, W0223
+
+
+class bmm_crr_add(bmm_crr):
+    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor] with Add.
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+        Y_pt = Y_pt + D_pt
+
+    """
+
+    def __init__(self):
+        """Constructor for bmm_crr_add"""
+        super().__init__()
+        self._attrs["op"] = "bmm_crr_add"
+        self._attrs["has_d"] = True
+
+    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        """Call bmm_crr_add with tensors a, b, c
+
+        Equivalent to the following PyTorch code:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            y = bmm(a.transpose(2, 1), b) + c
+
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, K, M)
+        b : Tensor
+            Tensor in shape (B, K, N)
+        c : Tensor
+            Tensor in shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, M, N)
+        """
+        output = super().__call__(a, b)
+        self._attrs["inputs"].append(c)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
new file mode 100644
index 000000000..35c9e717c
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
@@ -0,0 +1,111 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor].
+"""
+
+from ...base import Tensor
+from . import gemm_common as common
+from .bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class bmm_rcr(bmm):
+    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor].
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "bmm_rcr"
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(k, k)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        batch_size = self._get_batch_size(a, b)
+        return [batch_size, a.shape()[-2], b.shape()[-2]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (B, M, K) * (B, N, K) = (B, M, N)
+        a_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 0
+        )
+        b_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 1
+        )
+        output_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.OUTPUT, 0
+        )
+
+        B_dim = common.create_input_batch_diminfo(
+            [a_shapes, b_shapes], [0, 0], output_shapes[0]
+        )
+        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
+
+        dim_info_dict = {
+            "B": B_dim,
+            "M": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
+                ),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+            "N": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
+                ),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+            "K": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
+                ),
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
+                ),
+            ],
+        }
+
+        return dim_info_dict
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
new file mode 100644
index 000000000..b1ab7dfb3
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
@@ -0,0 +1,106 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor] with permutation on output.
+"""
+
+from typing import Tuple
+
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from ..common import reshape
+from . import bmm_rcr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class bmm_rcr_permute(bmm_rcr):
+    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor] with permutation
+    on output to given layout.
+
+    Currently only supports reshape to 4D tensor, then do 0213 permute
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        WT = torch.transpose(W_pt, 2, 1)
+        Y_l = torch.bmm(X_pt, WT)
+        Y_r = Y_l.reshape(B // D1, D1, M, N)
+        Y_pt = torch.permute(Y_r, [0, 2, 1, 3])
+    """
+
+    def __init__(self, shape: Tuple[int], layout="0213"):
+        """Constructor for bmm_rcr_permute
+
+        Parameters
+        ----------
+        shape : Tuple[int]
+            Necessary dim info of the reshape operator
+            In 0213 case, we need to know the [D1,] to reshape the output from 3D to 4D
+        layout : str, optional
+            permutation type, by default "0213"
+        """
+        super().__init__()
+        self._attrs["op"] = "bmm_rcr_permute"
+        self._attrs["shape"] = shape
+        self._attrs["layout"] = "Permute4DBMM_{}".format(layout)
+        self._attrs["permute_shape"] = "_".join(map(str, shape))
+
+    def __call__(self, a: Tensor, b: Tensor) -> Tensor:
+        """Call bmm_rcr_permute with tensors a, b
+
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, M, K)
+        b : Tensor
+            Tensor in shape (B, N, K)
+
+        Returns
+        -------
+        Tensor
+            Tensors in shape (B // D1, M, D1, N) for 0213 permute
+
+        Raises
+        ------
+        NotImplementedError
+            Permute layout not implemented yet
+        """
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b]
+        self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b)
+        self._extract_epilogue_alignment(output_shape)
+
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+
+        if self._attrs["layout"] == "Permute4DBMM_0213":
+            b, m, n = output_shape
+            d1 = self._attrs["shape"][0]
+            output_shape = [b.value() // d1, m, d1, n]
+            return reshape()(output, output_shape)
+        else:
+            raise NotImplementedError(
+                "{} is not implemented!".format(self._attrs["layout"])
+            )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
new file mode 100644
index 000000000..82c865d13
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
@@ -0,0 +1,109 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor]
+"""
+
+from ...base import Tensor
+from . import gemm_common as common
+from .bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class bmm_rrr(bmm):
+    """BBatch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor]
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        Y_pt = torch.bmm(X_pt, W_pt)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "bmm_rrr"
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(k, n)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        batch_size = self._get_batch_size(a, b)
+        return [batch_size, a.shape()[-2], b.shape()[-1]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (B, M, K) * (B, K, N) = (B, M, N)
+        a_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 0
+        )
+        b_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 1
+        )
+        output_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.OUTPUT, 0
+        )
+
+        B_dim = common.create_input_batch_diminfo(
+            [a_shapes, b_shapes], [0, 0], output_shapes[0]
+        )
+        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
+
+        dim_info_dict = {
+            "B": B_dim,
+            "M": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
+                ),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+            "N": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
+                ),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+            "K": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
+                ),
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
+                ),
+            ],
+        }
+
+        return dim_info_dict
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
new file mode 100644
index 000000000..6e8b7ab28
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with Add.
+"""
+
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+from ...base import Tensor
+from . import bmm_rrr
+
+# pylint: disable=C0103, W0223
+
+
+class bmm_rrr_add(bmm_rrr):
+    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with Add.
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        Y_pt = torch.bmm(X_pt, W_pt) + D_pt
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "bmm_rrr_add"
+        self._attrs["has_d"] = True
+
+    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        """Call bmm_rrr_add with tensors a, b, c
+
+        Equivalent to the following PyTorch code:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            y = bmm(a, b) + c
+
+        Parameters
+        ----------
+        a : Tensor
+            Tensor with shape (B, M, K)
+        b : Tensor
+            Tensor with shape (B, K, N)
+        c : Tensor
+            Tensor with shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor with shape (B, M, N)
+        """
+        output = super().__call__(a, b)
+        self._attrs["inputs"].append(c)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
new file mode 100644
index 000000000..4378d893f
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
@@ -0,0 +1,105 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with permutation on output.
+"""
+
+from typing import Tuple
+
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from ..common import reshape
+from . import bmm_rrr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class bmm_rrr_permute(bmm_rrr):
+    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with permutation
+    on output to given layout.
+
+    Currently only supports reshape to 4D tensor, then do 0213 permute
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        Y_l = torch.bmm(X_pt, W_pt)
+        Y_r = Y_l.reshape(B // D1, D1, M, N)
+        Y_pt = torch.permute(Y_r, [0, 2, 1, 3])
+    """
+
+    def __init__(self, shape: Tuple[int], layout="0213"):
+        """Constructor for bmm_rrr_permute
+
+        Parameters
+        ----------
+        shape : Tuple[int]
+            Necessary dim info of the reshape operator
+            In 0213 case, we need to know the [D1,] to reshape the output from 3D to 4D
+        layout : str, optional
+            permutation type, by default "0213"
+        """
+        super().__init__()
+        self._attrs["op"] = "bmm_rrr_permute"
+        self._attrs["shape"] = shape
+        self._attrs["layout"] = "Permute4DBMM_{}".format(layout)
+        self._attrs["permute_shape"] = "_".join(map(str, shape))
+
+    def __call__(self, a: Tensor, b: Tensor) -> Tensor:
+        """Call bmm_rrr_permute with tensors a, b
+
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, M, K)
+        b : Tensor
+            Tensor in shape (B, K, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B // D1, M, D1, N) for 0213 permute
+
+        Raises
+        ------
+        NotImplementedError
+            Permute layout not implemented yet
+        """
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b]
+        self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b)
+        self._extract_epilogue_alignment(output_shape)
+
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+
+        if self._attrs["layout"] == "Permute4DBMM_0213":
+            b, m, n = output_shape
+            d1 = self._attrs["shape"][0]
+            output_shape = [b.value() // d1, m, d1, n]
+            return reshape()(output, output_shape)
+        else:
+            raise NotImplementedError(
+                "{} is not implemented!".format(self._attrs["layout"])
+            )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
new file mode 100644
index 000000000..3cd6986fd
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
@@ -0,0 +1,159 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+BMM_RCR + Softmax + BMM_RRR Specialization
+"""
+
+from ...base import IntImm, Tensor
+from ...tensor_accessor import TensorAccessor
+from . import gemm_common as common
+from .bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class bmm_softmax_bmm(bmm):
+    """BMM_RCR + Softmax + BMM_RRR Specialization
+    This fusion is commonly used in Attention family
+
+    This op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        Q = torch.randn(B, M, K).cuda().half()
+        K = torch.randn(B, N, K).cuda().half()
+        V = torch.randn(B, N, O).cuda().half()
+
+        attn = torch.bmm(Q, K.transpose(1, 2)) * scale
+        attn = torch.softmax(attn, dim=-1)
+        score = torch.bmm(attn, V)
+
+    Limitations:
+    1. Output dim O should be smaller than 256.
+    2. CUDA backend codegen is not implemented in this release.
+    """
+
+    def __init__(self, scale=1.0):
+        """Constructor of BMM_RCR + Softmax + BMM_RRR op
+
+        Parameters
+        ----------
+        scale : float, optional
+            normalization factor, by default 1.0
+
+        """
+        super().__init__()
+        self._attrs["op"] = "bmm_softmax_bmm"
+        self._attrs["scale"] = scale
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(k, k)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, b1: Tensor):
+        a_shapes = a._attrs["shape"]
+        b_shapes = b._attrs["shape"]
+        b1_shapes = b1._attrs["shape"]
+
+        batch_size_a = a_shapes[0]
+        batch_size_b = b_shapes[0]
+        if batch_size_a != batch_size_b and batch_size_a != 1 and batch_size_b != 1:
+            raise RuntimeError(
+                "bmm_rcr operand A and B should have same batch_size, or batch_size = 1! "
+                "Current shape A: {} shape B: {} .".format(a_shapes, b_shapes)
+            )
+        batch_size = batch_size_b if batch_size_a == IntImm(1) else batch_size_a
+        assert (
+            a_shapes[2] == b_shapes[2]
+        ), f"bmm_rcr operand A and B should have the same K dim (dim2)! Current shape A: {a_shapes}, shape B: {b_shapes}"
+        return [batch_size, a_shapes[1], b1_shapes[2]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (B, M, K) * (B, N, K) = (B, M, N)
+        # softmax on (B, M, N)
+        # (B, M, N) * (B, N, O) = (B, M, O)
+        return {
+            # TODO: support BMM broadcast
+            "B": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=0),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=0),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0),
+            ],
+            "M": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=1),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+            "N": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=1),
+            ],
+            "K": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=2),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=2),
+            ],
+            "O": [
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+        }
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K, C = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            cmd.append(C)  # o
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
+
+    def __call__(self, a: Tensor, b: Tensor, b1: Tensor) -> Tensor:
+        """Call the BMM_RCR + Softmax + BMM_RRR op
+
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape of [B, M, K]
+        b : Tensor
+            Tensor in shape of [B, N, K]
+        b1 : Tensor
+            Tensor in shape of [B, N, O]
+
+        Returns
+        -------
+        Tensor
+            Tensors in shape of [B, M, O]
+        """
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, b1]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(a),
+            TensorAccessor(b),
+            TensorAccessor(b1),
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, b1)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
new file mode 100644
index 000000000..dfc6c7601
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
@@ -0,0 +1,184 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+BMM_RCR + Softmax + BMM_RRR + Permute Specialization
+"""
+from typing import Tuple
+
+from ...base import IntImm, Tensor
+from ...tensor_accessor import TensorAccessor
+from ..common import reshape
+from . import gemm_common as common
+from .bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class bmm_softmax_bmm_permute(bmm):
+    """BMM_RCR + Softmax + BMM_RRR + Permute Specialization
+    This fusion is commonly used in Attention family
+
+    This op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        Q = torch.randn(B, M, K).cuda().half()
+        K = torch.randn(B, N, K).cuda().half()
+        V = torch.randn(B, N, O).cuda().half()
+
+        attn = torch.bmm(Q, K.transpose(1, 2)) * scale
+        attn = torch.softmax(attn, dim=-1)
+        score = torch.bmm(attn, V)
+        score_reshape = score.reshape(B // num_heads, num_heads, M, O)
+        score_permute = torch.permute(score_reshape, [0, 2, 1, 3])
+
+    Limitations:
+    1. Output dim O should be smaller than 256.
+    2. CUDA backend codegen is not implemented in this release.
+    """
+
+    def __init__(self, shape: Tuple[int], scale=1.0, causal=False, layout="0213"):
+        """Constructor for BMM_RCR * scale + Softmax + BMM_RRR
+
+        Parameters
+        ----------
+        shape : Tuple[int]
+            The reshape dim info, in attention family, the shape is [num_heads]
+        scale : float, optional
+            The norm scale, in attention family, it is 1.0 / sqrt(num_heads)
+        layout : str, optional
+            output permute layout, by default "0213"
+        """
+        super().__init__()
+        causal_mask = "_causal" if causal else ""
+        self._attrs["op"] = "bmm_softmax_bmm_permute" + causal_mask
+        self._attrs["scale"] = scale
+        self._attrs["shape"] = shape
+        self._attrs["layout"] = "Permute4DBMM_{}".format(layout)
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(k, k)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, b1: Tensor):
+        a_shapes = a._attrs["shape"]
+        b_shapes = b._attrs["shape"]
+        b1_shapes = b1._attrs["shape"]
+
+        batch_size_a = a_shapes[0]
+        batch_size_b = b_shapes[0]
+        if batch_size_a != batch_size_b and batch_size_a != 1 and batch_size_b != 1:
+            raise RuntimeError(
+                "bmm_rcr operand A and B should have same batch_size, or batch_size = 1! "
+                "Current shape A: {} shape B: {} .".format(a_shapes, b_shapes)
+            )
+        batch_size = batch_size_b if batch_size_a == IntImm(1) else batch_size_a
+        assert (
+            a_shapes[2] == b_shapes[2]
+        ), f"bmm_rcr operand A and B should have the same K dim (dim2)! Current shape A: {a_shapes}, shape B: {b_shapes}"
+        return [batch_size, a_shapes[1], b1_shapes[2]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (B, M, K) * (B, N, K) = (B, M, N)
+        # softmax on (B, M, N)
+        # (B, M, N) * (B, N, O) = (B, M, O)
+        return {
+            # TODO: support BMM broadcast
+            "B": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=0),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=0),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0),
+            ],
+            "M": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=1),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+            "N": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=1),
+            ],
+            "K": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=2),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=2),
+            ],
+            "O": [
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+        }
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K, C = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            cmd.append(C)  # o
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
+
+    def __call__(self, a: Tensor, b: Tensor, b1: Tensor) -> Tensor:
+        """Call BMM_RCR * scale + Softmax + BMM_RRR op
+
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape of [B, M, K]
+        b : Tensor
+            Tensor in shape of [B, N, K]
+        b1 : Tensor
+            Tensor in shape of [B, N, O]
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape of [B, M, D, O]
+
+        Raises
+        ------
+        NotImplementedError
+            Other permutation is not implemented yet
+        """
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, b1]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(a),
+            TensorAccessor(b),
+            TensorAccessor(b1),
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, b1)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+
+        if self._attrs["layout"] == "Permute4DBMM_0213":
+            b, m, o = output_shape
+            d1 = self._attrs["shape"][0]
+            output_shape = [b.value() // d1, m, d1, o]
+            return reshape()(output, output_shape)
+        else:
+            raise NotImplementedError(
+                "{} is not implemented!".format(self._attrs["layout"])
+            )
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/cache_entry.py b/python/aitemplate/compiler/ops/gemm_universal/cache_entry.py
new file mode 100644
index 000000000..6ca3537c7
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/cache_entry.py
@@ -0,0 +1,58 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM profiling cache entries
+"""
+from dataclasses import dataclass
+
+
+@dataclass
+class GemmQueryEntry:
+    """GEMM query entry"""
+
+    dtype_a: int
+    dtype_b: int
+    dtype_c: int
+    dtype_acc: int
+    major_a: int
+    major_b: int
+    major_c: int
+    op_type: str
+    device: str
+    epilogue: int
+    exec_entry_sha1: str
+    pshape: str
+
+
+@dataclass
+class GemmRecordEntry:
+    """Profile result record entry"""
+
+    exec_entry: str
+    exec_entry_sha1: str
+    dtype_a: int
+    dtype_b: int
+    dtype_c: int
+    dtype_acc: int
+    major_a: int
+    major_b: int
+    major_c: int
+    op_type: str
+    epilogue: int
+    pshape: str
+    device: str
+    algo: str
+    workspace: int
+    split_k: int
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
new file mode 100644
index 000000000..e59c1d008
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -0,0 +1,762 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common functions/classes for GEMM ops
+"""
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+from enum import Enum
+from hashlib import sha1
+from operator import itemgetter
+from typing import Dict, List, Union
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....backend.target import Target
+from ....utils import logger
+from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
+from ...tensor_accessor import TensorAccessor
+from .cache_entry import GemmQueryEntry, GemmRecordEntry
+
+# pylint: disable=C0103,R1711,W0102,W0221,E1120
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class Source(Enum):
+    INPUT = 1
+    OUTPUT = 2
+
+
+@dataclass
+class DimInfo:
+    """Class to record dimension info."""
+
+    def __init__(
+        self,
+        source: Source,
+        tensor_idx: int,
+        dim_idx: Union[int, List[int]],
+        placeholder: bool = False,
+    ):
+        """
+        source:
+            Source.INPUT or Source.OUTPUT
+        tensor_idx:
+            Depending on source, extract info from inputs[tensor_idx] or outputs[tensor_idx]
+        dim_idx:
+            Extract shape from inputs/outputs[tensor_idx][dim_idx]
+        placeholder:
+            If True, the diminfo might not be accurate in compile time, just a placeholder to be filled afterwards
+            This is useful to handle issue such as broadcasting which B might not be exact.
+        """
+        self.source = source
+        self.tensor_idx = tensor_idx
+        if isinstance(dim_idx, int):
+            dim_idx = [dim_idx]
+        self.dim_idx = dim_idx
+        self.placeholder = placeholder
+
+    source: Source
+    tensor_idx: int
+    dim_idx: List[int]
+    placeholder: bool
+
+
+def extract_shape_from_accessor(func_attrs, source: Source, idx: int):
+    if source == Source.INPUT:
+        if "input_accessors" in func_attrs:
+            return func_attrs["input_accessors"][idx].original_shapes
+        return func_attrs["inputs"][idx].shape()
+    elif source == Source.OUTPUT:
+        if "output_accessors" in func_attrs:
+            return func_attrs["output_accessors"][idx].original_shapes
+        return func_attrs["outputs"][idx].shape()
+    else:
+        raise RuntimeError(f"Unknown source, got {source}")
+
+
+def create_input_batch_diminfo(input_shapes, batch_dims, output_batch):
+    """
+    Create inputs' batch diminfo.
+    Provided input_shapes and the corresponding batch_dims, this function
+    returns a list of batch's DimInfo of the inputs.
+
+    input_shapes:
+        A list of input shapes.
+    batch_dims:
+        The batch dimension for the corresponding input_shapes.
+        If length of corresponding input's shape is less than 2, neglected.
+    output_batch:
+        The batch size for output.
+    """
+    assert len(input_shapes) == len(batch_dims)
+
+    batch_diminfo = []
+    for idx, input_shape in enumerate(input_shapes):
+        if len(input_shape) > 2:
+            batch_diminfo.append(
+                DimInfo(
+                    Source.INPUT,
+                    tensor_idx=idx,
+                    dim_idx=batch_dims[idx],
+                    placeholder=input_shape[batch_dims[idx]] != output_batch,
+                )
+            )
+    return batch_diminfo
+
+
+def group_gemm_inverse_key_func(key):
+    m_pattern = re.compile(r"GROUP_\d+_M\s*==\s*(\d+)")
+    all_m = re.findall(m_pattern, key)
+    n_pattern = re.compile(r"GROUP_\d+_N\s*==\s*(\d+)")
+    all_n = re.findall(n_pattern, key)
+    k_pattern = re.compile(r"GROUP_\d+_K\s*==\s*(\d+)")
+    all_k = re.findall(k_pattern, key)
+    assert len(all_m) == len(all_n) == len(all_n)
+    return (all_m, all_n, all_k)
+
+
+def gemm_inverse_key_func(key):
+    tmp = re.findall(r"(\d+)", key)
+    return [int(x) for x in tmp]
+
+
+def default_align_ab(a, b):
+    ab = math.gcd(a, b)
+    if ab % 8 == 0:
+        return 8
+    if ab % 4 == 0:
+        return 4
+    if ab % 2 == 0:
+        return 2
+    return 1
+
+
+def _to_list(elem):
+    if isinstance(elem, tuple):
+        return list(elem)
+    else:
+        return [elem]
+
+
+class gemm(Operator):
+    """Base gemm operators"""
+
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        self._attrs["op"] = "gemm"
+        self._attrs["has_profiler"] = True
+        self._attrs["f_ab_alignment"] = None
+        self._attrs["epilogue_alignment"] = 1
+        self._attrs["epilogue"] = "LinearCombination"
+        self._attrs["workspace"] = 0
+        self._attrs["split_k"] = 1
+        self._attrs["num_sources"] = 0
+        self._attrs["alpha"] = 1.0
+        self._attrs["permute_shape"] = ""
+        self.exec_cond_template = EXEC_COND_TEMPLATE
+
+    def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
+        epilogue_dim = output_shape[-1]
+        if not isinstance(epilogue_dim, IntImm):
+            raise RuntimeError(
+                f"Gemm output last dimension must be static! gemm: {self._attrs}"
+            )
+        shape = epilogue_dim._attrs["values"][0]
+        if shape % 8 == 0:
+            self._attrs["epilogue_alignment"] = 8
+        elif shape % 4 == 0:
+            self._attrs["epilogue_alignment"] = 4
+        elif shape % 2 == 0:
+            self._attrs["epilogue_alignment"] = 2
+        return
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        raise NotImplementedError("_infer_shapes() is not implemented!")
+
+    def _gen_exec_key(self, name_value_mapping):
+        key_strs = []
+        for name, values in name_value_mapping.items():
+            if len(values) == 1:
+                key_strs.append(f"{name} == {values[0]}")
+            elif len(values) > 1:
+                key_strs.append(f"{name} >= {values[0]} && {name} <= {values[-1]}")
+            else:
+                raise RuntimeError("Gemm input has empty dim values: {}".format(values))
+        return " && ".join(key_strs)
+
+    def _extract_dims(self, for_profiling: bool = False) -> Dict[str, List[DimInfo]]:
+        """Extracts a mapping between dim names and a list of DimInfo.
+        This function will be used in gemm shape inference, gemm padding graph
+        transformation, gemm profiling, etc.
+
+        All subclasses must implement this API.
+
+        An example result from gemm_rcr:
+        {
+            "M": [
+                DimInfo(source=INPUT, tensor_idx=0, dim_idx=0),
+                DimInfo(source=OUTPUT, tensor_idx=0, dim_idx=0),
+            ],
+            "K": [
+                DimInfo(source=INPUT, tensor_idx=0, dim_idx=1),
+                DimInfo(source=INPUT, tensor_idx=1, dim_idx=1),
+            ],
+            "N": [
+                DimInfo(source=INPUT, tensor_idx=1, dim_idx=0),
+                DimInfo(source=OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+        }
+
+
+        Parameters
+        ----------
+        for_profiling: bool
+            Whether this function is used for generating profiling source codes.
+            If yes, some DimInfo are simplified. e.g. For gemm, we treat all tensors
+            as 2d.
+        """
+
+        raise NotImplementedError("extract_dims() is not implemented!")
+
+    def _extract_exec_path(self, dynamic_profiling_strategy):
+        """Extracts profiling keys and execution conditions for a given dynamic_profiling_strategy.
+        This function fills in self._attrs["exec_path"].
+        Keys are "exec_key"s, and are used for profiling.
+        Values are ItemValues, where "profiling_key" fields are the same as the corresponding keys,
+        "exec_cond" fields specify dynamic ranges, and "algo" fields are empty for now.
+
+        e.g. for gemm_rrr, input1=[m, k], input2=[k, n]
+        m = 1, k = 128, n = 256.
+        self._attrs["exec_path"] = {
+            "M==1 && K==128 && N==256" : ItemValue(
+                profiling_key="M==1 && K==128 && N==256",
+                exec_cond="M==1 && K==128 && N==256",
+                algo="",
+            )
+        }
+
+        e.g. for gemm_rrr, input1=[dynamic_m, k], input2=[k, n]
+        dynamic_m >= 1 and dynamic_m <= 1024, dynamic_profiling_strategy = MAX,
+        k = 128, n = 256.
+        self._attrs["exec_path"] = {
+            "M==1024 && K==128 && N==256" : ItemValue(
+                profiling_key="M==1024 && K==128 && N==256",
+                exec_cond="M>=1 && M<=1024 && K==128 && N==256",
+                algo="",
+            )
+        }
+
+        Parameters
+        ----------
+        dynamic_profiling_strategy : DynamicProfileStrategy
+            See comments for DynamicProfileStrategy.
+        """
+
+        dim_info_dict: Dict[str, List[DimInfo]] = self._extract_dims()
+        dim_dict: Dict[str, IntVar] = {}
+        for name, dim_infos in dim_info_dict.items():
+            dim_info = None
+            for d in dim_infos:
+                if d.placeholder:
+                    continue
+
+                if dim_info is None:
+                    dim_info = d
+                elif d.source == Source.INPUT:
+                    # input should have priority.
+                    dim_info = d
+            assert dim_info is not None, f"Couldn't find valid dim info for dim {name}"
+
+            tensor_list = (
+                self._attrs["inputs"]
+                if dim_info.source == Source.INPUT
+                else self._attrs["outputs"]
+            )
+            if dim_info.source == Source.INPUT and "input_accessors" in self._attrs:
+                dim_dict[name] = _to_list(
+                    itemgetter(*(dim_info.dim_idx))(
+                        self._attrs["input_accessors"][
+                            dim_info.tensor_idx
+                        ].original_shapes
+                    )
+                )
+            elif dim_info.source == Source.OUTPUT and "output_accessors" in self._attrs:
+                dim_dict[name] = _to_list(
+                    itemgetter(*(dim_info.dim_idx))(
+                        self._attrs["output_accessors"][
+                            dim_info.tensor_idx
+                        ].original_shapes
+                    )
+                )
+            else:
+                dim_dict[name] = _to_list(
+                    itemgetter(*(dim_info.dim_idx))(
+                        tensor_list[dim_info.tensor_idx]._attrs["shape"]
+                    )
+                )
+        shape_values_dict = {}
+        for name, dims in dim_dict.items():
+            min_value = math.prod([dim.lower_bound() for dim in dims])
+            max_value = math.prod([dim.upper_bound() for dim in dims])
+            shape_values_dict[name] = sorted({min_value, max_value})
+
+        self._attrs["exec_path"] = OrderedDict()
+        if dynamic_profiling_strategy == DynamicProfileStrategy.MAX:
+            max_values = {
+                name: [max(shape_values)]
+                for name, shape_values in shape_values_dict.items()
+            }
+            exec_item = ExecItem(
+                profiling_key=self._gen_exec_key(max_values),
+                exec_cond=self._gen_exec_key(shape_values_dict),
+                algo="",
+            )
+            self._attrs["exec_path"][exec_item.profiling_key] = exec_item
+        elif dynamic_profiling_strategy == DynamicProfileStrategy.MIN:
+            min_values = {
+                name: [min(shape_values)]
+                for name, shape_values in shape_values_dict.items()
+            }
+            exec_item = ExecItem(
+                profiling_key=self._gen_exec_key(min_values),
+                exec_cond=self._gen_exec_key(shape_values_dict),
+                algo="",
+            )
+            self._attrs["exec_path"][exec_item.profiling_key] = exec_item
+        else:
+            raise NotImplementedError(
+                "Gemm only supports MIN or MAX dynamic profiling! "
+                "Current dynamic_profiling_strategy: {}".format(
+                    dynamic_profiling_strategy
+                )
+            )
+
+    def _should_build_profiler(
+        self, workloads: List[str], new_op_instance: OrderedDict
+    ):
+        """
+        Check if we should build profilers. If we have a cached
+        entry for this gemm instance, we update this gemm op's
+        relevant attributes with the cached result and return False.
+        """
+        target = backend.target.Target.current()
+
+        build_profiler = True
+        # Now, let's query if all of our workloads have cache entries. If that
+        # is the case, it is safely to skip generating and building profilers.
+        if not target.use_dummy_profiling_results():
+            tmp_key = next(iter(new_op_instance.keys()))
+            tmp_op = new_op_instance[tmp_key]
+            build_profiler = False
+            for wkl in workloads:
+                exec_entry_sha1 = sha1(wkl.encode("utf-8")).hexdigest()
+                query = GemmQueryEntry(
+                    dtype_a=tmp_op.A.element.value,
+                    dtype_b=tmp_op.B.element.value,
+                    dtype_c=tmp_op.C.element.value,
+                    dtype_acc=tmp_op.accumulator_type().value,
+                    major_a=tmp_op.A.layout.value,
+                    major_b=tmp_op.B.layout.value,
+                    major_c=tmp_op.C.layout.value,
+                    op_type=self._attrs["op"],
+                    device=target._arch,
+                    epilogue=tmp_op.epilogue_functor.value,
+                    exec_entry_sha1=exec_entry_sha1,
+                    pshape=self._attrs["permute_shape"],
+                )
+                cache_value = target.query_profile_cache("gemm", query.__dict__)
+                if cache_value is not None and not target.force_profile():
+                    logger.info(
+                        __name__,
+                        f'Load profiling result for {self._attrs["name"]} '
+                        f"from cache: {cache_value}",
+                    )
+                    best_algo, workspace, split_k = cache_value
+                    self._attrs["exec_path"][wkl].algo = best_algo
+                    self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+                    self._attrs["split_k"] = split_k
+                else:
+                    # cache miss - we will have to generate and build profilers
+                    build_profiler = True
+        return build_profiler
+
+    def gen_profiler(
+        self, workdir: str = None, dynamic_profiling_strategy=DynamicProfileStrategy.MAX
+    ) -> None:
+        """Generate profilers for this gemm op.
+
+        Parameters
+        ----------
+        workdir : str, optional
+            Output dir of profilers, by default None
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy, used to filter generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        """
+        target = backend.target.Target.current()
+        # init candidate ops
+        func_key = "{target}.{op}.config".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs)
+
+        # init exec path
+        self._extract_exec_path(dynamic_profiling_strategy)
+        # init compile-time filter
+        workloads = list(self._attrs["exec_path"].keys())
+        ab_alignments = sorted({self._get_ab_alignment(wkl) for wkl in workloads})
+        assert 1 == len(
+            ab_alignments
+        ), f"ab_alignments should be the same among all workloads, got {ab_alignments=}"
+        func_key = "{target}.{op}.filter".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        filter_func = registry.get(func_key)
+        # run compile-time filter
+        new_op_instance = OrderedDict(
+            {
+                k: v
+                for k, v in self._attrs["op_instance"].items()
+                if filter_func(k, self._attrs, ab_alignments[0])
+            }
+        )
+        logger.debug(
+            __name__,
+            f"Filtered profiler kernels for {self._attrs['op']}: reduced the "
+            f"number of generated kernels from {len(self._attrs['op_instance'])} "
+            f"to {len(new_op_instance)}",
+        )
+        self._attrs["op_instance"] = new_op_instance
+
+        build_profiler = self._should_build_profiler(workloads, new_op_instance)
+        if build_profiler:
+            # generate profiler
+            func_key = "{target}.{op}.gen_profiler".format(
+                target=target.name(), op=self._attrs["op"]
+            )
+            func = registry.get(func_key)
+            func(self._attrs, workdir, self._extract_dims(for_profiling=True))
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key, fbuild_cmd):
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd_args = fbuild_cmd(exec_key)
+        cmd = [exe_path]
+        # mnk
+        cmd.extend(cmd_args)
+        command = [str(x) for x in cmd]
+        # profiling gemm/bmm_permute with layout and shape for ROCM
+        if self._attrs.get("shape") is not None:
+            if backend.target.Target.current().name() == "rocm":
+                for x in self._attrs["shape"]:
+                    command.append(str(x))
+        return command
+
+    def _split_k_search_space(self, M, N, K):
+        """Get split_k search range = [1] by default"""
+        space = [1]
+        # skip split-k search for rocm
+        if backend.target.Target.current().name() == "rocm":
+            return set(space)
+        factor = K // max(M, N)
+        low_range = max(1, factor // 4)
+        high_range = min(factor, 32)
+        if low_range == 1:
+            low_range += 1
+        space += list(range(low_range, high_range, 2))
+        logger.debug(
+            __name__,
+            f"profiling split-k for gemm instance M={M}, N={N}, K={K} in {set(space)}",
+        )
+        return set(space)
+
+    def _get_ab_alignment(self, exec_key):
+        if self._attrs["op"].startswith("group_gemm"):
+            all_m, all_n, all_k = group_gemm_inverse_key_func(exec_key)
+            all_ab_alignments = [
+                self._attrs["f_ab_alignment"](int(m), int(n), int(k))
+                for m, n, k in zip(all_m, all_n, all_k)
+            ]
+            ab_alignment = sorted(all_ab_alignments)[0]
+        else:
+            # exec_key may contain batch dimension, which we don't care here
+            m, n, k = gemm_inverse_key_func(exec_key)[-3:]
+            ab_alignment = self._attrs["f_ab_alignment"](m, n, k)
+            # FIXME: for dtype != float16
+            if ab_alignment == 1:
+                raise RuntimeError(
+                    "A / B alignment == 1 is not supported! " f"m: {m}, n: {n}, k: {k}."
+                )
+        return ab_alignment
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        target = backend.target.Target.current()
+        tmp_key = next(iter(self._attrs["op_instance"].keys()))
+        tmp_op = self._attrs["op_instance"][tmp_key]
+        exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
+        split_k = 1 if self._attrs["split_k"] is None else self._attrs["split_k"]
+        # Because we call gen_profiler to generate and compile all profilers
+        # before running any of them, we won't be able to update the exec_path
+        # in gen_profiler even if two gemms have the same problem size (assume that
+        # we don't have a cache entry for this problem size). Consequently,
+        # we still need to query the cache here to ensure we won't re-profile
+        # the second gemm with the same problem size. Note that if we already
+        # have a cache entry for the problem size before gen_profiler, we will
+        # setup exec_path correctly in gen_profiler, so we won't get here at all.
+        query = GemmQueryEntry(
+            dtype_a=tmp_op.A.element.value,
+            dtype_b=tmp_op.B.element.value,
+            dtype_c=tmp_op.C.element.value,
+            dtype_acc=tmp_op.accumulator_type().value,
+            major_a=tmp_op.A.layout.value,
+            major_b=tmp_op.B.layout.value,
+            major_c=tmp_op.C.layout.value,
+            op_type=self._attrs["op"],
+            device=target._arch,
+            epilogue=tmp_op.epilogue_functor.value,
+            exec_entry_sha1=exec_entry_sha1,
+            pshape=self._attrs["permute_shape"],
+        )
+        cache_value = target.query_profile_cache("gemm", query.__dict__)
+        if cache_value is not None and not target.force_profile():
+            logger.debug(
+                __name__,
+                f'Load profiling result for {self._attrs["name"]} '
+                f"from cache: {cache_value}",
+            )
+            return cache_value
+        if target.use_dummy_profiling_results():
+            op_type = self._attrs["op"]
+            raise Exception(
+                "This is a CI run but we could not find the following cache ",
+                f"available on device {target._arch}\n",
+                f"{op_type} {exec_entry_sha1}.\n",
+                "To bypass, you need to make it available in the db table.",
+            )
+        # do real profile
+        content = list(self._attrs["op_instance"].keys())
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+
+        results = []
+        if self._attrs["op"].startswith("group_gemm") or self._attrs["op"].startswith(
+            "bmm"
+        ):
+            for cfg in content:
+                command = self._gen_profile_cmd(profiler_prefix, cfg, exec_key)
+                runner.push(cfg, command)
+            runner.join()
+            result = runner.pull()
+            results += [item + (1,) for item in result]
+        else:
+            m, n, k = gemm_inverse_key_func(exec_key)[-3:]
+            for split_k in self._split_k_search_space(m, n, k):
+                for cfg in content:
+                    command = self._gen_profile_cmd(profiler_prefix, cfg, exec_key)
+                    command.append(str(split_k))
+                    logger.debug(__name__, "profiling cmd: {}".format(command))
+                    runner.push(cfg, command)
+                runner.join()
+                result = runner.pull()
+                results += [item + (split_k,) for item in result]
+
+        out = sorted(results, key=lambda x: x[1].duration)
+        if len(out) == 0:
+            raise RuntimeError(
+                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+            )
+        best_algo = out[0][0]
+        workspace = out[0][1].workspace
+        split_k = out[0][2]
+        # cache
+        cache_record = GemmRecordEntry(
+            exec_entry=exec_key,
+            exec_entry_sha1=exec_entry_sha1,
+            dtype_a=tmp_op.A.element.value,
+            dtype_b=tmp_op.B.element.value,
+            dtype_c=tmp_op.C.element.value,
+            dtype_acc=tmp_op.accumulator_type().value,
+            major_a=tmp_op.A.layout.value,
+            major_b=tmp_op.B.layout.value,
+            major_c=tmp_op.C.layout.value,
+            op_type=self._attrs["op"],
+            epilogue=tmp_op.epilogue_functor.value,
+            device=target._arch,
+            algo=best_algo,
+            workspace=workspace,
+            split_k=split_k,
+            pshape=self._attrs["permute_shape"],
+        )
+        Target.current().insert_profile_cache("gemm", cache_record.__dict__)
+        logger.info(__name__, f"Selected kernel: {best_algo}, {workspace}, {split_k}")
+        return (best_algo, workspace, split_k)
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=None,
+    ):
+        """Selects the fastest kernel configurations.
+
+        Parameters
+        ----------
+        workdir : str, optional
+            Base dir to keep profiling source codes, by default "./"
+        devices: list, optional
+            Devices used for profiling, by default device 0 will be used.
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            Unused (profiles are generated at compile time), by default None.
+            Call site in :func:`~aitemplate.compiler.transform.profile.profile`
+            uses this parameter for each op, so it cannot be removed until all
+            profile generation is done at compile time.
+        """
+
+        if devices is None:
+            devices = [0]
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        if "op_instance" not in self._attrs:
+            target = backend.target.Target.current()
+            # init candidate ops
+            func_key = "{target}.{op}.config".format(
+                target=target.name(), op=self._attrs["op"]
+            )
+            func = registry.get(func_key)
+            func(self._attrs)
+        for wkl in workloads:
+            logger.info(
+                __name__,
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            target = backend.target.Target.current()
+            # if in CI just choose minimal configs
+            # workspace is a hack just provides 102400 Byte
+            if target.use_dummy_profiling_results():
+                algo = target.select_minimal_algo(
+                    list(self._attrs["op_instance"].keys())
+                )
+                logger.info(__name__, f"Select minimal algo {algo} for CI")
+                self._attrs["exec_path"][wkl].algo = algo
+                self._attrs["workspace"] = 102400
+            elif self._attrs["exec_path"][wkl].algo != "":
+                # we have cached best algo
+                return
+            else:
+                best_algo, workspace, split_k = self._profile_single_workload(
+                    profiler_prefix, wkl, devices
+                )
+                self._attrs["exec_path"][wkl].algo = best_algo
+                self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+                self._attrs["split_k"] = split_k
+                logger.debug(
+                    __name__,
+                    "Profile best split-k: {}".format(split_k),
+                )
+
+    def gen_function(self) -> str:
+        """Generates the function code for the gemm op for the current target.
+
+        Returns
+        -------
+        str
+            C++ source code of the function
+        """
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            self.exec_cond_template,
+            self._extract_dims(),
+        )
+
+    def _signature(self) -> str:
+        """Generate the unique signature of the gemm op.
+
+        Returns
+        -------
+        str
+            The unique signature of the gemm op.
+        """
+        op_name = self._attrs["op"] + ("split_" + str(self._attrs["split_k"]))
+        signature = sha1(op_name.encode("utf-8")).hexdigest()
+        return signature
+
+    def _align_ab(self, a: Tensor, b: Tensor):
+        return a, b
+
+    def _sanity_check(self, a: Tensor, b: Tensor):
+        a_shapes = a._attrs["shape"]
+        if len(a_shapes) < 2:
+            raise RuntimeError(
+                "gemm operand A should have >= 2 dimensions! Current shape: {}.".format(
+                    a_shapes
+                )
+            )
+        b_shapes = b._attrs["shape"]
+        if len(b_shapes) != 2:
+            raise RuntimeError(
+                "gemm operand B should have 2 dimensions! Current shape: {}.".format(
+                    b_shapes
+                )
+            )
+
+    def __call__(self, a: Tensor, b: Tensor) -> Tensor:
+        """Call the gemm op.
+
+        Parameters
+        ----------
+        a : Tensor
+            Tensor with correct shape for the gemm operand A.
+        b : Tensor
+            Tensor with correct shape for the gemm operand B.
+
+        Returns
+        -------
+        Tensor
+            Output tensor for the gemm operation.
+        """
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b]
+        # TensorAccessor(b) is for bmm or rare cases of gemm where b is not constant weight
+        self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
new file mode 100644
index 000000000..6532d5aba
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
@@ -0,0 +1,103 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
+"""
+
+from ...base import IntImm, Tensor
+from . import gemm_common as common
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rcr(common.gemm):
+    """GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+
+        y = torch.nn.functional.linear(A, B)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr"
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(k, k)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        return a._attrs["shape"][:-1] + [b._attrs["shape"][0]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (M, K) * (N, K) = (M, N)
+
+        # profiling always uses 2d * 2d.
+        A_len = (
+            2
+            if for_profiling
+            else len(self._attrs["input_accessors"][0].original_shapes)
+        )
+        return {
+            "M": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=list(range(A_len - 1))
+                ),
+                common.DimInfo(
+                    common.Source.OUTPUT, tensor_idx=0, dim_idx=list(range(A_len - 1))
+                ),
+            ],
+            "N": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=0),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=A_len - 1),
+            ],
+            "K": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=A_len - 1),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=1),
+            ],
+        }
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
+
+    def _align_ab(self, a: Tensor, b: Tensor):
+        a_shape = a._attrs["shape"]
+        b_shape = b._attrs["shape"]
+        if a_shape[-1] != b_shape[-1]:
+            raise RuntimeError(
+                "A/B shape mismatch! A: {}, B: {}".format(a_shape, b_shape)
+            )
+        if not isinstance(a_shape[-1], IntImm):
+            raise RuntimeError("K must be static! k: {}".format(a_shape[-1]))
+
+        return a, b
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
new file mode 100644
index 000000000..73e6d3659
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
@@ -0,0 +1,100 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: GEMM_RCR(A, B) + Bias
+"""
+from ...base import IntImm, Tensor
+from ...tensor_accessor import TensorAccessor
+from . import gemm_rcr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rcr_bias(gemm_rcr):
+    """GEMM Specialization: GEMM_RCR(A, B) + Bias
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+
+        y = torch.nn.functional.linear(A, B, bias=Bias)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias"
+
+    @staticmethod
+    def is_valid_inputs(X: Tensor, W: Tensor, bias: Tensor):
+        msg = ""
+
+        bias_shapes = bias._attrs["shape"]
+        if len(bias_shapes) != 1:
+            msg = f"Bias should be 1D vector! Current bias shape: {bias_shapes}"
+            return False, msg
+
+        bias_shape = bias_shapes[0]
+        if not isinstance(bias_shape, IntImm):
+            msg = f"Bias should be fixed 1D vector! Current bias shape: {bias_shape}"
+            return False, msg
+
+        outshape = gemm_rcr()._infer_shapes(X, W)
+        if outshape[-1] != bias_shape:
+            msg = f"GEMM/Bias shape doesn't match! Gemm shape: {outshape}, bias shape: {bias_shape}"
+            return False, msg
+
+        return True, msg
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, bias: Tensor):
+        """Infers output shapes for gemm_rcr_bas.
+
+        Parameters
+        ----------
+        a : Tensor
+            Input tensor A.
+        b : Tensor
+            Input tensor B.
+        bias : Tensor
+            Input tensor bias. Must be a 1D vector.
+
+        Returns
+        -------
+        List[IntVar]
+            Output tensor shape.
+        """
+        is_valid_inputs, msg = self.is_valid_inputs(a, b, bias)
+        if not is_valid_inputs:
+            raise RuntimeError(msg)
+        return super()._infer_shapes(a, b)
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py
new file mode 100644
index 000000000..9f7b92d05
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: GEMM_RCR(A, B) + Bias + D0
+"""
+
+from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class gemm_rcr_bias_add(gemm_rcr_bias_broadcast):
+    """GEMM Specialization: GEMM_RCR(A, B) + Bias + D0
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+        D0 = torch.randn(M, N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = linear + D0
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_add"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_add"
+        self._attrs["num_sources"] = 1
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py
new file mode 100644
index 000000000..35026663d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: GEMM_RCR(A, B) + Bias + D0 + D1
+"""
+
+from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class gemm_rcr_bias_add_add(gemm_rcr_bias_broadcast):
+    """GEMM Specialization: RELU(GEMM_RCR(A, B) + Bias + D0 + D1)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+        D0 = torch.randn(M, N).cuda().half()
+        D1 = torch.randn(M, N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = linear + D0 + D1
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_add_add"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_add_add"
+        self._attrs["num_sources"] = 2
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py
new file mode 100644
index 000000000..2b6eb5312
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: RELU(GEMM_RCR(A, B) + Bias + D0 + D1)
+"""
+
+from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class gemm_rcr_bias_add_add_relu(gemm_rcr_bias_broadcast):
+    """GEMM Specialization: RELU(GEMM_RCR(A, B) + Bias + D0 + D1)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+        D0 = torch.randn(M, N).cuda().half()
+        D1 = torch.randn(M, N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.nn.ReLU(linear + D0 + D1)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_add_add_relu"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_add_add_relu"
+        self._attrs["num_sources"] = 2
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py
new file mode 100644
index 000000000..824114a01
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: RELU(GEMM_RCR(A, B) + Bias + D0)
+"""
+
+from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class gemm_rcr_bias_add_relu(gemm_rcr_bias_broadcast):
+    """GEMM Specialization: RELU(GEMM_RCR(A, B) + Bias + D0)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+        D0 = torch.randn(M, N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.nn.ReLU(linear + D0)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_add_relu"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_add_relu"
+        self._attrs["num_sources"] = 1
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
new file mode 100644
index 000000000..d55457f98
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
@@ -0,0 +1,74 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+gemm_rcr_bias with 2 extra sources.
+BinaryOp2(BinaryOp1(UnaryOp(TensorOp(X) + bias), residual1), residual2)
+"""
+
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from . import gemm_rcr_bias
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class gemm_rcr_bias_broadcast(gemm_rcr_bias):
+    def __init__(self):
+        super().__init__()
+        self._attrs["epilogue"] = "LinearCombinationResidualBlockV2"
+
+    @staticmethod
+    def is_valid_inputs(*inputs):
+        msg = ""
+        if len(inputs) < 3:
+            msg = "input for gemm_rcr_bias_broadcast should be at least 3, got {} instead.".format(
+                len(inputs)
+            )
+            return False, msg
+
+        gemm_rcr_bias_valid, msg = gemm_rcr_bias.is_valid_inputs(
+            inputs[0], inputs[1], inputs[2]
+        )
+        if not gemm_rcr_bias_valid:
+            return False, msg
+
+        if len(inputs) > 3:
+            base_shape = gemm_rcr_bias()._infer_shapes(inputs[0], inputs[1], inputs[2])
+            for d in inputs[3:]:
+                d_shape = d.shape()
+                if d_shape != base_shape:
+                    msg = "Additional elementwise shape {d_shape} doesn't match gemm_bias' shape {base_shape}"
+                    return False, msg
+
+        return True, msg
+
+    def __call__(
+        self, a: Tensor, b: Tensor, bias: Tensor, d0: Tensor, d1: Tensor = None
+    ) -> Tensor:
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, bias, d0]
+        if d1 is not None:
+            self._attrs["inputs"].append(d1)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
new file mode 100644
index 000000000..05d8c23b7
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: FastGELU(GEMM_RCR(A, B) + Bias)
+"""
+from . import gemm_rcr_bias
+
+# pylint: disable=C0103,W0223,W0221
+
+
+class gemm_rcr_bias_fast_gelu(gemm_rcr_bias):
+    """GEMM Specialization: FastGELU(GEMM_RCR(A, B) + Bias)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.nn.GELU(linear)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_fast_gelu"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_fast_gelu"
+        self._attrs["epilogue"] = "LinearCombinationFastGELU"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
new file mode 100644
index 000000000..730df8f0a
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: GELU(GEMM_RCR(A, B) + Bias)
+"""
+from . import gemm_rcr_bias
+
+# pylint: disable=C0103,W0223,W0221
+
+
+class gemm_rcr_bias_gelu(gemm_rcr_bias):
+    """GEMM Specialization: GELU(GEMM_RCR(A, B) + Bias)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.nn.GELU(linear)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_gelu"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_gelu"
+        self._attrs["epilogue"] = "LinearCombinationGELU"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
new file mode 100644
index 000000000..1063e36f5
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -0,0 +1,42 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: HardSwish(GEMM_RCR(A, B) + Bias)
+"""
+from . import gemm_rcr_bias
+
+# pylint: disable=C0103,W0223,W0221
+
+
+class gemm_rcr_bias_hardswish(gemm_rcr_bias):
+    """GEMM Specialization: HardSwish(GEMM_RCR(A, B) + Bias)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.nn.HardSwish(linear)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_hardswish"
+        self._attrs["epilogue"] = "LinearCombinationHardSwish"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py
new file mode 100644
index 000000000..486821c65
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: (GEMM_RCR(A, B) + Bias) * D0
+"""
+
+from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class gemm_rcr_bias_mul(gemm_rcr_bias_broadcast):
+    """GEMM Specialization: (GEMM_RCR(A, B) + Bias) * D0
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+        D0 = torch.randn(M, N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = linear * D0
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_mul"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_mul"
+        self._attrs["num_sources"] = 1
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py
new file mode 100644
index 000000000..0039992d6
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: (GEMM_RCR(A, B) + Bias) * D0 + D1
+"""
+
+from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class gemm_rcr_bias_mul_add(gemm_rcr_bias_broadcast):
+    """GEMM Specialization: (GEMM_RCR(A, B) + Bias) * D0 + D1
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+        D0 = torch.randn(M, N).cuda().half()
+        D1 = torch.randn(M, N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = linear * D0 + D1
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_mul_add"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_mul_add"
+        self._attrs["num_sources"] = 2
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py
new file mode 100644
index 000000000..72f1f6ea3
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: TANH((GEMM_RCR(A, B) + Bias) * D0)
+"""
+
+from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class gemm_rcr_bias_mul_tanh(gemm_rcr_bias_broadcast):
+    """GEMM Specialization: TANH((GEMM_RCR(A, B) + Bias) * D0)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+        D0 = torch.randn(M, N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.tanh(linear * D0)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_mul_tanh"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_mul_tanh"
+        self._attrs["num_sources"] = 1
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
new file mode 100644
index 000000000..711eb9d75
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
@@ -0,0 +1,69 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+gemm rcr with bias + permute
+"""
+
+from typing import Tuple
+
+from aitemplate.testing import detect_target
+
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from ..common import reshape
+from . import gemm_rcr_bias
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rcr_bias_permute(gemm_rcr_bias):
+    def __init__(self, shape: Tuple[int], layout="20314"):
+        super().__init__()
+        if layout == "20314":
+            self._attrs["op"] = "gemm_rcr_bias_permute"
+        elif layout == "m2n3":
+            self._attrs["op"] = "gemm_rcr_bias_permute_m2n3"
+        elif layout == "m3n2":
+            self._attrs["op"] = "gemm_rcr_bias_permute_m3n2"
+        else:
+            raise NotImplementedError("{} is not implemented!".format(layout))
+        self._attrs["shape"] = shape
+        self._attrs["layout"] = "Permute5D_{}".format(layout)
+        self._attrs["permute_shape"] = "_".join(map(str, shape))
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+
+        m, n = output_shape
+        t1, t2, t3 = self._attrs["shape"]
+        if (
+            self._attrs["layout"] == "Permute5D_20314"
+            and detect_target().name() == "rocm"
+        ) or self._attrs["layout"] == "Permute5D_m3n2":
+            output_shape = [t2, m.value() // t1 // t2, t3, t1, n.value() // t3]
+        else:
+            output_shape = [t2, m.value() // t1, t3, t1, n.value() // t3 // t2]
+        return reshape()(output, output_shape)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
new file mode 100644
index 000000000..c22822a97
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: ReLU(GEMM_RCR(A, B) + Bias)
+"""
+from . import gemm_rcr_bias
+
+# pylint: disable=C0103,W0223,W0221
+
+
+class gemm_rcr_bias_relu(gemm_rcr_bias):
+    """GEMM Specialization: ReLU(GEMM_RCR(A, B) + Bias)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.nn.ReLU(linear)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_relu"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_relu"
+        self._attrs["epilogue"] = "LinearCombinationRelu"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
new file mode 100644
index 000000000..cae111a19
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Sigmoid(GEMM_RCR(A, B) + Bias)
+"""
+from . import gemm_rcr_bias
+
+# pylint: disable=C0103,W0223,W0221
+
+
+class gemm_rcr_bias_sigmoid(gemm_rcr_bias):
+    """GEMM Specialization: Sigmoid(GEMM_RCR(A, B) + Bias)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.sigmoid(linear)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_sigmoid"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_sigmoid"
+        self._attrs["epilogue"] = "LinearCombinationSigmoid"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
new file mode 100644
index 000000000..bbf2f133a
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
@@ -0,0 +1,44 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: Sigmoid(GEMM_RCR(A, B) + Bias) * D0
+"""
+
+from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class gemm_rcr_bias_sigmoid_mul(gemm_rcr_bias_broadcast):
+    """GEMM Specialization: Sigmoid(GEMM_RCR(A, B) + Bias) * D0
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+        D0 = torch.randn(M, N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.sigmoid(linear) * D0
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_sigmoid_mul"
+        self._attrs["num_sources"] = 1
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
new file mode 100644
index 000000000..b26d6fe4d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: Tanh(Sigmoid(GEMM_RCR(A, B) + Bias) * D0)
+"""
+
+from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class gemm_rcr_bias_sigmoid_mul_tanh(gemm_rcr_bias_broadcast):
+    """GEMM Specialization: Tanh(Sigmoid(GEMM_RCR(A, B) + Bias) * D0)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+        D0 = torch.randn(M, N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.tanh(torch.sigmoid(linear) * D0)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_sigmoid_mul_tanh"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_sigmoid_mul_tanh"
+        self._attrs["num_sources"] = 1
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
new file mode 100644
index 000000000..69c388ddd
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: SiLU(GEMM_RCR(A, B) + Bias)
+"""
+from . import gemm_rcr_bias
+
+# pylint: disable=C0103,W0223,W0221
+
+
+class gemm_rcr_bias_swish(gemm_rcr_bias):
+    """GEMM Specialization: SiLU(GEMM_RCR(A, B) + Bias)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.nn.SiLU(linear)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_swish"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_swish"
+        self._attrs["epilogue"] = "LinearCombinationSilu"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
new file mode 100644
index 000000000..aaec1c507
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: Tanh(GEMM_RCR(A, B) + Bias)
+"""
+from . import gemm_rcr_bias
+
+# pylint: disable=C0103,W0223,W0221
+
+
+class gemm_rcr_bias_tanh(gemm_rcr_bias):
+    """GEMM Specialization: Tanh(GEMM_RCR(A, B) + Bias)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(N, K).cuda().half()
+        Bias = torch.randn(N).cuda().half()
+
+        linear = torch.nn.functional.linear(A, B, bias=Bias)
+        y = torch.tanh(linear)
+    """
+
+    def __init__(self):
+        """Constructor for gemm_rcr_bias_tanh"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rcr_bias_tanh"
+        self._attrs["epilogue"] = "LinearCombinationTanh"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
new file mode 100644
index 000000000..549cf11fb
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[RowMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+"""
+
+from typing import Tuple
+
+from aitemplate.testing import detect_target
+
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from ..common import reshape
+
+from . import gemm_rcr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rcr_permute(gemm_rcr):
+    def __init__(self, shape: Tuple[int], layout="20314"):
+        super().__init__()
+        if layout == "20314":
+            self._attrs["op"] = "gemm_rcr_permute"
+        elif layout == "m2n3":
+            self._attrs["op"] = "gemm_rcr_permute_m2n3"
+        else:
+            raise NotImplementedError("{} is not implemented!".format(layout))
+
+        self._attrs["shape"] = shape
+        self._attrs["layout"] = "Permute5D_{}".format(layout)
+        self._attrs["permute_shape"] = "_".join(map(str, shape))
+
+    def __call__(self, a: Tensor, b: Tensor) -> Tensor:
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b]
+        self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+
+        if self._attrs["layout"] == "Permute5D_20314" or (
+            self._attrs["layout"] == "Permute5D_m2n3"
+            and detect_target().name() == "rocm"
+        ):
+            m, n = output_shape
+            t1, t2, t3 = self._attrs["shape"]
+            output_shape = [t2, m.value() // t1, t3, t1, n.value() // t2 // t3]
+            return reshape()(output, output_shape)
+        else:
+            raise NotImplementedError(
+                "{} is not implemented!".format(self._attrs["layout"])
+            )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
new file mode 100644
index 000000000..d8a2f1a99
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
@@ -0,0 +1,106 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[RowMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+"""
+
+from ...base import IntImm, Tensor
+from . import gemm_common as common
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rrr(common.gemm):
+    """GEMM Specialization for A[RowMajor], B[RowMajor], C[RowMajor]
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        A = torch.randn(M, K).cuda().half()
+        B = torch.randn(K, N).cuda().half()
+
+        y = torch.nn.functional.linear(A, B.t())
+    """
+
+    def __init__(self):
+        """Constructor of gemm_rrr"""
+        super().__init__()
+        self._attrs["op"] = "gemm_rrr"
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(k, n)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        return a._attrs["shape"][:-1] + [b._attrs["shape"][1]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (M, K) * (K, N) = (M, N)
+
+        # profiling always uses 2d * 2d.
+        A_len = (
+            2
+            if for_profiling
+            else len(self._attrs["input_accessors"][0].original_shapes)
+        )
+        return {
+            "M": [
+                common.DimInfo(
+                    common.Source.INPUT, tensor_idx=0, dim_idx=list(range(A_len - 1))
+                ),
+                common.DimInfo(
+                    common.Source.OUTPUT, tensor_idx=0, dim_idx=list(range(A_len - 1))
+                ),
+            ],
+            "N": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=1),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=A_len - 1),
+            ],
+            "K": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=A_len - 1),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=0),
+            ],
+        }
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
+
+    def _align_ab(self, a: Tensor, b: Tensor):
+        a_shape = a._attrs["shape"]
+        b_shape = b._attrs["shape"]
+        if a_shape[-1] != b_shape[0]:
+            raise RuntimeError(
+                "A/B shape mismatch! A: {}, B: {}".format(a_shape, b_shape)
+            )
+        if not isinstance(a_shape[-1], IntImm):
+            raise RuntimeError("K must be static! k: {}".format(a_shape[-1]))
+        return a, b
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
new file mode 100644
index 000000000..aa5060afe
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+gemm rrr with bias
+"""
+from ...base import IntImm, Tensor
+from ...tensor_accessor import TensorAccessor
+from . import gemm_rrr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rrr_bias(gemm_rrr):
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "gemm_rrr_bias"
+
+    @staticmethod
+    def is_valid_inputs(X: Tensor, W: Tensor, bias: Tensor):
+        msg = ""
+
+        bias_shapes = bias._attrs["shape"]
+        if len(bias_shapes) != 1:
+            msg = f"Bias should be 1D vector! Current bias shape: {bias_shapes}"
+            return False, msg
+
+        bias_shape = bias_shapes[0]
+        if not isinstance(bias_shape, IntImm):
+            msg = f"Bias should be fixed 1D vector! Current bias shape: {bias_shape}"
+            return False, msg
+
+        outshape = gemm_rrr()._infer_shapes(X, W)
+        if outshape[-1] != bias_shape:
+            msg = f"GEMM/Bias shape doesn't match! Gemm shape: {outshape}, bias shape: {bias_shape}"
+            return False, msg
+
+        return True, msg
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, bias: Tensor):
+        """Infers output shapes for gemm_rrr_bas.
+
+        Parameters
+        ----------
+        a : Tensor
+            Input tensor A.
+        b : Tensor
+            Input tensor B.
+        bias : Tensor
+            Input tensor bias. Must be a 1D vector.
+
+        Returns
+        -------
+        List[IntVar]
+            Output tensor shape.
+        """
+        is_valid_inputs, msg = self.is_valid_inputs(a, b, bias)
+        if not is_valid_inputs:
+            raise RuntimeError(msg)
+        return super()._infer_shapes(a, b)
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
new file mode 100644
index 000000000..fddc0bf00
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+gemm rrr with bias + permute
+"""
+
+from typing import Tuple
+
+from aitemplate.testing import detect_target
+
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from ..common import reshape
+
+from . import gemm_rrr_bias
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rrr_bias_permute(gemm_rrr_bias):
+    def __init__(self, shape: Tuple[int], layout="20314"):
+        super().__init__()
+        self._attrs["op"] = "gemm_rrr_bias_permute"
+        self._attrs["shape"] = shape
+        self._attrs["layout"] = "Permute5D_{}".format(layout)
+        self._attrs["permute_shape"] = "_".join(map(str, shape))
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+
+        if self._attrs["layout"] == "Permute5D_20314":
+            m, n = output_shape
+            t1, t2, t3 = self._attrs["shape"]
+            # TODO:currently ROCM only partitions M by 2 time and N by 1 time.
+            # We should update ROCM to use the same output_shape
+            if detect_target().name() == "rocm":
+                output_shape = [t2, m.value() // t1 // t2, t3, t1, n.value() // t3]
+            else:
+                output_shape = [t2, m.value() // t1, t3, t1, n.value() // t3 // t2]
+            return reshape()(output, output_shape)
+        else:
+            raise NotImplementedError(
+                "{} is not implemented!".format(self._attrs["layout"])
+            )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
new file mode 100644
index 000000000..48a64fbd8
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
@@ -0,0 +1,62 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for A[RowMajor], B[RowMajor], C[RowMajor]
+This is special in template based gemm solution
+This is used for `torch.nn.functional.linear`
+When use for `linear`, need set A->Data, B->Weight
+"""
+
+from typing import Tuple
+
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from ..common import reshape
+
+from . import gemm_rrr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rrr_permute(gemm_rrr):
+    def __init__(self, shape: Tuple[int], layout="20314"):
+        super().__init__()
+        self._attrs["op"] = "gemm_rrr_permute"
+        self._attrs["shape"] = shape
+        self._attrs["layout"] = "Permute5D_{}".format(layout)
+        self._attrs["permute_shape"] = "_".join(map(str, shape))
+
+    def __call__(self, a: Tensor, b: Tensor) -> Tensor:
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b]
+        self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b)
+        self._extract_epilogue_alignment(output_shape)
+
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+
+        if self._attrs["layout"] == "Permute5D_20314":
+            m, n = output_shape
+            t1, t2, t3 = self._attrs["shape"]
+            output_shape = [t2, m.value() // t1, t3, t1, n.value() // t2 // t3]
+            return reshape()(output, output_shape)
+        else:
+            raise NotImplementedError(
+                "{} is not implemented!".format(self._attrs["layout"])
+            )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
new file mode 100644
index 000000000..50231d2ec
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
@@ -0,0 +1,319 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Grouped GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
+"""
+import re
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+
+from ....backend import registry
+from ....backend.target import Target
+from ....utils import logger
+from ...base import ExecItem, Tensor
+from ...tensor_accessor import TensorAccessor
+from ..tensor import concatenate
+from . import gemm_common as common
+from .gemm_rcr import gemm_rcr
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+SHAPE_EVAL_TEMPLATE = jinja2.Template(
+    """
+{% for operand_dim in group_operand_dims %}
+{% set output_addr = output_addr_cals[loop.index - 1] %}
+{% set input_a_addr = input_a_addr_cals[loop.index - 1] %}
+{{indent}}{{dtype}}GROUP_{{loop.index0}}_AM = {{operand_dim[0]}};
+{{indent}}{{dtype}}GROUP_{{loop.index0}}_AK = {{operand_dim[1]}};
+{{indent}}{{dtype}}GROUP_{{loop.index0}}_BN = {{operand_dim[2]}};
+{{indent}}{{dtype}}GROUP_{{loop.index0}}_BK = {{operand_dim[3]}};
+{{indent}}{{dtype}}GROUP_{{loop.index0}}_CM = GROUP_{{loop.index0}}_AM;
+{{indent}}{{dtype}}GROUP_{{loop.index0}}_CN = GROUP_{{loop.index0}}_BN;
+
+{{input_a_addr}}
+{{output_addr}}
+
+{{indent}}{{dtype}}GROUP_{{loop.index0}}_M = {{operand_dim[0]}};
+{{indent}}{{dtype}}GROUP_{{loop.index0}}_K = {{operand_dim[1]}};
+{{indent}}{{dtype}}GROUP_{{loop.index0}}_N = {{operand_dim[2]}};
+{% endfor %}
+{% for operand_dim in group_operand_dims %}
+{{indent}}{{operand_dim[4]}} = GROUP_{{loop.index0}}_M;
+{{indent}}{{operand_dim[5]}} = GROUP_{{loop.index0}}_N;
+{% endfor %}
+"""
+)
+
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+{% for mnk in group_mnk %} {% if loop.index0 != 0 %} && {% endif %}
+GROUP_{{loop.index0}}_M == {{mnk[0]}} &&
+GROUP_{{loop.index0}}_N == {{mnk[1]}} &&
+GROUP_{{loop.index0}}_K == {{mnk[2]}}
+{% endfor %}
+"""
+)
+
+
+class group_gemm_rcr(common.gemm):
+    """Grouped GEMM Specialization: GEMM_RCR(A, B)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # group 1
+        A1 = torch.randn(M1, K1).cuda().half()
+        B1 = torch.randn(N1, K1).cuda().half()
+
+        y1 = torch.nn.functional.linear(A1, B1)
+
+        ...
+        # group n
+        An = torch.randn(Mn, Kn).cuda().half()
+        Bn = torch.randn(Nn, Kn).cuda().half()
+
+        yn = torch.nn.functional.linear(An, Bn)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.shape_eval_template = SHAPE_EVAL_TEMPLATE
+        self._attrs["op"] = "group_gemm_rcr"
+        # this is a state flag will be codegen
+        self._attrs["int_state_flag"] = 0
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(k, k)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"==\s*(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _extract_exec_path(self, dynamic_profiling_strategy=None):
+        # FIXME: Make this API properly support dynamic_profiling_strategy.
+        if dynamic_profiling_strategy is not None:
+            return
+
+        # check batch dim same for each group
+        batch_dim = self._attrs["inputs"][0]._attrs["shape"][0]
+        for i in range(self._attrs["groups"]):
+            if batch_dim != self._attrs["inputs"][i * 2]._attrs["shape"][0]:
+                raise RuntimeError(
+                    "Batch dim is different in groups. Inputs: {}".format(
+                        self._attrs["inputs"]
+                    )
+                )
+        # for each batch create exec_path
+        self._attrs["exec_path"] = OrderedDict()
+        for m_value in batch_dim._attrs["values"]:
+            group_mnk = []
+            for i in range(self._attrs["groups"]):
+                b = self._attrs["inputs"][i * 2 + 1]
+                mnk = [m_value]
+                mnk.append(b._attrs["shape"][0]._attrs["values"][0])
+                mnk.append(b._attrs["shape"][1]._attrs["values"][0])
+                group_mnk.append(mnk)
+            exec_key = EXEC_KEY_TEMPLATE.render(group_mnk=group_mnk).replace("\n", "")
+            self._attrs["exec_path"][exec_key] = ExecItem(
+                profiling_key=exec_key,
+                exec_cond=exec_key,
+                algo="",
+            )
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            mnk_flat = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(self._attrs["groups"])
+            cmd.extend(mnk_flat)
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
+
+    def _concat_strided_outputs(self, outputs, output_stride_dim):
+        """a temporary function to concatenate strided outputs"""
+        cat_op = concatenate()
+        cat_output = cat_op(outputs, dim=output_stride_dim)
+        cat_output._attrs["src_ops"] = [self]
+        offset = 0
+        for idx, output_tensor in enumerate(outputs):
+            self._attrs["output_accessors"][idx].update_base_tensor(
+                cat_output, output_stride_dim, offset
+            )
+            offset += output_tensor._attrs["shape"][output_stride_dim]._attrs["values"][
+                0
+            ]
+            from ...transform import transform_utils
+
+            transform_utils.remove_tensor_from_sorted_graph(output_tensor)
+        return cat_output
+
+    @staticmethod
+    def _one_input_accessors(
+        input_accessors: List[TensorAccessor], num_inputs_per_group: int, idx: int
+    ) -> List[TensorAccessor]:
+        return [
+            a for i, a in enumerate(input_accessors) if i % num_inputs_per_group == idx
+        ]
+
+    def input_a_accessors(self) -> List[TensorAccessor]:
+        return self._one_input_accessors(
+            self._attrs["input_accessors"], num_inputs_per_group=2, idx=0
+        )
+
+    def input_b_accessors(self) -> List[TensorAccessor]:
+        return self._one_input_accessors(
+            self._attrs["input_accessors"], num_inputs_per_group=2, idx=1
+        )
+
+    def __call__(self, operand_groups: List[List[Tensor]], output_stride_dim=None):
+        # FIXME: when output_stride_dim is specified, we will concat the outputs of the
+        # grouped gemm along the output_stride_dim axis. It's a temporary solution for
+        # a pattern where the outputs of a grouped gemm can be concatenated
+        # to form a single larger tensor. We will write a pass to detect such a
+        # pattern automatically.
+        self._attrs["inputs"] = []
+        ret = []
+        epilogue_alignment = 8
+        for a, b in operand_groups:
+            op = gemm_rcr()
+            c = op(a, b)
+            c._attrs["src_ops"] = [self]
+            a._attrs["dst_ops"].remove(op)
+            b._attrs["dst_ops"].remove(op)
+            epilogue_alignment = min(
+                op._attrs["epilogue_alignment"], epilogue_alignment
+            )
+            ret.append(c)
+            self._attrs["inputs"].append(a)
+            self._attrs["inputs"].append(b)
+        self._set_depth()
+        self._attrs["input_accessors"] = [
+            TensorAccessor(a) for i, a in enumerate(self._attrs["inputs"])
+        ]
+        self._attrs["output_accessors"] = [TensorAccessor(c) for c in ret]
+        self._attrs["groups"] = len(ret)
+        if output_stride_dim is not None:
+            # FIXME: replace this manual concat with an automated pass
+            if output_stride_dim != 1:
+                raise RuntimeError(
+                    "only support cases where output_stride_dim equals to 1"
+                )
+            self._attrs["output_stride_dim"] = output_stride_dim
+            ret = self._concat_strided_outputs(ret, output_stride_dim)
+            self._attrs["outputs"] = [ret]
+        else:
+            self._attrs["outputs"] = ret
+        self._attrs["epilogue_alignment"] = epilogue_alignment
+        self._extract_exec_path()
+        # This is a lazy way to allocate space for args
+        # Reserve 12 * 4 * len(groups) byte for each field
+        # 12 is read of sizeof(GemmCoord)
+        # problem_sizes_device
+        # ptrA/B/C/D
+        # lda/b/c/d
+        # problem_sizes_device: N * GemmCoord -> N * 3 * sizeof(int64_t) -> 32 * N
+        # ptrA/B/C/D: N * 8 for each
+        # lda/b/c/d: N * 8 for each
+        # total: N * 8 * 4 + N * 8 * 4 + N * 8 * 4
+        # total: 3 * 32 * N
+        args_size = 96 * self._attrs["groups"]
+        self._attrs["unique_workspace"] = args_size
+        return ret
+
+    def gen_profiler(
+        self, workdir: str = None, dynamic_profiling_strategy=None
+    ) -> None:
+        """Generate profiler for the op
+
+        Parameters
+        ----------
+        workdir : str, optional
+            [description], by default None
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy, used to filter generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        """
+        target = Target.current()
+        # init candidate ops
+        func_key = "{target}.{op}.config".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs)
+        # init exec path
+        self._extract_exec_path(dynamic_profiling_strategy)
+        # init compile-time filter
+        workloads = list(self._attrs["exec_path"].keys())
+        ab_alignments = sorted({self._get_ab_alignment(wkl) for wkl in workloads})
+        assert 1 == len(
+            ab_alignments
+        ), f"ab_alignments should be the same among all workloads, got {ab_alignments=}"
+        func_key = "{target}.{op}.filter".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        filter_func = registry.get(func_key)
+        # run compile-time filter
+        new_op_instance = OrderedDict(
+            {
+                k: v
+                for k, v in self._attrs["op_instance"].items()
+                if filter_func(k, self._attrs, ab_alignments[0])
+            }
+        )
+        logger.debug(
+            __name__,
+            f"Filtered profiler kernels for {self._attrs['op']}: reduced the "
+            f"number of generated kernels from {len(self._attrs['op_instance'])} "
+            f"to {len(new_op_instance)}",
+        )
+        logger.debug(
+            __name__,
+            f"Group_gemm profiler valid configs: {sorted(new_op_instance.keys())}",
+        )
+        self._attrs["op_instance"] = new_op_instance
+        build_profiler = super()._should_build_profiler(workloads, new_op_instance)
+        if build_profiler:
+            func_key = "{target}.{op}.gen_profiler".format(
+                target=target.name(), op=self._attrs["op"]
+            )
+            func = registry.get(func_key)
+            func(self._attrs, workdir, self.shape_eval_template)
+
+    def gen_function(self) -> str:
+        """Generate function for the op
+
+        Returns
+        -------
+        str
+            C++ source code of the function
+        """
+        target = Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            self.exec_cond_template,
+            self.shape_eval_template,
+        )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
new file mode 100644
index 000000000..906234345
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
@@ -0,0 +1,168 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Grouped GEMM Specialization: GEMM_RCR(A, B) + Bias
+"""
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+
+from ...base import ExecItem, Tensor
+from ...tensor_accessor import TensorAccessor
+from .gemm_rcr_bias import gemm_rcr_bias
+from .group_gemm_rcr import group_gemm_rcr, SHAPE_EVAL_TEMPLATE
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+{% for mnk in group_mnk %} {% if loop.index0 != 0 %} && {% endif %}
+GROUP_{{loop.index0}}_M == {{mnk[0]}} &&
+GROUP_{{loop.index0}}_N == {{mnk[1]}} &&
+GROUP_{{loop.index0}}_K == {{mnk[2]}}
+{% endfor %}
+"""
+)
+
+
+class group_gemm_rcr_bias(group_gemm_rcr):
+    """Grouped GEMM Specialization: GEMM_RCR(A, B) + Bias
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # group 1
+        A1 = torch.randn(M1, K1).cuda().half()
+        B1 = torch.randn(N1, K1).cuda().half()
+        Bias1 = torch.randn(N1).cuda().half()
+
+        y1 = torch.nn.functional.linear(A1, B1, bias=Bias1)
+
+        ...
+        # group n
+        An = torch.randn(Mn, Kn).cuda().half()
+        Bn = torch.randn(Nn, Kn).cuda().half()
+        Biasn = torch.randn(Nn).cuda().half()
+
+        yn = torch.nn.functional.linear(An, Bn, bias=Biasn)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.shape_eval_template = SHAPE_EVAL_TEMPLATE
+        self._attrs["op"] = "group_gemm_rcr_bias"
+
+    def _extract_exec_path(self, dynamic_profiling_strategy=None):
+        if dynamic_profiling_strategy is not None:
+            # FIXME: Make group_gemm support dynamic_profiling_strategy.
+            return
+
+        # check batch dim same for each group
+        batch_dim = self._attrs["inputs"][0]._attrs["shape"][0]
+        for i in range(self._attrs["groups"]):
+            if batch_dim != self._attrs["inputs"][i * 3]._attrs["shape"][0]:
+                raise RuntimeError("Batch dim is different in groups")
+        # for each batch create exec_path
+        self._attrs["exec_path"] = OrderedDict()
+        for m_value in batch_dim._attrs["values"]:
+            group_mnk = []
+            for i in range(self._attrs["groups"]):
+                b = self._attrs["inputs"][i * 3 + 1]
+                mnk = [m_value]
+                mnk.append(b._attrs["shape"][0]._attrs["values"][0])
+                mnk.append(b._attrs["shape"][1]._attrs["values"][0])
+                group_mnk.append(mnk)
+            exec_key = EXEC_KEY_TEMPLATE.render(group_mnk=group_mnk).replace("\n", "")
+            self._attrs["exec_path"][exec_key] = ExecItem(
+                profiling_key=exec_key,
+                exec_cond=exec_key,
+                algo="",
+            )
+
+    def input_a_accessors(self) -> List[TensorAccessor]:
+        return group_gemm_rcr._one_input_accessors(
+            self._attrs["input_accessors"], num_inputs_per_group=3, idx=0
+        )
+
+    def input_b_accessors(self) -> List[TensorAccessor]:
+        return group_gemm_rcr._one_input_accessors(
+            self._attrs["input_accessors"], num_inputs_per_group=3, idx=1
+        )
+
+    def input_bias_accessors(self) -> List[TensorAccessor]:
+        return group_gemm_rcr._one_input_accessors(
+            self._attrs["input_accessors"], num_inputs_per_group=3, idx=2
+        )
+
+    def __call__(self, operand_groups: List[List[Tensor]], output_stride_dim=None):
+        # FIXME: when output_stride_dim is specified, we will concat the outputs of the
+        # grouped gemm along the stride_dim axis. It's a temporary solution for
+        # a pattern where the outputs of a grouped gemm can be concatenated
+        # to form a single larger tensor. We will write a pass to detect such a
+        # pattern automatically.
+        self._attrs["inputs"] = []
+        ret = []
+        epilogue_alignment = 8
+        for a, b, bias in operand_groups:
+            op = gemm_rcr_bias()
+            c = op(a, b, bias)
+            c._attrs["src_ops"] = [self]
+            a._attrs["dst_ops"].remove(op)
+            b._attrs["dst_ops"].remove(op)
+            bias._attrs["dst_ops"].remove(op)
+            epilogue_alignment = min(
+                op._attrs["epilogue_alignment"], epilogue_alignment
+            )
+            ret.append(c)
+            self._attrs["inputs"].append(a)
+            self._attrs["inputs"].append(b)
+            self._attrs["inputs"].append(bias)
+        self._set_depth()
+        self._attrs["input_accessors"] = [
+            TensorAccessor(a) for i, a in enumerate(self._attrs["inputs"])
+        ]
+        self._attrs["output_accessors"] = [TensorAccessor(c) for c in ret]
+        self._attrs["groups"] = len(ret)
+        if output_stride_dim is not None:
+            # FIXME: replace this manual concat with an automated pass
+            if output_stride_dim != 1:
+                raise RuntimeError(
+                    "only support cases where output_stride_dim equals to 1"
+                )
+            self._attrs["output_stride_dim"] = output_stride_dim
+            ret = self._concat_strided_outputs(ret, output_stride_dim)
+            self._attrs["outputs"] = [ret]
+        else:
+            self._attrs["outputs"] = ret
+        self._attrs["epilogue_alignment"] = epilogue_alignment
+        self._extract_exec_path()
+        # This is a lazy way to allocate space for args
+        # Reserve 12 * 4 * len(groups) byte for each field
+        # 12 is read of sizeof(GemmCoord)
+        # problem_sizes_device
+        # ptrA/B/C/D
+        # lda/b/c/d
+        # problem_sizes_device: N * GemmCoord -> N * 3 * sizeof(int64_t) ~ 32 * N
+        # ptrA/B/C/D: N * sizeof(half*) ~ N * 8 for each
+        # lda/b/c/d: N * sizeof(int64_t) ~ N * 8 for each
+        # total: N * 8 * 4 + N * 8 * 4 + N * 8 * 4
+        # total: 3 * 32 * N
+        args_size = 96 * self._attrs["groups"]
+        self._attrs["unique_workspace"] = args_size
+        return ret
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
new file mode 100644
index 000000000..1e095bdcf
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
@@ -0,0 +1,52 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""Grouped GEMM Specialization: ReLU(GEMM_RCR(A, B) + Bias)
+"""
+
+from . import group_gemm_rcr_bias
+
+# pylint: disable=C0103,W0223
+
+
+class group_gemm_rcr_bias_relu(group_gemm_rcr_bias):
+    """Grouped GEMM Specialization: ReLU(GEMM_RCR(A, B) + Bias)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # group 1
+        A1 = torch.randn(M1, K1).cuda().half()
+        B1 = torch.randn(N1, K1).cuda().half()
+        Bias1 = torch.randn(N1).cuda().half()
+
+        linear1 = torch.nn.functional.linear(A1, B1, bias=Bias1)
+        y1 = torch.nn.ReLU(linear1)
+
+        ...
+        # group n
+        An = torch.randn(Mn, Kn).cuda().half()
+        Bn = torch.randn(Nn, Kn).cuda().half()
+        Biasn = torch.randn(Nn).cuda().half()
+
+        linearn = torch.nn.functional.linear(An, Bn, bias=Biasn)
+        yn = torch.nn.ReLU(linearn)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "group_gemm_rcr_bias_relu"
+        self._attrs["epilogue"] = "LinearCombinationRelu"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
new file mode 100644
index 000000000..eccedaf52
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
@@ -0,0 +1,52 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""Grouped GEMM Specialization: Sigmoid(GEMM_RCR(A, B) + Bias)
+"""
+
+from . import group_gemm_rcr_bias
+
+# pylint: disable=C0103,W0223
+
+
+class group_gemm_rcr_bias_sigmoid(group_gemm_rcr_bias):
+    """Grouped GEMM Specialization: Sigmoid(GEMM_RCR(A, B) + Bias)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # group 1
+        A1 = torch.randn(M1, K1).cuda().half()
+        B1 = torch.randn(N1, K1).cuda().half()
+        Bias1 = torch.randn(N1).cuda().half()
+
+        linear1 = torch.nn.functional.linear(A1, B1, bias=Bias1)
+        y1 = torch.sigmoid(linear1)
+
+        ...
+        # group n
+        An = torch.randn(Mn, Kn).cuda().half()
+        Bn = torch.randn(Nn, Kn).cuda().half()
+        Biasn = torch.randn(Nn).cuda().half()
+
+        linearn = torch.nn.functional.linear(An, Bn, bias=Biasn)
+        yn = torch.sigmoid(linearn)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "group_gemm_rcr_bias_sigmoid"
+        self._attrs["epilogue"] = "LinearCombinationSigmoid"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
new file mode 100644
index 000000000..c3ee84199
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
@@ -0,0 +1,147 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: A.permute(0, 2, 1)[col] @ B[col]
+"""
+
+from ...base import _create_host_zero_tensor, IntImm, Tensor
+from ..tensor import concatenate
+from . import gemm_common as common
+from .bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class perm021fc_ccr(bmm):
+    """GEMM Specialization: A.permute(0, 2, 1) @ B
+
+    This op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+
+        XT = X_pt.permute(0, 2, 1)
+        XT = torch.reshape(XT, (-1, K))
+        Y_pt = torch.nn.functional.linear(XT, W_pt)
+        Y_pt = torch.reshape(Y_pt, (B, M, N))
+    """
+
+    def __init__(self):
+        """Constructor for perm021fc_ccr"""
+        super().__init__()
+        self._attrs["op"] = "perm021fc_ccr"
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(m, k)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        a_shapes = a._attrs["shape"]
+        b_shapes = b._attrs["shape"]
+
+        batch_size_a = a_shapes[0]
+        batch_size_b = b_shapes[0]
+        if (
+            batch_size_a != batch_size_b
+            and batch_size_a != IntImm(1)
+            and batch_size_b != IntImm(1)
+        ):
+            raise RuntimeError(
+                "bmm operand A and B should have same batch_size, or batch_size = 1! "
+                "Current shape A: {} shape B: {} .".format(a_shapes, b_shapes)
+            )
+        batch_size = batch_size_b if batch_size_a == IntImm(1) else batch_size_a
+
+        return [batch_size, a_shapes[2], b_shapes[1]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (B, K, M) * (B, N, K) = (B, M, N)
+        return {
+            "B": [common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0)],
+            "M": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=2),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+            "N": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=1),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+            "K": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=1),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=2),
+            ],
+        }
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
+
+    def _align_ab(self, a: Tensor, b: Tensor):
+        # a: [b, k, m]
+        # b: [1, n, k]
+        a_shape = a._attrs["shape"]
+        b_shape = b._attrs["shape"]
+        ak = a_shape[1]
+        bk = b_shape[2]
+        if ak != bk:
+            raise RuntimeError(
+                f"A/B shape mismatch, ak: {ak}, bk: {bk}, "
+                f"a_shape: {a_shape}, b_shape: {b_shape}"
+            )
+        if not isinstance(bk, IntImm):
+            raise RuntimeError(
+                "Last dim K must be static! Current shape: {}".format(b_shape)
+            )
+        k = ak._attrs["values"][0]
+
+        if k % 2 != 0:
+            pad_k = int((k // 8 + 1) * 8)
+
+            pad_a = _create_host_zero_tensor(
+                shape=[
+                    a_shape[0],
+                    IntImm(pad_k - k),
+                    a_shape[2],
+                ],
+                dtype=a.dtype(),
+            )
+            pad_b = _create_host_zero_tensor(
+                shape=[
+                    b_shape[0],
+                    b_shape[1],
+                    IntImm(pad_k - k),
+                ],
+                dtype=b.dtype(),
+            )
+            cat_a = concatenate()
+            cat_b = concatenate()
+            a = cat_a([a, pad_a], dim=1)
+            b = cat_b([b, pad_b], dim=2)
+        return a, b
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
new file mode 100644
index 000000000..e5af55a95
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: (A.permute(0, 2, 1)[col] @ B[col] + Bias)
+"""
+
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+from ...base import Tensor
+from . import perm021fc_ccr
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class perm021fc_ccr_bias(perm021fc_ccr):
+    """GEMM Specialization: (A.permute(0, 2, 1) @ B + Bias)
+
+    This op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        Bias_pt = torch.randn(N).cuda().half()
+
+        XT = X_pt.permute(0, 2, 1)
+        XT = torch.reshape(XT, (-1, K))
+        Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
+        Y_pt = torch.reshape(Y_pt, (B, M, N))
+    """
+
+    def __init__(self):
+        """Constructor for perm021fc_ccr_bias"""
+        super().__init__()
+        self._attrs["op"] = "perm021fc_ccr_bias"
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, bias: Tensor):
+        bias_shape = bias._attrs["shape"]
+        if len(bias_shape) != 1:
+            raise RuntimeError("Bias should be 1D vector ")
+        bias_shape_value = bias_shape[0]._attrs["values"]
+        if len(bias_shape_value) != 1:
+            raise RuntimeError("Bias should be fixed 1D vector")
+        bias_dim = bias_shape_value[0]
+        outshape = super()._infer_shapes(a, b)
+        if outshape[2]._attrs["values"][0] != bias_dim:
+            raise RuntimeError("GEMM/Bias shape doesn't match")
+        return outshape
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["outputs"]
+        ]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
new file mode 100644
index 000000000..aca47f57e
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: (A.permute(0, 2, 1)[col] @ B[col] + Bias).permute(0, 2, 1)
+"""
+from ...base import Tensor
+from ...tensor_accessor import TensorAccessor
+from ..common import reshape
+from .perm021fc_ccr_bias import perm021fc_ccr_bias
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class perm021fc_ccr_bias_permute(perm021fc_ccr_bias):
+    """
+    GEMM Specialization: (A.permute(0, 2, 1) @ B + Bias).permute(0, 2, 1)
+
+    Note: This fusion may be slower than the non-fused version due to NVCC
+    is not able to optimize the fused version.
+
+    This op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        Bias_pt = torch.randn(N).cuda().half()
+
+        XT = X_pt.permute(0, 2, 1)
+        XT = torch.reshape(XT, (-1, K))
+        Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
+        Y_pt = torch.reshape(Y_pt, (B, M, N))
+        Y_pt = Y_pt.permute(0, 2, 1)
+    """
+
+    def __init__(self, layout="021"):
+        """Constructor for perm021fc_ccr_bias_permute"""
+        super().__init__()
+        self._attrs["op"] = "perm021fc_ccr_bias_permute"
+        self._attrs["shape"] = [0]  # this is a hack
+        self._attrs["layout"] = "Permute3DBMM_{}".format(layout)
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        a, b = self._align_ab(a, b)
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+
+        if self._attrs["layout"] == "Permute3DBMM_021":
+            b, m, n = output_shape
+            output_shape = [b, n, m]
+            self._attrs["epilogue_alignment"] = 1
+            return reshape()(output, output_shape)
+        else:
+            raise NotImplementedError(
+                "{} is not implemented!".format(self._attrs["layout"])
+            )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
new file mode 100644
index 000000000..ba4d52fd7
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
@@ -0,0 +1,113 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: (A.permute(0, 2, 1)[col] @ B[row])
+Note: This op's output is a ColMajor
+"""
+
+from ...base import IntImm, Tensor
+from . import gemm_common as common
+from .bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class perm021fc_crc(bmm):
+    """GEMM Specialization: (A.permute(0, 2, 1) @ B)
+
+    This one is used when n/m gives you better alignment than m/k.
+    Note: This op's output is a ColMajor
+
+    This op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(K, N).cuda().half()
+
+        XT = X_pt.permute(0, 2, 1)
+        XT = torch.reshape(XT, (-1, K))
+        WT = W_pt.transpose(0, 1).contiguous()
+        Y_pt = torch.nn.functional.linear(XT, WT)
+        Y_pt = torch.reshape(Y_pt, (B, M, N)).contiguous()
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "perm021fc_crc"
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(m, n)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        a_shapes = a._attrs["shape"]
+        b_shapes = b._attrs["shape"]
+
+        batch_size_a = a_shapes[0]
+        batch_size_b = b_shapes[0]
+        if (
+            batch_size_a != batch_size_b
+            and batch_size_a != IntImm(1)
+            and batch_size_b != IntImm(1)
+        ):
+            raise RuntimeError(
+                "bmm operand A and B should have same batch_size, or batch_size = 1! "
+                "Current shape A: {} shape B: {} .".format(a_shapes, b_shapes)
+            )
+        batch_size = batch_size_b if batch_size_a == IntImm(1) else batch_size_a
+
+        return [batch_size, b_shapes[2], a_shapes[2]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (B, K, N) * (B, K, M) = (B, M, N)
+        return {
+            "B": [common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0)],
+            "M": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=2),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
+            ],
+            "N": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=2),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+            "K": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=1),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=1),
+            ],
+        }
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
+
+    def _align_ab(self, a: Tensor, b: Tensor):
+        # b: [b, k, m]
+        # a: [1, k, n]
+        # TODO(xxx): Not implemented, need to pad m, n to 8
+        return a, b
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
new file mode 100644
index 000000000..43b17fdac
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization: (A.permute(0, 2, 1)[col] @ B[row] + Bias)
+"""
+
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+from ...base import IntImm, Tensor
+from . import perm021fc_crc
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class perm021fc_crc_bias(perm021fc_crc):
+    """GEMM Specialization: (A.permute(0, 2, 1) @ B + Bias)
+
+    This one is used when n/m gives you better alignment than m/k.
+
+    This op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(K, N).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+
+        XT = X_pt.permute(0, 2, 1)
+        XT = torch.reshape(XT, (-1, K))
+        WT = W_pt.transpose(0, 1).contiguous()
+        Y_pt = torch.nn.functional.linear(XT, WT, bias=B_pt)
+        Y_pt = torch.reshape(Y_pt, (B, M, N)).contiguous()
+    """
+
+    def __init__(self):
+        """Constractor for perm021fc_crc_bias"""
+        super().__init__()
+        self._attrs["op"] = "perm021fc_crc_bias"
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, bias: Tensor):
+        bias_shape = bias._attrs["shape"]
+        if len(bias_shape) != 1:
+            raise RuntimeError("Bias should be 1D vector ")
+        bias_dim = bias_shape[0]
+        if not isinstance(bias_dim, IntImm):
+            raise RuntimeError("Bias should be fixed 1D vector")
+        outshape = super()._infer_shapes(a, b)
+        if outshape[2] != bias_dim:
+            raise RuntimeError("GEMM/Bias shape doesn't match")
+        return outshape
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["outputs"]
+        ]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
new file mode 100644
index 000000000..e4293290c
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
+"""
+
+from ...base import IntImm, Tensor
+from . import gemm_common as common
+from .bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class perm102_bmm_rcr(bmm):
+    """Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
+
+    The op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(M, B, K).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = X_pt.permute(1, 0, 2)
+        Y_pt = torch.bmm(XT, W_pt.permute([0, 2, 1]))
+        Y_pt = Y_pt.permute(1, 0, 2)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "perm102_bmm_rcr"
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(k, k)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        a_shapes = a._attrs["shape"]
+        b_shapes = b._attrs["shape"]
+
+        batch_size_a = a_shapes[1]
+        batch_size_b = b_shapes[0]
+        if batch_size_a != batch_size_b and batch_size_a != 1 and batch_size_b != 1:
+            raise RuntimeError(
+                f"bmm operand A and B should have same batch_size, or batch_size = 1! "
+                f"Current shape A: {a_shapes} shape B: {b_shapes}."
+            )
+        batch_size = batch_size_b if batch_size_a == IntImm(1) else batch_size_a
+
+        # [m, b, n]
+        return [a_shapes[0], batch_size, b_shapes[1]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (M, B, K) * (B, N, K) = (M, B, N)
+        return {
+            "B": [common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1)],
+            "M": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=0),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0),
+            ],
+            "N": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=1),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+            "K": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=2),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=2),
+            ],
+        }
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # b
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
new file mode 100644
index 000000000..8e134d9c1
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
@@ -0,0 +1,91 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
+"""
+
+from ...base import IntImm, Tensor
+from ...tensor_accessor import TensorAccessor
+from . import perm102_bmm_rcr
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class perm102_bmm_rcr_bias(perm102_bmm_rcr):
+    """
+    Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col)) + bias[b, n]
+
+    The op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(M, B, K).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        B_pt = torch.randn(B, N).cuda().half()
+
+        XT = X_pt.permute(1, 0, 2)
+        Bias = B_pt.unsqueeze(1)
+        Y_pt = torch.baddbmm(Bias, XT, W_pt.permute([0, 2, 1]))
+        Y_pt = Y_pt.permute(1, 0, 2)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "perm102_bmm_rcr_bias"
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, bias: Tensor):
+        bias_shapes = bias._attrs["shape"]
+        if len(bias_shapes) != 2:
+            raise RuntimeError("Bias should be 2D vector ")
+        bias_shape = bias_shapes[1]
+        if not isinstance(bias_shape, IntImm):
+            raise RuntimeError("Bias should be fixed 2D vector")
+        outshape = super()._infer_shapes(a, b)
+        if outshape[2] != bias_shape:
+            raise RuntimeError("GEMM/Bias shape doesn't match")
+        return outshape
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        """Call operator
+
+        Parameters
+        ----------
+        a : Tensor
+            Tensors of shape (M, B, K)
+        b : Tensor
+            Tensor of shape (B, N, K)
+        bias : Tensor
+            Tensor of shape (B, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor of shape (M, B, N)
+        """
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["outputs"]
+        ]
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
new file mode 100644
index 000000000..a11830f40
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row))
+"""
+
+from ...base import IntImm, Tensor
+from . import gemm_common as common
+from .bmm import bmm
+
+# pylint: disable=C0103, W0223, W0221, W0613
+
+
+class perm102_bmm_rrr(bmm):
+    """Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row))
+
+    The op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(M, B, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+
+        XT = X_pt.permute(1, 0, 2)
+        Y_pt = torch.bmm(XT, W_pt)
+        Y_pt = Y_pt.permute(1, 0, 2)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "perm102_bmm_rrr"
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(k, n)
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        a_shapes = a._attrs["shape"]
+        b_shapes = b._attrs["shape"]
+
+        batch_size_a = a_shapes[1]
+        batch_size_b = b_shapes[0]
+        if batch_size_a != batch_size_b and batch_size_a != 1 and batch_size_b != 1:
+            raise RuntimeError(
+                "bmm operand A and B should have same batch_size, or batch_size = 1! "
+                "Current shape A: {} shape B: {} .".format(a_shapes, b_shapes)
+            )
+        batch_size = batch_size_b if batch_size_a == IntImm(1) else batch_size_a
+
+        return [a_shapes[0], batch_size, b_shapes[2]]
+
+    def _extract_dims(self, for_profiling=False):
+        # (M, B, K) * (B, K, N) = (M, B, N)
+        return {
+            "B": [common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1)],
+            "M": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=0),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0),
+            ],
+            "N": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=2),
+                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
+            ],
+            "K": [
+                common.DimInfo(common.Source.INPUT, tensor_idx=0, dim_idx=2),
+                common.DimInfo(common.Source.INPUT, tensor_idx=1, dim_idx=1),
+            ],
+        }
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # b
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
new file mode 100644
index 000000000..337894b3b
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row)) + bias[b, n]
+"""
+
+from ...base import IntImm, Tensor
+from ...tensor_accessor import TensorAccessor
+from . import perm102_bmm_rrr
+
+# pylint: disable=C0103, W0223, W0221
+
+
+class perm102_bmm_rrr_bias(perm102_bmm_rrr):
+    """Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row)) + bias[b, n]
+
+    The op is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(M, B, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        B_pt = torch.randn(B, N).cuda().half()
+
+        XT = X_pt.permute(1, 0, 2)
+        Bias = B_pt.unsqueeze(1)
+        Y_pt = torch.baddbmm(Bias, XT, W_pt)
+        Y_pt = Y_pt.permute(1, 0, 2)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "perm102_bmm_rrr_bias"
+
+    def _infer_shapes(self, a: Tensor, b: Tensor, bias: Tensor):
+        bias_shapes = bias._attrs["shape"]
+        if len(bias_shapes) != 2:
+            raise RuntimeError("Bias should be 2D vector ")
+        bias_shape = bias_shapes[1]
+        if not isinstance(bias_shape, IntImm):
+            raise RuntimeError("Bias should be fixed 2D vector")
+        outshape = super()._infer_shapes(a, b)
+        if outshape[2] != bias_shape:
+            raise RuntimeError("GEMM/Bias shape doesn't match")
+        return outshape
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        self._attrs["inputs"] = [a, b, bias]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        self._sanity_check(a, b)
+        output_shape = self._infer_shapes(a, b, bias)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
diff --git a/python/aitemplate/compiler/ops/groupnorm/__init__.py b/python/aitemplate/compiler/ops/groupnorm/__init__.py
new file mode 100644
index 000000000..e51549e67
--- /dev/null
+++ b/python/aitemplate/compiler/ops/groupnorm/__init__.py
@@ -0,0 +1,19 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from .groupnorm import group_norm
+from .groupnorm_swish import group_norm_swish
+
+__all__ = ["group_norm", "group_norm_swish"]
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
new file mode 100644
index 000000000..fd315614f
--- /dev/null
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -0,0 +1,403 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Operator definition for groupnorm.
+"""
+import os
+import re
+from collections import OrderedDict
+from hashlib import sha1
+from typing import Any, List, Union
+
+import jinja2
+
+from aitemplate.testing import detect_target
+
+from .... import backend
+from ....backend import registry
+from ....backend.target import Target
+from ....utils import logger
+from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
+from ..softmax.cache_entry import NormQueryEntry, NormRecordEntry
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class group_norm(Operator):
+    """Standalone group norm op.
+    The grouped dim must be the last dim of the input tensor.
+    """
+
+    def __init__(self, num_groups: int, num_channels: int) -> None:
+        super().__init__()
+        self._attrs["op"] = "groupnorm"
+        self._attrs["num_groups"] = num_groups
+        self._attrs["has_profiler"] = False
+        if detect_target().name() == "rocm":
+            self._attrs["has_profiler"] = True
+        self._attrs["num_channels"] = num_channels
+
+    @staticmethod
+    def check_shapes(x_shapes, gamma_shapes, beta_shapes, num_groups):
+        # check last dim can be divided by num_groups
+        # minimal group: 8
+        if len(gamma_shapes) != len(beta_shapes):
+            raise RuntimeError(
+                f"Gamma and beta must have the same number of dimensions, but got {len(gamma_shapes)} and {len(beta_shapes)}"
+            )
+        if x_shapes[-1].value() != gamma_shapes[0].value():
+            raise RuntimeError(
+                f"Input last dim {x_shapes[-1]} must be equal to gamma dim {gamma_shapes[0]}"
+            )
+        if x_shapes[-1].value() % num_groups != 0:
+            raise RuntimeError(
+                f"Channel dim {gamma_shapes[0]} must be divisible by num_groups {num_groups}"
+            )
+        return
+
+    @staticmethod
+    def get_input_shapes(x, gamma, beta) -> List[List[Union[IntVar, IntImm]]]:
+        """
+        Return a list of shapes for x, gamma and beta, where gamma_shape and
+        beta_shape may be None if gamma and beta are None, respectively.
+        """
+        x_shape = x._attrs["shape"]
+        # gamma and beta can be None.
+        gamma_shape = None
+        if gamma is not None:
+            gamma_shape = gamma._attrs["shape"]
+        beta_shape = None
+        if beta is not None:
+            beta_shape = beta._attrs["shape"]
+        return [x_shape, gamma_shape, beta_shape]
+
+    def _sanity_check(self, x, gamma, beta):
+        (x_shape, gamma_shape, beta_shape) = group_norm.get_input_shapes(x, gamma, beta)
+        group_norm.check_shapes(
+            x_shape, gamma_shape, beta_shape, self._attrs["num_groups"]
+        )
+
+    def _infer_shapes(self, x: Tensor):
+        """Infer shapes for groupnorm."""
+
+        return x._attrs["shape"]
+
+    def __call__(
+        self,
+        x: Tensor,
+        gamma: Tensor = None,
+        beta: Tensor = None,
+        normalized_shape: List[Any] = None,
+        eps: float = 1e-5,
+    ) -> Tensor:
+        inputs = [x]
+        self._attrs["gamma_constant"] = "1.0"
+        self._attrs["beta_constant"] = "0.0"
+        if gamma is not None:
+            self._attrs["gamma_constant"] = None
+            inputs.append(gamma)
+        if beta is not None:
+            self._attrs["beta_constant"] = None
+            inputs.append(beta)
+
+        assert isinstance(eps, float), f"eps must be float, instead it is {type(eps)}"
+        self._attrs["eps"] = eps
+        self._attrs["inputs"] = inputs
+
+        self._sanity_check(x, gamma, beta)
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        self._attrs["exec_cond_template"] = EXEC_COND_TEMPLATE
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+    def _invert_exec_key(self, key):
+        """Invert execution key to get input arguments as integers.
+
+        Parameters
+        ----------
+        key : str
+            Execution key
+
+        Returns
+        ----------
+            Dict[str, int]
+        """
+        vals = []
+        key_strs = []
+        for item in re.split(" == | && ", key):
+            if item.isnumeric():
+                vals.append(int(item))
+            else:
+                key_strs.append(item.strip())
+        assert len(vals) == len(
+            key_strs
+        ), f"expected len(vals) == len(key_strs), but got {len(vals)}, {len(key_strs)}"
+        return dict(zip(key_strs, vals))
+
+    def _gen_exec_key(self, name_value_mapping):
+        """Generate execution key from the name value mapping.
+
+        Parameters
+        ----------
+        name_value_mapping : Dict[str, Union[int, List[int]]
+            Dict for name and value.
+
+        Returns
+        ----------
+            str
+        """
+        key_strs = []
+        for name, values in name_value_mapping.items():
+            if len(values) == 1:
+                key_strs.append(f"{name} == {values[0]}")
+            elif len(values) > 1:
+                key_strs.append(f"{name} >= {values[0]} && {name} <= {values[-1]}")
+            else:
+                raise RuntimeError(f"Group norm input has empty dim values: {values}")
+        return " && ".join(key_strs)
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape_dict):
+        """Generate profiler command.
+
+        Parameters
+        ----------
+        profiler_prefix : str
+            Directory to store profiler.
+        cfg: str
+            The filename generated for profiler.
+        x_shape_dict : List[str, int]
+            Input shapes for the profiler.
+        """
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd = [exe_path]
+        x_shape = ["N", "H", "W", "G", "C"]
+        for shape in x_shape:
+            cmd.append(x_shape_dict[shape])
+        command = [str(x) for x in cmd]
+        return command
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        """Profile a single workload.
+
+        Parameters
+        ----------
+        profiler_prefix : str
+            Base dir to keep profiling source codes.
+        exec_key: str
+            Input arguments to profiler executables.
+        devices: List[int]
+            GPU device ids used for profiling.
+        """
+        target = backend.target.Target.current()
+        tmp_key = next(iter(self._attrs["op_instance"].keys()))
+        tmp_op = self._attrs["op_instance"][tmp_key]
+        exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
+        query = NormQueryEntry(
+            dtype_in=tmp_op.In.value,
+            dtype_acc=tmp_op.accumulator_type().value,
+            dtype_out=tmp_op.Out.value,
+            rank=tmp_op.Rank,
+            op_type=self._attrs["op"],
+            device=target._arch,
+            exec_entry_sha1=exec_entry_sha1,
+        )
+        cache_value = target.query_profile_cache("normalization", query.__dict__)
+        if cache_value is not None and not target.force_profile():
+            logger.info(__name__, "Load profiling result from cache.")
+            return cache_value
+
+        content = list(self._attrs["op_instance"].keys())
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        x_shape_dict = self._invert_exec_key(exec_key)
+        for cfg in content:
+            command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape_dict)
+            runner.push(cfg, command)
+
+        runner.join()
+        result = runner.pull()
+
+        out = sorted(result, key=lambda x: x[1])
+        if len(out) == 0:
+            raise RuntimeError(
+                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+            )
+        best_algo = out[0][0]
+        workspace = out[0][1].workspace
+        ## cache
+        cache_record = NormRecordEntry(
+            exec_entry=exec_key,
+            exec_entry_sha1=exec_entry_sha1,
+            dtype_in=tmp_op.In.value,
+            dtype_acc=tmp_op.accumulator_type().value,
+            dtype_out=tmp_op.Out.value,
+            rank=tmp_op.Rank,
+            op_type=self._attrs["op"],
+            device=target._arch,
+            algo=best_algo,
+            workspace=workspace,
+        )
+        Target.current().insert_profile_cache("normalization", cache_record.__dict__)
+        return (best_algo, workspace)
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.MAX,
+    ):
+        """Selects the fastest kernel configurations.
+
+        Parameters
+        ----------
+        workdir : str, optional
+            Base dir to keep profiling source codes, by default "./"
+        devices: list, optional
+            Devices used for profiling, by default device 0 will be used.
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy. By default MAX is used, i.e. to profile
+            a dynamic range, an upper bound will be used.
+        """
+
+        if devices is None:
+            devices = [0]
+
+        self._extract_exec_path(dynamic_profiling_strategy)
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        if "op_instance" not in self._attrs:
+            target = backend.target.Target.current()
+            # init candidate ops
+            func_key = "{target}.{op}.config".format(
+                target=target.name(), op=self._attrs["op"]
+            )
+            func = registry.get(func_key)
+            func(self._attrs)
+
+        for wkl in workloads:
+            logger.info(
+                __name__,
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            best_algo, workspace = self._profile_single_workload(
+                profiler_prefix, wkl, devices
+            )
+            self._attrs["exec_path"][wkl].algo = best_algo
+            self._attrs["workspace"] = workspace
+
+    def gen_profiler(
+        self,
+        workdir: str = None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+    ) -> None:
+        """Generator profiler. The profiler files are standalone executable for profiling.
+
+        Parameters
+        ----------
+        workdir : str, optional
+            Base dir to keep profiling source codes, by default "./"
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy, used to filter generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        """
+        target = Target.current()
+        # init candidate ops
+        func_key = "{target}.{op}.config".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs)
+        func_key = "{target}.{op}.gen_profiler".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs, workdir)
+
+    def _extract_exec_path(self, dynamic_profiling_strategy=DynamicProfileStrategy.MAX):
+        """Extract execution key, i.e. input arguments for the profiler.
+
+        Parameters
+        ----------
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy. By default MAX is used, i.e. to profile
+            a dynamic range, an upper bound will be used.
+        """
+        n_dim = self._attrs["inputs"][0]._attrs["shape"][0]
+        n_max = max(n_dim._attrs["values"])
+        n_min = min(n_dim._attrs["values"])
+
+        h_dim = self._attrs["inputs"][0]._attrs["shape"][1]
+        assert isinstance(h_dim, IntImm), "groupnorm requires h_dim to be static"
+        w_dim = self._attrs["inputs"][0]._attrs["shape"][2]
+        assert isinstance(w_dim, IntImm), "groupnorm requires w_dim to be static"
+        c_dim = self._attrs["inputs"][0]._attrs["shape"][3]
+        assert isinstance(c_dim, IntImm), "groupnorm requires c_dim to be static"
+
+        # N, H, W, G, C
+        shape_values_dict = {
+            "N": [n_min, n_max],
+            "H": [h_dim.value()],
+            "W": [w_dim.value()],
+            "G": [self._attrs["num_groups"]],
+            "C": [c_dim.value()],
+        }
+
+        self._attrs["exec_path"] = OrderedDict()
+        if dynamic_profiling_strategy == DynamicProfileStrategy.MAX:
+            max_values = {
+                name: [max(shape_values)]
+                for name, shape_values in shape_values_dict.items()
+            }
+            exec_item = ExecItem(
+                profiling_key=self._gen_exec_key(max_values),
+                exec_cond=self._gen_exec_key(shape_values_dict),
+                algo="",
+            )
+            self._attrs["exec_path"][exec_item.profiling_key] = exec_item
+        elif dynamic_profiling_strategy == DynamicProfileStrategy.MIN:
+            min_values = {
+                name: [min(shape_values)]
+                for name, shape_values in shape_values_dict.items()
+            }
+            exec_item = ExecItem(
+                profiling_key=self._gen_exec_key(min_values),
+                exec_cond=self._gen_exec_key(shape_values_dict),
+                algo="",
+            )
+            self._attrs["exec_path"][exec_item.profiling_key] = exec_item
+
+    def _inputs_for_pseudo_code(self):
+        return self._attrs["inputs"] + [f"num_groups={self._attrs['num_groups']}"]
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py
new file mode 100644
index 000000000..9aebe87c7
--- /dev/null
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py
@@ -0,0 +1,26 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from .groupnorm import group_norm
+
+
+class group_norm_swish(group_norm):
+    """Standalone group norm op.
+    The grouped dim must be the last dim of the input tensor.
+    """
+
+    def __init__(self, num_groups: int, num_channels: int) -> None:
+        super().__init__(num_groups, num_channels)
+        self._attrs["op"] = "groupnorm_swish"
diff --git a/python/aitemplate/compiler/ops/layernorm/__init__.py b/python/aitemplate/compiler/ops/layernorm/__init__.py
new file mode 100644
index 000000000..361b3a05e
--- /dev/null
+++ b/python/aitemplate/compiler/ops/layernorm/__init__.py
@@ -0,0 +1,28 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from .batch_layernorm_sigmoid_mul import batch_layernorm_sigmoid_mul
+from .group_layernorm import group_layernorm
+from .group_layernorm_sigmoid_mul import group_layernorm_sigmoid_mul
+from .layernorm import layernorm
+from .layernorm_sigmoid_mul import layernorm_sigmoid_mul
+
+
+__all__ = [
+    "batch_layernorm_sigmoid_mul",
+    "group_layernorm",
+    "group_layernorm_sigmoid_mul",
+    "layernorm",
+    "layernorm_sigmoid_mul",
+]
diff --git a/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py
new file mode 100644
index 000000000..5ff2dd079
--- /dev/null
+++ b/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py
@@ -0,0 +1,91 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+x: [b, m, n]
+gamma: [b, n]
+beta: [b, n]
+"""
+from typing import List
+
+from ...base import IntImm
+from .layernorm import layernorm
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+
+class batch_layernorm_sigmoid_mul(layernorm):
+    """batch_layernorm_sigmoid_mul op.
+    This op expects the normalized_shape to be 1D.
+    """
+
+    def __init__(self, normalized_shape: List[IntImm] = None) -> None:
+        super().__init__(normalized_shape)
+        self._attrs["op"] = "batch_layernorm_sigmoid_mul"
+
+    def _sanity_check(self, x, gamma, beta):
+        input_len = len(self._attrs["inputs"])
+        if input_len < 1 or input_len > 4:
+            raise NotImplementedError(
+                f"Expect 1 ~ 4 inputs for Layernorm. Actual #inputs: {input_len}"
+            )
+        (x_shape, gamma_shape, beta_shape) = layernorm.get_input_shapes(x, gamma, beta)
+        if len(x_shape) != 3:
+            raise NotImplementedError(
+                f"Layernorm input must be a 3-d matrix, current shape: {x_shape}"
+            )
+        if gamma_shape is not None:
+            if len(gamma_shape) != 2:
+                raise NotImplementedError(
+                    f"Layernorm gamma must be a 2-d matrix, current shapes: {gamma_shape}"
+                )
+        if beta_shape is not None:
+            if len(beta_shape) != 2:
+                raise NotImplementedError(
+                    f"Layernorm beta must be a 2-d matrix, current shapes: {beta_shape}"
+                )
+
+        # x: [b, m, n]
+        # gamma: [b, n]
+        # beta: [b, n]
+        if gamma_shape is not None:
+            if x_shape[2] != gamma_shape[1]:
+                raise RuntimeError(
+                    f"Layernorm inputs mismatch! x shape: {x_shape}, gamma shape: {gamma_shape}"
+                )
+            if x_shape[0] != gamma_shape[0]:
+                raise RuntimeError(
+                    f"Layernorm inputs mismatch! x shape: {x_shape}, gamma shape: {gamma_shape}"
+                )
+
+        if beta_shape is not None:
+            if x_shape[2] != beta_shape[1]:
+                raise RuntimeError(
+                    f"Layernorm inputs mismatch! x shape: {x_shape}, beta shape: {beta_shape}"
+                )
+            if x_shape[0] != beta_shape[0]:
+                raise RuntimeError(
+                    f"Layernorm inputs mismatch! x shape: {x_shape}, beta shape: {beta_shape}"
+                )
+
+        normalized_shape = self._attrs["normalized_shape"]
+        if len(normalized_shape) != 1:
+            raise NotImplementedError(
+                f"Layernorm normalized_shape length must be 1. Current normalized_shape: {normalized_shape}"
+            )
+        if normalized_shape[0]._attrs["values"] != x_shape[2]._attrs["values"]:
+            raise RuntimeError(
+                f"Layernorm normalized shape is not compatible with input shape. "
+                f"Normalized shape: {normalized_shape}, input shape: {x_shape}"
+            )
diff --git a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
new file mode 100644
index 000000000..8458526f7
--- /dev/null
+++ b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
@@ -0,0 +1,160 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Operator definition for group_layernorm.
+"""
+from typing import Any, List
+
+from aitemplate.compiler.base import IntImm, IntVarTensor, Tensor
+from aitemplate.compiler.ops.layernorm.layernorm import layernorm
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import shape_utils
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+
+class group_layernorm(layernorm):
+    """group_layernorm.
+    For each group, we expect each input to have shapes:
+
+        Input shape: [M0, M1, ..., Mp, N1, N2, ..., ND]
+        Normalized_shape: [N1, N2, ..., ND]
+        Gamma/Beta, if not None, have the same shape as normalized_shape.
+    Every input in the groups must have the same [M0, M1, ..., Mp] dims.
+    """
+
+    def __init__(self, normalized_shape: List[List[IntImm]] = None) -> None:
+        super().__init__(normalized_shape[0] if normalized_shape is not None else None)
+        self._attrs["op"] = "group_layernorm"
+        self._attrs["has_profiler"] = False
+        self._attrs["default_normalized_shape"] = normalized_shape
+
+    def _sanity_check(self, all_inputs):
+        input_len = len(all_inputs)
+        if input_len % 3 != 0:
+            raise NotImplementedError(
+                "Expect multiples of 3 inputs for group layernorm. "
+                "Actual #inputs: {}".format(input_len)
+            )
+
+        total = len(all_inputs)
+        b = total // 3
+
+        inputs, gammas, betas = (
+            all_inputs[:b],
+            all_inputs[b : 2 * b],
+            all_inputs[2 * b :],
+        )
+        assert len(inputs) > 0
+        assert (
+            len(inputs)
+            == len(gammas)
+            == len(betas)
+            == len(self._attrs["normalized_shape"])
+        )
+
+        for (x, gamma, beta, normalized_shape) in zip(
+            inputs, gammas, betas, self._attrs["normalized_shape"]
+        ):
+            (x_shape, gamma_shape, beta_shape) = layernorm.get_input_shapes(
+                x, gamma, beta
+            )
+            layernorm.check_shapes(x_shape, gamma_shape, beta_shape, normalized_shape)
+
+        # check x in inputs have the same batch dims (Ms), it can be dynamic
+        # x shape: B * [Ms, Ns], Ns can be different
+        input_len = inputs[0]._rank()
+        norm_len = len(self._attrs["normalized_shape"][0])
+        for k in range(input_len - norm_len):
+            M = inputs[0].shape()[k]
+            for i, x in enumerate(inputs, 1):
+                x_M = x.shape()[k]
+                assert M == x_M, f"found shape mismatch for input_{i},"
+                f"input_0 shape: {inputs[0].shape()}, input_{i} shape: {x.shape()}"
+
+    def __call__(
+        self,
+        inputs: List[Tensor],
+        gammas: List[Tensor],
+        betas: List[Tensor],
+        normalized_shapes: List[List[Any]] = None,
+        eps: float = 1e-5,
+    ) -> List[Tensor]:
+        # inputs is flattend into a single list of tensors
+        all_inputs = inputs + gammas + betas
+        # 'real_inputs' only contains non-None tensors
+        real_inputs = list(inputs)
+
+        # FIXME: currently, only support two cases, either all gammas are None or
+        # all gammas are non-None
+        self._attrs["gamma_constant"] = "1.0"
+        if gammas[0] is not None:
+            if any(gamma is None for gamma in gammas):
+                raise NotImplementedError(
+                    f"expected beta not to be None, but got None: {gammas}"
+                )
+            self._attrs["gamma_constant"] = None
+            real_inputs.extend(gammas)
+        else:
+            if any(gamma is not None for gamma in gammas):
+                raise NotImplementedError(
+                    f"expected all gammas to be None, but got {gammas}"
+                )
+
+        # FIXME: currently, only support two cases, either all betas are None or
+        # all betas are non-None
+        self._attrs["beta_constant"] = "0.0"
+        if betas[0] is not None:
+            if any(beta is None for beta in betas):
+                raise NotImplementedError(
+                    "expected beta not to be None, but got None: {betas}"
+                )
+            self._attrs["beta_constant"] = None
+            real_inputs.extend(betas)
+        else:
+            if any(beta is not None for beta in betas):
+                raise NotImplementedError(
+                    f"expected all betas to be None, but got {betas}"
+                )
+        if normalized_shapes is not None:
+            self._attrs["normalized_shape"] = []
+            for normalized_shape in normalized_shapes:
+                for shape in normalized_shape:
+                    # Only add source of dynamic dim to inputs
+                    if isinstance(shape, IntVarTensor) and not isinstance(
+                        shape._attrs["int_var"], IntImm
+                    ):
+                        real_inputs.append(shape)
+                self._attrs["normalized_shape"].append(
+                    shape_utils.convert_shape_to_IntVar(normalized_shape)
+                )
+        else:
+            self._attrs["normalized_shape"] = self._attrs["default_normalized_shape"]
+
+        assert isinstance(eps, float), f"eps must be float, instead it is {type(eps)}"
+        self._attrs["eps"] = eps
+        self._attrs["inputs"] = real_inputs
+        self._attrs["input_accessors"] = []
+        self._sanity_check(all_inputs)
+        self._set_depth()
+        self._attrs["outputs"] = []
+        self._attrs["output_accessors"] = []
+        for x in inputs:
+            output_shape = self._infer_shapes(x)
+            output = Tensor(output_shape, src_ops={self})
+            self._attrs["outputs"].append(output)
+            self._attrs["output_accessors"].append(TensorAccessor(output))
+            self._attrs["input_accessors"].append(TensorAccessor(x))
+        return self._attrs["outputs"]
diff --git a/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
new file mode 100644
index 000000000..764880244
--- /dev/null
+++ b/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
@@ -0,0 +1,39 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Operator definition for group_layernorm_sigmoid_mul.
+"""
+from typing import List
+
+from ...base import IntImm
+from .group_layernorm import group_layernorm
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+
+class group_layernorm_sigmoid_mul(group_layernorm):
+    """group_layernorm_sigmoid_mul.
+    For each group, we expect each input to have shapes:
+
+        Input shape: [M0, M1, ..., Mp, N1, N2, ..., ND]
+        Normalized_shape: [N1, N2, ..., ND]
+        Gamma/Beta, if not None, have the same shape as normalized_shape.
+    Every input in the groups must have the same [M0, M1, ..., Mp] dims.
+    """
+
+    def __init__(self, normalized_shape: List[List[IntImm]] = None) -> None:
+        super().__init__(normalized_shape)
+        self._attrs["op"] = "group_layernorm_sigmoid_mul"
+        self._attrs["has_profiler"] = False
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm.py b/python/aitemplate/compiler/ops/layernorm/layernorm.py
new file mode 100644
index 000000000..c84fb822f
--- /dev/null
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm.py
@@ -0,0 +1,417 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Operator definition for layernorm.
+"""
+import os
+import re
+from collections import OrderedDict
+from hashlib import sha1
+from typing import Any, List, Union
+
+import jinja2
+
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+from .... import backend
+from ....backend import registry
+from ....backend.target import Target
+from ....utils import logger
+from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
+from ...tensor_accessor import TensorAccessor
+from ..softmax.cache_entry import NormQueryEntry, NormRecordEntry
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class layernorm(Operator):
+    """Standalone layernorm op.
+
+    Applies Layer Normalization over a mini-batch of inputs as described in the
+    paper Layer Normalization. The mean and standard-deviation are calculated
+    over the last D dimensions, where D is the dimension of normalized_shape.
+    Input shape: [M0, M1, ..., Mp, N1, N2, ..., ND]
+    Normalized_shape: [N1, N2, ..., ND]
+    Gamma/Beta, if not None, have the same shape as normalized_shape.
+    """
+
+    def __init__(self, normalized_shape: List[IntImm] = None) -> None:
+        super().__init__()
+        self._attrs["op"] = "layernorm"
+        self._attrs["has_profiler"] = False
+        if detect_target().name() == "rocm":
+            self._attrs["has_profiler"] = True
+        self._attrs["default_normalized_shape"] = normalized_shape
+        self._attrs["normalized_shape"] = []
+
+    @staticmethod
+    def check_shapes(x_shapes, gamma_shapes, beta_shapes, normalized_shape):
+        if len(normalized_shape) >= len(x_shapes):
+            raise NotImplementedError(
+                f"Layernorm normalized_shape length must be smaller than the input."
+                f"Current normalized_shape: {normalized_shape}, input shape: {x_shapes}"
+            )
+
+        def _check_param_shapes(x_shapes, param_shapes, param_name):
+            if param_name != "normalized" and not param_shapes:
+                return
+            for shape in param_shapes:
+                if not isinstance(shape, IntImm):
+                    raise NotImplementedError(
+                        f"Layernorm {param_name} shape must be immutable values."
+                        f"Current value: {param_shapes}"
+                    )
+
+            batch_ndims = len(x_shapes) - len(param_shapes)
+            for i in range(len(param_shapes)):
+                if param_shapes[i].value() != x_shapes[batch_ndims + i].value():
+                    raise RuntimeError(
+                        f"Layernorm {param_name} shape is not compatible with input shape. "
+                        f"{param_name} shape: {param_shapes}, input shape: {x_shapes}"
+                    )
+
+        _check_param_shapes(x_shapes, gamma_shapes, "gamma")
+        _check_param_shapes(x_shapes, beta_shapes, "beta")
+        _check_param_shapes(x_shapes, normalized_shape, "normalized")
+
+    @staticmethod
+    def get_input_shapes(x, gamma, beta) -> List[List[Union[IntVar, IntImm]]]:
+        """
+        Return a list of shapes for x, gamma and beta, where gamma_shape and
+        beta_shape may be None if gamma and beta are None, respectively.
+        """
+        x_shape = x._attrs["shape"]
+        # gamma and beta can be None.
+        gamma_shape = None
+        if gamma is not None:
+            gamma_shape = gamma._attrs["shape"]
+        beta_shape = None
+        if beta is not None:
+            beta_shape = beta._attrs["shape"]
+        return [x_shape, gamma_shape, beta_shape]
+
+    def _sanity_check(self, x, gamma, beta):
+        normalized_shape = self._attrs["normalized_shape"]
+
+        # size() op can introduce up to 1 more input per normalized dim
+        input_len = len(self._attrs["inputs"])
+        max_input_len = 3 + len(normalized_shape)
+        if input_len < 1 or input_len > max_input_len:
+            raise NotImplementedError(
+                f"Expect 1 ~ {max_input_len} inputs for Layernorm. Actual #inputs: {input_len}"
+            )
+        (x_shape, gamma_shape, beta_shape) = layernorm.get_input_shapes(x, gamma, beta)
+
+        layernorm.check_shapes(x_shape, gamma_shape, beta_shape, normalized_shape)
+
+    def _infer_shapes(self, x: Tensor):
+        """Infer shapes for layernorm."""
+
+        return x._attrs["shape"]
+
+    def __call__(
+        self,
+        x: Tensor,
+        gamma: Tensor = None,
+        beta: Tensor = None,
+        normalized_shape: List[Any] = None,
+        eps: float = 1e-5,
+    ) -> Tensor:
+        inputs = [x]
+        self._attrs["gamma_constant"] = "1.0"
+        self._attrs["beta_constant"] = "0.0"
+        if gamma is not None:
+            self._attrs["gamma_constant"] = None
+            inputs.append(gamma)
+        if beta is not None:
+            self._attrs["beta_constant"] = None
+            inputs.append(beta)
+        if normalized_shape is not None:
+            new_norm_shape = shape_utils.convert_shape_to_IntVar(normalized_shape)
+            # Only add source of dynamic dim to inputs
+            for old_shape, new_shape in zip(normalized_shape, new_norm_shape):
+                if not isinstance(new_shape, IntImm):
+                    inputs.append(old_shape)
+            self._attrs["normalized_shape"] = new_norm_shape
+        else:
+            self._attrs["normalized_shape"] = self._attrs["default_normalized_shape"]
+        assert isinstance(eps, float), f"eps must be float, instead it is {type(eps)}"
+        self._attrs["eps"] = eps
+        self._attrs["inputs"] = inputs
+        self._attrs["input_accessors"] = [TensorAccessor(x)]
+        self._sanity_check(x, gamma, beta)
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        self._attrs["exec_cond_template"] = EXEC_COND_TEMPLATE
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+    def _invert_exec_key(self, key):
+        """Invert execution key to get input arguments as integers.
+
+        Parameters
+        ----------
+        key : str
+            Execution key
+
+        Returns
+        ----------
+            List[int]
+        """
+        res = []
+        for item in re.split(" == | && ", key):
+            if item.isnumeric():
+                res.append(int(item))
+        return res
+
+    def _gen_exec_key(self, name_value_mapping):
+        """Generate execution key from the name value mapping.
+
+        Parameters
+        ----------
+        name_value_mapping : Dict[str, Union[int, List[int]]
+            Dict for name and value.
+
+        Returns
+        ----------
+            str
+        """
+        key_strs = []
+        for name, values in name_value_mapping.items():
+            if len(values) == 1:
+                key_strs.append(f"{name} == {values[0]}")
+            elif len(values) > 1:
+                key_strs.append(f"{name} >= {values[0]} && {name} <= {values[-1]}")
+            else:
+                raise RuntimeError(
+                    "Layernorm input has empty dim values: {}".format(values)
+                )
+        return " && ".join(key_strs)
+
+    def _extract_exec_path(self, dynamic_profiling_strategy=DynamicProfileStrategy.MAX):
+        """Extract execution key, i.e. input arguments for the profiler.
+
+        Parameters
+        ----------
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy. By default MAX is used, i.e. to profile
+            a dynamic range, an upper bound will be used.
+        """
+        assert (
+            len(self._attrs["normalized_shape"]) == 1
+        ), "For profiling, normalized_shape must be 1D"
+
+        m_max = 1
+        m_min = 1
+        for dim in self._attrs["inputs"][0]._attrs["shape"][:-1]:
+            m_max *= max(dim._attrs["values"])
+            m_min *= min(dim._attrs["values"])
+
+        n = self._attrs["inputs"][0]._attrs["shape"][-1].value()
+
+        shape_values_dict = {
+            "M": [m_min, m_max],
+            "N": [n],
+        }
+
+        self._attrs["exec_path"] = OrderedDict()
+        if dynamic_profiling_strategy == DynamicProfileStrategy.MAX:
+            max_values = {"M": [m_max], "N": [n]}
+
+            exec_item = ExecItem(
+                profiling_key=self._gen_exec_key(max_values),
+                exec_cond=self._gen_exec_key(shape_values_dict),
+                algo="",
+            )
+            self._attrs["exec_path"][exec_item.profiling_key] = exec_item
+        elif dynamic_profiling_strategy == DynamicProfileStrategy.MIN:
+            min_values = {"M": [m_min], "N": [n]}
+            exec_item = ExecItem(
+                profiling_key=self._gen_exec_key(min_values),
+                exec_cond=self._gen_exec_key(shape_values_dict),
+                algo="",
+            )
+            self._attrs["exec_path"][exec_item.profiling_key] = exec_item
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
+        """Generate profiler command.
+
+        Parameters
+        ----------
+        profiler_prefix : str
+            Directory to store profiler.
+        cfg: str
+            The filename generated for profiler.
+        x_shape : List[int]
+            Input shapes for the profiler.
+        """
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd = [exe_path]
+        for shape in x_shape:
+            cmd.append(shape)
+        command = [str(x) for x in cmd]
+        return command
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        """Profile a single workload.
+
+        Parameters
+        ----------
+        profiler_prefix : str
+            Base dir to keep profiling source codes.
+        exec_key: str
+            Input arguments to profiler executables.
+        devices: List[int]
+            GPU device ids used for profiling.
+        """
+        target = backend.target.Target.current()
+        # if in CI just choose minimal configs
+        # workspace is a hack just provides 102400 Byte
+        # query cache
+        tmp_key = next(iter(self._attrs["op_instance"].keys()))
+        tmp_op = self._attrs["op_instance"][tmp_key]
+        exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
+        query = NormQueryEntry(
+            dtype_in=tmp_op.In.value,
+            dtype_acc=tmp_op.accumulator_type().value,
+            dtype_out=tmp_op.Out.value,
+            rank=tmp_op.Rank,
+            op_type=self._attrs["op"],
+            device=target._arch,
+            exec_entry_sha1=exec_entry_sha1,
+        )
+        cache_value = target.query_profile_cache("normalization", query.__dict__)
+        if cache_value is not None and not target.force_profile():
+            logger.info(__name__, "Load profiling result from cache.")
+            return cache_value
+
+        content = list(self._attrs["op_instance"].keys())
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        x_shape = self._invert_exec_key(exec_key)
+        for cfg in content:
+            command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
+            runner.push(cfg, command)
+
+        runner.join()
+        result = runner.pull()
+
+        out = sorted(result, key=lambda x: x[1])
+        if len(out) == 0:
+            raise RuntimeError(
+                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+            )
+        best_algo = out[0][0]
+        workspace = out[0][1].workspace
+        ## cache
+        cache_record = NormRecordEntry(
+            exec_entry=exec_key,
+            exec_entry_sha1=exec_entry_sha1,
+            dtype_in=tmp_op.In.value,
+            dtype_acc=tmp_op.accumulator_type().value,
+            dtype_out=tmp_op.Out.value,
+            rank=tmp_op.Rank,
+            op_type=self._attrs["op"],
+            device=target._arch,
+            algo=best_algo,
+            workspace=workspace,
+        )
+        Target.current().insert_profile_cache("normalization", cache_record.__dict__)
+        return (best_algo, workspace)
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.MAX,
+    ):
+        """Selects the fastest kernel configurations.
+
+        Parameters
+        ----------
+        workdir : str, optional
+            Base dir to keep profiling source codes, by default "./"
+        devices: list, optional
+            Devices used for profiling, by default device 0 will be used.
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy. By default MAX is used, i.e. to profile
+            a dynamic range, an upper bound will be used.
+        """
+
+        if devices is None:
+            devices = [0]
+
+        self._extract_exec_path(dynamic_profiling_strategy)
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        if "op_instance" not in self._attrs:
+            target = backend.target.Target.current()
+            # init candidate ops
+            func_key = "{target}.{op}.config".format(
+                target=target.name(), op=self._attrs["op"]
+            )
+            func = registry.get(func_key)
+            func(self._attrs)
+
+        for wkl in workloads:
+            logger.info(
+                __name__,
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            best_algo, workspace = self._profile_single_workload(
+                profiler_prefix, wkl, devices
+            )
+            self._attrs["exec_path"][wkl].algo = best_algo
+            self._attrs["workspace"] = workspace
+
+    def gen_profiler(
+        self,
+        workdir: str = None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+    ) -> None:
+        target = Target.current()
+        # init candidate ops
+        func_key = "{target}.{op}.config".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs)
+        func_key = "{target}.{op}.gen_profiler".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs, workdir)
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
new file mode 100644
index 000000000..47ace2262
--- /dev/null
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
@@ -0,0 +1,97 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Operator definition for layernorm_sigmoid_mul.
+"""
+from .... import backend
+from ....backend import registry
+from ...base import Operator
+from ...tensor_accessor import TensorAccessor
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+
+class layernorm_sigmoid_mul(Operator):
+    """Fused layernorm_sigmoid_mul op
+    Input shape: [M0, M1, ..., Mp, N1, N2, ..., ND]
+    Normalized_shape: [N1, N2, ..., ND]
+    Gamma/Beta, if not None, have the same shape as normalized_shape.
+    """
+
+    def __init__(self, layer_norm: Operator, sigmoid: Operator, mul: Operator) -> None:
+        super().__init__()
+        self._attrs["op"] = "layernorm_sigmoid_mul"
+        self._attrs["has_profiler"] = False
+
+        assert layernorm_sigmoid_mul.is_valid(layer_norm, sigmoid, mul)
+        self._update_inputs_outputs(layer_norm, sigmoid, mul)
+        self._set_depth()
+
+    @staticmethod
+    def is_valid(layer_norm: Operator, sigmoid: Operator, mul: Operator) -> bool:
+        if sigmoid._attrs["inputs"][0] != layer_norm._attrs["outputs"][0]:
+            return False
+        if len(mul._attrs["inputs"]) != 2:
+            return False
+        return (
+            mul._attrs["inputs"][0] == sigmoid._attrs["outputs"][0]
+            and mul._attrs["inputs"][1] == layer_norm._attrs["inputs"][0]
+        ) or (
+            mul._attrs["inputs"][1] == sigmoid._attrs["outputs"][0]
+            and mul._attrs["inputs"][0] == layer_norm._attrs["inputs"][0]
+        )
+
+    def _update_inputs_outputs(self, layer_norm, sigmoid, mul):
+        self._attrs["inputs"] = layer_norm._attrs["inputs"]
+        self._attrs["gamma_constant"] = layer_norm._attrs["gamma_constant"]
+        self._attrs["beta_constant"] = layer_norm._attrs["beta_constant"]
+        self._attrs["normalized_shape"] = layer_norm._attrs["normalized_shape"]
+        self._attrs["eps"] = layer_norm._attrs["eps"]
+        self._attrs["outputs"] = mul._attrs["outputs"]
+        self._attrs["output_accessors"] = [
+            TensorAccessor(output_tensor) for output_tensor in self._attrs["outputs"]
+        ]
+        self._attrs["input_accessors"] = [TensorAccessor(self._attrs["inputs"][0])]
+
+        for input_tensor in self._attrs["inputs"]:
+            input_tensor._attrs["dst_ops"].discard(layer_norm)
+            input_tensor._attrs["dst_ops"].discard(mul)
+            input_tensor._attrs["dst_ops"].add(self)
+
+        assert len(self._attrs["outputs"]) == 1
+        output_tensor = self._attrs["outputs"][0]
+        output_tensor._attrs["src_ops"] = {self}
+
+        # update output tensor shape
+        # hack for fixing dynamic shape with elementwise fusion issue
+        x = self._attrs["inputs"][0]
+        for i, shape_var in enumerate(output_tensor._attrs["shape"]):
+            shape_var._attrs["values"] = x._attrs["shape"][i]._attrs["values"]
+
+        sigmoid._attrs["inputs"][0]._attrs["src_ops"] = set()
+        sigmoid._attrs["inputs"][0]._attrs["dst_ops"] = set()
+        sigmoid._attrs["outputs"][0]._attrs["src_ops"] = set()
+        sigmoid._attrs["outputs"][0]._attrs["dst_ops"] = set()
+
+    def __call__(self):
+        return self._attrs["outputs"][0]
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/padding/__init__.py b/python/aitemplate/compiler/ops/padding/__init__.py
new file mode 100644
index 000000000..c3b9b2f3c
--- /dev/null
+++ b/python/aitemplate/compiler/ops/padding/__init__.py
@@ -0,0 +1,23 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Padding ops module init.
+"""
+from .nhwc3to4 import nhwc3to4
+from .nhwc3to8 import nhwc3to8
+from .pad_last_dim import pad_last_dim
+
+
+__all__ = ["nhwc3to8", "nhwc3to4", "pad_last_dim"]
diff --git a/python/aitemplate/compiler/ops/padding/nhwc3to4.py b/python/aitemplate/compiler/ops/padding/nhwc3to4.py
new file mode 100644
index 000000000..7cce9a94e
--- /dev/null
+++ b/python/aitemplate/compiler/ops/padding/nhwc3to4.py
@@ -0,0 +1,39 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Nhwc 3 channel to 4 channel padding.
+"""
+
+import jinja2
+
+from .nhwc_pad_common import nhwc_pad_common
+
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}HI = {{x_dim1}};
+{{indent}}{{dtype}}WI = {{x_dim2}};
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}HO = HI;
+{{indent}}{{dtype}}WO = WI;
+{{indent}}{{dtype}}CO = 4;
+"""
+)
+
+
+class nhwc3to4(nhwc_pad_common):
+    def __init__(self):
+        super().__init__(SHAPE_FUNC_TEMPLATE, 4)
diff --git a/python/aitemplate/compiler/ops/padding/nhwc3to8.py b/python/aitemplate/compiler/ops/padding/nhwc3to8.py
new file mode 100644
index 000000000..7d4581c8e
--- /dev/null
+++ b/python/aitemplate/compiler/ops/padding/nhwc3to8.py
@@ -0,0 +1,38 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Nhwc 3 channel to 8 channel padding.
+"""
+
+import jinja2
+
+from .nhwc_pad_common import nhwc_pad_common
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}HI = {{x_dim1}};
+{{indent}}{{dtype}}WI = {{x_dim2}};
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}HO = HI;
+{{indent}}{{dtype}}WO = WI;
+{{indent}}{{dtype}}CO = 8;
+"""
+)
+
+
+class nhwc3to8(nhwc_pad_common):
+    def __init__(self):
+        super().__init__(SHAPE_FUNC_TEMPLATE, 8)
diff --git a/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py b/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
new file mode 100644
index 000000000..b88967a6f
--- /dev/null
+++ b/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
@@ -0,0 +1,109 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common NHWC padding ops
+"""
+import itertools
+from typing import List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ...base import Operator, Tensor
+
+# pylint: disable=C0103,W0221
+
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = NO;
+{{indent}}{{y_dim1}} = HO;
+{{indent}}{{y_dim2}} = WO;
+"""
+)
+
+
+class nhwc_pad_common(Operator):
+    """
+    Pad the 3-channel input data to 4/8-channel.
+    """
+
+    def __init__(self, shape_func_template, padded_channels):
+        super().__init__()
+        self._attrs["op"] = f"nhwc3to{padded_channels}"
+        self.shape_eval_template = shape_func_template
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+
+    def _infer_shape(self, x: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def __call__(self, x: Tensor) -> List[Tensor]:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/compiler/ops/padding/pad_last_dim.py b/python/aitemplate/compiler/ops/padding/pad_last_dim.py
new file mode 100644
index 000000000..29fb49c24
--- /dev/null
+++ b/python/aitemplate/compiler/ops/padding/pad_last_dim.py
@@ -0,0 +1,93 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Pad last dimension.
+"""
+from typing import List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ...base import IntImm, Operator, Tensor
+
+# pylint: disable=C0103,W0221
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{% for dim in shape %}
+{{indent}}{{dtype}}X_DIM{{loop.index - 1}} = {{dim}};
+{% endfor %}
+{{indent}}{{dtype}}Y_OUT_DIM = {{out_dim}};
+"""
+)
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{% for dim in shape %}
+{{indent}}{{dtype}}{{dim}} = X_DIM{{loop.index - 1}};
+{% endfor %}
+{{indent}}{{dtype}}{{last_dim}} = Y_OUT_DIM;
+"""
+)
+
+
+class pad_last_dim(Operator):
+    """Pad the last dimension of the input data to the specified length."""
+
+    def __init__(self, ndim: int, out_dim: int):
+        super().__init__()
+        self._attrs["op"] = "pad_last_dim"
+        self._attrs["ndim"] = ndim
+        self._attrs["out_dim"] = out_dim
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+
+    def _infer_shapes(self, x: Tensor):
+        """Infers shapes for pad_last_dim."""
+
+        x_shape = x._attrs["shape"]
+        ndim = len(x_shape)
+        if self._attrs["out_dim"] <= max(x_shape[-1]._attrs["values"]):
+            raise RuntimeError("Output of padded dim must be larger than original dim")
+        if ndim != self._attrs["ndim"]:
+            raise RuntimeError("Data/Op dims mismatch")
+        if ndim > 4:
+            raise NotImplementedError
+        output_shape = list(x_shape)
+        output_shape[-1] = IntImm(self._attrs["out_dim"])
+        return output_shape
+
+    def __call__(self, x: Tensor) -> List[Tensor]:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/compiler/ops/pool/__init__.py b/python/aitemplate/compiler/ops/pool/__init__.py
new file mode 100644
index 000000000..e0e5003fe
--- /dev/null
+++ b/python/aitemplate/compiler/ops/pool/__init__.py
@@ -0,0 +1,22 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Pool module init.
+"""
+from .avg_pool2d import avg_pool2d
+from .max_pool2d import max_pool2d
+
+
+__all__ = ["avg_pool2d", "max_pool2d"]
diff --git a/python/aitemplate/compiler/ops/pool/avg_pool2d.py b/python/aitemplate/compiler/ops/pool/avg_pool2d.py
new file mode 100644
index 000000000..c113c1d5e
--- /dev/null
+++ b/python/aitemplate/compiler/ops/pool/avg_pool2d.py
@@ -0,0 +1,53 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Avg_pool2d op.
+"""
+from .pool2d import pool2d_base
+
+
+# pylint: disable=C0103
+class avg_pool2d(pool2d_base):
+    r"""Applies a 2D average pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, H, W, C)`,
+    output :math:`(N, H_{out}, W_{out}, C)` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+
+    If :attr:`pad` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`pad` number of points.
+
+    * .attr.:`kernel_size`: the size of the window
+
+    * .attr.:`stride`: the stride of the window
+
+    * .attr.:`pad`: implicit zero padding to be added on both sides
+
+    Args:
+        input (Tensor [N, H, W, C]): the input tensor.
+
+    Return:
+        Tensor [N, H_out, W_out, C].
+    """
+
+    def __init__(self, kernel_size, stride, pad) -> None:
+        super().__init__(stride, pad, kernel_size, "avg")
+        self._attrs["op"] = "avg_pool2d"
diff --git a/python/aitemplate/compiler/ops/pool/max_pool2d.py b/python/aitemplate/compiler/ops/pool/max_pool2d.py
new file mode 100644
index 000000000..f92303f1b
--- /dev/null
+++ b/python/aitemplate/compiler/ops/pool/max_pool2d.py
@@ -0,0 +1,56 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Max_pool2d op.
+"""
+from .pool2d import pool2d_base
+
+
+# pylint: disable=C0103
+class max_pool2d(pool2d_base):
+    r"""
+    Applies a 2D max pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \begin{aligned}
+            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+        \end{aligned}
+
+    If :attr:`pad` is non-zero, then the input is implicitly padded with negative infinity on both sides.
+
+    * .attr.:`kernel_size`: the size of the window
+
+    * .attr.:`stride`: the stride of the window
+
+    * .attr.:`pad`: implicit zero padding to be added on both sides
+
+    Args:
+        input (Tensor [N, H, W, C]): the input tensor.
+
+    Return:
+        Tensor [N, H_out, W_out, C].
+
+    """
+
+    def __init__(self, kernel_size, stride, pad) -> None:
+        super().__init__(stride, pad, kernel_size, "max")
+        self._attrs["op"] = "max_pool2d"
diff --git a/python/aitemplate/compiler/ops/pool/pool2d.py b/python/aitemplate/compiler/ops/pool/pool2d.py
new file mode 100644
index 000000000..883abfd40
--- /dev/null
+++ b/python/aitemplate/compiler/ops/pool/pool2d.py
@@ -0,0 +1,180 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Pool2d.
+"""
+import itertools
+import logging
+import re
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ...base import Operator, Tensor
+
+# pylint: disable=C0103,W0221,R1732,W0613
+logging.basicConfig(level=logging.INFO)
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}HI = {{x_dim1}};
+{{indent}}{{dtype}}WI = {{x_dim2}};
+{{indent}}{{dtype}}CI = {{x_dim3}};
+{{indent}}{{dtype}}CO = {{x_dim3}};
+{{indent}}{{dtype}}KH = {{kernel_h}};
+{{indent}}{{dtype}}KW = {{kernel_w}};
+{{indent}}{{dtype}}SH = {{stride}};
+{{indent}}{{dtype}}SW = {{stride}};
+{{indent}}{{dtype}}PH = {{pad}};
+{{indent}}{{dtype}}PW = {{pad}};
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}HO = (HI + PH + PH - KH) {{div}} SH + 1;
+{{indent}}{{dtype}}WO = (WI + PW + PW - KW) {{div}} SW + 1;
+"""
+)
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = NO;
+{{indent}}{{y_dim1}} = HO;
+{{indent}}{{y_dim2}} = WO;
+"""
+)
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class pool2d_base(Operator):
+    """Base class of pool2d."""
+
+    def __init__(self, stride, pad, kernel_size, reduce_func) -> None:
+        """
+        Parameters
+        ----------
+        stride : int
+        pad : int
+        dilate : int, optional, by default 1
+        reduce_func : the function to use for reduction
+        """
+        super().__init__()
+        self._attrs["op"] = "pool2d"
+        self._attrs["stride"] = stride
+        self._attrs["pad"] = pad
+        self._attrs["reduce_func"] = reduce_func
+        self._attrs["kernel_size"] = kernel_size
+        self._attrs["KH"] = kernel_size
+        self._attrs["KW"] = kernel_size
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+        self.exec_cond_template = EXEC_COND_TEMPLATE
+
+    def _infer_shape(self, x: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            stride=self._attrs["stride"],
+            pad=self._attrs["pad"],
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            kernel_h=self._attrs["kernel_size"],
+            kernel_w=self._attrs["kernel_size"],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def _invert_exec_key(self, key: str):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_exec_key(self, shape):
+        return self.exec_key_template.render(
+            x_dim0=shape[0], x_dim1=shape[1], x_dim2=shape[2], x_dim3=shape[3]
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        self._attrs["exec_path"] = OrderedDict()
+        self._attrs["exec_path"]["true"] = ""
+
+    def _signature(self) -> str:
+        signature = "pooling2d: K=[{kh}, {kw}], S=[{s}], P=[{p}], CO=[{co}]".format(
+            kh=self._attrs["KH"],
+            kw=self._attrs["KW"],
+            s=self._attrs["stride"],
+            p=self._attrs["pad"],
+            co=self._attrs["CO"],
+        )
+        return signature
+
+    def __call__(self, x: Tensor) -> List[Tensor]:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        self._extract_exec_path(x)
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            self.exec_cond_template,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/compiler/ops/reduce/__init__.py b/python/aitemplate/compiler/ops/reduce/__init__.py
new file mode 100644
index 000000000..037151e20
--- /dev/null
+++ b/python/aitemplate/compiler/ops/reduce/__init__.py
@@ -0,0 +1,24 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Reduce module init.
+"""
+from .reduce_mean import reduce_mean
+from .reduce_sum import reduce_sum
+from .var import var
+from .vector_norm import vector_norm
+
+
+__all__ = ["reduce_mean", "reduce_sum", "var", "vector_norm"]
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_common.py b/python/aitemplate/compiler/ops/reduce/reduce_common.py
new file mode 100644
index 000000000..d8036168b
--- /dev/null
+++ b/python/aitemplate/compiler/ops/reduce/reduce_common.py
@@ -0,0 +1,249 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Base operator definition for reduce-family ops.
+"""
+from typing import List
+
+from .... import backend
+from ....backend import registry
+from ....utils import logger, shape_utils
+from ....utils.tensor_utils import wrap_dim
+from ...base import get_dtype_size, IntImm, IntVar, Operator, Tensor
+from ...tensor_accessor import TensorAccessor
+
+# pylint: disable=C0103,W0221
+
+
+class reduce_base(Operator):
+    """The base class for reduce ops."""
+
+    def __init__(self, dim, keepdim=False, dtype=None) -> None:
+        """
+        Parameters
+        ----------
+        dim : int or tuple of python:ints
+            the dimension or dimensions to reduce
+        keepdim : bool
+            keep the reduced dimensions if True, default is False
+        dtype : optional str
+            the type of the return tensor. If it is not None,
+            the input tensor is casted to dtype before reduction.
+        Raises
+        ------
+        RuntimeError : duplicate values in the dim list
+        """
+        super().__init__()
+        if isinstance(dim, int):
+            dim = [dim]
+        elif isinstance(dim, (list, tuple)):
+            if not all(isinstance(x, int) for x in dim):
+                raise RuntimeError("dim must be either int or a list/tuple of ints.")
+            dim = list(dim)
+        else:
+            raise RuntimeError("dim must be either int or a list/tuple of ints.")
+        dup_dims = {d for d in dim if dim.count(dim) > 1}
+        if len(dup_dims) > 1:
+            raise RuntimeError(
+                "dim {d} appears multiple times in the list of dims".format(
+                    d=dup_dims[0]
+                )
+            )
+        self._attrs["op"] = "reduce"
+        self._attrs["reduction_axes"] = dim
+        self._attrs["keepdim"] = keepdim
+        self._attrs["output_type"] = dtype
+        self._attrs["has_profiler"] = False
+
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        """Infers shapes for reduce ops."""
+
+        input_dims = x._attrs["shape"]
+        reduction_axes = self._attrs["reduction_axes"]
+        if self._attrs["keepdim"]:
+            output_dims = [
+                IntImm(1) if idx in set(reduction_axes) else d
+                for idx, d in enumerate(input_dims)
+            ]
+        else:
+            # out codegen for reduce ops doesn't rely on the output shape,
+            # so it's safe to squeeze the output tensor shape here
+            output_dims = [
+                d for idx, d in enumerate(input_dims) if idx not in set(reduction_axes)
+            ]
+        return output_dims
+
+    def _compute_ws_size_strided(
+        self, extent, reduction_axis, vector_length, dtype
+    ) -> int:
+        """
+        Compute workspace size for contiguous reduction kernels.
+        Reference: cutlass reduction/device/tensor_reduce_affine_strided.h
+        """
+        k_rank = 4
+        k_reduced_rank = 3
+        k_inner_rank = k_rank - k_reduced_rank
+        k_threads = 256
+        target_threadblock_count = 128
+
+        outer_count = 1
+        for dim_val in extent[: k_reduced_rank - 1]:
+            outer_count *= dim_val
+        inner_count = 1
+        for dim_val in extent[k_reduced_rank - 1 : k_reduced_rank - 1 + k_inner_rank]:
+            inner_count *= dim_val
+        extent_c = extent[k_rank - 1]
+        vectors_c = (extent_c - 1 + vector_length) // vector_length
+        cta_width = k_threads * vector_length
+
+        def _reshape_pow2(ext, count):
+            if ext > count:
+                return 1
+            x = 1
+            while count >= ext * 2:
+                count >>= 1
+                x <<= 1
+            return x
+
+        cta_ways = _reshape_pow2(extent_c, cta_width)
+        cta_threads_x = k_threads / cta_ways
+        threadblock_shape_y = 1
+        threadblock_shape_z = min(cta_ways, 64)
+        cta_count_x = (vectors_c + cta_threads_x - 1) // cta_threads_x
+        cta_count_y = max(1, target_threadblock_count // cta_count_x)
+        if cta_count_y * threadblock_shape_y > outer_count:
+            cta_count_y = (outer_count + threadblock_shape_y - 1) // threadblock_shape_y
+        cta_count_z = max(1, target_threadblock_count / cta_count_y)
+        if cta_count_z * threadblock_shape_z > inner_count:
+            cta_count_z = (inner_count + threadblock_shape_z - 1) // threadblock_shape_z
+        if int(cta_count_z) == 1:
+            return 0
+        vector_size_bytes = vector_length * get_dtype_size(dtype)
+        workspace_stride = extent[k_rank - 1] * vector_size_bytes
+        return workspace_stride * outer_count * cta_count_z
+
+    def _compute_workspace_size(
+        self, shape: List[IntVar], reduction_axis, dtype
+    ) -> int:
+        """
+        Compute workspace size for the given shape using the same algorithm as
+        cutlass's TensorReduction kernel. The only difference is that we use
+        the maximum dim value for dynamic dimension, whereas TensorReduction
+        uses the real dim value at runtime.
+        """
+        # Make sure the last dim is static to pre-compute vector_length.
+        # Note that this is a temporary constraint. Once we replace TensorReduction
+        # with our own col-reduction kernel, we will remove this entire workaround.
+        if not shape_utils.is_static_dimension(shape, -1):
+            raise NotImplementedError("expect the last dim to be static")
+        last_dim = shape[-1].value()
+        vector_lens_config = [32, 16, 8, 4, 1]
+        vector_length = 1
+        for vec_len in vector_lens_config:
+            if last_dim % vec_len == 0:
+                vector_length = vec_len
+                break
+        extent_affine = []
+        num_dims = 4
+        # Use dim's upper-bound for computing workspace size
+        shape_dims = [d.upper_bound() for d in shape]
+        rank = len(shape_dims)
+        assert (
+            rank <= num_dims
+        ), f"expected rank <= num_dims, but got {rank=}, {num_dims=}"
+        # adjust reduction axis
+        reduction_axis = num_dims - rank + reduction_axis
+        prefix_dims = [1] * (num_dims - rank)
+        shape_dims = prefix_dims + shape_dims
+        # normalize extent_affine list
+        # reference: TensorReduction construction in cutlass
+        # reduction/device/tensor_reduce.h
+        if reduction_axis == 0:
+            extent_affine.append(shape_dims[1])
+            extent_affine.append(shape_dims[2])
+            extent_affine.append(shape_dims[0])
+            extent_affine.append(shape_dims[3])
+        elif reduction_axis == 1:
+            extent_affine.append(shape_dims[0])
+            extent_affine.append(shape_dims[2])
+            extent_affine.append(shape_dims[1])
+            extent_affine.append(shape_dims[3])
+        elif reduction_axis == 2:
+            extent_affine.append(shape_dims[0])
+            extent_affine.append(shape_dims[1])
+            extent_affine.append(shape_dims[2])
+            extent_affine.append(shape_dims[3])
+        else:
+            # note that we already ruled out non-col-reduction kernels so that
+            # reduction_axis would never be 3. Consequently, we would never
+            # invoke contiguous tensor_reduce kernels.
+            raise RuntimeError(
+                f"Expected reduction_axis to be within [0, 2], but got {reduction_axis=}"
+            )
+        return self._compute_ws_size_strided(
+            extent_affine, reduction_axis, vector_length, dtype
+        )
+
+    def __call__(self, x: Tensor) -> Tensor:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        reduction_axes = self._attrs["reduction_axes"]
+        if not len(reduction_axes) == 1:
+            raise NotImplementedError("Multiple reduction axes are not supported yet")
+        input_rank = len(x._attrs["shape"])
+        self._attrs["reduction_axes"] = [
+            wrap_dim(axis, input_rank) for axis in reduction_axes
+        ]
+        for axis in self._attrs["reduction_axes"]:
+            if axis < 0 or axis >= input_rank:
+                raise RuntimeError(
+                    "invalid axis {a}, expected in a range [0, {r})".format(
+                        a=axis, r=input_rank
+                    )
+                )
+        output_shape = self._infer_shapes(x)
+        output_type = self._attrs["output_type"]
+        if output_type is None:
+            output_type = x._attrs["dtype"]
+        output = Tensor(output_shape, src_ops={self}, dtype=output_type)
+        self._attrs["outputs"] = [output]
+        self._attrs["output_accessors"] = [TensorAccessor(output)]
+        # Under the condition below, we invoke cutlass's TensorReduction kernel,
+        # which requires a workspace. The actual size of the required workspace
+        # is computed at the runtime before invoking TensorReduction. We tried
+        # to simply make the worspace be able to hold the largest tensor using
+        # the upperbound of any dynamic dim from the input shape. Unfortunately,
+        # this seemed to not work as expected. Particularly, assigning a workspace
+        # for a reduce kernel that expects a NULL workspace pointer crashed some
+        # of the tests. As a result, we just follow cutlass's algorithm to
+        # pre-compute workspace sizes using dims' upperbound values.
+        # Note that this is a temprary solution only for col-reduction reduce_sum
+        # kernels that invoke cutlass's TensorReduction kernel. Once we have our
+        # own implementation, we will remove the workaround.
+        if self._attrs["op"] == "reduce_sum" and (reduction_axes[0] != input_rank - 1):
+            ws_size = self._compute_workspace_size(
+                x._attrs["shape"], reduction_axes[0], x.dtype()
+            )
+            logger.info(__name__, f'allocating {ws_size} for tensor {x._attrs["name"]}')
+            self._attrs["workspace"] = ws_size
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_mean.py b/python/aitemplate/compiler/ops/reduce/reduce_mean.py
new file mode 100644
index 000000000..44fbc810d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/reduce/reduce_mean.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Reduce_mean op implementation.
+"""
+from .reduce_common import reduce_base
+
+# pylint: disable=C0103
+
+
+class reduce_mean(reduce_base):
+    """
+    Implements the reduce_mean op.
+
+    * .attr.:`dim` : int or tuple of python:ints
+      the dimension or dimensions to reduce
+
+    * .attr.:`keepdim` : bool, optional
+      keep the reduced dimensions if True, default is False
+
+    * .attr.:`dtype` : str, optional
+      the type of the return tensor. If it is not None,
+      the input tensor is cast to dtype before reduction.
+
+    Args:
+        input (Tensor): the input tensor.
+
+    Return:
+        Tensor that contains the mean value of all elements in the input tensor.
+    """
+
+    def __init__(self, dim, keepdim=False, dtype=None) -> None:
+        super().__init__(dim, keepdim, dtype)
+        self._attrs["op"] = "reduce_mean"
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_sum.py b/python/aitemplate/compiler/ops/reduce/reduce_sum.py
new file mode 100644
index 000000000..3f08d8b14
--- /dev/null
+++ b/python/aitemplate/compiler/ops/reduce/reduce_sum.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+reduce_sum op
+"""
+from .reduce_common import reduce_base
+
+# pylint: disable=C0103
+
+
+class reduce_sum(reduce_base):
+    """
+    Implements the reduce_sum op.
+
+    * .attr.:`dim` : int or tuple of python:ints
+      the dimension or dimensions to reduce
+
+    * .attr.:`keepdim` : bool, optional
+      keep the reduced dimensions if True, default is False
+
+    * .attr.:`dtype` : str, optional
+      the type of the return tensor. If it is not None,
+      the input tensor is cast to dtype before reduction.
+
+    Args:
+        input (Tensor): the input tensor.
+
+    Return:
+        Tensor that contains the sum of all elements in the input tensor.
+    """
+
+    def __init__(self, dim, keepdim=False, dtype=None) -> None:
+        super().__init__(dim, keepdim, dtype)
+        self._attrs["op"] = "reduce_sum"
diff --git a/python/aitemplate/compiler/ops/reduce/var.py b/python/aitemplate/compiler/ops/reduce/var.py
new file mode 100644
index 000000000..2f2fec56e
--- /dev/null
+++ b/python/aitemplate/compiler/ops/reduce/var.py
@@ -0,0 +1,52 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+var op implementation
+"""
+from .reduce_common import reduce_base
+
+# pylint: disable=C0103
+
+
+class var(reduce_base):
+    """
+    Calculates the variance of all elements in the input tensor.
+
+    * .attr.:`dim` : int or tuple of python:ints
+      the dimension or dimensions to reduce
+
+    * .attr.:`unbiased` : bool
+      specifying whether to use Bessel’s correction or not
+
+    * .attr.:`keepdim` : bool, optional
+      keep the reduced dimensions if True, default is False
+
+    * .attr.:`dtype` : str, optional
+      the type of the return tensor. If it is not None,
+      the input tensor is cast to dtype before reduction.
+
+    Args:
+        input (Tensor): the input tensor.
+
+    Return:
+        Tensor.
+
+    """
+
+    def __init__(self, dim, unbiased, keepdim=False, dtype=None) -> None:
+        """initialization routine of var op"""
+        super().__init__(dim, keepdim, dtype)
+        self._attrs["op"] = "var"
+        self._attrs["unbiased"] = unbiased
diff --git a/python/aitemplate/compiler/ops/reduce/vector_norm.py b/python/aitemplate/compiler/ops/reduce/vector_norm.py
new file mode 100644
index 000000000..fbacf8f46
--- /dev/null
+++ b/python/aitemplate/compiler/ops/reduce/vector_norm.py
@@ -0,0 +1,59 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+vector_norm op implementation that simulates pytorch's linalg.vector_norm.
+Currently, we only support L2 norm.
+"""
+from .reduce_common import reduce_base
+
+# pylint: disable=C0103
+
+
+class vector_norm(reduce_base):
+    """
+    Vector_norm op implementation that simulates pytorch's linalg.vector_norm.
+    Currently, we only support L2 norm.
+
+    * .attr.:`ord_kind` (int or float or str), optional
+      specifies the vector norm to be computed. (default: 2)
+
+    * .attr.:`dim` (None or int or tuple of python:ints), optional
+      the dimension or dimensions to be normalized.
+      (default: None, in this case the input tensor will be treated as
+      a 1-D tensor)
+
+    * .attr.:`keepdim` (bool), optional
+      keep the normalized dimensions if True, default is False
+
+    * .attr.:`dtype` (str), optional
+      the type of the return tensor. If it is not None,
+      the input tensor is cast to dtype before reduction.
+
+    Args:
+        input (Tensor): the input tensor.
+
+    Return:
+        Tensor.
+    """
+
+    def __init__(self, ord_kind=2, dim=None, keepdim=False, dtype=None) -> None:
+        """initialize the op"""
+        if dim is None:
+            raise NotImplementedError(
+                "flattening input tensor before normalization is not supported yet"
+            )
+        super().__init__(dim, keepdim, dtype)
+        self._attrs["op"] = "vector_norm"
+        self._attrs["ord_kind"] = str(ord_kind)
diff --git a/python/aitemplate/compiler/ops/softmax/__init__.py b/python/aitemplate/compiler/ops/softmax/__init__.py
new file mode 100644
index 000000000..d1e1d89d0
--- /dev/null
+++ b/python/aitemplate/compiler/ops/softmax/__init__.py
@@ -0,0 +1,21 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+softmax module init
+"""
+from .softmax import softmax
+
+
+__all__ = ["softmax"]
diff --git a/python/aitemplate/compiler/ops/softmax/cache_entry.py b/python/aitemplate/compiler/ops/softmax/cache_entry.py
new file mode 100644
index 000000000..f2d7dee93
--- /dev/null
+++ b/python/aitemplate/compiler/ops/softmax/cache_entry.py
@@ -0,0 +1,57 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Softmax cache entry.
+"""
+from dataclasses import dataclass
+
+# pylint: disable=C0103
+
+
+@dataclass
+class NormQueryEntry:
+    """Query Entry
+
+    Attributes
+    ----------
+    """
+
+    dtype_in: int
+    dtype_acc: int
+    dtype_out: int
+    rank: int
+    op_type: str
+    device: str
+    exec_entry_sha1: str
+
+
+@dataclass
+class NormRecordEntry:
+    """Record Entry
+
+    Attributes
+    ----------
+    """
+
+    exec_entry: str
+    exec_entry_sha1: str
+    dtype_in: int
+    dtype_acc: int
+    dtype_out: int
+    rank: int
+    op_type: str
+    device: str
+    algo: str
+    workspace: int
diff --git a/python/aitemplate/compiler/ops/softmax/softmax.py b/python/aitemplate/compiler/ops/softmax/softmax.py
new file mode 100644
index 000000000..7392d92db
--- /dev/null
+++ b/python/aitemplate/compiler/ops/softmax/softmax.py
@@ -0,0 +1,367 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Softmax op implementation
+"""
+import os
+import re
+from collections import OrderedDict
+from hashlib import sha1
+from typing import Dict, List, Union
+
+import jinja2
+
+from aitemplate.testing import detect_target
+
+from .... import backend
+from ....backend import registry
+from ....backend.target import Target
+
+from ....utils import logger
+from ....utils.tensor_utils import wrap_dim
+from ...base import DynamicProfileStrategy, ExecItem, IntVar, Operator, Tensor
+from .cache_entry import NormQueryEntry, NormRecordEntry
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class softmax(Operator):
+    r"""Applies the Softmax function to a 2D input Tensor
+    rescaling them so that the elements of the n-dimensional output Tensor
+    lie in the range [0,1] and sum to 1.
+
+    Softmax is defined as:
+
+    .. math::
+        \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
+
+    Args:
+        input (Tensor [N, M]):
+        dim (int): optional, a dimension along which Softmax will be computed (so every slice
+        along dim will sum to 1). Default: None, in this case the input tensor will be treated as
+        a 1-D tensor.
+
+    Returns:
+        Tensor: a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1].
+    """
+
+    def __init__(
+        self,
+    ) -> None:
+        """initialize the op"""
+        super().__init__()
+        self._attrs["op"] = "softmax"
+        self._attrs["has_profiler"] = False
+        if detect_target().name() == "rocm":
+            self._attrs["has_profiler"] = True
+
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        """Infer output shape for the softmax op.
+
+        Parameters
+        ----------
+        x : Tensor
+            Input shape of softmax op.
+
+        Returns
+        ----------
+            List[IntVar]
+        """
+        shapes = x._attrs["shape"]
+        assert (
+            len(shapes) >= 2
+        ), f"softmax only supports input with rank >= 2, current rank: {len(shapes)}"
+        return x._attrs["shape"]
+
+    def _invert_exec_key(self, key: str):
+        """Invert execution key to get input arguments as integers.
+
+        Parameters
+        ----------
+        key : str
+            Execution key
+
+        Returns
+        ----------
+            List[int]
+        """
+        res = []
+        for item in re.split(" == | && ", key):
+            if item.isnumeric():
+                res.append(int(item))
+        return res
+
+    def _gen_exec_key(self, name_value_mapping: Dict[str, Union[int, List[int]]]):
+        """Generate execution key from the name value mapping.
+
+        Parameters
+        ----------
+        name_value_mapping : Dict[str, Union[int, List[int]]
+            Dict for name and value.
+
+        Returns
+        ----------
+            str
+        """
+        key_strs = []
+        for name, values in name_value_mapping.items():
+            if len(values) == 1:
+                key_strs.append(f"{name} == {values[0]}")
+            elif len(values) > 1:
+                key_strs.append(f"{name} >= {values[0]} && {name} <= {values[-1]}")
+            else:
+                raise RuntimeError(
+                    "Softmax input has empty dim values: {}".format(values)
+                )
+        return " && ".join(key_strs)
+
+    def _extract_exec_path(self, dynamic_profiling_strategy=DynamicProfileStrategy.MAX):
+        """Extract execution key, i.e. input arguments for the profiler.
+
+        Parameters
+        ----------
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy. By default MAX is used, i.e. to profile
+            a dynamic range, an upper bound will be used.
+        """
+        shape_values_dict = {
+            var._attrs["name"]: var._attrs["values"]
+            for var in self._attrs["inputs"][0]._attrs["shape"]
+        }
+
+        self._attrs["exec_path"] = OrderedDict()
+        if dynamic_profiling_strategy == DynamicProfileStrategy.MAX:
+            max_values = {
+                name: [max(shape_values)]
+                for name, shape_values in shape_values_dict.items()
+            }
+            exec_item = ExecItem(
+                profiling_key=self._gen_exec_key(max_values),
+                exec_cond=self._gen_exec_key(shape_values_dict),
+                algo="",
+            )
+            self._attrs["exec_path"][exec_item.profiling_key] = exec_item
+        elif dynamic_profiling_strategy == DynamicProfileStrategy.MIN:
+            min_values = {
+                name: [min(shape_values)]
+                for name, shape_values in shape_values_dict.items()
+            }
+            exec_item = ExecItem(
+                profiling_key=self._gen_exec_key(min_values),
+                exec_cond=self._gen_exec_key(shape_values_dict),
+                algo="",
+            )
+            self._attrs["exec_path"][exec_item.profiling_key] = exec_item
+
+    def __call__(self, x: Tensor, dim: int = None) -> Tensor:
+        """call the op
+
+        Parameters
+        ----------
+        x : Tensor
+            input tensor
+        dim : int
+            the dimension to be normalized.
+            (default: None, in this case the input tensor will be treated as
+             a 1-D tensor)
+
+        Returns
+        ----------
+            Tensor
+        """
+        if dim is None:
+            raise NotImplementedError(
+                "flattening input tensor before normalization is not supported yet"
+            )
+        dim = wrap_dim(dim, x._rank())
+        if dim != x._rank() - 1:
+            raise NotImplementedError(
+                f"softmax currently only supports dim=x._rank() - 1, dim={dim}, x._rank()={x._rank()}"
+            )
+
+        self._attrs["inputs"] = [x]
+        self._attrs["dim"] = dim
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
+        """Generate profiler command.
+
+        Parameters
+        ----------
+        profiler_prefix : str
+            Directory to store profiler.
+        cfg: str
+            The filename generated for profiler.
+        x_shape : List[int]
+            Input shapes for the profiler.
+        """
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd = [exe_path]
+        for shape in x_shape:
+            cmd.append(shape)
+        command = [str(x) for x in cmd]
+        return command
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        """Profile a single workload.
+
+        Parameters
+        ----------
+        profiler_prefix : str
+            Base dir to keep profiling source codes.
+        exec_key: str
+            Input arguments to profiler executables.
+        devices: List[int]
+            GPU device ids used for profiling.
+        """
+        target = backend.target.Target.current()
+        # if in CI just choose minimal configs
+        # workspace is a hack just provides 102400 Byte
+        # query cache
+        tmp_key = next(iter(self._attrs["op_instance"].keys()))
+        tmp_op = self._attrs["op_instance"][tmp_key]
+        exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
+        query = NormQueryEntry(
+            dtype_in=tmp_op.In.value,
+            dtype_acc=tmp_op.accumulator_type().value,
+            dtype_out=tmp_op.Out.value,
+            rank=tmp_op.Rank,
+            op_type=self._attrs["op"],
+            device=target._arch,
+            exec_entry_sha1=exec_entry_sha1,
+        )
+        cache_value = target.query_profile_cache("normalization", query.__dict__)
+        if cache_value is not None and not target.force_profile():
+            logger.info(__name__, "Load profiling result from cache.")
+            return cache_value
+
+        content = list(self._attrs["op_instance"].keys())
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        x_shape = self._invert_exec_key(exec_key)
+        for cfg in content:
+            command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
+            runner.push(cfg, command)
+
+        runner.join()
+        result = runner.pull()
+
+        out = sorted(result, key=lambda x: x[1])
+        if len(out) == 0:
+            raise RuntimeError(
+                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+            )
+        best_algo = out[0][0]
+        workspace = out[0][1].workspace
+        ## cache
+        cache_record = NormRecordEntry(
+            exec_entry=exec_key,
+            exec_entry_sha1=exec_entry_sha1,
+            dtype_in=tmp_op.In.value,
+            dtype_acc=tmp_op.accumulator_type().value,
+            dtype_out=tmp_op.Out.value,
+            rank=tmp_op.Rank,
+            op_type=self._attrs["op"],
+            device=target._arch,
+            algo=best_algo,
+            workspace=workspace,
+        )
+        Target.current().insert_profile_cache("normalization", cache_record.__dict__)
+        return (best_algo, workspace)
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.MAX,
+    ):
+        """Selects the fastest kernel configurations.
+
+        Parameters
+        ----------
+        workdir : str, optional
+            Base dir to keep profiling source codes, by default "./"
+        devices: list, optional
+            Devices used for profiling, by default device 0 will be used.
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy. By default MAX is used, i.e. to profile
+            a dynamic range, an upper bound will be used.
+        """
+
+        if devices is None:
+            devices = [0]
+
+        self._extract_exec_path(dynamic_profiling_strategy)
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        if "op_instance" not in self._attrs:
+            target = backend.target.Target.current()
+            # init candidate ops
+            func_key = "{target}.{op}.config".format(
+                target=target.name(), op=self._attrs["op"]
+            )
+            func = registry.get(func_key)
+            func(self._attrs)
+
+        for wkl in workloads:
+            logger.info(
+                __name__,
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            best_algo, workspace = self._profile_single_workload(
+                profiler_prefix, wkl, devices
+            )
+            self._attrs["exec_path"][wkl].algo = best_algo
+            self._attrs["workspace"] = workspace
+
+    def gen_profiler(
+        self,
+        workdir: str = None,
+        dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+    ) -> None:
+        target = Target.current()
+        # init candidate ops
+        func_key = "{target}.{op}.config".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs)
+        func_key = "{target}.{op}.gen_profiler".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs, workdir)
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        self._attrs["exec_cond_template"] = EXEC_COND_TEMPLATE
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
new file mode 100644
index 000000000..61ae1c585
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -0,0 +1,35 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+reduce module init
+"""
+from .argmax import argmax
+from .batch_gather import batch_gather
+from .chunk import chunk
+from .concatenate import concatenate
+from .concatenate_tanh import concatenate_tanh
+from .dynamic_slice import dynamic_slice
+from .expand import expand
+from .gather import gather
+from .permute import permute
+from .permute021 import permute021
+from .permute102 import permute102
+from .permute210 import permute210
+from .size import size
+from .slice_reshape_scatter import slice_reshape_scatter
+from .slice_scatter import slice_scatter
+from .split import split
+from .topk import topk
diff --git a/python/aitemplate/compiler/ops/tensor/argmax.py b/python/aitemplate/compiler/ops/tensor/argmax.py
new file mode 100644
index 000000000..d7fad570f
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/argmax.py
@@ -0,0 +1,206 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Argmax.
+"""
+import itertools
+import os
+import re
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+import numpy as np
+
+from .... import backend
+from ....backend import registry
+from ....utils import logger, shape_utils
+from ...base import Operator, Tensor
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+instance_size == {{x_dim0}} &&  instance_num == {{x_dim1}}
+"""
+)
+
+
+class argmax(Operator):
+    """
+    Returns the indices of the maximum value of all elements across a dimension in the input tensor. If there are multiple maximal values then the indices of the first maximal value are returned.
+
+    Args:
+        input (Tensor): the source tensor
+        dim (int): optional, the dimension to reduce. Default: 0
+
+    Returns:
+        Tensor: a long tensor that contains the indices of the maximum values
+    """
+
+    def __init__(self, dim=0) -> None:
+        """initialize the op"""
+        super().__init__()
+        self._attrs["op"] = "argmax"
+        self._attrs["has_profiler"] = False
+        self._attrs["dim"] = dim
+        self._attrs["has_profiler"] = True
+        self._attrs["workspace"] = 0
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+
+    def _infer_shape(self, x: List[int]):
+        """Infer the output shape"""
+        output = list(x)[:-1]
+        return output
+
+    def _infer_shapes(self, x: Tensor):
+        """Infer the output shape"""
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = []
+        for idx in range(len(y_shapes[0])):
+            output_shape.append(
+                shape_utils.gen_int_var(values=unique([d[idx] for d in y_shapes]))
+            )
+        return output_shape
+
+    def __call__(self, x: Tensor) -> Tensor:
+        """call the op
+
+        Parameters
+        ----------
+        x : Tensor
+            input tensor
+        Returns
+        ----------
+            Tensor
+        """
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        self._extract_exec_path(x)
+        output = Tensor(output_shape, src_ops={self}, dtype="int64")
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        """call backend function"""
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+    def gen_profiler(
+        self, workdir: str = None, dynamic_profiling_strategy=None
+    ) -> None:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_profiler".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs, workdir)
+
+    def _gen_exec_key(self, shape: List[int]):
+        """rending the shape info"""
+        elem_cnt = np.prod(shape)
+        instance_size = shape[-1]
+        instance_num = elem_cnt // instance_size
+        return self.exec_key_template.render(
+            x_dim0=instance_size, x_dim1=instance_num
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd = [exe_path]
+        cmd.append(x_shape[0])
+        cmd.append(x_shape[1])
+        command = [str(x) for x in cmd]
+        logger.info(__name__, "profiling cmd: {}".format(command))
+        return command
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        cfg = self._attrs["op"]
+        x_shape = self._invert_exec_key(exec_key)
+        command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
+        runner.push(cfg, command)
+        runner.join()
+        result = runner.pull()
+
+        out = sorted(result, key=lambda x: x[1])
+        if len(out) == 0:
+            raise RuntimeError(
+                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+            )
+        workspace = out[0][1].workspace
+        return workspace
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=None,
+    ):
+        """Get the Argmax Op workspace
+
+        Parameters
+        ----------
+        workdir : str, optional
+            Base dir to keep profiling source codes, by default "./"
+        devices: list, optional
+            Devices used for profiling, by default device 0 will be used.
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy. By default MAX is used, i.e. to profile
+            a dynamic range, an upper bound will be used.
+        """
+
+        if devices is None:
+            devices = [0]
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+
+        for wkl in workloads:
+            logger.info(
+                __name__,
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
+            self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
diff --git a/python/aitemplate/compiler/ops/tensor/batch_gather.py b/python/aitemplate/compiler/ops/tensor/batch_gather.py
new file mode 100644
index 000000000..a895b2516
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/batch_gather.py
@@ -0,0 +1,121 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch_gather.
+"""
+import itertools
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ...base import IntVar, Operator, Tensor
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+M == {{x_dim0}} && K == {{x_dim1}}
+"""
+)
+
+
+class batch_gather(Operator):
+    """
+    Gathers values of the `input` tensor specified by `indicies`. Dim 0 of `indicies` correspond to the indices of `input` elements in dim 0.
+
+    Args:
+        input (Tensor): the source tensor
+        indices (Tensor): the indices of elements to gather
+
+    Returns:
+        Tensor: the destination tensor
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "batch_gather"
+        self._attrs["has_profiler"] = False
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+
+    def _infer_shape(self, x: List[int], indices: List[int]):
+        rank = len(indices)
+        for r in range(rank - 1):
+            assert x[r] == indices[r]
+        output = list(x)
+        output[rank - 1] = indices[-1]
+        return output
+
+    def _infer_shapes(self, x: Tensor, indices: Tensor) -> List[IntVar]:
+        """Infers shapes for batch_gather."""
+
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        indices_shape = [var._attrs["values"][0] for var in indices._attrs["shape"]]
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, indices_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = []
+        for idx in range(len(y_shapes[0])):
+            output_shape.append(
+                shape_utils.gen_int_var(unique([d[idx] for d in y_shapes]))
+            )
+        return output_shape
+
+    def __call__(self, x: Tensor, indices: Tensor) -> Tensor:
+        dtype = indices._attrs["dtype"]
+        assert dtype in [
+            "int",
+            "int32",
+            "int64",
+        ], f"batch_gather(): Expected dtype int/int32/int64 for index, got dtype {dtype}"
+        self._attrs["inputs"] = [x, indices]
+        self._set_depth()
+        self._extract_exec_path(x)
+        output_shape = self._infer_shapes(x, indices)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _gen_exec_key(self, shape):
+        return self.exec_key_template.render(
+            x_dim0=shape[0],
+            x_dim1=shape[1],
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/chunk.py b/python/aitemplate/compiler/ops/tensor/chunk.py
new file mode 100644
index 000000000..78e5d7f21
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/chunk.py
@@ -0,0 +1,70 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+chunk
+"""
+import math
+
+from typing import List
+
+from ...base import Tensor
+from .split import split
+
+
+class chunk(split):
+    """
+    Attempts to split a tensor into the specified number of chunks
+
+    Args:
+        input (Tensor): the tensor to split
+        chunks (int): number of chunks to return. Must be >= 1
+        dim (int) : optional, axes along which to split the tensor, by default 0
+
+    Returns :
+        List[Tensor]: If the tensor size along the given dimesion dim is divisible by chunks,
+        all returned chunks will be the same size.
+        If the tensor size along the given dimension dim is not divisible by chunks,
+        all returned chunks will be the same size, except the last one.
+        If such division is not possible,
+        this function may return less than the specified number of chunks.
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "split"
+
+    def __call__(self, input: Tensor, chunks: int, dim: int = 0) -> List[Tensor]:
+        if chunks < 1:
+            raise RuntimeError(f"chunks must be >= 1 but got {chunks=}")
+        input_shape = input._attrs["shape"]
+        input_rank = len(input_shape)
+        if input_rank <= 0:
+            raise RuntimeError("expected a non-scalar tensor")
+        if dim >= input_rank:
+            raise RuntimeError(f"chunk {dim=} expected to be less than {input_rank=}")
+        split_dim_sizes = input_shape[dim]._attrs["values"]
+        if len(split_dim_sizes) > 1:
+            raise RuntimeError(f"Not implemented: chunk along dynamic axes {dim=}")
+        length = split_dim_sizes[0]
+        chunk_size = math.ceil(length / chunks)
+        full_chunks = math.floor(length / chunk_size)
+        tail_chunk_size = length % chunk_size
+        split_size_or_sections = [chunk_size] * full_chunks
+        if tail_chunk_size > 0:
+            split_size_or_sections.append(tail_chunk_size)
+        return super().__call__(
+            x=input, split_size_or_sections=split_size_or_sections, dim=dim
+        )
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
new file mode 100644
index 000000000..534149310
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -0,0 +1,260 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Concatenate.
+"""
+from typing import List, Sequence, Union
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ....utils.tensor_utils import wrap_dim
+from ...base import IntVar, Operator, Tensor
+from ...tensor_accessor import TensorAccessor
+
+# pylint: disable=C0103,W0221
+
+
+class concatenate(Operator):
+    """
+    Concatenates the given sequence of seq tensors in the given dimension. All tensors must either have the same shape (except in the concatenating dimension) or be empty.
+    It is the inverse operation for `split` and `chunk`.
+
+    Args:
+        inputs (List[Tensor]): the sequence of input tensors to concatenate
+        dim (int): the dimension to concatenate. Optional, 0 by default
+
+    Returns:
+        Tensor: the output tensor
+
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "concatenate"
+        self._attrs["has_profiler"] = False
+
+    def _unique(self, vector):
+        return sorted(set(vector))
+
+    def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
+        """Infers shapes for concatenate."""
+
+        if len(inputs) < 1:
+            raise RuntimeError("expected a list of Tensors")
+        x = inputs[0]
+        rank = len(x._attrs["shape"])
+        if rank <= 0:
+            raise RuntimeError("expected a non-scalar tensor")
+        if dim >= rank:
+            raise RuntimeError(
+                "concat_dim ({dim}) expected to be less than rank ({rank})".format(
+                    dim=dim, rank=rank
+                )
+            )
+        for t in inputs:
+            r = len(t._attrs["shape"])
+            if r != rank:
+                raise RuntimeError(
+                    "tensors expected to have the same rank, got {} and {}".format(
+                        r, rank
+                    )
+                )
+        input_shapes = [i._attrs["shape"] for i in inputs]
+        output_shape = []
+        input_shape_values = [
+            [d._attrs["values"] for d in shape] for shape in input_shapes
+        ]
+        for idx, lst in enumerate(zip(*input_shape_values)):
+            if idx == dim:
+                min_value_sum = sum(value[0] for value in lst)
+                max_value_sum = sum(value[-1] for value in lst)
+                output_shape.append(
+                    shape_utils.gen_int_var([min_value_sum, max_value_sum])
+                )
+            else:
+                output_dim = input_shapes[0][idx]
+                for shape in input_shapes:
+                    if output_dim != shape[idx]:
+                        raise RuntimeError(
+                            "tensors expected to have the same dimensions "
+                            "except concat_dim! dim: {}, shape1: {}, shape2: {}, inputs: {}".format(
+                                idx, output_dim, shape[idx], inputs
+                            )
+                        )
+                output_shape.append(output_dim)
+        return output_shape
+
+    def __call__(self, inputs: List[Tensor], dim=0) -> List[Tensor]:
+        self._attrs["inputs"] = list(inputs)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(t) for t in self._attrs["inputs"]
+        ]
+        # We have transformations that may modify some inputs to tensor accessors,
+        # for which the source op will write directly to the corresponding
+        # output locations. However, our concat backend needs original input
+        # shapes to calculate concat offsets. So, we keep a copy of input tensors.
+        self._attrs["original_inputs"] = list(inputs)
+        # True means the corresponding tensor will be copied by the concat backend.
+        self._attrs["input_masks"] = [True] * len(inputs)
+        input_rank = inputs[0]._rank()
+        dim = wrap_dim(dim, input_rank)
+        self._attrs["concat_dim"] = dim
+        self._set_depth()
+        output_shape = self._infer_shapes(inputs, dim)
+        output = Tensor(output_shape, src_ops={self}, dtype=inputs[0]._attrs["dtype"])
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_func(self, fmt_str):
+        target = backend.target.Target.current()
+        func_key = fmt_str.format(target=target.name(), op=self._attrs["op"])
+        return registry.get(func_key)
+
+    def gen_function(self) -> str:
+        func = self._get_func("{target}.{op}.gen_function")
+        return func(self._attrs)
+
+    def get_original_index(self, idx: int) -> int:
+        """
+        Return the original index of the input at idx in the current "inputs" list.
+
+        Parameters
+        ----------
+        idx : int
+            the index of an input based on the current "inputs"
+
+        Returns
+        -------
+        int
+            the index of this input in the "original_inputs"
+        """
+        num_original_inputs = len(self._attrs["original_inputs"])
+        orig_idx = None
+        # track the index for the "inputs" list
+        curr_idx = 0
+        for i in range(num_original_inputs):
+            # We don't increase curr_idx if this input is removed
+            if not self._attrs["input_masks"][i]:
+                continue
+            # We found the original index
+            if curr_idx == idx:
+                orig_idx = i
+                break
+            curr_idx += 1
+        assert orig_idx is not None, f"Expected orig_idx to be non-None for idx {idx}"
+        return orig_idx
+
+    def get_tensor_index(self, tensor: Tensor) -> int:
+        """
+        Return the index for the input tensor in the "inputs" list.
+
+        Parameters
+        ----------
+        tensor : Tensor
+            the input tensor for looking up the index
+
+        Returns
+        -------
+        int
+            the index of this input in the "nputs" list
+        """
+        idx = None
+        for input_idx, input_tensor in enumerate(self._attrs["inputs"]):
+            if input_tensor is tensor:
+                idx = input_idx
+                # found the input to be removed
+                break
+        assert idx is not None and idx < len(self._attrs["inputs"]), (
+            f"Expected idx to be less than the number of inputs, "
+            f'but got: {idx}, {len(self._attrs["inputs"])}'
+        )
+        return idx
+
+    def remove_input_at(self, indices: Union[int, Sequence[int]]) -> None:
+        """
+        This function removes the inputs in indices from the "inputs" attribute
+        and sets input_masks[indices] to be False. Note that the indices are based
+        on the current "inputs".
+
+        Parameters
+        ----------
+        indices : Union[int, Sequence[int]]
+            the index of an input or indices of multiple inputs based on the current "inputs"
+
+        Returns
+        -------
+        None
+        """
+        if isinstance(indices, int):
+            indices = [indices]
+        else:
+            indices = list(indices)
+
+        curr_inputs = self._attrs["inputs"]
+        curr_input_accessors = self._attrs["input_accessors"]
+        num_curr_inputs = len(curr_inputs)
+
+        assert len(curr_input_accessors) == num_curr_inputs, (
+            "expected curr_input_accessors have the same length as num_curr_inputs, "
+            f"but got {len(curr_input_accessors)=}, {num_curr_inputs=}, "
+            f'op: {self._attrs["name"]}'
+        )
+
+        assert (
+            len(indices) <= num_curr_inputs
+        ), f"Expected len(indices) <= num_curr_inputs, but got {len(indices)} and {num_curr_inputs}"
+
+        num_original_inputs = len(self._attrs["original_inputs"])
+        num_input_masks = len(self._attrs["input_masks"])
+        assert num_original_inputs == num_input_masks, (
+            f"original_inputs and input_masks must have the same length, "
+            f"but got {num_original_inputs} and {num_input_masks}"
+        )
+
+        curr_idx = 0  # index into curr_inputs
+        idx = 0  # index into indices
+        new_inputs = []
+        new_input_accessors = []
+        # we need to skip those indices where input_masks have been modified.
+        for orig_idx in range(num_original_inputs):
+            if not self._attrs["input_masks"][orig_idx]:
+                continue
+            if idx < len(indices) and curr_idx == indices[idx]:
+                if not self._attrs["input_masks"][orig_idx]:
+                    raise RuntimeError(
+                        f'Expected input_masks at {idx} to be True for {self._attrs["name"]}'
+                    )
+                if curr_input_accessors[curr_idx].stride_dim is not None:
+                    raise RuntimeError(
+                        f"Cannot remove an input (idx: {curr_idx}) with a valid "
+                        f'TensorAccessor for {self._attrs["name"]}'
+                    )
+                self._attrs["input_masks"][orig_idx] = False
+                idx += 1
+            else:
+                new_inputs.append(curr_inputs[curr_idx])
+                new_input_accessors.append(curr_input_accessors[curr_idx])
+            curr_idx += 1
+        num_new_inputs = len(new_inputs)
+        assert num_new_inputs + len(indices) == num_curr_inputs, (
+            f"Expected num_new_inputs + len(indices) == num_curr_inputs, "
+            f"but got {num_new_inputs + len(indices)} and {num_curr_inputs}"
+        )
+        self._attrs["inputs"] = new_inputs
+        self._attrs["input_accessors"] = new_input_accessors
+
+    def _inputs_for_pseudo_code(self):
+        return self._attrs["inputs"] + [f"dim={self._attrs['concat_dim']}"]
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py b/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py
new file mode 100644
index 000000000..08cdacaa7
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py
@@ -0,0 +1,28 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Concatenate_tanh
+"""
+from . import concatenate
+
+# pylint: disable=C0103
+
+
+class concatenate_tanh(concatenate):
+    """The fusion of concatenate and tanh."""
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "concatenate_tanh"
diff --git a/python/aitemplate/compiler/ops/tensor/dynamic_slice.py b/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
new file mode 100644
index 000000000..774c4418a
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
@@ -0,0 +1,186 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Dynamic_slice.
+"""
+import itertools
+from typing import List, Optional, Union
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ...base import IntVar, IntVarTensor, Operator, Tensor
+
+# pylint: disable=C0103,W0221
+
+# FIXME: We use MAX_INT32 to represent the end position in a sliced
+# dimension for now, because we use int32_t to represent indices in
+# the generated backend CUDA/C++ code. After we replace int32_t with
+# int64_t in ourbackend, we will also need to replace MAX_INT32 with
+# MAX_INT64.
+MAX_INT32 = pow(2, 31) - 1
+
+
+class dynamic_slice(Operator):
+    """
+    Cut the source tensor into slices specified by a list of start indices and a list of end indices.
+
+    Args:
+        x (Tensor): input tensor
+        start_indices (List[int]) : similar to PyTorch and numpy, indices can be negative
+        end_indices (List[int]) : end_index is not included. Similar to PyTorch and numpy, indices can be negative.
+
+    Returns:
+        List[Tensor] : the list of sliced tensors.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "dynamic_slice"
+        self._attrs["has_profiler"] = False
+
+    @staticmethod
+    def normalize_start_end_indices(dim_val: int, start: int, end: int) -> List[int]:
+        """
+        return normalized start and end indices which fall into a well-formed
+        range like below:
+        0 <= start <= end <= dim_val
+        """
+        # handle negative indices
+        start = start if start >= 0 else dim_val + start
+        start = 0 if start < 0 else start
+        end = end if end >= 0 else dim_val + end
+        end = 0 if end < 0 else end
+
+        start = dim_val if start > dim_val else start
+        end = dim_val if end > dim_val else end
+        start = end if start > end else start
+        return [start, end]
+
+    def _infer_shape(
+        self, x_shape: List[int], start_indices: List[int], end_indices: List[int]
+    ) -> List[int]:
+        y_shape = []
+        for dim_val, start, end in zip(x_shape, start_indices, end_indices):
+            # handle negative indices
+            start, end = dynamic_slice.normalize_start_end_indices(dim_val, start, end)
+            y_shape.append(end - start)
+        return y_shape
+
+    def _infer_shapes(
+        self,
+        x: Tensor,
+        start_indices: List[Union[IntVar, IntVarTensor, Optional[int]]],
+        end_indices: List[Union[IntVar, IntVarTensor, Optional[int]]],
+    ) -> List[IntVar]:
+        """Infers shape for dynamic_slice."""
+
+        x_shape = x._attrs["shape"]
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, start_indices, end_indices)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = []
+        for idx in range(len(y_shapes[0])):
+            output_shape.append(
+                x._attrs["shape"][idx]
+                if (start_indices[idx] == 0 and end_indices[idx] == MAX_INT32)
+                else shape_utils.gen_int_var(unique(d[idx] for d in y_shapes))
+            )
+        return output_shape
+
+    def __call__(
+        self,
+        x: Tensor,
+        start_indices: List[Union[IntVar, IntVarTensor, Optional[int]]],
+        end_indices: List[Union[IntVar, IntVarTensor, Optional[int]]],
+    ) -> List[Tensor]:
+        """
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor.
+        start_indices : List[int]
+            Similar to PyTorch and numpy, indices can be negative
+        end_indices : List[int]
+            end_index is not included. Similar to PyTorch and
+                numpy, indices can be negative.
+
+        Returns
+        -------
+        List[Tensor]
+            Output tensors.
+        """
+
+        x_shape = x._attrs["shape"]
+        if len(start_indices) != len(end_indices):
+            raise RuntimeError("len(start_indices) must equal to len(end_indices)")
+        rank = len(x_shape)
+        if rank != len(start_indices):
+            raise RuntimeError(
+                "input rank expected to be equal to the length of start_indices"
+                ", but got {} and {}".format(rank, len(start_indices))
+            )
+
+        start_indices = [
+            shape_utils.convert_IntVar_to_int(idx) if idx is not None else 0
+            for idx in start_indices
+        ]
+        end_indices = [
+            shape_utils.convert_IntVar_to_int(idx) if idx is not None else MAX_INT32
+            for idx in end_indices
+        ]
+
+        self._attrs["inputs"] = [x]
+        self._attrs["start_indices"] = start_indices
+        self._attrs["end_indices"] = end_indices
+        self._set_depth()
+
+        output_shape = self._infer_shapes(x, start_indices, end_indices)
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_func(self, fmt_str):
+        """
+        Parameters
+        ----------
+        inputs : string
+            format string to create func_key for looking up func
+            from the registry
+
+        Returns
+        -------
+        the function generator
+        """
+        target = backend.target.Target.current()
+        func_key = fmt_str.format(target=target.name(), op=self._attrs["op"])
+        return registry.get(func_key)
+
+    def gen_function(self) -> str:
+        func = self._get_func("{target}.{op}.gen_function")
+        return func(self._attrs)
+
+    def _inputs_for_pseudo_code(self):
+        return self._attrs["inputs"] + [
+            f"start_indices=[{self._pseudo_code_helper(self._attrs['start_indices'], with_shape=True)}]",
+            f"end_indices=[{self._pseudo_code_helper(self._attrs['end_indices'], with_shape=True)}]",
+        ]
diff --git a/python/aitemplate/compiler/ops/tensor/expand.py b/python/aitemplate/compiler/ops/tensor/expand.py
new file mode 100644
index 000000000..b4a436a87
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/expand.py
@@ -0,0 +1,135 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import List, Union
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.utils.shape_utils import convert_shape_to_IntVar, gen_int_var
+
+
+def _normalize_dim(dim: IntVar) -> IntVar:
+    """
+    Convert IntVars with the same upper and lower bounds to IntImms.
+    """
+    if isinstance(dim, IntImm) or dim.upper_bound() != dim.lower_bound():
+        return dim
+    return IntImm(dim.upper_bound())
+
+
+def _dim_has_value(dim: IntVar, value: int) -> bool:
+    return isinstance(dim, IntImm) and dim.value() == value
+
+
+class expand(Operator):
+    """
+    Expands a tensor's singleton dimensions.
+
+    Expanded dimensions in the input tensor must be `IntImm`s with value() == 1,
+    or `IntVar`s with upper_bound() == lower_bound() == 1.
+    The output shape may be dynamic.
+
+    The other dimensions in the input must match the input shape exactly,
+    or be set to -1.
+
+    Args:
+        input (Tensor) : the source tensor
+        dim (List[Union[IntImm, IntVar, int]]) : the target dim
+
+    Returns:
+        Tensor : the destination tensor
+
+    Example:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        x = Tensor([2, 3], name="input_0", is_input=True)
+        y = Tensor([2, 3], name="input_1", is_input=True)
+        x_expand = ops.expand()(x, [IntImm(1), -1, -1])
+        y_expand = ops.expand()(y, [IntVar([1, 1]), -1, -1])
+        z = ops.elementwise(FuncEnum.MUL)(x_expand, y_expand)
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "expand"
+        self._attrs["expand_dim"] = None
+
+    @staticmethod
+    def _should_reuse_input_dim(dim_tensor: IntVar, dim_arg: IntVar) -> bool:
+        return _dim_has_value(dim_arg, -1) or dim_tensor == dim_arg
+
+    def _infer_shape(self, tensor: Tensor, shape: List[IntVar]) -> List[IntVar]:
+        output_shape = []
+        input_shape = tensor._attrs["shape"]
+
+        if len(shape) != len(input_shape):
+            raise ValueError(
+                f"Input shape ndim ({len(shape)}) must match tensor's ndim ({len(input_shape)})"
+            )
+
+        for i, dim_tensor in enumerate(input_shape):
+            dim_arg = shape[i]
+
+            # Convert IntVars with the same upper and lower bounds to IntImm's.
+            # This lets us tell that expanding IntImm(1) into IntVar([1, 1]) is
+            # actually a no-op.
+            dim_tensor = _normalize_dim(dim_tensor)
+            dim_arg = _normalize_dim(dim_arg)
+
+            if self._should_reuse_input_dim(dim_tensor, dim_arg):
+                output_shape.append(
+                    gen_int_var(
+                        dim_tensor._attrs["values"], name=dim_tensor._attrs["name"]
+                    )
+                )
+            elif _dim_has_value(dim_tensor, 1):
+                if self._attrs["expand_dim"] is not None:
+                    raise NotImplementedError(
+                        f"Expand only supports expanding one dim. Tried to expand dim {i}, but already expanded dim {self._attrs['expand_dim']}."
+                    )
+                self._attrs["expand_dim"] = i
+                output_shape.append(
+                    gen_int_var(dim_arg._attrs["values"], name=dim_arg._attrs["name"])
+                )
+            else:
+                raise ValueError(
+                    f"Tried to expand non-singleton dimension {i}. Input tensor dim: {dim_tensor}, target shape dim: {dim_arg}"
+                )
+
+        return output_shape
+
+    def __call__(
+        self, tensor: Tensor, shape: List[Union[int, IntVar, IntVarTensor]]
+    ) -> Tensor:
+        self._attrs["inputs"] = [tensor]
+        for dim in shape:
+            if isinstance(dim, IntVarTensor):
+                self._attrs["inputs"].append(dim)
+        shape = convert_shape_to_IntVar(shape)
+        self._set_depth()
+        output_shape = self._infer_shape(tensor, shape)
+        output = Tensor(output_shape, src_ops={self}, dtype=tensor._attrs["dtype"])
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/gather.py b/python/aitemplate/compiler/ops/tensor/gather.py
new file mode 100644
index 000000000..867962a28
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/gather.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Operator definition for gather.
+"""
+from .... import backend
+from ....backend import registry
+from ...base import Operator, Tensor
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+
+class gather(Operator):
+    """gather implementation
+
+    Parameters
+    ----------
+    Operator : [type]
+        [description]
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "gather"
+        self._attrs["has_profiler"] = False
+
+    def __call__(self, x: Tensor, dim: int, index: Tensor) -> Tensor:
+        dtype = index._attrs["dtype"]
+        if dtype != "int64":
+            raise RuntimeError(
+                "expected dtype int64 for index but got {}".format(dtype)
+            )
+
+        x_shape = x._attrs["shape"]
+        if dim >= len(x_shape):
+            raise RuntimeError(
+                "dimension value {} expected to be less than {}".format(
+                    dim, len(x_shape)
+                )
+            )
+        self._attrs["inputs"] = [x, index]
+        self._attrs["gather_dim"] = dim
+        self._set_depth()
+
+        output_shape = index._attrs["shape"]
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_func(self, fmt_str):
+        """
+        Parameters
+        ----------
+        inputs : string
+            format string to create func_key for looking up func
+            from the registry
+        """
+
+        target = backend.target.Target.current()
+        func_key = fmt_str.format(target=target.name(), op=self._attrs["op"])
+        return registry.get(func_key)
+
+    def gen_function(self) -> str:
+        func = self._get_func("{target}.{op}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/permute.py b/python/aitemplate/compiler/ops/tensor/permute.py
new file mode 100644
index 000000000..e4f7c2011
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/permute.py
@@ -0,0 +1,54 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute op
+"""
+from typing import Sequence
+
+from ....utils.tensor_utils import wrap_dim
+
+from ...base import Operator, Tensor
+from .permute021 import permute021
+from .permute102 import permute102
+from .permute210 import permute210
+
+
+class permute(Operator):
+    """
+    Returns a tensor with its dimensions permuted. This returned tensor is not a view. Dim in dims can be negative.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "permute"
+
+    def __call__(self, x: Tensor, dims: Sequence[int]) -> Tensor:
+        if len(dims) != 3:
+            raise NotImplementedError(
+                "Permute op doesn't support permute pattern {}".format(dims)
+            )
+        dims = list(dims)
+        for i, dim in enumerate(dims):
+            dims[i] = wrap_dim(dim, 3)
+
+        if dims == [0, 2, 1]:
+            return permute021()(x)
+        if dims == [1, 0, 2]:
+            return permute102()(x)
+        if dims == [2, 1, 0]:
+            return permute210()(x)
+        raise NotImplementedError(
+            "Permute op doesn't support permute pattern {}".format(dims)
+        )
diff --git a/python/aitemplate/compiler/ops/tensor/permute021.py b/python/aitemplate/compiler/ops/tensor/permute021.py
new file mode 100644
index 000000000..25b3eedec
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/permute021.py
@@ -0,0 +1,104 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute(0, 2, 1) op
+"""
+from typing import List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ...base import IntVar, Operator, Tensor
+
+# pylint: disable=C0103,W0221
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}X_DIM0 = {{x_dim0}};
+{{indent}}{{dtype}}X_DIM1 = {{x_dim1}};
+{{indent}}{{dtype}}X_DIM2 = {{x_dim2}};
+{{indent}}{{dtype}}Y_DIM0 = X_DIM0;
+{{indent}}{{dtype}}Y_DIM1 = X_DIM2;
+{{indent}}{{dtype}}Y_DIM2 = X_DIM1;
+"""
+)
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = Y_DIM0;
+{{indent}}{{y_dim1}} = Y_DIM1;
+{{indent}}{{y_dim2}} = Y_DIM2;
+"""
+)
+
+
+class permute021(Operator):
+    """
+    Permutes the input tensor from (B, N, M) to (B, M, N).
+
+    Args:
+        input (Tensor[B, N, M]): the source tensor with 3 dimensions
+
+    Returns:
+        output (Tensor[B, M, N]): the destination tensor
+
+    Example:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            X = Tensor(shape=[2, 384, 262], name="X", is_input=True)
+            Y = ops.permute021()(X)
+            y_shape = [d._attrs["values"][0] for d in Y.shape()]
+            print(y_shape)
+
+            Outs:
+            [2, 262, 384]
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "permute021"
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        """Infers shapes for permute021."""
+
+        x_shape = x._attrs["shape"]
+        return [x_shape[0], x_shape[2], x_shape[1]]
+
+    def __call__(self, x: Tensor) -> Tensor:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/compiler/ops/tensor/permute102.py b/python/aitemplate/compiler/ops/tensor/permute102.py
new file mode 100644
index 000000000..37e6c3880
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/permute102.py
@@ -0,0 +1,142 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Permute(1, 0, 2) op.
+Change the dimension of dim0 and dim1 of input 3d tensor.
+"""
+from typing import List
+
+import jinja2
+
+from aitemplate.backend import registry
+
+from .... import backend
+from ...base import IntVar, Operator, Tensor
+
+# pylint: disable=C0103,W0221
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}X_DIM0 = {{x_dim0}};
+{{indent}}{{dtype}}X_DIM1 = {{x_dim1}};
+{{indent}}{{dtype}}X_DIM2 = {{x_dim2}};
+{{indent}}{{dtype}}Y_DIM0 = X_DIM1;
+{{indent}}{{dtype}}Y_DIM1 = X_DIM0;
+{{indent}}{{dtype}}Y_DIM2 = X_DIM2;
+"""
+)
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = Y_DIM0;
+{{indent}}{{y_dim1}} = Y_DIM1;
+{{indent}}{{y_dim2}} = Y_DIM2;
+"""
+)
+
+
+class permute102(Operator):
+    """
+    Permutes the input 3d tensor from (B, N, M) to (N, B, M).
+
+    Args:
+        input (Tensor[B, N, M]): the source tensor with 3 dimensions
+
+    Returns:
+        output (Tensor[N, B, M]): the destination tensor
+
+    Example:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            X = Tensor(shape=[2, 384, 262], name="X", is_input=True)
+            Y = ops.permute102()(X)
+            y_shape = [d._attrs["values"][0] for d in Y.shape()]
+            print(y_shape)
+
+            Outs:
+            [384, 2, 262]
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "permute102"
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+
+    def _infer_shape(self, x: List[int]):
+        """
+        Parameters
+        ----------
+        x : List[int]
+
+        Returns
+        -------
+        List[int]
+            Deduce output dimension based on SHAPE_ASSIGNMENT_TEMPLATE.
+        """
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [int(output["Y_DIM0"]), int(output["Y_DIM1"]), int(output["Y_DIM2"])]
+
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        """Infers shapes for permute021."""
+
+        x_shape = x._attrs["shape"]
+        return [x_shape[1], x_shape[0], x_shape[2]]
+
+    def __call__(self, x: Tensor) -> List[Tensor]:
+        """
+        Parameters
+        ----------
+        x : Tensor
+
+        Returns
+        -------
+        Tensor
+            Generate output tensors of function calls.
+            In permute102, its a 3d tensor with d1,d0,d2 of
+            input Tensor.
+        """
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        """Generate function body."""
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/compiler/ops/tensor/permute210.py b/python/aitemplate/compiler/ops/tensor/permute210.py
new file mode 100644
index 000000000..a815adce6
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/permute210.py
@@ -0,0 +1,115 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Permute(2, 1, 0) op.
+Swap the dimension of dim0 and dim2 of input 3d tensor.
+"""
+from typing import List
+
+from aitemplate.backend import registry
+
+from .... import backend
+from ...base import Operator, Tensor
+
+# pylint: disable=C0103,W0221
+
+
+class permute210(Operator):
+    """
+    Permutes the input 3d tensor from (B, N, M) to (M, N, B).
+
+    Args:
+        input (Tensor[B, N, M]): the source tensor with 3 dimensions
+
+    Returns:
+        output (Tensor[M, N, B]): the destination tensor
+
+    Example:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            X = Tensor(shape=[2, 384, 262], name="X", is_input=True)
+            Y = ops.permute210()(X)
+            y_shape = [d._attrs["values"][0] for d in Y.shape()]
+            print(y_shape)
+
+            Outs:
+            [262, 384, 2]
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "permute210"
+
+    def _infer_shapes(self, x: Tensor):
+        """Infers shapes for permute210.
+
+        Parameters
+        ----------
+        x : Tensor
+
+        Returns
+        ------
+        Tensor
+            Inferred output 3d tensor with input shape.
+            Because its a permute210 operation,
+            Out.d0=In.d2, Out.d2=In.d0.
+        """
+        x_shape = x._attrs["shape"]
+        return [x_shape[2], x_shape[1], x_shape[0]]
+
+    def __call__(self, x: Tensor) -> List[Tensor]:
+        """
+        Return the output tensor of permute210
+
+        Parameters
+        ----------
+        x : Tensor
+
+        Returns
+        -------
+        Tensor
+            Generate output tensors of function calls.
+            In permute210, its a 3d tensor with d2,d1,d0 of
+            input Tensor.
+        """
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        """
+        Generate function body
+
+        Returns
+        -------
+        str
+           The function body string
+        """
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+        )
diff --git a/python/aitemplate/compiler/ops/tensor/size.py b/python/aitemplate/compiler/ops/tensor/size.py
new file mode 100644
index 000000000..e7cd1adf3
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/size.py
@@ -0,0 +1,68 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Op to return the size of a tensor.
+"""
+from typing import List, Union
+
+from aitemplate import backend
+
+from aitemplate.backend import registry
+
+from ...base import IntImm, IntVar, IntVarTensor, Operator, Tensor
+
+# pylint: disable=C0103,W0221,R1732,W0613
+
+
+class size(Operator):
+    """
+    Returns the size of the input tensor. If dim is not specified, the returned value is
+    the same as tensor.shape(). If dim is specified, returns an int holding the size of
+    that dimension.
+
+    This op doesn't generate any code.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "size"
+        self._attrs["has_profiler"] = False
+
+    def _copy_construct_IntVar_IntImm(self, var: IntVar) -> IntVar:
+        if isinstance(var, IntImm):
+            return IntImm(var._attrs["values"][0], var._attrs["name"], src_ops=[self])
+        assert isinstance(var, IntVar)
+        return IntVar(var._attrs["values"], var._attrs["name"], src_ops=[self])
+
+    def __call__(self, x: Tensor, dim: int = None) -> Union[List[IntVar], IntVar]:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        if isinstance(dim, int):
+            var = x._size(dim)
+            output = IntVarTensor(var, src_ops={self})
+            self._attrs["outputs"] = [output]
+        else:
+            output = [IntVarTensor(var, src_ops={self}) for var in x.shape()]
+            self._attrs["outputs"] = output
+        return output
+
+    def gen_function(self) -> str:
+        """call backend function"""
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
new file mode 100644
index 000000000..cba756182
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
@@ -0,0 +1,144 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Slice_reshape_scatter.
+"""
+
+from .... import backend
+from ....backend import registry
+from ...base import IntImm, IntVar, Operator
+
+# pylint: disable=C0103,C0415,W0221
+
+
+class slice_reshape_scatter(Operator):
+    """represent slice + concat + reshape + concat pattern with
+    slice + concat
+    """
+
+    @staticmethod
+    def is_valid(cat_op: Operator, reshape_op: Operator, cat_op_2: Operator) -> bool:
+        assert cat_op._attrs["op"] == "concatenate"
+        assert reshape_op._attrs["op"] == "reshape"
+        assert cat_op_2._attrs["op"].startswith("concatenate")
+
+        # only handle cases where two cat ops have the same concat_dim
+        cat_dim = cat_op._attrs["concat_dim"]
+        if cat_dim != cat_op_2._attrs["concat_dim"]:
+            return False
+        cat_output_shape = cat_op._attrs["outputs"][0]._attrs["shape"]
+        cat_output_rank = len(cat_output_shape)
+        if cat_output_rank <= 1:
+            return False
+        cat_output_shape_2 = cat_op_2._attrs["outputs"][0]._attrs["shape"]
+        cat_output_rank_2 = len(cat_output_shape_2)
+        # only handle cases where we are concatenating the last dim
+        if cat_dim != cat_output_rank - 1:
+            return False
+        if cat_output_rank >= cat_output_rank_2:
+            return False
+        if not all(
+            d1._attrs["values"][0] == d2._attrs["values"][0]
+            for (d1, d2) in zip(
+                cat_output_shape[:cat_dim], cat_output_shape_2[:cat_dim]
+            )
+        ):
+            return False
+
+        reshape_to_shape = reshape_op._attrs["outputs"][0]._attrs["shape"]
+        # skip dynamic shape
+        if not all(isinstance(d, (IntImm, IntVar)) for d in reshape_to_shape):
+            return False
+
+        if not all(
+            d1._attrs["values"][0] == d2._attrs["values"][0]
+            for (d1, d2) in zip(cat_output_shape[:cat_dim], reshape_to_shape[:cat_dim])
+        ):
+            return False
+
+        return all(
+            x._attrs["src_ops"] is not None
+            and len(x._attrs["src_ops"]) == 1
+            and list(x._attrs["src_ops"])[0]._attrs["op"] == "dynamic_slice"
+            for x in cat_op._attrs["inputs"]
+        )
+
+    def _update_inputs_outputs(self, cat_op, reshape_op, cat_op_2):
+        from ...transform import transform_utils
+
+        idx = -1
+        for i, input_tensor in enumerate(cat_op_2._attrs["inputs"]):
+            if input_tensor == reshape_op._attrs["outputs"][0]:
+                idx = i
+                break
+        assert idx >= 0
+        cat_op_2.remove_input_at(idx)
+        transform_utils.remove_single_tensor_op_from_sorted_graph(reshape_op)
+
+        self._attrs["inputs"] = [
+            op._attrs["inputs"][0] for op in self._attrs["slice_ops"]
+        ]
+        self._attrs["outputs"] = cat_op_2._attrs["outputs"]
+        for x in self._attrs["inputs"]:
+            x._attrs["dst_ops"] = {self}
+        for y in self._attrs["outputs"]:
+            y._attrs["src_ops"].add(self)
+
+        for op in self._attrs["slice_ops"]:
+            op._attrs["outputs"][0]._attrs["src_ops"] = set()
+            op._attrs["outputs"][0]._attrs["dst_ops"] = set()
+
+        for x in cat_op._attrs["inputs"]:
+            x._attrs["src_ops"] = set()
+            x._attrs["dst_ops"] = set()
+        for y in cat_op._attrs["outputs"]:
+            y._attrs["src_ops"] = set()
+            y._attrs["dst_ops"] = set()
+
+    def __init__(
+        self, cat_op: Operator, reshape_op: Operator, cat_op_2: Operator
+    ) -> None:
+        super().__init__()
+        if cat_op_2._attrs["op"] == "concatenate_tanh":
+            self._attrs["element_func"] = "fast_tanh"
+        else:
+            self._attrs["element_func"] = None
+        assert slice_reshape_scatter.is_valid(cat_op, reshape_op, cat_op_2)
+
+        self._attrs["op"] = "slice_reshape_scatter"
+        self._attrs["has_profiler"] = False
+        self._attrs["scatter_dim"] = cat_op._attrs["concat_dim"]
+        slice_ops = []
+        for x in cat_op._attrs["inputs"]:
+            src_ops = x.src_ops()
+            assert len(src_ops) == 1
+            slice_op = list(src_ops)[0]
+            slice_ops.append(slice_op)
+        self._attrs["slice_ops"] = slice_ops
+
+        self._update_inputs_outputs(cat_op, reshape_op, cat_op_2)
+        self._set_depth()
+
+    def __call__(self):
+        raise RuntimeError("op {} cannot be called directly".format(self._attrs["op"]))
+
+    def _get_func(self, fmt_str):
+        target = backend.target.Target.current()
+        func_key = fmt_str.format(target=target.name(), op=self._attrs["op"])
+        return registry.get(func_key)
+
+    def gen_function(self) -> str:
+        func = self._get_func("{target}.{op}.gen_function")
+        return func(self._attrs, self._attrs["element_func"])
diff --git a/python/aitemplate/compiler/ops/tensor/slice_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
new file mode 100644
index 000000000..729201934
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
@@ -0,0 +1,99 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Slice_scatter.
+"""
+
+from .... import backend
+from ....backend import registry
+from ...base import Operator
+
+# pylint: disable=C0103,W0221
+
+
+class slice_scatter(Operator):
+    """This op represents a special fusion case where the
+    inputs of a concatenate op all come from slice ops. In such a case,
+    we can remove the concatenate op by placing each slice's output
+    into the correct location in the original concatenate's output.
+    """
+
+    @staticmethod
+    def is_valid(cat_op: Operator) -> bool:
+        if cat_op._attrs["op"] != "concatenate":
+            return False
+        return all(
+            x._attrs["src_ops"] is not None
+            and len(x._attrs["src_ops"]) == 1
+            and len(x._attrs["dst_ops"]) == 1
+            and list(x._attrs["src_ops"])[0]._attrs["op"] == "dynamic_slice"
+            for x in cat_op._attrs["inputs"]
+        )
+
+    def _update_inputs_outputs(self, cat_op):
+        self._attrs["inputs"] = []
+        for slice_op in self._attrs["slice_ops"]:
+            assert (
+                len(slice_op._attrs["inputs"]) == 1
+            ), "Slice op should only have 1 input! op: {}".format(slice_op)
+            input_tensor = slice_op._attrs["inputs"][0]
+            # A slice op's output may be fed into the same cat op multiple
+            # times, so we make sure it's removed from the set only once.
+            if slice_op in input_tensor._attrs["dst_ops"]:
+                input_tensor._attrs["dst_ops"].remove(slice_op)
+                input_tensor._attrs["dst_ops"].add(self)
+            self._attrs["inputs"].append(input_tensor)
+
+        self._attrs["outputs"] = cat_op._attrs["outputs"]
+        for y in self._attrs["outputs"]:
+            y._attrs["src_ops"] = {self}
+
+        for op in self._attrs["slice_ops"]:
+            op._attrs["outputs"][0]._attrs["src_ops"] = set()
+            op._attrs["outputs"][0]._attrs["dst_ops"] = set()
+
+        for x in cat_op._attrs["inputs"]:
+            x._attrs["src_ops"] = set()
+            x._attrs["dst_ops"] = set()
+
+    def __init__(self, cat_op: Operator) -> None:
+        super().__init__()
+        assert slice_scatter.is_valid(cat_op)
+
+        self._attrs["op"] = "slice_scatter"
+        self._attrs["has_profiler"] = False
+        self._attrs["scatter_dim"] = cat_op._attrs["concat_dim"]
+        slice_ops = []
+        for x in cat_op._attrs["inputs"]:
+            src_ops = x.src_ops()
+            assert len(src_ops) == 1
+            slice_op = list(src_ops)[0]
+            slice_ops.append(slice_op)
+        self._attrs["slice_ops"] = slice_ops
+
+        self._update_inputs_outputs(cat_op)
+        self._set_depth()
+
+    def __call__(self):
+        raise RuntimeError("op {} cannot be called directly".format(self._attrs["op"]))
+
+    def _get_func(self, fmt_str):
+        target = backend.target.Target.current()
+        func_key = fmt_str.format(target=target.name(), op=self._attrs["op"])
+        return registry.get(func_key)
+
+    def gen_function(self) -> str:
+        func = self._get_func("{target}.{op}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/split.py b/python/aitemplate/compiler/ops/tensor/split.py
new file mode 100644
index 000000000..56fbfeeeb
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/split.py
@@ -0,0 +1,165 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Split.
+"""
+import itertools
+from typing import List
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ....utils.tensor_utils import wrap_dim
+from ...base import IntImm, IntVar, Operator, Tensor
+
+# pylint: disable=C0103,W0221
+
+
+class split(Operator):
+    """Splits the tensor into chunks on the specified dimension.
+
+    Args:
+        x (Tensor): tensor to split.
+        split_sizes (List[int]) : list of sizes for each chunk
+        dim (int): dimension along which to split the tensor
+
+    Returns:
+        List[Tensor]: the list of output tensors
+
+    Example:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            >>> X = Tensor(shape=[2, 1], name="X", is_input=True)
+            >>> Y = ops.split()(X, 2, dim=0)
+            [Tensor(shape=[IntImm(1), IntImm(1)]), Tensor(shape=[IntImm(1), IntImm(1)])]
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "split"
+        self._attrs["has_profiler"] = False
+
+    def _infer_shapes(
+        self, x: Tensor, split_sizes: List[int], dim: int
+    ) -> List[IntVar]:
+        """Infers shapes for split."""
+
+        x_shape = x._attrs["shape"]
+        rank = len(x_shape)
+        if rank <= 0:
+            raise RuntimeError("expected a non-scalar tensor")
+        if dim >= rank:
+            raise RuntimeError(
+                f"split dim ({dim}) expected to be less than rank ({rank})"
+            )
+        num_splits = len(split_sizes)
+        if num_splits < 1:
+            raise RuntimeError(
+                f"the number of splits expected >=0 but got {num_splits}"
+            )
+        split_dim_size = x_shape[dim]._attrs["values"][0]
+        if sum(split_sizes) != split_dim_size:
+            raise RuntimeError(
+                f"sum of split_sizes ({split_sizes}) does not match split_dim_size ({split_dim_size})"
+            )
+
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        y_shapes = []
+        for x_shape_vals in x_shapes:
+            y_shape = [list(x_shape_vals) for _ in range(num_splits)]
+            for split_size, shape in zip(split_sizes, y_shape):
+                shape[dim] = split_size
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shapes = []
+        for idx, shapes in enumerate(zip(*y_shapes)):
+            assert all(split_sizes[idx] == dims[dim] for dims in shapes)
+            output_shape = []
+            for i in range(len(shapes[0])):
+                dim_vals = unique(dims[i] for dims in shapes)
+                # propagate the name of each non-split-dim dynamic axis, which
+                # may be used later by some shape checks.
+                if i != dim:
+                    new_dim_val = shape_utils.gen_int_var(
+                        dim_vals, x_shape[i]._attrs["name"]
+                    )
+                else:
+                    # FIXME: we might want to create a new unique name for this
+                    # new_dim_val. We would do this once we have a mechanism
+                    # to create a unique dim name
+                    new_dim_val = shape_utils.gen_int_var(dim_vals)
+                output_shape.append(new_dim_val)
+            output_shapes.append(output_shape)
+        return output_shapes
+
+    def __call__(self, x: Tensor, split_size_or_sections, dim=0) -> List[Tensor]:
+        x_shape = x._attrs["shape"]
+        self._attrs["inputs"] = [x]
+        dim = wrap_dim(dim, x._rank())
+        self._attrs["split_dim"] = dim
+        self._set_depth()
+        if isinstance(split_size_or_sections, (List, tuple)):
+            split_sizes = list(split_size_or_sections)
+        else:
+            split_size = split_size_or_sections
+            if not isinstance(split_size, int):
+                raise RuntimeError("split_size expected to be of int")
+            # TODO: support split along dynamic axis
+            if not isinstance(x_shape[dim], IntImm):
+                raise NotImplementedError("split dynamic axis")
+            split_dim_size = x_shape[dim].value()
+            if split_dim_size == 0:
+                # a special case - it's valid in pytorch
+                num_splits = 1
+                split_sizes = [0]
+            else:
+                if split_size == 0:
+                    raise RuntimeError("split_size expected to be > 0")
+                num_splits = int((split_dim_size + split_size - 1) / split_size)
+                split_sizes = [split_size] * num_splits
+                split_sizes[num_splits - 1] = split_size - (
+                    split_size * num_splits - split_dim_size
+                )
+
+        self._attrs["split_sizes"] = split_sizes
+        output_shapes = self._infer_shapes(x, split_sizes, dim)
+        outputs = [
+            Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+            for output_shape in output_shapes
+        ]
+        self._attrs["outputs"] = outputs
+        # torch returns a tuple, so do we
+        return tuple(outputs)
+
+    def _get_func(self, fmt_str):
+        target = backend.target.Target.current()
+        func_key = fmt_str.format(target=target.name(), op=self._attrs["op"])
+        return registry.get(func_key)
+
+    def gen_function(self) -> str:
+        func = self._get_func("{target}.{op}.gen_function")
+        return func(self._attrs)
+
+    def _inputs_for_pseudo_code(self):
+        return self._attrs["inputs"] + [
+            f"split_sizes={str(self._attrs['split_sizes'])}]",
+            f"dim={str(self._attrs['split_dim'])}]",
+        ]
diff --git a/python/aitemplate/compiler/ops/tensor/topk.py b/python/aitemplate/compiler/ops/tensor/topk.py
new file mode 100644
index 000000000..89e7d0256
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/topk.py
@@ -0,0 +1,189 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Topk.
+"""
+import itertools
+import os
+import re
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+import numpy as np
+
+from .... import backend
+from ....backend import registry
+from ....utils import logger
+from ...base import IntImm, IntVar, Operator, Tensor
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+elem_cnt == {{x_dim0}} &&  instance_size == {{x_dim1}} &&  instance_num == {{x_dim2}}
+"""
+)
+
+
+class topk(Operator):
+    """Returns the k largest elements of the given input tensor along its last dimension.
+
+    * :attr:`k` the k in "top-k".
+
+    Args:
+        x (Tensor) : the input tensor
+
+    Return:
+        Tensor : the output tensor with last dimension being `k`.
+
+    Example:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            X = Tensor(shape=[2, 800], name="X", is_input=True)
+            Y = ops.topk(k=300)(X)
+            y_shape = [d._attrs["values"][0] for d in Y.shape()]
+            print(y_shape)
+
+            Outs:
+            [2, 300]
+    """
+
+    def __init__(self, k) -> None:
+        super().__init__()
+        self._attrs["op"] = "topk"
+        self._attrs["has_profiler"] = True
+        self._attrs["topK"] = k
+        self._attrs["workspace"] = 0
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        """Infers shapes for topK."""
+
+        output_shape = list(x._attrs["shape"])
+        output_shape[-1] = IntImm(self._attrs["topK"])
+        return output_shape
+
+    def __call__(self, x: Tensor) -> Tensor:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        self._extract_exec_path(x)
+        output = Tensor(output_shape, src_ops={self}, dtype="int64")
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+    def gen_profiler(
+        self, workdir: str = None, dynamic_profiling_strategy=None
+    ) -> None:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_profiler".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs, workdir)
+
+    def _gen_exec_key(self, shape: List[int]):
+        """rending the shape info"""
+        elem_cnt = np.prod(shape)
+        instance_size = shape[-1]
+        instance_num = elem_cnt // instance_size
+        return self.exec_key_template.render(
+            x_dim0=elem_cnt, x_dim1=instance_size, x_dim2=instance_num
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd = [exe_path]
+        cmd.append(x_shape[0])
+        cmd.append(x_shape[1])
+        cmd.append(x_shape[2])
+        command = [str(x) for x in cmd]
+        logger.info(__name__, "profiling cmd: {}".format(command))
+        return command
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        cfg = self._attrs["op"]
+        x_shape = self._invert_exec_key(exec_key)
+        command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
+        runner.push(cfg, command)
+        runner.join()
+        result = runner.pull()
+
+        out = sorted(result, key=lambda x: x[1])
+        if len(out) == 0:
+            raise RuntimeError(
+                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+            )
+        workspace = out[0][1].workspace
+        return workspace
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=None,
+    ):
+        """Get the TopK Op workspace
+
+        Parameters
+        ----------
+        workdir : str, optional
+            Base dir to keep profiling source codes, by default "./"
+        devices: list, optional
+            Devices used for profiling, by default device 0 will be used.
+        dynamic_profiling_strategy: DynamicProfileStrategy, optional
+            A dynamic profiling strategy. By default MAX is used, i.e. to profile
+            a dynamic range, an upper bound will be used.
+        """
+
+        if devices is None:
+            devices = [0]
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+
+        for wkl in workloads:
+            logger.info(
+                __name__,
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
+            self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
diff --git a/python/aitemplate/compiler/ops/upsample/__init__.py b/python/aitemplate/compiler/ops/upsample/__init__.py
new file mode 100644
index 000000000..54712bf9b
--- /dev/null
+++ b/python/aitemplate/compiler/ops/upsample/__init__.py
@@ -0,0 +1,22 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Upsampling module init.
+"""
+from .upsampling2d import upsampling2d
+from .upsampling2d_add import upsampling2d_add
+
+
+__all__ = ["upsampling2d", "upsampling2d_add"]
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling2d.py b/python/aitemplate/compiler/ops/upsample/upsampling2d.py
new file mode 100644
index 000000000..747cf0291
--- /dev/null
+++ b/python/aitemplate/compiler/ops/upsample/upsampling2d.py
@@ -0,0 +1,41 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Upsampling2d op.
+"""
+from .upsampling_common import upsampling2d_base
+
+
+# pylint: disable=C0103
+class upsampling2d(upsampling2d_base):
+    """
+    Applies a 2D bilinear upsampling to an input signal composed of several input
+    channels.
+
+    To specify the scale, it takes the :attr:`scale_factor` as it's constructor argument.
+
+    * :attr:`scale_factor` (float): multiplier for spatial size.
+
+    Args:
+        input (Tensor [N, H, W, C]): the input data.
+
+    Return:
+        Tensor [N, H_out, W_out, C].
+    """
+
+    def __init__(self, scale_factor, mode) -> None:
+        super().__init__(scale_factor, mode)
+        self._attrs["op"] = "upsampling2d"
+        self._attrs["mode"] = mode
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py b/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
new file mode 100644
index 000000000..16632d4fe
--- /dev/null
+++ b/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
@@ -0,0 +1,56 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Upsampling2d_add op.
+"""
+from typing import List
+
+from ...base import Tensor
+from .upsampling_common import upsampling2d_base
+
+
+# pylint: disable=C0103
+class upsampling2d_add(upsampling2d_base):
+    """
+    Fused op for bilinear_upsampling + add.
+
+    Applies a 2D bilinear upsampling to an input signal composed of several input
+    channels, and adds an residual.
+
+    To specify the scale, it takes the :attr:`scale_factor` as it's constructor argument.
+
+    * :attr:`scale_factor` (float): multiplier for spatial size.
+
+    Args:
+        input (Tensor [N, H, W, C]): the input data.
+        r (Tensor [N, H_out, W_out, C]): the residual.
+
+    Return:
+        Tensor [N, H_out, W_out, C].
+    """
+
+    def __init__(self, scale_factor, mode) -> None:
+        super().__init__(scale_factor, mode)
+        self._attrs["op"] = "upsampling2d_add"
+        self._attrs["mode"] = mode
+
+    def __call__(self, x: Tensor, r: Tensor) -> List[Tensor]:
+        self._attrs["inputs"] = [x, r]
+        self._set_depth()
+        self._extract_exec_path(x)
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling_common.py b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
new file mode 100644
index 000000000..528c86f64
--- /dev/null
+++ b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
@@ -0,0 +1,172 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+upsampling2d.
+"""
+import itertools
+import logging
+import re
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+
+from .... import backend
+from ....backend import registry
+from ....utils import shape_utils
+from ...base import Operator, Tensor
+
+# pylint: disable=C0103,W0221,R1732,W0613
+logging.basicConfig(level=logging.INFO)
+
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}HI = {{x_dim1}};
+{{indent}}{{dtype}}WI = {{x_dim2}};
+{{indent}}{{dtype}}CI = {{x_dim3}};
+{{indent}}{{dtype}}CO = {{x_dim3}};
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}HO = HI * {{scale_factor}};
+{{indent}}{{dtype}}WO = WI * {{scale_factor}};
+"""
+)
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = NO;
+{{indent}}{{y_dim1}} = HO;
+{{indent}}{{y_dim2}} = WO;
+"""
+)
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class upsampling2d_base(Operator):
+    """
+    Upsamples a given multi-channel 2D (spatial) data.
+
+    * :attr:`scale_factor` (float): multiplier for spatial size.
+
+    * :attr:`mode` (str): the upsampling algorithm: one of ``'nearest'``,
+      ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
+      Currently we support ``'bilinear'`` and  ``'nearest'`` mode.
+
+    Args:
+        input (Tensor [N, H, W, C]): the input data.
+
+    Return:
+        Tensor [N, H_out, W_out, C].
+    """
+
+    def __init__(self, scale_factor, mode) -> None:
+        super().__init__()
+        self._attrs["op"] = "upsampling2d"
+        self._attrs["scale_factor"] = scale_factor
+        self._attrs["mode"] = mode
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+        self.exec_cond_template = EXEC_COND_TEMPLATE
+
+    def _infer_shape(self, x: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            scale_factor=self._attrs["scale_factor"],
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_exec_key(self, shape: List[int]):
+        return self.exec_key_template.render(
+            x_dim0=shape[0], x_dim1=shape[1], x_dim2=shape[2], x_dim3=shape[3]
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        self._attrs["exec_path"] = OrderedDict()
+        self._attrs["exec_path"]["true"] = ""
+
+    def _signature(self):
+        signature = "upsampling2d: S=[{s}], M=[{m}]]".format(
+            s=self._attrs["scale_factor"], m=self._attrs["mode"]
+        )
+        return signature
+
+    def __call__(self, x: Tensor) -> List[Tensor]:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        self._extract_exec_path(x)
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+            self.exec_cond_template,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/compiler/ops/vision_ops/__init__.py b/python/aitemplate/compiler/ops/vision_ops/__init__.py
new file mode 100644
index 000000000..29b76d99a
--- /dev/null
+++ b/python/aitemplate/compiler/ops/vision_ops/__init__.py
@@ -0,0 +1,19 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Vision ops module init.
+"""
+from .nms import *  # noqa
+from .roi_ops import *  # noqa
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py b/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py
new file mode 100644
index 000000000..c26c09e00
--- /dev/null
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py
@@ -0,0 +1,23 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Nms family ops.
+"""
+from .batched_nms import batched_nms
+from .efficient_nms import efficient_nms
+from .nms import nms
+
+
+__all__ = ["batched_nms", "nms", "efficient_nms"]
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
new file mode 100644
index 000000000..314124f5d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
@@ -0,0 +1,114 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batched nms.
+"""
+import itertools
+from typing import List
+
+import jinja2
+
+from ..... import backend
+from .....backend import registry
+from .....utils import shape_utils
+from ....base import _create_host_zero_tensor, IntImm, Operator, Tensor  # noqa
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+M == {{x_dim0}} && K == {{x_dim1}}
+"""
+)
+
+
+class batched_nms(Operator):
+    r"""
+    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU) in a batched fashion.
+
+    NMS iteratively removes lower scoring boxes which have an IoU greater than iou_threshold with another (higher scoring) box.
+
+    Note: if multiple boxes have the exact same score and satisfy the IoU criterion with respect to a reference box, the selected box is not guaranteed to be the same for different backends.
+
+     * :attr:`iouThreshold` identifies the intersection-over-union (IoU) threshold which is used to discards all overlapping boxes with IoU > iouThreshold. By default 0.5.
+
+     * :attr:`keep_n` identifies the number of boxes to return, by default -1 to return all.
+
+    Args:
+        boxes (Tensor[N, 4])), boxes to perform NMS on. They
+            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
+            ``0 <= y1 < y2``), and have been sorted in decreasing order of scores.
+
+    Returns:
+        Tensor: "keep" (Tensor[N]) in which each element indicates if the corresponding box is removed (element=0) or not (element=1).
+    """
+
+    def __init__(self, iou_threshold=0.5, keep_n=-1) -> None:
+        """Op Initialization"""
+        super().__init__()
+        self._attrs["op"] = "batched_nms"
+        self._attrs["has_profiler"] = False
+        self._attrs["keep_n"] = keep_n
+        self._attrs["iou_threshold"] = iou_threshold
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+
+    def _infer_shape(self, x: List[int]):
+        """infer output shape"""
+        return [x[0]]
+
+    def _infer_shapes(self, x: Tensor):
+        """infer output shape"""
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = []
+        for idx in range(len(y_shapes[0])):
+            output_shape.append(
+                shape_utils.gen_int_var(values=unique([d[idx] for d in y_shapes]))
+            )
+        return output_shape
+
+    def __call__(self, x: Tensor) -> Tensor:
+        """call the function"""
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self}, dtype="int64")
+        boxes_num = x._attrs["shape"][0]._attrs["values"][0]
+        col_blocks = int((boxes_num + 64 - 1) / 64)
+        tmp_space = col_blocks * boxes_num
+        tmp_c = _create_host_zero_tensor(
+            [IntImm(tmp_space)], dst_ops={self}, dtype="int64"
+        )
+        self._attrs["inputs"].append(tmp_c)
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        """call backend function"""
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
new file mode 100644
index 000000000..44583e1c0
--- /dev/null
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
@@ -0,0 +1,244 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Efficient nms.
+"""
+import itertools
+import os
+import re
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+
+from ..... import backend
+from .....backend import registry
+from .....utils import logger, shape_utils
+from ....base import IntImm, Operator, Tensor
+
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+# TODO: change to column last
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}BS = {{x_dim0}};
+{{indent}}{{dtype}}NB = {{x_dim1}};
+{{indent}}{{dtype}}NC = {{x_dim2}};
+{{indent}}{{dtype}}SZ = {{x_dim3}};
+{{indent}}{{dtype}}NO = BS;
+{{indent}}{{dtype}}CO = {{nmsMaxOut}};
+{{indent}}{{dtype}}HO = SZ;
+"""
+)
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+num_batch == {{x_dim0}} && num_rois == {{x_dim1}} && num_classes == {{x_dim2}}
+"""
+)
+
+
+class efficient_nms(Operator):
+    r"""
+    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU).
+
+    NMS iteratively removes lower scoring boxes which have an IoU greater than iou_threshold with another (higher scoring) box.
+
+    Note: if multiple boxes have the exact same score and satisfy the IoU criterion with respect to a reference box, the selected box is not guaranteed to be the same for different backends.
+
+     * :attr:`preNmsTop` identifies the maximum number of boxes to take.
+
+     * :attr:`nmsMaxOut` identifies the maximum number of boxes to reserve after the operation.
+
+     * :attr:`iouThreshold` identifies the intersection-over-union (IoU) threshold which is used to discards all overlapping boxes with IoU > iouThreshold.
+
+     * :attr:`minBoxSize` identifies the minimum box size, if a box has size less than this value, it will be removed before the non-maximum suppression.
+
+    Args:
+        boxes (Tensor[N, 4])): boxes to perform NMS on. They
+            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
+            ``0 <= y1 < y2``.
+        scores (Tensor[N]): scores for each one of the boxes
+
+    Returns:
+        Tensor: int64 tensor with the indices of the elements that have been kept
+        by NMS, sorted in decreasing order of scores
+    """
+
+    def __init__(
+        self, preNmsTop=2000, nmsMaxOut=200, iouThreshold=0.5, minBoxSize=0
+    ) -> None:
+        """Initializes efficient_nms"""
+        super().__init__()
+        self._attrs["op"] = "efficient_nms"
+        self._attrs["preNmsTop"] = preNmsTop
+        self._attrs["nmsMaxOut"] = nmsMaxOut
+        self._attrs["iouThreshold"] = iouThreshold
+        self._attrs["minBoxSize"] = minBoxSize
+        self._attrs["has_profiler"] = True
+        self._attrs["workspace"] = 0
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+
+    def _infer_shape(self, x: List[int], w: List[int]):
+        """infer the output shape for nms op"""
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            nmsMaxOut=self._attrs["nmsMaxOut"],
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204  # noqa: P204
+        return [int(output["NO"]), int(output["CO"]), int(output["HO"])]
+
+    def _infer_shapes(self, x: Tensor, w: Tensor):
+        """infer the output shape for nms op"""
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
+        self._attrs["KH"] = w_shape[0]
+        self._attrs["KW"] = w_shape[1]
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, w_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def __call__(self, boxes: Tensor, scores: Tensor) -> Tensor:
+        """Performs shape inference and returns an output tensor."""
+        self._attrs["inputs"] = [boxes, scores]
+        self._set_depth()
+        self._extract_exec_path(boxes)
+        output_shape = self._infer_shapes(boxes, scores)
+
+        num_detections = Tensor(
+            [output_shape[0], IntImm(1)], dtype="int64", src_ops={self}
+        )
+        detection_boxes = Tensor(output_shape, src_ops={self})
+        detection_scores = Tensor(output_shape[:-1], src_ops={self})
+        detection_classes = Tensor(output_shape[:-1], dtype="int64", src_ops={self})
+        output = (num_detections, detection_boxes, detection_scores, detection_classes)
+        self._attrs["outputs"] = [
+            num_detections,
+            detection_boxes,
+            detection_scores,
+            detection_classes,
+        ]
+        return output
+
+    def _gen_exec_key(self, shape):
+        """rendering shape info"""
+        return self.exec_key_template.render(
+            x_dim0=shape[0],
+            x_dim1=shape[1] * shape[2],
+            x_dim2=shape[2],
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def gen_function(self) -> str:
+        """call backend functions"""
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+    def gen_profiler(
+        self, workdir: str = None, dynamic_profiling_strategy=None
+    ) -> None:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_profiler".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs, workdir)
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd = [exe_path]
+        cmd.append(x_shape[0])
+        cmd.append(x_shape[1] * x_shape[2])
+        cmd.append(x_shape[2])
+        command = [str(x) for x in cmd]
+        logger.info(__name__, "profiling cmd: {}".format(command))
+        return command
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        cfg = self._attrs["op"]
+        x_shape = self._invert_exec_key(exec_key)
+        command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
+        runner.push(cfg, command)
+        runner.join()
+        result = runner.pull()
+
+        out = sorted(result, key=lambda x: x[1])
+        if len(out) == 0:
+            raise RuntimeError(
+                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+            )
+        workspace = out[0][1].workspace
+        return workspace
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=None,
+    ):
+        """Profile to compute the NMS Op workspace size."""
+        if devices is None:
+            devices = [0]
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+
+        for wkl in workloads:
+            logger.info(
+                __name__,
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
+            self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
new file mode 100644
index 000000000..cd6af1340
--- /dev/null
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
@@ -0,0 +1,228 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Nms.
+"""
+import itertools
+import os
+import re
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+
+from ..... import backend
+from .....backend import registry
+from .....utils import logger, shape_utils
+from ....base import Operator, Tensor
+
+# pylint: disable=C0103,W0221,W0102,W0223
+
+# TODO: change to column last
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}CI = {{x_dim1}};
+{{indent}}{{dtype}}HI = {{x_dim2}};
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}CO = {{nmsMaxOut}};
+{{indent}}{{dtype}}HO = HI;
+"""
+)
+
+EXEC_KEY_TEMPLATE = jinja2.Template(
+    """
+num_batch == {{x_dim0}} &&  num_rois == {{x_dim1}}
+"""
+)
+
+
+class nms(Operator):
+    r"""
+    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU).
+
+    NMS iteratively removes lower scoring boxes which have an IoU greater than iou_threshold with another (higher scoring) box.
+
+    Note: if multiple boxes have the exact same score and satisfy the IoU criterion with respect to a reference box, the selected box is not guaranteed to be the same for different backends.
+
+     * :attr:`preNmsTop` identifies the maximum number of boxes to take.
+
+     * :attr:`nmsMaxOut` identifies the maximum number of boxes to reserve after the operation.
+
+     * :attr:`iouThreshold` identifies the intersection-over-union (IoU) threshold which is used to discards all overlapping boxes with IoU > iouThreshold.
+
+     * :attr:`minBoxSize` identifies the minimum box size, if a box has size less than this value, it will be removed before the non-maximum suppression.
+
+    Args:
+        boxes (Tensor[N, 4])): boxes to perform NMS on. They
+            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
+            ``0 <= y1 < y2``.
+        scores (Tensor[N]): scores for each one of the boxes
+
+    Returns:
+        Tensor: int64 tensor with the indices of the elements that have been kept
+        by NMS, sorted in decreasing order of scores
+    """
+
+    def __init__(
+        self, preNmsTop=2000, nmsMaxOut=200, iouThreshold=0.5, minBoxSize=0
+    ) -> None:
+        """initialize the op"""
+        super().__init__()
+        self._attrs["op"] = "nms"
+        self._attrs["has_profiler"] = False
+        self._attrs["preNmsTop"] = preNmsTop
+        self._attrs["nmsMaxOut"] = nmsMaxOut
+        self._attrs["iouThreshold"] = iouThreshold
+        self._attrs["minBoxSize"] = minBoxSize
+        self._attrs["has_profiler"] = True
+        self._attrs["workspace"] = 0
+        self.exec_key_template = EXEC_KEY_TEMPLATE
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+
+    def _infer_shape(self, x: List[int], w: List[int]):
+        """Infer the output shape"""
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            nmsMaxOut=self._attrs["nmsMaxOut"],
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204  # noqa: P204
+        return [int(output["NO"]), int(output["CO"]), int(output["HO"])]
+
+    def _infer_shapes(self, x: Tensor, w: Tensor):
+        """Infer the output shape"""
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
+        self._attrs["KH"] = w_shape[0]
+        self._attrs["KW"] = w_shape[1]
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, w_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def __call__(self, x: Tensor, scores: Tensor) -> Tensor:
+        self._attrs["inputs"] = [x, scores]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, scores)
+        self._extract_exec_path(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _gen_exec_key(self, shape):
+        """rending the shape info"""
+        return self.exec_key_template.render(
+            x_dim0=shape[0],
+            x_dim1=shape[1],
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        self._attrs["exec_path"] = OrderedDict()
+        for x_shape in x_shapes:
+            key = self._gen_exec_key(x_shape)
+            self._attrs["exec_path"][key] = ""
+
+    def gen_function(self) -> str:
+        """call backend function"""
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+    def gen_profiler(
+        self, workdir: str = None, dynamic_profiling_strategy=None
+    ) -> None:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_profiler".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        func(self._attrs, workdir)
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
+        exe_path = os.path.join(profiler_prefix, cfg)
+        if not os.access(exe_path, os.X_OK):
+            raise RuntimeError("Profiler %s is not executable" % exe_path)
+        cmd = [exe_path]
+        cmd.append(x_shape[0])
+        cmd.append(x_shape[1])
+        command = [str(x) for x in cmd]
+        logger.info(__name__, "profiling cmd: {}".format(command))
+        return command
+
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+        cfg = self._attrs["op"]
+        x_shape = self._invert_exec_key(exec_key)
+        command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
+        runner.push(cfg, command)
+        runner.join()
+        result = runner.pull()
+
+        out = sorted(result, key=lambda x: x[1])
+        if len(out) == 0:
+            raise RuntimeError(
+                "Profile workload: " + "" + "failed. " "Results: {}.".format(result)
+            )
+        workspace = out[0][1].workspace
+        return workspace
+
+    def profile(
+        self,
+        workdir="./",
+        devices=None,
+        dynamic_profiling_strategy=None,
+    ):
+        """Profile to compute the NMS Op workspace size."""
+
+        if devices is None:
+            devices = [0]
+
+        workloads = list(self._attrs["exec_path"].keys())
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+
+        for wkl in workloads:
+            logger.info(
+                __name__,
+                "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
+            )
+            workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
+            self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py
new file mode 100644
index 000000000..0d8619521
--- /dev/null
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py
@@ -0,0 +1,21 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Roi-align module init.
+"""
+from .multi_level_roi_align import multi_level_roi_align
+from .roi_align import roi_align
+
+__all__ = ["roi_align", "multi_level_roi_align"]
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
new file mode 100644
index 000000000..69fbb9eae
--- /dev/null
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
@@ -0,0 +1,119 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Multi level roi_align.
+"""
+
+from typing import List
+
+from ....base import Tensor
+from .roi_ops import roi_ops_base
+
+# pylint: disable=C0103
+
+
+class multi_level_roi_align(roi_ops_base):
+    """
+    Performs Multiple level Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN.
+
+     * :attr:`num_rois` identifies the number of RoIs in the input.
+
+     * :attr:`pooled_size` identifies the size of the pooling section, i.e., the size of the output (in bins or pixels) after the pooling
+       is performed, as (height, width).
+
+     * :attr:`sampling_ratio` is the number of sampling points in the interpolation grid
+       used to compute the output value of each pooled output bin. If > 0,
+       then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If
+       <= 0, then an adaptive number of grid points are used (computed as
+       ``ceil(roi_width / output_width)``, and likewise for height).
+
+     * :attr:`spatial_scale` is a scaling factor that maps the box coordinates to
+       the input coordinates. For example, if your boxes are defined on the scale
+       of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of
+       the original image), you'll want to set this to 0.5.
+
+     * :attr:`position_sensitive`, a bool value.
+
+     * :attr:`continuous_coordinate`, a bool value.
+
+     * :attr:`im_shape`, original image shape.
+
+    Args:
+        p1 (Tensor[N, H//4, W//4, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``(H//4) x (W//4)``.
+        p2 (Tensor[N, H//8, W//8, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``(H//8) x (W//8)``.
+        p3 (Tensor[N, H//16, W//16, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``(H//16) x (W//16)``.
+        p4 (Tensor[N, H//32, W//32, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``(H//32) x (W//32)``.
+        rois (Tensor[roi_batch, 5]): the list of RoIs and each ROI contains the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``, and the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+
+    Return:
+        Tensor[num_rois * N, pooled_size, pooled_size, C]: the fixed-size feature maps, i.e., the pooled RoIs.
+
+    """
+
+    def __init__(
+        self,
+        num_rois,
+        pooled_size,
+        sampling_ratio,
+        spatial_scale,
+        position_sensitive,
+        continuous_coordinate,
+        im_shape,
+    ) -> None:
+        super().__init__(
+            num_rois,
+            pooled_size,
+            sampling_ratio,
+            spatial_scale,
+            position_sensitive,
+            continuous_coordinate,
+        )
+        self._attrs["op"] = "multi_level_roi_align"
+        self._attrs["im_shape"] = im_shape
+
+    def _infer_shape(self, x: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            num_rois=self._attrs["num_rois"] * x[0],
+            pooled_size=self._attrs["pooled_size"],
+            position_sensitive=self._attrs["position_sensitive"],
+        )
+
+        output = {}
+        exec(eval_func, output)  # noqa: P204  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def __call__(
+        self, p2: Tensor, p3: Tensor, p4: Tensor, p5: Tensor, rois: Tensor
+    ) -> List[Tensor]:
+        self._attrs["inputs"] = [p2, p3, p4, p5, rois]
+        x = p2
+        self._set_depth()
+        self._extract_exec_path(x)
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py
new file mode 100644
index 000000000..dbc6b13da
--- /dev/null
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Roi_align.
+"""
+from .roi_ops import roi_ops_base
+
+
+# pylint: disable=C0103
+class roi_align(roi_ops_base):
+    """
+    Performs Region of Interest (RoI) Align operator with average pooling, as described in Mask R-CNN.
+
+     * :attr:`num_rois` identifies the number of RoIs in the input.
+
+     * :attr:`pooled_size` identifies the size of the pooling section, i.e., the size of the output (in bins or pixels) after the pooling
+       is performed, as (height, width).
+
+     * :attr:`sampling_ratio` is the number of sampling points in the interpolation grid
+       used to compute the output value of each pooled output bin. If > 0,
+       then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If
+       <= 0, then an adaptive number of grid points are used (computed as
+       ``ceil(roi_width / output_width)``, and likewise for height).
+
+     * :attr:`spatial_scale` is a scaling factor that maps the box coordinates to
+       the input coordinates. For example, if your boxes are defined on the scale
+       of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of
+       the original image), you'll want to set this to 0.5.
+
+     * :attr:`position_sensitive`, a bool value.
+
+     * :attr:`continuous_coordinate`. a bool value.
+
+    Args:
+        x (Tensor[N, H, W, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``H x W``.
+        rois (Tensor[roi_batch, 5]): the list of RoIs and each ROI contains the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``, and the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+
+    Return:
+        Tensor[roi_batch, pooled_size, pooled_size, C]: the fixed-size feature maps, i.e., the pooled RoIs.
+
+    """
+
+    def __init__(
+        self,
+        num_rois,
+        pooled_size,
+        sampling_ratio,
+        spatial_scale,
+        position_sensitive,
+        continuous_coordinate,
+    ) -> None:
+        super().__init__(
+            num_rois,
+            pooled_size,
+            sampling_ratio,
+            spatial_scale,
+            position_sensitive,
+            continuous_coordinate,
+        )
+        self._attrs["op"] = "roi_align"
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
new file mode 100644
index 000000000..a21f3dc1b
--- /dev/null
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
@@ -0,0 +1,214 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Roi.
+"""
+import itertools
+import logging
+import re
+from collections import OrderedDict
+from typing import List
+
+import jinja2
+
+from ..... import backend
+from .....backend import registry
+from .....utils import shape_utils
+from ....base import Operator, Tensor
+
+# pylint: disable=C0103,W0221,R1732,W0613
+logging.basicConfig(level=logging.INFO)
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}HI = {{x_dim1}};
+{{indent}}{{dtype}}WI = {{x_dim2}};
+{{indent}}{{dtype}}CI = {{x_dim3}};
+{{indent}}{{dtype}}KH = {{pooled_size}};
+{{indent}}{{dtype}}KW = {{pooled_size}};
+{{indent}}{{dtype}}NO = {{num_rois}};
+{{indent}}{{dtype}}CO = CI;
+{{indent}}{{dtype}}HO = {{pooled_size}};
+{{indent}}{{dtype}}WO = {{pooled_size}};
+"""
+)
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = NO;
+{{indent}}{{y_dim1}} = HO;
+{{indent}}{{y_dim2}} = WO;
+"""
+)
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+{{indent}}if ({{cond}}) {
+{{indent}}  {{program}}
+{{indent}}}
+"""
+)
+
+
+class roi_ops_base(Operator):
+    """
+    Performs Region of Interest (RoI) Pool operator described in Fast R-CNN.
+
+     * :attr:`num_rois` identifies the number of RoIs in the input.
+
+     * :attr:`pooled_size` identifies the size of the pooling section, i.e., the size of the output (in bins or pixels) after the pooling
+       is performed, as (height, width).
+
+     * :attr:`sampling_ratio` is the number of sampling points in the interpolation grid
+       used to compute the output value of each pooled output bin. If > 0,
+       then exactly ``sampling_ratio x sampling_ratio`` sampling points per bin are used. If
+       <= 0, then an adaptive number of grid points are used (computed as
+       ``ceil(roi_width / output_width)``, and likewise for height).
+
+     * :attr:`spatial_scale` is a scaling factor that maps the box coordinates to
+       the input coordinates. For example, if your boxes are defined on the scale
+       of a 224x224 image and your input is a 112x112 feature map (resulting from a 0.5x scaling of
+       the original image), you'll want to set this to 0.5.
+
+     * :attr:`position_sensitive`, a bool value.
+
+     * :attr:`continuous_coordinate`. a bool value.
+
+    Args:
+        x (Tensor[N, H, W, C]): the feature map, i.e. a batch with ``N`` elements. Each element contains ``C`` feature maps of dimensions ``H x W``.
+        rois (Tensor[roi_batch, 5]): the list of RoIs and each ROI contains the index of the corresponding element in the batch, i.e. a number in ``[0, N - 1]``, and the box coordinates in (x1, y1, x2, y2) format where the regions will be taken from. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
+
+    Return:
+        Tensor[roi_batch, pooled_size, pooled_size, C]: the fixed-size feature maps, i.e., the pooled RoIs.
+
+    """
+
+    def __init__(
+        self,
+        num_rois,
+        pooled_size,
+        sampling_ratio,
+        spatial_scale,
+        position_sensitive,
+        continuous_coordinate,
+    ) -> None:
+        super().__init__()
+        self._attrs["op"] = "roi_align"
+        self._attrs["num_rois"] = num_rois
+        self._attrs["sampling_ratio"] = sampling_ratio
+        self._attrs["spatial_scale"] = spatial_scale
+        self._attrs["position_sensitive"] = position_sensitive
+        self._attrs["continuous_coordinate"] = continuous_coordinate
+        self._attrs["pooled_size"] = pooled_size
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+        self.exec_cond_template = EXEC_COND_TEMPLATE
+
+    def _infer_shape(self, x: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            num_rois=self._attrs["num_rois"],
+            pooled_size=self._attrs["pooled_size"],
+            position_sensitive=self._attrs["position_sensitive"],
+        )
+
+        output = {}
+        exec(eval_func, output)  # noqa: P204  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def _invert_exec_key(self, key):
+        tmp = re.findall(r"(\d+)", key)
+        return [int(x) for x in tmp]
+
+    def _gen_exec_key(self, shape):
+        return self.exec_key_template.render(
+            x_dim0=shape[0], x_dim1=shape[1], x_dim2=shape[2], x_dim3=shape[3]
+        ).replace("\n", "")
+
+    def _extract_exec_path(self, x: Tensor):
+        self._attrs["exec_path"] = OrderedDict()
+        self._attrs["exec_path"]["true"] = ""
+
+    def _signature(self):
+        signature = "roi_align: num_rois=[{num_rois}], \
+                                sampling_ratio=[{sampling_ratio}], \
+                                spatial_scale=[{spatial_scale}], \
+                                position_sensitive=[{position_sensitive}], \
+                                continuous_coordinate=[{continuous_coordinate}], \
+                                pooled_size=[{pooled_size}]".format(
+            num_rois=self._attrs["num_rois"],
+            sampling_ratio=self._attrs["sampling_ratio"],
+            spatial_scale=self._attrs["spatial_scale"],
+            position_sensitive=self._attrs["position_sensitive"],
+            continuous_coordinate=self._attrs["continuous_coordinate"],
+            pooled_size=self._attrs["pooled_size"],
+        )
+        return signature
+
+    def __call__(self, x: Tensor, rois: Tensor) -> List[Tensor]:
+        self._attrs["inputs"] = [x, rois]
+        self._set_depth()
+        self._extract_exec_path(x)
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+            self.exec_cond_template,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
new file mode 100644
index 000000000..c9eaf9bf4
--- /dev/null
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+This file defines public tensor concepts and ops that are exposed to
+external converters, e.g. FX2AITemplate.
+"""
+
+# pylint: disable=C0413,W0105
+
+"""Shape"""
+from aitemplate.compiler.base import IntImm, IntVar
+
+"""Tensor"""
+from aitemplate.compiler.base import IntVarTensor, Tensor
+
+"""Profiling"""
+from aitemplate.compiler.base import DynamicProfileStrategy
+
+"""Operators"""
+
+"""Elementwise"""
+from aitemplate.compiler.ops.common.elementwise import clamp, elementwise
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+
+"""GEMM"""
+from aitemplate.compiler.ops.gemm_universal.bmm_rcr import bmm_rcr
+from aitemplate.compiler.ops.gemm_universal.bmm_rrr import bmm_rrr
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr import gemm_rrr
+
+"""Reduce"""
+from aitemplate.compiler.ops.reduce.reduce_mean import reduce_mean
+from aitemplate.compiler.ops.reduce.reduce_sum import reduce_sum
+from aitemplate.compiler.ops.reduce.var import var
+from aitemplate.compiler.ops.reduce.vector_norm import vector_norm
+
+"""View ops"""
+from aitemplate.compiler.ops.common.view_ops import flatten, reshape, squeeze, unsqueeze
+
+"""Functions"""
+from aitemplate.compiler.ops.conv.conv2d import conv2d
+from aitemplate.compiler.ops.conv.conv2d_bias import conv2d_bias
+from aitemplate.compiler.ops.conv.conv2d_bias_relu import conv2d_bias_relu
+from aitemplate.compiler.ops.layernorm.group_layernorm import group_layernorm
+from aitemplate.compiler.ops.layernorm.group_layernorm_sigmoid_mul import (
+    group_layernorm_sigmoid_mul,
+)
+from aitemplate.compiler.ops.layernorm.layernorm import layernorm
+from aitemplate.compiler.ops.padding import nhwc3to8, pad_last_dim
+from aitemplate.compiler.ops.pool.avg_pool2d import avg_pool2d
+from aitemplate.compiler.ops.softmax.softmax import softmax
+from aitemplate.compiler.ops.tensor.size import size
+from aitemplate.compiler.ops.tensor.topk import topk
+
+"""Memory ops"""
+from aitemplate.compiler.ops.tensor.chunk import chunk
+from aitemplate.compiler.ops.tensor.concatenate import concatenate
+from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
+from aitemplate.compiler.ops.tensor.expand import expand
+from aitemplate.compiler.ops.tensor.permute import permute
+from aitemplate.compiler.ops.tensor.split import split
+
+"""Python ops"""
+from aitemplate.compiler.ops.common import getitem, list_construct, tuple_construct
diff --git a/python/aitemplate/compiler/tensor_accessor.py b/python/aitemplate/compiler/tensor_accessor.py
new file mode 100644
index 000000000..232d5041a
--- /dev/null
+++ b/python/aitemplate/compiler/tensor_accessor.py
@@ -0,0 +1,447 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+TensorAccessor definition.
+"""
+
+import copy
+import logging
+
+# pylint: disable=C0103,C0301,W0612
+
+from pprint import pformat
+from typing import Any, List, Optional
+
+from aitemplate.compiler.base import IntVar
+
+from .base import IntImm, Tensor
+
+logger = logging.getLogger(__name__)
+
+
+class TensorAccessor(object):
+    """
+    A tensor accessor which manages how to access a Tensor.
+    Must always be used together with a Tensor.
+    """
+
+    def __init__(self, original_tensor: Tensor) -> None:
+        """
+        Initializes the TensorAccessor with an "original" tensor.
+        """
+        super().__init__()
+        # Tensor offset in terms of number of elements compared to the base tensor.
+        self.offset = 0
+        self.original_shapes = original_tensor._attrs["shape"]
+        # This strictly means that the tensor's memory itself is contiguous
+        self.is_contiguous = True
+
+        ## These variables are only set when self.stride_dim != None.
+        # A tensor can be contiguous and still come from a strided tensor,
+        # e.g., when stride_dim == 0
+        self.is_from_strided_tensor = False
+        self.stride_dim = None
+        self.actual_shapes = None
+
+        # Total number of elements starting from the stride dim, before & after.
+        self.original_total_elements_from_stride_dim = None
+        self.actual_total_elements_from_stride_dim = None
+
+        # List[(original_dim_group, actual_dim_group)], where
+        # original_dim_group is a list of indices for self.original_shapes, and
+        # actual_dim_group is a list of indices for self.actual_shapes.
+        #
+        # This field is used to record dim mapping between self.original_shapes
+        # and self.actual_shapes when update_base_tensor_shape() is called,
+        # which means this TensorAccessor is used to handle a view op (e.g. reshape).
+        # This will be used together with strided_dim to calculate
+        # strides based on original tensor shape and dim.
+        #
+        # By default, self._dim_mapping = [([0], [0]), ([1], [1]), ...., ([n], [n])],
+        # where n = len(self.original_shapes) - 1.
+        # It represents that there is 1:1 dim mapping relationship between
+        # self.original_shapes and self.actual_shapes.
+        #
+        # When self.update_base_tensor_shape() is called, # dimension may be different
+        # between self.original_shapes and self.actual_shapes.
+        # e.g. The original tensor is in shape [2, 3, 2], and it's reshaped to [2, 6].
+        # In this case, self._dim_mapping = [([0], [0]), ([1, 2], [1])], which represents
+        # that self.orignal_shapes[0] and self.actual_shapes[0] are in the same group,
+        # and self.original_shapes[1:2] and self.actual_shapes[1] are in the same group.
+        #
+        # It's possible that such a mapping cannot be calculated (e.g. because of
+        # dynamic shapes), and in such a case, self._dim_mapping is updated to None
+        # to represent that this tensor_accessor cannot be further updated with
+        # another stride dims.
+        self._dim_mapping = [([i], [i]) for i in range(len(self.original_shapes))]
+
+    def __deepcopy__(self, memo):
+        res = type(self)(Tensor(shape=self.original_shapes))
+        res.original_shapes = copy.deepcopy(self.original_shapes, memo)
+        res.actual_shapes = copy.deepcopy(self.actual_shapes, memo)
+        res._dim_mapping = copy.deepcopy(self._dim_mapping, memo)
+        return res
+
+    def __str__(self) -> str:
+        return pformat(vars(self), indent=2)
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, TensorAccessor):
+            return False
+        attrs = vars(self)
+        for attr in attrs:
+            self_attr = getattr(self, attr)
+            other_attr = getattr(other, attr)
+            if self_attr != other_attr:
+                return False
+        return True
+
+    def _try_gen_dim_mapping(self):
+        """
+        Updates self._dim_mapping based on self.original_shapes and self.actual_shapes.
+        Check comments for self._dim_mapping for more info.
+        """
+
+        # Set self._dim_mapping = None.
+        self._dim_mapping = None
+        dim_mapping = []
+
+        original_value = None
+        actual_value = None
+        original_idx = 0
+        actual_idx = 0
+        prev_original_idx = 0
+        prev_actual_idx = 0
+
+        # Add a dummy entry at the end of original_shapes / actual_shapes to
+        # make boundary checks easier.
+        INT_MAX = int("0x" + "F" * 16, 16)
+        original_shapes = list(self.original_shapes) + [IntImm(INT_MAX)]
+        actual_shapes = list(self.actual_shapes) + [IntImm(INT_MAX)]
+
+        while original_idx < len(original_shapes) and actual_idx < len(actual_shapes):
+            original_d = original_shapes[original_idx]
+            actual_d = actual_shapes[actual_idx]
+
+            if original_d == IntImm(1) and original_value is None:
+                # Create separate groups for "1"s.
+                original_idx += 1
+                dim_mapping.append((list(range(prev_original_idx, original_idx)), []))
+                prev_original_idx = original_idx
+                continue
+
+            if actual_d == IntImm(1) and actual_value is None:
+                # Create separate groups for "1"s.
+                actual_idx += 1
+                dim_mapping.append(([], list(range(prev_actual_idx, actual_idx))))
+                prev_actual_idx = actual_idx
+                continue
+
+            if not isinstance(original_d, IntImm) or not isinstance(actual_d, IntImm):
+                # If there are dynamic dims, check whether these two dynamic dims are the same.
+                # Only allow one dynamic dim per group.
+                if (
+                    original_d != actual_d
+                    or original_value is not None
+                    or actual_value is not None
+                ):
+                    return
+                else:
+                    original_value = original_d
+                    actual_value = actual_d
+                    original_idx += 1
+                    actual_idx += 1
+            elif original_value is None or actual_value is None:
+                # Assign values to original_value / actual_value first.
+                if original_value is None:
+                    original_value = original_d.value()
+                    original_idx += 1
+                if actual_value is None:
+                    actual_value = actual_d.value()
+                    actual_idx += 1
+            else:
+                # Choose to advance original_idx or actual_idx based on
+                # original_value / actual_value comparisons.
+                if original_value < actual_value:
+                    original_value *= original_d.value()
+                    original_idx += 1
+                elif original_value > actual_value:
+                    actual_value *= actual_d.value()
+                    actual_idx += 1
+                else:
+                    raise AssertionError("This branch should never be reached.")
+
+            if original_value == actual_value:
+                dim_mapping.append(
+                    (
+                        list(range(prev_original_idx, original_idx)),
+                        list(range(prev_actual_idx, actual_idx)),
+                    )
+                )
+                prev_original_idx = original_idx
+                prev_actual_idx = actual_idx
+                original_value = None
+                actual_value = None
+
+        if (
+            original_value is not None
+            or actual_value is not None
+            or original_idx != len(original_shapes)
+            or actual_idx != len(actual_shapes)
+        ):
+            logger.debug(f"tail processing failed, dim_mapping: {dim_mapping}")
+            return
+
+        # Remove the last dummy group.
+        dim_mapping = dim_mapping[:-1]
+
+        # Assign new dim_mapping to self._dim_mapping.
+        self._dim_mapping = dim_mapping
+        logger.debug(f"generate dim_mapping: {dim_mapping}")
+
+    def try_get_stride_strs(
+        self, dim: int, dim_names: List[str] = None
+    ) -> Optional[List[str]]:
+        """
+        Tries to return a list of stride strs for the given dim.
+        Note that both dim and dim_names are based on self.original_shapes.
+
+        Returns None if this function fails to calculate stride strs.
+        """
+
+        assert dim < len(self.original_shapes), (
+            f"dim {dim} must be smaller than original_shapes' rank, "
+            f"{len(self.original_shapes)}"
+        )
+        if dim_names is not None:
+            assert len(dim_names) == len(self.original_shapes), (
+                "dim_names must have the same length as shapes, "
+                f"dim_names: {dim_names}, shapes: {self.original_shapes}"
+            )
+
+        def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
+            res = []
+            for index in indices:
+                d = shape[index]
+                if isinstance(d, IntImm):
+                    res.append(str(d.value()))
+                else:
+                    res.append(
+                        dim_names[index] if dim_names is not None else d._attrs["name"]
+                    )
+            return res
+
+        if self.stride_dim is None:
+            # There are no stride dims. Use original shapes to calculate directly.
+            return _get_value_or_names(
+                self.original_shapes, range(dim + 1, len(self.original_shapes))
+            )
+
+        if self._dim_mapping is None:
+            # self._dim_mapping cannot be generated successfully.
+            # Return None to represent an error.
+            logger.debug("Failed to get dim mapping.")
+            return None
+
+        # Loop through self._dim_mapping to generate stride_strs.
+        found_original_dim_group = False
+        res = []
+        for (original_group, actual_group) in self._dim_mapping:
+            if not found_original_dim_group:
+                if dim in original_group:
+                    found_original_dim_group = True
+                    idx = original_group.index(dim)
+                    if (self.stride_dim in actual_group) and (
+                        idx != len(original_group) - 1
+                    ):
+                        # If self.stride_dim is in the corresponding group,
+                        # need to make sure that dim is the last dim
+                        # inside the original group.
+                        # Otherwise, we cannot compute strides.
+                        logger.debug(
+                            "Multiple dims in stride_dim group. "
+                            f"dim_mapping: {self._dim_mapping}, "
+                            f"dim: {dim}, stride_dim: {self.stride_dim}, self: {self}"
+                        )
+                        return None
+                    res.extend(
+                        _get_value_or_names(
+                            self.original_shapes, original_group[idx + 1 :]
+                        )
+                    )
+            else:
+                if self.stride_dim in actual_group:
+                    if actual_group.index(self.stride_dim) != 0:
+                        logger.debug(
+                            f"Stride dim {self.stride_dim} is not the first dim "
+                            f"of the underlying group {actual_group}."
+                        )
+                        return None
+                    res.extend(_get_value_or_names(self.actual_shapes, actual_group))
+                else:
+                    res.extend(
+                        _get_value_or_names(self.original_shapes, original_group)
+                    )
+
+        logger.debug(
+            f"dim: {dim}, stride_dim: {self.stride_dim}, "
+            f"mapping: {self._dim_mapping}, stride_strs: {res}, "
+            f"original: {self.original_shapes}, actual: {self.actual_shapes}"
+        )
+        return res
+
+    def stride(self, dim: int) -> int:
+        """
+        Returns stride (a number) for the given dim.
+        Note that dim is based on self.original_shapes.
+        This API assumes that all dims after dim are static (IntImm).
+
+        Throws RuntimeError if such a stride number cannot be computed.
+        """
+
+        for i, d in enumerate(self.original_shapes[dim + 1 :], dim + 1):
+            if not isinstance(d, IntImm):
+                raise RuntimeError(
+                    f"Can only calculate static stride from static dim: {i}, "
+                    f"original shapes: {self.original_shapes}."
+                )
+        strides = self.try_get_stride_strs(dim)
+        if strides is None:
+            raise RuntimeError("Failed to get stride strs!")
+        stride = 1
+        for s in strides:
+            stride *= int(s)
+        return stride
+
+    def gen_stride_str(self, dim: int, dim_names: List[str]) -> int:
+        """
+        Returns the str to calculate the stride of a certain dim. This is
+        a temporary solution to get around dynamic shapes problems with
+        tensor_accessor. dim_names is a list of str, such as ["B", "M", "K"]
+        for the first input to bmm_rcr.
+
+        Note that both dim and dim_names are based on self.original_shapes.
+
+        Throws RuntimeError if such a stride number cannot be computed.
+        """
+        strides = self.try_get_stride_strs(dim, dim_names)
+        if strides is None:
+            raise RuntimeError("Failed to get stride strs!")
+        return " * ".join(strides)
+
+    def update_base_tensor(
+        self, new_tensor: Tensor, stride_dim: int, stride_dim_offset: int
+    ) -> None:
+        """
+        Updates the TensorAccessor with a new base tensor.
+        This API is useful to handle ops with a stride dim, e.g. split, cat.
+        It can also used by slice if slice is only operated on one dim.
+        """
+
+        assert (
+            self.stride_dim is None
+        ), "Tensor accessor cannot be updated once stride_dim is set!"
+
+        original_shapes = (
+            self.actual_shapes
+            if self.actual_shapes is not None
+            else self.original_shapes
+        )
+        self.actual_shapes = new_tensor._attrs["shape"]
+        self.stride_dim = stride_dim
+        assert len(self.actual_shapes) == len(original_shapes), (
+            f"Original tensor and new tensor must have the same number of dims! "
+            f"Original tensor shape: {original_shapes}, new tensor shape: {self.actual_shapes}"
+        )
+        assert (
+            len(self.actual_shapes) > stride_dim
+        ), f"stride_dim {stride_dim} must be less than #dims {len(self.actual_shapes)}!"
+        assert isinstance(
+            original_shapes[stride_dim], IntImm
+        ), "Stride dim can't be dynamic!"
+        assert isinstance(
+            self.actual_shapes[stride_dim], IntImm
+        ), "Stride dim can't be dynamic!"
+
+        self.original_total_elements_from_stride_dim = original_shapes[
+            stride_dim
+        ].value()
+        self.actual_total_elements_from_stride_dim = self.actual_shapes[
+            stride_dim
+        ].value()
+        self.offset = stride_dim_offset
+
+        for original_shape, actual_shape in zip(
+            original_shapes[stride_dim + 1 :], self.actual_shapes[stride_dim + 1 :]
+        ):
+            assert isinstance(
+                original_shape, IntImm
+            ), "Dims after the stride dim must have static shapes! Shapes: {}".format(
+                original_shapes
+            )
+            assert isinstance(
+                actual_shape, IntImm
+            ), "Dims after the stride dim must have static shapes! Shapes: {}".format(
+                self.actual_shapes
+            )
+            assert (
+                original_shape._attrs["values"] == actual_shape._attrs["values"]
+            ), "original shapes {} and actual shapes {} after the stride dim must be equal! ".format(
+                original_shapes, self.actual_shapes
+            )
+            value = actual_shape._attrs["values"][0]
+            self.original_total_elements_from_stride_dim *= value
+            self.actual_total_elements_from_stride_dim *= value
+            self.offset *= value
+
+        if (
+            stride_dim > 0
+            and self.actual_total_elements_from_stride_dim
+            > self.original_total_elements_from_stride_dim
+        ):
+            self.is_contiguous = False
+
+        if (
+            self.actual_total_elements_from_stride_dim
+            > self.original_total_elements_from_stride_dim
+        ):
+            self.is_from_strided_tensor = True
+
+    def update_base_tensor_shape(self, new_tensor: Tensor) -> None:
+        """
+        Updates the TensorAccessor's actual shape.
+        This API is useful to handle view ops, e.g. reshape, flatten, etc.
+        """
+
+        assert (
+            self.stride_dim is None
+        ), "Tensor accessor cannot be updated once stride_dim is set!"
+
+        self.actual_shapes = new_tensor._attrs["shape"]
+        original_dynamic_dims = {
+            dim for dim in self.original_shapes if not isinstance(dim, IntImm)
+        }
+        new_dynamic_dims = {
+            dim for dim in self.actual_shapes if not isinstance(dim, IntImm)
+        }
+        assert original_dynamic_dims == new_dynamic_dims, (
+            "Original tensor and actual tensor have different dynamic dimensions! "
+            f"Original tensor: {self.original_shapes}, "
+            f"actual tensor: {self.actual_shapes}!"
+        )
+        self._try_gen_dim_mapping()
diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
new file mode 100644
index 000000000..bda828a9e
--- /dev/null
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -0,0 +1,39 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from .bind_constants import bind_constants
+from .constant_folding import constant_folding
+from .fuse_conv_elementwise import fuse_conv_elementwise
+from .fuse_group_ops import (
+    fuse_group_gemm_ops,
+    fuse_group_layernorm_ops,
+    fuse_group_ops,
+)
+from .fuse_mm_elementwise import fuse_mm_elementwise
+from .fuse_ops import fuse_ops
+from .fuse_permute_bmm import fuse_permute_bmm
+from .mark_param_tensor import mark_param_tensor, mark_special_views
+from .memory_planning import memory_planning
+from .name_graph import name_graph
+from .optimize_graph import optimize_graph
+from .profile import profile
+from .refine_graph import refine_graph
+from .remove_no_ops import remove_no_ops
+from .remove_unused_ops import remove_unused_ops
+from .toposort import toposort
+from .transform_memory_ops import transform_memory_ops
+from .transform_odd_alignment import transform_odd_alignment
+from .transform_special_ops import transform_special_ops
+from .transform_strided_ops import transform_strided_ops
diff --git a/python/aitemplate/compiler/transform/apply_padding.py b/python/aitemplate/compiler/transform/apply_padding.py
new file mode 100644
index 000000000..f04587ea0
--- /dev/null
+++ b/python/aitemplate/compiler/transform/apply_padding.py
@@ -0,0 +1,245 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Applies paddings to gemms based on alignment requirements.
+"""
+from typing import Callable, Dict, List
+
+from aitemplate.compiler.base import _create_host_zero_tensor
+
+from ...utils import logger
+from .. import ops
+from ..base import IntImm, Operator, Tensor
+from ..ops.gemm_universal.gemm_common import DimInfo, gemm, Source
+from . import transform_utils
+
+
+def _extract_mnk_name(
+    dim_info_dict: Dict[str, DimInfo], source: Source, tensor_idx: int, dim_idx: int
+) -> str:
+    for name, info_list in dim_info_dict.items():
+        for info in info_list:
+            if info == DimInfo(source, tensor_idx, [dim_idx]):
+                return name
+    return None
+
+
+def _get_padding_length(original_length: int) -> int:
+    if original_length % 2 == 0:
+        return 0
+
+    # TODO(yingz): Tune padding strategy.
+    if original_length < 16:
+        return 1
+    return int((original_length // 8 + 1) * 8) - original_length
+
+
+def _pad_input_tensor(
+    op: Operator,
+    tensor_idx: int,
+    f_extract_var_name: Callable[[int, int], str],
+    alignment_var_to_padding_length: Dict[str, int],
+    tensor_list: List[Tensor],
+) -> None:
+    original_shape = op._attrs["inputs"][tensor_idx]._attrs["shape"]
+    for dim_idx, dim in enumerate(original_shape):
+        tensor = op._attrs["inputs"][tensor_idx]
+        original_tensor_debug_str = str(tensor)
+        previous_shape = tensor._attrs["shape"]
+        padding_shape = list(previous_shape)
+        new_shape = list(previous_shape)
+
+        var_name = f_extract_var_name(tensor_idx, dim_idx)
+        if var_name is None or var_name not in alignment_var_to_padding_length:
+            # This dim doesn't require alignment padding. Skipping.
+            continue
+
+        padding_length = alignment_var_to_padding_length.get(var_name)
+        padding_shape[dim_idx] = IntImm(padding_length)
+        new_shape[dim_idx] = IntImm(dim.value() + padding_length)
+        tensor._attrs["dst_ops"].remove(op)
+
+        padding_tensor = _create_host_zero_tensor(
+            shape=padding_shape, dtype=tensor.dtype()
+        )
+        padded_tensor = ops.concatenate()(
+            [tensor, padding_tensor],
+            dim=dim_idx,
+        )
+        op._attrs["inputs"][tensor_idx] = padded_tensor
+        padded_tensor._attrs["dst_ops"].add(op)
+        tensor_list.append(padding_tensor)
+        tensor_list.append(padded_tensor)
+
+        logger.debug(
+            __name__,
+            "**** Apply padding ****, replace input tensor \n {} \n with \n {} \n".format(
+                original_tensor_debug_str, padded_tensor
+            ),
+        )
+
+    return
+
+
+def _slice_output_tensor(
+    new_output: Tensor, original_output: Tensor, tensor_list: List[Tensor]
+) -> Tensor:
+    new_shape = new_output._attrs["shape"]
+    original_shape = original_output._attrs["shape"]
+    if new_shape == original_shape:
+        return new_output
+
+    start_indicies = [0] * len(new_shape)
+    end_indicies = [None] * len(new_shape)
+    for i, (new_dim, old_dim) in enumerate(zip(new_shape, original_shape)):
+        if new_dim != old_dim:
+            assert isinstance(new_dim, IntImm) and isinstance(
+                old_dim, IntImm
+            ), f"new_shape: {new_shape}, old_shape: {original_shape}"
+            assert (
+                new_dim.value() > old_dim.value()
+            ), f"new_shape: {new_shape}, old_shape: {original_shape}"
+            end_indicies[i] = old_dim.value()
+    sliced_tensor = ops.dynamic_slice()(new_output, start_indicies, end_indicies)
+    tensor_list.append(sliced_tensor)
+    sliced_tensor._attrs["is_output"] = new_output._attrs["is_output"]
+    sliced_tensor._attrs["name"] = new_output._attrs["name"]
+    new_output._attrs["name"] = None
+    new_output._attrs["is_output"] = False
+    return sliced_tensor
+
+
+def apply_padding(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
+    """
+    Applies padding to gemms to use SM80 kernels.
+    SM80 kernels require min_alignment == 2.
+    """
+
+    visited_ops = set()
+    new_sorted_graph = []
+    for tensor in sorted_graph:
+        new_tensor_list = [tensor]
+        src_ops = tensor.src_ops()
+        for op in src_ops:
+            if op in visited_ops:
+                continue
+
+            # Exclude special gemm kernels.
+            if (
+                not isinstance(op, gemm)
+                or isinstance(op, ops.gemm_rrr_small_nk)
+                or isinstance(op, ops.bmm_rcr_n1)
+                or isinstance(op, ops.bmm_rrr_k1_tanh)
+            ):
+                continue
+
+            # This pass only works for gemm or bmm. group_gemm is not supported.
+            # We don't need to padd our special kernel bmm_rcr_n1, which does
+            # not have any alignment constraint.
+            op_name = op._attrs["name"]
+            if op_name.startswith(("group_gemm", "bmm_rcr_n1")) or "softmax" in op_name:
+                continue
+
+            # Extract alignment var names and padding lengths.
+            alignment_var_to_padding_length = {}
+            dim_info_dict = op._extract_dims()
+            for i, tensor in enumerate(op._attrs["inputs"]):
+                alignment_var = _extract_mnk_name(
+                    dim_info_dict, Source.INPUT, i, len(tensor._attrs["shape"]) - 1
+                )
+                if alignment_var is None:
+                    # No alignment var is extracted. Skip padding.
+                    continue
+                alignment_dim = tensor._attrs["shape"][-1]
+                if not isinstance(alignment_dim, IntImm):
+                    raise NotImplementedError(
+                        "Gemm does not support dynamic alignment dimensions "
+                        "(i.e. alignment==1)! Gemm: {}".format(op)
+                    )
+                padding_length = _get_padding_length(alignment_dim.value())
+                if padding_length > 0:
+                    alignment_var_to_padding_length[alignment_var] = padding_length
+            if len(alignment_var_to_padding_length) == 0:
+                # No padding is necessary.
+                continue
+
+            logger.debug(
+                __name__,
+                "**** Apply padding ****, alignment_var_to_padding_length: \n {} \n".format(
+                    alignment_var_to_padding_length
+                ),
+            )
+            original_op_debug_str = str(op)
+
+            # Pad A and B.
+            for tensor_idx, _ in enumerate(op._attrs["inputs"][:2]):
+                _pad_input_tensor(
+                    op,
+                    tensor_idx,
+                    lambda tensor_idx, dim_idx: _extract_mnk_name(
+                        dim_info_dict, Source.INPUT, tensor_idx, dim_idx
+                    ),
+                    alignment_var_to_padding_length,
+                    new_tensor_list,
+                )
+
+            # Pad bias and extra sources if necessary.
+            for tensor_idx, tensor in enumerate(op._attrs["inputs"][2:]):
+                _pad_input_tensor(
+                    op,
+                    tensor_idx + 2,  # skip A and B
+                    lambda _, dim_idx: _extract_mnk_name(
+                        dim_info_dict,
+                        Source.OUTPUT,  # bias alignment follows output alignment
+                        0,  # always check output[0]
+                        dim_idx
+                        + len(op._attrs["outputs"][0]._attrs["shape"])
+                        - len(tensor._attrs["shape"]),  # handle bias broadcast case
+                    ),
+                    alignment_var_to_padding_length,
+                    new_tensor_list,
+                )
+
+            # Replaces the old op with the new op.
+            for tensor_input in op._attrs["inputs"]:
+                tensor_input._attrs["dst_ops"].remove(op)
+            new_op = type(op)()
+            new_op._attrs["split_k"] = op._attrs["split_k"]
+            if "alpha" in op._attrs:
+                new_op._attrs["alpha"] = op._attrs["alpha"]
+            new_output = new_op(*op._attrs["inputs"])
+            new_tensor_list.append(new_output)
+            original_output = op._attrs["outputs"][0]
+            transform_utils.copy_tensor_attributes(new_output, original_output)
+
+            # Slice output if necessary.
+            new_output = _slice_output_tensor(
+                new_output, original_output, new_tensor_list
+            )
+            transform_utils.replace_tensor(original_output, new_output)
+            transform_utils.remove_tensor_from_sorted_graph(original_output)
+
+            logger.debug(
+                __name__,
+                "**** Apply padding ****, replace op \n {} \n with \n {} \n".format(
+                    original_op_debug_str, new_op
+                ),
+            )
+
+        new_sorted_graph.extend(new_tensor_list)
+
+    new_sorted_graph = transform_utils.sanitize_sorted_graph(new_sorted_graph)
+
+    return new_sorted_graph
diff --git a/python/aitemplate/compiler/transform/bind_constants.py b/python/aitemplate/compiler/transform/bind_constants.py
new file mode 100644
index 000000000..7ff6fe9c1
--- /dev/null
+++ b/python/aitemplate/compiler/transform/bind_constants.py
@@ -0,0 +1,53 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Bind all user-provided constants to the graph.
+"""
+
+from typing import Dict, List
+
+from aitemplate.compiler.base import _TorchConstantTensorData, Tensor
+from aitemplate.compiler.model import TorchTensor
+
+
+def bind_constants(graph: List[Tensor], constants: Dict[str, TorchTensor]) -> None:
+    """Bind all user-provided constants to the graph. Internally, the constants are
+    represented as ConstantTensors. These can be folded, and are packaged into
+    the final *.so.
+
+    Parameters
+    ----------
+    graph : List[Tensor]
+        Input graph
+    constants : Dict[str, TorchTensor]
+        Constants to bind
+
+    """
+
+    for tensor in graph:
+        name = tensor._attrs["name"]
+        if name not in constants:
+            continue
+
+        if tensor._attrs["data"] is not None:
+            raise ValueError(f"Tensor {name} is already bound!")
+
+        if tensor.src_ops():
+            raise ValueError(f"Cannot bind non-constant tensor {name}")
+
+        if tensor._attrs["is_input"]:
+            raise ValueError(f"Cannot bind input tensor {name}")
+
+        tensor._bind_data(_TorchConstantTensorData(constants[name]))
diff --git a/python/aitemplate/compiler/transform/constant_folding.py b/python/aitemplate/compiler/transform/constant_folding.py
new file mode 100644
index 000000000..6e9239618
--- /dev/null
+++ b/python/aitemplate/compiler/transform/constant_folding.py
@@ -0,0 +1,192 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+from typing import Dict, List
+
+import numpy as np
+
+from aitemplate import backend, compiler
+
+from aitemplate.compiler.base import _NumpyConstantTensorData, Tensor
+from aitemplate.compiler.model import AITData, Model
+from aitemplate.compiler.transform.transform_utils import replace_tensor
+from aitemplate.utils import logger
+
+
+def _output_from_tensor(tensor: Tensor) -> Tensor:
+    new_tensor = Tensor(
+        shape=tensor._attrs["shape"],
+        name=tensor._attrs["name"],
+        src_ops=tensor._attrs["src_ops"].copy(),
+        dst_ops=tensor._attrs["dst_ops"].copy(),
+        dtype=tensor._attrs["dtype"],
+        is_output=True,
+        is_view_of=tensor._attrs["is_view_of"],
+    )
+    if new_tensor._attrs["is_view_of"] is not None:
+        # If this tensor is a view, we need to set external_tensor
+        # so codegen handles the "output is view of output" case
+        # correctly.
+        new_tensor._attrs["external_tensor"] = new_tensor._attrs["is_view_of"]
+    return new_tensor
+
+
+def _extract_foldable_subgraph(
+    sorted_graph: List[Tensor],
+) -> List[Tensor]:
+    """
+    Extract a list of foldable nodes. A node is foldable if:
+    * It has bound data, or
+    * All of its inputs are foldable.
+
+    The subgraph returned is just a list of Tensors. All foldable
+    tensors that do not have bound data are marked as outputs in
+    the subgraph. The original graph is not modified.
+
+    All tensors that do not have bound data are marked as outputs.
+    This is because we want to execute the subgraph and get all
+    of the new constants. Only the ones that are actually needed are put
+    back into the final graph.
+    """
+    foldable_node_names = set()
+    subgraph = []
+
+    for tensor in sorted_graph:
+        if tensor._attrs["is_input"]:
+            continue
+
+        name = tensor._attrs["name"]
+        if tensor._attrs["data"] is not None:
+            foldable_node_names.add(name)
+            subgraph.append(tensor)
+            continue
+        elif tensor._attrs["is_param"]:
+            # Params that do not have bound data cannot be folded.
+            continue
+
+        foldable = all(
+            inp._attrs["name"] in foldable_node_names
+            for op in tensor._attrs["src_ops"]
+            for inp in op._attrs["inputs"]
+        )
+
+        if foldable:
+            foldable_node_names.add(name)
+            subgraph.append(_output_from_tensor(tensor))
+
+    return subgraph
+
+
+def _constant_folding_impl(
+    sorted_graph: List[Tensor], workdir: str
+) -> Dict[str, Tensor]:
+
+    # Collect the set of output names before we do any transformations. We'll need this
+    # if we end up turning outputs into constants. _extract_foldable_subgraph marks *all*
+    # folded constants as outputs, so we can't just query attrs["is_output"] (see
+    # extract_foldable_subgraph for more info on why that happens)
+    original_output_tensors = {
+        tensor._attrs["name"] for tensor in sorted_graph if tensor._attrs["is_output"]
+    }
+
+    subgraph = _extract_foldable_subgraph(sorted_graph)
+    output_tensors = [tensor for tensor in subgraph if tensor._attrs["is_output"]]
+    if not output_tensors:
+        logger.info(__file__, "No constants to fold, skipping constant folding.")
+        return {}
+
+    blob, constant_blob, workspace = compiler.transform.memory_planning(subgraph)
+
+    constant_folding_workdir = os.path.join(workdir, "constant_folding")
+    os.makedirs(constant_folding_workdir, exist_ok=True)
+    file_pairs = backend.codegen.gen_function_src(subgraph, workdir, "constant_folding")
+    main_pairs = backend.codegen.gen_library_src(
+        subgraph,
+        blob,
+        constant_blob,
+        workspace,
+        workdir,
+        output_tensors,
+        "constant_folding",
+    )
+    file_pairs.extend(main_pairs)
+    compile_engine = backend.builder.Builder()
+    compile_engine.build_objs(
+        file_pairs,
+        backend.target.Target.current().compile_cmd(False),
+        backend.target.Target.current().binary_compile_cmd(),
+    )
+
+    so_name = os.path.join(constant_folding_workdir, "test.so")
+    compile_engine.build_so(so_name, [p[1] for p in file_pairs])
+
+    module = Model(so_name, num_runtimes=1)
+
+    outputs = {}
+    new_tensors = {}
+    for tensor in subgraph:
+        if tensor._attrs["data"] is None:
+            name = tensor._attrs["name"]
+            shape = module.get_output_maximum_shape(tensor._attrs["name"])
+            arr = np.empty(shape, dtype=tensor._attrs["dtype"])
+            new_tensor = Tensor(
+                shape=tensor._attrs["shape"],
+                name=name,
+                # copy dst_ops so we can modify the original tensor without affecting this one.
+                dst_ops=tensor._attrs["dst_ops"].copy(),
+                dtype=tensor._attrs["dtype"],
+                is_output=name in original_output_tensors,
+            )
+            new_tensor._bind_data(_NumpyConstantTensorData(arr))
+            new_tensors[name] = new_tensor
+            outputs[name] = AITData(arr.ctypes.data, shape, tensor._attrs["dtype"])
+
+    module._run_with_outputs_on_host({}, outputs)
+    return new_tensors
+
+
+def constant_folding(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
+    """
+    Fold and propagate constants.
+
+    This pass looks for ops that have inputs which can be determined
+    at compile time. It evaluates them, then puts the new constants
+    back into the graph with bound data. The old ops are eliminated.
+
+    This pass actually compiles and runs an AIT runtime. If there are
+    any problems (e.g. due to buggy ops), the constant folding is
+    aborted and the graph is returned unchanged. All generated code
+    is stored in workdir/constant_folding.
+    """
+    try:
+        new_constants = _constant_folding_impl(sorted_graph, workdir)
+    except Exception as e:
+        logger.warning(
+            __file__,
+            f"Constant folding encountered an error: {e}. The graph will not be modified.",
+        )
+        return sorted_graph
+
+    # Replace ops with their folded values.
+    for idx, tensor in enumerate(sorted_graph):
+        name = tensor._attrs["name"]
+        if name in new_constants:
+            new_tensor = new_constants[name]
+            replace_tensor(tensor, new_tensor)
+            sorted_graph[idx] = new_tensor
+
+    # Eliminate constants that are no longer used
+    compiler.transform.remove_unused_ops(sorted_graph)
+    return compiler.transform.transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/fuse_conv_elementwise.py b/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
new file mode 100644
index 000000000..2db2a8c9e
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fuse conv + elementwise ops.
+"""
+from typing import List
+
+from ..base import Tensor
+
+from .fuse_conv_patterns import (
+    get_conv2d_bias_elementwise_patterns,
+    get_conv2d_bias_pattern,
+    get_cuda_only_conv2d_bias_elementwise_patterns,
+)
+from .fuse_utils import transform_simple_fusion_patterns
+
+# pylint: disable=C0103,C0415,W0612
+
+
+def _transform_conv2d_bias(sorted_graph: List[Tensor]) -> List[Tensor]:
+    fusion_patterns = get_conv2d_bias_pattern()
+
+    return transform_simple_fusion_patterns(sorted_graph, fusion_patterns)
+
+
+def _transform_conv2d_bias_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
+    fusion_patterns = get_conv2d_bias_elementwise_patterns()
+
+    return transform_simple_fusion_patterns(sorted_graph, fusion_patterns)
+
+
+def _transform_cuda_only_conv2d_bias_elementwise(
+    sorted_graph: List[Tensor],
+) -> List[Tensor]:
+    fusion_patterns = get_cuda_only_conv2d_bias_elementwise_patterns()
+
+    return transform_simple_fusion_patterns(sorted_graph, fusion_patterns)
+
+
+def fuse_conv_elementwise(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
+    """
+    Fuse conv + elementwise ops. The second argument is unused, it's only
+    here to make the type of this function the same as the others called in optimize_graph.
+    """
+    funcs = [
+        _transform_conv2d_bias,
+        _transform_conv2d_bias_elementwise,
+    ]
+    for func in funcs:
+        sorted_graph = func(sorted_graph)
+
+    from ...backend.target import Target
+
+    if Target.current().name() == "cuda":
+        funcs = [
+            _transform_cuda_only_conv2d_bias_elementwise,
+        ]
+        for func in funcs:
+            sorted_graph = func(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_conv_patterns.py b/python/aitemplate/compiler/transform/fuse_conv_patterns.py
new file mode 100644
index 000000000..9dfef17dd
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_conv_patterns.py
@@ -0,0 +1,137 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from ..ops.common import elementwise
+from ..ops.common.epilogue import FuncEnum
+from ..ops.conv import (
+    conv2d,
+    conv2d_bias,
+    conv2d_bias_add,
+    conv2d_bias_add_relu,
+    conv2d_bias_few_channels,
+    conv2d_bias_relu,
+    conv2d_bias_relu_few_channels,
+    conv2d_bias_sigmoid,
+    transposed_conv2d,
+    transposed_conv2d_bias,
+    transposed_conv2d_bias_relu,
+)
+
+
+def get_conv2d_bias_pattern():
+    # Attribute in conv2d is not of concern, it will be passed-through directly.
+    return [((conv2d(stride=1, pad=0), elementwise(FuncEnum.ADD)), conv2d_bias)]
+
+
+def get_conv2d_bias_elementwise_patterns():
+    """
+    We create the pattern of fusion here.
+    The format should be in the form of (pattern, replacement)
+
+    pattern: This would be a list of operator which are chained which we
+             want to match
+    replacement: The op to replace pattern.
+    """
+
+    conv2d_bias_patterns = [
+        (
+            (
+                conv2d_bias(stride=1, pad=0),
+                elementwise(FuncEnum.ADD),
+                elementwise(FuncEnum.RELU),
+            ),
+            conv2d_bias_add_relu,
+        ),
+        (
+            (
+                conv2d_bias(stride=1, pad=0),
+                elementwise(FuncEnum.RELU),
+            ),
+            conv2d_bias_relu,
+        ),
+        (
+            (
+                conv2d_bias(stride=1, pad=0),
+                elementwise(FuncEnum.SIGMOID),
+            ),
+            conv2d_bias_sigmoid,
+        ),
+    ]
+
+    transposed_conv2d_bias_patterns = [
+        (
+            (
+                transposed_conv2d_bias(stride=1, pad=0),
+                elementwise(FuncEnum.RELU),
+            ),
+            transposed_conv2d_bias_relu,
+        ),
+    ]
+
+    transposed_conv2d_patterns = [
+        (
+            (
+                transposed_conv2d(stride=1, pad=0),
+                elementwise(FuncEnum.ADD),
+                elementwise(FuncEnum.RELU),
+            ),
+            transposed_conv2d_bias_relu,
+        ),
+        (
+            (
+                transposed_conv2d_bias(stride=1, pad=0),
+                elementwise(FuncEnum.RELU),
+            ),
+            transposed_conv2d_bias_relu,
+        ),
+    ]
+
+    fusion_patterns = (
+        conv2d_bias_patterns
+        + transposed_conv2d_bias_patterns
+        + transposed_conv2d_patterns
+    )
+
+    return fusion_patterns
+
+
+def get_cuda_only_conv2d_bias_elementwise_patterns():
+    conv2d_bias_patterns = [
+        (
+            (
+                conv2d_bias_few_channels(stride=1, pad=0),
+                elementwise(FuncEnum.RELU),
+            ),
+            conv2d_bias_relu_few_channels,
+        ),
+        (
+            (
+                conv2d_bias(stride=1, pad=0),
+                elementwise(FuncEnum.ADD),
+            ),
+            conv2d_bias_add,
+        ),
+    ]
+
+    transposed_conv2d_patterns = [
+        (
+            (
+                transposed_conv2d(stride=1, pad=0),
+                elementwise(FuncEnum.ADD),
+            ),
+            transposed_conv2d_bias,
+        ),
+    ]
+
+    return conv2d_bias_patterns + transposed_conv2d_patterns
diff --git a/python/aitemplate/compiler/transform/fuse_group_ops.py b/python/aitemplate/compiler/transform/fuse_group_ops.py
new file mode 100644
index 000000000..6d515659b
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_group_ops.py
@@ -0,0 +1,716 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Horizontal fusion pass to group ops together.
+"""
+import collections
+import os
+from typing import Callable, List, OrderedDict, Set
+
+from ...utils import graph_utils, logger
+from ...utils.shape_utils import all_static_dimensions
+from .. import ops
+from ..base import Operator, Tensor
+from ..ops.gemm_universal.gemm_common import default_align_ab
+from . import transform_utils
+from .fuse_split import _can_fuse_split_op
+from .toposort import toposort
+
+
+# used by debugging only
+def _dump_dependency_graph(graph, op_type, postfix, workdir):
+    fname = f"fuse_group_{op_type}_dependency_graph_{postfix}.txt"
+    file_path = os.path.join(workdir, fname)
+    graph_str = []
+    for parent, descendants in graph.items():
+        parent_op = parent._attrs["name"]
+        descendant_ops = [child._attrs["name"] for child in descendants]
+        graph_str.append(f"[{parent_op}: {descendant_ops}];")
+
+    with open(file_path, "w") as f:
+        f.write("\n\n".join(graph_str))
+        logger.info(__file__, f"Dumped dependency graph to {file_path}")
+
+
+def _dump_groups(groups, op_type, workdir):
+    fname = f"fuse_group_{op_type}_groups.txt"
+    file_path = os.path.join(workdir, fname)
+    with open(file_path, "w") as f:
+        for group in groups:
+            single_group_str = ",".join(op._attrs["name"] for op in group)
+            f.write(f"[{single_group_str}]\n\n")
+            f.write(graph_utils.sorted_op_pseudo_code(group))
+            f.write("\n")
+        logger.info(__file__, f"Dumped groups to {file_path}")
+
+
+def _dump_single_group(group):
+    single_group_str = ",".join(op._attrs["name"] for op in group)
+    print(f"[{single_group_str}]\n")
+
+
+def _ops_have_same_num_inputs(op1: Operator, op2: Operator) -> bool:
+    """Helper function to check whether op1 and op2 can be grouped together"""
+    num_inputs_1 = len(op1._attrs["inputs"])
+    num_inputs_2 = len(op2._attrs["inputs"])
+    return num_inputs_1 > 0 and num_inputs_1 == num_inputs_2
+
+
+def _check_op_num_outputs(op: Operator, num_outputs: int) -> bool:
+    return len(op._attrs["outputs"]) == num_outputs
+
+
+def _get_ab_alignment(op: Operator) -> int:
+    if op._attrs["op"].startswith("gemm_rcr"):
+        k = op._attrs["inputs"][0]._size(1).value()
+        return default_align_ab(k, k)
+    raise NotImplementedError(
+        f"Need to add alignment check support for op {op._attrs['op']}"
+    )
+
+
+def _filter_gemm_op(op: Operator) -> bool:
+    """Function to filter out bad gemm candidates for group_gemm."""
+    if op._attrs["alpha"] != 1.0:
+        return False
+
+    # 2D input
+    input = op._attrs["inputs"][0]
+    if input._rank() != 2:
+        return False
+
+    # dynamic dim check
+    if not all_static_dimensions(input.shape()):
+        return False
+
+    return _get_ab_alignment(op) > 1
+
+
+def _gemm_op_check(op1: Operator, op2: Operator) -> bool:
+    align1 = _get_ab_alignment(op1)
+    align2 = _get_ab_alignment(op2)
+    return align1 == align2
+
+
+def _get_layernorm_flattened_normalized_shape(op: Operator) -> int:
+    n = 1
+    for shape in op._attrs["normalized_shape"]:
+        n *= shape.value()
+    return n
+
+
+def _get_layernorm_alignment(n: int) -> int:
+    if n % 4 == 0:
+        return 4
+    return 1
+
+
+def _check_layernorm_n_match(op1: Operator, op2: Operator) -> int:
+    """n is the flattened normalized_shape dim"""
+    n1 = _get_layernorm_flattened_normalized_shape(op1)
+    n2 = _get_layernorm_flattened_normalized_shape(op2)
+    # we can't use half4 kernel here anyways
+    if n1 < 128 and n2 < 128:
+        return True
+
+    align1 = _get_layernorm_alignment(n1)
+    align2 = _get_layernorm_alignment(n2)
+    if n1 <= 4096 and n2 <= 4096:
+        if align1 != align2:
+            return False
+
+    # TODO: may need better heuristics
+    # We may group 128, 256, 512, 1024 together if we only check neighbors, which might be ok.
+    # We mostly want to rule out 128, 128, ..., 1024 cases
+    if n1 >= 4 * n2 or 4 * n1 <= n2:
+        return False
+
+    if align1 == 4:
+
+        def _in_range(n):
+            return n >= 128 and n <= 4096
+
+        # prefer to use half4 kernel
+        n1_in_range = _in_range(n1)
+        n2_in_range = _in_range(n2)
+        return not (n1_in_range ^ n2_in_range)
+    return True
+
+
+def _layernorm_op_check(op1: Operator, op2: Operator) -> bool:
+    """Function to filter out bad layernorm candidates for group_layernorm
+    and group_layernorm_sigmoid_mul.
+    """
+    if op1 == op2:
+        return True
+
+    # check for same eps
+    if op1._attrs["eps"] != op2._attrs["eps"]:
+        return False
+
+    if len(op1._attrs["inputs"]) != 3 or len(op2._attrs["inputs"]) != 3:
+        return False
+
+    # check for same rank and batch dims
+    input1 = op1._attrs["inputs"][0]
+    input2 = op2._attrs["inputs"][0]
+    if input1._rank() != input2._rank():
+        return False
+
+    norm_shapes_1 = op1._attrs["normalized_shape"]
+    norm_shapes_2 = op2._attrs["normalized_shape"]
+    # This may be relaxed in the future
+    if len(norm_shapes_1) != len(norm_shapes_2):
+        return False
+
+    # All batch dims must be the same
+    for i in range(input1._rank() - len(norm_shapes_1)):
+        if input1._size(i) != input2._size(i):
+            return False
+
+    # check gamma and bias
+    if op1._attrs["gamma_constant"] is not None:
+        return False
+
+    if op1._attrs["beta_constant"] is not None:
+        return False
+
+    return _check_layernorm_n_match(op1, op2)
+
+
+def _default_op_check(op1: Operator, op2: Operator = None) -> bool:
+    return True
+
+
+def _get_op_checker(op_type: str) -> Callable:
+    """Returns op specific check functions to check if two ops can be
+    grouped togher.
+    """
+    if op_type.startswith("gemm"):
+        return _gemm_op_check
+    if op_type.startswith("layernorm"):
+        return _layernorm_op_check
+    return _default_op_check
+
+
+def _get_op_filter(op_type: str) -> Callable:
+    """Returns op specific check functions to check for if a single op
+    is good candidate for group ops.
+    """
+    if op_type.startswith("gemm"):
+        return _filter_gemm_op
+    return _default_op_check
+
+
+_group_gemm_op_mapping = {
+    "gemm_rcr": ops.group_gemm_rcr,
+    "gemm_rcr_bias": ops.group_gemm_rcr_bias,
+    "gemm_rcr_bias_relu": ops.group_gemm_rcr_bias_relu,
+    "gemm_rcr_bias_sigmoid": ops.group_gemm_rcr_bias_sigmoid,
+}
+
+
+def _group_split_outputs_together(
+    sorted_ops: List[Operator], op_type: str
+) -> List[List[Operator]]:
+    """As long as alignment allows, we group all output gemm ops from split op
+    together to eliminate the cost of split. Here we don't exclude large gemms
+    because the copy of split is worse than group gemm overhead.
+    """
+    groups = []
+    if op_type not in _group_gemm_op_mapping:
+        return groups
+    for op in sorted_ops:
+        if op._attrs["op"] != "split":
+            continue
+        if not _can_fuse_split_op(op):
+            continue
+
+        gemm_group = []
+        for output in op._attrs["outputs"]:
+            dst_ops = list(output.dst_ops())
+            if len(dst_ops) == 0:
+                break
+            gemm_op = dst_ops[0]
+            if gemm_op._attrs["op"] != op_type:
+                break
+            if not _filter_gemm_op(gemm_op):
+                break
+            if not gemm_group:
+                gemm_group.append(gemm_op)
+            else:
+                if _gemm_op_check(gemm_group[-1], gemm_op):
+                    gemm_group.append(gemm_op)
+                else:
+                    break
+        if len(gemm_group) == len(op._attrs["outputs"]):
+            groups.append(gemm_group)
+    return groups
+
+
+def _dfs(
+    tensor: Tensor, op_type: str, visited: OrderedDict[Tensor, Set[Operator]]
+) -> Set[Operator]:
+    """Dfs pass to traverse the graph and collects descendant ops with type == op_type
+    for every tensor, which is saved in `visited`.
+    """
+    if tensor in visited:
+        return visited[tensor]
+
+    # establish topological order in visited
+    visited[tensor] = set()
+
+    descendants = set()
+    for op in tensor.dst_ops():
+        outputs = op._attrs["outputs"]
+        for output in outputs:
+            descendants.update(_dfs(output, op_type, visited))
+
+    # visited[tensor] should only contain descendants, not self
+    visited[tensor].update(descendants)
+
+    src_ops = list(tensor.src_ops())
+    assert (
+        len(src_ops) <= 1
+    ), f"A tensor can't have more than 1 src_op in this stage, len(src_ops): {len(src_ops)}"
+
+    src_op = src_ops[0] if len(src_ops) == 1 else None
+    if src_op and src_op._attrs["op"] == op_type:
+        descendants.add(src_op)
+    return descendants
+
+
+def _filter_by_op_type(
+    visited: OrderedDict[Tensor, Set[Operator]], op_type: str
+) -> OrderedDict[Operator, Set[Operator]]:
+    """Go through `visited` and ony save the entries with parent op == op_type in
+       the final dependency graph
+
+    Args:
+        visited (OrderedDict[Tensor, Set[Operator]]): {tensor: descendants of tensor} pairs
+            for all tensors in the graph
+        op_type (str): The op type to be grouped
+
+    Returns:
+        OrderedDict[Operator, Set[Operator]]: The final dependency graph
+    """
+    final = collections.OrderedDict()
+    for parent, descendants in visited.items():
+        if parent.src_ops():
+            src_op = list(parent.src_ops())[0]
+            parent_op_type = src_op._attrs["op"]
+            if parent_op_type == op_type:
+                final[src_op] = descendants
+    return final
+
+
+def _get_dependency_graph(
+    sorted_graph: List[Tensor], op_type: str
+) -> OrderedDict[Operator, Set[Operator]]:
+    """Get dependency graph: `G: {op: all the descendants of op with op_type}`.
+       G only contains ops with type == op_type.
+       The dependency doesn't necessarily need to be topologically sorted.
+       It's helpful for debugging though. So let's keep it ordered.
+
+    Args:
+        sorted_graph (List[Tensor]): Topologically sorted graph
+        op_type (str): The op type to be grouped
+
+    Returns:
+        OrderedDict[Operator, Set[Operator]]: The dependency graph.
+
+    Algorithm:
+    1) Do dfs to traverse the graph and collects descendants ops with type == op_type
+       for every tensor, which is saved in `visited`.
+    2) Go through `visited` and ony save the entries with parent op == op_type in
+       the final dependency graph
+    """
+    visited = collections.OrderedDict()
+    for tensor in sorted_graph:
+        _dfs(tensor, op_type, visited)
+
+    filtered = _filter_by_op_type(visited, op_type)
+
+    return filtered
+
+
+def _get_sorted_candidate_ops(
+    sorted_ops: List[Operator], op_type: str, f_filter: Callable
+) -> OrderedDict[Tensor, bool]:
+    """Get all the candidate ops, `grouped: {op: flag}`, for group fusion. The flag
+       denotes whether this op is grouped or not and is initialized to False. We need to
+       filter out ops that are not eligible such as gemm ops with large m/n/k or odd
+       alignment, or layernorm ops without gamma/beta etc.
+
+    Args:
+        sorted_ops (List[Operator]): Sorted ops from the graph
+        op_type (str): The op type to be grouped
+
+    Returns:
+        OrderedDict[Tensor, bool]: All the candidate ops for group fusion
+    """
+    op_set = collections.OrderedDict()
+    for op in sorted_ops:
+        if op._attrs["op"] == op_type and f_filter(op):
+            op_set[op] = False
+
+    return op_set
+
+
+# 39 comes from the kernel requirement, for > 39 groups, we need to copy
+# the arguments to gpu memory with sync memcpy, which is bad for perf
+_MAX_LAYERNORM_GROUP = 39
+
+# TODO: remove after switching to async copy for group layernorm args
+def _break_layernorm_groups(group: List[Operator]) -> List[List[Operator]]:
+    if len(group) <= _MAX_LAYERNORM_GROUP:
+        return group
+    group.sort(key=lambda x: _get_layernorm_flattened_normalized_shape(x), reverse=True)
+    groups = []
+    num_groups = (len(group) + _MAX_LAYERNORM_GROUP - 1) // _MAX_LAYERNORM_GROUP
+
+    for i in range(num_groups):
+        begin = i * _MAX_LAYERNORM_GROUP
+        end = min((i + 1) * _MAX_LAYERNORM_GROUP, num_groups)
+        groups.append(group[begin:end])
+    return groups
+
+
+def _group_ops_by_type(
+    sorted_graph: List[Tensor], op_type: str, workdir: str = None
+) -> List[List[Operator]]:
+    """Find all groups of ops that can be fused together. Each group is replaced
+    with 1 group op.
+
+    Args:
+        sorted_graph (List[Tensor]): Topologically sorted input graph
+        op_type (str): The type of op to be grouped
+
+    Returns:
+        List[List[Operator]]: All groups of ops that can be grouped together.
+
+    The algorithm can be described as:
+    0) Let groups = []
+    1) Do dfs and get the dependency graph, `G: {op: all the descendants of op with op_type}`.
+       G only contains ops with type == op_type. G does not need to be in topological order.
+    2) Get all the candidate ops, `grouped: {op: flag}`, for group fusion. The flag
+       denotes whether this op is grouped or not and is initialized to False. We need to
+       filter out ops that are not eligible such as gemm ops with large m/n/k or odd
+       alignment, or layernorm ops without gamma/beta etc. Later, we set all grouped ops
+       in grouped to True. Ops that can't be grouped with any other ops are also set to True.
+       **Ops in `grouped` must be in topological order.**
+    3) For group gemm, we group all gemms following the same split op together so the
+       split op can be eliminated. Due to the high cost of split, we don't apply the same
+       m/n/k filter to these gemm ops. These ops are removed from `grouped`.
+    4) Let op_set = set(grouped.keys()), all the ops available for grouping.
+    5) For every op in grouped:
+            If grouped[candidate] is True, continue to next op.
+            Remove op from op_set. Because ops in grouped are topologically sorted, this
+                guarantees that op_set won't contain any ancestors of op.
+            Get op candidates that can be potentially grouped with op.
+            candidates = op_set - {G[op]} where G[op] is descendant of op.
+            Sort candidates by name (same as topological order).
+            for every candidate in candidates:
+                Check if op and candidate op can be grouped together.
+                    - Check for dependency
+                    - Check for op compatibility
+                If yes:
+                    Group them together and remove candidate from op_set.
+                    Merge descendants of candidate op to descendants of op.
+                    Set grouped[candidate] = True
+            If the final group is >= 2, add them to `groups`
+        Set grouped[op] = True
+    """
+
+    # TODO: as an optimization, we may keep using the same dependency_graph
+    # through all the group passes and keep updating it
+    dependency_graph = _get_dependency_graph(sorted_graph, op_type)
+
+    # There is no op with op_type in the graph
+    if len(dependency_graph) == 0:
+        return []
+
+    if workdir:
+        _dump_dependency_graph(dependency_graph, op_type, "filtered", workdir)
+
+    f_filter_op = _get_op_filter(op_type)
+    f_check_ops_are_compatible = _get_op_checker(op_type)
+
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+
+    # grouped: {key: op, value: whether this op has been grouped or not}
+    # ops in grouped must be in topological order
+    grouped = _get_sorted_candidate_ops(sorted_ops, op_type, f_filter_op)
+    assert len(grouped) <= len(dependency_graph)
+
+    groups = []
+
+    # applies to group gemms only
+    split_groups = _group_split_outputs_together(sorted_ops, op_type)
+    for group in split_groups:
+        groups.append(group)
+        for op in group:
+            if op in grouped:
+                del grouped[op]
+
+    # the set of ops available for group fusion
+    op_set = set(grouped.keys())
+
+    for op, visited in grouped.items():
+        if visited:
+            op_set.discard(op)
+            continue
+        descendants = dependency_graph[op]
+        op_set.discard(op)
+
+        candidates = op_set - descendants
+
+        def get_op_number(op: Operator) -> int:
+            op_name = op._attrs["name"]
+            last_idx = op_name.rfind("_")
+            return int(op_name[last_idx + 1 :])
+
+        # Sort by topological order. Sorting by names guarantees topological order
+        # because of how the name_graph pass works
+        group_candidates = sorted(candidates, key=lambda x: get_op_number(x))
+
+        group = [op]
+        for candidate in group_candidates:
+            if candidate in descendants:
+                continue
+
+            if (
+                _ops_have_same_num_inputs(op, candidate)
+                and _check_op_num_outputs(op, 1)
+                and f_check_ops_are_compatible(op, candidate)
+            ):
+                group.append(candidate)
+                grouped[candidate] = True
+
+                op_set.discard(candidate)
+
+                # must merge descendants together
+                descendants.update(dependency_graph[candidate])
+
+        if len(group) > _MAX_LAYERNORM_GROUP and op_type.startswith("layernorm"):
+            groups.extend(_break_layernorm_groups(group))
+        elif len(group) >= 2:
+            groups.append(group)
+
+        grouped[op] = True
+
+    return groups
+
+
+def _fuse_layernorm_ops(
+    op_group: List[Operator], sorted_graph: List[Tensor]
+) -> List[Tensor]:
+    """
+    Replace a group of ops with a single group op
+    """
+    # Make the order deterministic
+    # Sort by gamma name
+    op_group.sort(key=lambda x: x._attrs["inputs"][1]._attrs["name"])
+
+    # gather inputs
+    num_inputs = len(op_group[0]._attrs["inputs"])
+    group_inputs = [[] for _ in range(num_inputs)]
+    normalized_shapes = []
+    for op in op_group:
+        normalized_shapes.append(op._attrs["normalized_shape"])
+        for i, input in enumerate(op._attrs["inputs"]):
+            group_inputs[i].append(input)
+
+    # remove dst_ops from inputs
+    for op in op_group:
+        for input in op._attrs["inputs"]:
+            transform_utils.remove_dst_op_from_tensor(input, op)
+
+    # create group op
+    op_type = op_group[0]._attrs["op"]
+    group_op = (
+        ops.group_layernorm
+        if op_type == "layernorm"
+        else ops.group_layernorm_sigmoid_mul
+    )
+    eps = op_group[0]._attrs["eps"]
+    group_outputs = group_op()(
+        group_inputs[0], group_inputs[1], group_inputs[2], normalized_shapes, eps
+    )
+
+    for i, op in enumerate(op_group):
+        new_output = group_outputs[i]
+        op_output = op._attrs["outputs"][0]
+        transform_utils.replace_tensor(op_output, new_output)
+
+    # sorted_graph is no longer sorted here
+    sorted_graph.extend(group_outputs)
+    return sorted_graph
+
+
+def _fuse_gemm_ops(
+    op_group: List[Operator], sorted_graph: List[Tensor]
+) -> List[Tensor]:
+    """
+    Replace a group of ops with a single group op
+    """
+
+    assert op_group[0]._attrs["op"].startswith("gemm_rcr"), (
+        f"_fuse_gemm_ops only supports gemm_rcr family ops. "
+        f"{op_group[0]._attrs['op']} is not supported"
+    )
+
+    # Sort by weight name
+    op_group.sort(key=lambda x: x._attrs["inputs"][1]._attrs["name"])
+
+    # Make the order deterministic, important for cache hit.
+    # sort ops by decreasing K, N
+    op_group.sort(
+        key=lambda x: (
+            x._attrs["inputs"][1]._size(1).value(),
+            x._attrs["inputs"][1]._size(0).value(),
+        ),
+        reverse=True,
+    )
+
+    # gather inputs
+    group_inputs = [op._attrs["inputs"] for op in op_group]
+
+    # remove dst_ops from inputs
+    for op in op_group:
+        for input in op._attrs["inputs"]:
+            transform_utils.remove_dst_op_from_tensor(input, op)
+
+    # create group op
+    op_type = op_group[0]._attrs["op"]
+    assert op_type in _group_gemm_op_mapping, f"{op_type} not in _group_gemm_op_mapping"
+    group_op = _group_gemm_op_mapping[op_type]
+
+    group_outputs = group_op()(group_inputs)
+
+    for i, op in enumerate(op_group):
+        new_output = group_outputs[i]
+        op_output = op._attrs["outputs"][0]
+        transform_utils.replace_tensor(op_output, new_output)
+
+    # sorted_graph is no longer sorted here
+    sorted_graph.extend(group_outputs)
+    return sorted_graph
+
+
+# TODO: add slice + group_gemm fusion
+def _fuse_group_ops_by_type(
+    sorted_graph: List[Tensor], op_type: str, workdir: str = None
+) -> List[Tensor]:
+    """
+    This pass groups gemm ops or layernorm ops together.
+
+    For gemm ops, the supported op types are:
+    - gemm_rcr
+    - gemm_rcr_bias
+    - gemm_rcr_bias_relu
+    - gemm_rcr_bias_sigmoid.
+
+    There are several conditions that the ops must meet to be grouped together:
+    1) alignment requirement: all ops in a group must have the same ab_alignment.
+       Minimal ab_alignment required is 2.
+       TODO: experiment with grouping ops with different alignment together and use
+       minimal alignment. Need to benchmark perf.
+    2) ops must have the same type and alpha = 1.0. Only 2D gemms are supported.
+    3) all gemm ops after a single split op will be grouped together into one group
+       regardless of condition 4
+    4) TODO: exclude gemm ops with large m/n/k with heuristics
+    6) TODO: support dynamic batch dim. All dims must be static currently.
+
+    For layernorm ops, the supported op types are:
+    - layernorm
+    - layernorm_sigmoid_mul
+
+    The group conditions are
+    1) all ops must have the same eps
+    2) neither gamma or beta can be None
+    3) the normalized_shape must have same rank
+    4) all inputs must have the same rank and batch dimensions
+    5) all inputs must have the same alignment according to _get_layernorm_alignment
+
+    The overall algorithm is pretty simple. It takes two steps:
+    1) find all groups of op to fuse together
+    2) fuse them together
+    Details of step 1 can be found in _group_ops_by_type
+    """
+    groups = _group_ops_by_type(sorted_graph, op_type, workdir)
+
+    if len(groups) == 0:
+        return sorted_graph
+
+    if workdir:
+        _dump_groups(groups, op_type, workdir)
+
+    is_layernorm = op_type.startswith("layernorm")
+    f_fuse_ops = _fuse_layernorm_ops if is_layernorm else _fuse_gemm_ops
+    for op_group in groups:
+        f_fuse_ops(op_group, sorted_graph)
+
+    sorted_graph = toposort(sorted_graph)
+    sorted_graph = transform_utils.sanitize_sorted_graph(sorted_graph)
+    return sorted_graph
+
+
+def fuse_group_gemm_ops(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    # gemm_rcr, gemm_rcr_bias, gemm_rcr_bias_relu, gemm_rcr_bias_sigmoid
+    for op_type in _group_gemm_op_mapping.keys():
+        sorted_graph = _fuse_group_ops_by_type(sorted_graph, op_type, workdir)
+    return sorted_graph
+
+
+def fuse_group_layernorm_ops(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    for op_type in ["layernorm_sigmoid_mul", "layernorm"]:
+        sorted_graph = _fuse_group_ops_by_type(sorted_graph, op_type, workdir)
+    return sorted_graph
+
+
+# The right order for graph passes is:
+# fuse_mm_elementwise, to prefer elementwise epilogue fusions (better overall perf)
+# fuse_group_ops,
+# fuse_strided_ops, (need to add more group gemm fusion passes)
+def fuse_group_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
+    """Horizontal fusion of grouped gemm and layernorm ops
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    workdir : str, optional
+        working dir, by default None
+
+    Returns
+    -------
+    List[Tensor]
+        New graph after fusion
+    """
+    # gemms need to be fused first
+    # TODO: enable after adding heuristics and fixing dynamic shapes
+    from ...backend.target import Target
+
+    if Target.current().name() == "cuda":
+        if "fuse_group_gemm" in Target.current()._kwargs:
+            sorted_graph = fuse_group_gemm_ops(sorted_graph)
+        sorted_graph = fuse_group_layernorm_ops(sorted_graph)
+
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_mm_elementwise.py b/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
new file mode 100644
index 000000000..fcf8e31ba
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
@@ -0,0 +1,218 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fuse GEMM with elementwise operations
+"""
+from typing import List
+
+from ..base import Tensor
+from ..ops.common import elementwise
+from ..ops.common.epilogue import FuncEnum
+from ..ops.gemm_universal import gemm_rcr, gemm_rcr_bias, gemm_rcr_bias_swish
+
+from .fuse_mm_elementwise_patterns import get_patterns
+from .fuse_utils import (
+    extract_only_one_op,
+    is_elementwise_type,
+    transform_simple_fusion_patterns,
+)
+from .transform_utils import (
+    copy_tensor_attributes,
+    remove_dst_op_from_tensor,
+    remove_single_tensor_op_from_sorted_graph,
+    replace_tensor,
+    sanitize_sorted_graph,
+)
+
+# pylint: disable=C0103,C0415,W0612
+
+
+def _fuse_bmm_mul_or_div_alpha(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """This pass fuses bmm and mul (or div) if mul's other operand is a
+       constant scalar tensor (i.e. which has a valid "value" attribute.
+       In such a case, we turn this constant value into bmm's alpha.
+       Note that for div cases, we assign 1/const_val to alpha.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        input sorted graph
+
+    Return
+    ----------
+    List[Tensor]
+        modified sorted graph upon success. Otherwise, the original sorted
+        graph will be returned.
+    """
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if src_ops is None or len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op is None:
+            continue
+        if not src_op._attrs["op"].startswith("bmm"):
+            continue
+        bmm_op = src_op
+
+        dst_ops = list(tensor._attrs["dst_ops"])
+        if not dst_ops or len(dst_ops) != 1:
+            continue
+
+        next_op = dst_ops[0]
+        if next_op._attrs["op"] != "elementwise":
+            continue
+        if next_op._attrs["func"] == FuncEnum.MUL:
+            is_div = False
+        elif next_op._attrs["func"] == FuncEnum.DIV:
+            is_div = True
+        else:
+            continue
+
+        elem_op = next_op
+        elem_inputs = elem_op._attrs["inputs"]
+        if len(elem_inputs) != 1:
+            continue
+        elem_args = elem_op._attrs["args"]
+        if len(elem_args) != 2:
+            continue
+        # make sure cst_tensor is the divisor of the DIV op
+        if is_div and tensor == elem_args[1]:
+            continue
+        cst_tensor = elem_args[1] if tensor == elem_args[0] else elem_args[1]
+        # skip non-constant scalar tensor
+        if not cst_tensor.is_a_const_num():
+            continue
+        cst_val = cst_tensor._attrs["value"]
+        # let's only consider int and float builtin types. Seems that it doesn't
+        # make any sense to take other scalar types like str and convert it
+        # to a float.
+        if not isinstance(cst_val, (float, int)):
+            continue
+        # OK, we are good so let's add cst_val to bmm's alpha attribute
+        bmm_op._attrs["alpha"] = 1.0 / float(cst_val) if is_div else float(cst_val)
+        # remove this MUL/DIV
+        remove_single_tensor_op_from_sorted_graph(elem_op)
+
+    return sanitize_sorted_graph(sorted_graph)
+
+
+def _fuse_gemm_rcr_bias_swish(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    gemm_rcr_bias_swish(A, B) is equivalent to:
+        x = gemm_rcr_bias(A, B)
+        x1 = sigmoid(x)
+        return elementwise(MUL)(x, x1)
+    """
+    new_sorted_graph = []
+
+    to_remove = set()
+    for tensor in sorted_graph:
+        if tensor in to_remove:
+            continue
+        new_sorted_graph.append(tensor)
+
+        if tensor._attrs["is_output"]:
+            continue
+
+        gemm_op = extract_only_one_op(tensor._attrs["src_ops"])
+        if gemm_op is None:
+            continue
+        if gemm_op._attrs["op"] != "gemm_rcr_bias":
+            continue
+
+        dst_op = list(tensor._attrs["dst_ops"])
+        if len(dst_op) != 2:
+            continue
+        swish_tensor = None
+        for idx in range(2):
+            other_idx = (idx + 1) % 2
+            if is_elementwise_type(dst_op[idx], FuncEnum.SIGMOID):
+                if not is_elementwise_type(dst_op[other_idx], FuncEnum.MUL):
+                    continue
+
+                is_swish = False
+                output = dst_op[idx]._attrs["outputs"][0]
+                mul_inputs = dst_op[other_idx]._attrs["inputs"]
+                if mul_inputs[0] == output and mul_inputs[1] == tensor:
+                    is_swish = True
+                if mul_inputs[1] == output and mul_inputs[0] == tensor:
+                    is_swish = True
+                if not is_swish:
+                    continue
+
+                swish_tensor = dst_op[other_idx]._attrs["outputs"][0]
+                break
+
+        if swish_tensor is None:
+            continue
+
+        gemm_inputs = gemm_op._attrs["inputs"]
+        remove_dst_op_from_tensor(gemm_inputs, gemm_op)
+        # Output of sigmoid and final mul of swish.
+        to_remove.add(dst_op[0]._attrs["outputs"][0])
+        to_remove.add(dst_op[1]._attrs["outputs"][0])
+
+        new_tensor = gemm_rcr_bias_swish()(*gemm_inputs)
+        copy_tensor_attributes(new_tensor, swish_tensor)
+        replace_tensor(swish_tensor, new_tensor)
+        new_sorted_graph[-1] = new_tensor
+
+    return sanitize_sorted_graph(new_sorted_graph)
+
+
+def _transform_gemm_bias(sorted_graph: List[Tensor]) -> List[Tensor]:
+    gemm_rcr_bias_patterns = [
+        (
+            (gemm_rcr(), elementwise(FuncEnum.ADD)),
+            gemm_rcr_bias,
+        ),
+    ]
+
+    return transform_simple_fusion_patterns(sorted_graph, gemm_rcr_bias_patterns)
+
+
+def _transform_mm_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
+    fusion_patterns = get_patterns()
+
+    return transform_simple_fusion_patterns(sorted_graph, fusion_patterns)
+
+
+def fuse_mm_elementwise(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """Fuse GEMMs with elementwise operations.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    workdir : str, optional
+        working dir, by default None
+
+    Returns
+    -------
+    List[Tensor]
+        Fused graph
+    """
+    funcs = [
+        _fuse_bmm_mul_or_div_alpha,
+        _transform_gemm_bias,
+        _transform_mm_elementwise,
+        _fuse_gemm_rcr_bias_swish,
+    ]
+    for func in funcs:
+        sorted_graph = func(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py b/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
new file mode 100644
index 000000000..d822a707c
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
@@ -0,0 +1,169 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from ..ops.common import elementwise
+from ..ops.common.epilogue import FuncEnum
+from ..ops.gemm_universal import (
+    bmm_ccr,
+    bmm_ccr_add,
+    bmm_crr,
+    bmm_crr_add,
+    bmm_rrr,
+    bmm_rrr_add,
+    gemm_rcr_bias,
+    gemm_rcr_bias_add,
+    gemm_rcr_bias_add_add,
+    gemm_rcr_bias_add_add_relu,
+    gemm_rcr_bias_add_relu,
+    gemm_rcr_bias_mul,
+    gemm_rcr_bias_mul_add,
+    gemm_rcr_bias_mul_tanh,
+    gemm_rcr_bias_relu,
+    gemm_rcr_bias_sigmoid,
+    gemm_rcr_bias_sigmoid_mul,
+    gemm_rcr_bias_sigmoid_mul_tanh,
+    gemm_rcr_bias_tanh,
+)
+
+
+def get_patterns():
+    """
+    We create the pattern of fusion here.
+    The format should be in the form of (pattern, replacement)
+
+    pattern: This would be a list of operator which are chained which we
+             want to match
+    replacement: The op to replace pattern.
+    """
+    bmm_ccr_patterns = [
+        ((bmm_ccr(), elementwise(FuncEnum.ADD)), bmm_ccr_add),
+    ]
+
+    bmm_crr_patterns = [
+        ((bmm_crr(), elementwise(FuncEnum.ADD)), bmm_crr_add),
+    ]
+
+    bmm_rrr_patterns = [
+        ((bmm_rrr(), elementwise(FuncEnum.ADD)), bmm_rrr_add),
+    ]
+
+    gemm_rcr_bias_activation_patterns = [
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.RELU),
+            ),
+            gemm_rcr_bias_relu,
+        ),
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.SIGMOID),
+                elementwise(FuncEnum.MUL),
+                elementwise(FuncEnum.TANH),
+            ),
+            gemm_rcr_bias_sigmoid_mul_tanh,
+        ),
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.SIGMOID),
+                elementwise(FuncEnum.MUL),
+            ),
+            gemm_rcr_bias_sigmoid_mul,
+        ),
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.SIGMOID),
+            ),
+            gemm_rcr_bias_sigmoid,
+        ),
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.TANH),
+            ),
+            gemm_rcr_bias_tanh,
+        ),
+    ]
+
+    gemm_rcr_bias_add_patterns = [
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.ADD),
+                elementwise(FuncEnum.RELU),
+            ),
+            gemm_rcr_bias_add_relu,
+        ),
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.ADD),
+                elementwise(FuncEnum.ADD),
+                elementwise(FuncEnum.RELU),
+            ),
+            gemm_rcr_bias_add_add_relu,
+        ),
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.ADD),
+                elementwise(FuncEnum.ADD),
+            ),
+            gemm_rcr_bias_add_add,
+        ),
+        (
+            (gemm_rcr_bias(), elementwise(FuncEnum.ADD)),
+            gemm_rcr_bias_add,
+        ),
+    ]
+
+    gemm_rcr_bias_mul_patterns = [
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.MUL),
+                elementwise(FuncEnum.ADD),
+            ),
+            gemm_rcr_bias_mul_add,
+        ),
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.MUL),
+                elementwise(FuncEnum.TANH),
+            ),
+            gemm_rcr_bias_mul_tanh,
+        ),
+        (
+            (
+                gemm_rcr_bias(),
+                elementwise(FuncEnum.MUL),
+            ),
+            gemm_rcr_bias_mul,
+        ),
+    ]
+
+    fusion_patterns = (
+        bmm_ccr_patterns
+        + bmm_crr_patterns
+        + bmm_rrr_patterns
+        + gemm_rcr_bias_activation_patterns
+        + gemm_rcr_bias_add_patterns
+        + gemm_rcr_bias_mul_patterns
+    )
+
+    return fusion_patterns
diff --git a/python/aitemplate/compiler/transform/fuse_ops.py b/python/aitemplate/compiler/transform/fuse_ops.py
new file mode 100644
index 000000000..d03132c9a
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_ops.py
@@ -0,0 +1,200 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform operator fusions.
+"""
+from typing import Any, Dict, List, Set
+
+from aitemplate.compiler.base import Operator
+from aitemplate.compiler.transform.toposort import toposort
+
+from aitemplate.utils import logger
+
+from ..base import Tensor
+from ..ops.common import fused_elementwise
+from ..ops.common.epilogue import FuncEnum
+from ..ops.layernorm import layernorm_sigmoid_mul
+from . import transform_utils
+
+# pylint: disable=C0103,W0612
+
+
+class SimpleDisjointSet(object):
+    def __init__(self):
+        self.node_to_list_mapping: Dict[Any, List[Any]] = {}
+
+    def add(self, node: Any, dependent_nodes: Set[Any]) -> None:
+        if node in self.node_to_list_mapping:
+            return
+
+        if dependent_nodes is None or len(dependent_nodes) == 0:
+            self.node_to_list_mapping[node] = [node]
+            return
+
+        current_list = None
+        for dependent in dependent_nodes:
+            if dependent is None or dependent not in self.node_to_list_mapping:
+                continue
+            new_list = self.node_to_list_mapping.get(dependent)
+            if current_list is None:
+                current_list = new_list
+            elif current_list is not new_list:
+                current_list.extend(new_list)
+                for new_node in new_list:
+                    self.node_to_list_mapping[new_node] = current_list
+        if current_list is None:
+            current_list = []
+        current_list.append(node)
+        self.node_to_list_mapping[node] = current_list
+
+    def get_node_groups(self) -> List[List[Any]]:
+        node_groups = []
+        visited = set()
+        for groups in self.node_to_list_mapping.values():
+            addr = id(groups)
+            if addr not in visited:
+                visited.add(addr)
+                node_groups.append(groups)
+        return node_groups
+
+
+def _find_fusable_elementwise_ops(op: Operator) -> Set[Operator]:
+    """
+    Given an elementwise op, returns a list of parent elementwise ops
+    which can be fused with this elementwise op.
+    """
+
+    # Get parent ops.
+    dependent_ops = set()
+    for input_tensor in op._attrs["inputs"]:
+        dependent_ops.update(input_tensor._attrs["src_ops"])
+    original_ops = set(dependent_ops)
+
+    # First, filter out all non-elementwise ops.
+    to_be_removed_set = set()
+    for op in dependent_ops:
+        if op._attrs["op"] != "elementwise":
+            to_be_removed_set.add(op)
+    dependent_ops = dependent_ops - to_be_removed_set
+
+    # Then get all connected elementwise ops at the last layer.
+    while True:
+        for op1 in dependent_ops:
+            # If op1 is an ancestor of op2 but not a parent of op2,
+            # op1 and op2 cannot be fused. Remove op1 and only
+            # keep op2.
+            for op2 in dependent_ops:
+                if op1 is op2:
+                    continue
+                if transform_utils.is_ancestor(
+                    op1, op2
+                ) and not transform_utils.is_parent(op1, op2):
+                    to_be_removed_set.add(op1)
+
+            # If op1 is an ancestor of a removed op,
+            # op1 and op cannot be fused. Remove op1.
+            for op2 in list(to_be_removed_set):
+                if transform_utils.is_ancestor(op1, op2):
+                    to_be_removed_set.add(op1)
+
+        prev_len = len(dependent_ops)
+        dependent_ops = dependent_ops - to_be_removed_set
+        new_len = len(dependent_ops)
+        if prev_len == new_len:
+            break
+
+    logger.debug(
+        __file__,
+        f"original op set: {original_ops}, to_be_removed_set: {to_be_removed_set}, final_set: {dependent_ops}",
+    )
+    return dependent_ops
+
+
+def _fuse_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
+    disjoint_set = SimpleDisjointSet()
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if src_ops is None or len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] == "elementwise":
+            disjoint_set.add(src_op, _find_fusable_elementwise_ops(src_op))
+
+    to_be_fused_op_groups = disjoint_set.get_node_groups()
+    for ops in to_be_fused_op_groups:
+        fused_elementwise(ops)
+
+    sorted_graph = toposort(sorted_graph)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _fuse_layernorm_sigmoid_mul(sorted_graph: List[Tensor]) -> List[Tensor]:
+    to_be_fused_op_groups = []
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if src_ops is None or len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op is None:
+            continue
+        if src_op._attrs["op"] != "layernorm":
+            continue
+        layer_norm = src_op
+
+        dst_ops = list(tensor._attrs["dst_ops"])
+        if not dst_ops:
+            continue
+
+        # layernorm as the last op in the graph
+        next_op = dst_ops[0]
+        if (
+            next_op._attrs["op"] != "elementwise"
+            or next_op._attrs["func"] != FuncEnum.SIGMOID
+        ):
+            continue
+        sigmoid = next_op
+
+        next_tensor = sigmoid._attrs["outputs"][0]
+
+        # layernorm + sigmoid
+        dst_ops = list(next_tensor._attrs["dst_ops"])
+        if not dst_ops:
+            continue
+
+        next_op = dst_ops[0]
+        if (
+            next_op._attrs["op"] != "elementwise"
+            or next_op._attrs["func"] != FuncEnum.MUL
+        ):
+            continue
+        mul = next_op
+
+        if layernorm_sigmoid_mul.is_valid(layer_norm, sigmoid, mul):
+            to_be_fused_op_groups.append((layer_norm, sigmoid, mul))
+
+    for ops in to_be_fused_op_groups:
+        layernorm_sigmoid_mul(*ops)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def fuse_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
+    funcs = [
+        _fuse_layernorm_sigmoid_mul,
+        _fuse_elementwise,
+    ]
+    for func in funcs:
+        sorted_graph = func(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_parallel_gemms.py b/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
new file mode 100644
index 000000000..bdd0d6473
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
@@ -0,0 +1,461 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Fuse parallel gemms into bmm op.
+"""
+
+from typing import Callable, List, Tuple
+
+from ...utils import graph_utils
+from ...utils.shape_utils import is_static_dimension
+from .. import ops
+from ..base import Operator, Tensor
+from ..ops.gemm_universal.gemm_common import default_align_ab
+from ..tensor_accessor import TensorAccessor
+from . import transform_utils
+from .toposort import toposort
+from .transform_strided_ops import _is_supported_op
+
+
+def _is_same_shape(gemm_op1: Operator, gemm_op2: Operator) -> bool:
+    inputs1 = gemm_op1._attrs["inputs"]
+    inputs2 = gemm_op2._attrs["inputs"]
+    if len(inputs1) != len(inputs2):
+        return False
+    for input1, input2 in zip(inputs1, inputs2):
+        if input1._rank() != input2._rank():
+            return False
+        for dim1, dim2 in zip(input1.shape(), input2.shape()):
+            if dim1 != dim2:
+                return False
+    return True
+
+
+def _is_valid_gemm_op(tensor: Tensor, f_check_src_op: Callable) -> bool:
+    """check if the src op of tensor is a valid gemm op for parallel fusion.
+
+    Args:
+        tensor (Tensor): the output tensor of the gemm op
+        f_check_src_op (Callable): a function to check if the src op of the
+        input to gemm op is valid for fusion
+
+    Returns:
+        bool: True if src_op of tensor is a valid gemm op
+    """
+    if len(tensor.dst_ops()) != 1 or len(tensor.src_ops()) != 1:
+        return False
+
+    gemm_op = list(tensor.src_ops())[0]
+    if gemm_op._attrs["op"] != "gemm_rcr_bias":
+        return False
+
+    gemm_input, weight, bias = gemm_op._attrs["inputs"]
+
+    # check gemm weight/bias is available for constant folding
+    if not transform_utils.can_be_constant_folded([weight, bias]):
+        return False
+
+    if len(gemm_input.dst_ops()) != 1 or len(gemm_input.src_ops()) != 1:
+        return False
+
+    # perm102_bmm only supports 3D input, 3D weight, 2D bias
+    if gemm_input._rank() != 2 or weight._rank() != 2 or bias._rank() != 1:
+        return False
+
+    if not is_static_dimension(gemm_input.shape(), 1):
+        return False
+
+    if not is_static_dimension(weight.shape(), 0) or not is_static_dimension(
+        weight.shape(), 1
+    ):
+        return False
+    if not is_static_dimension(bias.shape(), 0):
+        return False
+
+    src_op = list(gemm_input.src_ops())[0]
+
+    # the new cat must be eliminated with TensorAccessor
+    if not f_check_src_op(src_op):
+        return False
+    return True
+
+
+def _get_row_length(cat_input: Tensor):
+    shape = cat_input.shape()
+    return shape[-1].value()
+
+
+# find groups of parallel gemm ops with identical shapes
+def _find_parallel_gemm_ops(
+    cat_inputs: List[Tensor], f_check_src_op: Callable
+) -> List[Tuple[List[Operator], int]]:
+    all_groups = []
+    gemm_ops = []
+
+    def add_gemm_groups(gemm_ops):
+        if len(gemm_ops) >= 2:
+            all_groups.append((gemm_ops.copy()))
+
+    for cat_input in cat_inputs:
+        if not _is_valid_gemm_op(cat_input, f_check_src_op):
+            add_gemm_groups(gemm_ops)
+            gemm_ops.clear()
+        else:
+            gemm_op = list(cat_input.src_ops())[0]
+            if len(gemm_ops) == 0:
+                gemm_ops.append(gemm_op)
+                continue
+            if _is_same_shape(gemm_ops[-1], gemm_op):
+                gemm_ops.append(gemm_op)
+            else:
+                # start new group when the gemm shape is different
+                add_gemm_groups(gemm_ops)
+                gemm_ops.clear()
+                gemm_ops.append(gemm_op)
+
+    # handle last group
+    add_gemm_groups(gemm_ops)
+    return all_groups
+
+
+def _group_gemm_inputs(gemm_ops: List[Operator]) -> Tuple[List[Tensor]]:
+    inputs = []
+    weights = []
+    bias = []
+    for gemm_op in gemm_ops:
+        gemm_inputs = gemm_op._attrs["inputs"]
+        assert len(gemm_inputs) == 3
+        inputs.append(gemm_inputs[0])
+        weights.append(gemm_inputs[1])
+        bias.append(gemm_inputs[2])
+    return inputs, weights, bias
+
+
+def _clear_gemm_inputs_dst_ops(gemm_ops: List[Operator]):
+    for gemm_op in gemm_ops:
+        gemm_inputs = gemm_op._attrs["inputs"]
+        for input in gemm_inputs:
+            input.dst_ops().clear()
+
+
+def _get_gemm_output_idx_in_cat_inputs(gemm_op, cat_op):
+    gemm_outputs = gemm_op._attrs["outputs"]
+    assert len(gemm_outputs) == 1
+    gemm_output = gemm_outputs[0]
+    idx = cat_op.get_tensor_index(gemm_output)
+    return idx
+
+
+def _merge_parallel_gemm_concat(
+    gemm_ops: List[Operator], cat_op: Operator, sorted_graph: List[Tensor]
+):
+    """merge parallel gemm ops and the following concat op together"""
+    # clear gemm_inputs dst_ops
+    _clear_gemm_inputs_dst_ops(gemm_ops)
+
+    inputs, weights, bias = _group_gemm_inputs(gemm_ops)
+
+    n, k = weights[0].shape()[0].value(), weights[0].shape()[1].value()
+    b = len(weights)
+
+    rcr_align = default_align_ab(k, k)
+    rrr_align = default_align_ab(k, n)
+
+    use_rcr = rcr_align > rrr_align
+
+    # create new subgraph
+    bmm_input_cat = ops.concatenate()(inputs, dim=-1)
+    bmm_input = ops.reshape()(bmm_input_cat, [-1, b, k])
+
+    bmm_weight_cat = ops.concatenate()(weights, dim=0)
+    bmm_weight_reshape = ops.reshape()(bmm_weight_cat, [b, n, k])
+    bmm_weight = bmm_weight_reshape if use_rcr else ops.permute021()(bmm_weight_reshape)
+
+    bmm_bias_cat = ops.concatenate()(bias, dim=0)
+    bmm_bias = ops.reshape()(bmm_bias_cat, [b, n])
+
+    if use_rcr:
+        bmm = ops.perm102_bmm_rcr_bias()(bmm_input, bmm_weight, bmm_bias)
+    else:
+        bmm = ops.perm102_bmm_rrr_bias()(bmm_input, bmm_weight, bmm_bias)
+    bmm_reshape = ops.reshape()(bmm, [-1, b * n])
+
+    num_cat_inputs = len(cat_op._attrs["inputs"])
+    cat_output = cat_op._attrs["outputs"][0]
+    if len(gemm_ops) == num_cat_inputs:
+        # fuse with concat op completely
+        transform_utils.replace_tensor(cat_output, bmm_reshape)
+
+        # if cat_output was the only output of the graph, we must
+        # append the new graph output to the graph
+        sorted_graph.append(bmm_reshape)
+
+    else:
+        # bmm_reshape now replaces num_cat_inputs cat inputs
+        begin_idx = _get_gemm_output_idx_in_cat_inputs(gemm_ops[0], cat_op)
+        end_idx = _get_gemm_output_idx_in_cat_inputs(gemm_ops[-1], cat_op)
+
+        old_inputs = cat_op._attrs["inputs"]
+        new_inputs = old_inputs[:begin_idx] + [bmm_reshape] + old_inputs[end_idx + 1 :]
+
+        assert all(
+            cat_op._attrs["input_masks"]
+        ), "The input_pasts of cat_op must be all True"
+
+        cat_op._attrs["inputs"] = new_inputs
+        cat_op._attrs["input_accessors"] = [TensorAccessor(t) for t in new_inputs]
+        cat_op._attrs["original_inputs"] = new_inputs
+        cat_op._attrs["input_masks"] = [True] * len(new_inputs)
+
+        bmm_reshape._attrs["dst_ops"].add(cat_op)
+
+    for gemm_op in gemm_ops:
+        gemm_outputs = gemm_op._attrs["outputs"]
+        assert len(gemm_outputs) == 1
+        transform_utils.remove_tensor_from_sorted_graph(gemm_outputs[0])
+
+
+def _check_cat_op(op: Operator) -> bool:
+    cat_inputs = op._attrs["inputs"]
+    if len(cat_inputs) <= 1:
+        return False
+    rank = cat_inputs[0]._rank()
+    if op._attrs["concat_dim"] != rank - 1:
+        return False
+    return True
+
+
+def _fuse_parallel_gemm_concat(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """This pass fuses patterns like
+    # x1: [m, k], w1: [n, k], b1: [n]
+    y1 = gemm_rcr_bias()(x1, w1, b1)
+    y2 = gemm_rcr_bias()(x2, w2, b1)
+    y3 = concatenate()([x1, x2], dim=-1)
+
+    into:
+    # x: [m, b, k], w: [b, k, n], b: [b, n]
+    x = concatenate()([x1, x2], dim=-1).reshape([-1, b, n])
+    w = concatenate()([w1, w2], dim=-1).reshape([b, n, k]).permute([0, 2, 1])
+    b =  concatenate()([b1, b2], dim=-1).reshape([b, n])
+    y3 = perm102_bmm_rrr_bias()(x, w, b).reshape([-1, b * n])
+
+    after constant folding:
+    x = concatenate()([x1, x2], dim=-1).reshape([-1, b, n])
+    y3 = perm102_bmm_rrr_bias()(x, w, b).reshape([-1, b * n])
+
+    If rcr layout has better alignment than rrr, it will tranform the graph into
+
+    # x: [m, b, k], w: [b, n, k], b: [b, n]
+    x = concatenate()([x1, x2], dim=-1).reshape([-1, b, n])
+    y3 = perm102_bmm_rcr_bias()(x, w, b).reshape([-1, b * n])
+
+    If there are other inputs to the concat op, such as
+
+    y1 = gemm_rcr_bias()(x1, w1, b1)
+    y2 = gemm_rcr_bias()(x2, w2, b1)
+    y3 = concatenate()([y1, y2, x3, x4], dim=-1)
+
+    The graph is transformed into
+    x = concatenate()([x1, x2], dim=-1).reshape([-1, b, n])
+    y3 = perm102_bmm_rrr_bias()(x, w, b).reshape([-1, b * n])
+    y4 = concatenate()([y3, x3, x4], dim=-1)
+
+    y3 will write into the y4 directly through concat fusion.
+
+    For w and b, we rely on constant folding to preprocess them.
+    For the extra concat op to cat x1 and x2 together, we require that the ops
+    that produce x1 and x2 write directly to the output of concat.
+    It is required that all the gemm ops have the same problem sizes and layouts.
+
+    On graph pass ordering, we need to make sure this pass runs before
+    any other pass that modifies gemm and concat input/output TensorAccessors.
+
+    For odd k/n, we rely on apply_padding pass to add padding to X and W.
+    The overall perf may be better or worse depending on problem sizes.
+
+    Args:
+        sorted_graph (List[Tensor]): a sorted list of tensors
+
+    Returns:
+        List[Tensor]: the transformed graph with all ops sorted
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    fusion_groups = []
+    for op in sorted_ops:
+        # check cat op
+        if op._attrs["op"] != "concatenate":
+            continue
+        if not _check_cat_op(op):
+            continue
+
+        cat_inputs = op._attrs["inputs"]
+
+        all_groups = _find_parallel_gemm_ops(cat_inputs, _is_supported_op)
+
+        for gemm_ops in all_groups:
+            # TODO: 2 is arbitrarily chosen. More benchmarks with real models
+            # are needed to find the best criteria. If gemms are not fused here,
+            # they can be grouped into group gemms with other gemms.
+            if len(gemm_ops) >= 2:
+                fusion_groups.append([gemm_ops, op])
+
+    for gemm_ops, cat_op in fusion_groups:
+        _merge_parallel_gemm_concat(gemm_ops, cat_op, sorted_graph)
+    sorted_graph = toposort(sorted_graph)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _is_split_op(op: Operator) -> bool:
+    op_type = op._attrs["op"]
+    if op_type != "split":
+        return False
+    split_dim = op._attrs["split_dim"]
+    inputs = op._attrs["inputs"]
+    if len(inputs) == 0:
+        return False
+    if split_dim != inputs[0]._rank() - 1:
+        return False
+    return True
+
+
+def _from_same_src_op(gemm_ops: List[Operator], op_type: str) -> bool:
+    """
+    Check that the first input of all the ops in gemm_ops come from the same exact op.
+    Returns true if they all come from the same op, and false otherwise.
+    """
+    if len(gemm_ops) <= 1:
+        return True
+    src_ops = list(gemm_ops[0]._attrs["inputs"][0].src_ops())
+    if len(src_ops) != 1:
+        return False
+    src_op = src_ops[0]
+    if src_op._attrs["op"] != op_type:
+        return False
+    for gemm_op in gemm_ops[1:]:
+        src_ops = gemm_op._attrs["inputs"][0].src_ops()
+        if len(src_ops) != 1:
+            return False
+        if src_op not in src_ops:
+            return False
+    return True
+
+
+def _fuse_split_parallel_gemm_concat(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """This pass fuses patterns like
+    # x: [m, b * k], w1: [n, k], b1: [n]
+    x1, x2 = split()(x, k, dim=-1)
+    y1 = gemm_rcr_bias()(x1, w1, b1)
+    y2 = gemm_rcr_bias()(x2, w2, b1)
+    y3 = concatenate()([x1, x2], dim=-1)
+
+    first into:
+    # x: [m, b, k], w: [b, k, n], b: [b, n]
+    x1, x2 = split()(x, k, dim=-1)
+    x = concatenate()([x1, x2], dim=-1).reshape([-1, b, n])
+    w = concatenate()([w1, w2], dim=-1).reshape([b, n, k]).permute([0, 2, 1])
+    b =  concatenate()([b1, b2], dim=-1).reshape([b, n])
+    y3 = perm102_bmm_rrr_bias()(x, w, b).reshape([-1, b * n])
+
+    after transform_memory_ops pass:
+    x = x.reshape([-1, b, n])
+    w = concatenate()([w1, w2], dim=-1).reshape([b, n, k]).permute([0, 2, 1])
+    b =  concatenate()([b1, b2], dim=-1).reshape([b, n])
+    y3 = perm102_bmm_rrr_bias()(x, w, b).reshape([-1, b * n])
+
+    after constant folding:
+    x = x.reshape([-1, b, n])
+    y3 = perm102_bmm_rrr_bias()(x, w, b).reshape([-1, b * n])
+
+    If rcr layout has better alignment than rrr, it will tranform the graph into
+
+    # x: [m, b, k], w: [b, n, k], b: [b, n]
+    x = concatenate()([x1, x2], dim=-1).reshape([-1, b, n])
+    y3 = perm102_bmm_rcr_bias()(x, w, b).reshape([-1, b * n])
+
+    If there are other inputs to the concat op, such as
+
+    x1, x2 = split()(x, k, dim=-1)
+    y1 = gemm_rcr_bias()(x1, w1, b1)
+    y2 = gemm_rcr_bias()(x2, w2, b1)
+    y3 = concatenate()([y1, y2, x3, x4], dim=-1)
+
+    The graph is transformed into
+    x = x.reshape([-1, b, n])
+    y3 = perm102_bmm_rrr_bias()(x, w, b).reshape([-1, b * n])
+    y4 = concatenate()([y3, x3, x4], dim=-1)
+
+    y3 will write into the y4 directly through concat fusion.
+
+    For w and b, we rely on constant folding to preprocess them.
+    It is required that all the gemm ops have the same problem sizes and layouts.
+    We also check that all the gemm inputs come from the same split op.
+
+    Args:
+        sorted_graph (List[Tensor]): a sorted list of tensors
+
+    Returns:
+        List[Tensor]: the transformed graph with all ops sorted
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    fusion_groups = []
+    for op in sorted_ops:
+        if op._attrs["op"] != "concatenate":
+            continue
+        if not _check_cat_op(op):
+            continue
+
+        cat_inputs = op._attrs["inputs"]
+
+        all_groups = _find_parallel_gemm_ops(cat_inputs, _is_split_op)
+
+        for gemm_ops in all_groups:
+            if not _from_same_src_op(gemm_ops, "split"):
+                continue
+            fusion_groups.append([gemm_ops, op])
+
+    for gemm_ops, cat_op in fusion_groups:
+        _merge_parallel_gemm_concat(gemm_ops, cat_op, sorted_graph)
+    sorted_graph = toposort(sorted_graph)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def fuse_parallel_gemms(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """Fuse parallel gemms into a single gemm op.
+    Currently, we only support the following patterns:
+
+    - parallel gemm + concat
+    - split->parallel gemm->concat
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    workdir : str, optional
+        working dir, by default None
+
+    Returns
+    -------
+    List[Tensor]
+        Fused graph
+    """
+    funcs = [
+        _fuse_parallel_gemm_concat,
+        _fuse_split_parallel_gemm_concat,
+    ]
+    for func in funcs:
+        sorted_graph = func(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_permute_bmm.py b/python/aitemplate/compiler/transform/fuse_permute_bmm.py
new file mode 100644
index 000000000..22a3ee036
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_permute_bmm.py
@@ -0,0 +1,224 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform fusions for permute+bmm operators.
+"""
+from typing import Callable, List, Optional, Set, Tuple, Type, Union
+
+from .. import ops
+from ..base import IntImm, Operator, Tensor
+from ..ops.gemm_universal import (
+    bmm_ccr,
+    bmm_crr,
+    bmm_rcr,
+    bmm_rrr,
+    gemm_rcr,
+    gemm_rcr_bias,
+    gemm_rrr,
+    gemm_rrr_bias,
+)
+from ..ops.tensor import permute021
+from .fuse_utils import extract_only_one_op
+from .transform_utils import (
+    copy_src_op_attributes,
+    copy_tensor_attributes,
+    remove_dst_op_from_tensor,
+    remove_tensor_from_sorted_graph,
+    replace_tensor,
+    sanitize_sorted_graph,
+)
+
+# pylint: disable=C0103,W0612
+
+
+def _try_extract_one_mm_op(ops: Set[Union[None, Operator]]) -> Union[None, Operator]:
+    """
+    Helper function that returns the matmul op from src_ops() or dst_ops() call.
+    Return None if there's no bmm ops
+    """
+    if ops is None:
+        return None
+
+    for op in ops:
+        if op._attrs["op"].startswith("bmm") or op._attrs["op"].startswith("gemm"):
+            return op
+
+    return None
+
+
+def _fuse_permute_bmm_ops(
+    sorted_graph: List[Tensor],
+    source: List[Type[Operator]],
+    targets: List[Union[None, Type[Operator]]],
+    condition: Optional[Callable],
+) -> Tuple[bool, List[Tensor]]:
+    """
+    Function that fuses [permute021 + bmm] into corresponding bmm op.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        AIT graph to run fusion
+    source: List[Type[Operator]]
+        Combination of permute+bmm ops to be fused.
+        This should be of len-2
+    targets: List[Type[Operator]]
+        To be fused bmm that matches the source.
+        This should be of len 2, which corresponds to the operator that does
+        permute A and permute B respectively
+    condition: Optional[Callable]
+        If not None, we apply on the gemm op to check whether it requires fusion.
+    """
+    assert len(source) == 2, "Source should have 2 elements, got {} instead".format(
+        len(source)
+    )
+
+    new_sorted_graph = []
+    fused = False
+    to_replace = {}
+    for tensor in sorted_graph:
+        if tensor in to_replace:
+            new_sorted_graph.append(to_replace[tensor])
+            replace_tensor(tensor, to_replace[tensor])
+            del to_replace[tensor]
+            continue
+        new_sorted_graph.append(tensor)
+
+        if fused:
+            continue
+        if tensor._attrs["is_output"]:
+            continue
+
+        permute_op = extract_only_one_op(tensor._attrs["src_ops"])
+        bmm_op = _try_extract_one_mm_op(tensor._attrs["dst_ops"])
+        if permute_op is None or bmm_op is None:
+            continue
+
+        if permute_op._attrs["op"] != source[0]()._attrs["op"]:
+            continue
+        if bmm_op._attrs["op"] != source[1]()._attrs["op"]:
+            continue
+        if condition is not None and not condition(bmm_op):
+            continue
+
+        assert len(permute_op._attrs["inputs"]) == 1
+        assert len(bmm_op._attrs["outputs"]) == 1
+
+        inputs = list(bmm_op._attrs["inputs"])
+        if targets[0] is None and inputs[0] == tensor:
+            continue
+        if targets[1] is None and inputs[1] == tensor:
+            continue
+
+        input_tensor = permute_op._attrs["inputs"][0]
+        output_tensor = bmm_op._attrs["outputs"][0]
+
+        # TODO: Check whether the input is weight to have better compile time
+        #       optimization on preprocessing of pad etc.
+        permute_shape = tensor.shape()
+        prepermute_shape = input_tensor.shape()
+
+        if (
+            isinstance(prepermute_shape[-1], IntImm)
+            and prepermute_shape[-1].value() % 2 == 1
+            and isinstance(permute_shape[-1], IntImm)
+            and permute_shape[-1].value() % 2 == 0
+        ):
+            # We don't run the permute+bmm fusion if the permute op could
+            # turn an odd alignment into even alignment.
+            continue
+
+        fused = True
+
+        remove_dst_op_from_tensor(bmm_op._attrs["inputs"], bmm_op)
+
+        target = None
+        if inputs[0] == tensor:
+            target = targets[0]
+            inputs[0] = input_tensor
+        elif inputs[1] == tensor:
+            target = targets[1]
+            inputs[1] = input_tensor
+        else:
+            raise RuntimeError(
+                "bmm inputs are {}, not matching permute's output tensor {}".format(
+                    inputs, tensor
+                )
+            )
+
+        if not tensor.dst_ops():
+            # Remove permute configs if this is the last bmm consuming the tensor
+            remove_dst_op_from_tensor(input_tensor, permute_op)
+            remove_tensor_from_sorted_graph(tensor)
+
+        new_tensor = target()(*inputs)
+        copy_tensor_attributes(new_tensor, output_tensor)
+        copy_src_op_attributes(new_tensor, output_tensor)
+        to_replace[output_tensor] = new_tensor
+
+    return (fused, sanitize_sorted_graph(new_sorted_graph))
+
+
+def fuse_permute_bmm(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
+    """Fuse [permute021 + bmm] into corresponding bmm op.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    workdir : str, optional
+        working dir, by default None
+
+    Returns
+    -------
+    List[Tensor]
+        Fused graph
+    """
+
+    def _need_broadcast_gemm(op: Operator):
+        if not op._attrs["op"].startswith("gemm"):
+            return False
+        inputs = op._attrs["inputs"]
+        return len(inputs[0].shape()) != 2 or len(inputs[1].shape()) != 2
+
+    permute_mm_patterns = (
+        ([permute021, bmm_ccr], [bmm_rcr, bmm_crr], None),
+        ([permute021, bmm_crr], [bmm_rrr, bmm_ccr], None),
+        ([permute021, bmm_rcr], [bmm_ccr, bmm_rrr], None),
+        ([permute021, bmm_rrr], [bmm_crr, bmm_rcr], None),
+        ([permute021, gemm_rcr], [bmm_ccr, bmm_rrr], _need_broadcast_gemm),
+        ([permute021, gemm_rrr], [bmm_crr, bmm_rcr], _need_broadcast_gemm),
+        (
+            [permute021, gemm_rcr_bias],
+            [ops.gemm_universal.bmm_ccr_add, ops.gemm_universal.bmm_rrr_add],
+            _need_broadcast_gemm,
+        ),
+        (
+            [permute021, gemm_rrr_bias],
+            [ops.gemm_universal.bmm_crr_add, None],
+            _need_broadcast_gemm,
+        ),
+    )
+
+    graph_transformed = True
+    while graph_transformed:
+        graph_transformed = False
+        for source, targets, condition in permute_mm_patterns:
+            fused, sorted_graph = _fuse_permute_bmm_ops(
+                sorted_graph, source, targets, condition
+            )
+            graph_transformed |= fused
+
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_split.py b/python/aitemplate/compiler/transform/fuse_split.py
new file mode 100644
index 000000000..646a49a0e
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_split.py
@@ -0,0 +1,282 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform transformations on ops which support strided inputs / outputs.
+"""
+from typing import List
+
+from ...utils import graph_utils, logger
+from ..base import IntImm, IntVar, Operator, Tensor
+from . import transform_strided_ops_utils, transform_utils
+
+# pylint: disable=W0612
+
+
+def _can_fuse_split_op(split_op: Operator):
+    split_dim = split_op._attrs["split_dim"]
+    # FIXME: only support dim == 1 at the moment
+    if split_dim != 1:
+        return False
+    if not transform_strided_ops_utils.cat_split_dim_is_static(split_op, split_dim):
+        return False
+    return True
+
+
+def _fuse_split_and_group_gemm(  # noqa: C901
+    sorted_graph: List[Tensor],
+) -> List[Tensor]:
+    """
+    This pass detects patterns like below:
+      [x1, x2, x3] = split(x, dim=1)
+      [y1, y2, y3] = group_gemm([[x1, w1], [x2, w2], [x3, w2]], stride_dim=1)
+
+    and we generate stride information for each input tensor with respect to
+    its portion in split(x). Later, the group gemm backend will generate
+    strided accesses based on the stored stride information.
+    We remove the split op.
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        op_type = op._attrs["op"]
+        if op_type != "split":
+            continue
+        split_op = op
+        if not _can_fuse_split_op(split_op):
+            continue
+
+        def _optional_group_gemm_op(dst_ops):
+            # skip cases where this output has multiple users
+            if len(dst_ops) != 1:
+                return None
+            dst_op = list(dst_ops)[0]
+            # FIXME: we only handle row-major A at the moment.
+            # We might need to change our TensorAccessor code to support col-major
+            # tensors.
+            if dst_op._attrs["op"].startswith("group_gemm_r"):
+                return dst_op
+            return None
+
+        split_outputs = split_op._attrs["outputs"]
+        group_gemm_op = _optional_group_gemm_op(split_outputs[0]._attrs["dst_ops"])
+        if group_gemm_op is None:
+            continue
+        if group_gemm_op._attrs["groups"] != len(split_outputs):
+            continue
+
+        all_as = []
+        all_a_indices = {}
+        # group_gemm "inputs" is like [a1, b1, a2, b2, ...]
+        # group_gemm_bias "inputs" is like
+        # [a1, b1, bias1, a2, b2, bias1, ...]
+        stride = 3 if group_gemm_op._attrs["op"].endswith("bias") else 2
+        group_gemm_inputs = group_gemm_op._attrs["inputs"]
+        for i in range(group_gemm_op._attrs["groups"]):
+            t = group_gemm_inputs[i * stride]
+            all_as.append(t)
+            all_a_indices[t] = (i, i * stride)
+
+        # make sure we make transformation only if the targeting ops are valid
+        def _valid_input(input_tensor):
+            return (
+                len(input_tensor._attrs["src_ops"])
+                == len(input_tensor._attrs["dst_ops"])
+                == 1
+            )
+
+        # let's make our life easier - we only handle two cases: either (1) split's
+        # outputs all go into As, or (2) split's outputs all go into Bs.
+        # FIXME: we only implement (1) and need to add support to (2).
+        if set(all_as) != set(split_outputs):
+            continue
+
+        if all(_valid_input(x) for x in all_as):
+            input_indices = all_a_indices
+            input_accessors = group_gemm_op.input_a_accessors()
+        else:
+            continue
+
+        split_input = split_op._attrs["inputs"][0]
+        split_dim = split_op._attrs["split_dim"]
+        split_dim_offset = 0
+        for split_output_tensor in split_outputs:
+            accessor_idx, input_idx = input_indices[split_output_tensor]
+            input_accessors[accessor_idx].update_base_tensor(
+                split_input, split_dim, split_dim_offset
+            )
+            group_gemm_op._attrs["inputs"][input_idx] = split_input
+            split_dim_offset += split_output_tensor._attrs["shape"][split_dim]._attrs[
+                "values"
+            ][0]
+            transform_utils.remove_tensor_from_sorted_graph(split_output_tensor)
+        # sanity check
+        assert (
+            split_dim_offset
+            == split_input._attrs["shape"][split_dim]._attrs["values"][0]
+        )
+        # some final updates
+        split_input._attrs["dst_ops"] = [group_gemm_op]
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _is_supported_op(op_type: str):
+    from ...backend.target import Target
+
+    if Target.current().name() == "rocm":
+        return op_type == "bmm_softmax_bmm_permute"
+    else:
+        return op_type in {"bmm_rcr_n1", "bmm_rcr", "bmm_rrr_permute"}
+
+
+def get_stride(t: Tensor, dim: int):
+    stride = 1
+    for shape in t.shape()[dim + 1 :]:
+        stride *= shape.value()
+    return stride
+
+
+def _check_dim_alignment(shape: List[IntVar], dim_idx: int) -> bool:
+    k_dim = shape[dim_idx]
+    # skip dynamic dim
+    if not isinstance(k_dim, IntImm):
+        return False
+    k_dim_val = k_dim._attrs["values"][0]
+    # We cannot have mis-aligned K
+    if k_dim_val % 2 == 0:
+        return True
+    else:
+        return False
+
+
+def _check_alignment(op: Operator, offset: int):
+    # ops that support align=1
+    if op._attrs["op"] == "bmm_rcr_n1":
+        return True
+
+    # ops that don't support align=1
+    # TODO: adjust alignment requirement based on dtype. 2-elem-alignment is
+    # only required by fp16, because async.copy needs at least 32 bits.
+    # For fp32 dtype values, 1-elem-alignment is valid.
+    if offset % 2 != 0:  # fp16
+        return False
+    if op._attrs["op"] == "bmm_rrr_permute":
+        a_shape = op._attrs["input_accessors"][0].original_shapes
+        b_shape = op._attrs["input_accessors"][1].original_shapes
+        # check K and N
+        return _check_dim_alignment(a_shape, dim_idx=2) and _check_dim_alignment(
+            b_shape, dim_idx=2
+        )
+    if op._attrs["op"] == "bmm_rcr":
+        a_shape = op._attrs["input_accessors"][0].original_shapes
+        # check K
+        return _check_dim_alignment(a_shape, dim_idx=2)
+    if op._attrs["op"] == "bmm_softmax_bmm_permute":
+        # a = (B, M, K), b = (B, N, K), c = (B, N, O)
+        # t = bmm_rcr(a, b)
+        # t' = softmax(t) # t' shape (B, M, N)
+        # bmm_rrr(t', c)
+        a_shape = op._attrs["input_accessors"][0].original_shapes
+        c_shape = op._attrs["input_accessors"][2].original_shapes
+        return (
+            # check K for bmm_rcr((B, M, K), (B, N, K))
+            _check_dim_alignment(a_shape, dim_idx=2)
+            and
+            # check N for bmm_rrr((B, M, N), (B, N, O))
+            _check_dim_alignment(c_shape, dim_idx=1)
+            and
+            # check O for bmm_rrr((B, M, N), (B, N, O))
+            _check_dim_alignment(c_shape, dim_idx=2)
+        )
+
+    raise RuntimeError(f'Unexpected op type: {op._attrs["op"]}')
+
+
+def _fuse_split_and_strided_op(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """Fuse split and any op that supports strided inputs. This pass requires
+    that all of the outputs can be fused into the next ops so that split op
+    can be eliminated. Partial fusion is not supported yet.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+
+    Returns
+    -------
+    List[Tensor]
+        Fused graph
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        op_type = op._attrs["op"]
+        if op_type != "split":
+            continue
+        split_op = op
+        split_dim = split_op._attrs["split_dim"]
+
+        split_input = split_op._attrs["inputs"][0]
+        # split_dim must be static
+        if not transform_strided_ops_utils.cat_split_dim_is_static(split_op, split_dim):
+            continue
+
+        outputs = split_op._attrs["outputs"]
+        can_fuse_split = True
+
+        stride = get_stride(split_input, split_dim)
+        # offset on the split dim, which is different from the real offset
+        dim_offset = 0
+        output_offsets = []
+        # We apply padding to bmm before this fuse_split pass. However, we may
+        # still have mis-aligned accesses caused by offsets. This _check_alignment
+        # filters out all bad cases.
+        for output in outputs:
+            can_fuse_split &= len(output.dst_ops()) > 0 and all(
+                _is_supported_op(next_op._attrs["op"])
+                # need to pass the real offset to alignment checker
+                and _check_alignment(next_op, dim_offset * stride)
+                and len(output.dst_ops()) == 1
+                for next_op in output.dst_ops()
+            )
+            for next_op in output.dst_ops():
+                for idx, input in enumerate(next_op._attrs["inputs"]):
+                    if input == output:
+                        can_fuse_split = can_fuse_split and (
+                            transform_strided_ops_utils.gemm_stride_checker(
+                                next_op._attrs["input_accessors"][idx], split_dim
+                            )
+                        )
+            output_offsets.append(dim_offset)
+            dim_offset += output._size(split_dim).value()
+
+        if not can_fuse_split:
+            continue
+        logger.debug(__file__, "Remove split from graph")
+        split_input.dst_ops().remove(split_op)
+
+        for output, offset in zip(outputs, output_offsets):
+            for next_op in output.dst_ops():
+                for idx, input in enumerate(next_op._attrs["inputs"]):
+                    if input == output:
+                        next_op._attrs["input_accessors"][idx].update_base_tensor(
+                            split_input, split_dim, offset
+                        )
+                        # update the graph
+                        next_op._attrs["inputs"][idx] = split_input
+                        break
+                split_input.dst_ops().add(next_op)
+
+        # remove split op
+        for output in outputs:
+            transform_utils.remove_tensor_from_sorted_graph(output)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/fuse_utils.py b/python/aitemplate/compiler/transform/fuse_utils.py
new file mode 100644
index 000000000..4aa7ee25b
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_utils.py
@@ -0,0 +1,191 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Any, List, Optional, Set
+
+from ..base import Operator, Tensor
+from ..ops.conv.common_conv2d_bias_add_activation import conv2d_bias_add_activation
+from .toposort import toposort
+from .transform_utils import (
+    copy_tensor_attributes,
+    remove_dst_op_from_tensor,
+    replace_tensor,
+    sanitize_sorted_graph,
+)
+
+# pylint: disable=C0103,C0415,W0612
+
+
+def extract_only_one_op(ops: Set[Optional[Operator]]) -> Optional[Operator]:
+    """
+    Helper function that returns the op from src_ops() or dst_ops() call.
+    Return None if there are no ops or if there's more than one op.
+    """
+    if ops is None or len(ops) != 1:
+        return None
+    return list(ops)[0]
+
+
+def is_elementwise_type(op: Operator, elementwise_type):
+    if op._attrs["op"] != "elementwise":
+        return False
+    return op._attrs["func"] == elementwise_type
+
+
+def _is_same_op_type(op_A: Operator, op_B: Operator):
+    """
+    Compare whether 2 ops are of same type.
+    """
+    if op_A._attrs["op"] != op_B._attrs["op"]:
+        return False
+    if op_A._attrs["op"] == "elementwise":
+        if op_A._attrs["func"] != op_B._attrs["func"]:
+            return False
+
+    return True
+
+
+def _find_fusion_root(tensor: Tensor, fusion_patterns: List[Any]) -> int:
+    fusion_idx = -1
+
+    src_op = extract_only_one_op(tensor._attrs["src_ops"])
+    if src_op is None:
+        return fusion_idx
+
+    for idx, fusion_pattern in enumerate(fusion_patterns):
+        pattern, _ = fusion_pattern
+        curr_op = src_op
+        curr_tensor = tensor
+
+        for step, pattern_op in enumerate(pattern):
+            if not _is_same_op_type(curr_op, pattern_op):
+                break
+            check_input = getattr(pattern_op, "is_valid_inputs", None)
+            if check_input is not None:
+                valid, _ = check_input(*curr_op._attrs["inputs"])
+                if not valid:
+                    break
+
+            if step == len(pattern) - 1:
+                fusion_idx = idx
+                break
+
+            dst_op = extract_only_one_op(curr_tensor._attrs["dst_ops"])
+            if dst_op is None:
+                break
+            curr_op = dst_op
+            dst_op_tensor = dst_op._attrs["outputs"]
+            if len(dst_op_tensor) != 1:
+                break
+            curr_tensor = dst_op_tensor[0]
+
+        if fusion_idx != -1:
+            return fusion_idx
+
+    return fusion_idx
+
+
+def transform_simple_fusion_patterns(
+    sorted_graph: List[Tensor], fusion_patterns: List[Any]
+) -> List[Tensor]:
+    output_tensors = []
+    to_remove = set()
+    for tensor in sorted_graph:
+        if tensor in to_remove:
+            to_remove.remove(tensor)
+            continue
+
+        if tensor._attrs["is_output"]:
+            output_tensors.append(tensor)
+            continue
+
+        fusion_idx = _find_fusion_root(tensor, fusion_patterns)
+        if fusion_idx == -1:
+            continue
+
+        to_remove_candidate = set()
+        to_remove_dst_op = {}
+
+        src_op = extract_only_one_op(tensor._attrs["src_ops"])
+        inputs = list(src_op._attrs["inputs"])
+        to_remove_dst_op[src_op] = list(inputs)
+
+        last_tensor = tensor
+        to_remove_candidate.add(last_tensor)
+
+        for _ in range(len(fusion_patterns[fusion_idx][0]) - 1):
+            # The check is done in _find_fusion_root, therefore we only need to
+            # know how many steps to go forward.
+            next_op = extract_only_one_op(last_tensor._attrs["dst_ops"])
+            if next_op._attrs["op"] == "elementwise":
+                next_op_inputs = next_op._attrs["args"]
+            else:
+                next_op_inputs = next_op._attrs["inputs"]
+            assert (
+                len(next_op_inputs) <= 2 and len(next_op_inputs) > 0
+            ), "next_op in pattern should have input length of 1 or 2, got {} instead".format(
+                len(next_op_inputs)
+            )
+            if len(next_op_inputs) == 2:
+                # This is the case of add/mul/etc. we put them into inputs.
+                if next_op_inputs[0] is last_tensor:
+                    other_tensor = next_op_inputs[1]
+                elif next_op_inputs[1] is last_tensor:
+                    other_tensor = next_op_inputs[0]
+                else:
+                    raise AssertionError("input does not come from upstream node")
+                inputs.append(other_tensor)
+
+                if next_op in to_remove_dst_op:
+                    to_remove_dst_op[next_op].append(other_tensor)
+                else:
+                    to_remove_dst_op[next_op] = [other_tensor]
+
+            last_tensor = next_op._attrs["outputs"][0]
+            to_remove_candidate.add(last_tensor)
+
+        # A final check to make sure our replacement is valid.
+        new_op = fusion_patterns[fusion_idx][1]
+
+        check_inputs_func = getattr(new_op, "is_valid_inputs", None)
+        if check_inputs_func is not None:
+            valid, _ = check_inputs_func(*inputs)
+            if not valid:
+                continue
+
+        # TODO: remove after broadcasting is supported
+        # special shape check for conv2d_bias_add_activation ops
+        if issubclass(new_op, conv2d_bias_add_activation):
+            assert len(inputs) >= 4, (
+                f"The number of inputs must be larger than 4 for conv2d_bias_add_activation "
+                f"family fusions. Current number of inputs: {len(inputs)}"
+            )
+            residual = inputs[3]
+            y = src_op._attrs["outputs"][0]
+            if y.shape() != residual.shape():
+                continue
+
+        # inputs here might not be ready in graph. But we will toposort again
+        # at end of pass so it's okay.
+        new_tensor = new_op(**src_op._get_op_attributes())(*inputs)
+        copy_tensor_attributes(new_tensor, last_tensor)
+        if new_tensor._attrs["is_output"]:
+            output_tensors.append(new_tensor)
+        replace_tensor(last_tensor, new_tensor)
+        for dst_op, tensors in to_remove_dst_op.items():
+            remove_dst_op_from_tensor(tensors, dst_op)
+        to_remove |= to_remove_candidate
+
+    new_sorted_graph = toposort(output_tensors)
+    return sanitize_sorted_graph(new_sorted_graph)
diff --git a/python/aitemplate/compiler/transform/mark_param_tensor.py b/python/aitemplate/compiler/transform/mark_param_tensor.py
new file mode 100644
index 000000000..104739908
--- /dev/null
+++ b/python/aitemplate/compiler/transform/mark_param_tensor.py
@@ -0,0 +1,61 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+mark tensors which are parameters
+"""
+from typing import List
+
+from ..base import Tensor
+
+# pylint: disable=C0103,W0613
+
+
+def mark_special_views(sorted_graph: List[Tensor]):
+    for node in sorted_graph:
+        view = node._attrs["is_view_of"]
+        if view is None:
+            continue
+
+        view_orig = view._attrs["external_tensor"]
+        if view_orig is not None:
+            node._attrs["external_tensor"] = view_orig
+            continue
+
+        view_is_input_or_constant = not view.src_ops()
+        view_is_output = view._attrs["is_output"]
+        if view_is_input_or_constant or view_is_output:
+            node._attrs["external_tensor"] = view
+
+
+def mark_param_tensor(sorted_graph: List[Tensor]):
+    """
+    Mark constant tensors: those that have no ops
+    *and* are not explicitly marked as inputs.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        The graph to mutate.
+    """
+
+    for node in reversed(sorted_graph):
+        if not node.src_ops() and not node._attrs["is_input"]:
+            node._attrs["is_param"] = True
+
+        view = node._attrs["is_view_of"]
+        if view is not None:
+            view._attrs["has_output_aliases"] = (
+                node._attrs["is_output"] or node._attrs["has_output_aliases"]
+            )
diff --git a/python/aitemplate/compiler/transform/memory_planning.py b/python/aitemplate/compiler/transform/memory_planning.py
new file mode 100644
index 000000000..7b53a0a80
--- /dev/null
+++ b/python/aitemplate/compiler/transform/memory_planning.py
@@ -0,0 +1,289 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Graph pass for memory planning.
+"""
+import bisect
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import List
+
+from ..base import Operator, Tensor
+
+# pylint: disable=C0103
+
+
+@dataclass
+class TensorUsageRecord:
+    """
+    A named tuple to keep a tensor usage record, where
+
+    tensor: this tensor
+
+    first_op_idx: the index of the first op that uses this tensor as its input
+                  or output
+
+    last_op_idx: the index of the last op that uses this tensor as its input or
+                 output
+
+    size: the size of this tensor
+    """
+
+    tensor: Tensor
+    first_op_idx: int
+    last_op_idx: int
+    size: int
+
+    def __iter__(self):
+        return iter([self.tensor, self.first_op_idx, self.last_op_idx, self.size])
+
+
+def _find_original_tensor(tensor: Tensor):
+    """Find the original tensor of a tensor view recursively."""
+    view = tensor._attrs["is_view_of"]
+    if not view:
+        return tensor
+    return _find_original_tensor(view)
+
+
+def _make_tensor_usage_records(sorted_ops: List[Operator]) -> List[TensorUsageRecord]:
+    num_of_ops = len(sorted_ops)
+    tensor_records = defaultdict(
+        lambda: TensorUsageRecord(
+            tensor=None, first_op_idx=num_of_ops, last_op_idx=-1, size=None
+        )
+    )
+    for op_idx, op in enumerate(sorted_ops):
+        for tensor in op._attrs["inputs"] + op._attrs["outputs"]:
+            # Skip weights and inputs since we don't overwrite them.
+            # Note that it might be OK to overwrite inputs, but let's be
+            # consertative for now and not surprise users. We could always
+            # make a flag to do that later if it's needed.
+            if tensor._attrs["is_param"]:
+                continue
+            name = tensor._attrs["name"]
+            this_tensor = tensor_records[name].tensor
+            if this_tensor is None:
+                tensor_records[name].tensor = tensor
+            else:
+                # make sure we didn't screw up anything
+                assert (
+                    tensor == this_tensor
+                ), f"existing tensor: {this_tensor}, new tensor: {tensor}, op: {op}"
+
+            first_op_idx = tensor_records[name].first_op_idx
+            last_op_idx = tensor_records[name].last_op_idx
+            tensor_records[name].first_op_idx = min(first_op_idx, op_idx)
+            tensor_records[name].last_op_idx = max(last_op_idx, op_idx)
+            # An output tensor's lifetime extends to the last op.
+            if tensor._attrs["is_output"]:
+                tensor_records[name].last_op_idx = num_of_ops - 1
+
+            size = tensor_records[name].size
+            tensor_size = tensor.size_bytes(alignment=64)
+            if size is None:
+                tensor_records[name].size = tensor_size
+            else:
+                # make sure we didn't screw up anything
+                assert size == tensor_size
+
+    # tensor views extend the lifetime of the original tensors
+    tensor_views = []
+    for name, tensor_record in tensor_records.items():
+        this_tensor = tensor_record.tensor
+        if this_tensor._attrs["is_view_of"]:
+            orig_tensor = _find_original_tensor(this_tensor)
+            # view of input
+            if orig_tensor._attrs["is_param"]:
+                continue
+            orig_tensor_name = orig_tensor._attrs["name"]
+            assert orig_tensor_name in tensor_records
+            tensor_records[orig_tensor_name].last_op_idx = max(
+                tensor_records[orig_tensor_name].last_op_idx, tensor_record.last_op_idx
+            )
+            tensor_views.append(name)
+
+    # remove tensor views from tensor_records
+    for name in tensor_views:
+        del tensor_records[name]
+
+    # sanity checks
+    # make sure we have valid indices and sizes
+    records = tensor_records.values()
+    for tensor, first_op_idx, last_op_idx, size in records:
+        assert tensor is not None
+        assert 0 <= first_op_idx < num_of_ops
+        assert 0 <= last_op_idx < num_of_ops
+        assert first_op_idx <= last_op_idx
+        assert size is not None
+
+    return list(records)
+
+
+def assign_offsets_to_views_and_outputs(sorted_graph: List[Tensor]) -> None:
+    """Propagate offsets determined by the memory planning algorithm to views.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        The graph, modified in-place
+    """
+    for node in sorted_graph:
+        if node._attrs["is_view_of"]:
+            node._attrs["offset"] = node._attrs["is_view_of"]._attrs["offset"]
+
+
+@dataclass
+class Workspace:
+    shared_size: int
+    unique_size: int
+
+    def total_size(self) -> int:
+        return self.shared_size + self.unique_size
+
+
+def _compute_workspace(sorted_graph: List[Tensor]) -> Workspace:
+    """
+    Compute the workspace for the model, which can be used as scratch memory by ops.
+    This pass examines two attributes on every function in the graph:
+    - workspace: The amount of memory in bytes to be used as shared scratch memory.
+      Here, "shared" means that other ops are allowed to write to this memory.
+    - unique_workspace: The amount of memory in bytes to be used as exclusive scratch memory.
+      If set, this pass will assign the op a "unique_workspace_offset". This can be used at
+      codegen time to set a pointer to the region of exclusive shared memory.
+
+    The returned Workspace has two attributes:
+    - shared_size: The total memory needed for all op's shared scratch memory (i.e. the maximum
+    of all workspace attributes)
+    - unique_size: The total memory needed for all unique scratch memory (i.e. the sum of
+    all unique_workspace attributes)
+
+    During codegen, the workspace gets set up like this:
+    [--unique 1--][--unique 2--]...[--unique N--][--shared--]
+    """
+    unique_workspace_size = 0
+    max_workspace = 0
+    for node in sorted_graph:
+        for func in node._attrs["src_ops"]:
+            if "workspace" in func._attrs:
+                max_workspace = max(max_workspace, func._attrs["workspace"])
+            if (
+                "unique_workspace" in func._attrs
+                and "unique_workspace_offset" not in func._attrs
+            ):
+                func._attrs["unique_workspace_offset"] = unique_workspace_size
+                unique_workspace_size += func._attrs["unique_workspace"]
+    return Workspace(max_workspace, unique_workspace_size)
+
+
+def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
+    """
+    based on the greedy-by-size algorithm for offset calculation described in
+    the following paper:
+        Yury Pisarchyk, Juhyun Lee,
+        Efficient Memory Management for Deep Neural Net Inference,
+        https://arxiv.org/abs/2001.03288
+    """
+    sorted_ops = []
+    for node in sorted_graph:
+        sorted_ops.extend(node.src_ops())
+    tensor_usage_records = _make_tensor_usage_records(sorted_ops)
+
+    # sort tensor usage records in non-increasing order by their sizes
+    sorted_tensor_usage_records = sorted(
+        tensor_usage_records, key=lambda r: r.size, reverse=True
+    )
+
+    max_blob = 0
+    # For tensors that have been assigned, we keep their tensor usage records
+    # in increasing order by memory offsets
+    sorted_assigned_records = []
+    for tensor_record in sorted_tensor_usage_records:
+        tensor, first_op_idx, last_op_idx, size = tensor_record
+        prev_offset = 0
+        best_offset = None
+        smallest_gap = pow(2, 63) - 1
+        # Iterate through tensors that have been allocated.
+        # For those whose usage intervals intersect with that of current
+        # tensor, we try to find the smallest valid memory gap between such two
+        # allocated tensors, which is big enough to hold current tensor.
+        # If such a gap is found, we will place current tensor in the gap.
+        for a_record in sorted_assigned_records:
+            a_tensor, a_first_op_idx, a_last_op_idx, a_size = a_record
+            max_first_op_idx = max(first_op_idx, a_first_op_idx)
+            min_last_op_idx = min(last_op_idx, a_last_op_idx)
+            # current tensor overlaps with this assigned tensor
+            if max_first_op_idx <= min_last_op_idx:
+                a_offset = a_tensor._attrs["offset"]
+                gap = a_offset - prev_offset
+                if size <= gap < smallest_gap:
+                    smallest_gap = gap
+                    best_offset = prev_offset
+                prev_offset = max(prev_offset, a_offset + a_size)
+        # If we can't find a valid memory gap between two allocated tensors,
+        # we put current tensor to the rightmost tensor whose usage interval
+        # intersects with that of the current tensor.
+        if best_offset is None:
+            best_offset = prev_offset
+        tensor._attrs["offset"] = best_offset
+        max_blob = max(max_blob, best_offset + size)
+
+        # bisect from Python <=3.9 doesn't have the key parameter
+        sorted_offsets = [r.tensor._attrs["offset"] for r in sorted_assigned_records]
+        in_pos = bisect.bisect_right(
+            sorted_offsets, tensor_record.tensor._attrs["offset"]
+        )
+        sorted_assigned_records.insert(in_pos, tensor_record)
+
+    # now we assign blobs for weights and inputs
+    constant_offset = 0
+    for node in sorted_graph:
+        if node._attrs["data"] is not None:
+            node._attrs["offset"] = constant_offset
+            constant_offset += node.size_bytes(alignment=64)
+
+    # assign offsets to tensor views
+    # this step must happen after weights and inputs are assigned so that views
+    # of inputs are properly handled
+    assign_offsets_to_views_and_outputs(sorted_graph)
+
+    workspace = _compute_workspace(sorted_graph)
+
+    # make sure we've covered the entire graph
+    return (max_blob, constant_offset, workspace)
+
+
+def naive_memory_planning(sorted_graph: List[Tensor]):
+    max_blob = 0
+    offset = 0
+    constant_offset = 0
+    for node in sorted_graph:
+        if node._attrs["data"] is not None:
+            node._attrs["offset"] = constant_offset
+            constant_offset += node.size_bytes(alignment=64)
+        elif not node._attrs["is_view_of"]:
+            node._attrs["offset"] = offset
+            tensor_size = node.size_bytes(alignment=64)
+            offset += tensor_size
+            max_blob += tensor_size
+
+    # workspace
+    workspace = _compute_workspace(sorted_graph)
+    assign_offsets_to_views_and_outputs(sorted_graph)
+    return (max_blob, constant_offset, workspace)
+
+
+memory_planning = greedy_by_size_memory_planning
+# memory_planning = naive_memory_planning
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
new file mode 100644
index 000000000..87d3b1b53
--- /dev/null
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Graph pass to assign names to a sorted graph.
+"""
+import re
+from typing import List
+
+from ..base import Tensor
+
+# pylint: disable=C0103
+
+# Make these variables global to allow repeately calling name_graph().
+func_cnt = 0
+tensor_cnt = 0
+func_name_to_tensor_cnt = {}
+
+MEMO = set()
+
+
+def valid_c_name(name):
+    return re.sub(r"\W|^(?=\d)", "_", name)
+
+
+def unique_name(name):
+    name = valid_c_name(name)
+    global MEMO
+    if name in MEMO:
+        return f"{name}_{str(len(MEMO))}"
+    else:
+        MEMO.add(name)
+        return name
+
+
+def name_graph(sorted_graph: List[Tensor]) -> None:
+    """Provide each tensor and operator with a unique valid C variable name
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph to be named
+    """
+    global func_cnt
+    global tensor_cnt
+    global func_name_to_tensor_cnt
+    for node in sorted_graph:
+        funcs = node.src_ops()
+        if len(funcs) == 0:
+            if node._attrs["name"] is None:
+                tensor_name = unique_name(f"tensor_{tensor_cnt}")
+                node._attrs["name"] = tensor_name
+                tensor_cnt += 1
+
+        else:
+            for func in funcs:
+                if func._attrs["name"] is None:
+                    func_name = "{op_kind}_{idx}".format(
+                        op_kind=func._attrs["op"], idx=func_cnt
+                    )
+                    func_name = unique_name(func_name)
+                    func._attrs["name"] = func_name
+                    func._attrs["original_name"] = func_name
+                    func_cnt += 1
+                    func_name_to_tensor_cnt[func_name] = 0
+                if node._attrs["name"] is None:
+                    func_tensor_count = func_name_to_tensor_cnt[func_name]
+                    node_name = unique_name(f"{func_name}_{func_tensor_count}")
+                    node._attrs["name"] = node_name
+                    func_name_to_tensor_cnt[func_name] = func_tensor_count + 1
+        tensor_name = node._attrs["name"]
+        for i, dim in enumerate(node._attrs["shape"]):
+            if dim._attrs["name"] is None:
+                dim_name = "{tname}_dim_{idx}".format(tname=tensor_name, idx=i)
+                dim._attrs["name"] = dim_name
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
new file mode 100644
index 000000000..804de46aa
--- /dev/null
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -0,0 +1,87 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Applies graph transformations.
+"""
+
+from typing import List
+
+from ...utils import graph_utils
+from ..base import Tensor
+from .apply_padding import apply_padding
+from .fuse_conv_elementwise import fuse_conv_elementwise
+from .fuse_group_ops import fuse_group_ops
+from .fuse_mm_elementwise import fuse_mm_elementwise
+from .fuse_ops import fuse_ops
+from .fuse_parallel_gemms import fuse_parallel_gemms
+from .fuse_permute_bmm import fuse_permute_bmm
+from .transform_memory_ops import transform_memory_ops
+from .transform_odd_alignment import transform_odd_alignment
+from .transform_special_ops import transform_special_ops
+from .transform_strided_ops import transform_strided_ops
+
+
+def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
+    """Applies graph optimizations, including
+
+    - fuse permute and bmm
+    - transform odd alignment
+    - fuse conv and elementwise
+    - fuse gemm and elementwise
+    - fuse elementwise ops
+    - fuse parallel gemms
+    - fuse group ops
+    - transform special ops
+    - transform strided ops
+    - transform memory ops
+    - apply padding
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    workdir : str
+        working directory
+
+    Returns
+    -------
+    List[Tensor]
+        Fused graph
+    """
+
+    funcs = [
+        fuse_permute_bmm,
+        transform_odd_alignment,
+        fuse_conv_elementwise,
+        fuse_mm_elementwise,
+        transform_memory_ops,
+        fuse_ops,
+        # need to run before transform_strided_ops to fuse strided ops + concat
+        # and transform_memory_ops to fuse split + concat
+        fuse_parallel_gemms,
+        fuse_group_ops,
+        # This needs to be run after fuse_ops() to avoid handling elementwise
+        # op directly. After fuse_ops, there are only FusedElementwise ops.
+        transform_special_ops,
+        apply_padding,
+        transform_strided_ops,
+        transform_memory_ops,
+    ]
+
+    for func in funcs:
+        sorted_graph = func(sorted_graph, workdir)
+        graph_utils.dump_graph_debug_str_to_file(sorted_graph, workdir, func.__name__)
+
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
new file mode 100644
index 000000000..cc65cb50c
--- /dev/null
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -0,0 +1,72 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Graph pass to invoke profiling.
+"""
+import os
+from typing import List
+
+from ...backend import codegen
+from ..base import DynamicProfileStrategy, Tensor
+
+# pylint: disable=C0103,W0613,W0102
+
+
+def profile(
+    sorted_graph: List[Tensor],
+    workdir="./tmp",
+    devices=None,
+    dynamic_profiling_strategy=DynamicProfileStrategy.MAX,
+):
+
+    """Profiles kernels.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        A sorted graph which contains all functions for profiling.
+    workdir : str, optional
+        The base dir to generate profiling source codes. By default "./tmp"
+    devices: list, optional
+        A list of device ids which can be used for profiling.
+        By default device 0 will be used.
+    dynamic_profiling_strategy: DynamicProfileStrategy, optional
+        A dynamic profiling strategy, used to filter generated profiles at compile time.
+        See also: :func:`~aitemplate.compiler.transform.profile.profile`
+        By default MAX is used, i.e. to profile a dynamic range, an upper bound will be used.
+    """
+
+    if devices is None:
+        devices = [0]
+    profiler_dir = os.path.join(workdir)
+    codegen.gen_profiler(sorted_graph, profiler_dir, dynamic_profiling_strategy)
+    profiled = {}
+    for node in sorted_graph:
+        for func in node.src_ops():
+            func_name = func._attrs["name"]
+            if func_name in profiled:
+                paths = func._attrs["exec_path"].keys()
+                for path in paths:
+                    func._attrs["exec_path"][path] = profiled[func_name]._attrs[
+                        "exec_path"
+                    ][path]
+                continue
+            if func._attrs["has_profiler"]:
+                func.profile(
+                    workdir=profiler_dir,
+                    devices=devices,
+                    dynamic_profiling_strategy=dynamic_profiling_strategy,
+                )
+                profiled[func_name] = func
diff --git a/python/aitemplate/compiler/transform/profile_dynamic_dim.py b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
new file mode 100644
index 000000000..525e2072c
--- /dev/null
+++ b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
@@ -0,0 +1,46 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Graph pass to invoke profiling with dynamic shapes.
+"""
+from copy import deepcopy
+from typing import List
+
+from ...backend import codegen
+from ...utils import logger
+from ..base import Tensor
+
+# pylint: disable=C0103,W0613,W0102
+
+
+def profile_dynamic_dim(sorted_graph: List[Tensor], workdir="./tmp"):
+    logger.info(__name__, "Current dynamic profiler supports ONLY ONE dynamic dim.")
+    codegen.gen_profiler(sorted_graph, workdir)
+    profiled = {}
+    for node in sorted_graph:
+        for func in node.src_ops():
+            func_name = func._attrs["name"]
+            if func_name in profiled:
+                # paths = profiled[func_name]._attrs["exec_path"].keys()
+                func._attrs["exec_path"] = deepcopy(
+                    profiled[func_name]._attrs["exec_path"]
+                )
+                # for path in paths:
+                #     func._attrs["exec_path"][path] = \
+                #         profiled[func_name]._attrs["exec_path"][path]
+                continue
+            if func._attrs["has_profiler"]:
+                func.profile_dynamic_dim(workdir=workdir)
+                profiled[func_name] = func
diff --git a/python/aitemplate/compiler/transform/refine_graph.py b/python/aitemplate/compiler/transform/refine_graph.py
new file mode 100644
index 000000000..6cc44cb2d
--- /dev/null
+++ b/python/aitemplate/compiler/transform/refine_graph.py
@@ -0,0 +1,159 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Graph pass to dedup operators with same signatures.
+"""
+from typing import List
+
+from ...utils import logger
+from ...utils.graph_utils import get_sorted_ops
+
+from ..base import Operator, Tensor
+
+# pylint: disable=C0103
+
+SPECIAL_CHECK_FUNC_KEYS = {
+    "inputs",
+    "name",
+    "depth",
+    "outputs",
+    "original_inputs",
+    "original_outputs",
+    "gemm_operand_groups",
+    "original_name",
+    "f_ab_alignment",
+    "elementwise_ops",
+    "args",
+}
+
+
+def same_tensor_type(t1: Tensor, t2: Tensor):
+    if t1.dtype() != t2.dtype():
+        return False
+    if t1._attrs["value"] != t2._attrs["value"]:
+        return False
+    t1s = t1.shape()
+    t2s = t2.shape()
+    if len(t1s) != len(t2s):
+        return False
+    for d1, d2 in zip(t1s, t2s):
+        if d1 != d2:
+            return False
+    return True
+
+
+def check_inputs_outputs(key: str, o1: Operator, o2: Operator):
+    # check inputs
+    o1_args = o1._attrs[key]
+    o2_args = o2._attrs[key]
+    if len(o1_args) != len(o2_args):
+        return False
+    for t1, t2 in zip(o1_args, o2_args):
+        if not same_tensor_type(t1, t2):
+            return False
+    return True
+
+
+def check_fused_elementwise_ops(o1: Operator, o2: Operator):
+    ops1 = o1._attrs["elementwise_ops"]
+    ops2 = o2._attrs["elementwise_ops"]
+
+    # Allow single input to simply it
+    if len(o1._attrs["inputs"]) != 1:
+        return False
+
+    # Disallow multiple ops
+    if len(ops1) != len(ops2) or len(ops1) != 1:
+        return False
+
+    return same_function_type(ops1[0], ops2[0])
+
+
+def same_function_type(o1: Operator, o2: Operator):
+    if o1._attrs["op"] != o2._attrs["op"]:
+        return False
+
+    if len(o1._attrs) != len(o2._attrs):
+        return False
+    keys = o1._attrs.keys()
+
+    # ban group gemm ops
+    if "unique_workspace" in keys:
+        return False
+
+    # check general attrs
+    for key in keys:
+        if key not in o2._attrs:
+            return False
+        if key not in SPECIAL_CHECK_FUNC_KEYS:
+            if o1._attrs[key] != o2._attrs[key]:
+                return False
+
+    if not check_inputs_outputs("inputs", o1, o2):
+        return False
+
+    # for fused_elementwise ops
+    if o1._attrs["op"] == "fused_elementwise" and (
+        not check_fused_elementwise_ops(o1, o2)
+    ):
+        return False
+    if "original_inputs" in keys:
+        if not check_inputs_outputs("original_inputs", o1, o2):
+            return False
+    if "original_outputs" in keys:
+        if not check_inputs_outputs("original_outputs", o1, o2):
+            return False
+
+    # for elementwise ops
+    if "args" in keys:
+        if not check_inputs_outputs("args", o1, o2):
+            return False
+
+    return True
+
+
+def refine_graph(sorted_graph: List[Tensor]):
+    """Graph pass to dedup operators with same signatures.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    """
+    sorted_ops = get_sorted_ops(sorted_graph)
+
+    exist_func = []
+    refined_ops = 0
+    total_ops = len(sorted_ops)
+
+    refined_ops_set = set()
+
+    for func in sorted_ops:
+        found = False
+        for f in reversed(exist_func):
+            if same_function_type(f, func):
+                func._attrs["name"] = f._attrs["name"]
+                found = True
+                refined_ops += 1
+                break
+        if not found:
+            exist_func.append(func)
+        if found:
+            refined_ops_set.add(func._attrs["op"])
+
+    logger.debug(__file__, f"refined ops: {refined_ops_set}")
+    logger.info(
+        __file__, f"reduced unique ops from {total_ops} to {total_ops - refined_ops}"
+    )
diff --git a/python/aitemplate/compiler/transform/remove_no_ops.py b/python/aitemplate/compiler/transform/remove_no_ops.py
new file mode 100644
index 000000000..0bf586e40
--- /dev/null
+++ b/python/aitemplate/compiler/transform/remove_no_ops.py
@@ -0,0 +1,168 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Remove no-ops from the graph.
+
+This is a bit different from remove_unused_ops. That pass
+is based on the graph structure - it removes ops tha are not
+connected to the src_ops of any tensor. This pass, on the other
+hand, removes things which are logically no-ops, like expands
+with no expanded dims.
+
+The reason it's not combined with removed_unused_ops is that
+many of the passes in this file will want to call sanitize_sorted_graph,
+but sanitize_sorted_graph calls remove_unused_ops.
+
+Also, even if the passes in this file avoided sanitize_sorted_graph,
+many other unrelated passes use sanitize_sorted_graph. We don't need to
+call the passes in this file more than once.
+"""
+from typing import List
+
+from aitemplate.compiler.base import IntVar, Operator
+
+from aitemplate.compiler.transform import transform_utils
+
+from aitemplate.utils import graph_utils
+from aitemplate.utils.shape_utils import is_singleton_dimension
+
+from ..base import Tensor
+
+
+def _remove_no_op_expands(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Remove no-op expands from the graph. A no-op expand is one
+    that doesn't expand any singleton dimensions to values greater
+    than one.
+
+    x = Tensor([1, 2, 3])
+    y1 = ops.expand()(x, [-1, -1, -1])  # no-op
+    y2 = ops.expand()(x, [1, 2, -1])  # no-op
+    """
+    ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in ops:
+        if op._attrs["op"] != "expand":
+            continue
+
+        if op._attrs["expand_dim"] is not None:
+            continue
+
+        outputs = op._attrs["outputs"]
+        assert len(outputs) == 1, "expand must only have 1 output"
+        expand_output = outputs[0]
+
+        if expand_output._attrs["is_output"]:
+            continue
+
+        inputs = op._attrs["inputs"]
+        assert len(inputs) >= 1, "expand must have at least 1 input"
+        expand_input = inputs[0]
+
+        # This expand is a no-op, so we know that these shapes should
+        # be the same. However, the shape inference system may not be aware
+        # of that due to different IntVar names.
+        expand_input._attrs["shape"] = expand_output._attrs["shape"]
+        for dst in list(expand_output.dst_ops()):
+            transform_utils.replace_tensor_for_op(dst, expand_output, expand_input)
+
+        transform_utils.remove_tensor_from_sorted_graph(expand_output)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _fuse_expand_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Eliminate expand ops that occur before elementwise when broadcasting
+    in elementwise can handle the unexpanded input.
+
+    Example:
+    x = Tensor([1, 2, 3])
+    y = Tensor([3, 2, 3])
+    z = ops.elementwise(FuncEnum.ADD)(ops.expand()(x, [3, 2, 3]), y)
+
+    The expand here is not required because elementwise broadcasting will just
+    do the right thing.
+
+    Note that this must occur before any pass that fuses elementwise into
+    other ops.
+    """
+
+    def _is_compatible_with_broadcasting(
+        expand_output_dim: IntVar, elementwise_input_dim: IntVar
+    ) -> bool:
+        return expand_output_dim == elementwise_input_dim or is_singleton_dimension(
+            expand_output_dim
+        )
+
+    for op in graph_utils.get_sorted_ops(sorted_graph):
+        if op._attrs["op"] != "expand":
+            continue
+
+        outputs = op._attrs["outputs"]
+        assert len(outputs) == 1, "expand must only have 1 output"
+        expand_output = outputs[0]
+
+        if expand_output._attrs["is_output"]:
+            continue
+
+        def _can_fuse_with(dst_op: Operator) -> bool:
+            if dst_op._attrs["op"] != "elementwise":
+                return False
+
+            for elementwise_input in dst_op._attrs["inputs"]:
+                if elementwise_input is expand_output:
+                    continue
+                if not all(
+                    _is_compatible_with_broadcasting(dim_a, dim_b)
+                    for dim_a, dim_b in zip(
+                        expand_output._attrs["shape"], elementwise_input._attrs["shape"]
+                    )
+                ):
+                    return False
+            return True
+
+        if not all(_can_fuse_with(dst) for dst in expand_output._attrs["dst_ops"]):
+            continue
+
+        inputs = op._attrs["inputs"]
+        assert len(inputs) >= 1, "expand must have at least 1 input"
+        expand_input = inputs[0]
+
+        for dst in list(expand_output.dst_ops()):
+            transform_utils.replace_tensor_for_op(dst, expand_output, expand_input)
+
+        transform_utils.remove_tensor_from_sorted_graph(expand_output)
+
+
+def remove_no_ops(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """Remove no-ops from the graph.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+
+    Returns
+    -------
+    List[Tensor]
+        Graph after remove no-ops
+    """
+    passes = [
+        _remove_no_op_expands,
+        _fuse_expand_elementwise,
+    ]
+    for f_pass in passes:
+        sorted_graph = f_pass(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/remove_unused_ops.py b/python/aitemplate/compiler/transform/remove_unused_ops.py
new file mode 100644
index 000000000..f3ccee282
--- /dev/null
+++ b/python/aitemplate/compiler/transform/remove_unused_ops.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Remove useless operators from a sorted_graph.
+"""
+from collections import deque
+from typing import List
+
+from ..base import Tensor
+
+
+def remove_unused_ops(sorted_graph: List[Tensor]) -> None:
+    """Remove ops which are not src operators of tensors in the input sorted_graph."""
+
+    src_ops = set()
+    to_be_visited_ops = deque()
+    for node in sorted_graph:
+        src_ops.update(node._attrs["src_ops"])
+        to_be_visited_ops.extend(node._attrs["dst_ops"])
+
+    visited_ops = set()
+    while len(to_be_visited_ops) > 0:
+        next_op = to_be_visited_ops.popleft()
+        if next_op in visited_ops:
+            continue
+        visited_ops.add(next_op)
+        if next_op not in src_ops:
+            for input_tensor in next_op._attrs["inputs"]:
+                input_tensor._attrs["dst_ops"].discard(next_op)
+        for output_tensor in next_op._attrs["outputs"]:
+            to_be_visited_ops.extend(output_tensor._attrs["dst_ops"])
diff --git a/python/aitemplate/compiler/transform/toposort.py b/python/aitemplate/compiler/transform/toposort.py
new file mode 100644
index 000000000..9de7eb615
--- /dev/null
+++ b/python/aitemplate/compiler/transform/toposort.py
@@ -0,0 +1,65 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Graph pass for topological sort.
+"""
+from typing import List, Union
+
+from ..base import Tensor
+
+# pylint: disable=C0103
+
+
+def toposort(nodes: Union[Tensor, List[Tensor]]) -> List[Tensor]:
+    """Generate sorted nodes by topological order. This is the foundation of all graph passes.
+
+    Parameters
+    ----------
+    nodes : Union[Tensor, List[Tensor]]
+        The output of the model
+
+    Returns
+    -------
+    List[Tensor]
+        Sorted graph
+    """
+    visited = set()
+    sorted_graph = []
+
+    def DFS(nd: Tensor):
+        if nd in visited:
+            return
+        for src_op in nd.src_ops():
+            args = src_op._attrs["inputs"]
+            indexed_args = list(enumerate(args))
+            depth_first_args = sorted(
+                indexed_args, key=lambda x: x[1]._attrs["depth"], reverse=True
+            )
+            visit_seq = [x[0] for x in depth_first_args]
+            for idx in visit_seq:
+                arg = args[idx]
+                DFS(arg)
+        visited.add(nd)
+        sorted_graph.append(nd)
+        for src_op in nd.src_ops():
+            for next_nd in src_op._attrs["outputs"]:
+                DFS(next_nd)
+
+    if isinstance(nodes, Tensor):
+        DFS(nodes)
+    else:
+        for node in list(nodes):
+            DFS(node)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
new file mode 100644
index 000000000..f03ce6f2b
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -0,0 +1,174 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform memory operator related transformations.
+"""
+from typing import List
+
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+from ...utils import graph_utils
+from ..base import Operator, Tensor
+from . import transform_utils
+
+
+def _eliminate_cat(sorted_graph: List[Tensor]) -> List[Tensor]:
+    # If we only have a single cat op in the graph, let's keep it.
+    # This almost always comes from unit tests.
+    if len(graph_utils.get_sorted_ops(sorted_graph)) <= 1:
+        return sorted_graph
+
+    single_input_cat_ops = []
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        if op._attrs["op"] != "concatenate":
+            continue
+        if len(op._attrs["outputs"]) != 1:
+            continue
+        if len(op._attrs["inputs"]) == 0:
+            op._attrs["outputs"][0]._attrs["src_ops"].remove(op)
+            op._attrs["outputs"] = []
+            continue
+        if (len(op._attrs["inputs"]) == 1) and (False not in op._attrs["input_masks"]):
+            single_input_cat_ops.append(op)
+
+    for op in single_input_cat_ops:
+        transform_utils.remove_single_tensor_op_from_sorted_graph(op)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
+    first_op_inputs = first_op._attrs["inputs"]
+    first_op_outputs = first_op._attrs["outputs"]
+    cat_inputs = cat._attrs["inputs"]
+    new_cat_inputs = []
+    i = 0
+    while i < len(cat_inputs):
+        matched = True
+        for j, _ in enumerate(first_op_outputs):
+            if (i + j >= len(cat_inputs)) or (
+                cat_inputs[i + j] is not first_op_outputs[j]
+            ):
+                matched = False
+                break
+        if matched:
+            new_cat_inputs.extend(first_op._attrs["inputs"])
+            i += len(first_op_outputs)
+        else:
+            new_cat_inputs.append(cat_inputs[i])
+            i += 1
+
+    for tensor in new_cat_inputs:
+        if tensor in first_op_outputs:
+            return False
+
+    cat._attrs["inputs"] = new_cat_inputs
+    # make sure all of the input_masks values are True. We may need to
+    # change this part later when we have TensorAccessors, depending on
+    # the order of the transformations.
+    assert all(cat._attrs["input_masks"])
+    # make sure input_accessors do not carry any strided information
+    assert all(
+        accessor.stride_dim is None for accessor in cat._attrs["input_accessors"]
+    )
+    cat._attrs["input_accessors"] = [TensorAccessor(t) for t in cat._attrs["inputs"]]
+    cat._attrs["original_inputs"] = list(new_cat_inputs)
+    cat._attrs["input_masks"] = [True] * len(new_cat_inputs)
+    for tensor in first_op_inputs:
+        tensor._attrs["dst_ops"].remove(first_op)
+        tensor._attrs["dst_ops"].add(cat)
+    for tensor in first_op_outputs:
+        transform_utils.remove_tensor_from_sorted_graph(tensor)
+    return True
+
+
+FIRST_OP_CANDIDATES = {"split", "concatenate"}
+
+
+def _merge_split_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noqa: C901
+    to_be_merged_ops = []
+    visited = set()
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] not in FIRST_OP_CANDIDATES:
+            continue
+        if src_op in visited:
+            continue
+        first_op = src_op
+
+        cat = None
+        found_cat_op = True
+        for output_t in first_op._attrs["outputs"]:
+            if len(output_t._attrs["dst_ops"]) > 1:
+                found_cat_op = False
+                break
+            next_ops = output_t._attrs["dst_ops"]
+            if len(next_ops) != 1:
+                break
+            next_op = list(next_ops)[0]
+            if next_op._attrs["op"] != "concatenate":
+                found_cat_op = False
+                break
+            if cat is None:
+                cat = next_op
+            if next_op is not cat:
+                found_cat_op = False
+                break
+
+        if cat is None or not found_cat_op:
+            continue
+
+        first_op_dim = (
+            first_op._attrs["concat_dim"]
+            if first_op._attrs["op"] == "concatenate"
+            else first_op._attrs["split_dim"]
+        )
+        if cat._attrs["concat_dim"] != first_op_dim:
+            continue
+
+        to_be_merged_ops.append([first_op, cat])
+        visited.add(first_op)
+        visited.add(cat)
+
+    for ops in to_be_merged_ops:
+        _try_merge_split_cat(ops[0], ops[1])
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def transform_memory_ops(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """
+    Eliminates unnecessary cat / split ops.
+    """
+
+    funcs = [
+        _merge_split_and_cat,
+        _eliminate_cat,
+    ]
+    num_ops = None
+    should_continue = True
+    while should_continue:
+        for func in funcs:
+            sorted_graph = func(sorted_graph)
+        new_num_ops = len(graph_utils.get_sorted_ops(sorted_graph))
+        if num_ops == new_num_ops:
+            should_continue = False
+        num_ops = new_num_ops
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_odd_alignment.py b/python/aitemplate/compiler/transform/transform_odd_alignment.py
new file mode 100644
index 000000000..0e9b9414d
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_odd_alignment.py
@@ -0,0 +1,301 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Add permute for gemm/bmm if alignment is odd.
+"""
+from math import inf
+from typing import Dict, List, Tuple
+
+from ..base import IntImm, IntVar, Operator, Tensor
+from ..ops.common.view_ops import unsqueeze
+from ..ops.gemm_universal import bmm_ccr, bmm_crr, bmm_rcr, bmm_rrr
+from ..ops.tensor import permute021
+
+from .apply_padding import _get_padding_length
+from .fuse_utils import extract_only_one_op
+from .toposort import toposort
+from .transform_strided_ops import _is_supported_op as _is_supported_strided_op
+from .transform_strided_slice import _is_supported_op as _is_supported_strided_slice
+from .transform_utils import (
+    can_be_constant_folded,
+    copy_src_op_attributes,
+    copy_tensor_attributes,
+    remove_dst_op_from_tensor,
+    remove_tensor_from_sorted_graph,
+    replace_tensor,
+    sanitize_sorted_graph,
+)
+
+# pylint: disable=C0103,W0612
+
+
+def _matrix_shape_prod(shapes: List[IntVar]) -> int:
+    prod = 1
+    for shape in shapes:
+        if isinstance(shape, IntImm):
+            prod *= shape.value()
+        else:
+            prod *= shape.upper_bound()
+
+    return prod
+
+
+def _compute_padding_flops(
+    tensor: Tensor, shapes: List[IntVar], padding_idx: int
+) -> int:
+    if shapes[padding_idx].value() % 2 == 0:
+        return 0
+
+    if can_be_constant_folded(tensor):
+        return 0
+    elif _is_strided_tensor(tensor):
+        return (
+            _matrix_shape_prod(shapes)
+            * _get_padding_length(shapes[padding_idx].value())
+            / shapes[padding_idx].value()
+        )
+    else:
+        return _matrix_shape_prod(shapes)
+
+
+def _compute_slicing_flops(mm_op: Operator, slicing_dim: int, other_dim: int) -> int:
+    can_be_fused = True
+
+    if len(mm_op._attrs["outputs"][0].dst_ops()) == 0:
+        can_be_fused = False
+    for dst in mm_op._attrs["outputs"][0].dst_ops():
+        # We can use mm_op here since the shape of post-slice would be the same.
+        if not _is_strided_slice(mm_op, dst):
+            can_be_fused = False
+
+    if can_be_fused:
+        return other_dim * _get_padding_length(slicing_dim)
+    else:
+        return other_dim * slicing_dim
+
+
+def _get_K_index(op_type: str):
+    k_mapping = {"ccr": [-2, -1], "crr": [-2, -2], "rcr": [-1, -1], "rrr": [-1, -2]}
+
+    for k, v in k_mapping.items():
+        if op_type.find(k) != -1:
+            return v
+
+    raise RuntimeError(f"Can't find K index mapping for {op_type}")
+
+
+def _get_nonK_len(shape: List[IntVar], k_idx: int) -> int:
+    nonK_shape = shape[-1] if k_idx == -2 else shape[-2]
+    return (
+        nonK_shape.value()
+        if isinstance(nonK_shape, IntImm)
+        else nonK_shape.upper_bound()
+    )
+
+
+def _is_strided_tensor(tensor: Tensor):
+    src_op = extract_only_one_op(tensor.src_ops())
+    if src_op is None:
+        return False
+
+    if src_op._attrs["op"] == "elementwise":
+        # elementwise are not fused yet.
+        return True
+    return _is_supported_strided_op(src_op)
+
+
+def _is_strided_slice(op: Operator, next_op: Operator):
+    if next_op._attrs["op"] == "elementwise":
+        return True
+    return _is_supported_strided_slice(next_op, op)
+
+
+def _compute_required_flops(mm_op: Operator, x_perm: bool, w_perm: bool) -> int:
+    inputs = mm_op._attrs["inputs"]
+    input_shapes = (inputs[0].shape(), inputs[1].shape())
+    perm = [x_perm, w_perm]
+
+    for idx in range(2):
+        if not perm[idx]:
+            continue
+        if not (
+            isinstance(input_shapes[idx][-1], IntImm)
+            and input_shapes[idx][-1].value() % 2 == 1
+            and isinstance(input_shapes[idx][-2], IntImm)
+            and input_shapes[idx][-2].value() % 2 == 0
+        ):
+            # Make sure we are really permuting from odd to even alignment
+            return inf
+
+    k_idx = _get_K_index(mm_op._attrs["op"])
+
+    count = 0
+    pad_k = False
+    for idx in range(2):
+        if perm[idx]:
+            count += (
+                0
+                if can_be_constant_folded(inputs[idx])
+                else _matrix_shape_prod(input_shapes[idx])
+            )
+        else:
+            count += _compute_padding_flops(inputs[idx], input_shapes[idx], -1)
+            if k_idx[idx] == -1:
+                pad_k = True
+
+    for idx in range(2):
+        # We add a k-padding if dimension k is being padded on other input
+        if pad_k and not perm[idx] and k_idx[idx] != -1:
+            count += _compute_padding_flops(inputs[idx], input_shapes[idx], -2)
+
+    for idx in range(2):
+        if (
+            not perm[idx]
+            and k_idx[idx] != -1
+            and input_shapes[idx][-1].value() % 2 == 1
+        ):
+            nonk_len = _get_nonK_len(input_shapes[idx], k_idx[idx])
+
+            other_idx = (idx + 1) % 2
+            other_nonk_len = _get_nonK_len(input_shapes[other_idx], k_idx[other_idx])
+
+            count += _compute_slicing_flops(mm_op, nonk_len, other_nonk_len)
+
+    return count
+
+
+def _transform_odd_alignment(
+    sorted_graph: List[Tensor],
+    permutable_pairs: Dict[str, Tuple[Operator, Operator, Operator]],
+) -> List[Tensor]:
+    """
+    This function tries to insert new permute021() ops before gemms when applicable.
+
+    For input tensors with odd alignments, either permutation or padding are needed
+    to transform an odd alignment to an even alignment, so that SM80 cutlass kernels
+    can be used.
+    This function decides between permutation and padding by estimating cost of all
+    the options, and selects the one with minimal cost. If adding permutation costs
+    less, this function inserts new permute021 kernels before gemm ops. Otherwise
+    this function does nothing and apply_padding pass kicks in.
+
+    Cost of permutation is the total element of matrix.
+    Cost of padding is computed with the following rules:
+        1) If the connecting op cannot be fused with padding, the cost is the total
+           elements of matrix.
+        2) If the connecting op can be fused with padding, the cost would be the
+           elements of the additional zeros padded.
+    One special case is if the input is constant, both permutation and padding are
+    free since we have constant folding pass.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        AIT graph to run fusion
+    permutable_pairs: Dict[str, Tuple[Operator, Operator, Operator]]
+        A dictionary where key is the op that might accept permuted inputs,
+        and value is a tuple of len-3 which correspond to ops (permute first input,
+        permute second input, permute both first and second inputs)
+    """
+
+    new_sorted_graph = []
+    permuted_inputs = {}
+    for tensor in sorted_graph:
+        new_sorted_graph.append(tensor)
+
+        src_op = extract_only_one_op(tensor._attrs["src_ops"])
+        if src_op is None:
+            continue
+
+        op_type = src_op._attrs["op"]
+        if op_type not in permutable_pairs:
+            continue
+
+        perm_type = ([False, False], [False, True], [True, False], [True, True])
+        permute_input = [False, False]
+        best_cost = inf
+        for p in perm_type:
+            perm_cost = _compute_required_flops(src_op, p[0], p[1])
+            if perm_cost < best_cost:
+                permute_input = p
+                best_cost = perm_cost
+
+        if not permute_input[0] and not permute_input[1]:
+            continue
+
+        inputs = src_op._attrs["inputs"]
+        new_inputs = list(inputs)
+        for idx in range(2):
+            if permute_input[idx]:
+                if inputs[idx] in permuted_inputs:
+                    permuted_input = permuted_inputs[inputs[idx]]
+                else:
+                    input_shape = inputs[idx].shape()
+                    if len(input_shape) == 2:
+                        expanded_input = unsqueeze(0)(inputs[idx])
+                        new_sorted_graph.insert(-1, expanded_input)
+                        permuted_input = permute021()(expanded_input)
+                    else:
+                        permuted_input = permute021()(inputs[idx])
+                    new_sorted_graph.insert(-1, permuted_input)
+                    permuted_inputs[inputs[idx]] = permuted_input
+                new_inputs[idx] = permuted_input
+
+        if permute_input[0] and permute_input[1]:
+            new_tensor = permutable_pairs[op_type][2]()(*new_inputs)
+        elif permute_input[0]:
+            new_tensor = permutable_pairs[op_type][0]()(*new_inputs)
+        elif permute_input[1]:
+            new_tensor = permutable_pairs[op_type][1]()(*new_inputs)
+        copy_tensor_attributes(new_tensor, tensor)
+        copy_src_op_attributes(new_tensor, tensor)
+        replace_tensor(tensor, new_tensor)
+
+        remove_dst_op_from_tensor(inputs, src_op)
+        remove_tensor_from_sorted_graph(tensor)
+
+        new_sorted_graph[-1] = new_tensor
+
+    new_sorted_graph = toposort(new_sorted_graph)
+    return sanitize_sorted_graph(new_sorted_graph)
+
+
+def transform_odd_alignment(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """Transform odd alignments to even alignments for bmm operators
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    workdir : str, optional
+        workdir, by default None
+
+    Returns
+    -------
+    List[Tensor]
+        Optimized graph
+    """
+    permutable_pairs = {
+        "bmm_ccr": (bmm_rcr, bmm_crr, bmm_rrr),
+        "bmm_crr": (bmm_rrr, bmm_ccr, bmm_rcr),
+        "bmm_rcr": (bmm_ccr, bmm_rrr, bmm_crr),
+        "bmm_rrr": (bmm_crr, bmm_rcr, bmm_ccr),
+    }
+
+    sorted_graph = _transform_odd_alignment(sorted_graph, permutable_pairs)
+
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_special_ops.py b/python/aitemplate/compiler/transform/transform_special_ops.py
new file mode 100644
index 000000000..53ee141fc
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_special_ops.py
@@ -0,0 +1,301 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform graph transformation specifically for gemm -> gemm_special.
+Check each transform function summary for specific pattern to be transformed.
+"""
+from typing import Callable, List, Tuple, Type, Union
+
+from aitemplate.utils.shape_utils import is_singleton_dimension
+
+from .. import ops
+from ..base import Operator, Tensor
+from ..ops.gemm_special.gemm_rrr_small_nk import gemm_rrr_small_nk
+from ..ops.gemm_universal.bmm_rcr import bmm_rcr
+from ..ops.gemm_universal.gemm_rrr import gemm_rrr
+from .transform_utils import (
+    copy_src_op_attributes,
+    copy_tensor_attributes,
+    remove_dst_op_from_tensor,
+    replace_tensor,
+    sanitize_sorted_graph,
+)
+
+# pylint: disable=C0103,C0415,W0612
+
+
+def _simple_transform_with_constraint(
+    sorted_graph: List[Tensor],
+    matchFunc: Callable[[Tensor], bool],
+    replaceFunc: Callable[[Tensor], Union[Tensor, List[Tensor]]],
+) -> List[Tensor]:
+    """
+    Replace ops in sorted_graph that match constraint provided by matchFunc.
+    Op to be replaced is determined by matchFunc, which if true, we call replaceFunc to substitute the tensor away.
+
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        original AIT graph
+
+    matchFunc : Callable(Tensor) -> bool
+        A function that returns whether or not the operator needs to be substituted
+
+    replaceFunc  : Callable(Tensor) -> Tensor
+        A function that return the resulting tensor that replaces the original one.
+
+    Returns
+    ----------
+    List[Tensor]
+        AIT graph after transformation
+    """
+
+    new_sorted_graph = []
+    transformed = False
+    for tensor in sorted_graph:
+        if matchFunc(tensor):
+            new_tensors = replaceFunc(tensor)
+            if isinstance(new_tensors, Tensor):
+                new_sorted_graph.append(new_tensors)
+            else:
+                new_sorted_graph.extend(new_tensors)
+
+            transformed = True
+        else:
+            new_sorted_graph.append(tensor)
+
+    if not transformed:
+        return sorted_graph
+    return sanitize_sorted_graph(new_sorted_graph)
+
+
+def _single_source_shape_constraint_funcs(
+    src_type: Type[Operator], target_type: Type[Operator]
+) -> Tuple[Callable[[Tensor], bool], Callable[[Tensor], Tensor]]:
+    """
+    Returns matching function and replace function for tensors that are single src_op with shape constraints.
+
+    Parameters
+    ----------
+    src_type : Type[Operator]
+        An operator type that can be substituted by target_type provided tensor shape constraint is satisfied.
+    target_type : Type[Operator]
+        An operator that can substitute src_type provided shape constraint is satisfied.
+        is_valid_shape needs to be implemented for target_type
+
+    Returns
+    ----------
+    Tuple[Callable[[Tensor], bool], Callable[[Tensor], Tensor]]
+        A tuple of function which corresponds to a matching function and a replacing function wrt tensor provided.
+    """
+
+    def matchFunc(tensor: Tensor) -> bool:
+        src_ops = tensor._attrs["src_ops"]
+        if src_ops is None or len(src_ops) != 1:
+            return False
+
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] != src_type()._attrs["op"]:
+            return False
+
+        A, B = src_op._attrs["inputs"]
+        if not target_type.is_valid_shape(A, B):
+            return False
+
+        return True
+
+    def replaceFunc(old_tensor: Tensor) -> Tensor:
+        src_op = list(old_tensor._attrs["src_ops"])[0]
+        A, B = src_op._attrs["inputs"]
+
+        new_op = target_type()
+        new_tensor = new_op(A, B)
+        copy_tensor_attributes(new_tensor, old_tensor)
+        copy_src_op_attributes(new_tensor, old_tensor)
+        remove_dst_op_from_tensor([A, B], src_op)
+        replace_tensor(old_tensor, new_tensor)
+        return new_tensor
+
+    return (matchFunc, replaceFunc)
+
+
+def _transform_bmm_rcr_n1(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Replace kernel bmm_rcr with N == 1 and K % 8 == 0 with bmm_rcr_n1
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        original AIT graph
+
+    Returns
+    ----------
+    List[Tensor]
+        AIT graph with suitable bmm_rcr substituted with bmm_rcr_n1
+    """
+    matchFunc, replaceFunc = _single_source_shape_constraint_funcs(
+        bmm_rcr, ops.gemm_special.bmm_rcr_n1
+    )
+
+    return _simple_transform_with_constraint(sorted_graph, matchFunc, replaceFunc)
+
+
+def _transform_gemm_rrr_small_nk(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Replace kernel gemm_rrr with N <= 8, K <= 16 with gemm_rrr_small_nk
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        original AIT graph
+
+    Returns
+    ----------
+    List[Tensor]
+        AIT graph with suitable gemm_rrr substituted with gemm_rrr_small_nk
+    """
+    matchFunc, replaceFunc = _single_source_shape_constraint_funcs(
+        gemm_rrr, gemm_rrr_small_nk
+    )
+
+    return _simple_transform_with_constraint(sorted_graph, matchFunc, replaceFunc)
+
+
+def _transform_1x1_conv_gemm_rcr(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Replace a 1x1 conv2d with an equivalent gemm_rcr call.
+    """
+
+    conv_to_gemm = {
+        "conv2d": ops.gemm_rcr,
+        "conv2d_bias": ops.gemm_rcr_bias,
+        "conv2d_bias_relu": ops.gemm_rcr_bias_relu,
+        "conv2d_bias_sigmoid": ops.gemm_rcr_bias_sigmoid,
+        "conv2d_bias_hardswish": ops.gemm_rcr_bias_hardswish,
+        "conv2d_bias_add_relu": ops.gemm_rcr_bias_add_relu,
+    }
+
+    def match_func(tensor: Tensor) -> bool:
+        src_ops = tensor._attrs["src_ops"]
+        if src_ops is None or len(src_ops) != 1:
+            return False
+
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] not in conv_to_gemm:
+            return False
+
+        if (
+            src_op._attrs["pad"] != 0
+            or src_op._attrs["dilate"] != 1
+            or src_op._attrs["group"] != 1
+            or src_op._attrs["stride"] != 1
+        ):
+            return False
+
+        # Check that the filter is 1x1
+        w_shape = src_op._attrs["inputs"][1]._attrs["shape"]
+        if not is_singleton_dimension(w_shape[1]) or not is_singleton_dimension(
+            w_shape[2]
+        ):
+            return False
+
+        return True
+
+    def replace_func(old_tensor: Tensor) -> List[Tensor]:
+        src_op = list(old_tensor._attrs["src_ops"])[0]
+        inputs = src_op._attrs["inputs"]
+        X = inputs[0]
+        W = inputs[1]
+
+        # Get rid of the singleton dimensions
+        W_squeeze_0 = ops.squeeze(1)(W)
+        W_squeeze_1 = ops.squeeze(1)(W_squeeze_0)
+
+        batch, HH, WW, CI = ops.size()(X)
+        CO = ops.size()(W, dim=0)
+
+        X_reshape = ops.reshape()(X, (-1, CI))
+        new_op = conv_to_gemm[src_op._attrs["op"]]
+        if len(inputs) == 2:
+            new_tensor = new_op()(X_reshape, W_squeeze_1)
+        elif len(inputs) == 3:
+            new_tensor = new_op()(X_reshape, W_squeeze_1, inputs[2])
+        elif len(inputs) == 4:
+            new_tensor = new_op()(X_reshape, W_squeeze_1, inputs[2], inputs[3])
+        else:
+            raise NotImplementedError(f"Unsupported number of inputs: {len(inputs)}")
+
+        new_tensor_reshape = ops.reshape()(new_tensor, (batch, HH, WW, CO))
+
+        copy_tensor_attributes(new_tensor_reshape, old_tensor)
+        copy_src_op_attributes(new_tensor_reshape, old_tensor)
+
+        remove_dst_op_from_tensor(inputs, src_op)
+        replace_tensor(old_tensor, new_tensor_reshape)
+
+        return [
+            W_squeeze_0,
+            W_squeeze_1,
+            batch,
+            HH,
+            WW,
+            CI,
+            CO,
+            X_reshape,
+            new_tensor,
+            new_tensor_reshape,
+        ]
+
+    return _simple_transform_with_constraint(sorted_graph, match_func, replace_func)
+
+
+def transform_special_ops(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """Transform generic gemm/conv ops to special ops.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    workdir : str, optional
+        workdir, by default None
+
+    Returns
+    -------
+    List[Tensor]
+        Transformed graph
+    """
+    funcs = [
+        _transform_bmm_rcr_n1,
+        _transform_gemm_rrr_small_nk,
+    ]
+    for func in funcs:
+        sorted_graph = func(sorted_graph)
+
+    # This pass doesn not always make the model run faster.
+    # Turn it on with detect_target(transform_conv_to_gemm=True)
+    funcs = [
+        _transform_1x1_conv_gemm_rcr,
+    ]
+
+    from ...backend.target import Target
+
+    if "transform_conv_to_gemm" in Target.current()._kwargs:
+        if Target.current()._kwargs["transform_conv_to_gemm"]:
+            for func in funcs:
+                sorted_graph = func(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
new file mode 100644
index 000000000..c0a56379c
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
@@ -0,0 +1,154 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform transformations to fuse view ops with strided op by using TensorAccessor.
+"""
+
+import logging
+from typing import List
+
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.public import IntImm
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.utils import graph_utils
+
+logger = logging.getLogger(__name__)
+
+_VIEW_OPS = {"reshape", "flatten", "squeeze", "unsqueeze"}
+
+
+def _is_supported_strided_op(op: Operator) -> bool:
+    from ...backend.target import Target
+
+    op_kind = op._attrs["op"]
+    if Target.current().name() == "rocm":
+        return op_kind == "bmm_softmax_bmm_permute"
+    else:
+        return not op_kind.startswith(("group_gemm", "concatenate"))
+
+
+def _is_supported_view_op(op: Operator, tensor: Tensor) -> bool:
+    if op._attrs["op"] not in _VIEW_OPS:
+        return False
+    input_dynamic_dims = {
+        dim
+        for dim in op._attrs["inputs"][0]._attrs["shape"]
+        if not isinstance(dim, IntImm)
+    }
+    output_dynamic_dims = {
+        dim
+        for dim in op._attrs["outputs"][0]._attrs["shape"]
+        if not isinstance(dim, IntImm)
+    }
+    if input_dynamic_dims != output_dynamic_dims:
+        return False
+    return tensor is op._attrs["inputs"][0] or tensor is op._attrs["outputs"][0]
+
+
+def _fuse_strided_op_and_view_op_single_pass(
+    sorted_graph: List[Tensor],
+) -> List[Tensor]:
+    for tensor in sorted_graph:
+        if len(tensor._attrs["src_ops"]) != 1:
+            continue
+        view_op = list(tensor._attrs["src_ops"])[0]
+        if not _is_supported_view_op(view_op, tensor):
+            continue
+        view_input_tensor = view_op._attrs["inputs"][0]
+        src_op = (
+            list(view_input_tensor._attrs["src_ops"])[0]
+            if len(view_input_tensor._attrs["src_ops"]) == 1
+            else None
+        )
+        if (
+            src_op is not None
+            and len(view_input_tensor._attrs["dst_ops"]) == 1
+            and "output_accessors" in src_op._attrs
+            and _is_supported_strided_op(src_op)
+            and not view_input_tensor._attrs["is_output"]
+        ):
+            found_tensor = False
+            for idx, accessor in enumerate(src_op._attrs["output_accessors"]):
+                if src_op._attrs["outputs"][idx] is view_input_tensor:
+                    found_tensor = True
+                    accessor.update_base_tensor_shape(tensor)
+                    tensor._attrs["is_view_of"] = None
+                    src_op._attrs["outputs"][idx] = tensor
+                    tensor._attrs["src_ops"] = {src_op}
+                    transform_utils.remove_tensor_from_sorted_graph(view_input_tensor)
+                    break
+            assert (
+                found_tensor
+            ), f"Cannot find view_input_tensor {view_input_tensor} from src_op outputs {src_op._attrs['outputs']}!"
+        else:
+            if tensor._attrs["is_output"]:
+                continue
+            to_be_removed_dst_ops = set()
+            for dst_op in tensor._attrs["dst_ops"]:
+                if (
+                    "input_accessors" not in dst_op._attrs
+                    or not _is_supported_strided_op(dst_op)
+                ):
+                    continue
+                found_tensor = False
+                for idx, accessor in enumerate(dst_op._attrs["input_accessors"]):
+                    if dst_op._attrs["inputs"][idx] == tensor:
+                        found_tensor = True
+                        accessor.update_base_tensor_shape(view_input_tensor)
+                        dst_op._attrs["inputs"][idx] = view_input_tensor
+                        view_input_tensor._attrs["dst_ops"].add(dst_op)
+                assert (
+                    found_tensor
+                ), f"Cannot find tensor {tensor} from dst_op inputs {dst_op._attrs['inputs']}!"
+                to_be_removed_dst_ops.add(dst_op)
+            tensor._attrs["dst_ops"] = tensor._attrs["dst_ops"] - to_be_removed_dst_ops
+            if len(tensor._attrs["dst_ops"]) == 0:
+                view_input_tensor._attrs["dst_ops"].remove(view_op)
+                transform_utils.remove_tensor_from_sorted_graph(tensor)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _fuse_strided_op_and_view_op(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    This pass fuses a view op with a strided op before or after it by using
+    op._attrs["input_accessors"] or op._attrs["output_accessors"].
+    It only works for ops with these fields.
+
+    If there are multiple view ops, all these view ops will be fused into
+    adjacent TensorAccessors if applicable.
+
+    Note that are several conditions for fusion:
+
+    1) The view op doesn't generate new dynamic dims.
+    e.g. reshape(X[IntVar("batch_size"), 4], [-1, 2]) won't be fused with
+    a strided op as a new dynamic dim is generated. This is because
+    TensorAccessor doesn't support dynamic stride calculation for now.
+    When the support is ready this condition can be removed.
+
+    2) Some strided ops are not supported, e.g. group_gemm.
+    This is because group_gemm has online_shape_inference which updates
+    output tensor shapes. When this bug is fixed this condition can be
+    removed.
+    """
+
+    num_ops = len(graph_utils.get_sorted_ops(sorted_graph))
+    should_continue = True
+    while should_continue:
+        sorted_graph = _fuse_strided_op_and_view_op_single_pass(sorted_graph)
+        new_num_ops = len(graph_utils.get_sorted_ops(sorted_graph))
+        if num_ops == new_num_ops:
+            should_continue = False
+        num_ops = new_num_ops
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
new file mode 100644
index 000000000..1ef6ade8e
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -0,0 +1,475 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform transformations on ops which support strided inputs / outputs.
+"""
+import functools
+import logging
+
+from typing import List
+
+from aitemplate.testing import detect_target
+
+from ...utils import graph_utils, shape_utils
+from ..base import IntImm, Operator, Tensor
+from ..ops.tensor.slice_reshape_scatter import slice_reshape_scatter
+from ..ops.tensor.slice_scatter import slice_scatter
+from . import transform_strided_ops_utils, transform_utils
+from .fuse_split import _fuse_split_and_group_gemm, _fuse_split_and_strided_op
+from .transform_strided_op_and_view_op import _fuse_strided_op_and_view_op
+from .transform_strided_slice import _fuse_slice_and_strided_op
+
+# pylint: disable=W0612
+
+logger = logging.getLogger(__name__)
+
+
+def _fuse_slices_concat(sorted_graph: List[Tensor]) -> List[Tensor]:
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] != "concatenate":
+            continue
+        concat_op = src_op
+        if slice_scatter.is_valid(concat_op):
+            slice_scatter(concat_op)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _fuse_slices_concat_reshape_concat(sorted_graph: List[Tensor]) -> List[Tensor]:
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] != "concatenate":
+            continue
+
+        concat_op = src_op
+        # TODO: we simply lookahead one step for a reshape op.
+        # Later, we may want to write a standalone pass that merge consecutive
+        # view-like ops.
+        concat_output = concat_op._attrs["outputs"][0]
+        if len(concat_output.dst_ops()) != 1:
+            continue
+
+        next_op = list(concat_output.dst_ops())[0]
+        if next_op._attrs["op"] != "reshape":
+            continue
+
+        reshape_op = next_op
+        reshape_output = reshape_op._attrs["outputs"][0]
+        if len(reshape_output.dst_ops()) != 1:
+            continue
+
+        next_op = list(reshape_output.dst_ops())[0]
+        if not next_op._attrs["op"].startswith("concatenate"):
+            continue
+
+        concat_op_2 = next_op
+        if slice_reshape_scatter.is_valid(concat_op, reshape_op, concat_op_2):
+            slice_reshape_scatter(concat_op, reshape_op, concat_op_2)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _is_strided_gemm(op_type: str) -> bool:
+    return op_type.startswith("gemm")
+
+
+def _gemm_cat_checker(gemm_op: Operator, cat_op: Operator) -> bool:
+    return transform_strided_ops_utils.gemm_stride_checker(
+        gemm_op._attrs["output_accessors"][0], cat_op._attrs["concat_dim"]
+    )
+
+
+def _is_strided_group_gemm(strided_op: Operator) -> bool:
+    op_type = strided_op._attrs["op"]
+    # make sure this op is a group_gemm op and the user doesn't
+    # explicitly set output_stride_dim, which is used for creating
+    # a target tensor holding all the outputs of the group_gemm.
+    # That being said, we don't overwrite the user's intent.
+    return (
+        op_type.startswith("group_gemm")
+        and "output_stride_dim" not in strided_op._attrs
+    )
+
+
+def _group_gemm_cat_checker(
+    group_gemm_op: Operator, cat_op: Operator, out_idx: int
+) -> bool:
+    return transform_strided_ops_utils.gemm_stride_checker(
+        group_gemm_op._attrs["output_accessors"][out_idx], cat_op._attrs["concat_dim"]
+    )
+
+
+def _is_bmm(op_type: str) -> bool:
+    # TODO: support cutlass bmm ops
+    return op_type.startswith("bmm_rcr")
+
+
+def _bmm_checker(bmm_op: Operator, cat_op: Operator) -> bool:
+    return transform_strided_ops_utils.gemm_stride_checker(
+        bmm_op._attrs["output_accessors"][0], cat_op._attrs["concat_dim"]
+    )
+
+
+def _is_perm102_bmm(op_type: str) -> bool:
+    # rcr/rrr family
+    return op_type.startswith("perm102_bmm_r")
+
+
+def _perm102_bmm_checker(bmm_op: Operator, cat_op: Operator) -> bool:
+    # Only support fusion patterns like this:
+
+    # x = perm102_bmm_xxx(a, b) # [m, b, n]
+    # y = x.reshape()[x._size(0), -1] # [m, b * n]
+    # z = cat()(y0, y1, ..., y, ..., yn, dim=-1)
+    input = bmm_op._attrs["inputs"][0]
+    output = bmm_op._attrs["outputs"][0]
+
+    # Make sure reshape only flattens the last two dims
+    if output._rank() != 2 or input._size(0) != output._size(0):
+        return False
+
+    cat_dim = cat_op._attrs["concat_dim"]
+
+    return cat_dim == 1 and transform_strided_ops_utils.gemm_stride_checker(
+        bmm_op._attrs["output_accessors"][0],
+        cat_op._attrs["concat_dim"],
+        get_stride_at_dim=0,
+    )
+
+
+def _is_layernorm(op_type: str) -> bool:
+    return op_type.startswith("layernorm") or op_type.startswith("group_layernorm")
+
+
+def _layernorm_cat_checker(cat_op: Operator) -> bool:
+    return cat_op._attrs["concat_dim"] in [0, 1]
+
+
+def _is_reduce_op(op_type: str) -> bool:
+    return op_type in {"reduce_sum", "reduce_mean", "var", "vector_norm"}
+
+
+def _reduce_cat_checker(op: Operator) -> bool:
+    if op._attrs["op"] == "reduce_sum":
+        # TODO: We only support output TensorAccessor for reduce_3d (and thus
+        # the special reduce_small_axis kernels). reduce_sum invokes reduce_3d
+        # only if the reduction_axis is -1, so we skip other reduction_axis values.
+        x = op._attrs["inputs"][0]
+        input_rank = x._rank()
+        for axis in op._attrs["reduction_axes"]:
+            assert (
+                axis >= 0 and axis < input_rank
+            ), f"axis ({axis}) is not in range of [0, {input_rank})"
+        if axis != input_rank - 1:
+            return False
+    return True
+
+
+def _is_supported_op(op: Operator) -> bool:
+    op_type = op._attrs["op"]
+    return (
+        op_type == "fused_elementwise"
+        or _is_strided_gemm(op_type)
+        or _is_strided_group_gemm(op)
+        or _is_layernorm(op_type)
+        or _is_bmm(op_type)
+        or _is_perm102_bmm(op_type)
+        or _is_reduce_op(op_type)
+    )
+
+
+def _is_valid_for_fusion(strided_op: Operator, cat_op: Operator, out_idx: int):
+    op_type = strided_op._attrs["op"]
+    if _is_strided_gemm(op_type):
+        return _gemm_cat_checker(strided_op, cat_op)
+    if _is_strided_group_gemm(strided_op):
+        return _group_gemm_cat_checker(strided_op, cat_op, out_idx)
+    if _is_layernorm(op_type):
+        return _layernorm_cat_checker(cat_op)
+    if _is_bmm(op_type):
+        return _bmm_checker(strided_op, cat_op)
+    if _is_perm102_bmm(op_type):
+        return _perm102_bmm_checker(strided_op, cat_op)
+    if _is_reduce_op(op_type):
+        return _reduce_cat_checker(strided_op)
+    return True
+
+
+def get_tensor_index(tensors, tensor: Tensor) -> int:
+    """
+    Return the index for the tensor in the "tensors" list.
+    """
+    idx = None
+    for input_idx, input_tensor in enumerate(tensors):
+        if input_tensor is tensor:
+            idx = input_idx
+            # found the input to be removed
+            break
+    assert idx is not None, "Expected idx to be not None"
+    return idx
+
+
+def _fuse_strided_op_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noqa: C901
+    """
+    Fuse strided_ops and cat op. One special case is group_gemm/group_layernorm + multiple cat.
+    We can have
+    y1, y2, y3, y4 = group_layernorm([x1, x2, x3, x4])
+    cat1 = concatenate(y1, y2)
+    cat2 = concatenate(y3, y4)
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        if op._attrs["op"] != "concatenate":
+            continue
+        cat_op = op
+        # check cat_dim is static
+        cat_dim = cat_op._attrs["concat_dim"]
+        if not transform_strided_ops_utils.cat_split_dim_is_static(cat_op, cat_dim):
+            continue
+
+        cat_inputs = cat_op._attrs["inputs"]
+        cat_output = cat_op._attrs["outputs"][0]
+
+        # We can have output_list = some_op()(x, y, z); y = concatenate()(output_list).
+        # Depending on whether we create a copy or not in concatenate(),
+        # the cat inputs and some_op outputs may point to the same list.
+        # This copy adds extra safety to make sure that it doesn't happen.
+        cat_op._attrs["inputs"] = cat_op._attrs["inputs"].copy()
+
+        cat_inputs_to_remove = []
+        for idx, cat_input in enumerate(cat_inputs):
+            src_ops = list(cat_input.src_ops())
+            if len(src_ops) != 1 or len(cat_input.dst_ops()) != 1:
+                continue
+            strided_op = src_ops[0]
+            if not _is_supported_op(strided_op):
+                continue
+
+            # get output_idx into strided_op
+            strided_idx = get_tensor_index(strided_op._attrs["outputs"], cat_input)
+
+            if not _is_valid_for_fusion(strided_op, cat_op, strided_idx):
+                continue
+
+            # Update "outputs" of the strided_op, and "input_masks" of the cat op.
+            # Note that we do not update "inputs" of the cat op here, to make sure
+            # that we could find input indexes correctly for other tensors.
+            if cat_op._attrs["inputs"].count(cat_input) > 1:
+                # We do not support the case that a strided op write to multiple outputs.
+                # TODO: Add multi-output later.
+                continue
+
+            offset = 0
+
+            # This pass must run before any other pass that remove cat inputs, like
+            # _fuse_strided_op_reshape_cat
+            for orig_i in range(idx):
+                input_tensor = cat_inputs[orig_i]
+                # TODO: Add dynamic shape support.
+                offset += input_tensor._attrs["shape"][cat_dim].value()
+
+            cat_inputs_to_remove.append(idx)
+
+            strided_op._attrs["output_accessors"][strided_idx].update_base_tensor(
+                cat_output, cat_dim, offset
+            )
+
+            cat_output._attrs["src_ops"].add(strided_op)
+
+            output_tensor = strided_op._attrs["outputs"][strided_idx]
+            strided_op._attrs["outputs"][strided_idx] = cat_output
+            transform_utils.remove_tensor_from_sorted_graph(output_tensor)
+
+            # Note that we have to update strided_op's epilogue_alignment
+            # in the backend codegen where we modify the generate gemm
+            # instance string with the updated epilogue_alignment.
+            # The reason is similar to the one with slice + gemm.
+            # For a problem size, the max alignment value returned from
+            # output_accessor may vary for different concat cases.
+            # If we update the strided_op's epilogue_alignment at this point,
+            # the updated epilogue_alignment value will be cached in our
+            # profiler database. Next time, when we get a different
+            # updated epilogue_alignment value for the same problem size,
+            # we would end up with not being able to find a matching
+            # config or hit misalignment failures at runtime with invalid
+            # alignment values.
+
+        cat_op.remove_input_at(cat_inputs_to_remove)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _fuse_group_gemm_reshape_cat(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    This pass fuses strided_op + view_op + concat patterns. It mainly performs the
+    following transformations:
+      (1) updates the strided_op's output_accessors with new stride information;
+      (2) updates concat op's input_masks and inputs;
+      (3) removes the view_op from the graph
+    Crrently, group_gemm is the only supported strided_op, and reshape and
+    unsqueeze are the supported view_op(s).
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+
+    visited_cat_ops = set()
+    for op in sorted_ops:
+        # TODO: add support to other strided ops such as elementwise ops
+        if not _is_strided_group_gemm(op):
+            continue
+        strided_op = op
+        for idx, output_tensor in enumerate(strided_op._attrs["outputs"]):
+            # Find the next strided_op + reshape + cat.
+            if len(output_tensor.dst_ops()) != 1:
+                continue
+            next_op = list(output_tensor.dst_ops())[0]
+
+            # TODO: add flatten later
+            if next_op._attrs["op"] not in ["reshape", "unsqueeze"]:
+                continue
+
+            view_op = next_op
+            reshape_output_tensor = view_op._attrs["outputs"][0]
+            # Let's keep it simple for now. We might be able to support
+            # the case where reshape's output is used by multiple ops.
+            if len(reshape_output_tensor.dst_ops()) > 1:
+                continue
+            # skip dynamic shape
+            if not shape_utils.all_static_dimensions(
+                reshape_output_tensor._attrs["shape"]
+            ):
+                continue
+
+            if not reshape_output_tensor.dst_ops():
+                continue
+
+            next_op = list(reshape_output_tensor.dst_ops())[0]
+
+            if next_op._attrs["op"] != "concatenate":
+                continue
+            cat_op = next_op
+
+            if not _is_valid_for_fusion(strided_op, cat_op, idx):
+                continue
+
+            # Update "outputs" of the strided_op, and "input_masks" of the cat op.
+            # Note that we do not update the "inputs" of the cat op here, to make sure
+            # that we could find input indices correctly for other tensors.
+
+            cat_dim = cat_op._attrs["concat_dim"]
+            if not transform_strided_ops_utils.cat_split_dim_is_static(cat_op, cat_dim):
+                continue
+
+            reshape_to_shape = reshape_output_tensor._attrs["shape"]
+            if cat_dim >= len(reshape_to_shape):
+                continue
+
+            if cat_op not in visited_cat_ops:
+                # We can have output_list = some_op()(x, y, z); y = concatenate()(output_list).
+                # Depending on whether we create a copy or not in concatenate(),
+                # the cat inputs and some_op outputs may point to the same list.
+                # This copy adds extra safety to make sure that it doesn't happen.
+                cat_op._attrs["inputs"] = cat_op._attrs["inputs"].copy()
+            visited_cat_ops.add(cat_op)
+
+            cat_idx = cat_op.get_tensor_index(reshape_output_tensor)
+
+            offset = 0
+            orig_idx = cat_op.get_original_index(cat_idx)
+
+            # We compute the offset with original inputs because the current inputs
+            # may have been modified by other transformations. For example,
+            # _fuse_strided_op_and_cat may remove some input tensors.
+            for orig_i in range(orig_idx):
+                input_tensor = cat_op._attrs["original_inputs"][orig_i]
+                input_tensor_shape = input_tensor._attrs["shape"]
+                assert cat_dim < len(input_tensor_shape), (
+                    f"Expected cat_dim to be less than the length of input_tensor_shape, "
+                    f"but got cat_dim: {cat_dim}, and input_tensor_shape: {input_tensor_shape}"
+                )
+                assert shape_utils.all_static_dimensions(input_tensor_shape, cat_dim), (
+                    f"Expected input_tensor_shape[{cat_dim}:] are all static dimensions, "
+                    f"but got: {input_tensor_shape}"
+                )
+
+                strided_dim_values = [
+                    dim._attrs["values"][0] for dim in input_tensor_shape[cat_dim:]
+                ]
+                offset += functools.reduce(lambda t1, t2: t1 * t2, strided_dim_values)
+
+            cat_op.remove_input_at(cat_idx)
+            cat_output = cat_op._attrs["outputs"][0]
+            cat_output_shape = cat_output._attrs["shape"]
+            # cat_output_shape[cat_dim:] is guaranteed to be static by the
+            # cat_split_dim_is_static above
+            strided_dim_values = [
+                dim._attrs["values"][0] for dim in cat_output_shape[cat_dim:]
+            ]
+            dim_value = functools.reduce(lambda t1, t2: t1 * t2, strided_dim_values)
+            new_tensor_shape = cat_output_shape[:cat_dim] + [IntImm(dim_value)]
+            new_tensor = Tensor(shape=new_tensor_shape, dtype=cat_output.dtype())
+            strided_op._attrs["output_accessors"][idx].update_base_tensor(
+                new_tensor, cat_dim, offset
+            )
+
+            cat_output._attrs["src_ops"].add(strided_op)
+
+            transform_utils.remove_view_op_from_sorted_graph(view_op)
+
+            output_tensor = strided_op._attrs["outputs"][idx]
+            strided_op._attrs["outputs"][idx] = cat_output
+            transform_utils.remove_tensor_from_sorted_graph(output_tensor)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def transform_strided_ops(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """
+    Add strided inputs / outputs to ops to avoid unnecessary data movement.
+    """
+    if detect_target().name() == "cuda":
+        funcs = [
+            # TODO: Remove these passes after cat supports input_accessors.
+            _fuse_slices_concat_reshape_concat,
+            _fuse_split_and_group_gemm,
+            # Common passes:
+            _fuse_strided_op_and_view_op,
+            _fuse_strided_op_and_cat,
+            _fuse_split_and_strided_op,
+            # make sure this pass runs after _fuse_strided_op_and_cat
+            _fuse_slice_and_strided_op,
+            # TODO: Remove this pass after we support general strides with input_accessors
+            _fuse_slices_concat,
+            # TODO: Remove group_gemm passes after group_gemm shape inference is fixed.
+            _fuse_group_gemm_reshape_cat,
+        ]
+    else:
+        funcs = [
+            # Keep on ROCM
+            _fuse_strided_op_and_view_op,
+            _fuse_split_and_strided_op,
+            _fuse_slices_concat,
+        ]
+    for func in funcs:
+        sorted_graph = func(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops_utils.py b/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
new file mode 100644
index 000000000..e04107649
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
@@ -0,0 +1,108 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import copy
+import logging
+from typing import List
+
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+logger = logging.getLogger(__name__)
+
+
+def _dynamic_shape_checker(shape: List[IntVar], dim: int) -> bool:
+    has_dynamic_dim = False
+    for idx in range(dim, len(shape)):
+        if not isinstance(shape[idx], IntImm):
+            has_dynamic_dim = True
+    return not has_dynamic_dim
+
+
+def cat_split_dim_is_static(op: Operator, cat_split_dim: int) -> bool:
+    """
+    To simplify, we only check either input or output. Assumption, if a certain input dim of any
+    input/output tensor is dynamic, the corresponding output/input dim must be dynamic.
+
+    Args:
+        op (Operator): only supports split and concatenate ops
+        cat_split_dim (int): cat or split dim
+
+    Raises:
+        RuntimeError: raises RuntimeError if unsupported op is passed
+
+    Returns:
+        bool: returns True if all the dims from cat_split_dim are static
+    """
+    tensor = None
+    if op._attrs["op"] == "split":
+        tensor = op._attrs["inputs"][0]
+    elif op._attrs["op"] == "concatenate":
+        tensor = op._attrs["outputs"][0]
+    else:
+        raise RuntimeError("Unsupported op:", op._attrs["op"])
+    # check all dims after cat_split_dim
+    return _dynamic_shape_checker(tensor._attrs["shape"], cat_split_dim)
+
+
+def gemm_stride_checker(
+    original_ta: TensorAccessor, dim: int, get_stride_at_dim: int = None
+) -> bool:
+    """
+    Checks whether it's possible to get gemm stride correctly from original_ta
+    given an input stride dim.
+
+    This function should be called before actually invoking TensorAccessor.update_base_tensor()
+    for gemm ops. If it returns False, we should avoid calling update_base_tensor().
+
+    This is necessary because CUTLASS gemm ops have special "stride" params:
+    batch_stride and row_stride. We need to make sure that after stride dim is updated
+    in the TensorAccessor, these strides could still be populated successfully.
+    """
+
+    if not _dynamic_shape_checker(original_ta.original_shapes, dim):
+        return False
+
+    # Need to make sure that the new stride dim doesn't break
+    # last dim's continuity.
+    # This is because CUTLASS gemm API assumes that gemm stride
+    # only operates on the last dim.
+
+    if get_stride_at_dim is None:
+        # The dim before the last dim
+        get_stride_at_dim = len(original_ta.original_shapes) - 2
+
+    tmp_ta = copy.deepcopy(original_ta)
+    tmp_shape = copy.deepcopy(
+        original_ta.actual_shapes
+        if original_ta.actual_shapes is not None
+        else original_ta.original_shapes
+    )
+    # Make sure new shape is different from the original shape.
+    tmp_shape[dim] = IntImm(tmp_shape[dim].value() + 1)
+    tmp_tensor = Tensor(shape=tmp_shape)
+    tmp_ta.update_base_tensor(tmp_tensor, dim, stride_dim_offset=0)
+
+    # TODO: Make this configurable for different gemms, bmms, etc.
+    stride_strs = tmp_ta.try_get_stride_strs(get_stride_at_dim)
+    if stride_strs is None:
+        logger.debug(
+            f"Failed in gemm_stride_checker: "
+            f"dim: {dim}, "
+            f"original_shapes length: {len(original_ta.original_shapes)}"
+        )
+        return False
+    else:
+        return True
diff --git a/python/aitemplate/compiler/transform/transform_strided_slice.py b/python/aitemplate/compiler/transform/transform_strided_slice.py
new file mode 100644
index 000000000..167a8e444
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_strided_slice.py
@@ -0,0 +1,268 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform transformations on slice and strided ops.
+"""
+import math
+
+from typing import List
+
+from ...utils import graph_utils
+from ..base import IntImm, IntVar, Operator, Tensor
+from ..ops.tensor.dynamic_slice import dynamic_slice, MAX_INT32
+from . import transform_strided_ops_utils, transform_utils
+
+
+def _is_supported_gemm(gemm_op: Operator, slice_op: Operator) -> bool:
+    slice_output_tensor = slice_op._attrs["outputs"][0]
+    slice_output_rank = slice_output_tensor._rank()
+    # TODO: support other gemm kinds
+    if gemm_op._attrs["op"].startswith("gemm_rcr"):
+        # TODO: support cases where slice_input_tensor is used by non-A/B
+        # matrices, e.g. bias/d1/d2 in gemm_rcr_bias_add_add
+        gemm_inputs = gemm_op._attrs["inputs"]
+        if (
+            gemm_inputs[0] is not slice_output_tensor
+            and gemm_inputs[1] is not slice_output_tensor
+        ):
+            return False
+        return slice_output_rank >= 2
+    return False
+
+
+def _sanity_check_concatenate(concat_op: Operator, slice_op: Operator) -> bool:
+    # Although it cannot happen, just make sure we are not going to apply both
+    # input and output accessors to a single input of the concat op
+    slice_output_tensor = slice_op._attrs["outputs"][0]
+    for idx, (input_tensor, input_mask) in enumerate(
+        zip(concat_op._attrs["original_inputs"], concat_op._attrs["input_masks"])
+    ):
+        if input_tensor is slice_output_tensor:
+            assert input_mask, (
+                f"Expected input_mask to be True at {idx=} for "
+                f'input_tensor {input_tensor._attrs["name"]} and '
+                f'slice_op {slice_op._attrs["name"]}'
+            )
+    return True
+
+
+def _is_supported_op(op: Operator, slice_op: Operator) -> bool:
+    op_type = op._attrs["op"]
+    if op_type.startswith("gemm"):
+        return _is_supported_gemm(op, slice_op)
+    if op_type == "concatenate":
+        return _sanity_check_concatenate(op, slice_op)
+    if op_type == "fused_elementwise":
+        return True
+    if op_type.startswith("layernorm") or op_type.startswith("group_layernorm"):
+        return True
+    return False
+
+
+def _is_slice_full_range(dim: IntVar, start_idx: int, end_idx: int) -> bool:
+    """
+    return true if start_idx and end_idx covers the full range of a slice dim
+    """
+    if start_idx == 0 and end_idx == MAX_INT32:
+        return True
+    # if it's dynamic dimension, we don't know. So, just return False
+    if not isinstance(dim, IntImm):
+        return False
+    start_idx, end_idx = dynamic_slice.normalize_start_end_indices(
+        dim.value(), start_idx, end_idx
+    )
+    return end_idx - start_idx >= dim.value()
+
+
+def _valid_alignment(
+    op: Operator,
+    slice_input_shape: List[IntVar],
+    start_indices: List[int],
+    end_indices: List[int],
+) -> bool:
+    op_type = op._attrs["op"]
+    if (
+        op_type in ("fused_elementwise", "concatenate")
+        or op._attrs["op"].startswith("layernorm")
+        or op._attrs["op"].startswith("group_layernorm")
+    ):
+        return True
+
+    if op_type.startswith("gemm_rcr"):
+        # for n-d * 2-d cases, we are only able to support a special case
+        # where we fully slice all axes except the last one (i.e. -1), because
+        # it the only case where we can have a constant stride. To see why other
+        # cases won't work, let's say we have the following inputs:
+        #   slice_input_shape = [2, 3, 8]
+        #   slice_start_indices = [0, 0, 0]
+        #   slice_end_indices = [2, 2, 8]
+        # for sliced output [2, 2, *], we end up with addresses 0, 8, 24, 32,
+        # which cannot be represented by a single constant stride value.
+        slice_output_rank = op._attrs["outputs"][0]._rank()
+        if slice_output_rank > 2:
+            for dim, s_i, e_i in zip(
+                slice_input_shape[:-1], start_indices[:-1], end_indices[:-1]
+            ):
+                if not _is_slice_full_range(dim, s_i, e_i):
+                    return False
+
+        k_dim = slice_input_shape[-1]
+        if not isinstance(k_dim, IntImm):
+            return False
+        alignment = math.gcd(k_dim.value(), start_indices[-1])
+        return alignment % 2 == 0
+    return False
+
+
+def _process_one_slice_dst(
+    slice_op: Operator, slice_output_tensor: Tensor, next_op: Operator
+) -> bool:
+    """
+    Process one slice_output_tensor's dst op. Return True upon success.
+    """
+    if not _is_supported_op(next_op, slice_op):
+        return False
+    strided_op = next_op
+
+    slice_input_tensor = slice_op._attrs["inputs"][0]
+    slice_input_rank = slice_input_tensor._rank()
+    slice_input_shape = slice_input_tensor._attrs["shape"]
+    start_indices = slice_op._attrs["start_indices"]
+    end_indices = slice_op._attrs["end_indices"]
+    strided_op_name = strided_op._attrs["op"]
+
+    # TODO: Currently, we only support a special case where all dimensions
+    # are fully sliced except one. In such a case, we can use the same
+    # TensorAccessor interface to update base tensors. We will revisit
+    # this part once we refactor our TensorAccessor to support more general
+    # senarios.
+    slice_dim = None
+    normalized_start_indices = []
+    normalized_end_indices = []
+    # Let's skip consecutive fully-sliced static dims starting from the
+    # innermost dim
+    for idx in reversed(range(slice_input_rank)):
+        dim = slice_input_shape[idx]
+        if not isinstance(dim, IntImm):
+            break
+        slice_dim = idx
+        start_idx = start_indices[idx]
+        end_idx = end_indices[idx]
+        start_idx, end_idx = dynamic_slice.normalize_start_end_indices(
+            dim.value(), start_idx, end_idx
+        )
+        normalized_start_indices.append(start_idx)
+        normalized_end_indices.append(end_idx)
+        if not _is_slice_full_range(dim, start_idx, end_idx):
+            break
+    # We encountered a dynamic dim before a valid slice_dim
+    if slice_dim is None:
+        return False
+
+    invalid_slice = False
+    # We got a valid slice_dim, but we need to keep checking if the
+    # remaining slice indices are valid
+    for idx in reversed(range(slice_dim)):
+        dim = slice_input_shape[idx]
+        start_idx = start_indices[idx]
+        end_idx = end_indices[idx]
+        if _is_slice_full_range(dim, start_idx, end_idx):
+            start_idx = 0
+            end_idx = MAX_INT32
+            if isinstance(dim, IntImm):
+                start_idx, end_idx = dynamic_slice.normalize_start_end_indices(
+                    dim.value(), start_idx, end_idx
+                )
+            normalized_start_indices.append(start_idx)
+            normalized_end_indices.append(end_idx)
+        else:
+            invalid_slice = True
+            break
+
+    if invalid_slice:
+        return False
+    offset = 0
+    normalized_start_indices.reverse()
+    normalized_end_indices.reverse()
+    offset = normalized_start_indices[slice_dim]
+
+    # Now let's check alignment
+    if not _valid_alignment(
+        strided_op,
+        slice_input_shape,
+        normalized_start_indices,
+        normalized_end_indices,
+    ):
+        return False
+
+    to_be_updated_input_accessors = []
+    for idx, input_tensor in enumerate(strided_op._attrs["inputs"]):
+        if input_tensor is not slice_output_tensor:
+            continue
+        input_accessors = strided_op._attrs["input_accessors"]
+
+        if any(strided_op_name.startswith(n) for n in ("gemm", "group_gemm", "bmm")):
+            if not transform_strided_ops_utils.gemm_stride_checker(
+                input_accessors[idx], slice_dim
+            ):
+                return False
+        to_be_updated_input_accessors.append(input_accessors[idx])
+
+    for tc in to_be_updated_input_accessors:
+        tc.update_base_tensor(slice_input_tensor, slice_dim, offset)
+
+    transform_utils.replace_tensor_for_op(
+        strided_op, slice_output_tensor, slice_input_tensor
+    )
+    return True
+
+
+def _fuse_slice_and_strided_op(
+    sorted_graph: List[Tensor],
+) -> List[Tensor]:
+    """
+    This pass detects patterns like below:
+      x1 = slice(x, start_indices, end_indices)
+      y = gemm_rcr(x1, w1)
+
+    where gemm_rcr stands for a strided_op, which can be one of the followings:
+    gemm-family ops, layernorm(s), elementwise ops, etc.
+    When we detect such a pattern, we generate stride information for each input
+    tensor with respect to its portion in slice. Later, the strided_op backend
+    will generate strided accesses based on the stored stride information.
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        op_type = op._attrs["op"]
+        if op_type != "dynamic_slice":
+            continue
+        slice_op = op
+
+        slice_outputs = slice_op._attrs["outputs"]
+        if len(slice_outputs) != 1:
+            continue
+        slice_output_tensor = slice_outputs[0]
+        slice_dsts = slice_output_tensor.dst_ops()
+        for next_op in list(slice_dsts):
+            _process_one_slice_dst(slice_op, slice_output_tensor, next_op)
+        # We remove the slice_op from the graph if all of its output dsts are
+        # valid strided ops. In such a case, output_tensor's "dsts" is empty
+        # already, so we can just remove the op from the graph.
+        if (
+            len(slice_output_tensor.dst_ops()) == 0
+            and not slice_output_tensor._attrs["is_output"]
+        ):
+            transform_utils.remove_tensor_from_sorted_graph(slice_output_tensor)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/transform_utils.py b/python/aitemplate/compiler/transform/transform_utils.py
new file mode 100644
index 000000000..d27f6208a
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_utils.py
@@ -0,0 +1,341 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Util functions for graph transformations.
+"""
+
+from collections import deque
+from typing import Dict, List, Union
+
+from ...utils import graph_utils, logger
+from ..base import Operator, Tensor
+from .mark_param_tensor import mark_param_tensor
+from .name_graph import name_graph
+from .remove_unused_ops import remove_unused_ops
+
+
+def check_graph_validity(sorted_graph: List[Tensor], raiseError: bool = False) -> bool:
+    """
+    Check whether all tensor/op in the AIT graph matches.
+    If raiseError == False, returns true if graph is a valid graph, else false.
+       raiseError == True, assert and raise at point of failure.
+    """
+
+    def handleError(msg: str):
+        if raiseError:
+            logger.info(__file__, "check_graph_validity() error! Graph:")
+            logger.info(__file__, graph_utils.sorted_graph_debug_str(sorted_graph))
+            raise RuntimeError(msg)
+        else:
+            return False
+
+    valid = True
+    visited_tensors = set()
+    for tensor in sorted_graph:
+        tname = tensor._attrs.get("name", None)
+        for op in tensor.src_ops():
+            if op is None:
+                continue
+            oname = op._attrs.get("name", None)
+            for input_tensor in op._attrs["inputs"]:
+                if not isinstance(input_tensor, Tensor):
+                    continue
+                iname = input_tensor._attrs.get("name", None)
+                if input_tensor not in visited_tensors:
+                    valid = handleError(
+                        "Input tensor {} not established in graph for op {}".format(
+                            iname, oname
+                        )
+                    )
+                if op not in input_tensor._attrs["dst_ops"]:
+                    valid = handleError(
+                        "Op {} not designated as dst_op for tensor {}".format(
+                            oname, iname
+                        )
+                    )
+            if tensor not in op._attrs["outputs"]:
+                valid = handleError(
+                    "Tensor {} not in outputs for op {}".format(tname, oname)
+                )
+        visited_tensors.add(tensor)
+
+    visited_tensors = set()
+    for tensor in sorted_graph[::-1]:
+        tname = tensor._attrs.get("name", None)
+        for op in tensor.dst_ops():
+            if op is None:
+                continue
+            oname = op._attrs.get("name", None)
+            outputs = op._attrs["outputs"]
+            if not isinstance(outputs, list):
+                outputs = [outputs]
+            for output_tensor in outputs:
+                if not isinstance(output_tensor, Tensor):
+                    continue
+                otname = output_tensor._attrs.get("name", None)
+                if output_tensor not in visited_tensors:
+                    valid = handleError(
+                        "Output tensor {} not established in graph for op {}".format(
+                            otname, oname
+                        )
+                    )
+                if op not in output_tensor._attrs["src_ops"]:
+                    valid = handleError(
+                        "Op {} not designated as src_op for tensor {}".format(
+                            oname, otname
+                        )
+                    )
+            if tensor not in op._attrs["inputs"]:
+                valid = handleError(
+                    "Tensor {} not in inputs for op {}".format(tname, oname)
+                )
+        visited_tensors.add(tensor)
+
+    return valid
+
+
+def copy_tensor_attributes(dst, src):
+    """
+    Copy over all tensor attributes that need to be preserved
+    """
+    attrs = ["depth", "name", "nop", "is_output"]
+    for attr in attrs:
+        if attr in src._attrs:
+            dst._attrs[attr] = src._attrs[attr]
+
+
+def copy_src_op_attributes(dst, src):
+    """
+    Copy over all op attributes that need to be preserved. Inputs/outputs
+    of the op are not handled by this function.
+    """
+    if len(dst.src_ops()) != 1 or len(src.src_ops()) != 1:
+        return False
+    dst_op = list(dst.src_ops())[0]
+    src_op = list(src.src_ops())[0]
+    attrs = ["alpha"]
+    for attr in attrs:
+        if attr in src_op._attrs:
+            dst_op._attrs[attr] = src_op._attrs[attr]
+    return True
+
+
+def replace_tensor(old_tensor, new_tensor):
+    """
+    Replaces all references to the old_tensor with the new_tensor.
+    """
+    if old_tensor._attrs["is_output"]:
+        new_tensor._attrs["is_output"] = True
+        new_tensor._attrs["name"] = old_tensor._attrs["name"]
+    dst_ops = list(old_tensor._attrs["dst_ops"])
+    for op in dst_ops:
+        op.replace_input_tensor(old_tensor, new_tensor)
+        new_tensor._attrs["dst_ops"].add(op)
+        old_tensor._attrs["dst_ops"].remove(op)
+    remove_tensor_from_sorted_graph(old_tensor)
+
+
+def replace_tensor_for_op(target_op: Operator, old_tensor: Tensor, new_tensor: Tensor):
+    """
+    Replaces all references to the old_tensor with the new_tensor in target_op.
+    """
+
+    dst_ops = list(old_tensor._attrs["dst_ops"])
+    for op in dst_ops:
+        if op is target_op:
+            op.replace_input_tensor(old_tensor, new_tensor)
+            new_tensor._attrs["dst_ops"].add(op)
+            old_tensor._attrs["dst_ops"].remove(op)
+            break
+
+
+def remove_dst_op_from_tensor(
+    tensors: Union[Tensor, List[Tensor]], dst_op: Operator
+) -> None:
+    if isinstance(tensors, Tensor):
+        tensors._attrs["dst_ops"].remove(dst_op)
+    else:
+        for tensor in tensors:
+            tensor._attrs["dst_ops"].remove(dst_op)
+
+
+def remove_single_tensor_op_from_sorted_graph(op: Operator) -> None:
+    """
+    Removes an op which only has one input and one output.
+    Connects the previous op and the next op together.
+    """
+    # Treat reshape op specially because its shape tensors do not maintain
+    # input-output dependency on the reshape op.
+    if op._attrs["op"] == "reshape":
+        # ensure
+        for x in op._attrs["inputs"][1:]:
+            assert op not in x._attrs["dst_ops"], (
+                f'Invalid: shape tensor {x._attrs["name"]} has reshape op '
+                f'{op._attrs["name"]} in its dst_ops'
+            )
+    else:
+        assert len(op._attrs["inputs"]) == 1
+    assert len(op._attrs["outputs"]) == 1
+
+    input_tensor = op._attrs["inputs"][0]
+    output_tensor = op._attrs["outputs"][0]
+
+    input_tensor._attrs["dst_ops"].discard(op)
+    input_tensor._attrs["dst_ops"].update(output_tensor._attrs["dst_ops"])
+
+    for dst_op in output_tensor._attrs["dst_ops"]:
+        dst_op.replace_input_tensor(output_tensor, input_tensor)
+    if output_tensor._attrs["is_output"]:
+        assert not input_tensor._attrs[
+            "is_input"
+        ], f"{input_tensor._attrs['name']} can not be input and output"
+        input_tensor._attrs["is_output"] = True
+        input_tensor._attrs["name"] = output_tensor._attrs["name"]
+        input_tensor._attrs["shape"] = output_tensor._attrs["shape"]
+
+    remove_tensor_from_sorted_graph(output_tensor)
+
+
+def remove_view_op_from_sorted_graph(op: Operator) -> None:
+    """
+    Removes a view op, including reshape, squeeze, unsqueeze and flatten.
+    """
+    # Treat reshape op specially because it may have multiple inputs.
+    if op._attrs["op"] != "reshape":
+        remove_single_tensor_op_from_sorted_graph(op)
+        return
+
+    input_tensor = op._attrs["inputs"][0]
+    output_tensor = op._attrs["outputs"][0]
+
+    input_tensor._attrs["dst_ops"] = output_tensor._attrs["dst_ops"]
+    for dst_op in output_tensor._attrs["dst_ops"]:
+        dst_op.replace_input_tensor(output_tensor, input_tensor)
+    if output_tensor._attrs["is_output"]:
+        input_tensor._attrs["is_output"] = True
+        input_tensor._attrs["name"] = output_tensor._attrs["name"]
+        input_tensor._attrs["shape"] = output_tensor._attrs["shape"]
+
+    # Now we remove this reshape op from its shape tensors' dst_ops
+    # Note that a single shape_tensor may be passed to this reshape op multiple
+    # times, so we place all shape_tensors into a set to remove duplicates.
+    for shape_tensor in set(op._attrs["inputs"][1:]):
+        shape_tensor._attrs["dst_ops"].remove(op)
+
+    remove_tensor_from_sorted_graph(output_tensor)
+
+
+def remove_tensor_from_sorted_graph(tensor: Tensor) -> None:
+    """
+    Disconnects the tensor from others so that sanitize_sorted_graph()
+    could remove it.
+    """
+    tensor._attrs["src_ops"] = set()
+    tensor._attrs["dst_ops"] = set()
+    tensor._attrs["is_input"] = False
+    tensor._attrs["is_output"] = False
+
+
+def sanitize_sorted_graph(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Removes tensors whose src_op and dst_ops are empty.
+    Inputs and outputs are always kept in the graph.
+    Names unamed tensors.
+    """
+
+    if len(sorted_graph) == 1:
+        return sorted_graph
+
+    new_sorted_graph = [
+        tensor
+        for tensor in sorted_graph
+        if (len(tensor._attrs["src_ops"]) > 0)
+        or (len(tensor._attrs["dst_ops"]) > 0)
+        or tensor._attrs["is_input"]
+        or tensor._attrs["is_output"]
+    ]
+    name_graph(new_sorted_graph)
+    mark_param_tensor(new_sorted_graph)
+    remove_unused_ops(new_sorted_graph)
+    check_graph_validity(new_sorted_graph, raiseError=True)
+    return new_sorted_graph
+
+
+def is_ancestor(op1: Operator, op2: Operator) -> bool:
+    """
+    Returns whether op1 is an ancestor of op2.
+    """
+
+    src_ops = deque([op2])
+    visited = set()
+    while len(src_ops) > 0:
+        src_op = src_ops.popleft()
+        if src_op in visited:
+            continue
+        visited.add(src_op)
+        for tensor in src_op._attrs["inputs"]:
+            if op1 in tensor._attrs["src_ops"]:
+                return True
+            src_ops.extend(tensor._attrs["src_ops"])
+    return False
+
+
+def is_parent(op1: Operator, op2: Operator) -> bool:
+    """
+    Returns whether op1 is a parent of op2.
+    """
+
+    for tensor in op2._attrs["inputs"]:
+        if op1 in tensor._attrs["src_ops"]:
+            return True
+    return False
+
+
+def _can_be_constant_folded(tensor: Tensor, visited: Dict[Tensor, bool]):
+    if tensor in visited:
+        return visited[tensor]
+    if tensor._attrs["data"] is not None:
+        visited[tensor] = True
+        return
+
+    src_ops = tensor.src_ops()
+
+    # might be graph input
+    if len(src_ops) == 0:
+        visited[tensor] = False
+        return
+
+    # weight may have been pre-transposed by other passes
+    for src_op in src_ops:
+        for input in src_op._attrs["inputs"]:
+            _can_be_constant_folded(input, visited)
+            if not visited[input]:
+                visited[tensor] = False
+                return
+    visited[tensor] = True
+
+
+def can_be_constant_folded(tensors: Union[Tensor, List[Tensor]]):
+    """
+    Check if a tensor or a list of tensors can be folded as a constant
+    """
+    visited = {}
+    if isinstance(tensors, Tensor):
+        tensors = [tensors]
+    for t in tensors:
+        _can_be_constant_folded(t, visited)
+        if not visited[t]:
+            return False
+    return True
diff --git a/python/aitemplate/frontend/__init__.py b/python/aitemplate/frontend/__init__.py
new file mode 100644
index 000000000..dd3562e11
--- /dev/null
+++ b/python/aitemplate/frontend/__init__.py
@@ -0,0 +1,19 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from ..compiler.base import DynamicProfileStrategy, IntImm, IntVar, Tensor  # noqa: F401
+from . import nn
+from .nn.parameter import Parameter
+
+__all__ = ["nn", "Parameter"]
diff --git a/python/aitemplate/frontend/nn/__init__.py b/python/aitemplate/frontend/nn/__init__.py
new file mode 100644
index 000000000..4dc331ab5
--- /dev/null
+++ b/python/aitemplate/frontend/nn/__init__.py
@@ -0,0 +1,34 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+from .container import ModuleDict, ModuleList, Sequential
+from .embedding import BertEmbeddings, Embedding
+from .module import Module
+from .conv2d import *
+from .linear import *
+from .padding import *
+from .pool2d import *
+from .fpn_proposal import FPNProposal
+from .proposal import Proposal
+from .roi_ops import *
+from .upsample import *
+from .view_ops import *
+from .attention import FlashAttention, MultiheadAttention
+from .identity import Identity
+from .dropout import *
+from .layer_norm import *
+from .group_norm import *
+
+__all__ = ["Module", "ModuleDict", "ModuleList", "Sequential"]
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
new file mode 100644
index 000000000..cabad815b
--- /dev/null
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -0,0 +1,227 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Frontend for attention module
+"""
+from aitemplate.testing import detect_target
+
+from ...compiler import ops
+from ...compiler.ops import flash_attention
+from ...compiler.ops.common.epilogue import FuncEnum
+from .. import Tensor
+from .dropout import Dropout
+from .linear import Linear
+from .module import Module
+from .parameter import Parameter
+
+# pylint: disable=C0103
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+class FlashAttention(Module):
+    def __init__(
+        self,
+        batch_size,
+        max_seq_len,
+        dropout=0,
+        causal=False,
+        dtype="float16",
+    ):
+        """Initilize attention module, create a tensor for seqlen"""
+        super().__init__()
+        self.cu_length = Parameter(shape=[batch_size + 1], dtype="int32")
+        self.op = flash_attention(
+            batch_size=batch_size,
+            dropout=dropout,
+            max_seq_len=max_seq_len,
+            causal=causal,
+        )
+
+    def forward(self, *args):
+        """forward pass for calling attention op"""
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x, self.cu_length.tensor())
+
+
+class MultiheadAttention(Module):
+    def __init__(
+        self,
+        dim,
+        batch_size,
+        seq_len,
+        num_heads=8,
+        qkv_bias=False,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        has_residual=True,
+        causal=False,
+        mask_seq=0,
+    ):
+        super().__init__()
+        assert (
+            dim % num_heads == 0
+        ), f"dim {dim} should be divisible by num_heads {num_heads}"
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.causal = causal
+        self.has_residual = has_residual
+        self.mask_seq = mask_seq
+
+        flash_head_dims = {8, 16, 32, 64, 128}
+        # simple heuristic, may need refinement
+        self.use_flash = (
+            not (seq_len >= 512 and batch_size <= 2)
+        ) and head_dim in flash_head_dims
+        # odd seq try use flash
+        if seq_len % 2 == 1:
+            self.use_flash = True
+
+        self.op = flash_attention(
+            batch_size=batch_size,
+            dropout=attn_drop,
+            max_seq_len=seq_len,
+            causal=causal,
+        )
+        # cu_length: the cumulative sequence lengths, used to index into hidden_states.
+        self.cu_length = Parameter(shape=[batch_size + 1], dtype="int32")
+        if self.mask_seq:
+            self.output_mask = Parameter(
+                shape=[mask_seq, num_heads, head_dim], dtype="float16"
+            )
+
+        if USE_CUDA:
+            # on CUDA flash_attention needs packed QKV as input,
+            # then do split + permute inside flash_attn
+            # input: (B, S, H)
+            # output: (B*S, 3, num_heads, head_dim)
+            if self.use_flash:
+                self.qkv = Linear(dim, dim * 3, bias=qkv_bias)
+            else:
+                self.qkv = Linear(
+                    dim,
+                    dim * 3,
+                    specialization="permute",
+                    shape=(seq_len, 3, self.num_heads),
+                )
+        else:
+            # on ROCM ck attention (bmm_softmax_bmm) takes three inputs (Q, K, V)
+            # here we generate packed QKV for spliting
+            # input: (B, seqlen, dim) -> (B*seqlen, dim)
+            # gemm: (B*seqlen, 3*dim)
+            # reshape to: (B, seqlen, 3, num_heads, head_dim)
+            # output: (3, B, num_heads, seqlen, head_dim)
+            self.qkv = Linear(
+                dim,
+                dim * 3,
+                specialization="permute",
+                shape=(seq_len, 3, self.num_heads),
+                layout="m2n3",
+            )
+
+        self.attn_drop = Dropout(attn_drop)
+        self.proj = Linear(dim, dim, specialization="add" if has_residual else None)
+        self.proj_drop = Dropout(proj_drop)
+
+    def get_shape(self, x):
+        shape = [it.value() for it in x._attrs["shape"]]
+        return shape
+
+    def qkv_proj(self, x):
+        if USE_CUDA:
+            if self.use_flash:
+                batch, seq, hidden = self.get_shape(x)
+                out = self.qkv(x)
+                return ops.reshape()(
+                    out, [int(batch * seq), 3, self.num_heads, hidden // self.num_heads]
+                )
+            else:
+                batch, seq, hidden = self.get_shape(x)
+                x = ops.reshape()(x, [-1, hidden])
+                return self.qkv(x)
+        else:
+            return self.qkv(x)
+
+    def attention(self, x):
+        # fused attention
+        # output: (B, Seqlen, num_heads, head_dim)
+        if USE_CUDA and self.use_flash:
+            # input(x): (B*seqlen, 3, num_heads, head_dim)
+            # output: (B, Seqlen, num_heads, head_dim)
+            return self.op(x, self.cu_length.tensor())
+        else:
+            # intput(q/k/v): (B*num_heads, seqlen, head_dim)
+            # attn = (B, S, H) * (B, S, H) = (B, S, S) #RCR
+            # softmax on dim -1 (B, S, S)
+            # attn@v: (B, S, S) * (B, S, H) = (B, S, H) #RRR
+            # reshape: (B, num_head, seqlen, head_dim)
+            # permute: (B, Seqlen, num_heads, head_dim)
+            if USE_CUDA:
+                scale = Tensor(
+                    shape=[], dtype="float16", name="scale", value=self.scale
+                )
+                # [3, b, num_heads, seqlen, d]
+                _, b, num_heads, seqlen, d = self.get_shape(x)
+                # [3 * b * num_heads, seqlen, d]
+                x = ops.reshape()(x, [-1, seqlen, d])
+                (q, k, v) = ops.split()(x, b * num_heads, dim=0)
+                qk = ops.bmm_rcr()(q, k)
+                score = ops.elementwise(FuncEnum.MUL)(qk, scale)
+                score = ops.softmax()(score, -1)
+                out = ops.bmm_rrr_permute((num_heads,))(score, v)
+            else:
+                (q, k, v) = ops.split()(x, 1, dim=0)
+                _, _, _, seqlen, d = self.get_shape(q)
+                OP = ops.bmm_softmax_bmm_permute(
+                    shape=(self.num_heads,),
+                    scale=self.scale,
+                    causal=self.causal,
+                )
+                out = OP(
+                    (ops.reshape()(q, [-1, seqlen, d])),
+                    (ops.reshape()(k, [-1, seqlen, d])),
+                    (ops.reshape()(v, [-1, seqlen, d])),
+                )
+            return out
+
+    def forward(self, *args):
+        """forward pass for calling mha module"""
+        assert len(args) >= 1
+        x = args[0]
+        batch, seq, hidden = self.get_shape(x)
+        qkv = self.qkv_proj(x)
+        if self.mask_seq:
+            total = self.get_shape(qkv)[0]
+            qkv = ops.dynamic_slice()(
+                qkv,
+                start_indices=[0, 0, 0, 0],
+                end_indices=[total - self.mask_seq, None, None, None],
+            )
+        attn_output = self.attention(qkv)
+        if self.mask_seq:
+            attn_output = ops.concatenate()(
+                [attn_output, self.output_mask.tensor()], dim=0
+            )
+        attn_output = ops.reshape()(attn_output, [batch * seq, -1])
+        if self.has_residual:
+            assert len(args) == 2
+            x = self.proj(attn_output, args[1])
+        else:
+            x = self.proj(attn_output)
+        x = self.proj_drop(x)
+        x = ops.reshape()(x, [batch, seq, hidden])
+        return x
diff --git a/python/aitemplate/frontend/nn/container.py b/python/aitemplate/frontend/nn/container.py
new file mode 100644
index 000000000..da1f0381e
--- /dev/null
+++ b/python/aitemplate/frontend/nn/container.py
@@ -0,0 +1,890 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import operator
+from collections import abc as container_abcs, OrderedDict
+from itertools import chain, islice
+
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Iterator,
+    Mapping,
+    Optional,
+    overload,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+from ...compiler.base import Tensor
+
+from .module import Module, typename
+from .parameter import Parameter
+
+__all__ = ["Sequential", "ModuleList", "ModuleDict", "ParameterList", "ParameterDict"]
+
+T = TypeVar("T", bound=Module)
+
+
+class Sequential(Module):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the
+    constructor. Alternatively, an ``OrderedDict`` of modules can be
+    passed in. The ``forward()`` method of ``Sequential`` accepts any
+    input and forwards it to the first module it contains. It then
+    "chains" outputs to inputs sequentially for each subsequent module,
+    finally returning the output of the last module.
+
+    The value a ``Sequential`` provides over manually calling a sequence
+    of modules is that it allows treating the whole container as a
+    single module, such that performing a transformation on the
+    ``Sequential`` applies to each of the modules it stores (which are
+    each a registered submodule of the ``Sequential``).
+
+    What's the difference between a ``Sequential`` and a
+    :class:`nn.ModuleList`? A ``ModuleList`` is exactly what it
+    sounds like--a list for storing ``Module`` s! On the other hand,
+    the layers in a ``Sequential`` are connected in a cascading way.
+
+    Example::
+
+        # Using Sequential to create a small model. When `model` is run,
+        # input will first be passed to `Conv2d(1,20,5)`. The output of
+        # `Conv2d(1,20,5)` will be used as the input to the first
+        # `ReLU`; the output of the first `ReLU` will become the input
+        # for `Conv2d(20,64,5)`. Finally, the output of
+        # `Conv2d(20,64,5)` will be used as input to the second `ReLU`
+        model = nn.Sequential(
+                  nn.Conv2d(1,20,5),
+                  nn.ReLU(),
+                  nn.Conv2d(20,64,5),
+                  nn.ReLU()
+                )
+
+        # Using Sequential with OrderedDict. This is functionally the
+        # same as the above code
+        model = nn.Sequential(OrderedDict([
+                  ('conv1', nn.Conv2d(1,20,5)),
+                  ('relu1', nn.ReLU()),
+                  ('conv2', nn.Conv2d(20,64,5)),
+                  ('relu2', nn.ReLU())
+                ]))
+    """
+
+    _modules: Dict[str, Module]  # type: ignore[assignment]
+
+    @overload
+    def __init__(self, *args: Module) -> None:
+        ...
+
+    @overload
+    def __init__(self, arg: "OrderedDict[str, Module]") -> None:
+        ...
+
+    def __init__(self, *args):
+        super(Sequential, self).__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+
+    def _get_item_by_idx(self, iterator, idx) -> T:
+        """Get the idx-th item of the iterator"""
+        size = len(self)
+        idx = operator.index(idx)
+        if not -size <= idx < size:
+            raise IndexError("index {} is out of range".format(idx))
+        idx %= size
+        return next(islice(iterator, idx, None))
+
+    def __getitem__(self, idx) -> Union["Sequential", T]:
+        if isinstance(idx, slice):
+            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
+        else:
+            return self._get_item_by_idx(self._modules.values(), idx)
+
+    def __setitem__(self, idx: int, module: Module) -> None:
+        key: str = self._get_item_by_idx(self._modules.keys(), idx)
+        return setattr(self, key, module)
+
+    def __delitem__(self, idx: Union[slice, int]) -> None:
+        if isinstance(idx, slice):
+            for key in list(self._modules.keys())[idx]:
+                delattr(self, key)
+        else:
+            key = self._get_item_by_idx(self._modules.keys(), idx)
+            delattr(self, key)
+        # To preserve numbering
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __add__(self, other) -> "Sequential":
+        if isinstance(other, Sequential):
+            ret = Sequential()
+            for layer in self:
+                ret.append(layer)
+            for layer in other:
+                ret.append(layer)
+            return ret
+        else:
+            raise ValueError(
+                "add operator supports only objects "
+                f"of Sequential type, but got {str(type(other))}"
+            )
+
+    def pop(self, key: Union[int, slice]) -> Module:
+        v = self[key]
+        del self[key]
+        return v
+
+    def __iadd__(self, other) -> "Sequential":
+        if isinstance(other, Sequential):
+            offset = len(self)
+            for i, module in enumerate(other):
+                self.add_module(str(i + offset), module)
+            return self
+        else:
+            raise ValueError(
+                "add operator supports only objects "
+                f"of Sequential type, but got {str(type(other))}"
+            )
+
+    def __mul__(self, other: int) -> "Sequential":
+        if not isinstance(other, int):
+            raise TypeError(
+                f"unsupported operand type(s) for *: {type(self)} and {type(other)}"
+            )
+        elif other <= 0:
+            raise ValueError(
+                f"Non-positive multiplication factor {other} for {type(self)}"
+            )
+        else:
+            combined = Sequential()
+            offset = 0
+            for _ in range(other):
+                for module in self:
+                    combined.add_module(str(offset), module)
+                    offset += 1
+            return combined
+
+    def __rmul__(self, other: int) -> "Sequential":
+        return self.__mul__(other)
+
+    def __imul__(self, other: int) -> "Sequential":
+        if not isinstance(other, int):
+            raise TypeError(
+                f"unsupported operand type(s) for *: {type(self)} and {type(other)}"
+            )
+        elif other <= 0:
+            raise ValueError(
+                f"Non-positive multiplication factor {other} for {type(self)}"
+            )
+        else:
+            len_original = len(self)
+            offset = len(self)
+            for _ in range(other - 1):
+                for i in range(len_original):
+                    self.add_module(str(i + offset), self._modules[str(i)])
+                offset += len_original
+            return self
+
+    def __dir__(self):
+        keys = super(Sequential, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+
+    # NB: We can't really type check this function as the type of input
+    # may change dynamically (as is tested in
+    # TestScript.test_sequential_intermediary_types).  Cannot annotate
+    # with Any as TorchScript expects a more precise type
+    def forward(self, input):
+        for module in self:
+            input = module(input)
+        return input
+
+    def append(self, module: Module) -> "Sequential":
+        r"""Appends a given module to the end.
+
+        Args:
+            module (nn.Module): module to append
+        """
+        self.add_module(str(len(self)), module)
+        return self
+
+    def insert(self, index: int, module: Module) -> "Sequential":
+        if not isinstance(module, Module):
+            raise AssertionError("module should be of type: {}".format(Module))
+        n = len(self._modules)
+        if not (-n <= index <= n):
+            raise IndexError(f"index {index} is out of range ")
+        if index < 0:
+            index += n
+        for i in range(n, index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+        return self
+
+    def extend(self, sequential) -> "Sequential":
+        for layer in sequential:
+            self.append(layer)
+        return self
+
+
+class ModuleList(Module):
+    r"""Holds submodules in a list.
+
+    :class:`~nn.ModuleList` can be indexed like a regular Python list, but
+    modules it contains are properly registered, and will be visible by all
+    :class:`~nn.Module` methods.
+
+    Args:
+        modules (iterable, optional): an iterable of modules to add
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
+
+            def forward(self, x):
+                # ModuleList can act as an iterable, or be indexed using ints
+                for i, l in enumerate(self.linears):
+                    x = self.linears[i // 2](x) + l(x)
+                return x
+    """
+
+    _modules: Dict[str, Module]  # type: ignore[assignment]
+
+    def __init__(self, modules: Optional[Iterable[Module]] = None) -> None:
+        super(ModuleList, self).__init__()
+        if modules is not None:
+            self += modules
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules"""
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f"index {idx} is out of range ")
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    def __getitem__(self, idx: int) -> Union[Module, "ModuleList"]:
+        if isinstance(idx, slice):
+            return self.__class__(list(self._modules.values())[idx])
+        else:
+            return self._modules[self._get_abs_string_index(idx)]
+
+    def __setitem__(self, idx: int, module: Module) -> None:
+        idx = self._get_abs_string_index(idx)
+        return setattr(self, str(idx), module)
+
+    def __delitem__(self, idx: Union[int, slice]) -> None:
+        if isinstance(idx, slice):
+            for k in range(len(self._modules))[idx]:
+                delattr(self, str(k))
+        else:
+            delattr(self, self._get_abs_string_index(idx))
+        # To preserve numbering, self._modules is being reconstructed with modules after deletion
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+
+    def __iadd__(self, modules: Iterable[Module]) -> "ModuleList":
+        return self.extend(modules)
+
+    def __add__(self, other: Iterable[Module]) -> "ModuleList":
+        combined = ModuleList()
+        for i, module in enumerate(chain(self, other)):
+            combined.add_module(str(i), module)
+        return combined
+
+    def __dir__(self):
+        keys = super(ModuleList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def insert(self, index: int, module: Module) -> None:
+        r"""Insert a given module before a given index in the list.
+
+        Args:
+            index (int): index to insert.
+            module (nn.Module): module to insert
+        """
+        for i in range(len(self._modules), index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+
+    def append(self, module: Module) -> "ModuleList":
+        r"""Appends a given module to the end of the list.
+
+        Args:
+            module (nn.Module): module to append
+        """
+        self.add_module(str(len(self)), module)
+        return self
+
+    def pop(self, key: Union[int, slice]) -> Module:
+        v = self[key]
+        del self[key]
+        return v
+
+    def extend(self, modules: Iterable[Module]) -> "ModuleList":
+        r"""Appends modules from a Python iterable to the end of the list.
+
+        Args:
+            modules (iterable): iterable of modules to append
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError(
+                "ModuleList.extend should be called with an "
+                "iterable, but got " + type(modules).__name__
+            )
+        offset = len(self)
+        for i, module in enumerate(modules):
+            self.add_module(str(offset + i), module)
+        return self
+
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+
+
+class ModuleDict(Module):
+    r"""Holds submodules in a dictionary.
+
+    :class:`~nn.ModuleDict` can be indexed like a regular Python dictionary,
+    but modules it contains are properly registered, and will be visible by all
+    :class:`~nn.Module` methods.
+
+    :class:`~nn.ModuleDict` is an **ordered** dictionary that respects
+
+    * the order of insertion, and
+
+    * in :meth:`~nn.ModuleDict.update`, the order of the merged
+      ``OrderedDict``, ``dict`` (started from Python 3.6) or another
+      :class:`~nn.ModuleDict` (the argument to
+      :meth:`~nn.ModuleDict.update`).
+
+    Note that :meth:`~nn.ModuleDict.update` with other unordered mapping
+    types (e.g., Python's plain ``dict`` before Python version 3.6) does not
+    preserve the order of the merged mapping.
+
+    Args:
+        modules (iterable, optional): a mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module)
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.choices = nn.ModuleDict({
+                        'conv': nn.Conv2d(10, 10, 3),
+                        'pool': nn.MaxPool2d(3)
+                })
+                self.activations = nn.ModuleDict([
+                        ['lrelu', nn.LeakyReLU()],
+                        ['prelu', nn.PReLU()]
+                ])
+
+            def forward(self, x, choice, act):
+                x = self.choices[choice](x)
+                x = self.activations[act](x)
+                return x
+    """
+
+    _modules: Dict[str, Module]  # type: ignore[assignment]
+
+    def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None:
+        super(ModuleDict, self).__init__()
+        if modules is not None:
+            self.update(modules)
+
+    def __getitem__(self, key: str) -> Module:
+        return self._modules[key]
+
+    def __setitem__(self, key: str, module: Module) -> None:
+        self.add_module(key, module)
+
+    def __delitem__(self, key: str) -> None:
+        del self._modules[key]
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._modules)
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._modules
+
+    def clear(self) -> None:
+        """Remove all items from the ModuleDict."""
+        self._modules.clear()
+
+    def pop(self, key: str) -> Module:
+        r"""Remove key from the ModuleDict and return its module.
+
+        Args:
+            key (str): key to pop from the ModuleDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self) -> Iterable[str]:
+        r"""Return an iterable of the ModuleDict keys."""
+        return self._modules.keys()
+
+    def items(self) -> Iterable[Tuple[str, Module]]:
+        r"""Return an iterable of the ModuleDict key/value pairs."""
+        return self._modules.items()
+
+    def values(self) -> Iterable[Module]:
+        r"""Return an iterable of the ModuleDict values."""
+        return self._modules.values()
+
+    def update(self, modules: Mapping[str, Module]) -> None:
+        r"""Update the :class:`~nn.ModuleDict` with the key-value pairs from a
+        mapping or an iterable, overwriting existing keys.
+
+        .. note::
+            If :attr:`modules` is an ``OrderedDict``, a :class:`~nn.ModuleDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+
+        Args:
+            modules (iterable): a mapping (dictionary) from string to :class:`~nn.Module`,
+                or an iterable of key-value pairs of type (string, :class:`~nn.Module`)
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError(
+                "ModuleDict.update should be called with an "
+                "iterable of key/value pairs, but got " + type(modules).__name__
+            )
+
+        if isinstance(modules, (OrderedDict, ModuleDict, container_abcs.Mapping)):
+            for key, module in modules.items():
+                self[key] = module
+        else:
+            # modules here can be a list with two items
+            for j, m in enumerate(modules):
+                if not isinstance(m, container_abcs.Iterable):
+                    raise TypeError(
+                        "ModuleDict update sequence element "
+                        f"#{j} should be Iterable, but is {type(m).__name__}"
+                    )
+                if not len(m) == 2:
+                    raise ValueError(
+                        "ModuleDict update sequence element "
+                        f"#{j} has length {len(m)}; 2 is required"
+                    )
+                # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
+                # that's too cumbersome to type correctly with overloads, so we add an ignore here
+                self[m[0]] = m[1]  # type: ignore[assignment]
+
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+
+
+class ParameterList(Module):
+    r"""Holds parameters in a list.
+
+    :class:`~nn.ParameterList` can be used like a regular Python
+    list, but Tensors that are :class:`~nn.Parameter` are properly registered,
+    and will be visible by all :class:`~nn.Module` methods.
+
+    Note that the constructor, assigning an element of the list, the
+    :meth:`~nn.ParameterDict.append` method and the :meth:`~nn.ParameterDict.extend`
+    method will convert any :class:`~Tensor` into :class:`~nn.Parameter`.
+
+    Args:
+        parameters (iterable, optional): an iterable of elements to add to the list.
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.params = nn.ParameterList([nn.Parameter(randn(10, 10)) for i in range(10)])
+
+            def forward(self, x):
+                # ParameterList can act as an iterable, or be indexed using ints
+                for i, p in enumerate(self.params):
+                    x = self.params[i // 2].mm(x) + p.mm(x)
+                return x
+    """
+
+    def __init__(self, values: Optional[Iterable[Any]] = None) -> None:
+        super(ParameterList, self).__init__()
+        self._size = 0
+        if values is not None:
+            self += values
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules"""
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(
+                f"Index {idx} is out of range. The list contains {len(self)} elements"
+            )
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    @overload
+    def __getitem__(self, idx: int) -> Any:
+        ...
+
+    @overload
+    def __getitem__(self: T, idx: slice) -> T:
+        ...
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            out = self.__class__()
+            for i in range(start, stop, step):
+                out.append(self[i])
+            return out
+        else:
+            idx = self._get_abs_string_index(idx)
+            return getattr(self, str(idx))
+
+    def __setitem__(self, idx: int, param: Any) -> None:
+        # Note that all other function that add an entry to the list part of
+        # the ParameterList end up here. So this is the only place where we need
+        # to wrap things into Parameter if needed.
+        # Objects added via setattr() are not in the list part and thus won't
+        # call into this function.
+        idx = self._get_abs_string_index(idx)
+        if isinstance(param, Tensor) and not isinstance(param, Parameter):
+            param = Parameter(param)
+        return setattr(self, str(idx), param)
+
+    def __len__(self) -> int:
+        return self._size
+
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self[i] for i in range(len(self)))
+
+    def __iadd__(self, parameters: Iterable[Any]) -> "ParameterList":
+        return self.extend(parameters)
+
+    def __dir__(self):
+        keys = super(ParameterList, self).__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def append(self, value: Any) -> "ParameterList":
+        """Appends a given value at the end of the list.
+
+        Args:
+            value (Any): value to append
+        """
+        new_idx = len(self)
+        self._size += 1
+        self[new_idx] = value
+        return self
+
+    def extend(self, values: Iterable[Any]) -> "ParameterList":
+        """Appends values from a Python iterable to the end of the list.
+
+        Args:
+            values (iterable): iterable of values to append
+        """
+        # Tensor is an iterable but we never want to unpack it here
+        if not isinstance(values, container_abcs.Iterable) or isinstance(
+            values, Tensor
+        ):
+            raise TypeError(
+                "ParameterList.extend should be called with an "
+                "iterable, but got " + type(values).__name__
+            )
+        for value in values:
+            self.append(value)
+        return self
+
+    def extra_repr(self) -> str:
+        child_lines = []
+        for k, p in enumerate(self):
+            if isinstance(p, Tensor):
+                size_str = "x".join(str(size) for size in p.size())
+                parastr = f"Parameter containing: [{size_str}]{p.dtype}"
+                child_lines.append("  (" + str(k) + "): " + parastr)
+            else:
+                child_lines.append(
+                    "  (" + str(k) + "): Object of type: " + type(p).__name__
+                )
+
+        tmpstr = "\n".join(child_lines)
+        return tmpstr
+
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError("ParameterList should not be called.")
+
+
+class ParameterDict(Module):
+    r"""Holds parameters in a dictionary.
+
+    ParameterDict can be indexed like a regular Python dictionary, but Parameters it
+    contains are properly registered, and will be visible by all Module methods.
+    Other objects are treated as would be done by a regular Python dictionary
+
+    :class:`~nn.ParameterDict` is an **ordered** dictionary.
+    :meth:`~nn.ParameterDict.update` with other unordered mapping
+    types (e.g., Python's plain ``dict``) does not preserve the order of the
+    merged mapping. On the other hand, ``OrderedDict`` or another :class:`~nn.ParameterDict`
+    will preserve their ordering.
+
+    Note that the constructor, assigning an element of the dictionary and the
+    :meth:`~nn.ParameterDict.update` method will convert any :class:`~Tensor` into
+    :class:`~nn.Parameter`.
+
+    Args:
+        values (iterable, optional): a mapping (dictionary) of
+            (string : Any) or an iterable of key-value pairs
+            of type (string, Any)
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.params = nn.ParameterDict({
+                        'left': nn.Parameter(randn(5, 10)),
+                        'right': nn.Parameter(randn(5, 10))
+                })
+
+            def forward(self, x, choice):
+                x = self.params[choice].mm(x)
+                return x
+    """
+
+    def __init__(self, parameters: Any = None) -> None:
+        super(ParameterDict, self).__init__()
+        self._keys: Dict[str, None] = {}
+        if parameters is not None:
+            self.update(parameters)
+
+    def _key_to_attr(self, key: str) -> str:
+        if not isinstance(key, str):
+            raise TypeError(
+                "Index given to ParameterDict cannot be used as a key as it is "
+                f"not a string (type is '{type(key).__name__}'). Open an issue on "
+                "github if you need non-string keys."
+            )
+        else:
+            # Use the key as-is so that `.named_parameters()` returns the right thing
+            return key
+
+    def __getitem__(self, key: str) -> Any:
+        attr = self._key_to_attr(key)
+        return getattr(self, attr)
+
+    def __setitem__(self, key: str, value: Any) -> None:
+        # Note that all other function that add an entry to the dictionary part of
+        # the ParameterDict end up here. So this is the only place where we need
+        # to wrap things into Parameter if needed.
+        # Objects added via setattr() are not in the dictionary part and thus won't
+        # call into this function.
+        self._keys[key] = None
+        attr = self._key_to_attr(key)
+        if isinstance(value, Tensor) and not isinstance(value, Parameter):
+            value = Parameter(value)
+        setattr(self, attr, value)
+
+    def __delitem__(self, key: str) -> None:
+        del self._keys[key]
+        attr = self._key_to_attr(key)
+        delattr(self, attr)
+
+    def __len__(self) -> int:
+        return len(self._keys)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._keys)
+
+    def __reversed__(self) -> Iterator[str]:
+        return reversed(list(self._keys))
+
+    def copy(self) -> "ParameterDict":
+        """Returns a copy of this :class:`~nn.ParameterDict` instance."""
+        # We have to use an OrderedDict because the ParameterDict constructor
+        # behaves differently on plain dict vs OrderedDict
+        return ParameterDict(OrderedDict((k, self[k]) for k in self._keys))
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._keys
+
+    def setdefault(self, key: str, default: Optional[Any] = None) -> Any:
+        """If key is in the ParameterDict, return its value.
+        If not, insert `key` with a parameter `default` and return `default`.
+        `default` defaults to `None`.
+
+        Args:
+            key (str): key to set default for
+            default (Any): the parameter set to the key
+        """
+
+        if key not in self:
+            self[key] = default
+        return self[key]
+
+    def clear(self) -> None:
+        """Remove all items from the ParameterDict."""
+        for k in self._keys.copy():
+            del self[k]
+
+    def pop(self, key: str) -> Any:
+        r"""Remove key from the ParameterDict and return its parameter.
+
+        Args:
+            key (str): key to pop from the ParameterDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def popitem(self) -> Tuple[str, Any]:
+        """Remove and return the last inserted `(key, parameter)` pair
+        from the ParameterDict
+        """
+        k, _ = self._keys.popitem()
+        # We need the key in the _keys to be able to access/del
+        self._keys[k] = None
+        val = self[k]
+        del self[k]
+        return k, val
+
+    def get(self, key: str, default: Optional[Any] = None) -> Any:
+        r"""Return the parameter associated with key if present.
+        Otherwise return default if provided, None if not.
+
+        Args:
+            key (str): key to get from the ParameterDict
+            default (Parameter, optional): value to return if key not present
+        """
+        return self[key] if key in self else default
+
+    def fromkeys(
+        self, keys: Iterable[str], default: Optional[Any] = None
+    ) -> "ParameterDict":
+        r"""Return a new ParameterDict with the keys provided
+
+        Args:
+            keys (iterable, string): keys to make the new ParameterDict from
+            default (Parameter, optional): value to set for all keys
+        """
+        return ParameterDict(((k, default) for k in keys))
+
+    def keys(self) -> Iterable[str]:
+        r"""Return an iterable of the ParameterDict keys."""
+        return self._keys.keys()
+
+    def items(self) -> Iterable[Tuple[str, Any]]:
+        r"""Return an iterable of the ParameterDict key/value pairs."""
+        return ((k, self[k]) for k in self._keys)
+
+    def values(self) -> Iterable[Any]:
+        r"""Return an iterable of the ParameterDict values."""
+        return (self[k] for k in self._keys)
+
+    def update(self, parameters: Union[Mapping[str, Any], "ParameterDict"]) -> None:
+        r"""Update the :class:`~nn.ParameterDict` with the key-value pairs from a
+        mapping or an iterable, overwriting existing keys.
+
+        .. note::
+            If :attr:`parameters` is an ``OrderedDict``, a :class:`~nn.ParameterDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+
+        Args:
+            parameters (iterable): a mapping (dictionary) from string to
+                :class:`~nn.Parameter`, or an iterable of
+                key-value pairs of type (string, :class:`~nn.Parameter`)
+        """
+        if not isinstance(parameters, container_abcs.Iterable):
+            raise TypeError(
+                "ParametersDict.update should be called with an "
+                f"iterable of key/value pairs, but got {type(parameters).__name__}"
+            )
+
+        if isinstance(parameters, (OrderedDict, ParameterDict)):
+            for key, parameter in parameters.items():
+                self[key] = parameter
+        elif isinstance(parameters, container_abcs.Mapping):
+            for key, parameter in sorted(parameters.items()):
+                self[key] = parameter
+        else:
+            for j, p in enumerate(parameters):
+                if not isinstance(p, container_abcs.Iterable):
+                    raise TypeError(
+                        "ParameterDict update sequence element "
+                        f"#{j} should be Iterable; is h{type(p).__name__};"
+                    )
+                if not len(p) == 2:
+                    raise ValueError(
+                        "ParameterDict update sequence element "
+                        f"#{j} has length {len(p)}; 2 is required"
+                    )
+                # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
+                self[p[0]] = p[1]  # type: ignore[assignment]
+
+    def extra_repr(self) -> str:
+        child_lines = []
+        for k, p in self.items():
+            if isinstance(p, Tensor):
+                size_str = "x".join(str(size) for size in p.size())
+                parastr = "{} containing: [{} of size {}]".format(
+                    "Parameter" if isinstance(p, Parameter) else "Tensor",
+                    typename(p),
+                    size_str,
+                )
+                child_lines.append("  (" + str(k) + "): " + parastr)
+            else:
+                child_lines.append(
+                    "  (" + str(k) + "): Object of type: " + type(p).__name__
+                )
+        tmpstr = "\n".join(child_lines)
+        return tmpstr
+
+    def __call__(self, input):
+        raise RuntimeError("ParameterDict should not be called.")
+
+    def __or__(self, other: "ParameterDict") -> "ParameterDict":
+        copy = self.copy()
+        copy.update(other)
+        return copy
+
+    def __ror__(self, other: "ParameterDict") -> "ParameterDict":
+        copy = other.copy()
+        copy.update(self)
+        return copy
+
+    def __ior__(self, other: "ParameterDict") -> "ParameterDict":
+        self.update(other)
+        return self
diff --git a/python/aitemplate/frontend/nn/conv2d/__init__.py b/python/aitemplate/frontend/nn/conv2d/__init__.py
new file mode 100644
index 000000000..d951bfcce
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/__init__.py
@@ -0,0 +1,30 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+modules for conv2d
+"""
+from .conv2d import Conv2d
+from .conv2d_bias import Conv2dBias
+from .conv2d_bias_add_hardswish import Conv2dBiasAddHardswish
+from .conv2d_bias_add_relu import Conv2dBiasAddRelu
+from .conv2d_bias_few_channels import Conv2dBiasFewChannels
+from .conv2d_bias_hardswish import Conv2dBiasHardswish
+from .conv2d_bias_hardswish_few_channels import Conv2dBiasHardswishFewChannels
+from .conv2d_bias_relu import Conv2dBiasRelu
+from .conv2d_bias_relu_few_channels import Conv2dBiasReluFewChannels
+from .conv2d_bias_sigmoid import Conv2dBiasSigmoid
+from .transposed_conv2d_bias import ConvTranspose2dBias
+from .transposed_conv2d_bias_relu import ConvTranspose2dBiasRelu
diff --git a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
new file mode 100644
index 000000000..276a360a7
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
@@ -0,0 +1,76 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+common module for conv_bias_act subgraph
+"""
+from ....compiler import ops
+from ..module import Module
+from ..parameter import Parameter
+
+# pylint: disable=C0103
+
+
+class Conv2dBiasAct(Module):
+    """common functions for conv2d_bias_act"""
+
+    def __init__(
+        self,
+        op_name,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        """initilize the Conv2dBiasAct class
+
+        Parameters
+        ----------
+        in_channel : [type]
+            [description]
+        out_channel : [type]
+            [description]
+        kernel_size : [type]
+            [description]
+        stride : [type]
+            [description]
+        pad : str, optional
+            [description], by default 'SAME'
+        dilate : int, optional
+            [description], by default 1
+        dtype : str, optional
+            [description], by default "float16"
+
+        Raises
+        ------
+        NotImplementedError
+            [description]
+        """
+        super().__init__()
+        self.weight = Parameter(
+            shape=[out_channels, kernel_size, kernel_size, in_channels // groups],
+            dtype=dtype,
+        )
+        self.bias = Parameter(shape=[out_channels], dtype=dtype)
+        op_func = getattr(ops, op_name)
+        self.op = op_func(stride=stride, pad=padding, dilate=dilation, group=groups)
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x, self.weight.tensor(), self.bias.tensor())
diff --git a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py
new file mode 100644
index 000000000..a08a4abf5
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py
@@ -0,0 +1,51 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+common module for conv2d bias act residual add
+"""
+from ....compiler import ops
+from ..module import Module
+from ..parameter import Parameter
+
+# pylint: disable=C0103
+
+
+class Conv2dBiasAddAct(Module):
+    def __init__(
+        self,
+        op_name,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__()
+        self.weight = Parameter(
+            shape=[out_channels, kernel_size, kernel_size, in_channels // groups],
+            dtype=dtype,
+        )
+        self.bias = Parameter(shape=[out_channels], dtype=dtype)
+        op_func = getattr(ops, op_name)
+        self.op = op_func(stride=stride, pad=padding, dilate=dilation, group=groups)
+
+    def forward(self, *args):
+        assert len(args) == 2
+        x = args[0]
+        r = args[1]
+        return self.op(x, self.weight.tensor(), self.bias.tensor(), r)
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d.py b/python/aitemplate/frontend/nn/conv2d/conv2d.py
new file mode 100644
index 000000000..18b8c6ca7
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d.py
@@ -0,0 +1,114 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d Module.
+"""
+from ....compiler.ops import conv2d
+from ..module import Module
+from ..parameter import Parameter
+
+# pylint: disable=C0103
+
+
+class Conv2d(Module):
+    r"""Applies a 2D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, H, W, C_{\text{in}})` and output :math:`(N, H_{\text{out}}, W_{\text{out}}, C_{\text{out}})`
+    can be precisely described as:
+
+    .. math::
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
+
+
+    where :math:`\star` is the valid 2D `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`H` is a height of input planes in pixels, :math:`W` is
+    width in pixels, and :math:`C` denotes a number of channels.
+
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of padding applied to the input.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the à trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int): Size of the convolving kernel
+        stride (int): Stride of the convolution
+        padding (int, optional): Padding added to all four sides of
+            the input. Default: 0
+        dilation (int, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        dtype (string, optional): Data type. Default: "float16"
+
+    Shape:
+        - Input: :math:`(N, H_{in}, W_{in}, C_{in})`
+        - Output: :math:`(N, H_{out}, W_{out}, C_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out_channels}, \text{kernel_size}, \text{kernel_size}, `
+            :math:`\frac{\text{in_channels}}{\text{groups}})`.
+
+    Examples::
+
+        >>> m = nn.Conv2d(16, 33, 3, 2)
+        >>> input = Tensor(shape=[20, 50, 100, 16])
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__()
+        self.weight = Parameter(
+            shape=[out_channels, kernel_size, kernel_size, in_channels // groups],
+            dtype=dtype,
+        )
+        self.op = conv2d(stride=stride, pad=padding, dilate=dilation, group=groups)
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x, self.weight.tensor())
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
new file mode 100644
index 000000000..92a03cf58
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias module
+"""
+from .common_conv2d_bias_act import Conv2dBiasAct
+
+
+class Conv2dBias(Conv2dBiasAct):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            "conv2d_bias",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
new file mode 100644
index 000000000..a3938171c
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d + bias + residual + hardswish
+"""
+from .common_conv2d_bias_add_act import Conv2dBiasAddAct
+
+
+class Conv2dBiasAddHardswish(Conv2dBiasAddAct):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            "conv2d_bias_add_hardswish",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
new file mode 100644
index 000000000..7d15e22c4
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+General template module for conv2e + bias + residual + relu
+"""
+from .common_conv2d_bias_add_act import Conv2dBiasAddAct
+
+
+class Conv2dBiasAddRelu(Conv2dBiasAddAct):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            "conv2d_bias_add_relu",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
new file mode 100644
index 000000000..825d80bcc
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias relu module
+"""
+from .special_conv2d_bias_act import SpecialConv2dBiasAct
+
+
+class Conv2dBiasFewChannels(SpecialConv2dBiasAct):
+    """functions for the op with conv2d+bias+relu pattern"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        auto_padding=True,
+        dtype="float16",
+    ):
+        super().__init__(
+            "conv2d_bias_few_channels",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            auto_padding,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
new file mode 100644
index 000000000..bd3251b64
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv bias hardswish module
+"""
+from .common_conv2d_bias_act import Conv2dBiasAct
+
+
+class Conv2dBiasHardswish(Conv2dBiasAct):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            "conv2d_bias_hardswish",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
new file mode 100644
index 000000000..73a6e42be
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias hardswish module
+"""
+from .special_conv2d_bias_act import SpecialConv2dBiasAct
+
+
+class Conv2dBiasHardswishFewChannels(SpecialConv2dBiasAct):
+    """functions for the op with conv2d+bias+hardswish pattern"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        auto_padding=True,
+        dtype="float16",
+    ):
+        super().__init__(
+            "conv2d_bias_hardswish_few_channels",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            auto_padding,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
new file mode 100644
index 000000000..02a48d3f3
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias relu module
+"""
+from .common_conv2d_bias_act import Conv2dBiasAct
+
+
+class Conv2dBiasRelu(Conv2dBiasAct):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            "conv2d_bias_relu",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
new file mode 100644
index 000000000..6f8b78f33
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias relu module
+"""
+from .special_conv2d_bias_act import SpecialConv2dBiasAct
+
+
+class Conv2dBiasReluFewChannels(SpecialConv2dBiasAct):
+    """functions for the op with conv2d+bias+relu pattern"""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        auto_padding=True,
+        dtype="float16",
+    ):
+        super().__init__(
+            "conv2d_bias_relu_few_channels",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            auto_padding,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
new file mode 100644
index 000000000..5de24ea50
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias sigmoid module
+"""
+from .common_conv2d_bias_act import Conv2dBiasAct
+
+
+class Conv2dBiasSigmoid(Conv2dBiasAct):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            "conv2d_bias_sigmoid",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py
new file mode 100644
index 000000000..63d9751dc
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py
@@ -0,0 +1,57 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+common module for conv_bias_act subgraph
+"""
+from ....compiler import ops
+from ..module import Module
+from ..parameter import Parameter
+
+# pylint: disable=C0103
+
+
+class SpecialConv2dBiasAct(Module):
+    """common functions for conv2d_bias_act op with few channel input"""
+
+    def __init__(
+        self,
+        op_name,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        auto_padding=True,
+        dtype="float16",
+    ):
+        super().__init__()
+        if auto_padding and in_channels < 4:
+            in_channels = 4
+        elif auto_padding and in_channels > 4 and in_channels < 8:
+            in_channels = 8
+        self.weight = Parameter(
+            shape=[out_channels, kernel_size, kernel_size, in_channels], dtype=dtype
+        )
+        self.bias = Parameter(shape=[out_channels], dtype=dtype)
+        op_func = getattr(ops, op_name)
+        self.op = op_func(
+            stride=stride, pad=padding, dilate=dilation, auto_padding=auto_padding
+        )
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x, self.weight.tensor(), self.bias.tensor())
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
new file mode 100644
index 000000000..ecaa5498d
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+[summary] conv2d bias relu module
+"""
+from .transposed_conv2d_bias_act import ConvTranspose2dBiasAct
+
+
+class ConvTranspose2dBias(ConvTranspose2dBiasAct):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            "transposed_conv2d_bias",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
new file mode 100644
index 000000000..368c64922
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
@@ -0,0 +1,76 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+common module for ConvTranspose2d_bias_act subgraph
+"""
+from ....compiler import ops
+from ..module import Module
+from ..parameter import Parameter
+
+# pylint: disable=C0103
+
+
+class ConvTranspose2dBiasAct(Module):
+    """common functions for ConvTranspose2d_bias_act"""
+
+    def __init__(
+        self,
+        op_name,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        """initilize the ConvTranspose2dBiasAct class
+
+        Parameters
+        ----------
+        in_channel : [type]
+            [description]
+        out_channel : [type]
+            [description]
+        kernel_size : [type]
+            [description]
+        stride : [type]
+            [description]
+        pad : str, optional
+            [description], by default 'SAME'
+        dilate : int, optional
+            [description], by default 1
+        dtype : str, optional
+            [description], by default "float16"
+
+        Raises
+        ------
+        NotImplementedError
+            [description]
+        """
+        super().__init__()
+        self.weight = Parameter(
+            shape=[in_channels, kernel_size, kernel_size, out_channels // groups],
+            dtype=dtype,
+        )
+        self.bias = Parameter(shape=[out_channels], dtype=dtype)
+        op_func = getattr(ops, op_name)
+        self.op = op_func(stride=stride, pad=padding, dilate=dilation, group=groups)
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x, self.weight.tensor(), self.bias.tensor())
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
new file mode 100644
index 000000000..368d8b4f7
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv2d bias relu module
+"""
+from .transposed_conv2d_bias_act import ConvTranspose2dBiasAct
+
+
+class ConvTranspose2dBiasRelu(ConvTranspose2dBiasAct):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            "transposed_conv2d_bias_relu",
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            dtype,
+        )
diff --git a/python/aitemplate/frontend/nn/dropout.py b/python/aitemplate/frontend/nn/dropout.py
new file mode 100644
index 000000000..0e3ddd41a
--- /dev/null
+++ b/python/aitemplate/frontend/nn/dropout.py
@@ -0,0 +1,40 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""Dropout/DropPath placeholder"""
+from .module import Module
+
+# pylint: disable=C0103
+
+
+class Dropout(Module):
+    def __init__(
+        self,
+        p=0,
+        dtype="float16",
+    ):
+        super().__init__()
+
+    def forward(self, *args):
+        assert len(args) == 1
+        data = args[0]
+        return data
+
+
+class DropPath(Dropout):
+    def __init__(
+        self,
+        dtype="float16",
+    ):
+        super().__init__()
diff --git a/python/aitemplate/frontend/nn/embedding.py b/python/aitemplate/frontend/nn/embedding.py
new file mode 100644
index 000000000..e72990e8a
--- /dev/null
+++ b/python/aitemplate/frontend/nn/embedding.py
@@ -0,0 +1,121 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from aitemplate.testing import detect_target
+
+from ...compiler import ops
+from ...compiler.public import FuncEnum
+from .dropout import Dropout
+from .layer_norm import LayerNorm
+from .module import Module
+from .parameter import Parameter
+
+
+class Embedding(Module):
+    def __init__(
+        self,
+        shape,
+        dtype,
+    ):
+        super().__init__()
+        self.weight = Parameter(shape=shape, dtype=dtype)
+
+    def tensor(self):
+        return self.weight.tensor()
+
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+class BertEmbeddings(Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(
+        self,
+        hidden_size,
+        vocab_size,
+        max_position_embeddings,
+        type_vocab_size,
+        layer_norm_eps,
+        hidden_dropout_prob,
+        dtype="float16",
+    ):
+        super().__init__()
+        assert (
+            hidden_dropout_prob == 0.0
+        ), "Dropout rate larger than 0 is not supported yet."
+
+        self.word_embeddings = Embedding(shape=[vocab_size, hidden_size], dtype=dtype)
+        self.position_embeddings = Embedding(
+            shape=[max_position_embeddings, hidden_size],
+            dtype=dtype,
+        )
+        self.token_type_embeddings = Embedding(
+            shape=[type_vocab_size, hidden_size], dtype=dtype
+        )
+
+        self.LayerNorm = LayerNorm([hidden_size], layer_norm_eps, dtype)
+        self.dropout = Dropout(hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_ids,  # [B, S]
+        token_type_ids,  # [B, S]
+        position_ids,  # [B, S]
+    ):
+        if USE_CUDA:
+            embeddings = ops.bert_embeddings()(
+                input_ids,
+                token_type_ids,
+                position_ids,
+                self.word_embeddings.weight.tensor(),
+                self.token_type_embeddings.weight.tensor(),
+                self.position_embeddings.weight.tensor(),
+                self.LayerNorm.weight.tensor(),
+                self.LayerNorm.bias.tensor(),
+                self.LayerNorm.eps,
+            )
+            embeddings = self.dropout(embeddings)
+            return embeddings
+
+        input_shape = ops.size()(input_ids)
+
+        # [B * S]
+        input_ids = ops.reshape()(input_ids, [-1])
+        token_type_ids = ops.reshape()(token_type_ids, [-1])
+        position_ids = ops.reshape()(position_ids, [-1])
+
+        # [B * S, H]
+        input_embeddings = ops.batch_gather()(self.word_embeddings.tensor(), input_ids)
+        token_type_embeddings = ops.batch_gather()(
+            self.token_type_embeddings.tensor(), token_type_ids
+        )
+        position_embeddings = ops.batch_gather()(
+            self.position_embeddings.tensor(), position_ids
+        )
+
+        # add
+        embeddings = ops.elementwise(FuncEnum.ADD)(
+            input_embeddings, token_type_embeddings
+        )
+
+        embeddings = ops.elementwise(FuncEnum.ADD)(embeddings, position_embeddings)
+
+        # norm
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        embeddings = ops.reshape()(embeddings, input_shape + [-1])
+
+        return embeddings
diff --git a/python/aitemplate/frontend/nn/fpn_proposal.py b/python/aitemplate/frontend/nn/fpn_proposal.py
new file mode 100644
index 000000000..8645a82a2
--- /dev/null
+++ b/python/aitemplate/frontend/nn/fpn_proposal.py
@@ -0,0 +1,118 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+FPNProposal module.
+"""
+import numpy as np
+
+from ...compiler import ops
+from ...compiler.base import Tensor
+from .proposal import generate_shifted_anchors, Proposal
+
+
+def generate_fpn_anchors(im_h, im_w, feat_strides, scales, ratios, batch_size, dtype):
+    """Enumerate anchors for all levels"""
+    anchors = []
+    for feat_stride, scale in zip(feat_strides, scales):
+        anchors.append(
+            generate_shifted_anchors(
+                im_h,
+                im_w,
+                feat_stride,
+                np.array(scale, dtype="float32"),
+                np.array(ratios, dtype="float32"),
+                batch_size,
+                dtype,
+            )
+        )
+    return anchors
+
+
+class FPNProposal(Proposal):
+    def __init__(
+        self,
+        im_shape,
+        feat_strides=(4, 8, 16, 32, 64),
+        scales=((32,), (64,), (128,), (256,), (512,)),
+        ratios=(0.5, 1, 2),
+        clip_box=True,
+        nms_on=True,
+        rpn_pre_nms_top_n=6000,
+        rpn_post_nms_top_n=300,
+        iou_threshold=0.3,
+        rpn_min_size=0,
+        level=-1,
+        batch_size=1,
+        dtype="float16",
+    ):
+        super().__init__(
+            im_shape,
+            feat_strides,
+            scales,
+            ratios,
+            clip_box,
+            nms_on,
+            rpn_pre_nms_top_n,
+            rpn_post_nms_top_n,
+            iou_threshold,
+            rpn_min_size,
+            level,
+            generate_fpn_anchors,
+            batch_size,
+            dtype,
+        )
+
+    def forward(self, *args):
+        assert len(args) >= 1
+
+        reg_list = args[0]
+        bbox_deltas_list = []
+        anchors_list = []
+        split_size_or_sections = []
+        for idx, x_bbox_deltas in enumerate(reg_list):
+            bbox_deltas = ops.reshape()(x_bbox_deltas, [-1, 4])
+            anchors = Tensor(
+                shape=self._anchors[idx].shape, name="anchors_%d" % (idx + 2)
+            )
+            bbox_deltas_list.append(bbox_deltas)
+            anchors_list.append(anchors)
+            split_size_or_sections.append(self._anchors[idx].shape[0])
+
+        bbox_deltas_cat = ops.concatenate()(bbox_deltas_list, dim=0)
+        anchors_cat = ops.concatenate()(anchors_list, dim=0)
+        proposals = self.box_transform(bbox_deltas_cat, anchors_cat)
+
+        proposals = ops.split()(proposals, split_size_or_sections, dim=0)
+
+        if self.nms_on:
+            scores = args[1]
+            scores_r = ops.reshape()(scores, [1, -1])
+            proposals_r = ops.reshape()(proposals, [1, -1, 4])
+            dets = ops.nms(
+                self.rpn_pre_nms_top_n,
+                self.rpn_post_nms_top_n,
+                self.iou_threshold,
+                self.rpn_min_size,
+            )(proposals_r, scores_r)
+            # prepare for roi-align for mask head
+            batch_inds = Tensor(
+                shape=[1, self.rpn_post_nms_top_n, 1],
+                dtype=self.dtype,
+                name="batch_inds",
+                value=0,
+            )
+            ret = ops.reshape()(ops.concatenate()([batch_inds, dets], dim=2), [-1, 5])
+            return ret, dets
+        return list(proposals)
diff --git a/python/aitemplate/frontend/nn/group_norm.py b/python/aitemplate/frontend/nn/group_norm.py
new file mode 100644
index 000000000..af8ea7a53
--- /dev/null
+++ b/python/aitemplate/frontend/nn/group_norm.py
@@ -0,0 +1,50 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GroupNorm module
+"""
+from ...compiler import ops
+from .module import Module
+from .parameter import Parameter
+
+# pylint: disable=C0103
+
+
+class GroupNorm(Module):
+    """GroupNorm nn module"""
+
+    def __init__(
+        self,
+        num_groups,
+        num_channels,
+        eps=1e-5,
+        affine=True,
+        dtype="float16",
+        use_swish=False,
+        **kwargs,
+    ):
+        """Group Norm init"""
+        super().__init__()
+        self.eps = eps
+        op_name = "group_norm_swish" if use_swish else "group_norm"
+        self.weight = Parameter(shape=[num_channels], dtype=dtype)
+        self.bias = Parameter(shape=[num_channels], dtype=dtype)
+        self.op = getattr(ops, op_name)(num_groups, num_channels)
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        y = self.op(x, self.weight.tensor(), self.bias.tensor(), self.eps)
+        return y
diff --git a/python/aitemplate/frontend/nn/identity.py b/python/aitemplate/frontend/nn/identity.py
new file mode 100644
index 000000000..f3a421df5
--- /dev/null
+++ b/python/aitemplate/frontend/nn/identity.py
@@ -0,0 +1,33 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Identity module.
+"""
+from .module import Module
+
+# pylint: disable=C0103
+
+
+class Identity(Module):
+    def __init__(
+        self,
+        dtype="float16",
+    ):
+        super().__init__()
+
+    def forward(self, *args):
+        assert len(args) == 1
+        data = args[0]
+        return data
diff --git a/python/aitemplate/frontend/nn/layer_norm.py b/python/aitemplate/frontend/nn/layer_norm.py
new file mode 100644
index 000000000..8b6f9988e
--- /dev/null
+++ b/python/aitemplate/frontend/nn/layer_norm.py
@@ -0,0 +1,58 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+LayerNorm module.
+"""
+from ...compiler import ops
+from .module import Module
+from .parameter import Parameter
+
+# pylint: disable=C0103
+
+
+class LayerNorm(Module):
+    """LayerNorm nn module"""
+
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-5,
+        dtype="float16",
+        **kwargs,
+    ):
+        """Standalone layernorm op.
+        Applies Layer Normalization over a mini-batch of inputs as described in the
+        paper Layer Normalization. The mean and standard-deviation are calculated
+        over the last D dimensions, where D is the dimension of normalized_shape.
+        Input shape: [M0, M1, ..., Mp, N1, N2, ..., ND]
+        Normalized_shape: [N1, N2, ..., ND]
+        Gamma/Beta, if not None, have the same shape as normalized_shape.
+        """
+        super().__init__()
+        self.eps = eps
+        self.dim = (
+            normalized_shape
+            if isinstance(normalized_shape, (tuple, list))
+            else (normalized_shape,)
+        )
+        self.weight = Parameter(shape=self.dim, dtype=dtype)
+        self.bias = Parameter(shape=self.dim, dtype=dtype)
+        self.op = ops.layernorm()
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        y = self.op(x, self.weight.tensor(), self.bias.tensor(), self.dim, self.eps)
+        return y
diff --git a/python/aitemplate/frontend/nn/linear.py b/python/aitemplate/frontend/nn/linear.py
new file mode 100644
index 000000000..d91099f2f
--- /dev/null
+++ b/python/aitemplate/frontend/nn/linear.py
@@ -0,0 +1,70 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Linear module.
+"""
+from aitemplate.testing import detect_target
+
+from ...compiler import ops
+from .module import Module
+from .parameter import Parameter
+
+# pylint: disable=C0103
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+class Linear(Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bias=True,
+        specialization=None,
+        dtype="float16",
+        **kwargs,
+    ):
+        super().__init__()
+        self.weight = Parameter(shape=[out_channels, in_channels], dtype=dtype)
+        op_name = "gemm_rcr_bias" if bias else "gemm_rcr"
+        if specialization is not None:
+            op_name += "_" + specialization
+        if bias:
+            self.bias = Parameter(shape=[out_channels], dtype=dtype)
+        op_func = getattr(ops, op_name)
+        self._op_name = op_name
+        self.op = op_func(**kwargs)
+        self.use_bias = bias
+        self.in_channels = in_channels
+
+    def forward(self, *args):
+        assert len(args) >= 1
+        x = args[0]
+        if not USE_CUDA:
+            shape = x._attrs["shape"]
+            x = x if len(shape) == 2 else ops.reshape()(x, [-1, self.in_channels])
+        if len(args) == 2:
+            if self.use_bias:
+                inputs = [x, self.weight.tensor(), self.bias.tensor(), args[1]]
+            else:
+                inputs = [x, self.weight.tensor(), args[1]]
+            output = self.op(*inputs)
+            return output
+        output = (
+            self.op(x, self.weight.tensor(), bias=self.bias.tensor())
+            if self.use_bias
+            else self.op(x, self.weight.tensor())
+        )
+        return output
diff --git a/python/aitemplate/frontend/nn/module.py b/python/aitemplate/frontend/nn/module.py
new file mode 100644
index 000000000..ae02926a0
--- /dev/null
+++ b/python/aitemplate/frontend/nn/module.py
@@ -0,0 +1,757 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from collections import namedtuple, OrderedDict
+from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple, Union
+
+from ...compiler.base import Tensor
+from .parameter import Parameter
+
+
+class _IncompatibleKeys(
+    namedtuple("IncompatibleKeys", ["missing_keys", "unexpected_keys"])
+):
+    def __repr__(self):
+        if not self.missing_keys and not self.unexpected_keys:
+            return "<All keys matched successfully>"
+        return super(_IncompatibleKeys, self).__repr__()
+
+    __str__ = __repr__
+
+
+# Trick mypy into not applying contravariance rules to inputs by defining
+# forward as a value, rather than a function.  See also
+# https://github.com/python/mypy/issues/8795
+def _forward_unimplemented(self, *input: Any) -> None:
+    r"""Defines the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    """
+    raise NotImplementedError(
+        f'Module [{type(self).__name__}] is missing the required "forward" function'
+    )
+
+
+def typename(x):
+    if hasattr(x, "__class__"):
+        return x.__class__.__name__
+    else:
+        return str(type(x))
+
+
+def _addindent(s_, numSpaces):
+    s = s_.split("\n")
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(numSpaces * " ") + line for line in s]
+    s = "\n".join(s)
+    s = first + "\n" + s
+    return s
+
+
+class Module:
+    r"""Base class for all neural network modules.
+
+    Your models should also subclass this class.
+
+    Modules can also contain other Modules, allowing to nest them in
+    a tree structure. You can assign the submodules as regular attributes::
+
+        import nn as nn
+        import nn.functional as F
+
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = nn.Conv2d(1, 20, 5)
+                self.conv2 = nn.Conv2d(20, 20, 5)
+
+            def forward(self, x):
+                x = F.relu(self.conv1(x))
+                return F.relu(self.conv2(x))
+
+    Submodules assigned in this way will be registered, and will have their
+    parameters converted too when you call :meth:`to`, etc.
+
+    .. note::
+        As per the example above, an ``__init__()`` call to the parent class
+        must be made before assignment on the child.
+
+    :ivar training: Boolean represents whether this module is in training or
+                    evaluation mode.
+    :vartype training: bool
+    """
+
+    dump_patches: bool = False
+
+    _version: int = 1
+    r"""This allows better BC support for :meth:`load_state_dict`. In
+    :meth:`state_dict`, the version number will be saved as in the attribute
+    `_metadata` of the returned state dict, and thus pickled. `_metadata` is a
+    dictionary with keys that follow the naming convention of state dict. See
+    ``_load_from_state_dict`` on how to use this information in loading.
+
+    If new parameters/buffers are added/removed from a module, this number shall
+    be bumped, and the module's `_load_from_state_dict` method can compare the
+    version number and do appropriate changes if the state dict is from before
+    the change."""
+
+    _parameters: Dict[str, Optional[Parameter]]
+    _buffers: Dict[str, Optional[Tensor]]
+    _modules: Dict[str, Optional["Module"]]
+
+    def __init__(self) -> None:
+        """
+        Calls super().__setattr__('a', a) instead of the typical self.a = a
+        to avoid Module.__setattr__ overhead. Module's __setattr__ has special
+        handling for parameters, submodules, and buffers but simply calls into
+        super().__setattr__ for all other attributes.
+        """
+        super().__setattr__("_parameters", OrderedDict())
+        super().__setattr__("_buffers", OrderedDict())
+        super().__setattr__("_modules", OrderedDict())
+
+    forward: Callable[..., Any] = _forward_unimplemented
+
+    def register_buffer(
+        self, name: str, tensor: Optional[Tensor], persistent: bool = True
+    ) -> None:
+        r"""Adds a buffer to the module.
+
+        This is typically used to register a buffer that should not to be
+        considered a model parameter. For example, BatchNorm's ``running_mean``
+        is not a parameter, but is part of the module's state. Buffers, by
+        default, are persistent and will be saved alongside parameters. This
+        behavior can be changed by setting :attr:`persistent` to ``False``. The
+        only difference between a persistent buffer and a non-persistent buffer
+        is that the latter will not be a part of this module's
+        :attr:`state_dict`.
+
+        Buffers can be accessed as attributes using given names.
+
+        Args:
+            name (str): name of the buffer. The buffer can be accessed
+                from this module using the given name
+            tensor (Tensor or None): buffer to be registered. If ``None``, then operations
+                that run on buffers, such as :attr:`cuda`, are ignored. If ``None``,
+                the buffer is **not** included in the module's :attr:`state_dict`.
+            persistent (bool): whether the buffer is part of this module's
+                :attr:`state_dict`.
+
+        Example::
+
+            >>> self.register_buffer('running_mean', zeros(num_features))
+
+        """
+        if "_buffers" not in self.__dict__:
+            raise AttributeError("cannot assign buffer before Module.__init__() call")
+        elif "." in name:
+            raise KeyError('buffer name can\'t contain "."')
+        elif name == "":
+            raise KeyError('buffer name can\'t be empty string ""')
+        elif hasattr(self, name) and name not in self._buffers:
+            raise KeyError("attribute '{}' already exists".format(name))
+        elif tensor is not None and not isinstance(tensor, Tensor):
+            raise TypeError(
+                "cannot assign '{}' object to buffer '{}' "
+                "(torch Tensor or None required)".format(typename(tensor), name)
+            )
+        else:
+            self._buffers[name] = tensor
+            if persistent:
+                self._non_persistent_buffers_set.discard(name)
+            else:
+                self._non_persistent_buffers_set.add(name)
+
+    def register_parameter(self, name: str, param: Optional[Parameter]) -> None:
+        r"""Adds a parameter to the module.
+
+        The parameter can be accessed as an attribute using given name.
+
+        Args:
+            name (str): name of the parameter. The parameter can be accessed
+                from this module using the given name
+            param (Parameter or None): parameter to be added to the module. If
+                ``None``, then operations that run on parameters, such as :attr:`cuda`,
+                are ignored. If ``None``, the parameter is **not** included in the
+                module's :attr:`state_dict`.
+        """
+        if "_parameters" not in self.__dict__:
+            raise AttributeError(
+                "cannot assign parameter before Module.__init__() call"
+            )
+
+        elif "." in name:
+            raise KeyError('parameter name can\'t contain "."')
+        elif name == "":
+            raise KeyError('parameter name can\'t be empty string ""')
+        elif hasattr(self, name) and name not in self._parameters:
+            raise KeyError("attribute '{}' already exists".format(name))
+
+        if param is None:
+            self._parameters[name] = None
+        elif not isinstance(param, Parameter):
+            raise TypeError(
+                "cannot assign '{}' object to parameter '{}' "
+                "(nn.Parameter or None required)".format(typename(param), name)
+            )
+        else:
+            self._parameters[name] = param
+
+    def add_module(self, name: str, module: Optional["Module"]) -> None:
+        r"""Adds a child module to the current module.
+
+        The module can be accessed as an attribute using the given name.
+
+        Args:
+            name (str): name of the child module. The child module can be
+                accessed from this module using the given name
+            module (Module): child module to be added to the module.
+        """
+        if not isinstance(module, Module) and module is not None:
+            raise TypeError("{} is not a Module subclass".format(typename(module)))
+        elif hasattr(self, name) and name not in self._modules:
+            raise KeyError("attribute '{}' already exists".format(name))
+        elif "." in name:
+            raise KeyError('module name can\'t contain ".", got: {}'.format(name))
+        elif name == "":
+            raise KeyError('module name can\'t be empty string ""')
+        self._modules[name] = module
+
+    def register_module(self, name: str, module: Optional["Module"]) -> None:
+        r"""Alias for :func:`add_module`."""
+        self.add_module(name, module)
+
+    def get_submodule(self, target: str) -> "Module":
+        """
+        Returns the submodule given by ``target`` if it exists,
+        otherwise throws an error.
+
+        For example, let's say you have an ``nn.Module`` ``A`` that
+        looks like this:
+
+        .. code-block:: text
+
+            A(
+                (net_b): Module(
+                    (net_c): Module(
+                        (conv): Conv2d(16, 33, kernel_size=(3, 3), stride=(2, 2))
+                    )
+                    (linear): Linear(in_features=100, out_features=200, bias=True)
+                )
+            )
+
+        (The diagram shows an ``nn.Module`` ``A``. ``A`` has a nested
+        submodule ``net_b``, which itself has two submodules ``net_c``
+        and ``linear``. ``net_c`` then has a submodule ``conv``.)
+
+        To check whether or not we have the ``linear`` submodule, we
+        would call ``get_submodule("net_b.linear")``. To check whether
+        we have the ``conv`` submodule, we would call
+        ``get_submodule("net_b.net_c.conv")``.
+
+        The runtime of ``get_submodule`` is bounded by the degree
+        of module nesting in ``target``. A query against
+        ``named_modules`` achieves the same result, but it is O(N) in
+        the number of transitive modules. So, for a simple check to see
+        if some submodule exists, ``get_submodule`` should always be
+        used.
+
+        Args:
+            target: The fully-qualified string name of the submodule
+                to look for. (See above example for how to specify a
+                fully-qualified string.)
+
+        Returns:
+            nn.Module: The submodule referenced by ``target``
+
+        Raises:
+            AttributeError: If the target string references an invalid
+                path or resolves to something that is not an
+                ``nn.Module``
+        """
+        if target == "":
+            return self
+
+        atoms: List[str] = target.split(".")
+        mod: Module = self
+
+        for item in atoms:
+
+            if not hasattr(mod, item):
+                raise AttributeError(
+                    mod._get_name() + " has no " "attribute `" + item + "`"
+                )
+
+            mod = getattr(mod, item)
+
+            if not isinstance(mod, Module):
+                raise AttributeError("`" + item + "` is not " "an nn.Module")
+
+        return mod
+
+    def get_parameter(self, target: str) -> "Parameter":
+        """
+        Returns the parameter given by ``target`` if it exists,
+        otherwise throws an error.
+
+        See the docstring for ``get_submodule`` for a more detailed
+        explanation of this method's functionality as well as how to
+        correctly specify ``target``.
+
+        Args:
+            target: The fully-qualified string name of the Parameter
+                to look for. (See ``get_submodule`` for how to specify a
+                fully-qualified string.)
+
+        Returns:
+            nn.Parameter: The Parameter referenced by ``target``
+
+        Raises:
+            AttributeError: If the target string references an invalid
+                path or resolves to something that is not an
+                ``nn.Parameter``
+        """
+        module_path, _, param_name = target.rpartition(".")
+
+        mod: Module = self.get_submodule(module_path)
+
+        if not hasattr(mod, param_name):
+            raise AttributeError(
+                mod._get_name() + " has no attribute `" + param_name + "`"
+            )
+
+        param: Parameter = getattr(mod, param_name)
+
+        if not isinstance(param, Parameter):
+            raise AttributeError("`" + param_name + "` is not an " "nn.Parameter")
+
+        return param
+
+    def get_buffer(self, target: str) -> "Tensor":
+        """
+        Returns the buffer given by ``target`` if it exists,
+        otherwise throws an error.
+
+        See the docstring for ``get_submodule`` for a more detailed
+        explanation of this method's functionality as well as how to
+        correctly specify ``target``.
+
+        Args:
+            target: The fully-qualified string name of the buffer
+                to look for. (See ``get_submodule`` for how to specify a
+                fully-qualified string.)
+
+        Returns:
+            Tensor: The buffer referenced by ``target``
+
+        Raises:
+            AttributeError: If the target string references an invalid
+                path or resolves to something that is not a
+                buffer
+        """
+        module_path, _, buffer_name = target.rpartition(".")
+
+        mod: Module = self.get_submodule(module_path)
+
+        if not hasattr(mod, buffer_name):
+            raise AttributeError(
+                mod._get_name() + " has no attribute `" + buffer_name + "`"
+            )
+
+        buffer: Tensor = getattr(mod, buffer_name)
+
+        if buffer_name not in mod._buffers:
+            raise AttributeError("`" + buffer_name + "` is not a buffer")
+
+        return buffer
+
+    def _call_impl(self, *input, **kwargs):
+        forward_call = self.forward
+        return forward_call(*input, **kwargs)
+
+    __call__: Callable[..., Any] = _call_impl
+
+    def __getattr__(self, name: str) -> Union[Tensor, "Module"]:
+        if "_parameters" in self.__dict__:
+            _parameters = self.__dict__["_parameters"]
+            if name in _parameters:
+                return _parameters[name]
+        if "_buffers" in self.__dict__:
+            _buffers = self.__dict__["_buffers"]
+            if name in _buffers:
+                return _buffers[name]
+        if "_modules" in self.__dict__:
+            modules = self.__dict__["_modules"]
+            if name in modules:
+                return modules[name]
+        raise AttributeError(
+            "'{}' object has no attribute '{}'".format(type(self).__name__, name)
+        )
+
+    def __setattr__(self, name: str, value: Union[Tensor, "Module"]) -> None:
+        def try_remove():
+            dicts_or_sets = [
+                self.__dict__,
+                self.__dict__.get("_parameters"),
+                self.__dict__.get("_buffers"),
+                self.__dict__.get("_modules"),
+            ]
+            for d in dicts_or_sets:
+                if name in d:
+                    if isinstance(d, dict):
+                        d.pop(name, None)
+                    else:
+                        d.discard(name)
+
+        params = self.__dict__.get("_parameters")
+        if isinstance(value, Parameter):
+            if params is None:
+                raise AttributeError(
+                    "cannot assign parameters before Module.__init__() call"
+                )
+            try_remove()
+            self.register_parameter(name, value)
+        elif params is not None and name in params:
+            if value is not None:
+                raise TypeError(
+                    "cannot assign '{}' as parameter '{}' "
+                    "(nn.Parameter or None expected)".format(typename(value), name)
+                )
+            try_remove()
+            self.register_parameter(name, value)
+        else:
+            modules = self.__dict__.get("_modules")
+            if isinstance(value, Module):
+                if modules is None:
+                    raise AttributeError(
+                        "cannot assign module before Module.__init__() call"
+                    )
+                try_remove()
+                modules[name] = value
+            elif modules is not None and name in modules:
+                if value is not None:
+                    raise TypeError(
+                        "cannot assign '{}' as child module '{}' "
+                        "(nn.Module or None expected)".format(typename(value), name)
+                    )
+                try_remove()
+                modules[name] = value
+            else:
+                buffers = self.__dict__.get("_buffers")
+                if buffers is not None and name in buffers:
+                    if value is not None and not isinstance(value, Tensor):
+                        raise TypeError(
+                            "cannot assign '{}' as buffer '{}' "
+                            "(Tensor or None expected)".format(typename(value), name)
+                        )
+                    try_remove()
+                    buffers[name] = value
+                else:
+                    super().__setattr__(name, value)
+
+    def __delattr__(self, name):
+        if name in self._parameters:
+            del self._parameters[name]
+        elif name in self._buffers:
+            del self._buffers[name]
+            self._non_persistent_buffers_set.discard(name)
+        elif name in self._modules:
+            del self._modules[name]
+        else:
+            super().__delattr__(name)
+
+    def _named_members(self, get_members_fn, prefix="", recurse=True):
+        r"""Helper method for yielding various names + members of modules."""
+        memo = set()
+        modules = self.named_modules(prefix=prefix) if recurse else [(prefix, self)]
+        for module_prefix, module in modules:
+            members = get_members_fn(module)
+            for k, v in members:
+                if v is None or v in memo:
+                    continue
+                memo.add(v)
+                name = module_prefix + ("." if module_prefix else "") + k
+                yield name, v
+
+    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
+        r"""Returns an iterator over module parameters.
+
+        This is typically passed to an optimizer.
+
+        Args:
+            recurse (bool): if True, then yields parameters of this module
+                and all submodules. Otherwise, yields only parameters that
+                are direct members of this module.
+
+        Yields:
+            Parameter: module parameter
+
+        Example::
+
+            >>> for param in model.parameters():
+            >>>     print(type(param), param.size())
+            <class 'Tensor'> (20L,)
+            <class 'Tensor'> (20L, 1L, 5L, 5L)
+
+        """
+        for _, param in self.named_parameters(recurse=recurse):
+            yield param
+
+    def named_parameters(
+        self, prefix: str = "", recurse: bool = True
+    ) -> Iterator[Tuple[str, Parameter]]:
+        r"""Returns an iterator over module parameters, yielding both the
+        name of the parameter as well as the parameter itself.
+
+        Args:
+            prefix (str): prefix to prepend to all parameter names.
+            recurse (bool): if True, then yields parameters of this module
+                and all submodules. Otherwise, yields only parameters that
+                are direct members of this module.
+
+        Yields:
+            (str, Parameter): Tuple containing the name and parameter
+
+        Example::
+
+            >>> for name, param in self.named_parameters():
+            >>>    if name in ['bias']:
+            >>>        print(param.size())
+
+        """
+        gen = self._named_members(
+            lambda module: module._parameters.items(), prefix=prefix, recurse=recurse
+        )
+        for elem in gen:
+            yield elem
+
+    def buffers(self, recurse: bool = True) -> Iterator[Tensor]:
+        r"""Returns an iterator over module buffers.
+
+        Args:
+            recurse (bool): if True, then yields buffers of this module
+                and all submodules. Otherwise, yields only buffers that
+                are direct members of this module.
+
+        Yields:
+            Tensor: module buffer
+
+        Example::
+
+            >>> for buf in model.buffers():
+            >>>     print(type(buf), buf.size())
+            <class 'Tensor'> (20L,)
+            <class 'Tensor'> (20L, 1L, 5L, 5L)
+
+        """
+        for _, buf in self.named_buffers(recurse=recurse):
+            yield buf
+
+    def named_buffers(
+        self, prefix: str = "", recurse: bool = True
+    ) -> Iterator[Tuple[str, Tensor]]:
+        r"""Returns an iterator over module buffers, yielding both the
+        name of the buffer as well as the buffer itself.
+
+        Args:
+            prefix (str): prefix to prepend to all buffer names.
+            recurse (bool): if True, then yields buffers of this module
+                and all submodules. Otherwise, yields only buffers that
+                are direct members of this module.
+
+        Yields:
+            (str, Tensor): Tuple containing the name and buffer
+
+        Example::
+
+            >>> for name, buf in self.named_buffers():
+            >>>    if name in ['running_var']:
+            >>>        print(buf.size())
+
+        """
+        gen = self._named_members(
+            lambda module: module._buffers.items(), prefix=prefix, recurse=recurse
+        )
+        for elem in gen:
+            yield elem
+
+    def children(self) -> Iterator["Module"]:
+        r"""Returns an iterator over immediate children modules.
+
+        Yields:
+            Module: a child module
+        """
+        for _, module in self.named_children():
+            yield module
+
+    def named_children(self) -> Iterator[Tuple[str, "Module"]]:
+        r"""Returns an iterator over immediate children modules, yielding both
+        the name of the module as well as the module itself.
+
+        Yields:
+            (str, Module): Tuple containing a name and child module
+
+        Example::
+
+            >>> for name, module in model.named_children():
+            >>>     if name in ['conv4', 'conv5']:
+            >>>         print(module)
+
+        """
+        memo = set()
+        for name, module in self._modules.items():
+            if module is not None and module not in memo:
+                memo.add(module)
+                yield name, module
+
+    def modules(self) -> Iterator["Module"]:
+        r"""Returns an iterator over all modules in the network.
+
+        Yields:
+            Module: a module in the network
+
+        Note:
+            Duplicate modules are returned only once. In the following
+            example, ``l`` will be returned only once.
+
+        Example::
+
+            >>> l = nn.Linear(2, 2)
+            >>> net = nn.Sequential(l, l)
+            >>> for idx, m in enumerate(net.modules()):
+                    print(idx, '->', m)
+
+            0 -> Sequential(
+              (0): Linear(in_features=2, out_features=2, bias=True)
+              (1): Linear(in_features=2, out_features=2, bias=True)
+            )
+            1 -> Linear(in_features=2, out_features=2, bias=True)
+
+        """
+        for _, module in self.named_modules():
+            yield module
+
+    def name_parameter_tensor(self):
+        r"""Set the name of the parameter to tensor's name"""
+        for name, param in self.named_parameters():
+            param.tensor()._attrs["name"] = name.replace(".", "_")
+
+    def named_modules(
+        self,
+        memo: Optional[Set["Module"]] = None,
+        prefix: str = "",
+        remove_duplicate: bool = True,
+    ):
+        r"""Returns an iterator over all modules in the network, yielding
+        both the name of the module as well as the module itself.
+
+        Args:
+            memo: a memo to store the set of modules already added to the result
+            prefix: a prefix that will be added to the name of the module
+            remove_duplicate: whether to remove the duplicated module instances in the result
+                or not
+
+        Yields:
+            (str, Module): Tuple of name and module
+
+        Note:
+            Duplicate modules are returned only once. In the following
+            example, ``l`` will be returned only once.
+
+        Example::
+
+            >>> l = nn.Linear(2, 2)
+            >>> net = nn.Sequential(l, l)
+            >>> for idx, m in enumerate(net.named_modules()):
+                    print(idx, '->', m)
+
+            0 -> ('', Sequential(
+              (0): Linear(in_features=2, out_features=2, bias=True)
+              (1): Linear(in_features=2, out_features=2, bias=True)
+            ))
+            1 -> ('0', Linear(in_features=2, out_features=2, bias=True))
+
+        """
+
+        if memo is None:
+            memo = set()
+        if self not in memo:
+            if remove_duplicate:
+                memo.add(self)
+            yield prefix, self
+            for name, module in self._modules.items():
+                if module is None:
+                    continue
+                submodule_prefix = prefix + ("." if prefix else "") + name
+                for m in module.named_modules(memo, submodule_prefix, remove_duplicate):
+                    yield m
+
+    def _get_name(self):
+        return self.__class__.__name__
+
+    def extra_repr(self) -> str:
+        r"""Set the extra representation of the module
+
+        To print customized extra information, you should re-implement
+        this method in your own modules. Both single-line and multi-line
+        strings are acceptable.
+        """
+        return ""
+
+    def __repr__(self):
+        # We treat the extra repr like the sub-module, one item per line
+        extra_lines = []
+        extra_repr = self.extra_repr()
+        # empty string will be split into list ['']
+        if extra_repr:
+            extra_lines = extra_repr.split("\n")
+        child_lines = []
+        for key, module in self._modules.items():
+            mod_str = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append("(" + key + "): " + mod_str)
+        lines = extra_lines + child_lines
+
+        main_str = self._get_name() + "("
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += "\n  " + "\n  ".join(lines) + "\n"
+
+        main_str += ")"
+        return main_str
+
+    def __dir__(self):
+        module_attrs = dir(self.__class__)
+        attrs = list(self.__dict__.keys())
+        parameters = list(self._parameters.keys())
+        modules = list(self._modules.keys())
+        buffers = list(self._buffers.keys())
+        keys = module_attrs + attrs + parameters + modules + buffers
+
+        # Eliminate attrs that are not legal Python variable names
+        keys = [key for key in keys if not key[0].isdigit()]
+
+        return sorted(keys)
diff --git a/python/aitemplate/frontend/nn/padding.py b/python/aitemplate/frontend/nn/padding.py
new file mode 100644
index 000000000..d2caa58a0
--- /dev/null
+++ b/python/aitemplate/frontend/nn/padding.py
@@ -0,0 +1,30 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Padding related modules.
+"""
+from ...compiler.ops import nhwc3to8
+from .module import Module
+
+
+class Nhwc3to8(Module):
+    def __init__(self):
+        super().__init__()
+        self.op = nhwc3to8()
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x)
diff --git a/python/aitemplate/frontend/nn/parameter.py b/python/aitemplate/frontend/nn/parameter.py
new file mode 100644
index 000000000..8caea006c
--- /dev/null
+++ b/python/aitemplate/frontend/nn/parameter.py
@@ -0,0 +1,30 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Parameter definition.
+"""
+from ...compiler.base import Tensor
+
+
+class Parameter(object):
+    def __init__(self, shape, dtype, name=None, value=None):
+        self._tensor = Tensor(shape=shape, dtype=dtype, name=name)
+        self._value = value
+
+    def tensor(self):
+        return self._tensor
+
+    def value(self):
+        return self._value
diff --git a/python/aitemplate/frontend/nn/pool2d.py b/python/aitemplate/frontend/nn/pool2d.py
new file mode 100644
index 000000000..46d38bca7
--- /dev/null
+++ b/python/aitemplate/frontend/nn/pool2d.py
@@ -0,0 +1,41 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+pool2d-family modules.
+"""
+from ...compiler.ops import avg_pool2d, max_pool2d
+from .module import Module
+
+
+class MaxPool2d(Module):
+    def __init__(self, kernel_size, stride, padding=0):
+        super().__init__()
+        self.op = max_pool2d(kernel_size, stride, padding)
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x)
+
+
+class AvgPool2d(Module):
+    def __init__(self, kernel_size, stride, padding):
+        super().__init__()
+        self.op = avg_pool2d(kernel_size, stride, padding)
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x)
diff --git a/python/aitemplate/frontend/nn/proposal.py b/python/aitemplate/frontend/nn/proposal.py
new file mode 100644
index 000000000..999682915
--- /dev/null
+++ b/python/aitemplate/frontend/nn/proposal.py
@@ -0,0 +1,278 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Proposal module.
+"""
+import math
+
+import numpy as np
+
+from ...compiler import ops
+from ...compiler.base import Tensor
+from ...compiler.ops.common.epilogue import FuncEnum
+from .module import Module
+
+
+def _mkanchors(widths, heights, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    widths = widths[:, np.newaxis]
+    heights = heights[:, np.newaxis]
+    anchors = np.hstack(
+        (
+            x_ctr - 0.5 * (widths - 1),
+            y_ctr - 0.5 * (heights - 1),
+            x_ctr + 0.5 * (widths - 1),
+            y_ctr + 0.5 * (heights - 1),
+        )
+    )
+    return anchors
+
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    width = anchor[2] - anchor[0] + 1
+    height = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (width - 1)
+    y_ctr = anchor[1] + 0.5 * (height - 1)
+    return width, height, x_ctr, y_ctr
+
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    width, height, x_ctr, y_ctr = _whctrs(anchor)
+    size = width * height
+    size_ratios = size / ratios
+    widths = np.round(np.sqrt(size_ratios))
+    heights = np.round(widths * ratios)
+    anchors = _mkanchors(widths, heights, x_ctr, y_ctr)
+    return anchors
+
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    width, height, x_ctr, y_ctr = _whctrs(anchor)
+    widths = width * scales
+    heights = height * scales
+    anchors = _mkanchors(widths, heights, x_ctr, y_ctr)
+    return anchors
+
+
+def generate_anchors(ratios=(0.5, 1, 2), scales=(8, 16, 32)):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+    anchors = []
+    for size in scales:
+        area = size**2.0
+        for aspect_ratio in ratios:
+            im_w = math.sqrt(area / aspect_ratio)
+            im_h = aspect_ratio * im_w
+            im_x0, im_y0, im_x1, im_y1 = (
+                -im_w / 2.0,
+                -im_h / 2.0,
+                im_w / 2.0,
+                im_h / 2.0,
+            )
+            anchors.append([im_x0, im_y0, im_x1, im_y1])
+    return np.array(anchors)
+
+
+def generate_shifted_anchors(
+    im_h, im_w, feat_stride, scales, ratios, batch_size, dtype
+):
+    """
+    Enumerate all shifted anchors
+    """
+    height, width = np.ceil(im_h / feat_stride), np.ceil(im_w / feat_stride)
+
+    shift_x = np.arange(0, width) * feat_stride
+    shift_y = np.arange(0, height) * feat_stride
+    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+    shifts = np.vstack(
+        (shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())
+    ).transpose()
+
+    # add A anchors (1, A, 4) to
+    # cell K shifts (K, 1, 4) to get
+    # shift anchors (K, A, 4)
+    # reshape to (K*A, 4) shifted anchors
+
+    anchors = generate_anchors(scales=scales, ratios=ratios)
+    num_anchors = anchors.shape[0]
+    num_shifts = shifts.shape[0]
+    anchors = anchors.reshape((1, num_anchors, 4)) + shifts.reshape(
+        (1, num_shifts, 4)
+    ).transpose((1, 0, 2))
+    anchors = anchors.reshape((1, num_shifts * num_anchors, 4))
+    exp_anchors = np.repeat(anchors, repeats=batch_size, axis=0).reshape(-1, 4)
+    return exp_anchors.astype(dtype)
+
+
+def gen_batch_inds(batch_size, rpn_post_nms_top_n):
+    if batch_size > 1:
+        inds = np.arange(batch_size)
+        batch_inds = np.repeat(inds.reshape(-1, 1), repeats=rpn_post_nms_top_n, axis=1)
+        return batch_inds.reshape(batch_size, rpn_post_nms_top_n, 1).astype("float16")
+    else:
+        return np.zeros((batch_size, rpn_post_nms_top_n, 1)).astype("float16")
+
+
+class Proposal(Module):
+    def __init__(
+        self,
+        im_shape,
+        feat_stride=16,
+        scales=(32, 64, 128, 256, 512),
+        ratios=(0.5, 1, 2),
+        clip_box=True,
+        nms_on=True,
+        rpn_pre_nms_top_n=6000,
+        rpn_post_nms_top_n=300,
+        iou_threshold=0.3,
+        rpn_min_size=0,
+        level=-1,
+        f_proc=None,
+        batch_size=1,
+        dtype="float16",
+    ):
+        super().__init__()
+        self.im_h, self.im_w = im_shape
+        self.clip_box = clip_box
+        self.nms_on = nms_on
+        self.rpn_pre_nms_top_n = rpn_pre_nms_top_n
+        self.rpn_post_nms_top_n = rpn_post_nms_top_n
+        self.iou_threshold = iou_threshold
+        self.rpn_min_size = rpn_min_size
+        self.level = level
+        self.dtype = dtype
+        self._feat_stride = feat_stride
+        self.batch_size = batch_size
+        self._scales = np.array(scales, dtype="float32")
+        self._ratios = np.array(ratios, dtype="float32")
+        if f_proc is None:
+            f_proc = generate_shifted_anchors
+        self._anchors = f_proc(
+            self.im_h,
+            self.im_w,
+            self._feat_stride,
+            self._scales,
+            self._ratios,
+            self.batch_size,
+            self.dtype,
+        )
+        self._batch_inds = gen_batch_inds(batch_size, rpn_post_nms_top_n)
+
+    def forward(self, *args):
+        assert len(args) >= 1
+        N = self.batch_size
+
+        x_bbox_deltas = args[0]
+        bbox_deltas = ops.reshape()(x_bbox_deltas, [-1, 4])
+
+        tensor_name = "anchors" if self.level == -1 else "anchors_%d" % (self.level)
+        anchors = Tensor(shape=self._anchors.shape, dtype=self.dtype, name=tensor_name)
+
+        proposals = self.box_transform(bbox_deltas, anchors)
+
+        if self.nms_on:
+            scores = args[1]
+            scores_r = ops.reshape()(scores, [N, -1])
+            proposals_r = ops.reshape()(proposals, [N, -1, 4])
+            dets = ops.nms(
+                self.rpn_pre_nms_top_n,
+                self.rpn_post_nms_top_n,
+                self.iou_threshold,
+                self.rpn_min_size,
+            )(proposals_r, scores_r)
+            # prepare for roi-align for mask head
+            batch_inds = Tensor(
+                shape=[N, self.rpn_post_nms_top_n, 1],
+                dtype=self.dtype,
+                name="batch_inds",
+                value=0,
+            )
+            ret = ops.reshape()(ops.concatenate()([batch_inds, dets], dim=2), [-1, 5])
+            return ret, ops.reshape()(dets, [-1, 4])
+        return proposals
+
+    def box_transform(self, bbox_deltas, anchors):
+        """apply transformation for proposals"""
+        (delta_x, delta_y, delta_w, delta_h) = ops.split()(bbox_deltas, 1, dim=1)
+        const_0_5 = Tensor(shape=[], dtype=self.dtype, name="const_0_5", value=0.5)
+
+        (anchor_x1, anchor_y1, anchor_x2, anchor_y2) = ops.split()(anchors, 1, dim=1)
+        widths = ops.elementwise(FuncEnum.SUB)(anchor_x2, anchor_x1)
+        heights = ops.elementwise(FuncEnum.SUB)(anchor_y2, anchor_y1)
+
+        width_mid = ops.elementwise(FuncEnum.MUL)(widths, const_0_5)
+        height_mid = ops.elementwise(FuncEnum.MUL)(heights, const_0_5)
+        ctr_x = ops.elementwise(FuncEnum.ADD)(anchor_x1, width_mid)
+        ctr_y = ops.elementwise(FuncEnum.ADD)(anchor_y1, height_mid)
+
+        pred_ctr_x = ops.elementwise(FuncEnum.ADD)(
+            ops.elementwise(FuncEnum.MUL)(delta_x, widths), ctr_x
+        )
+        pred_ctr_y = ops.elementwise(FuncEnum.ADD)(
+            ops.elementwise(FuncEnum.MUL)(delta_y, heights), ctr_y
+        )
+        pred_w = ops.elementwise(FuncEnum.MUL)(
+            ops.elementwise(FuncEnum.EXP)(delta_w), widths
+        )
+        pred_h = ops.elementwise(FuncEnum.MUL)(
+            ops.elementwise(FuncEnum.EXP)(delta_h), heights
+        )
+
+        p_x1 = ops.elementwise(FuncEnum.SUB)(
+            pred_ctr_x, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_w)
+        )
+        p_y1 = ops.elementwise(FuncEnum.SUB)(
+            pred_ctr_y, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_h)
+        )
+        p_x2 = ops.elementwise(FuncEnum.ADD)(
+            pred_ctr_x, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_w)
+        )
+        p_y2 = ops.elementwise(FuncEnum.ADD)(
+            pred_ctr_y, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_h)
+        )
+
+        if self.clip_box:
+
+            x_min = Tensor(shape=[], dtype="float16", name="X_min", value=0)
+            x_max_h = Tensor(shape=[], dtype="float16", name="X_min_h", value=self.im_h)
+            x_max_w = Tensor(shape=[], dtype="float16", name="X_min_w", value=self.im_w)
+
+            f_x1 = ops.elementwise(FuncEnum.HARDTANH)(p_x1, x_min, x_max_w)
+            f_y1 = ops.elementwise(FuncEnum.HARDTANH)(p_y1, x_min, x_max_h)
+            f_x2 = ops.elementwise(FuncEnum.HARDTANH)(p_x2, x_min, x_max_w)
+            f_y2 = ops.elementwise(FuncEnum.HARDTANH)(p_y2, x_min, x_max_h)
+            proposals = ops.concatenate()([f_x1, f_y1, f_x2, f_y2], dim=1)
+        else:
+            proposals = ops.concatenate()([p_x1, p_y1, p_x2, p_y2], dim=1)
+
+        return proposals
diff --git a/python/aitemplate/frontend/nn/roi_ops.py b/python/aitemplate/frontend/nn/roi_ops.py
new file mode 100644
index 000000000..401813f09
--- /dev/null
+++ b/python/aitemplate/frontend/nn/roi_ops.py
@@ -0,0 +1,75 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+RoiAlign-family modules.
+"""
+from ...compiler.ops import multi_level_roi_align, roi_align
+from .module import Module
+
+
+class RoiAlign(Module):
+    def __init__(
+        self,
+        num_rois,
+        pooled_size,
+        sampling_ratio,
+        spatial_scale,
+        position_sensitive,
+        continuous_coordinate,
+    ):
+        super().__init__()
+        self.op = roi_align(
+            num_rois,
+            pooled_size,
+            sampling_ratio,
+            spatial_scale,
+            position_sensitive,
+            continuous_coordinate,
+        )
+
+    def forward(self, *args):
+        assert len(args) == 2
+        x = args[0]
+        rois = args[1]
+        return self.op(x, rois)
+
+
+class FPNRoiAlign(Module):
+    def __init__(
+        self,
+        num_rois,
+        pooled_size,
+        sampling_ratio,
+        spatial_scale,
+        position_sensitive,
+        continuous_coordinate,
+        im_shape,
+    ):
+        super().__init__()
+        self.op = multi_level_roi_align(
+            num_rois,
+            pooled_size,
+            sampling_ratio,
+            spatial_scale,
+            position_sensitive,
+            continuous_coordinate,
+            im_shape,
+        )
+
+    def forward(self, *args):
+        assert len(args) >= 2
+        x = args[0]
+        rois = args[1]
+        return self.op(x, rois)
diff --git a/python/aitemplate/frontend/nn/upsample.py b/python/aitemplate/frontend/nn/upsample.py
new file mode 100644
index 000000000..ab85c6d61
--- /dev/null
+++ b/python/aitemplate/frontend/nn/upsample.py
@@ -0,0 +1,42 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unsampling2d module.
+"""
+from ...compiler.ops import upsampling2d, upsampling2d_add
+from .module import Module
+
+
+class Upsampling2d(Module):
+    def __init__(self, scale_factor, mode):
+        super().__init__()
+        self.op = upsampling2d(scale_factor, mode)
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x)
+
+
+class Upsampling2dAdd(Module):
+    def __init__(self, scale_factor, mode):
+        super().__init__()
+        self.op = upsampling2d_add(scale_factor, mode)
+
+    def forward(self, *args):
+        assert len(args) == 2
+        x = args[0]
+        res = args[1]
+        return self.op(x, res)
diff --git a/python/aitemplate/frontend/nn/view_ops.py b/python/aitemplate/frontend/nn/view_ops.py
new file mode 100644
index 000000000..dc6b03715
--- /dev/null
+++ b/python/aitemplate/frontend/nn/view_ops.py
@@ -0,0 +1,54 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+View-related modules.
+"""
+from ...compiler.ops import flatten, reshape
+from .module import Module
+
+
+class Reshape(Module):
+    def __init__(self):
+        super().__init__()
+        self.op = reshape()
+
+    def forward(self, *args):
+        assert len(args) == 2
+        x = args[0]
+        shape = args[1]
+        return self.op(x, shape)
+
+
+class View(Module):
+    def __init__(self):
+        super().__init__()
+        self.op = reshape()
+
+    def forward(self, *args):
+        assert len(args) == 2
+        x = args[0]
+        shape = args[1]
+        return self.op(x, shape)
+
+
+class Flatten(Module):
+    def __init__(self, start_dim=0, end_dim=-1):
+        super().__init__()
+        self.op = flatten(start_dim, end_dim)
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x)
diff --git a/python/aitemplate/frontend/parameter.py b/python/aitemplate/frontend/parameter.py
new file mode 100644
index 000000000..ebb060bd7
--- /dev/null
+++ b/python/aitemplate/frontend/parameter.py
@@ -0,0 +1,30 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Parameter definition.
+"""
+from ..compiler.base import Tensor
+
+
+class Parameter(object):
+    def __init__(self, shape, dtype, name=None, value=None):
+        self._tensor = Tensor(shape=shape, dtype=dtype, name=name)
+        self._value = value
+
+    def tensor(self):
+        return self._tensor
+
+    def value(self):
+        return self._value
diff --git a/python/aitemplate/testing/__init__.py b/python/aitemplate/testing/__init__.py
new file mode 100644
index 000000000..7aeed2679
--- /dev/null
+++ b/python/aitemplate/testing/__init__.py
@@ -0,0 +1,25 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+testing module
+"""
+from . import benchmark_ait, benchmark_pt
+from .detect_target import detect_target
+
+__all__ = [
+    "benchmark_pt",
+    "benchmark_ait",
+    "detect_target",
+]
diff --git a/python/aitemplate/testing/benchmark_ait.py b/python/aitemplate/testing/benchmark_ait.py
new file mode 100644
index 000000000..7c45daecd
--- /dev/null
+++ b/python/aitemplate/testing/benchmark_ait.py
@@ -0,0 +1,160 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Optional
+
+import torch
+
+
+def make_input_output_pools(
+    *, pool_size, eval_pt_func, input_filter_func, output_filter_func
+):
+    """
+    Make input and output pools for benchmarking. The rationale is avoiding retrieving the same input from the device cache for fair perf assessment.
+    Parameters
+    ----------
+    pool_size : int
+        The size of the pool.
+    eval_pt_func : callable
+        A callable that returns a dict of inputs and outputs.
+    input_filter_func : callable
+        A callable that takes a key and a value and returns True if the key-value pair from the `eval_pt_func` result should be included in the input pool.
+    output_filter_func : callable
+        A callable that takes a key and a value and returns True if the key-value pair from the `eval_pt_func` result should be included in the output pool.
+    Returns
+    -------
+    inputs_pool : List[Dict[str, torch.Tensor]]
+        A list of inputs to pass into Model.RunWithTensors.
+    outputs_pool : List[Dict[str, torch.Tensor]]
+        A list of outputs to pass into Model.RunWithTensors.
+    """
+    return zip(
+        *[
+            [
+                {k: v for k, v in d.items() if input_filter_func(k, v)},
+                {
+                    k: torch.empty_like(v)
+                    for k, v in d.items()
+                    if output_filter_func(k, v)
+                },
+            ]
+            for d in [eval_pt_func() for _ in range(pool_size)]
+        ]
+    )
+
+
+def run_module_with_pools(
+    *,
+    ait_module,
+    inputs_pool,
+    outputs_pool,
+    num_iters,
+    stream_ptr: Optional[int] = None,
+    sync: bool = False,
+    graph_mode: bool = False,
+):
+    """
+    Run the module with the given inputs and outputs pools.
+    Parameters
+    ----------
+    ait_module : Model
+        The AIT module to run.
+    inputs_pool : List[Dict[str, torch.Tensor]]
+        A list of inputs to pass into Model.RunWithTensors.
+    outputs_pool : List[Dict[str, torch.Tensor]]
+        A list of outputs to pass into Model.RunWithTensors.
+    num_iters : int
+        The number of iterations to run.
+    stream_ptr : Optional[int]
+        The CUDA stream pointer to run the module on; if None, use the legacy stream.
+    sync : bool
+        Whether to synchronize the CUDA stream after each iteration.
+    graph_mode : bool
+        Whether to run the module in graph mode.
+    """
+    for i in range(num_iters):
+        ait_module.run_with_tensors(
+            inputs_pool[i % len(inputs_pool)],
+            outputs_pool[i % len(outputs_pool)],
+            sync=sync,
+            stream_ptr=stream_ptr,
+            graph_mode=graph_mode,
+        )
+
+
+def run_benchmark(
+    *,
+    ait_module,
+    inputs_pool,
+    outputs_pool,
+    num_iters,
+    num_warmup_iters,
+    stream: Optional[torch.cuda.Stream] = None,
+    sync: bool = False,
+    graph_mode: bool = False,
+):
+    """
+    Run the benchmark.
+    Parameters
+    ----------
+    ait_module : Model
+        The AIT module to run.
+    inputs_pool : List[Dict[str, torch.Tensor]]
+        A list of inputs to pass into Model.RunWithTensors.
+    outputs_pool : List[Dict[str, torch.Tensor]]
+        A list of outputs to pass into Model.RunWithTensors.
+    num_iters : int
+        The number of iterations to run.
+    num_warmup_iters : int
+        The number of warmup iterations to run.
+    stream : Optional[torch.cuda.Stream]
+        The CUDA stream to run the module on; if None, use the default stream.
+    sync : bool
+        Whether to synchronize the CUDA stream after each iteration.
+    graph_mode : bool
+        Whether to run the module in graph mode.
+    Returns
+    -------
+    float
+        The average time per iteration in *milliseconds*.
+    """
+    if stream is None:
+        stream = torch.cuda.default_stream()
+
+    _common_params = {
+        "ait_module": ait_module,
+        "inputs_pool": inputs_pool,
+        "outputs_pool": outputs_pool,
+        "sync": sync,
+        "stream_ptr": stream.cuda_stream,
+        "graph_mode": graph_mode,
+    }
+    # Warmup by running for num_warmup_iters
+    run_module_with_pools(
+        num_iters=num_warmup_iters,
+        **_common_params,
+    )
+    # Benchmark by running for num_iters
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record(stream=stream)
+    run_module_with_pools(
+        num_iters=num_iters,
+        **_common_params,
+    )
+    end_event.record(stream=stream)
+    torch.cuda.synchronize()
+    return start_event.elapsed_time(end_event) / num_iters
diff --git a/python/aitemplate/testing/benchmark_pt.py b/python/aitemplate/testing/benchmark_pt.py
new file mode 100644
index 000000000..c564fea37
--- /dev/null
+++ b/python/aitemplate/testing/benchmark_pt.py
@@ -0,0 +1,55 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+helper function to benchmark eager pytorch
+"""
+# pylint: disable=C0415
+
+
+def benchmark_torch_function(iters: int, function, *args, **kwargs) -> float:
+    """
+    function for benchmarking a pytorch function.
+
+    Parameters
+    ----------
+    iters: int
+        Number of iterations.
+    function: lambda function
+        function to benchmark.
+    args: Any type
+        Args to function.
+
+    Returns
+    -------
+    float
+        Runtime per iteration in ms.
+    """
+    import torch
+
+    # Warm up
+    for _ in range(5):
+        function(*args, **kwargs)
+
+    # Start benchmark.
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(iters):
+        function(*args, **kwargs)
+    end_event.record()
+    torch.cuda.synchronize()
+    # in ms
+    return (start_event.elapsed_time(end_event)) / iters
diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
new file mode 100644
index 000000000..1705385c6
--- /dev/null
+++ b/python/aitemplate/testing/detect_target.py
@@ -0,0 +1,97 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Automatic detect target for testing
+"""
+import os
+from subprocess import PIPE, Popen
+
+from ..backend.target import CUDA, ROCM
+from ..utils import logger
+
+# pylint: disable=W0702, W0612,R1732
+
+IS_CUDA = None
+FLAG = ""
+
+
+def _detect_cuda():
+    try:
+        proc = Popen(
+            ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv"],
+            stdout=PIPE,
+            stderr=PIPE,
+        )
+        stdout, stderr = proc.communicate()
+        stdout = stdout.decode("utf-8")
+        if "A100" in stdout or "RTX 30" in stdout or "A30" in stdout:
+            return "80"
+        if "V100" in stdout:
+            return "70"
+        if "T4" in stdout:
+            return "75"
+        return None
+    except Exception:
+        return None
+
+
+def _detect_rocm():
+    try:
+        proc = Popen(["rocminfo"], stdout=PIPE, stderr=PIPE)
+        stdout, stderr = proc.communicate()
+        stdout = stdout.decode("utf-8")
+        if "gfx90a" in stdout:
+            return "gfx90a"
+        if "gfx908" in stdout:
+            return "gfx908"
+        return None
+    except Exception:
+        return None
+
+
+def detect_target(**kwargs):
+    """Detect GPU target based on nvidia-smi and rocminfo
+
+    Returns
+    -------
+    Target
+        CUDA or ROCM target
+
+    """
+    global IS_CUDA, FLAG
+    if FLAG:
+        if IS_CUDA:
+            return CUDA(arch=FLAG, **kwargs)
+        else:
+            return ROCM(arch=FLAG, **kwargs)
+
+    doc_flag = os.getenv("BUILD_DOCS", None)
+    if doc_flag is not None:
+        return CUDA(arch="80", **kwargs)
+    flag = _detect_cuda()
+    if flag is not None:
+        IS_CUDA = True
+        FLAG = flag
+
+        logger.info(__name__, "Set target to CUDA")
+        return CUDA(arch=flag, **kwargs)
+    flag = _detect_rocm()
+    if flag is not None:
+        IS_CUDA = False
+        FLAG = flag
+
+        logger.info(__name__, "Set target to ROCM")
+        return ROCM(arch=flag, **kwargs)
+    raise RuntimeError("Unsupported platform")
diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
new file mode 100644
index 000000000..b229541f3
--- /dev/null
+++ b/python/aitemplate/testing/test_utils.py
@@ -0,0 +1,105 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Utils for unit tests.
+"""
+from typing import Any, Dict, List
+
+import torch
+
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.utils.graph_utils import get_sorted_ops
+
+
+DTYPE_TO_TORCH_DTYPE: Dict[str, torch.dtype] = {
+    "float16": torch.half,
+    "float": torch.float,
+    "int": torch.int,
+}
+
+
+def dtype_to_torch_dtype(dtype):
+    if dtype is None:
+        return None
+    torch_dtype = DTYPE_TO_TORCH_DTYPE.get(dtype)
+    if torch_dtype is None:
+        raise RuntimeError("Unsupported dtype: {}".format(dtype))
+    return torch_dtype
+
+
+def get_random_torch_tensor(shape, dtype):
+    if dtype == "float16":
+        return torch.randn(shape).cuda().half()
+    if dtype == "float":
+        return torch.randn(shape).cuda().float()
+    if dtype == "int":
+        return torch.randn(shape).cuda().int()
+    raise RuntimeError("unsupported dtype: {}".format(dtype))
+
+
+def has_op(sorted_ops: List[Operator], op_name: str) -> bool:
+    for op in sorted_ops:
+        op_type = op._attrs["op"]
+        if op_type == op_name:
+            return True
+    return False
+
+
+def graph_has_op(graph: List[Tensor], op_name: str) -> bool:
+    return has_op(get_sorted_ops(graph), op_name)
+
+
+def count_ops(sorted_ops: List[Operator], op_name: str):
+    count = 0
+    for op in sorted_ops:
+        op_type = op._attrs["op"]
+        if op_type == op_name:
+            count += 1
+    return count
+
+
+def gen_input_tensor(shape: List[Any], name: str = None) -> Tensor:
+    tensor = Tensor(
+        shape=shape,
+        dtype="float16",
+        name=name,
+        is_input=True,
+    )
+    return tensor
+
+
+def get_src_op(tensor: Tensor) -> str:
+    assert len(tensor._attrs["src_ops"]) == 1
+    return list(tensor._attrs["src_ops"])[0]
+
+
+def get_src_op_name(tensor: Tensor) -> str:
+    return get_src_op(tensor)._attrs["op"]
+
+
+def get_src_input(tensor: Tensor) -> str:
+    src_op = get_src_op(tensor)
+    assert len(src_op._attrs["inputs"]) >= 1
+    return src_op._attrs["inputs"][0]
+
+
+def get_shape(shape: List[IntVar], dim_to_value_dict: Dict[str, int]):
+    res = [
+        dim.value()
+        if isinstance(dim, IntImm)
+        else dim_to_value_dict[dim._attrs["name"]]
+        for dim in shape
+    ]
+    return res
diff --git a/python/aitemplate/utils/__init__.py b/python/aitemplate/utils/__init__.py
new file mode 100644
index 000000000..6e0902313
--- /dev/null
+++ b/python/aitemplate/utils/__init__.py
@@ -0,0 +1,26 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# flake8: noqa
+
+from . import (
+    graph_utils,
+    logger,
+    markdown_table,
+    shape_utils,
+    tensor_utils,
+    torch_utils,
+    visualization,
+)
diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
new file mode 100644
index 000000000..9f5dedc51
--- /dev/null
+++ b/python/aitemplate/utils/graph_utils.py
@@ -0,0 +1,74 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+from typing import Any, List
+
+from aitemplate.utils import logger
+
+
+def get_sorted_ops(tensors) -> List[Any]:
+    from aitemplate.compiler.base import Tensor
+
+    visited = set()
+    sorted_ops = []
+    if isinstance(tensors, Tensor):
+        tensors = [tensors]
+    for tensor in tensors:
+        for src_op in tensor._attrs["src_ops"]:
+            if src_op in visited:
+                continue
+            visited.add(src_op)
+            sorted_ops.append(src_op)
+    return sorted_ops
+
+
+def sorted_graph_debug_str(tensors) -> str:
+    from aitemplate.compiler.base import Tensor
+
+    if isinstance(tensors, Tensor):
+        tensors = [tensors]
+    tensor_str = "\n\n".join([str(tensor) for tensor in tensors])
+    op_str = "\n\n".join([str(op) for op in get_sorted_ops(tensors)])
+    return "Tensors: {}\n\nOperators: {}\n\n".format(tensor_str, op_str)
+
+
+def sorted_graph_pseudo_code(tensors, with_shape=True) -> str:
+    from aitemplate.compiler.base import Tensor
+
+    if isinstance(tensors, Tensor):
+        tensors = [tensors]
+    op_str = "\n".join([op.pseudo_code(with_shape) for op in get_sorted_ops(tensors)])
+    return op_str
+
+
+def sorted_op_pseudo_code(ops, with_shape=True) -> str:
+    from aitemplate.compiler.base import Operator
+
+    if isinstance(ops, Operator):
+        ops = [ops]
+    op_str = "\n".join([op.pseudo_code(with_shape) for op in ops])
+    return op_str
+
+
+def dump_graph_debug_str_to_file(tensors, workdir, name):
+    prefix = os.path.join(workdir, name)
+    graph_path = prefix + "_graph.txt"
+    pseudo_code_path = prefix + "_pseudo_code.txt"
+    with open(graph_path, "w") as f:
+        f.write(sorted_graph_debug_str(tensors))
+        logger.info(__file__, f"Dumped {name} graph to {graph_path}")
+    with open(pseudo_code_path, "w") as f:
+        f.write(sorted_graph_pseudo_code(tensors))
+        logger.info(__file__, f"Dumped {name} pseudo code to {pseudo_code_path}")
diff --git a/python/aitemplate/utils/logger.py b/python/aitemplate/utils/logger.py
new file mode 100644
index 000000000..cd7eba07e
--- /dev/null
+++ b/python/aitemplate/utils/logger.py
@@ -0,0 +1,38 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+default logger
+"""
+import logging
+
+
+def info(name, message):
+    logger = logging.getLogger(name)
+    logger.info(message)
+
+
+def debug(name, message):
+    logger = logging.getLogger(name)
+    logger.debug(message)
+
+
+def warning(name, message):
+    logger = logging.getLogger(name)
+    logger.warning(message)
+
+
+def is_debug():
+    logger = logging.getLogger("aitemplate")
+    return logger.level == logging.DEBUG
diff --git a/python/aitemplate/utils/markdown_table.py b/python/aitemplate/utils/markdown_table.py
new file mode 100644
index 000000000..9bdbb9bea
--- /dev/null
+++ b/python/aitemplate/utils/markdown_table.py
@@ -0,0 +1,183 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Markdown table generator
+Original Project: https://github.com/hvalev/markdownTable
+Accessed: Jul 16, 2022
+"""
+import math
+
+
+class markdownTable:
+    """
+    A class used to generate padded tables in a markdown code block
+    ...
+    Args:
+    data (list): List of dicts with uniform key : value pairs.
+    The keys will be used for generating the header and values -- the rows.
+    row_sep (str): A flag used to indicate how and where to insert row separators. Possible values are 'always'/'topbottom'/None where:
+                   'always 'inserts separators between each individual data row
+                   'topbottom' inserts separators above the header and below the last row
+                   'None' omits the insertion of a row separator
+    padding_width (int): Value used to indicate how much extra padding to insert to table cells.
+    padding_weight (str): Flag used to indicate the strategy of applying the padding indicated through padding_width.
+                   Possible values are 'left'/'right'/'centerleft'/'centerright' where:
+                   'left' aligns items in cells to the end of the cell
+                   'right' aligns items in cells to the beginning of the cell
+                   'centerleft' centers items with the extra padding allocated to the beginning of the cell
+                   'centerright' centers items with the extra padding allocated to the end of the cell
+    padding_char (str): Custom character to fill extra and normal padding with. Default is a blank space.
+    newline_char (str): Custom character to be used for indicating a newline
+    float_rounding (int): decimal place to round float values to. Default is 2, but can also be set to 'None' to show complete values
+    Methods
+    -------
+    getHeader()
+        gets unescaped table header
+    getBody()
+        gets unescaped table content
+    getMarkdown()
+        gets complete escaped markdown table
+    """
+
+    def __init__(self, data):
+        self.data = data
+        self.validate()
+        self.row_sep = "always"
+        self.padding_width = 0
+        self.padding_weight = "centerleft"
+        self.padding_char = " "
+        self.newline_char = "\n"
+        self.float_rounding = 2
+        self.quote = True
+        self.updateMetaParams()
+        return
+
+    def setParams(
+        self,
+        row_sep="always",
+        padding_width=0,
+        padding_weight="centerleft",
+        padding_char=" ",
+        newline_char="\n",
+        float_rounding=2,
+        quote=True,
+    ):
+        self.row_sep = row_sep
+        self.padding_width = padding_width
+        self.padding_weight = padding_weight
+        self.padding_char = padding_char
+        self.newline_char = newline_char
+        self.float_rounding = float_rounding
+        self.quote = quote
+        self.updateMetaParams()
+        return self
+
+    def updateMetaParams(self):
+        self.var_padding = self.getPadding()
+        self.var_row_sep = self.getRowSepStr()
+        self.var_row_sep_last = self.getRowSepLast()
+
+    def validate(self):
+        if len(self.data) < 1:
+            raise Exception("Data variable is empty")
+        keys = self.data[0].keys()
+        for item in self.data:
+            for key in keys:
+                if key not in item:
+                    raise Exception("Keys are not uniform")
+
+    def getPadding(self):
+        padding = {}
+        for item in self.data[0].keys():
+            padding[item] = len(item)
+        for item in self.data:
+            for key in item.keys():
+                if type(item[key]) is float and self.float_rounding:
+                    item[key] = round(item[key], self.float_rounding)
+                if (padding[key] - self.padding_width) < len(str(item[key])):
+                    padding[key] = len(str(item[key])) + self.padding_width
+        return padding
+
+    def getRowSepStr(self):
+        row_sep_str = ""
+        for value in self.var_padding.values():
+            row_sep_str += "+" + "-" * value
+        row_sep_str += "+"
+        return row_sep_str
+
+    def getRowSepLast(self):
+        row_sep_str_last = "+"
+        for value in self.var_padding.values():
+            row_sep_str_last += "-" * (value + 1)
+        row_sep_str_last = row_sep_str_last[:-1] + "+"
+        return row_sep_str_last
+
+    def getMargin(self, margin):
+        if self.padding_weight == "left":
+            right = 0
+        elif self.padding_weight == "right":
+            right = margin
+        elif self.padding_weight == "centerleft":
+            right = math.floor(margin / 2)
+        elif self.padding_weight == "centerright":
+            right = math.ceil(margin / 2)
+        else:
+            right = math.floor(margin / 2)
+        return right
+
+    def getMarkdown(self):
+        data = self.getHeader() + self.getBody()
+        if self.quote:
+            return "```" + data + "```"
+        else:
+            return data
+
+    def getHeader(self):
+        header = ""
+        if self.row_sep in ("topbottom", "always"):
+            header += self.newline_char + self.var_row_sep_last + self.newline_char
+        for key in self.data[0].keys():
+            margin = self.var_padding[key] - len(key)
+            right = self.getMargin(margin)
+            header += "|" + key.rjust(
+                self.var_padding[key] - right, self.padding_char
+            ).ljust(self.var_padding[key], self.padding_char)
+        header += "|" + self.newline_char
+        if self.row_sep == "always":
+            header += self.var_row_sep + self.newline_char
+        if self.row_sep == "markdown":
+            header += self.var_row_sep.replace("+", "|") + self.newline_char
+        return header
+
+    def getBody(self):
+        rows = ""
+        for ix, item in enumerate(self.data):
+            for key in self.data[0].keys():
+                margin = self.var_padding[key] - len(str(item[key]))
+                right = self.getMargin(margin)
+                rows += "|" + str(item[key]).rjust(
+                    self.var_padding[key] - right, self.padding_char
+                ).ljust(self.var_padding[key], self.padding_char)
+            rows += "|"
+            if ix < len(self.data) - 1:
+                rows += self.newline_char
+            if self.row_sep == "always" and ix < len(self.data) - 1:
+                rows += self.var_row_sep + self.newline_char
+            if (self.row_sep == "always" or self.row_sep == "topbottom") and ix == len(
+                self.data
+            ) - 1:
+                rows += self.newline_char + self.var_row_sep_last
+        return rows
diff --git a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
new file mode 100644
index 000000000..739b69579
--- /dev/null
+++ b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
@@ -0,0 +1,388 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import enum
+from copy import deepcopy
+from dataclasses import dataclass
+from enum import auto
+from typing import List
+
+import jinja2
+
+# import library
+
+from . import library
+
+
+class Conv2DSpecialization(enum.Enum):
+    ConvFwdDefault = auto()
+    ConvFwd1x1P0 = auto()
+    ConvFwd1x1S1P0 = auto()
+    ConvFwdOddC = auto()
+    GemmDefault = auto()
+    MNKPadding = auto()
+    ConvBwdDataDefault = auto()
+    ConvBwd1x1S1P0 = auto()
+
+
+Conv2DSpecializationTag = {
+    Conv2DSpecialization.ConvFwdDefault: "ck::tensor_operation::device::ConvolutionForwardSpecialization::Default",
+    Conv2DSpecialization.ConvFwd1x1P0: "ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0",
+    Conv2DSpecialization.ConvFwd1x1S1P0: "ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0",
+    Conv2DSpecialization.ConvFwdOddC: "ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC",
+    Conv2DSpecialization.GemmDefault: "ck::tensor_operation::device::GemmSpecialization::Default",
+    Conv2DSpecialization.MNKPadding: "ck::tensor_operation::device::GemmSpecialization::MNKPadding",
+    Conv2DSpecialization.ConvBwdDataDefault: "ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default",
+    Conv2DSpecialization.ConvBwd1x1S1P0: "ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0",
+}
+
+
+class XdlOpType(enum.Enum):
+    DeviceConv2d_Xdl_CShuffle = auto()
+    DeviceConv2d_Xdl_CShuffle_Bias_Relu = auto()
+    DeviceConv2d_Xdl_CShuffle_Bias_Relu_Add = auto()
+    DeviceConv2d_Xdl_CShuffle_Bias_Sigmoid = auto()
+    DeviceGroupedConv2D_Xdl_CShuffle_Bias_Relu = auto()
+    DeviceConvNdBwdDataNwcKxcNwk_Xdl = auto()
+    DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 = auto()
+
+
+XdlOpTag = {
+    XdlOpType.DeviceConv2d_Xdl_CShuffle: "ck::tensor_operation::device::DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K",
+    XdlOpType.DeviceConv2d_Xdl_CShuffle_Bias_Relu: "ck::tensor_operation::device::DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K",
+    XdlOpType.DeviceConv2d_Xdl_CShuffle_Bias_Relu_Add: "ck::tensor_operation::device::DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K",
+    XdlOpType.DeviceConv2d_Xdl_CShuffle_Bias_Sigmoid: "ck::tensor_operation::device::DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K",
+    XdlOpType.DeviceGroupedConv2D_Xdl_CShuffle_Bias_Relu: "ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle",
+    XdlOpType.DeviceConvNdBwdDataNwcKxcNwk_Xdl: "ck::tensor_operation::device::DeviceConvNdBwdDataNwcKxcNwk_Xdl",
+    XdlOpType.DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1: "ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1",
+}
+
+
+@dataclass
+class TileDesc:
+    block_size: int
+    m_per_block: int
+    n_per_block: int
+    k_per_block: int
+    k1: int
+    m_per_xdl: int
+    n_per_xdl: int
+    m_xdl_per_wave: int
+    n_xdl_per_wave: int
+
+    def __str__(self) -> str:
+        values = list(self.__dict__.values())
+        return "_".join([str(x) for x in values])
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        template = jinja2.Template(
+            """
+{%for key, value in param.items() %}
+    {{value}}, // {{key}}
+{% endfor %}
+""",
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(param=args)
+
+
+@dataclass
+class GroupTileDesc:
+    default: int
+    block_size: int
+    m_per_block: int
+    n_per_block: int
+    k_per_block: int
+    ak1: int
+    bk1: int
+    m_per_xdl: int
+    n_per_xdl: int
+    m_xdl_per_wave: int
+    n_xdl_per_wave: int
+
+    def __str__(self) -> str:
+        values = list(self.__dict__.values())
+        return "_".join([str(x) for x in values])
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        template = jinja2.Template(
+            """
+{%for key, value in param.items() %}
+    {{value}}, // {{key}}
+{% endfor %}
+""",
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(param=args)
+
+
+@dataclass
+class BlockTransferDesc:
+    thread_cluster_length: List[int]
+    thread_cluster_arrange_order: List[int]
+    src_access_order: List[int]
+    src_vector_dim: int
+    src_scalar_per_vector: int
+    dst_scalar_per_vector: int
+    add_extra_dim: int
+
+    def __str__(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["thread_cluster_length"] = [str(x) for x in self.thread_cluster_length]
+        args["thread_cluster_arrange_order"] = [
+            str(x) for x in self.thread_cluster_arrange_order
+        ]
+        args["src_access_order"] = [str(x) for x in self.src_access_order]
+
+        template = jinja2.Template(
+            """S{{thread_cluster_length|join('_')}}S
+            _S{{thread_cluster_arrange_order|join('_')}}S
+            _S{{src_access_order|join('_')}}S
+            _{{src_vector_dim}}
+            _{{src_scalar_per_vector}}
+            _{{dst_scalar_per_vector}}
+            _{{add_extra_dim}}
+            """,
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(**args).replace("\n", "").replace(" ", "")
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["thread_cluster_length"] = [str(x) for x in self.thread_cluster_length]
+        args["thread_cluster_arrange_order"] = [
+            str(x) for x in self.thread_cluster_arrange_order
+        ]
+        args["src_access_order"] = [str(x) for x in self.src_access_order]
+
+        template = jinja2.Template(
+            """
+    ck::Sequence<{{thread_cluster_length|join(',')}}>, // thread_cluster_length
+    ck::Sequence<{{thread_cluster_arrange_order|join(',')}}>, // thread_cluster_arrange_order
+    ck::Sequence<{{src_access_order|join(',')}}>, // src_access_order
+    {{src_vector_dim}}, // src_vector_dim
+    {{src_scalar_per_vector}}, // src_scalar_per_vector
+    {{dst_scalar_per_vector}}, // dst_scalar_per_vector
+    {% if add_extra_dim == 1 %}true{% else %}false{% endif %}, // add_extra_dim
+"""
+        )
+        return template.render(**args)
+
+
+@dataclass
+class CBlockTransferDesc:
+    m_xdl_per_wave: int
+    n_xdl_per_wave: int
+    m_n_block_wave_per_xdl: List[int]
+    scalar_per_vector: int
+
+    def __str__(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["m_n_block_wave_per_xdl"] = [str(x) for x in self.m_n_block_wave_per_xdl]
+        template = jinja2.Template(
+            """
+        {{m_xdl_per_wave}}
+        _{{n_xdl_per_wave}}
+        {{m_n_block_wave_per_xdl|join('_')}}S
+        {{scalar_per_vector}}
+        """,
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(**args).replace("\n", "").replace(" ", "")
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["m_n_block_wave_per_xdl"] = [str(x) for x in self.m_n_block_wave_per_xdl]
+
+        template = jinja2.Template(
+            """
+    {{m_xdl_per_wave}}, // m_xdl_per_wave
+    {{n_xdl_per_wave}}, // n_xdl_per_wave
+    ck::Sequence<{{m_n_block_wave_per_xdl|join(',')}}>, // m_n_block_wave_per_xdl
+    {{scalar_per_vector}} // scalar_per_vector
+    """,
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(**args)
+
+
+@dataclass
+class Conv2DOperation:
+    operation_kind: library.Conv2dKind
+    extra_kind: library.TensorOperation
+    xdl_op_type: XdlOpType
+    A: library.TensorDesc
+    B: library.TensorDesc
+    C: library.TensorDesc
+    a_elem_op: library.TensorOperation
+    b_elem_op: library.TensorOperation
+    epilogue_functor: library.TensorOperation
+    c_data_op: library.MemoryDataOperation
+    conv2d_specialization: Conv2DSpecialization
+    gemm_specialization: Conv2DSpecialization
+    tile_desc: TileDesc
+    a_block_transfer: BlockTransferDesc
+    b_block_transfer: BlockTransferDesc
+    c_block_transfer: CBlockTransferDesc
+
+    def __str__(self) -> str:
+        io_name = "{conv2d_kind}_{conv2d_specialization}_{gemm_specialization}_{a_dtype}{b_dtype}{c_dtype}_{a_layout}_{b_layout}_{c_layout}".format(
+            conv2d_kind=library.Conv2dKindNames[self.operation_kind],
+            conv2d_specialization=self.conv2d_specialization.value,
+            gemm_specialization=self.gemm_specialization.value,
+            a_dtype=library.ShortDataTypeNames[self.A.element],
+            b_dtype=library.ShortDataTypeNames[self.B.element],
+            c_dtype=library.ShortDataTypeNames[self.C.element],
+            a_layout=library.ShortLayoutTypeNames[self.A.layout],
+            b_layout=library.ShortLayoutTypeNames[self.B.layout],
+            c_layout=library.ShortLayoutTypeNames[self.C.layout],
+        )
+        tile_name = str(self.tile_desc)
+        return "{io_name}_{tile_name}_{epilogue_functor}".format(
+            io_name=io_name,
+            tile_name=tile_name,
+            epilogue_functor=library.ShortTensorOperationNames[self.epilogue_functor],
+        )
+
+    def accumulator_type(self):
+        return library.DataType.f32
+
+    def emit(self) -> str:
+
+        template = jinja2.Template(
+            """
+using {{name}} = {{xdl_op_type}}<
+    2, // NDimSpatial
+{% if "DeviceConvNdBwdDataNwcKxcNwk_Xdl" in xdl_op_type %}
+    {{ADType}}, // InDataType
+    {{BDType}}, // WeiDataType
+    {{CDType}}, // OutDataType
+    {{AccDType}}, // AccDataType
+    {{A_elem_op}},
+    {{B_elem_op}},
+    {{epilogue_functor}},
+    {{Conv2DSpecialization}},
+{% else %}
+    {{InLayout}}, // InLayout
+    {{WeiLayout}}, // WeiLayout
+    {% if func=="PT" %}
+    ck::Tuple<>,
+    {% elif func=="AAR" %}
+    ck::Tuple<{{OutLayout}}, {{OutLayout}}>, // BiasLayout
+    {% else %}
+{% if "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1" in xdl_op_type %}
+    ck::Tuple<ck::tensor_layout::convolution::G_C>, // BiasLayouts
+{% else %}
+    ck::Tuple<{{OutLayout}}>, // BiasLayout
+{% endif %}
+    {% endif %}
+    {{OutLayout}}, // OutLayout
+    {{ADType}}, // InDataType
+    {{BDType}}, // WeiDataType
+    {{AccDType}}, // AccDataType
+    {{CShuffleDType}}, // CShuffleDataType
+    {% if func=="PT" %}
+    ck::Tuple<>,
+    {% elif func=="AAR" %}
+    ck::Tuple<{{CDType}}, {{CDType}}>, // BiasLayout
+    {% else %}
+    ck::Tuple<{{CDType}}>, // BiasDataType
+    {% endif %}
+    {{CDType}}, // OutDataType
+
+    {{A_elem_op}},
+    {{B_elem_op}},
+    {{epilogue_functor}},
+
+    {{Conv2DSpecialization}},
+{% if "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1" in xdl_op_type %}
+    true,
+    true,
+    1,
+{% else %}
+    {{GemmSpecialization}},
+{% endif %}
+{% endif %}
+    {{tile_config}}
+    {{a_block_transfer}}
+    {{b_block_transfer}}
+{% if "DeviceConvNdBwdDataNwcKxcNwk_Xdl" in xdl_op_type %}
+    7, // CThreadTransferSrcDstVectorDim
+    1 // GemmCThreadTransferDstScalarPerVector
+{% else %}
+    {{c_block_transfer}}
+{% endif %}
+    >;
+                """
+        )
+        return template.render(
+            name=self.__str__(),
+            xdl_op_type=XdlOpTag[self.xdl_op_type],
+            InLayout=library.LayoutTag[self.A.layout],
+            WeiLayout=library.LayoutTag[self.B.layout],
+            OutLayout=library.LayoutTag[self.C.layout],
+            ADType=library.DataTypeTag[self.A.element],
+            BDType=library.DataTypeTag[self.B.element],
+            CDType=library.DataTypeTag[self.C.element],
+            AccDType=library.DataTypeTag[library.DataType.f32],
+            CShuffleDType=library.DataTypeTag[self.C.element],
+            A_elem_op=library.TensorOperationTag[self.a_elem_op],
+            B_elem_op=library.TensorOperationTag[self.b_elem_op],
+            epilogue_functor=library.TensorOperationTag[self.epilogue_functor],
+            C_data_op=library.MemoryDataOperationTag.get(self.c_data_op, -1),
+            Conv2DSpecialization=Conv2DSpecializationTag[self.conv2d_specialization],
+            GemmSpecialization=Conv2DSpecializationTag[self.gemm_specialization],
+            tile_config=self.tile_desc.emit(),
+            a_block_transfer=self.a_block_transfer.emit(),
+            b_block_transfer=self.b_block_transfer.emit(),
+            c_block_transfer=self.c_block_transfer.emit(),
+            func=library.ShortTensorOperationNames[self.epilogue_functor],
+        )
+
+
+if __name__ == "__main__":
+    A = library.TensorDesc(library.DataType.f16, library.LayoutType.RowMajor)
+    B = library.TensorDesc(library.DataType.f16, library.LayoutType.ColumnMajor)
+    C = library.TensorDesc(library.DataType.f16, library.LayoutType.RowMajor)
+    Conv2DOp = Conv2DOperation(
+        operation_kind=library.Conv2dKind.Conv2d,
+        extra_kind=library.TensorOperation.PassThrough,
+        xdl_op_type=XdlOpType.DeviceConv2d_Xdl_CShuffle,
+        A=A,
+        B=B,
+        C=C,
+        a_elem_op=library.TensorOperation.PassThrough,
+        b_elem_op=library.TensorOperation.PassThrough,
+        epilogue_functor=library.TensorOperation.PassThrough,
+        c_data_op="",
+        conv2d_specialization=Conv2DSpecialization.ConvFwdDefault,
+        gemm_specialization=Conv2DSpecialization.GemmDefault,
+        tile_desc=TileDesc(256, 256, 128, 4, 8, 32, 32, 4, 2),
+        a_block_transfer=BlockTransferDesc(
+            [4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1
+        ),
+        b_block_transfer=BlockTransferDesc(
+            [4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1
+        ),
+        c_block_transfer=CBlockTransferDesc(1, 1, [1, 1, 32, 1, 1, 8], 8),
+    )
+    print(str(Conv2DOp))
+    print(Conv2DOp.emit())
diff --git a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
new file mode 100644
index 000000000..3a6968aa5
--- /dev/null
+++ b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
@@ -0,0 +1,513 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import enum
+from copy import deepcopy
+from dataclasses import dataclass
+from enum import auto
+from typing import List
+
+import jinja2
+
+from . import library
+
+# import library
+
+
+class GemmSpecialization(enum.Enum):
+    GemmDefault = auto()
+    MNKPadding = auto()
+    MNPadding = auto()
+    MNOPadding = auto()
+    MNKOPadding = auto()
+
+
+GemmSpecializationTag = {
+    GemmSpecialization.GemmDefault: "ck::tensor_operation::device::GemmSpecialization::Default",
+    GemmSpecialization.MNKPadding: "ck::tensor_operation::device::GemmSpecialization::MNKPadding",
+    GemmSpecialization.MNPadding: "ck::tensor_operation::device::GemmSpecialization::MNPadding",
+    GemmSpecialization.MNOPadding: "ck::tensor_operation::device::GemmSpecialization::MNOPadding",
+    GemmSpecialization.MNKOPadding: "ck::tensor_operation::device::GemmSpecialization::MNKOPadding",
+}
+
+
+class XdlOpType(enum.Enum):
+    DeviceGemmXdl_CShuffle = 1  # TODO: This sucks
+    DeviceGemmMultipleD_Xdl_CShuffle = auto()
+    DeviceBatchedGemmXdl = auto()
+    DeviceBatchedGemmCPermuteXdl = auto()
+    DeviceGemmBiasCPermute_Xdl = auto()
+    DeviceBatchedContractionMultipleD_Xdl_CShuffle = auto()
+    DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle = auto()
+    DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle = auto()
+
+
+XdlOpTag = {
+    XdlOpType.DeviceGemmXdl_CShuffle: "ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle",
+    XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle: "ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle",
+    XdlOpType.DeviceBatchedGemmXdl: "ck::tensor_operation::device::DeviceBatchedGemmXdl",
+    XdlOpType.DeviceBatchedGemmCPermuteXdl: "ck::tensor_operation::device::DeviceBatchedGemmEPermuteXdl",
+    XdlOpType.DeviceGemmBiasCPermute_Xdl: "ck::tensor_operation::device::DeviceGemmBiasEPermute_Xdl",
+    XdlOpType.DeviceBatchedContractionMultipleD_Xdl_CShuffle: "ck::tensor_operation::device::DeviceBatchedContractionMultipleD_Xdl_CShuffle",
+    XdlOpType.DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle: "ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle",
+    XdlOpType.DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle: "ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle",
+}
+
+
+@dataclass
+class TileDesc:
+    block_size: int
+    m_per_block: int
+    n_per_block: int
+    k_per_block: int
+    ak1: int
+    bk1: int
+    m_per_xdl: int
+    n_per_xdl: int
+    m_xdl_per_wave: int
+    n_xdl_per_wave: int
+
+    def __str__(self) -> str:
+        values = list(self.__dict__.values())
+        return "_".join([str(x) for x in values])
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        template = jinja2.Template(
+            """
+{%for key, value in param.items() %}
+{% if value!=0 %}   {{value}}, // {{key}}
+    {% endif %}
+{% endfor %}
+""",
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(param=args)
+
+
+@dataclass
+class AttnTileDesc:
+    block_size: int
+    m_per_block: int
+    n_per_block: int
+    k_per_block: int
+    Gemm1NPerBlock: int
+    Gemm1KPerBlock: int
+    ak1: int
+    bk1: int
+    b1k1: int
+    m_per_xdl: int
+    n_per_xdl: int
+    m_xdl_per_wave: int
+    n_xdl_per_wave: int
+    Gemm1NXdlPerWave: int
+
+    def __str__(self) -> str:
+        values = list(self.__dict__.values())
+        return "_".join([str(x) for x in values])
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        template = jinja2.Template(
+            """
+{%for key, value in param.items() %}
+{% if value!=0 %}   {{value}}, // {{key}}
+    {% endif %}
+{% endfor %}
+""",
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(param=args)
+
+
+@dataclass
+class BlockTransferDesc:
+    thread_cluster_length: List[int]
+    thread_cluster_arrange_order: List[int]
+    src_access_order: List[int]
+    src_vector_dim: int
+    src_scalar_per_vector: int
+    dst_scalar_per_vector: int
+    add_extra_dim: int
+    add_extra_dim_flag: bool = False
+
+    def __str__(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["thread_cluster_length"] = [str(x) for x in self.thread_cluster_length]
+        args["thread_cluster_arrange_order"] = [
+            str(x) for x in self.thread_cluster_arrange_order
+        ]
+        args["src_access_order"] = [str(x) for x in self.src_access_order]
+
+        template = jinja2.Template(
+            """S{{thread_cluster_length|join('_')}}S
+            _S{{thread_cluster_arrange_order|join('_')}}S
+            _S{{src_access_order|join('_')}}S
+            _{{src_vector_dim}}
+            _{{src_scalar_per_vector}}
+            _{{dst_scalar_per_vector}}
+            _{{add_extra_dim}}
+            """,
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(**args).replace("\n", "").replace(" ", "")
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["thread_cluster_length"] = [str(x) for x in self.thread_cluster_length]
+        args["thread_cluster_arrange_order"] = [
+            str(x) for x in self.thread_cluster_arrange_order
+        ]
+        args["src_access_order"] = [str(x) for x in self.src_access_order]
+        template = jinja2.Template(
+            """
+    ck::Sequence<{{thread_cluster_length|join(',')}}>, // thread_cluster_length
+    ck::Sequence<{{thread_cluster_arrange_order|join(',')}}>, // thread_cluster_arrange_order
+    ck::Sequence<{{src_access_order|join(',')}}>, // src_access_order
+    {{src_vector_dim}}, // src_vector_dim
+    {{src_scalar_per_vector}}, // src_scalar_per_vector
+    {{dst_scalar_per_vector}}, // dst_scalar_per_vector
+{% if add_extra_dim_flag %}
+    {% if add_extra_dim==1 %}true, {% else %}false,{% endif %} //add_extra_dim
+{% else %}
+    {{add_extra_dim}}, // add_extra_dim
+{% endif %}
+"""
+        )
+        return template.render(**args)
+
+
+@dataclass
+class CBlockTransferDesc:
+    m_xdl_per_wave: int
+    n_xdl_per_wave: int
+    m_n_block_wave_per_xdl: List[int]
+    scalar_per_vector: int
+
+    def __str__(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["m_n_block_wave_per_xdl"] = [str(x) for x in self.m_n_block_wave_per_xdl]
+        template = jinja2.Template(
+            """
+        {{m_xdl_per_wave}}
+        _{{n_xdl_per_wave}}
+        {{m_n_block_wave_per_xdl|join('_')}}S
+        {{scalar_per_vector}}
+        """,
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(**args).replace("\n", "").replace(" ", "")
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["m_n_block_wave_per_xdl"] = [str(x) for x in self.m_n_block_wave_per_xdl]
+
+        template = jinja2.Template(
+            """
+    {{m_xdl_per_wave}}, // m_xdl_per_wave
+    {{n_xdl_per_wave}}, // n_xdl_per_wave
+    ck::Sequence<{{m_n_block_wave_per_xdl|join(',')}}>, // m_n_block_wave_per_xdl
+    {{scalar_per_vector}} // scalar_per_vector
+    """,
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(**args)
+
+
+@dataclass
+class MaskedCBlockTransferDesc:
+    m_xdl_per_wave: int
+    n_xdl_per_wave: int
+    m_n_block_wave_per_xdl: List[int]
+    scalar_per_vector: int
+    causal_mask: int
+
+    def __str__(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["m_n_block_wave_per_xdl"] = [str(x) for x in self.m_n_block_wave_per_xdl]
+        template = jinja2.Template(
+            """
+        {{m_xdl_per_wave}}
+        _{{n_xdl_per_wave}}
+        {{m_n_block_wave_per_xdl|join('_')}}S
+        {{scalar_per_vector}}
+        {{causal_mask}}
+        """,
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(**args).replace("\n", "").replace(" ", "")
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        args["m_n_block_wave_per_xdl"] = [str(x) for x in self.m_n_block_wave_per_xdl]
+
+        template = jinja2.Template(
+            """
+    {{m_xdl_per_wave}}, // m_xdl_per_wave
+    {{n_xdl_per_wave}}, // n_xdl_per_wave
+    ck::Sequence<{{m_n_block_wave_per_xdl|join(',')}}>, // m_n_block_wave_per_xdl
+    {{scalar_per_vector}}, // scalar_per_vector
+    {{causal_mask}} // causal_mask
+    """,
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(**args)
+
+
+@dataclass
+class GemmOperation:
+    operation_kind: library.OperationKind
+    extra_kind: library.TensorOperation
+    xdl_op_type: XdlOpType
+    A: library.TensorDesc
+    B: library.TensorDesc
+    C: library.TensorDesc
+    a_elem_op: library.TensorOperation
+    b_elem_op: library.TensorOperation
+    epilogue_functor: library.TensorOperation
+    gemm_specialization: GemmSpecialization
+    tile_desc: TileDesc
+    a_block_transfer: BlockTransferDesc
+    b_block_transfer: BlockTransferDesc
+    b1_block_transfer: BlockTransferDesc = None
+    c_block_transfer: CBlockTransferDesc = None
+    ds_dtype: List[library.DataType] = None
+    ds_layout: List[library.LayoutType] = None
+    e_dtype: library.DataType = None
+
+    def __str__(self) -> str:
+        io_name = "{gemm_kind}_{gemm_specialization}_{a_dtype}{b_dtype}{c_dtype}_{a_layout}{b_layout}{c_layout}".format(
+            gemm_kind=library.GemmKindNames[self.operation_kind],
+            gemm_specialization=self.gemm_specialization.value,
+            a_dtype=library.ShortDataTypeNames[self.A.element],
+            b_dtype=library.ShortDataTypeNames[self.B.element],
+            c_dtype=library.ShortDataTypeNames[self.C.element],
+            a_layout=library.ShortLayoutTypeNames[self.A.layout],
+            b_layout=library.ShortLayoutTypeNames[self.B.layout],
+            c_layout=library.ShortLayoutTypeNames[self.C.layout],
+        )
+        extra_tile = ""
+        if self.c_block_transfer is not None:
+            if self.c_block_transfer.scalar_per_vector == 4:
+                extra_tile = "_C4"
+            elif self.c_block_transfer.scalar_per_vector == 1:
+                extra_tile = "_C1"
+
+        tile_name = str(self.tile_desc) + extra_tile
+        extra_name = (
+            "_CM" if library.ShortTensorOperationNames[self.extra_kind] == "CM" else ""
+        )
+        return "{io_name}_{tile_name}_{epilogue_functor}".format(
+            io_name=io_name,
+            tile_name=tile_name,
+            epilogue_functor=library.ShortTensorOperationNames[self.epilogue_functor]
+            + extra_name,
+        )
+
+    def accumulator_type(self):
+        return library.DataType.f32
+
+    def emit(self) -> str:
+        template = jinja2.Template(
+            """
+using {{name}} = {{xdl_op_type}}<
+{% if xdl_op_type_value==1 %}
+    {{ALayout}},
+    {{BLayout}},
+    {{CLayout}},
+    {{ADType}},
+    {{BDType}},
+    {{CDType}},
+    {{AccDType}},
+    {{CShuffleDType}},
+{% elif xdl_op_type_value==2 %}
+    {{ALayout}},
+    {{BLayout}},
+    ck::Tuple<{{DsLayout}}>, // DsLayout
+    {{CLayout}},
+    {{ADType}},
+    {{BDType}},
+    {{AccDType}},
+    {{CShuffleDType}},
+    ck::Tuple<{{DsDType}}>, // DsType
+    {{CDType}},
+{% elif xdl_op_type_value==3 %}
+    {{ADType}},
+    {{BDType}},
+    {{CDType}},
+    {{AccDType}},
+    {{ALayout}},
+    {{BLayout}},
+    {{CLayout}},
+{% elif xdl_op_type_value==4 %}
+    {{ALayout}},
+    {{BLayout}},
+    {{CLayout}},
+    {{ADType}},
+    {{BDType}},
+    {{AccDType}},
+    {{CShuffleDType}},
+    {{CDType}},
+{% elif xdl_op_type_value==5 %}
+    {{ALayout}},
+    {{BLayout}},
+    {{CLayout}},
+    {{ADType}},
+    {{BDType}},
+    {{AccDType}},
+    float, // CShuffleDType
+    ck::half_t,
+    ck::half_t,
+{% elif xdl_op_type_value==6 %}
+    {% if gemm_kind == "gemm_permute_m2n3" %}
+    1, 2, 3, 1, // permute m2n3
+    {% elif gemm_kind == "gemm_permute_m3n2" %}
+    1, 3, 2, 1, // permute m3n2
+    {% endif %}
+    {{ADType}},
+    {{BDType}},
+    {{AccDType}},
+    ck::half_t,
+    {% if "PassThrough" in C_elem_op %}
+    ck::Tuple<>,
+    {% else %}
+    ck::Tuple<ck::half_t>,
+    {% endif %}
+    ck::half_t,
+{% elif xdl_op_type_value in [7, 8] %}
+    {{ALayout}},
+    {{BLayout}},
+    {{CLayout}},
+    {% if xdl_op_type_value == 8 %}
+    ck::Sequence<2,1,1>,
+    {% else %}
+    {{CLayout}},
+    {% endif %}
+    {{ADType}},
+    {{BDType}},
+    {{BDType}},
+    {{CDType}},
+    {{AccDType}},
+    float, // CShuffleDType,
+{% endif %}
+{% if xdl_op_type_value in [7, 8] %}
+    {{A_elem_op}},
+    {{B_elem_op}},
+    ck::tensor_operation::element_wise::ScaleAndResetNaNToMinusInfinity,
+{% else %}
+    {{A_elem_op}},
+{% endif %}
+    {{B_elem_op}},
+    {{C_elem_op}},
+{% if xdl_op_type_value!=3 %}
+    {{GemmSpecialization}},
+    {% if xdl_op_type_value==6 %}
+    ck::tensor_operation::device::TensorSpecialization::Packed,
+    ck::tensor_operation::device::TensorSpecialization::Packed,
+    ck::tensor_operation::device::TensorSpecialization::Default,
+    {% endif %}
+    1,
+{% endif %}
+    {{tile_config}}
+    {{a_block_transfer}}
+    {{b_block_transfer}}
+{% if xdl_op_type_value in [7, 8] %}
+    {{b1_block_transfer}}
+{% endif %}
+{% if xdl_op_type_value!=3 %}
+    {{c_block_transfer}}
+{% else %}
+    7, // src_dst_vector_dim
+    1 // dst_scalar_per_vector
+{% endif %}
+    >;
+"""
+        )
+        return template.render(
+            name=self.__str__(),
+            gemm_kind=library.GemmKindNames[self.operation_kind],
+            xdl_op_type=XdlOpTag[self.xdl_op_type],
+            xdl_op_type_value=self.xdl_op_type.value,  # This sucks
+            ALayout=library.LayoutTag[self.A.layout],
+            BLayout=library.LayoutTag[self.B.layout],
+            CLayout=library.LayoutTag[self.C.layout],
+            ADType=library.DataTypeTag[self.A.element],
+            BDType=library.DataTypeTag[self.B.element],
+            CDType=library.DataTypeTag[self.C.element],
+            AccDType=library.DataTypeTag[library.DataType.f32],
+            CShuffleDType=library.DataTypeTag[self.C.element],
+            A_elem_op=library.TensorOperationTag[self.a_elem_op],
+            B_elem_op=library.TensorOperationTag[self.b_elem_op],
+            C_elem_op=library.TensorOperationTag[self.epilogue_functor],
+            GemmSpecialization=GemmSpecializationTag[self.gemm_specialization],
+            epilogue_func=library.ShortTensorOperationNames[self.epilogue_functor],
+            tile_config=self.tile_desc.emit(),
+            a_block_transfer=self.a_block_transfer.emit(),
+            b_block_transfer=self.b_block_transfer.emit(),
+            b1_block_transfer=""
+            if self.b1_block_transfer is None
+            else self.b1_block_transfer.emit(),
+            c_block_transfer=self.c_block_transfer.emit()
+            if self.c_block_transfer is not None
+            else "",
+            DsDType=",".join(
+                [library.DataTypeTag[d_dtype] for d_dtype in self.ds_dtype]
+            )
+            if self.ds_dtype is not None
+            else "",
+            DsLayout=",".join(
+                [library.LayoutTag[d_layout] for d_layout in self.ds_layout]
+            )
+            if self.ds_layout is not None
+            else "",
+            EDType=library.DataTypeTag[self.e_dtype]
+            if self.e_dtype is not None
+            else "",
+        )
+
+
+if __name__ == "__main__":
+    A = library.TensorDesc(library.DataType.f16, library.LayoutType.RowMajor)
+    B = library.TensorDesc(library.DataType.f16, library.LayoutType.ColumnMajor)
+    C = library.TensorDesc(library.DataType.f16, library.LayoutType.RowMajor)
+    GemmOp = GemmOperation(
+        operation_kind=library.GemmKind.BatchGemmPermute,
+        extra_kind=library.TensorOperation.PassThrough,
+        xdl_op_type=XdlOpType.DeviceBatchedGemmCPermuteXdl,
+        A=A,
+        B=B,
+        C=C,
+        a_elem_op=library.TensorOperation.PassThrough,
+        b_elem_op=library.TensorOperation.PassThrough,
+        epilogue_functor=library.TensorOperation.PassThrough,
+        gemm_specialization=GemmSpecialization.GemmDefault,
+        tile_desc=TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 4, 2),
+        a_block_transfer=BlockTransferDesc(
+            [4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1, True
+        ),
+        b_block_transfer=BlockTransferDesc(
+            [8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 1, 0, True
+        ),
+        # c_block_transfer=CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        ds_dtype=[library.DataType.f16],
+    )
+    print(str(GemmOp))
+    print(GemmOp.emit())
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
new file mode 100644
index 000000000..c65f6c4dc
--- /dev/null
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -0,0 +1,2164 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import copy
+
+from . import (
+    conv2d_operation as conv,
+    gemm_operation as gemm,
+    groupnorm_operation as groupnorm,
+    layernorm_operation as layernorm,
+    library,
+    softmax_operation as softmax,
+)
+
+###########################################################################################################
+# Convolution for 2D Fwd operations
+def CreateConv2dFwdOperator(manifest, operation_kind, out_element_op, out_data_op=""):
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.G_NHW_C
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.G_K_YX_C
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.G_NHW_K
+    )
+
+    in_element_op = library.TensorOperation.PassThrough
+
+    tile_descriptions = [
+        conv.GroupTileDesc(1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        conv.GroupTileDesc(1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        conv.GroupTileDesc(1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        conv.GroupTileDesc(1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        conv.GroupTileDesc(1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        conv.GroupTileDesc(1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        conv.GroupTileDesc(1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2),
+        conv.GroupTileDesc(1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        conv.GroupTileDesc(1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+        conv.GroupTileDesc(1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2),
+        conv.GroupTileDesc(1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1),
+        conv.GroupTileDesc(1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    c_block_descriptions = [
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
+    ]
+
+    block_descriptions = []
+    for t in tile_descriptions:
+        block_transfer = -1
+        if t.block_size == 256:
+            block_transfer = [4, 64, 1]
+        if t.block_size == 128:
+            block_transfer = [4, 32, 1]
+        if t.block_size == 64:
+            block_transfer = [4, 16, 1]
+        assert (
+            block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        block_descriptions.append(
+            conv.BlockTransferDesc(block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+
+    conv2d_specialization = [
+        conv.Conv2DSpecialization.ConvFwdDefault,
+        conv.Conv2DSpecialization.ConvFwd1x1P0,
+        conv.Conv2DSpecialization.ConvFwd1x1S1P0,
+    ]
+
+    gemm_specialization = [
+        conv.Conv2DSpecialization.GemmDefault,
+        conv.Conv2DSpecialization.MNKPadding,
+    ]
+
+    operations = []
+    for conv2d_spec in conv2d_specialization:
+        for gemm_spec in gemm_specialization:
+            for tile_desc, block_desc, c_block_desc in zip(
+                tile_descriptions, block_descriptions, c_block_descriptions
+            ):
+                new_operation = conv.Conv2DOperation(
+                    operation_kind=operation_kind,
+                    extra_kind=out_element_op,
+                    xdl_op_type=conv.XdlOpType(operation_kind.value),
+                    A=a_element_desc,
+                    B=b_element_desc,
+                    C=c_element_desc,
+                    a_elem_op=in_element_op,
+                    b_elem_op=in_element_op,
+                    epilogue_functor=out_element_op,
+                    c_data_op=out_data_op,
+                    conv2d_specialization=conv2d_spec,
+                    gemm_specialization=gemm_spec,
+                    tile_desc=tile_desc,
+                    a_block_transfer=block_desc,
+                    b_block_transfer=block_desc,
+                    c_block_transfer=c_block_desc,
+                )
+                manifest.append(new_operation)
+                operations.append(new_operation)
+
+    conv2d_specialization = [conv.Conv2DSpecialization.ConvFwdOddC]
+
+    tile_descriptions += [
+        conv.GroupTileDesc(1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        conv.GroupTileDesc(1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        conv.GroupTileDesc(1, 256, 256, 64, 32, 8, 8, 32, 32, 4, 1),
+        conv.GroupTileDesc(1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        conv.GroupTileDesc(1, 128, 64, 64, 32, 8, 8, 32, 32, 1, 2),
+        conv.GroupTileDesc(1, 256, 256, 16, 32, 8, 8, 16, 16, 4, 1),  # c_out=1
+    ]
+
+    block_descriptions = [
+        conv.BlockTransferDesc([4, 8, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 8, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 4, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 8, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 4, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 4, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 2, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 8, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 8, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 4, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 2, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 2, 8], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([2, 32, 4], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([2, 32, 4], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([2, 32, 4], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([2, 16, 4], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([2, 16, 4], [1, 0, 2], [1, 0, 2], 2, 1, 1, 1),
+        conv.BlockTransferDesc([4, 16, 4], [1, 0, 2], [1, 0, 2], 2, 2, 2, 1),  # c_out=1
+    ]
+
+    c_block_descriptions += [
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
+        conv.CBlockTransferDesc(4, 1, [1, 256, 1, 1], 1),  # c_out=1
+    ]
+    for conv2d_spec in conv2d_specialization:
+        for gemm_spec in gemm_specialization:
+            for tile_desc, block_desc, c_block_desc in zip(
+                tile_descriptions, block_descriptions, c_block_descriptions
+            ):
+                new_operation = conv.Conv2DOperation(
+                    operation_kind=operation_kind,
+                    extra_kind=out_element_op,
+                    xdl_op_type=conv.XdlOpType(operation_kind.value),
+                    A=a_element_desc,
+                    B=b_element_desc,
+                    C=c_element_desc,
+                    a_elem_op=in_element_op,
+                    b_elem_op=in_element_op,
+                    epilogue_functor=out_element_op,
+                    c_data_op=out_data_op,
+                    conv2d_specialization=conv2d_spec,
+                    gemm_specialization=gemm_spec,
+                    tile_desc=tile_desc,
+                    a_block_transfer=block_desc,
+                    b_block_transfer=block_desc,
+                    c_block_transfer=c_block_desc,
+                )
+                manifest.append(new_operation)
+                operations.append(new_operation)
+    return operations
+
+
+# Convolution for 2D Bwd operations
+def CreateConv2dBwdOperator(manifest, operation_kind, out_element_op, out_data_op=""):
+    a_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNWC)
+    b_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GKXC)
+    c_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNWK)
+
+    in_element_op = library.TensorOperation.PassThrough
+
+    tile_descriptions = [
+        conv.TileDesc(256, 256, 128, 4, 8, 32, 32, 4, 2),
+        conv.TileDesc(256, 128, 256, 4, 8, 32, 32, 2, 4),
+        conv.TileDesc(128, 128, 128, 4, 8, 32, 32, 4, 2),
+        conv.TileDesc(256, 128, 128, 4, 8, 32, 32, 2, 2),
+        conv.TileDesc(256, 64, 128, 4, 8, 32, 32, 1, 2),
+        conv.TileDesc(128, 32, 128, 4, 8, 32, 32, 1, 2),
+        conv.TileDesc(128, 64, 128, 4, 8, 32, 32, 2, 2),
+        conv.TileDesc(256, 128, 64, 4, 8, 32, 32, 2, 1),
+        conv.TileDesc(128, 128, 64, 4, 8, 32, 32, 2, 2),
+        conv.TileDesc(64, 64, 64, 4, 8, 32, 32, 2, 2),
+        conv.TileDesc(128, 128, 32, 4, 8, 32, 32, 2, 1),
+        conv.TileDesc(64, 64, 32, 4, 8, 32, 32, 2, 1),
+        conv.TileDesc(64, 32, 64, 4, 8, 32, 32, 1, 2),
+    ]
+
+    c_block_descriptions = [
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
+        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
+    ]
+
+    block_descriptions = []
+    for t in tile_descriptions:
+        block_transfer = -1
+        if t.block_size == 256:
+            block_transfer = [4, 64, 1]
+        if t.block_size == 128:
+            block_transfer = [4, 32, 1]
+        if t.block_size == 64:
+            block_transfer = [4, 16, 1]
+        assert (
+            block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        block_descriptions.append(
+            conv.BlockTransferDesc(block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+    b_block_scalars = [2, 4, 4, 2, 2, 4, 4, 1, 2, 4, 1, 2, 2]
+
+    conv2d_specialization = [
+        conv.Conv2DSpecialization.ConvBwdDataDefault,
+        conv.Conv2DSpecialization.ConvBwd1x1S1P0,
+    ]
+    gemm_spec = conv.Conv2DSpecialization.GemmDefault
+
+    operations = []
+    for conv2d_spec in conv2d_specialization:
+        for tile_desc, block_desc, b_scalar, c_block_desc in zip(
+            tile_descriptions,
+            block_descriptions,
+            b_block_scalars,
+            c_block_descriptions,
+        ):
+            b_block_desc = copy.deepcopy(block_desc)
+            b_block_desc.src_vector_dim = 1
+            b_block_desc.src_scalar_per_vector = b_scalar
+            new_operation = conv.Conv2DOperation(
+                operation_kind=operation_kind,
+                extra_kind=out_element_op,
+                xdl_op_type=conv.XdlOpType(operation_kind.value),
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=in_element_op,
+                b_elem_op=in_element_op,
+                epilogue_functor=out_element_op,
+                c_data_op=out_data_op,
+                conv2d_specialization=conv2d_spec,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=block_desc,
+                b_block_transfer=b_block_desc,
+                c_block_transfer=c_block_desc,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+# Convolution for 2D Bwd + Bias operations
+def CreateConv2dBwdBiasOperator(
+    manifest, operation_kind, out_element_op, out_data_op=""
+):
+    a_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNHWK)
+    b_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GKYXC)
+    c_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNHWC)
+
+    in_element_op = library.TensorOperation.PassThrough
+
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 4, 2),
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 2, 32, 32, 2, 4),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 8, 2, 32, 32, 4, 2),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 2, 32, 32, 2, 1),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 2, 32, 32, 1, 2),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    b_block_descriptions = [
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([8, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([16, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+    ]
+    a_block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        a_block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            a_block_transfer = [4, 64, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+        if t.block_size == 128:
+            a_block_transfer = [4, 32, 1]
+            if t.n_per_block == 128:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+            if t.n_per_block == 64:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+
+        assert (
+            a_block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        a_block_descriptions.append(
+            gemm.BlockTransferDesc(a_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+        c_block_descriptions.append(c_block_transfer)
+
+    conv2d_specialization = [
+        conv.Conv2DSpecialization.ConvBwdDataDefault,
+        conv.Conv2DSpecialization.ConvBwd1x1S1P0,
+    ]
+    gemm_spec = conv.Conv2DSpecialization.GemmDefault
+
+    operations = []
+    for conv2d_spec in conv2d_specialization:
+        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+            tile_descriptions,
+            a_block_descriptions,
+            b_block_descriptions,
+            c_block_descriptions,
+        ):
+            new_operation = conv.Conv2DOperation(
+                operation_kind=operation_kind,
+                extra_kind=out_element_op,
+                xdl_op_type=conv.XdlOpType(operation_kind.value),
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=in_element_op,
+                b_elem_op=in_element_op,
+                epilogue_functor=out_element_op,
+                c_data_op=out_data_op,
+                conv2d_specialization=conv2d_spec,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+                c_block_transfer=c_block_desc,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+###########################################################################################################
+# Gemm operations
+def CreateGemmRRROperator(manifest):
+    operation_kind = library.GemmKind.Gemm
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 4, 2),
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 2, 32, 32, 2, 4),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 8, 2, 32, 32, 4, 2),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 2, 32, 32, 2, 1),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 2, 32, 32, 1, 2),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    b_block_descriptions = [
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([8, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([16, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+    ]
+    a_block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        a_block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            a_block_transfer = [4, 64, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+        if t.block_size == 128:
+            a_block_transfer = [4, 32, 1]
+            if t.n_per_block == 128:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+            if t.n_per_block == 64:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+
+        assert (
+            a_block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        a_block_descriptions.append(
+            gemm.BlockTransferDesc(a_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+        c_block_descriptions.append(c_block_transfer)
+
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+            tile_descriptions,
+            a_block_descriptions,
+            b_block_descriptions,
+            c_block_descriptions,
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceGemmXdl_CShuffle,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+                c_block_transfer=c_block_desc,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateGemmRCROperator(manifest):
+    operation_kind = library.GemmKind.Gemm
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(64, 64, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(128, 128, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(128, 32, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(64, 64, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(64, 32, 64, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            block_transfer = [4, 64, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+        if t.block_size == 128:
+            block_transfer = [4, 32, 1]
+            if t.n_per_block == 128:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+            else:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+        if t.block_size == 64:
+            block_transfer = [4, 16, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8)
+
+        assert (
+            block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        block_descriptions.append(
+            gemm.BlockTransferDesc(block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+        c_block_descriptions.append(c_block_transfer)
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, block_desc, c_block_desc in zip(
+            tile_descriptions, block_descriptions, c_block_descriptions
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceGemmXdl_CShuffle,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=block_desc,
+                b_block_transfer=block_desc,
+                c_block_transfer=c_block_desc,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateGemmRCRBillinearOperator(manifest, c_element_op):
+    operation_kind = library.GemmKind.Gemm
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    if c_element_op in [
+        library.TensorOperation.AddMulAdd,
+        library.TensorOperation.AddAddAdd,
+        library.TensorOperation.AddAddAddRelu,
+    ]:
+        ds_dtype = [library.DataType.f16, library.DataType.f16, library.DataType.f16]
+        ds_layout = [
+            library.LayoutType.RowMajor,
+            library.LayoutType.RowMajor,
+            library.LayoutType.RowMajor,
+        ]
+    elif c_element_op in [
+        library.TensorOperation.AddSigmoidMul,
+        library.TensorOperation.AddSigmoidMulTanh,
+        library.TensorOperation.AddAdd,
+        library.TensorOperation.AddMul,
+        library.TensorOperation.AddMulTanh,
+        library.TensorOperation.AddAddRelu,
+    ]:
+        ds_dtype = [library.DataType.f16, library.DataType.f16]
+        ds_layout = [library.LayoutType.RowMajor, library.LayoutType.RowMajor]
+    else:
+        ds_dtype = [library.DataType.f16]
+        ds_layout = [library.LayoutType.RowMajor]
+    e_dtype = library.DataType.f16
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(64, 64, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(128, 128, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(128, 32, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(64, 64, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(64, 32, 64, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            block_transfer = [4, 64, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+        if t.block_size == 128:
+            block_transfer = [4, 32, 1]
+            if t.n_per_block == 128:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+            else:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+        if t.block_size == 64:
+            block_transfer = [4, 16, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8)
+
+        assert (
+            block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        block_descriptions.append(
+            gemm.BlockTransferDesc(block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+        c_block_descriptions.append(c_block_transfer)
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, block_desc, c_block_desc in zip(
+            tile_descriptions, block_descriptions, c_block_descriptions
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=c_element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=c_element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=block_desc,
+                b_block_transfer=block_desc,
+                c_block_transfer=c_block_desc,
+                ds_dtype=ds_dtype,
+                ds_layout=ds_layout,
+                e_dtype=e_dtype,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+    if c_element_op in [
+        library.TensorOperation.Add,  # gemm_rcr_bias
+        library.TensorOperation.AddRelu,  # gemm_rcr_bias_relu
+    ]:
+        # N % 8 == 0 && K % 1 == 0
+        gemm_spec = gemm.GemmSpecialization.MNKPadding
+        for tile_desc, block_desc, c_block_desc in zip(
+            tile_descriptions, block_descriptions, c_block_descriptions
+        ):
+            c_block_desc = copy.deepcopy(c_block_desc)
+            c_block_desc.scalar_per_vector = 1
+            c_block_desc.m_n_block_wave_per_xdl[1] //= 8
+            c_block_desc.m_n_block_wave_per_xdl[-1] *= 8
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=c_element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=c_element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=block_desc,
+                b_block_transfer=block_desc,
+                c_block_transfer=c_block_desc,
+                ds_dtype=ds_dtype,
+                ds_layout=ds_layout,
+                e_dtype=e_dtype,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+        # N % 4 == 0 && K % 4 == 0
+        gemm_spec = gemm.GemmSpecialization.MNKPadding
+        for tile_desc, block_desc, c_block_desc in zip(
+            tile_descriptions, block_descriptions, c_block_descriptions
+        ):
+            block_desc.src_scalar_per_vector = 4
+            block_desc.dst_scalar_per_vector = 4
+            c_block_desc = copy.deepcopy(c_block_desc)
+            c_block_desc.scalar_per_vector = 4
+            c_block_desc.m_n_block_wave_per_xdl[1] //= 2
+            c_block_desc.m_n_block_wave_per_xdl[-1] *= 2
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=c_element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=c_element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=block_desc,
+                b_block_transfer=block_desc,
+                c_block_transfer=c_block_desc,
+                ds_dtype=ds_dtype,
+                ds_layout=ds_layout,
+                e_dtype=e_dtype,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+    return operations
+
+
+def CreateBmmRCROperator(manifest):
+    operation_kind = library.GemmKind.BatchGemm
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 4, 8, 0, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 4, 8, 0, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 4, 8, 0, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(64, 64, 64, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 4, 8, 0, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 4, 8, 0, 32, 32, 1, 2),
+        gemm.TileDesc(128, 128, 32, 4, 8, 0, 32, 32, 2, 1),
+        gemm.TileDesc(128, 32, 128, 4, 8, 0, 32, 32, 1, 2),
+        gemm.TileDesc(64, 64, 32, 4, 8, 0, 32, 32, 2, 1),
+        gemm.TileDesc(64, 32, 64, 4, 8, 0, 32, 32, 1, 2),
+    ]
+
+    block_descriptions = []
+    for t in tile_descriptions:
+        block_transfer = -1
+        if t.block_size == 256:
+            block_transfer = [4, 64, 1]
+        if t.block_size == 128:
+            block_transfer = [4, 32, 1]
+        if t.block_size == 64:
+            block_transfer = [4, 16, 1]
+
+        assert (
+            block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        block_descriptions.append(
+            gemm.BlockTransferDesc(
+                block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1, True
+            )
+        )
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, block_desc in zip(tile_descriptions, block_descriptions):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmXdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=block_desc,
+                b_block_transfer=block_desc,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateGemmRCRPermOperator(manifest, c_element_op):
+    operation_kind = library.GemmKind.GemmPermute
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    ds_dtype = [library.DataType.f16]
+    e_dtype = library.DataType.f16
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        # gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(64, 64, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(128, 128, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(128, 32, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(64, 64, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(64, 32, 64, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            block_transfer = [4, 64, 1]
+            # TODO:figure out the last dimension
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 1)
+        if t.block_size == 128:
+            block_transfer = [4, 32, 1]
+            if t.n_per_block == 128:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 1)
+            else:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 1)
+        if t.block_size == 64:
+            block_transfer = [4, 16, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 1)
+
+        assert (
+            block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        block_descriptions.append(
+            gemm.BlockTransferDesc(block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+        c_block_descriptions.append(c_block_transfer)
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, block_desc, c_block_desc in zip(
+            tile_descriptions, block_descriptions, c_block_descriptions
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=c_element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceGemmBiasCPermute_Xdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=c_element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=block_desc,
+                b_block_transfer=block_desc,
+                c_block_transfer=c_block_desc,
+                ds_dtype=ds_dtype,
+                e_dtype=e_dtype,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateGemmRRRPermOperator(manifest, c_element_op):
+    operation_kind = library.GemmKind.GemmPermute
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    ds_dtype = [library.DataType.f16]
+    e_dtype = library.DataType.f16
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 4, 2),
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 2, 32, 32, 2, 4),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 8, 2, 32, 32, 4, 2),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 2, 32, 32, 2, 1),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 2, 32, 32, 1, 2),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    b_block_descriptions = [
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([8, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([16, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+    ]
+    a_block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        a_block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            a_block_transfer = [4, 64, 1]
+            # TODO:figure out the last dimension
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 1)
+        if t.block_size == 128:
+            a_block_transfer = [4, 32, 1]
+            if t.n_per_block == 128:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 1)
+            else:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 1)
+        if t.block_size == 64:
+            a_block_transfer = [4, 16, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 1)
+
+        assert (
+            a_block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        a_block_descriptions.append(
+            gemm.BlockTransferDesc(a_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+        c_block_descriptions.append(c_block_transfer)
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+            tile_descriptions,
+            a_block_descriptions,
+            b_block_descriptions,
+            c_block_descriptions,
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=c_element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceGemmBiasCPermute_Xdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=c_element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+                c_block_transfer=c_block_desc,
+                ds_dtype=ds_dtype,
+                e_dtype=e_dtype,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateGemmRCRm2n3PermOperator(manifest, c_element_op):
+    operation_kind = library.GemmKind.GemmPermuteM2N3
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    ds_dtype = [library.DataType.f16]
+    e_dtype = library.DataType.f16
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        # gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(64, 64, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(128, 128, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(128, 32, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(64, 64, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(64, 32, 64, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            block_transfer = [4, 64, 1]
+            # TODO:figure out the last dimension
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+        if t.block_size == 128:
+            block_transfer = [4, 32, 1]
+            if t.n_per_block == 128:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+            else:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+        if t.block_size == 64:
+            block_transfer = [4, 16, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8)
+
+        assert (
+            block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        block_descriptions.append(
+            gemm.BlockTransferDesc(block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+        c_block_descriptions.append(c_block_transfer)
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, block_desc, c_block_desc in zip(
+            tile_descriptions, block_descriptions, c_block_descriptions
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=c_element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedContractionMultipleD_Xdl_CShuffle,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=c_element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=block_desc,
+                b_block_transfer=block_desc,
+                c_block_transfer=c_block_desc,
+                ds_dtype=ds_dtype,
+                e_dtype=e_dtype,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateGemmRCRm3n2PermOperator(manifest, c_element_op):
+    operation_kind = library.GemmKind.GemmPermuteM3N2
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    ds_dtype = [library.DataType.f16]
+    e_dtype = library.DataType.f16
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        # gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(64, 64, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(128, 128, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(128, 32, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(64, 64, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(64, 32, 64, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            block_transfer = [4, 64, 1]
+            # TODO:figure out the last dimension
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 1)
+        if t.block_size == 128:
+            block_transfer = [4, 32, 1]
+            if t.n_per_block == 128:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 1)
+            else:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 1)
+        if t.block_size == 64:
+            block_transfer = [4, 16, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 1)
+
+        assert (
+            block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        block_descriptions.append(
+            gemm.BlockTransferDesc(block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+        c_block_descriptions.append(c_block_transfer)
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, block_desc, c_block_desc in zip(
+            tile_descriptions, block_descriptions, c_block_descriptions
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=c_element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedContractionMultipleD_Xdl_CShuffle,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=c_element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=block_desc,
+                b_block_transfer=block_desc,
+                c_block_transfer=c_block_desc,
+                ds_dtype=ds_dtype,
+                e_dtype=e_dtype,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateBmmRCRPermOperator(manifest):
+    operation_kind = library.GemmKind.BatchGemmPermute
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(64, 64, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(128, 32, 128, 32, 8, 8, 32, 32, 1, 2),
+        gemm.TileDesc(64, 64, 32, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(64, 32, 64, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            block_transfer = [4, 64, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+        if t.block_size == 128:
+            block_transfer = [4, 32, 1]
+            if t.n_per_block == 128:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+            else:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+        if t.block_size == 64:
+            block_transfer = [4, 16, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8)
+
+        assert (
+            block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        block_descriptions.append(
+            gemm.BlockTransferDesc(block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+        c_block_descriptions.append(c_block_transfer)
+
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, block_desc, c_block_desc in zip(
+            tile_descriptions, block_descriptions, c_block_descriptions
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmCPermuteXdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=block_desc,
+                b_block_transfer=block_desc,
+                c_block_transfer=c_block_desc,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateBmmSoftmaxBmmOperator(
+    manifest,
+    operation_kind=library.GemmKind.BatchGemmSoftmaxGemm,
+    xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle,
+):
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    tile_descriptions = [
+        gemm.AttnTileDesc(256, 256, 128, 32, 64, 32, 8, 8, 2, 32, 32, 2, 4, 2),
+        gemm.AttnTileDesc(256, 256, 128, 32, 128, 32, 8, 8, 2, 32, 32, 2, 4, 4),
+        gemm.AttnTileDesc(256, 128, 256, 32, 64, 32, 8, 8, 2, 32, 32, 1, 8, 2),
+        gemm.AttnTileDesc(256, 128, 256, 32, 128, 32, 8, 8, 2, 32, 32, 1, 8, 4),
+        gemm.AttnTileDesc(256, 128, 128, 64, 64, 32, 8, 8, 2, 32, 32, 1, 4, 2),
+        gemm.AttnTileDesc(256, 128, 128, 32, 64, 32, 8, 8, 2, 32, 32, 1, 4, 2),
+        gemm.AttnTileDesc(256, 128, 128, 64, 128, 32, 8, 8, 2, 32, 32, 1, 4, 4),
+        gemm.AttnTileDesc(256, 128, 128, 32, 128, 32, 8, 8, 2, 32, 32, 1, 4, 4),
+        gemm.AttnTileDesc(256, 64, 256, 32, 128, 32, 8, 8, 2, 16, 16, 1, 16, 8),
+        gemm.AttnTileDesc(256, 64, 256, 32, 64, 32, 8, 8, 2, 16, 16, 1, 16, 4),
+        gemm.AttnTileDesc(256, 64, 256, 64, 128, 32, 8, 8, 2, 16, 16, 1, 16, 8),
+        gemm.AttnTileDesc(256, 64, 256, 64, 64, 32, 8, 8, 2, 16, 16, 1, 16, 4),
+        gemm.AttnTileDesc(256, 128, 128, 64, 128, 32, 8, 8, 2, 32, 32, 1, 4, 4),
+        gemm.AttnTileDesc(256, 128, 64, 32, 128, 32, 8, 8, 2, 32, 32, 1, 2, 4),
+    ]
+
+    block_descriptions = [
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+    ]
+    c_block_descriptions, b1_block_descriptions = [], []
+    for i in range(len(tile_descriptions)):
+
+        if i in [0, 2, 4, 5, 9, 11]:
+            block_transfer = [16, 16, 1]
+        else:
+            block_transfer = [8, 32, 1]
+        b1_block_descriptions.append(
+            gemm.BlockTransferDesc(block_transfer, [0, 2, 1], [0, 2, 1], 1, 4, 2, 0)
+        )
+
+        if i in [8, 10]:
+            c_block_transfer = gemm.CBlockTransferDesc(1, 8, [1, 16, 1, 16], 8)
+        else:
+            c_shuffle = 4 if i in [9, 11] else 2
+            c_block_transfer = gemm.CBlockTransferDesc(1, c_shuffle, [1, 32, 1, 8], 8)
+
+        c_block_descriptions.append(c_block_transfer)
+
+    gemm_specialization = []
+    for i in range(len(tile_descriptions)):
+        if i < 12:
+            gemm_specialization.append(gemm.GemmSpecialization.GemmDefault)
+        else:
+            gemm_specialization.append(gemm.GemmSpecialization.MNOPadding)
+
+    operations = []
+    for tile_desc, block_desc, b1_block_desc, c_block_desc, gemm_spec in zip(
+        tile_descriptions,
+        block_descriptions,
+        b1_block_descriptions,
+        c_block_descriptions,
+        gemm_specialization,
+    ):
+        new_operation = gemm.GemmOperation(
+            operation_kind=operation_kind,
+            extra_kind=element_op,
+            xdl_op_type=xdl_op_type,
+            A=a_element_desc,
+            B=b_element_desc,
+            C=c_element_desc,
+            a_elem_op=element_op,
+            b_elem_op=element_op,
+            epilogue_functor=element_op,
+            gemm_specialization=gemm_spec,
+            tile_desc=tile_desc,
+            a_block_transfer=block_desc,
+            b_block_transfer=block_desc,
+            b1_block_transfer=b1_block_desc,
+            c_block_transfer=c_block_desc,
+        )
+        manifest.append(new_operation)
+        operations.append(new_operation)
+    return operations
+
+
+def CreateBmmSoftmaxBmmPermOperator(
+    manifest,
+    operation_kind=library.GemmKind.BatchGemmSoftmaxGemmPermute,
+    xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle,
+    causal_mask=None,
+):
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    tile_descriptions = [
+        gemm.AttnTileDesc(256, 256, 128, 32, 64, 32, 8, 8, 2, 32, 32, 2, 4, 2),
+        gemm.AttnTileDesc(256, 256, 128, 32, 128, 32, 8, 8, 2, 32, 32, 2, 4, 4),
+        gemm.AttnTileDesc(256, 128, 256, 32, 64, 32, 8, 8, 2, 32, 32, 1, 8, 2),
+        gemm.AttnTileDesc(256, 128, 256, 32, 128, 32, 8, 8, 2, 32, 32, 1, 8, 4),
+        gemm.AttnTileDesc(256, 128, 128, 64, 64, 32, 8, 8, 2, 32, 32, 1, 4, 2),
+        gemm.AttnTileDesc(256, 128, 128, 32, 64, 32, 8, 8, 2, 32, 32, 1, 4, 2),
+        gemm.AttnTileDesc(256, 128, 128, 64, 128, 32, 8, 8, 2, 32, 32, 1, 4, 4),
+        gemm.AttnTileDesc(256, 128, 128, 32, 128, 32, 8, 8, 2, 32, 32, 1, 4, 4),
+        gemm.AttnTileDesc(256, 64, 256, 32, 128, 32, 8, 8, 2, 16, 16, 1, 16, 8),
+        gemm.AttnTileDesc(256, 64, 256, 32, 64, 32, 8, 8, 2, 16, 16, 1, 16, 4),
+        gemm.AttnTileDesc(256, 64, 256, 64, 128, 32, 8, 8, 2, 16, 16, 1, 16, 8),
+        gemm.AttnTileDesc(256, 64, 256, 64, 64, 32, 8, 8, 2, 16, 16, 1, 16, 4),
+        gemm.AttnTileDesc(256, 128, 128, 64, 128, 32, 8, 8, 2, 32, 32, 1, 4, 4),
+        gemm.AttnTileDesc(256, 128, 64, 32, 128, 32, 8, 8, 2, 32, 32, 1, 2, 4),
+        # for MNKOPadding
+        gemm.AttnTileDesc(256, 128, 128, 64, 128, 32, 8, 8, 2, 32, 32, 1, 4, 4),
+        gemm.AttnTileDesc(256, 128, 64, 32, 128, 32, 8, 8, 2, 32, 32, 1, 2, 4),
+    ]
+
+    block_descriptions = [
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+        # for MNKOPadding
+        gemm.BlockTransferDesc([8, 32, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [1, 0, 2], [1, 0, 2], 2, 8, 8, 1),
+    ]
+    causal_mask_flag = 0
+    if causal_mask is not None:
+        causal_mask_flag = 1 if library.TensorOperationTag[causal_mask] == "True" else 0
+
+    c_block_descriptions, b1_block_descriptions = [], []
+    for i in range(len(tile_descriptions)):
+
+        if i in [0, 2, 4, 5, 9, 11]:
+            block_transfer = [16, 16, 1]
+        else:
+            block_transfer = [8, 32, 1]
+        b1_block_descriptions.append(
+            gemm.BlockTransferDesc(block_transfer, [0, 2, 1], [0, 2, 1], 1, 4, 2, 0)
+        )
+
+        if i in [8, 10]:
+            c_block_transfer = gemm.MaskedCBlockTransferDesc(
+                1, 8, [1, 16, 1, 16], 8, causal_mask_flag
+            )
+        else:
+            c_shuffle = 4 if i in [9, 11] else 2
+            c_block_transfer = gemm.MaskedCBlockTransferDesc(
+                1, c_shuffle, [1, 32, 1, 8], 8, causal_mask_flag
+            )
+
+        c_block_descriptions.append(c_block_transfer)
+
+    gemm_specialization = []
+    for i in range(len(tile_descriptions)):
+        if i < 12:
+            gemm_specialization.append(gemm.GemmSpecialization.GemmDefault)
+        elif i in [12, 13]:
+            gemm_specialization.append(gemm.GemmSpecialization.MNOPadding)
+        else:
+            gemm_specialization.append(gemm.GemmSpecialization.MNKOPadding)
+
+    operations = []
+    extra_op = element_op if causal_mask_flag == 0 else causal_mask
+    for tile_desc, block_desc, b1_block_desc, c_block_desc, gemm_spec in zip(
+        tile_descriptions,
+        block_descriptions,
+        b1_block_descriptions,
+        c_block_descriptions,
+        gemm_specialization,
+    ):
+        new_operation = gemm.GemmOperation(
+            operation_kind=operation_kind,
+            extra_kind=extra_op,
+            xdl_op_type=xdl_op_type,
+            A=a_element_desc,
+            B=b_element_desc,
+            C=c_element_desc,
+            a_elem_op=element_op,
+            b_elem_op=element_op,
+            epilogue_functor=element_op,
+            gemm_specialization=gemm_spec,
+            tile_desc=tile_desc,
+            a_block_transfer=block_desc,
+            b_block_transfer=block_desc,
+            b1_block_transfer=b1_block_desc,
+            c_block_transfer=c_block_desc,
+        )
+        manifest.append(new_operation)
+        operations.append(new_operation)
+    return operations
+
+
+def CreateBmmRRROperator(manifest):
+    operation_kind = library.GemmKind.BatchGemm
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 4, 8, 0, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 4, 8, 0, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 4, 8, 0, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 4, 8, 0, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 4, 8, 0, 32, 32, 1, 2),
+        gemm.TileDesc(128, 32, 256, 4, 8, 0, 32, 32, 1, 4),
+        gemm.TileDesc(128, 32, 128, 4, 8, 0, 32, 32, 1, 2),
+        gemm.TileDesc(128, 32, 64, 4, 8, 0, 32, 32, 1, 1),
+        gemm.TileDesc(64, 32, 32, 4, 8, 0, 32, 32, 1, 1),
+        gemm.TileDesc(128, 16, 256, 4, 8, 0, 16, 16, 1, 8),
+        gemm.TileDesc(128, 16, 128, 4, 8, 0, 16, 16, 1, 4),
+        gemm.TileDesc(128, 16, 64, 4, 8, 0, 16, 16, 1, 2),
+        gemm.TileDesc(128, 16, 32, 4, 8, 0, 16, 16, 1, 1),
+        gemm.TileDesc(64, 16, 16, 4, 8, 0, 16, 16, 1, 1),
+    ]
+
+    a_block_descriptions = []
+    for t in tile_descriptions:
+        a_block_transfer = -1
+        if t.block_size == 256:
+            a_block_transfer = [4, 64, 1]
+        if t.block_size == 128 and t.m_per_block != 16:
+            a_block_transfer = [4, 32, 1]
+        if t.block_size == 64 or t.m_per_block == 16:
+            a_block_transfer = [4, 16, 1]
+
+        assert (
+            a_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        a_block_descriptions.append(
+            gemm.BlockTransferDesc(
+                a_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1, True
+            )
+        )
+    b_block_descriptions = [
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 8, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 16, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 8, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1, True),
+        gemm.BlockTransferDesc([4, 16, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1, True),
+    ]
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, a_block_desc, b_block_desc in zip(
+            tile_descriptions, a_block_descriptions, b_block_descriptions
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmXdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateBmmRRRPermOperator(manifest):
+    operation_kind = library.GemmKind.BatchGemmPermute
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 4, 2),
+        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 32, 8, 2, 32, 32, 2, 4),
+        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 32, 8, 2, 32, 32, 4, 2),
+        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 2, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 32, 8, 2, 32, 32, 2, 1),
+        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 32, 8, 2, 32, 32, 1, 2),
+        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
+    ]
+
+    b_block_descriptions = [
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([8, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
+        gemm.BlockTransferDesc([16, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1),
+        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
+    ]
+    a_block_descriptions = []
+    c_block_descriptions = []
+    for t in tile_descriptions:
+        a_block_transfer = -1
+        c_block_transfer = -1
+        if t.block_size == 256:
+            a_block_transfer = [4, 64, 1]
+            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
+        if t.block_size == 128:
+            a_block_transfer = [4, 32, 1]
+            if t.n_per_block == 128:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
+            if t.n_per_block == 64:
+                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
+
+        assert (
+            a_block_transfer != -1
+            and c_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        a_block_descriptions.append(
+            gemm.BlockTransferDesc(a_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
+        )
+        c_block_descriptions.append(c_block_transfer)
+
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+            tile_descriptions,
+            a_block_descriptions,
+            b_block_descriptions,
+            c_block_descriptions,
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmCPermuteXdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+                c_block_transfer=c_block_desc,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateBmmCCROperator(manifest):
+    operation_kind = library.GemmKind.BatchGemm
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 4, 8, 0, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 4, 8, 0, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 4, 8, 0, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 4, 8, 0, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 4, 8, 0, 32, 32, 1, 2),
+    ]
+
+    b_block_descriptions = []
+    for t in tile_descriptions:
+        b_block_transfer = -1
+        if t.block_size == 256:
+            b_block_transfer = [4, 64, 1]
+        if t.block_size == 128 and t.m_per_block != 16:
+            b_block_transfer = [4, 32, 1]
+
+        assert (
+            b_block_transfer != -1
+            and "Cannot determine block_transfer_size with block_size "
+            + str(t.block_size)
+        )
+        b_block_descriptions.append(
+            gemm.BlockTransferDesc(
+                b_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1, True
+            )
+        )
+    a_block_descriptions = [
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1, True),
+    ]
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, a_block_desc, b_block_desc in zip(
+            tile_descriptions, a_block_descriptions, b_block_descriptions
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmXdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateBmmCRROperator(manifest):
+    operation_kind = library.GemmKind.BatchGemm
+    a_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.ColumnMajor
+    )
+    b_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    c_element_desc = library.TensorDesc(
+        library.DataType.f16, library.LayoutType.RowMajor
+    )
+    element_op = library.TensorOperation.PassThrough
+    # 0 indicates not print
+    tile_descriptions = [
+        gemm.TileDesc(256, 256, 128, 4, 8, 0, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 256, 4, 8, 0, 32, 32, 2, 4),
+        gemm.TileDesc(128, 128, 128, 4, 8, 0, 32, 32, 4, 2),
+        gemm.TileDesc(256, 128, 128, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(128, 128, 64, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(128, 64, 128, 4, 8, 0, 32, 32, 2, 2),
+        gemm.TileDesc(256, 128, 64, 4, 8, 0, 32, 32, 2, 1),
+        gemm.TileDesc(256, 64, 128, 4, 8, 0, 32, 32, 1, 2),
+    ]
+
+    a_block_descriptions = [
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1, True),
+    ]
+    b_block_descriptions = [
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1, True),
+        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1, True),
+    ]
+    gemm_specialization = [
+        gemm.GemmSpecialization.GemmDefault,
+        gemm.GemmSpecialization.MNKPadding,
+    ]
+    operations = []
+    for gemm_spec in gemm_specialization:
+        for tile_desc, a_block_desc, b_block_desc in zip(
+            tile_descriptions, a_block_descriptions, b_block_descriptions
+        ):
+            new_operation = gemm.GemmOperation(
+                operation_kind=operation_kind,
+                extra_kind=element_op,
+                xdl_op_type=gemm.XdlOpType.DeviceBatchedGemmXdl,
+                A=a_element_desc,
+                B=b_element_desc,
+                C=c_element_desc,
+                a_elem_op=element_op,
+                b_elem_op=element_op,
+                epilogue_functor=element_op,
+                gemm_specialization=gemm_spec,
+                tile_desc=tile_desc,
+                a_block_transfer=a_block_desc,
+                b_block_transfer=b_block_desc,
+            )
+            manifest.append(new_operation)
+            operations.append(new_operation)
+    return operations
+
+
+def CreateSoftmaxOperator(manifest, rank=3):
+    operation_kind = library.OperationKind.Softmax
+    in_dtype = library.DataType.f16
+    out_dtype = library.DataType.f16
+    # 0 indicates not print
+    tile_descriptions = [
+        softmax.TileDesc(256, 8, 32, 1, 8, 1, 1, 1),
+        softmax.TileDesc(256, 8, 32, 1, 8, 1, 8, 8),
+        softmax.TileDesc(256, 4, 64, 1, 8, 1, 8, 8),
+        softmax.TileDesc(256, 2, 128, 1, 8, 1, 8, 8),
+        softmax.TileDesc(256, 2, 128, 1, 16, 1, 8, 8),
+        softmax.TileDesc(256, 2, 128, 1, 32, 1, 8, 8),
+        softmax.TileDesc(256, 1, 256, 1, 8, 1, 8, 8),
+        softmax.TileDesc(256, 1, 256, 1, 16, 1, 8, 8),
+        softmax.TileDesc(256, 1, 256, 1, 32, 1, 8, 8),
+    ]
+
+    operations = []
+    for tile_desc in tile_descriptions:
+        new_operation = softmax.SoftmaxOperation(
+            operation_kind=operation_kind,
+            extra_kind=rank,
+            In=in_dtype,
+            Out=out_dtype,
+            Rank=rank,
+            NumReduceDim=1,
+            tile_desc=tile_desc,
+        )
+        manifest.append(new_operation)
+        operations.append(new_operation)
+    return operations
+
+
+def CreateLayerNormOperator(manifest, rank=2):
+    operation_kind = library.OperationKind.LayerNorm
+    in_dtype = library.DataType.f16
+    out_dtype = library.DataType.f16
+    # 0 indicates not print
+    tile_descriptions = [
+        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1),
+        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2),
+        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4),
+        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        layernorm.TileDesc(256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        layernorm.TileDesc(256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        layernorm.TileDesc(256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8),
+        layernorm.TileDesc(256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8),
+        layernorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        layernorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8),
+        layernorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8),
+    ]
+
+    operations = []
+    for tile_desc in tile_descriptions:
+        new_operation = layernorm.LayerNormOperation(
+            operation_kind=operation_kind,
+            extra_kind=rank,
+            In=in_dtype,
+            Out=out_dtype,
+            Rank=rank,
+            NumReduceDim=1,
+            tile_desc=tile_desc,
+        )
+        manifest.append(new_operation)
+        operations.append(new_operation)
+    return operations
+
+
+def CreateGroupNormOperator(manifest, rank=5):
+    operation_kind = library.OperationKind.GroupNorm
+    in_dtype = library.DataType.f16
+    out_dtype = library.DataType.f16
+    # 0 indicates not print
+    tile_descriptions = [
+        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1),
+        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2),
+        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4),
+        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        groupnorm.TileDesc(256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        groupnorm.TileDesc(256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        groupnorm.TileDesc(256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8),
+        groupnorm.TileDesc(256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8),
+        groupnorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        groupnorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8),
+        groupnorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8),
+    ]
+
+    operations = []
+    for tile_desc in tile_descriptions:
+        new_operation = groupnorm.GroupNormOperation(
+            operation_kind=operation_kind,
+            extra_kind=rank,
+            In=in_dtype,
+            Out=out_dtype,
+            Rank=rank,
+            NumReduceDim=3,
+            tile_desc=tile_desc,
+        )
+        manifest.append(new_operation)
+        operations.append(new_operation)
+    return operations
+
+
+def GenerateTensorOp(manifest):
+    # Conv2d
+    CreateConv2dFwdOperator(
+        manifest,
+        library.Conv2dKind.GroupConv2dBiasRelu,
+        library.TensorOperation.PassThrough,
+    )
+    # Conv2dBias
+    CreateConv2dFwdOperator(
+        manifest,
+        library.Conv2dKind.GroupConv2dBiasRelu,
+        library.TensorOperation.Add,
+        library.MemoryDataOperation.MemorySet,
+    )
+    # Conv2dBiasRelu
+    CreateConv2dFwdOperator(
+        manifest,
+        library.Conv2dKind.GroupConv2dBiasRelu,
+        library.TensorOperation.AddRelu,
+        library.MemoryDataOperation.MemorySet,
+    )
+    # Conv2dBiasReluAdd
+    CreateConv2dFwdOperator(
+        manifest,
+        library.Conv2dKind.GroupConv2dBiasRelu,
+        library.TensorOperation.AddReluAdd,
+    )
+    # Conv2dBiasAddRelu
+    CreateConv2dFwdOperator(
+        manifest,
+        library.Conv2dKind.GroupConv2dBiasRelu,
+        library.TensorOperation.AddAddRelu,
+    )
+    # Conv2dBiasSigmoid
+    CreateConv2dFwdOperator(
+        manifest,
+        library.Conv2dKind.GroupConv2dBiasRelu,
+        library.TensorOperation.AddSigmoid,
+        library.MemoryDataOperation.MemorySet,
+    )
+    # TranposedConv2d
+    CreateConv2dBwdOperator(
+        manifest,
+        library.Conv2dKind.TransposedConv2d,
+        library.TensorOperation.PassThrough,
+        library.MemoryDataOperation.MemorySet,
+    )
+    # TranposedConv2dBiasRelu
+    CreateConv2dBwdBiasOperator(
+        manifest,
+        library.Conv2dKind.TransposedConv2dBiasRelu,
+        library.TensorOperation.AddRelu,
+        library.MemoryDataOperation.MemorySet,
+    )
+    # GemmRRR
+    CreateGemmRRROperator(manifest)
+    # GemmRCR
+    CreateGemmRCROperator(manifest)
+    # GemmRCRBias
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.Add)
+    # GemmRCRBiasRelu
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddRelu)
+    # GemmRCRBiasTanh
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddTanh)
+    # GemmRCRBiasTanh
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddFastGelu)
+    # GemmRCRBiasSwish
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddHardswish)
+    # GemmRCRBiasSigmoid
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddSigmoid)
+    # GemmRCRBiasAdd
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAdd)
+    # GemmRCRBiasMul
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddMul)
+    # GemmRCRBiasMul
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddMulTanh)
+    # GemmRCRBiasAddRelu
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAddRelu)
+    # GemmRCRBiasAddAddRelu
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAddAdd)
+    # GemmRCRBiasAddAddRelu
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAddAddRelu)
+    # GemmRCRBiasSigmoidMul
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddSigmoidMul)
+    # GemmRCRBiasSigmoidMulTanh
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddSigmoidMulTanh)
+    # GemmRCRBiasMulAdd
+    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddMulAdd)
+    # BmmRCR
+    CreateBmmRCROperator(manifest)
+    # BmmRRR
+    CreateBmmRRROperator(manifest)
+    # BmmCCR
+    CreateBmmCCROperator(manifest)
+    # BmmCRR
+    CreateBmmCRROperator(manifest)
+    # BmmRCR-Permute
+    CreateBmmRCRPermOperator(manifest)
+    # BmmRRR-Permute
+    CreateBmmRRRPermOperator(manifest)
+    # GemmBiasRCR-Permute
+    CreateGemmRCRPermOperator(manifest, library.TensorOperation.Add)
+    CreateGemmRCRm2n3PermOperator(manifest, library.TensorOperation.Add)
+    CreateGemmRCRm2n3PermOperator(manifest, library.TensorOperation.PassThrough)
+    CreateGemmRCRm3n2PermOperator(manifest, library.TensorOperation.Add)
+    # GemmBiasRRR-Permute
+    CreateGemmRRRPermOperator(manifest, library.TensorOperation.Add)
+    # Bmm-Softmax-Bmm
+    CreateBmmSoftmaxBmmOperator(manifest)
+    # Attention (Bmm-Softmax-Bmm-Permute)
+    CreateBmmSoftmaxBmmPermOperator(manifest)
+    # Attention with Causal Mask
+    CreateBmmSoftmaxBmmPermOperator(
+        manifest, causal_mask=library.TensorOperation.CausalMask
+    )
+    # Softmax
+    for rank in range(2, 5):
+        CreateSoftmaxOperator(manifest, rank=rank)
+
+    CreateLayerNormOperator(manifest, rank=2)
+    CreateGroupNormOperator(manifest, rank=5)
+
+
+def GenerateGFX908(manifest, rocm_version):
+    GenerateTensorOp(manifest)
+
+
+def GenerateGFX90A(manifest, rocm_version):
+    GenerateTensorOp(manifest)
diff --git a/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py b/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
new file mode 100644
index 000000000..98b98191a
--- /dev/null
+++ b/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
@@ -0,0 +1,119 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from copy import deepcopy
+from dataclasses import dataclass
+
+import jinja2
+
+# import library
+
+from . import library
+
+
+@dataclass
+class TileDesc:
+    block_size: int
+    m_cluster_size: int
+    k_cluster_size: int
+    m_slice_size: int
+    k_slice_size: int
+    in_src_dim: int
+    in_src_size: int
+    gamma_src_dim: int
+    gamma_src_size: int
+    beta_src_dim: int
+    beta_src_size: int
+    out_dst_size: int
+
+    def __str__(self) -> str:
+        values = list(self.__dict__.values())
+        return "_".join([str(x) for x in values])
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        template = jinja2.Template(
+            """
+{%for key, value in param.items() %}
+    {{value}}{% if not loop.last %},{% endif %} // {{key}}
+{% endfor %}
+""",
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(param=args)
+
+
+@dataclass
+class GroupNormOperation:
+    operation_kind: library.OperationKind
+    extra_kind: int
+    In: library.DataType
+    Out: library.DataType
+    Rank: int
+    NumReduceDim: int
+    tile_desc: TileDesc
+
+    def __str__(self) -> str:
+        return "{op_kind}_rank{rank}_{tile_name}".format(
+            op_kind=library.OperationKindNames[self.operation_kind],
+            rank=self.Rank,
+            tile_name=str(self.tile_desc),
+        )
+
+    def accumulator_type(self):
+        return library.DataType.f32
+
+    def emit(self) -> str:
+        template = jinja2.Template(
+            """
+using {{name}} = ck::tensor_operation::device::DeviceLayernormImpl<
+    {{InDType}},
+    {{InDType}},
+    {{InDType}},
+    {{AccDType}},
+    {{OutDType}},
+    YElementOp,
+    {{Rank}},
+    {{NumReduceDim}},
+    {{tile_config}}
+    >;
+"""
+        )
+        return template.render(
+            name=self.__str__(),
+            InDType=library.DataTypeTag[self.In],
+            AccDType=library.DataTypeTag[library.DataType.f32],
+            OutDType=library.DataTypeTag[self.Out],
+            Rank=self.Rank,
+            NumReduceDim=self.NumReduceDim,
+            tile_config=self.tile_desc.emit(),
+        )
+
+
+if __name__ == "__main__":
+    A = library.TensorDesc(library.DataType.f16, library.LayoutType.RowMajor)
+    B = library.TensorDesc(library.DataType.f16, library.LayoutType.ColumnMajor)
+    C = library.TensorDesc(library.DataType.f16, library.LayoutType.RowMajor)
+    GroupNormOp = GroupNormOperation(
+        operation_kind=library.OperationKind.GroupNorm,
+        extra_kind=5,
+        In=library.DataType.f16,
+        Out=library.DataType.f16,
+        Rank=5,
+        NumReduceDim=3,
+        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+    )
+    print(str(GroupNormOp))
+    print(GroupNormOp.emit())
diff --git a/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py b/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
new file mode 100644
index 000000000..c9d93d55a
--- /dev/null
+++ b/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
@@ -0,0 +1,119 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from copy import deepcopy
+from dataclasses import dataclass
+
+import jinja2
+
+# import library
+
+from . import library
+
+
+@dataclass
+class TileDesc:
+    block_size: int
+    m_cluster_size: int
+    k_cluster_size: int
+    m_slice_size: int
+    k_slice_size: int
+    in_src_dim: int
+    in_src_size: int
+    gamma_src_dim: int
+    gamma_src_size: int
+    beta_src_dim: int
+    beta_src_size: int
+    out_dst_size: int
+
+    def __str__(self) -> str:
+        values = list(self.__dict__.values())
+        return "_".join([str(x) for x in values])
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        template = jinja2.Template(
+            """
+{%for key, value in param.items() %}
+    {{value}}{% if not loop.last %},{% endif %} // {{key}}
+{% endfor %}
+""",
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(param=args)
+
+
+@dataclass
+class LayerNormOperation:
+    operation_kind: library.OperationKind
+    extra_kind: int
+    In: library.DataType
+    Out: library.DataType
+    Rank: int
+    NumReduceDim: int
+    tile_desc: TileDesc
+
+    def __str__(self) -> str:
+        return "{op_kind}_rank{rank}_{tile_name}".format(
+            op_kind=library.OperationKindNames[self.operation_kind],
+            rank=self.Rank,
+            tile_name=str(self.tile_desc),
+        )
+
+    def accumulator_type(self):
+        return library.DataType.f32
+
+    def emit(self) -> str:
+        template = jinja2.Template(
+            """
+using {{name}} = ck::tensor_operation::device::DeviceLayernormImpl<
+    {{InDType}},
+    {{InDType}},
+    {{InDType}},
+    {{AccDType}},
+    {{OutDType}},
+    ck::tensor_operation::element_wise::PassThrough,
+    {{Rank}},
+    {{NumReduceDim}},
+    {{tile_config}}
+    >;
+"""
+        )
+        return template.render(
+            name=self.__str__(),
+            InDType=library.DataTypeTag[self.In],
+            AccDType=library.DataTypeTag[library.DataType.f32],
+            OutDType=library.DataTypeTag[self.Out],
+            Rank=self.Rank,
+            NumReduceDim=self.NumReduceDim,  # we only need softmax(dim=-1) at this moment
+            tile_config=self.tile_desc.emit(),
+        )
+
+
+if __name__ == "__main__":
+    A = library.TensorDesc(library.DataType.f16, library.LayoutType.RowMajor)
+    B = library.TensorDesc(library.DataType.f16, library.LayoutType.ColumnMajor)
+    C = library.TensorDesc(library.DataType.f16, library.LayoutType.RowMajor)
+    LayerNormOp = LayerNormOperation(
+        operation_kind=library.OperationKind.LayerNorm,
+        extra_kind=3,
+        In=library.DataType.f16,
+        Out=library.DataType.f16,
+        Rank=3,
+        NumReduceDim=-1,
+        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+    )
+    print(str(LayerNormOp))
+    print(LayerNormOp.emit())
diff --git a/python/aitemplate/utils/mk_ck_lib/library.py b/python/aitemplate/utils/mk_ck_lib/library.py
new file mode 100644
index 000000000..a3fdb1c00
--- /dev/null
+++ b/python/aitemplate/utils/mk_ck_lib/library.py
@@ -0,0 +1,375 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import enum
+from dataclasses import dataclass
+from enum import auto
+
+
+class DataType(enum.Enum):
+    b1 = auto()
+    u4 = auto()
+    u8 = auto()
+    u16 = auto()
+    u32 = auto()
+    u64 = auto()
+    s4 = auto()
+    s8 = auto()
+    s16 = auto()
+    s32 = auto()
+    s64 = auto()
+    f16 = auto()
+    bf16 = auto()
+    f32 = auto()
+    f64 = auto()
+    invalid = auto()
+
+
+ShortDataTypeNames = {
+    DataType.s32: "i",
+    DataType.f16: "h",
+    DataType.f32: "s",
+    DataType.f64: "d",
+}
+
+#
+DataTypeNames = {
+    DataType.b1: "b1",
+    DataType.u4: "u4",
+    DataType.u8: "u8",
+    DataType.u16: "u16",
+    DataType.u32: "u32",
+    DataType.u64: "u64",
+    DataType.s4: "s4",
+    DataType.s8: "s8",
+    DataType.s16: "s16",
+    DataType.s32: "s32",
+    DataType.s64: "s64",
+    DataType.f16: "f16",
+    DataType.bf16: "bf16",
+    DataType.f32: "f32",
+    DataType.f64: "f64",
+}
+
+DataTypeTag = {
+    DataType.u8: "uint8_t",
+    DataType.u16: "uint16_t",
+    DataType.u32: "uint32_t",
+    DataType.u64: "uint64_t",
+    DataType.s8: "int8_t",
+    DataType.s16: "int16_t",
+    DataType.s32: "int32_t",
+    DataType.s64: "int64_t",
+    DataType.f16: "ck::half_t",
+    DataType.bf16: "ck::bhalf_t",
+    DataType.f32: "float",
+    DataType.f64: "double",
+}
+
+DataTypeSize = {
+    DataType.b1: 1,
+    DataType.u4: 4,
+    DataType.u8: 8,
+    DataType.u16: 16,
+    DataType.u32: 32,
+    DataType.u64: 64,
+    DataType.s4: 4,
+    DataType.s8: 8,
+    DataType.s16: 16,
+    DataType.s32: 32,
+    DataType.s64: 64,
+    DataType.f16: 16,
+    DataType.bf16: 16,
+    DataType.f32: 32,
+    DataType.f64: 64,
+}
+
+
+class LayoutType(enum.Enum):
+    ColumnMajor = auto()
+    RowMajor = auto()
+    NWC = auto()
+    KXC = auto()
+    NWK = auto()
+    NCW = auto()
+    KCX = auto()
+    NKW = auto()
+    NHWC = auto()
+    KYXC = auto()
+    NHWK = auto()
+    NCHW = auto()
+    KCYX = auto()
+    NKWH = auto()
+    NDHWC = auto()
+    KZYXC = auto()
+    NDHWK = auto()
+    NCDHW = auto()
+    KCZYX = auto()
+    NKDHW = auto()
+    G_NHW_C = auto()
+    G_K_YX_C = auto()
+    G_NHW_K = auto()
+    NHWGC = auto()
+    KYXGC = auto()
+    NHWGK = auto()
+    GNHWC = auto()
+    GKYXC = auto()
+    GNHWK = auto()
+    GNWC = auto()
+    GKXC = auto()
+    GNWK = auto()
+
+
+LayoutTag = {
+    LayoutType.ColumnMajor: "ck::tensor_layout::gemm::ColumnMajor",
+    LayoutType.RowMajor: "ck::tensor_layout::gemm::RowMajor",
+    LayoutType.NWC: "ck::tensor_layout::convolution::NWC",
+    LayoutType.KXC: "ck::tensor_layout::convolution::KXC",
+    LayoutType.NWK: "ck::tensor_layout::convolution::NWK",
+    LayoutType.NCW: "ck::tensor_layout::convolution::NCW",
+    LayoutType.KCX: "ck::tensor_layout::convolution::KCX",
+    LayoutType.NKW: "ck::tensor_layout::convolution::NKW",
+    LayoutType.NHWC: "ck::tensor_layout::convolution::NHWC",
+    LayoutType.KYXC: "ck::tensor_layout::convolution::KYXC",
+    LayoutType.NHWK: "ck::tensor_layout::convolution::NHWK",
+    LayoutType.NCHW: "ck::tensor_layout::convolution::NCHW",
+    LayoutType.KCYX: "ck::tensor_layout::convolution::KCYX",
+    LayoutType.NKWH: "ck::tensor_layout::convolution::NKWH",
+    LayoutType.NDHWC: "ck::tensor_layout::convolution::NDHWC",
+    LayoutType.KZYXC: "ck::tensor_layout::convolution::KZYXC",
+    LayoutType.NDHWK: "ck::tensor_layout::convolution::NDHWK",
+    LayoutType.NCDHW: "ck::tensor_layout::convolution::NCDHW",
+    LayoutType.KCZYX: "ck::tensor_layout::convolution::KCZYX",
+    LayoutType.NKDHW: "ck::tensor_layout::convolution::NKDHW",
+    LayoutType.G_NHW_C: "ck::tensor_layout::convolution::G_NHW_C",
+    LayoutType.G_K_YX_C: "ck::tensor_layout::convolution::G_K_YX_C",
+    LayoutType.G_NHW_K: "ck::tensor_layout::convolution::G_NHW_K",
+    LayoutType.NHWGC: "ck::tensor_layout::convolution::NHWGC",
+    LayoutType.KYXGC: "ck::tensor_layout::convolution::KYXGC",
+    LayoutType.NHWGK: "ck::tensor_layout::convolution::NHWGK",
+    LayoutType.GNHWC: "ck::tensor_layout::convolution::GNHWC",
+    LayoutType.GKYXC: "ck::tensor_layout::convolution::GKYXC",
+    LayoutType.GNHWK: "ck::tensor_layout::convolution::GNHWK",
+    LayoutType.GNWC: "ck::tensor_layout::convolution::GNWC",
+    LayoutType.GKXC: "ck::tensor_layout::convolution::GKXC",
+    LayoutType.GNWK: "ck::tensor_layout::convolution::GNWK",
+}
+
+ShortLayoutTypeNames = {
+    LayoutType.ColumnMajor: "N",
+    LayoutType.RowMajor: "T",
+    LayoutType.NWC: "NWC",
+    LayoutType.KXC: "KXC",
+    LayoutType.NWK: "NWK",
+    LayoutType.NCW: "NCW",
+    LayoutType.KCX: "KCX",
+    LayoutType.NKW: "NKW",
+    LayoutType.NHWC: "nhwc",
+    LayoutType.KYXC: "kyxc",
+    LayoutType.NHWK: "NHWK",
+    LayoutType.NCHW: "NCHW",
+    LayoutType.KCYX: "KCYX",
+    LayoutType.NKWH: "NKWH",
+    LayoutType.NDHWC: "NDHWC",
+    LayoutType.KZYXC: "KZYXC",
+    LayoutType.NDHWK: "NDHWK",
+    LayoutType.NCDHW: "NCDHW",
+    LayoutType.KCZYX: "KCZYX",
+    LayoutType.NKDHW: "NKDHW",
+    LayoutType.G_NHW_C: "G_NHW_C",
+    LayoutType.G_K_YX_C: "G_K_YX_C",
+    LayoutType.G_NHW_K: "G_NHW_K",
+    LayoutType.NHWGC: "NHWGC",
+    LayoutType.KYXGC: "KYXGC",
+    LayoutType.NHWGK: "NHWGK",
+    LayoutType.GNHWC: "GNHWC",
+    LayoutType.GKYXC: "GKYXC",
+    LayoutType.GNHWK: "GNHWK",
+    LayoutType.GNWC: "GNWC",
+    LayoutType.GKXC: "GKXC",
+    LayoutType.GNWK: "GNWK",
+}
+
+#
+class OperationKind(enum.Enum):
+    Gemm = auto()
+    Conv1d = auto()
+    Conv2d = auto()
+    Conv3d = auto()
+    Softmax = auto()
+    LayerNorm = auto()
+    GroupNorm = auto()
+
+
+OperationKindNames = {
+    OperationKind.Gemm: "gemm",
+    OperationKind.Conv1d: "conv1d",
+    OperationKind.Conv2d: "conv2d",
+    OperationKind.Conv3d: "conv3d",
+    OperationKind.Softmax: "softmax",
+    OperationKind.LayerNorm: "layernorm",
+    OperationKind.GroupNorm: "groupnorm",
+}
+
+
+class Conv2dKind(enum.Enum):
+    Conv2d = auto()
+    Conv2dBiasRelu = auto()
+    Conv2dBiasReluAdd = auto()
+    Conv2dBiasSigmoid = auto()
+    GroupConv2dBiasRelu = auto()
+    TransposedConv2d = auto()
+    TransposedConv2dBiasRelu = auto()
+
+
+Conv2dKindNames = {
+    Conv2dKind.Conv2d: "conv2d",
+    Conv2dKind.Conv2dBiasRelu: "conv2d_bias_relu",
+    Conv2dKind.Conv2dBiasReluAdd: "conv2d_bias_relu_add",
+    Conv2dKind.Conv2dBiasSigmoid: "conv2d_bias_sigmoid",
+    Conv2dKind.GroupConv2dBiasRelu: "group_conv2d_bias_relu",
+    Conv2dKind.TransposedConv2d: "transposed_conv2d",
+    Conv2dKind.TransposedConv2dBiasRelu: "transposed_conv2d_bias_relu",
+}
+
+
+class GemmKind(enum.Enum):
+    Gemm = auto()
+    GemmPermute = auto()
+    BatchGemm = auto()
+    BatchGemmPermute = auto()
+    SplitKGemm = auto()
+    Grouped = auto()
+    BatchGemmSoftmaxGemm = auto()
+    BatchGemmSoftmaxGemmPermute = auto()
+    GemmPermuteM2N3 = auto()
+    GemmPermuteM3N2 = auto()
+
+
+GemmKindNames = {
+    GemmKind.Gemm: "gemm",
+    GemmKind.GemmPermute: "gemm_permute",
+    GemmKind.BatchGemm: "batch_gemm",
+    GemmKind.BatchGemmPermute: "batch_gemm_permute",
+    GemmKind.SplitKGemm: "split_k_gemm",
+    GemmKind.Grouped: "grouped_gemm",
+    GemmKind.BatchGemmSoftmaxGemm: "batched_gemm_softmax_gemm",
+    GemmKind.BatchGemmSoftmaxGemmPermute: "batched_gemm_softmax_gemm_permute",
+    GemmKind.GemmPermuteM2N3: "gemm_permute_m2n3",
+    GemmKind.GemmPermuteM3N2: "gemm_permute_m3n2",
+}
+
+
+class TensorOperation(enum.Enum):
+    PassThrough = auto()
+    Add = auto()
+    AddAdd = auto()
+    AddMul = auto()
+    AddMulTanh = auto()
+    AlphaBetaAdd = auto()
+    AddRelu = auto()
+    AddFastGelu = auto()
+    AddTanh = auto()
+    AddHardswish = auto()
+    AddSigmoid = auto()
+    AddReluAdd = auto()
+    AddAddRelu = auto()
+    AddSigmoidMul = auto()
+    AddSigmoidMulTanh = auto()
+    AddHardswishAdd = auto()
+    UnaryIdentic = auto()
+    UnarySquare = auto()
+    UnaryAbs = auto()
+    UnarySqrt = auto()
+    AddMulAdd = auto()
+    AddAddAdd = auto()
+    AddAddAddRelu = auto()
+    Bilinear = auto()
+    CausalMask = auto()
+
+
+#
+TensorOperationTag = {
+    TensorOperation.PassThrough: "ck::tensor_operation::element_wise::PassThrough",
+    TensorOperation.Add: "ck::tensor_operation::element_wise::Add",
+    TensorOperation.AddAdd: "ck::tensor_operation::element_wise::AddAdd",
+    TensorOperation.AddMul: "ck::tensor_operation::element_wise::AddMul",
+    TensorOperation.AddMulTanh: "ck::tensor_operation::element_wise::AddMulTanh",
+    TensorOperation.AlphaBetaAdd: "ck::tensor_operation::element_wise::AlphaBetaAdd",
+    TensorOperation.AddRelu: "ck::tensor_operation::element_wise::AddRelu",
+    TensorOperation.AddFastGelu: "ck::tensor_operation::element_wise::AddFastGelu",
+    TensorOperation.AddTanh: "ck::tensor_operation::element_wise::AddTanh",
+    TensorOperation.AddSigmoid: "ck::tensor_operation::element_wise::AddSigmoid",
+    TensorOperation.AddHardswish: "ck::tensor_operation::element_wise::AddHardswish",
+    TensorOperation.AddReluAdd: "ck::tensor_operation::element_wise::AddReluAdd",
+    TensorOperation.AddAddRelu: "ck::tensor_operation::element_wise::AddAddRelu",
+    TensorOperation.AddHardswishAdd: "ck::tensor_operation::element_wise::AddHardswishAdd",
+    TensorOperation.AddMulAdd: "ck::tensor_operation::element_wise::AddMulAdd",
+    TensorOperation.AddAddAdd: "ck::tensor_operation::element_wise::AddAddAdd",
+    TensorOperation.AddAddAddRelu: "ck::tensor_operation::element_wise::AddAddAddRelu",
+    TensorOperation.AddSigmoidMul: "ck::tensor_operation::element_wise::AddSigmoidMul",
+    TensorOperation.AddSigmoidMulTanh: "ck::tensor_operation::element_wise::AddSigmoidMulTanh",
+    TensorOperation.UnaryIdentic: "ck::tensor_operation::element_wise::UnaryIdentic",
+    TensorOperation.UnarySquare: "ck::tensor_operation::element_wise::UnarySquare",
+    TensorOperation.UnaryAbs: "ck::tensor_operation::element_wise::UnaryAbs",
+    TensorOperation.UnarySqrt: "ck::tensor_operation::element_wise::UnarySqrt",
+    TensorOperation.Bilinear: "ck::tensor_operation::element_wise::Bilinear",
+    TensorOperation.CausalMask: "True",
+}
+
+#
+ShortTensorOperationNames = {
+    TensorOperation.PassThrough: "PT",
+    TensorOperation.Add: "A",
+    TensorOperation.AddAdd: "AA",
+    TensorOperation.AddMul: "AM",
+    TensorOperation.AddMulTanh: "AMT",
+    TensorOperation.AlphaBetaAdd: "ABA",
+    TensorOperation.AddRelu: "ARu",
+    TensorOperation.AddFastGelu: "AFG",
+    TensorOperation.AddTanh: "AT",
+    TensorOperation.AddSigmoid: "AS",
+    TensorOperation.AddHardswish: "AH",
+    TensorOperation.AddReluAdd: "ARA",
+    TensorOperation.AddAddRelu: "AAR",
+    TensorOperation.AddHardswishAdd: "AHA",
+    TensorOperation.AddMulAdd: "AMA",
+    TensorOperation.AddAddAdd: "AAA",
+    TensorOperation.AddAddAddRelu: "AAAR",
+    TensorOperation.AddSigmoidMul: "ASM",
+    TensorOperation.AddSigmoidMulTanh: "ASMT",
+    TensorOperation.UnaryIdentic: "UI",
+    TensorOperation.UnarySquare: "USR",
+    TensorOperation.UnaryAbs: "UA",
+    TensorOperation.UnarySqrt: "USQ",
+    TensorOperation.Bilinear: "B",
+    TensorOperation.CausalMask: "CM",
+}
+
+
+class MemoryDataOperation(enum.Enum):
+    MemorySet = auto()
+    InMemoryAtomicAdd = auto()
+
+
+MemoryDataOperationTag = {
+    MemoryDataOperation.MemorySet: "ck::InMemoryDataOperationEnum::Set",
+    MemoryDataOperation.InMemoryAtomicAdd: "ck::InMemoryDataOperationEnum::AtomicAdd",
+}
+
+
+@dataclass
+class TensorDesc:
+    element: DataType
+    layout: LayoutType
diff --git a/python/aitemplate/utils/mk_ck_lib/manifest.py b/python/aitemplate/utils/mk_ck_lib/manifest.py
new file mode 100644
index 000000000..b6f5c6c0d
--- /dev/null
+++ b/python/aitemplate/utils/mk_ck_lib/manifest.py
@@ -0,0 +1,178 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+#
+# \file generator.py
+#
+# \brief Generates the CUTLASS Library's instances
+#
+
+import os.path
+import re
+
+from .library import OperationKind, OperationKindNames
+
+
+class Manifest:
+    def __init__(self, args=None):
+        self.operations = {}
+        self.args = args
+        self.operation_count = 0
+        self.operations_by_name = {}
+
+        self.kernel_filter = ""
+        self.kernel_filter_list = []
+        self.kernel_names = []
+        self.operations_enabled = []
+        self.selected_kernels = []
+        self.ignore_kernels = []
+        # self.compute_capabilities = [50,]
+        self.curr_build_dir = "."
+        # self.filter_by_cc = True
+
+        if self.args:
+            self.kernel_filter = self.args.kernels
+            self.curr_build_dir = args.curr_build_dir
+
+            if args.operations == "all":
+                self.operations_enabled = []
+            else:
+                operations_list = [
+                    OperationKind.Gemm,
+                    OperationKind.Conv2d,
+                    OperationKind.Softmax,
+                ]
+                self.operations_enabled = [
+                    x
+                    for x in operations_list
+                    if OperationKindNames[x] in args.operations.split(",")
+                ]
+
+            if args.kernels == "all":
+                self.kernel_names = []
+            else:
+                self.kernel_names = [x for x in args.kernels.split(",") if x != ""]
+
+            self.ignore_kernels = [x for x in args.ignore_kernels.split(",") if x != ""]
+
+            if args.kernel_filter_file is None:
+                self.kernel_filter_list = []
+            else:
+                self.kernel_filter_list = self.get_kernel_filters(
+                    args.kernel_filter_file
+                )
+
+    def get_kernel_filters(self, kernelListFile):
+        if os.path.isfile(kernelListFile):
+            with open(kernelListFile, "r") as fileReader:
+                lines = [
+                    line.rstrip() for line in fileReader if not line.startswith("#")
+                ]
+
+            lines = [re.compile(line) for line in lines if line]
+            return lines
+        else:
+            return []
+
+    def filter_out_kernels(self, kernel_name, kernel_filter_list):
+
+        for kernel_filter_re in kernel_filter_list:
+            if kernel_filter_re.search(kernel_name) is not None:
+                return True
+
+        return False
+
+    def _filter_string_matches(self, filter_string, haystack):
+        """Returns true if all substrings appear in the haystack in order"""
+        substrings = filter_string.split("*")
+        for sub in substrings:
+            idx = haystack.find(sub)
+            if idx < 0:
+                return False
+            haystack = haystack[idx + len(sub) :]
+        return True
+
+    def filter(self, operation):
+        """Filtering operations based on various criteria"""
+        enabled = True
+
+        if (
+            len(self.operations_enabled)
+            and operation.operation_kind not in self.operations_enabled
+        ):
+            return False
+        # eliminate duplicates
+        if str(operation) in self.operations_by_name.keys():
+            return False
+        # Filter based on list of valid substrings
+        if len(self.kernel_names):
+            name = str(operation)
+            enabled = False
+
+            # compare against the include list
+            for name_substr in self.kernel_names:
+                if self._filter_string_matches(name_substr, name):
+                    enabled = True
+                    break
+
+            # compare against the exclude list
+            for name_substr in self.ignore_kernels:
+                if self._filter_string_matches(name_substr, name):
+                    enabled = False
+                    break
+
+        if len(self.kernel_filter_list) > 0:
+            enabled = False
+            if self.filter_out_kernels(str(operation), self.kernel_filter_list):
+                enabled = True
+
+        # todo: filter based on compute data type
+        return enabled
+
+    def append(self, operation):
+        """
+        Inserts the operation.
+        operation_kind -> configuration_name -> []
+        """
+        if self.filter(operation):
+            self.selected_kernels.append(str(operation))
+
+            self.operations_by_name[str(operation)] = operation
+
+            # add the configuration
+            configuration_name = str(operation)
+
+            if operation.operation_kind not in self.operations.keys():
+                self.operations[operation.operation_kind] = {}
+            if (
+                operation.extra_kind
+                not in self.operations[operation.operation_kind].keys()
+            ):
+                self.operations[operation.operation_kind][operation.extra_kind] = {}
+
+            if (
+                configuration_name
+                not in self.operations[operation.operation_kind][
+                    operation.extra_kind
+                ].keys()
+            ):
+                self.operations[operation.operation_kind][operation.extra_kind][
+                    configuration_name
+                ] = []
+
+            self.operations[operation.operation_kind][operation.extra_kind][
+                configuration_name
+            ].append(operation)
+            self.operation_count += 1
diff --git a/python/aitemplate/utils/mk_ck_lib/softmax_operation.py b/python/aitemplate/utils/mk_ck_lib/softmax_operation.py
new file mode 100644
index 000000000..f280236d1
--- /dev/null
+++ b/python/aitemplate/utils/mk_ck_lib/softmax_operation.py
@@ -0,0 +1,113 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from copy import deepcopy
+from dataclasses import dataclass
+
+import jinja2
+
+# import library
+
+from . import library
+
+
+@dataclass
+class TileDesc:
+    block_size: int
+    m_cluster_size: int
+    k_cluster_size: int
+    m_slice_size: int
+    k_slice_size: int
+    in_src_dim: int
+    in_src_size: int
+    out_dst_size: int
+
+    def __str__(self) -> str:
+        values = list(self.__dict__.values())
+        return "_".join([str(x) for x in values])
+
+    def emit(self) -> str:
+        args = deepcopy(self.__dict__)
+        template = jinja2.Template(
+            """
+{%for key, value in param.items() %}
+    {{value}}{% if not loop.last %},{% endif %} // {{key}}
+{% endfor %}
+""",
+            trim_blocks=True,
+            lstrip_blocks=True,
+        )
+        return template.render(param=args)
+
+
+@dataclass
+class SoftmaxOperation:
+    operation_kind: library.OperationKind
+    extra_kind: int
+    In: library.DataType
+    Out: library.DataType
+    Rank: int
+    NumReduceDim: int
+    tile_desc: TileDesc
+
+    def __str__(self) -> str:
+        return "{op_kind}_rank{rank}_{tile_name}".format(
+            op_kind=library.OperationKindNames[self.operation_kind],
+            rank=self.Rank,
+            tile_name=str(self.tile_desc),
+        )
+
+    def accumulator_type(self):
+        return library.DataType.f32
+
+    def emit(self) -> str:
+        template = jinja2.Template(
+            """
+using {{name}} = ck::tensor_operation::device::DeviceSoftmaxImpl<
+    {{InDType}},
+    {{AccDType}},
+    {{OutDType}},
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    {{Rank}},
+    {{NumReduceDim}},
+    {{tile_config}}
+    >;
+"""
+        )
+        return template.render(
+            name=self.__str__(),
+            InDType=library.DataTypeTag[self.In],
+            AccDType=library.DataTypeTag[library.DataType.f32],
+            OutDType=library.DataTypeTag[self.Out],
+            Rank=self.Rank,
+            NumReduceDim=self.NumReduceDim,  # we only need softmax(dim=-1) at this moment
+            tile_config=self.tile_desc.emit(),
+        )
+
+
+if __name__ == "__main__":
+    A = library.TensorDesc(library.DataType.f16, library.LayoutType.RowMajor)
+    B = library.TensorDesc(library.DataType.f16, library.LayoutType.ColumnMajor)
+    C = library.TensorDesc(library.DataType.f16, library.LayoutType.RowMajor)
+    SoftmaxOp = SoftmaxOperation(
+        operation_kind=library.OperationKind.Softmax,
+        extra_kind=3,
+        In=library.DataType.f16,
+        Out=library.DataType.f16,
+        Rank=3,
+        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 1, 1),
+    )
+    print(str(SoftmaxOp))
+    print(SoftmaxOp.emit())
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_conv_emit.py b/python/aitemplate/utils/mk_cutlass_lib/extra_conv_emit.py
new file mode 100644
index 000000000..8732e4ac4
--- /dev/null
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_conv_emit.py
@@ -0,0 +1,127 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Extra cutlass enum, mainly for epilogue
+"""
+import jinja2
+
+CONV_TEMPLATE = jinja2.Template(
+    """
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}WithBroadcast<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m},
+                             ${threadblock_shape_n},
+                             ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m},
+                             ${warp_shape_n},
+                             ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m},
+                             ${instruction_shape_n},
+                             ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${element_accumulator},
+      ${element_epilogue},
+      ${element_c},
+      ${epilogue_vector_length},
+      ${activation_op},
+      ${binary_op},
+      ${unary_op}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator},
+    ${iterator_algorithm},
+    ${stride_support},
+    ${align_a},
+    ${align_b}
+  >::Kernel;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    '''
+class EmitConv2dWithBroadcastInstance:
+  def __init__(self):
+    self.template = """
+    {{conv_template}}
+  """
+
+  def emit(self, operation):
+
+    warp_shape = [int(operation.tile_description.threadblock_shape[idx]
+                    / operation.tile_description.warp_count[idx]) for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128)
+                                   / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind': ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
+      'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+      'stride_support': StrideSupportTag[operation.stride_support],
+      'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else \
+      MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'activation_op': EpilogueMathTag[operation.activation_op],
+      'binary_op': EpilogueMathTag[operation.binary_op],
+      'unary_op': EpilogueMathTag[operation.unary_op],
+    }
+
+    return SubstituteTemplate(self.template, values)
+
+
+'''
+)
+
+
+def emit_library():
+    conv_template = CONV_TEMPLATE.render()
+    return SRC_TEMPLATE.render(conv_template=conv_template)
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_cutlass_generator.py b/python/aitemplate/utils/mk_cutlass_lib/extra_cutlass_generator.py
new file mode 100644
index 000000000..a684dcc5b
--- /dev/null
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_cutlass_generator.py
@@ -0,0 +1,112 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Extra cutlass tiling configs for special problems
+"""
+import jinja2
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+import enum
+import os.path
+import shutil
+import argparse
+
+from .library import *
+from .manifest import *
+from .generator import CreateGemmOperator
+
+{{func_decls}}
+
+def GenerateSM80(manifest, args):
+  {{func_calls}}
+"""
+)
+
+EXTRA_GEN = {}
+
+PERM021_FC_TEMPLATE = jinja2.Template(
+    """
+def {{func_name}}(manifest, args):
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.bf16, DataType.bf16, DataType.f32,     \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+  max_cc_smem_limited = 80
+
+  alignment_constraints = [8, 4, 2]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128,  32, 32],  7, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints)
+
+"""
+)
+
+EXTRA_GEN["perm021_fc"] = PERM021_FC_TEMPLATE
+
+
+def emit_library():
+    func_decls = ""
+    func_calls = ""
+    for fname, ftemplate in EXTRA_GEN.items():
+        func_decls += ftemplate.render(func_name=fname)
+        func_calls += "  {fname}(manifest, args)".format(fname=fname)
+    return SRC_TEMPLATE.render(func_decls=func_decls, func_calls=func_calls)
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
new file mode 100644
index 000000000..bcd7f85ef
--- /dev/null
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -0,0 +1,139 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Extra cutlass enum, mainly for epilogue
+"""
+import jinja2
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+class EpilogueFunctor(enum.Enum):
+  LinearCombination = enum_auto()
+  LinearCombinationClamp = enum_auto()
+  LinearCombinationRelu = enum_auto()
+  LinearCombinationSigmoid = enum_auto()
+  LinearCombinationTanh = enum_auto()
+  LinearCombinationResidualBlock = enum_auto()
+  LinearCombinationResidualBlockV2 = enum_auto()
+  LinearCombinationHardSwish = enum_auto()
+  LinearCombinationGELU = enum_auto()
+  LinearCombinationFastGELU = enum_auto()
+  LinearCombinationSilu = enum_auto()
+
+EpilogueFunctorTag = {
+  EpilogueFunctor.LinearCombination:
+    'cutlass::epilogue::thread::LinearCombination',
+  EpilogueFunctor.LinearCombinationClamp:
+    'cutlass::epilogue::thread::LinearCombinationClamp',
+  EpilogueFunctor.LinearCombinationRelu:
+    'cutlass::epilogue::thread::LinearCombinationRelu',
+  EpilogueFunctor.LinearCombinationSigmoid:
+    'cutlass::epilogue::thread::LinearCombinationSigmoid',
+  EpilogueFunctor.LinearCombinationTanh:
+    'cutlass::epilogue::thread::LinearCombinationTanh',
+  EpilogueFunctor.LinearCombinationResidualBlock:
+    'cutlass::epilogue::thread::LinearCombinationResidualBlock',
+  EpilogueFunctor.LinearCombinationResidualBlockV2:
+    'cutlass::epilogue::thread::LinearCombinationResidualBlockV2',
+  EpilogueFunctor.LinearCombinationHardSwish:
+    'cutlass::epilogue::thread::LinearCombinationHardSwish',
+  EpilogueFunctor.LinearCombinationGELU:
+    'cutlass::epilogue::thread::LinearCombinationGELU',
+  EpilogueFunctor.LinearCombinationFastGELU:
+    'cutlass::epilogue::thread::LinearCombinationFastGELU',
+  EpilogueFunctor.LinearCombinationSilu:
+    'cutlass::epilogue::thread::LinearCombinationSilu',
+}
+
+EpilogueFunctorName = {
+  "LinearCombination": EpilogueFunctor.LinearCombination,
+  "LinearCombinationClamp": EpilogueFunctor.LinearCombinationClamp,
+  "LinearCombinationRelu": EpilogueFunctor.LinearCombinationRelu,
+  "LinearCombinationSigmoid": EpilogueFunctor.LinearCombinationSigmoid,
+  "LinearCombinationTanh": EpilogueFunctor.LinearCombinationTanh,
+  "LinearCombinationResidualBlock": EpilogueFunctor.LinearCombinationResidualBlock,
+  "LinearCombinationResidualBlockV2": EpilogueFunctor.LinearCombinationResidualBlockV2,
+  "LinearCombinationHardSwish": EpilogueFunctor.LinearCombinationHardSwish,
+  "LinearCombinationGELU": EpilogueFunctor.LinearCombinationGELU,
+  "LinearCombinationFastGELU": EpilogueFunctor.LinearCombinationFastGELU,
+  "LinearCombinationSilu": EpilogueFunctor.LinearCombinationSilu
+}
+
+class EpilogueMath(enum.Enum):
+  ReLu = enum_auto()
+  Sigmoid = enum_auto()
+  Tanh = enum_auto()
+  Identity = enum_auto()
+  HardSwish = enum_auto()
+  Plus = enum_auto()
+  Gelu = enum_auto()
+  FastGelu = enum_auto()
+  Silu = enum_auto()
+
+
+EpilogueMathTag = {
+  EpilogueMath.ReLu: 'cutlass::epilogue::thread::ReLu',
+  EpilogueMath.Sigmoid: 'cutlass::epilogue::thread::Sigmoid',
+  EpilogueMath.Tanh: 'cutlass::epilogue::thread::Tanh',
+  EpilogueMath.Identity: 'cutlass::epilogue::thread::Identity',
+  EpilogueMath.HardSwish: 'cutlass::epilogue::thread::HardSwish',
+  EpilogueMath.Plus: 'cutlass::plus',
+  EpilogueMath.Gelu: 'GELU',
+  EpilogueMath.FastGelu: 'GELU_taylor',
+  EpilogueMath.FastGelu: 'cutlass::epilogue::thread::Silu'
+}
+
+EpilogueMathName = {
+  "ReLu": EpilogueMath.ReLu,
+  "Sigmoid": EpilogueMath.Sigmoid,
+  "Tanh": EpilogueMath.Tanh,
+  "Identity": EpilogueMath.Identity,
+  "HardSwish": EpilogueMath.HardSwish,
+  "Plus": EpilogueMath.Plus,
+  "Add": EpilogueMath.Plus,
+  "Gelu": EpilogueMath.Gelu,
+  "FastGelu": EpilogueMath.FastGelu,
+  "Silu": EpilogueMath.Silu
+}
+
+class EpiloguePermuteLayout(enum.Enum):
+  Permute5D_20314 = enum_auto()
+  Permute4D_0213 = enum_auto()
+  Permute4DBMM_0213 = enum_auto()
+  # Permute3DBMM_021 = enum_auto()
+  NoPermute = enum_auto()
+
+EpiloguePermuteLayoutTag = {
+  EpiloguePermuteLayout.Permute5D_20314: 'cutlass::layout::Tensor5DPermute20314',
+  EpiloguePermuteLayout.Permute4D_0213: 'cutlass::layout::Tensor4DPermute0213',
+  EpiloguePermuteLayout.Permute4DBMM_0213: 'cutlass::layout::Tensor4DPermuteBMM0213',
+  EpiloguePermuteLayout.NoPermute: 'cutlass::layout::NoPermute',
+  # EpiloguePermuteLayout.Permute3DBMM_021: 'cutlass::layout::Tensor3DPermute021BMM',
+}
+
+EpiloguePermuteLayoutName = {
+  "Permute5D_20314": EpiloguePermuteLayout.Permute5D_20314,
+  "Permute4D_0213": EpiloguePermuteLayout.Permute4D_0213,
+  "Permute4DBMM_0213": EpiloguePermuteLayout.Permute4DBMM_0213,
+  "NoPermute": EpiloguePermuteLayout.NoPermute,
+  # "Permute3DBMM_021": EpiloguePermuteLayout.Permute3DBMM_021,
+}
+
+"""
+)
+
+
+def emit_library():
+    return SRC_TEMPLATE.render()
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py b/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
new file mode 100644
index 000000000..cb15736a3
--- /dev/null
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
@@ -0,0 +1,250 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Extra cutlass enum, mainly for epilogue
+"""
+import jinja2
+
+
+GEMM_SOFTMAX_TEMPLATE = jinja2.Template(
+    """
+    using ${operation_name}_base =
+    cutlass::GemmSoftmax<
+        ${element_a}, ${layout_a}, ${align_a},
+        ${element_b}, ${layout_b}, ${align_b},
+        ${element_c}, ${align_c},
+        ${opcode_class},
+        ${arch},
+        ${element_accumulator},
+        ${stages},
+        cutlass::gemm::GemmShape<${threadblock_shape_m},
+                                  ${threadblock_shape_n},
+                                  ${threadblock_shape_k}>,
+        cutlass::gemm::GemmShape<${warp_shape_m},
+                                  ${warp_shape_n},
+                                  ${warp_shape_k}>,
+        cutlass::gemm::GemmShape<${instruction_shape_m},
+                                  ${instruction_shape_n},
+                                  ${instruction_shape_k}>,
+        ${epilogue_functor},
+        ${swizzling_functor}
+    >;
+"""
+)
+
+SRC_SOFTMAX_TEMPLATE = jinja2.Template(
+    '''
+class EmitGemmSoftmaxInstance:
+  def __init__(self, operation_suffix = ''):
+      self.operation_suffix = operation_suffix
+      self.includes = [
+        "cutlass/cutlass.h",
+        "cutlass/numeric_types.h",
+        "cutlass/arch/arch.h",
+        "cutlass/arch/mma.h",
+        "cutlass/layout/matrix.h",
+        "cutlass/gemm/device/gemm.h",
+        "cutlass/gemm/kernel/gemm_grouped.h",
+        "cutlass/gemm/kernel/default_gemm_grouped.h",
+        "cutlass/gemm/device/gemm_grouped.h",
+        "cutlass/gemm/kernel/default_gemm.h"
+      ]
+
+      self.builtin_epilogue_functor_template = """
+          ${epilogue_functor}<
+            ${element_c},
+            ${epilogue_vector_length},
+            ${element_accumulator},
+            ${element_epilogue}
+          >
+      """
+
+      self.gemm_template = """
+      {{gemm_template}}
+      """
+
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    transpose_layouts = {
+      LayoutType.ColumnMajor: LayoutType.RowMajor,
+      LayoutType.RowMajor: LayoutType.ColumnMajor
+    }
+
+    instance_layout_A, instance_layout_B, instance_layout_C = \
+      (operation.A.layout, operation.B.layout, operation.C.layout)
+    #
+
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+
+      epilogue_vector_length = \
+        min(operation.C.alignment * DataTypeSize[operation.C.element], 128) \
+          // DataTypeSize[operation.C.element]
+
+      values = {
+        'epilogue_vector_length': str(epilogue_vector_length),
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'operation_suffix': self.operation_suffix,
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[instance_layout_A],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[instance_layout_B],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[instance_layout_C],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_functor': epilogue_functor,
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'align_c': str(operation.C.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
+    }
+
+    return SubstituteTemplate(self.gemm_template, values)
+
+
+'''
+)
+
+
+GEMM_PERMUTE_TEMPLATE = jinja2.Template(
+    """
+    using ${operation_name}_base =
+    cutlass::gemm::device::GemmUniversal<
+        ${element_a}, ${layout_a},
+        ${element_b}, ${layout_b},
+        ${element_c}, ${layout_c},
+        ${element_accumulator},
+        ${opcode_class},
+        ${arch},
+        cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+        cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+        cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+        ${epilogue_functor}<
+          ${element_c},
+          ${epilogue_vector_length},
+          ${element_accumulator},
+          ${element_epilogue}
+        >,
+        ${swizzling_functor},
+        ${stages},
+        ${align_a},
+        ${align_b},
+        ${math_operation},
+        cutlass::ComplexTransform::kNone,
+        cutlass::ComplexTransform::kNone,
+        false,  /*GatherA*/
+        false,  /*GatherB*/
+        false,  /*ScatterD*/
+        ${permute_layout} /*PermuteDLayout*/
+    >;
+"""
+)
+
+SRC_PERMUTE_TEMPLATE = jinja2.Template(
+    '''
+class EmitGemmPermuteInstance:
+  def __init__(self, operation_suffix = ''):
+      self.operation_suffix = operation_suffix
+      self.includes = []
+      self.gemm_template = """
+      {{gemm_template}}
+      """
+
+
+  def emit(self, operation):
+
+    warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'permute_layout': EpiloguePermuteLayoutTag[operation.permute_layout],
+    }
+
+    return SubstituteTemplate(self.gemm_template, values)
+
+
+'''
+)
+
+
+def emit_library():
+    template = ""
+    template += SRC_SOFTMAX_TEMPLATE.render(
+        gemm_template=GEMM_SOFTMAX_TEMPLATE.render()
+    )
+    template += SRC_PERMUTE_TEMPLATE.render(
+        gemm_template=GEMM_PERMUTE_TEMPLATE.render()
+    )
+    return template
diff --git a/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py b/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
new file mode 100644
index 000000000..2791fe05c
--- /dev/null
+++ b/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import pathlib
+import re
+import shutil
+import tempfile
+
+from . import extra_conv_emit, extra_cutlass_generator, extra_enum, extra_gemm_emit
+
+
+def mk_cutlass_lib(template_path, dst_prefix=None):
+    if dst_prefix is None:
+        dst_prefix = tempfile.mkdtemp()
+    lib_dst = os.path.join(dst_prefix, "cutlass_lib")
+    if pathlib.Path(lib_dst).is_dir():
+        shutil.rmtree(lib_dst)
+
+    os.makedirs(lib_dst)
+    with open(os.path.join(lib_dst, "__init__.py"), "w") as fo:
+        fo.write("from . import library\n")
+        fo.write("from . import generator\n")
+        fo.write("from . import manifest\n")
+        fo.write("from . import conv3d_operation\n")
+        fo.write("from . import gemm_operation\n")
+        fo.write("from . import conv2d_operation\n")
+        fo.write("from . import extra_operation\n")
+
+    def process_code(src_path, dst_path, code_set):
+        pattern = re.compile(r"from\s([a-z_0-9]+)\simport \*")
+        with open(src_path) as fi:
+            lines = fi.readlines()
+        output = []
+
+        for line in lines:
+            match = pattern.match(line)
+            if match is not None:
+                name = match.groups()[0]
+                if name + ".py" in code_set:
+                    line = "from .{name} import *\n".format(name=name)
+            output.append(line)
+        if "library.py" in dst_path:
+            lines = extra_enum.emit_library()
+            output.append(lines)
+        if "conv2d_operation.py" in dst_path:
+            lines = extra_conv_emit.emit_library()
+            output.append(lines)
+        if "gemm_operation.py" in dst_path:
+            lines = extra_gemm_emit.emit_library()
+            output.append(lines)
+        with open(dst_path, "w") as fo:
+            fo.writelines(output)
+
+    src_prefix = os.path.join(template_path, "tools/library/scripts")
+    srcs = os.listdir(src_prefix)
+    for file in srcs:
+        src_path = os.path.join(src_prefix, file)
+        if not os.path.isfile(src_path):
+            continue
+        dst_path = os.path.join(lib_dst, file)
+        process_code(src_path, dst_path, srcs)
+
+    # extra configs
+    dst_path = os.path.join(lib_dst, "extra_operation.py")
+    with open(dst_path, "w") as fo:
+        code = extra_cutlass_generator.emit_library()
+        fo.write(code)
+    return dst_prefix
+
+
+if __name__ == "__main__":
+    cutlass_path = os.getenv("SRCDIR")
+    output_path = os.getenv("OUT")
+
+    assert output_path is not None
+    assert cutlass_path is not None
+
+    mk_cutlass_lib(cutlass_path + "/cutlass", os.path.dirname(output_path))
diff --git a/python/aitemplate/utils/shape_utils.py b/python/aitemplate/utils/shape_utils.py
new file mode 100644
index 000000000..75a857e07
--- /dev/null
+++ b/python/aitemplate/utils/shape_utils.py
@@ -0,0 +1,187 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Util functions to handle shapes.
+"""
+
+from typing import List
+
+
+def gen_int_var(values: List[int], name: str = None):
+    """
+    A helper function to generate IntImm or IntVar depending on the length of values.
+    """
+    from aitemplate.compiler.base import IntImm, IntVar
+
+    values = list(set(values))
+    if len(values) == 1:
+        return IntImm(values[0], name=name)
+    elif len(values) > 1:
+        return IntVar(values, name=name)
+    else:
+        raise RuntimeError("Unsupported dim definition: {}".format(values))
+
+
+def gen_int_var_min_max(values: List[int], name: str = None):
+    """
+    A helper function to generate IntImm or IntVar depending on the length of values.
+    Only keeps [min, max] pairs if there are more than 2 values.
+    """
+    return gen_int_var([min(values), max(values)], name=name)
+
+
+def get_broadcast_max_shape(shape1, shape2):
+    """
+    Checks whether two inputs shapes are broadcastable, and if yes, also returns the result broadcast shape.
+    Two shapes are broadcastable if starting from trailing (rightmost) dimensions both dims are:
+        1. equal, or
+        2. one of them is 1
+    Note that two shapes are not required to have the same number of dimensions.
+    For example, shape [5, 2, 3] and shape [3] are also broadcastable.
+    """
+    from aitemplate.compiler.base import IntImm
+
+    min_len = min(len(shape1), len(shape2))
+    if len(shape1) > len(shape2):
+        res_shape = list(shape1)
+    else:
+        res_shape = list(shape2)
+    for i in range(min_len):
+        idx = -i - 1
+        dim1 = shape1[idx]
+        dim2 = shape2[idx]
+        if dim1 == dim2:
+            res_shape[idx] = dim1
+            continue
+        if dim1 == IntImm(1):
+            res_shape[idx] = dim2
+        elif dim2 == IntImm(1):
+            res_shape[idx] = dim1
+        else:
+            return (False, None)
+    return (True, res_shape)
+
+
+def get_num_rightmost_static_elements(shape, num_rightmost_dims: int = None) -> int:
+    """
+    Returns number of elements in rightmost max contiguous static dimensions.
+    If the rightmost dim is dynamic, returns 1.
+
+    If num_rightmost_dims is specified, only look into num_rightmost_dims.
+    Otherwise, look into all dims.
+    If num_rightmost_dims == 0, returns 1.
+
+    This is useful when calculating alignment.
+
+    e.g.
+    shape = [IntImm(2), IntImm(4)] returns 2 * 4 = 8.
+    shape = [IntImm(2), IntVar(4, 8)] returns 1.
+    shape = [IntImm(9), IntVar(4, 8), IntImm(3)] returns 3.
+
+    shape = [IntImm(2), IntImm(4), IntImm(3)], num_rightmost_dims = None returns 24.
+    shape = [IntImm(2), IntImm(4), IntImm(3)], num_rightmost_dims = 1 returns 3.
+    shape = [IntImm(2), IntImm(4), IntImm(3)], num_rightmost_dims = 0 returns 1.
+    """
+    from aitemplate.compiler.base import IntImm
+
+    res = 1
+
+    for idx, dim in enumerate(reversed(shape)):
+        if idx >= num_rightmost_dims:
+            break
+        if not isinstance(dim, IntImm):
+            break
+        res *= dim._attrs["values"][0]
+    return res
+
+
+def all_static_dimensions(shape, from_dim: int = 0):
+    """
+    Return true if all dimensions starting from from_dim (inclusive)
+    are static
+    """
+    from aitemplate.compiler.base import IntImm
+
+    for dim in shape[from_dim:]:
+        if not isinstance(dim, IntImm):
+            return False
+    return True
+
+
+def is_static_dimension(shape, dim: int) -> bool:
+    """
+    Return true if shape[dim] is static
+    """
+    from aitemplate.compiler.base import IntImm
+
+    return dim <= len(shape) and isinstance(shape[dim], IntImm)
+
+
+def convert_shape_to_IntVar(shape):
+    """
+    Helper function to convert a list of mixed int/IntVar/IntImm
+    into a list with only IntVar/IntImm.
+    """
+    from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor
+
+    ret = []
+    for v in shape:
+        if isinstance(v, int):
+            ret.append(IntImm(v))
+        elif isinstance(v, IntVar):
+            ret.append(v)
+        elif isinstance(v, IntVarTensor):
+            ret.append(v._attrs["int_var"])
+    return ret
+
+
+def convert_IntVar_to_int(var) -> int:
+    """
+    Try to convert an IntVar (or an IntVar wrapped in a IntVarTensor) to
+    an int. Raises a value error if var is dynamic.
+    """
+    from aitemplate.compiler.base import IntVarTensor
+
+    if isinstance(var, int):
+        return var
+
+    var = var._attrs["int_var"] if isinstance(var, IntVarTensor) else var
+    if var.upper_bound() == var.lower_bound():
+        return var.upper_bound()
+
+    raise ValueError(f"Cannot convert IntVar to int: {var}")
+
+
+def is_singleton_dimension(dim) -> bool:
+    """
+    True if this dimension is 1. IntVars will return True if their
+    upper and lower bounds are both 1.
+    """
+    from aitemplate.compiler.base import IntVarTensor
+
+    if isinstance(dim, int):
+        return dim == 1
+
+    dim = dim._attrs["int_var"] if isinstance(dim, IntVarTensor) else dim
+    return dim.upper_bound() == dim.lower_bound() and dim.upper_bound() == 1
+
+
+def is_same_shape(shapes1, shapes2) -> bool:
+    if len(shapes1) != len(shapes2):
+        return False
+    for dim1, dim2 in zip(shapes1, shapes2):
+        if dim1 != dim2:
+            return False
+    return True
diff --git a/python/aitemplate/utils/tensor_utils.py b/python/aitemplate/utils/tensor_utils.py
new file mode 100644
index 000000000..66996010f
--- /dev/null
+++ b/python/aitemplate/utils/tensor_utils.py
@@ -0,0 +1,28 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Util functions to handle tensor shapes.
+"""
+
+
+def wrap_dim(idx, rank):
+    """
+    Wrap tensor index, idx, if it's negative.
+    """
+    assert isinstance(idx, int)
+    if idx < 0:
+        idx = idx + rank
+    assert idx < rank
+    return idx
diff --git a/python/aitemplate/utils/torch_utils.py b/python/aitemplate/utils/torch_utils.py
new file mode 100644
index 000000000..b3d41efb2
--- /dev/null
+++ b/python/aitemplate/utils/torch_utils.py
@@ -0,0 +1,38 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Functions for working with torch Tensors.
+AITemplate doesn't depend on PyTorch, but it exposes
+many APIs that work with torch Tensors anyways.
+
+The functions in this file may assume that
+`import torch` will work.
+"""
+
+
+def torch_dtype_to_string(dtype):
+    import torch
+
+    dtype_to_str = {
+        torch.float16: "float16",
+        torch.float32: "float32",
+        torch.int32: "int32",
+        torch.int64: "int64",
+    }
+    if dtype not in dtype_to_str:
+        raise ValueError(
+            f"Got unsupported input dtype {dtype}! Supported dtypes are: {list(dtype_to_str.keys())}"
+        )
+    return dtype_to_str[dtype]
diff --git a/python/aitemplate/utils/visualization/__init__.py b/python/aitemplate/utils/visualization/__init__.py
new file mode 100644
index 000000000..e6a2db339
--- /dev/null
+++ b/python/aitemplate/utils/visualization/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from .plot import plot_graph
+
+__all__ = ["plot_graph"]
diff --git a/python/aitemplate/utils/visualization/op_attr_factory.py b/python/aitemplate/utils/visualization/op_attr_factory.py
new file mode 100644
index 000000000..6049a1151
--- /dev/null
+++ b/python/aitemplate/utils/visualization/op_attr_factory.py
@@ -0,0 +1,21 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+def op_to_content(op):
+    # TODO (XXX): Add op specialized attrs here, like gemm/conv
+    content = {}
+    content["op_type"] = op._attrs["op"]
+    return content
diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
new file mode 100644
index 000000000..228eb435c
--- /dev/null
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -0,0 +1,202 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Graph visualization tool for AITemplate
+"""
+
+import json
+
+from aitemplate.utils.visualization import op_attr_factory, pydot
+from aitemplate.utils.visualization.web_template import (
+    INDEX_TEMPLATE,
+    MODAL_TEMPLATE,
+    TABLE_TEMPLATE,
+)
+
+
+COLOR_SCHEME = {
+    "default_tensor": "lightskyblue1",
+    "view": "plum1",
+    "output": "violetred",
+    "param": "x11gray",
+    "default_op": "mediumpurple1",
+}
+
+
+def _get_tensor_shape_str(tensor) -> str:
+    from aitemplate.compiler.base import IntImm
+
+    shape = []
+    tensor_shape = tensor.shape()
+    for s in tensor_shape:
+        if isinstance(s, IntImm):
+            shape.append(s.value())
+            continue
+
+        # s is IntVar
+        s = s._attrs["values"]
+        if len(s) == 1:
+            shape.append(s[0])
+        elif len(s) >= 2:
+            shape.append((min(s), max(s)))
+        else:
+            raise RuntimeError()
+    return str(shape)
+
+
+def _gen_tensor_modal(tensor) -> str:
+    content = {}
+    content["shape"] = _get_tensor_shape_str(tensor)
+    content["is_view_of"] = (
+        "None"
+        if tensor._attrs["is_view_of"] is None
+        else tensor._attrs["is_view_of"]._attrs["name"]
+    )
+    content["is_output"] = str(tensor._attrs["is_output"])
+    content["is_param"] = str(tensor._attrs["is_param"])
+    content["dtype"] = str(tensor._attrs["dtype"])
+    table_src = TABLE_TEMPLATE.render(table_data=content)
+    modal_src = MODAL_TEMPLATE.render(
+        modal_id=tensor._attrs["name"] + "_modal",
+        modal_label=tensor._attrs["name"] + "_label",
+        modal_title=tensor._attrs["name"],
+        modal_content=table_src,
+    )
+    return modal_src
+
+
+def _gen_op_modal(op) -> str:
+    content = op_attr_factory.op_to_content(op)
+    table_src = TABLE_TEMPLATE.render(table_data=content)
+    modal_src = MODAL_TEMPLATE.render(
+        modal_id=op._attrs["name"] + "_modal",
+        modal_label=op._attrs["name"] + "_label",
+        modal_title=op._attrs["name"],
+        modal_content=table_src,
+    )
+    return modal_src
+
+
+def plot_graph(sorted_graph, file_path: str, network_name: str = "") -> None:
+    """Plot a sorted graph into an interactive HTML page.
+
+    The sorted graph must be named.
+    The HTML can be opened in Chrome directly.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        output of sorted graph / other optimization pass.
+    file_path : str
+        output HTML path
+    network_name : str, optional
+        the name of network, will appear in navigation bar, by default ""
+    """
+    dot_graph = pydot.Dot(graph_type="digraph")
+
+    op_set = {}
+    tensor_set = {}
+    modal_set = []
+    items = []
+    popover_data = {}
+    for tensor in sorted_graph:
+        tensor_node = None
+        tensor_name = tensor._attrs["name"]
+        if tensor_name is None:
+            raise RuntimeError(
+                "Input sorted_graph must be named. Try to run name_graph pass on it."
+            )
+        if tensor in tensor_set:
+            tensor_node = tensor_set[tensor]
+        else:
+            color = COLOR_SCHEME["default_tensor"]
+            if tensor._attrs["is_view_of"] is not None:
+                color = COLOR_SCHEME["view"]
+            if tensor._attrs["is_output"] is True:
+                color = COLOR_SCHEME["output"]
+            if tensor._attrs["is_param"] is True:
+                color = COLOR_SCHEME["param"]
+            tensor_node = pydot.Node(
+                name=tensor_name,
+                shape="note",
+                id=tensor_name,
+                color=color,
+            )
+            tensor_set[tensor] = tensor_node
+            dot_graph.add_node(tensor_node)
+            modal_set.append(_gen_tensor_modal(tensor))
+            items.append(tensor_name)
+            popover_data[tensor_name] = "shape: " + _get_tensor_shape_str(tensor)
+
+        for src_op in tensor.src_ops():
+            op_node = None
+            op_name = src_op._attrs["name"]
+            if op_name is None:
+                raise RuntimeError(
+                    "Input sorted_graph must be named. Try to run name_graph pass on it."
+                )
+            if src_op in op_set:
+                op_node = op_set[src_op]
+            else:
+                op_node = pydot.Node(
+                    name=op_name,
+                    shape="folder",
+                    id=op_name,
+                    color="mediumpurple1",
+                )
+                op_set[src_op] = op_node
+                dot_graph.add_node(op_node)
+                modal_set.append(_gen_op_modal(src_op))
+                items.append(op_name)
+                popover_data[op_name] = "op: " + src_op._attrs["op"]
+            dot_graph.add_edge(pydot.Edge(op_node, tensor_node))
+
+        for dst_op in tensor.dst_ops():
+            op_node = None
+            op_name = dst_op._attrs["name"]
+            if op_name is None:
+                raise RuntimeError(
+                    "Input sorted_graph must be named. Try to run name_graph pass on it."
+                )
+            if dst_op in op_set:
+                op_node = op_set[dst_op]
+            else:
+                op_node = pydot.Node(
+                    name=op_name,
+                    shape="folder",
+                    id=op_name,
+                    color="mediumpurple1",
+                )
+                op_set[dst_op] = op_node
+                dot_graph.add_node(op_node)
+                items.append(op_name)
+                popover_data[op_name] = "op: " + dst_op._attrs["op"]
+                # add modal
+                modal_set.append(_gen_op_modal(dst_op))
+            dot_graph.add_edge(pydot.Edge(tensor_node, op_node))
+    dot_src = dot_graph.to_string()
+    modal_src = "\n".join(modal_set)
+    items_src = [f'"{item}"' for item in items]
+    popover_src = json.dumps(popover_data)
+    index = INDEX_TEMPLATE.render(
+        dot_src=dot_src,
+        modals=modal_src,
+        network_name=network_name,
+        items=items_src,
+        popover_data=popover_src,
+    )
+
+    with open(file_path, "w") as fo:
+        fo.write(index)
diff --git a/python/aitemplate/utils/visualization/pydot.py b/python/aitemplate/utils/visualization/pydot.py
new file mode 100644
index 000000000..adcec96d1
--- /dev/null
+++ b/python/aitemplate/utils/visualization/pydot.py
@@ -0,0 +1,1962 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""An interface to GraphViz.
+Original Project: https://github.com/pydot/pydot
+Accessed: Jul 25, 2022
+"""
+import copy
+import errno
+import io
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+# flake8: noqa
+
+GRAPH_ATTRIBUTES = {
+    "Damping",
+    "K",
+    "URL",
+    "aspect",
+    "bb",
+    "bgcolor",
+    "center",
+    "charset",
+    "clusterrank",
+    "colorscheme",
+    "comment",
+    "compound",
+    "concentrate",
+    "defaultdist",
+    "dim",
+    "dimen",
+    "diredgeconstraints",
+    "dpi",
+    "epsilon",
+    "esep",
+    "fontcolor",
+    "fontname",
+    "fontnames",
+    "fontpath",
+    "fontsize",
+    "id",
+    "label",
+    "labeljust",
+    "labelloc",
+    "landscape",
+    "layers",
+    "layersep",
+    "layout",
+    "levels",
+    "levelsgap",
+    "lheight",
+    "lp",
+    "lwidth",
+    "margin",
+    "maxiter",
+    "mclimit",
+    "mindist",
+    "mode",
+    "model",
+    "mosek",
+    "nodesep",
+    "nojustify",
+    "normalize",
+    "nslimit",
+    "nslimit1",
+    "ordering",
+    "orientation",
+    "outputorder",
+    "overlap",
+    "overlap_scaling",
+    "pack",
+    "packmode",
+    "pad",
+    "page",
+    "pagedir",
+    "quadtree",
+    "quantum",
+    "rankdir",
+    "ranksep",
+    "ratio",
+    "remincross",
+    "repulsiveforce",
+    "resolution",
+    "root",
+    "rotate",
+    "searchsize",
+    "sep",
+    "showboxes",
+    "size",
+    "smoothing",
+    "sortv",
+    "splines",
+    "start",
+    "stylesheet",
+    "target",
+    "truecolor",
+    "viewport",
+    "voro_margin",
+    # for subgraphs
+    "rank",
+}
+
+
+EDGE_ATTRIBUTES = {
+    "URL",
+    "arrowhead",
+    "arrowsize",
+    "arrowtail",
+    "color",
+    "colorscheme",
+    "comment",
+    "constraint",
+    "decorate",
+    "dir",
+    "edgeURL",
+    "edgehref",
+    "edgetarget",
+    "edgetooltip",
+    "fontcolor",
+    "fontname",
+    "fontsize",
+    "headURL",
+    "headclip",
+    "headhref",
+    "headlabel",
+    "headport",
+    "headtarget",
+    "headtooltip",
+    "href",
+    "id",
+    "label",
+    "labelURL",
+    "labelangle",
+    "labeldistance",
+    "labelfloat",
+    "labelfontcolor",
+    "labelfontname",
+    "labelfontsize",
+    "labelhref",
+    "labeltarget",
+    "labeltooltip",
+    "layer",
+    "len",
+    "lhead",
+    "lp",
+    "ltail",
+    "minlen",
+    "nojustify",
+    "penwidth",
+    "pos",
+    "samehead",
+    "sametail",
+    "showboxes",
+    "style",
+    "tailURL",
+    "tailclip",
+    "tailhref",
+    "taillabel",
+    "tailport",
+    "tailtarget",
+    "tailtooltip",
+    "target",
+    "tooltip",
+    "weight",
+    "rank",
+}
+
+
+NODE_ATTRIBUTES = {
+    "URL",
+    "color",
+    "colorscheme",
+    "comment",
+    "distortion",
+    "fillcolor",
+    "fixedsize",
+    "fontcolor",
+    "fontname",
+    "fontsize",
+    "group",
+    "height",
+    "id",
+    "image",
+    "imagescale",
+    "label",
+    "labelloc",
+    "layer",
+    "margin",
+    "nojustify",
+    "orientation",
+    "penwidth",
+    "peripheries",
+    "pin",
+    "pos",
+    "rects",
+    "regular",
+    "root",
+    "samplepoints",
+    "shape",
+    "shapefile",
+    "showboxes",
+    "sides",
+    "skew",
+    "sortv",
+    "style",
+    "target",
+    "tooltip",
+    "vertices",
+    "width",
+    "z",
+    # The following are attributes dot2tex
+    "texlbl",
+    "texmode",
+}
+
+
+CLUSTER_ATTRIBUTES = {
+    "K",
+    "URL",
+    "bgcolor",
+    "color",
+    "colorscheme",
+    "fillcolor",
+    "fontcolor",
+    "fontname",
+    "fontsize",
+    "label",
+    "labeljust",
+    "labelloc",
+    "lheight",
+    "lp",
+    "lwidth",
+    "nojustify",
+    "pencolor",
+    "penwidth",
+    "peripheries",
+    "sortv",
+    "style",
+    "target",
+    "tooltip",
+}
+# fmt: on
+
+
+DEFAULT_PROGRAMS = {
+    "dot",
+    "twopi",
+    "neato",
+    "circo",
+    "fdp",
+    "sfdp",
+}
+
+
+def is_windows():
+    # type: () -> bool
+    return os.name == "nt"
+
+
+def is_anaconda():
+    # type: () -> bool
+    import glob
+
+    conda_pattern = os.path.join(sys.prefix, "conda-meta\\graphviz*.json")
+    return glob.glob(conda_pattern) != []
+
+
+def get_executable_extension():
+    # type: () -> str
+    if is_windows():
+        return ".bat" if is_anaconda() else ".exe"
+    else:
+        return ""
+
+
+def call_graphviz(program, arguments, working_dir, **kwargs):
+    # explicitly inherit `$PATH`, on Windows too,
+    # with `shell=False`
+
+    if program in DEFAULT_PROGRAMS:
+        extension = get_executable_extension()
+        program += extension
+
+    if arguments is None:
+        arguments = []
+
+    env = {
+        "PATH": os.environ.get("PATH", ""),
+        "LD_LIBRARY_PATH": os.environ.get("LD_LIBRARY_PATH", ""),
+        "SYSTEMROOT": os.environ.get("SYSTEMROOT", ""),
+    }
+
+    program_with_args = [program] + arguments
+
+    process = subprocess.Popen(
+        program_with_args,
+        env=env,
+        cwd=working_dir,
+        shell=False,
+        stderr=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        **kwargs,
+    )
+    stdout_data, stderr_data = process.communicate()
+
+    return stdout_data, stderr_data, process
+
+
+#
+# Extended version of ASPN's Python Cookbook Recipe:
+# Frozen dictionaries.
+# https://code.activestate.com/recipes/414283/
+#
+# This version freezes dictionaries used as values within dictionaries.
+#
+class frozendict(dict):
+    def _blocked_attribute(obj):
+        raise AttributeError("A frozendict cannot be modified.")
+
+    _blocked_attribute = property(_blocked_attribute)
+
+    __delitem__ = __setitem__ = clear = _blocked_attribute
+    pop = popitem = setdefault = update = _blocked_attribute
+
+    def __new__(cls, *args, **kw):
+        new = dict.__new__(cls)
+
+        args_ = []
+        for arg in args:
+            if isinstance(arg, dict):
+                arg = copy.copy(arg)
+                for k in arg:
+                    v = arg[k]
+                    if isinstance(v, frozendict):
+                        arg[k] = v
+                    elif isinstance(v, dict):
+                        arg[k] = frozendict(v)
+                    elif isinstance(v, list):
+                        v_ = list()
+                        for elm in v:
+                            if isinstance(elm, dict):
+                                v_.append(frozendict(elm))
+                            else:
+                                v_.append(elm)
+                        arg[k] = tuple(v_)
+                args_.append(arg)
+            else:
+                args_.append(arg)
+
+        dict.__init__(new, *args_, **kw)
+        return new
+
+    def __init__(self, *args, **kw):
+        pass
+
+    def __hash__(self):
+        try:
+            return self._cached_hash
+        except AttributeError:
+            h = self._cached_hash = hash(tuple(sorted(self.items())))
+            return h
+
+    def __repr__(self):
+        return "frozendict(%s)" % dict.__repr__(self)
+
+
+dot_keywords = ["graph", "subgraph", "digraph", "node", "edge", "strict"]
+
+id_re_alpha_nums = re.compile("^[_a-zA-Z][a-zA-Z0-9_,]*$", re.UNICODE)
+id_re_alpha_nums_with_ports = re.compile(
+    '^[_a-zA-Z][a-zA-Z0-9_,:"]*[a-zA-Z0-9_,"]+$', re.UNICODE
+)
+id_re_num = re.compile("^[0-9,]+$", re.UNICODE)
+id_re_with_port = re.compile("^([^:]*):([^:]*)$", re.UNICODE)
+id_re_dbl_quoted = re.compile('^".*"$', re.S | re.UNICODE)
+id_re_html = re.compile("^<.*>$", re.S | re.UNICODE)
+
+
+def needs_quotes(s):
+    """Checks whether a string is a dot language ID.
+
+    It will check whether the string is solely composed
+    by the characters allowed in an ID or not.
+    If the string is one of the reserved keywords it will
+    need quotes too but the user will need to add them
+    manually.
+    """
+
+    # If the name is a reserved keyword it will need quotes but pydot
+    # can't tell when it's being used as a keyword or when it's simply
+    # a name. Hence the user needs to supply the quotes when an element
+    # would use a reserved keyword as name. This function will return
+    # false indicating that a keyword string, if provided as-is, won't
+    # need quotes.
+    if s in dot_keywords:
+        return False
+
+    chars = [ord(c) for c in s if ord(c) > 0x7F or ord(c) == 0]
+    if chars and not id_re_dbl_quoted.match(s) and not id_re_html.match(s):
+        return True
+
+    for test_re in [
+        id_re_alpha_nums,
+        id_re_num,
+        id_re_dbl_quoted,
+        id_re_html,
+        id_re_alpha_nums_with_ports,
+    ]:
+        if test_re.match(s):
+            return False
+
+    m = id_re_with_port.match(s)
+    if m:
+        return needs_quotes(m.group(1)) or needs_quotes(m.group(2))
+
+    return True
+
+
+def quote_if_necessary(s):
+    """Enclose attribute value in quotes, if needed."""
+    if isinstance(s, bool):
+        if s is True:
+            return "True"
+        return "False"
+
+    if not isinstance(s, str):
+        return s
+
+    if not s:
+        return s
+
+    if needs_quotes(s):
+        replace = {
+            '"': r"\"",
+            "\n": r"\n",
+            "\r": r"\r",
+        }
+        for (a, b) in replace.items():
+            s = s.replace(a, b)
+
+        return '"' + s + '"'
+
+    return s
+
+
+def graph_from_dot_data(s):
+    """Load graphs from DOT description in string `s`.
+
+    @param s: string in [DOT language](
+        https://en.wikipedia.org/wiki/DOT_(graph_description_language))
+
+    @return: Graphs that result from parsing.
+    @rtype: `list` of `pydot.Dot`
+    """
+    return dot_parser.parse_dot_data(s)
+
+
+def graph_from_dot_file(path, encoding=None):
+    """Load graphs from DOT file at `path`.
+
+    @param path: to DOT file
+    @param encoding: as passed to `io.open`.
+        For example, `'utf-8'`.
+
+    @return: Graphs that result from parsing.
+    @rtype: `list` of `pydot.Dot`
+    """
+    with io.open(path, "rt", encoding=encoding) as f:
+        s = f.read()
+    graphs = graph_from_dot_data(s)
+    return graphs
+
+
+def graph_from_edges(edge_list, node_prefix="", directed=False):
+    """Creates a basic graph out of an edge list.
+
+    The edge list has to be a list of tuples representing
+    the nodes connected by the edge.
+    The values can be anything: bool, int, float, str.
+
+    If the graph is undirected by default, it is only
+    calculated from one of the symmetric halves of the matrix.
+    """
+
+    if directed:
+        graph = Dot(graph_type="digraph")
+
+    else:
+        graph = Dot(graph_type="graph")
+
+    for edge in edge_list:
+
+        if isinstance(edge[0], str):
+            src = node_prefix + edge[0]
+        else:
+            src = node_prefix + str(edge[0])
+
+        if isinstance(edge[1], str):
+            dst = node_prefix + edge[1]
+        else:
+            dst = node_prefix + str(edge[1])
+
+        e = Edge(src, dst)
+        graph.add_edge(e)
+
+    return graph
+
+
+def graph_from_adjacency_matrix(matrix, node_prefix="", directed=False):
+    """Creates a basic graph out of an adjacency matrix.
+
+    The matrix has to be a list of rows of values
+    representing an adjacency matrix.
+    The values can be anything: bool, int, float, as long
+    as they can evaluate to True or False.
+    """
+
+    node_orig = 1
+
+    if directed:
+        graph = Dot(graph_type="digraph")
+    else:
+        graph = Dot(graph_type="graph")
+
+    for row in matrix:
+        if not directed:
+            skip = matrix.index(row)
+            r = row[skip:]
+        else:
+            skip = 0
+            r = row
+        node_dest = skip + 1
+
+        for e in r:
+            if e:
+                graph.add_edge(
+                    Edge(
+                        "%s%s" % (node_prefix, node_orig),
+                        "%s%s" % (node_prefix, node_dest),
+                    )
+                )
+            node_dest += 1
+        node_orig += 1
+
+    return graph
+
+
+def graph_from_incidence_matrix(matrix, node_prefix="", directed=False):
+    """Creates a basic graph out of an incidence matrix.
+
+    The matrix has to be a list of rows of values
+    representing an incidence matrix.
+    The values can be anything: bool, int, float, as long
+    as they can evaluate to True or False.
+    """
+
+    if directed:
+        graph = Dot(graph_type="digraph")
+    else:
+        graph = Dot(graph_type="graph")
+
+    for row in matrix:
+        nodes = []
+        c = 1
+
+        for node in row:
+            if node:
+                nodes.append(c * node)
+            c += 1
+            nodes.sort()
+
+        if len(nodes) == 2:
+            graph.add_edge(
+                Edge(
+                    "%s%s" % (node_prefix, abs(nodes[0])),
+                    "%s%s" % (node_prefix, nodes[1]),
+                )
+            )
+
+    if not directed:
+        graph.set_simplify(True)
+
+    return graph
+
+
+class Common(object):
+    """Common information to several classes.
+
+    Should not be directly used, several classes are derived from
+    this one.
+    """
+
+    def __getstate__(self):
+        dict = copy.copy(self.obj_dict)
+        return dict
+
+    def __setstate__(self, state):
+        self.obj_dict = state
+
+    def __get_attribute__(self, attr):
+        """Look for default attributes for this node"""
+
+        attr_val = self.obj_dict["attributes"].get(attr, None)
+        if attr_val is None:
+            # get the defaults for nodes/edges
+
+            default_node_name = self.obj_dict["type"]
+
+            # The defaults for graphs are set on a node named 'graph'
+            if default_node_name in ("subgraph", "digraph", "cluster"):
+                default_node_name = "graph"
+
+            g = self.get_parent_graph()
+            if g is not None:
+                defaults = g.get_node(default_node_name)
+            else:
+                return None
+
+            # Multiple defaults could be set by having repeated 'graph [...]'
+            # 'node [...]', 'edge [...]' statements. In such case, if the
+            # same attribute is set in different statements, only the first
+            # will be returned. In order to get all, one would call the
+            # get_*_defaults() methods and handle those. Or go node by node
+            # (of the ones specifying defaults) and modify the attributes
+            # individually.
+            #
+            if not isinstance(defaults, (list, tuple)):
+                defaults = [defaults]
+
+            for default in defaults:
+                attr_val = default.obj_dict["attributes"].get(attr, None)
+                if attr_val:
+                    return attr_val
+        else:
+            return attr_val
+
+        return None
+
+    def set_parent_graph(self, parent_graph):
+        self.obj_dict["parent_graph"] = parent_graph
+
+    def get_parent_graph(self):
+        return self.obj_dict.get("parent_graph", None)
+
+    def set(self, name, value):
+        """Set an attribute value by name.
+
+        Given an attribute 'name' it will set its value to 'value'.
+        There's always the possibility of using the methods:
+
+            set_'name'(value)
+
+        which are defined for all the existing attributes.
+        """
+        self.obj_dict["attributes"][name] = value
+
+    def get(self, name):
+        """Get an attribute value by name.
+
+        Given an attribute 'name' it will get its value.
+        There's always the possibility of using the methods:
+
+            get_'name'()
+
+        which are defined for all the existing attributes.
+        """
+        return self.obj_dict["attributes"].get(name, None)
+
+    def get_attributes(self):
+        """Get attributes of the object"""
+        return self.obj_dict["attributes"]
+
+    def set_sequence(self, seq):
+        """Set sequence"""
+        self.obj_dict["sequence"] = seq
+
+    def get_sequence(self):
+        """Get sequence"""
+        return self.obj_dict["sequence"]
+
+    def create_attribute_methods(self, obj_attributes):
+        for attr in obj_attributes:
+            # Generate all the Setter methods.
+            #
+            self.__setattr__(
+                "set_" + attr,
+                lambda x, a=attr: self.obj_dict["attributes"].__setitem__(a, x),
+            )
+
+            # Generate all the Getter methods.
+            #
+            self.__setattr__("get_" + attr, lambda a=attr: self.__get_attribute__(a))
+
+
+class Node(Common):
+    """A graph node.
+
+    This class represents a graph's node with all its attributes.
+
+    node(name, attribute=value, ...)
+
+    name: node's name
+
+    All the attributes defined in the Graphviz dot language should
+    be supported.
+    """
+
+    def __init__(self, name="", obj_dict=None, **attrs):
+        #
+        # Nodes will take attributes of
+        # all other types because the defaults
+        # for any GraphViz object are dealt with
+        # as if they were Node definitions
+        #
+        if obj_dict is not None:
+
+            self.obj_dict = obj_dict
+
+        else:
+
+            self.obj_dict = dict()
+
+            # Copy the attributes
+            #
+            self.obj_dict["attributes"] = dict(attrs)
+            self.obj_dict["type"] = "node"
+            self.obj_dict["parent_graph"] = None
+            self.obj_dict["parent_node_list"] = None
+            self.obj_dict["sequence"] = None
+
+            # Remove the compass point
+            #
+            port = None
+            if isinstance(name, str) and not name.startswith('"'):
+                idx = name.find(":")
+                if idx > 0 and idx + 1 < len(name):
+                    name, port = name[:idx], name[idx:]
+
+            if isinstance(name, int):
+                name = str(name)
+
+            self.obj_dict["name"] = quote_if_necessary(name)
+            self.obj_dict["port"] = port
+
+        self.create_attribute_methods(NODE_ATTRIBUTES)
+
+    def __str__(self):
+        return self.to_string()
+
+    def set_name(self, node_name):
+        """Set the node's name."""
+        self.obj_dict["name"] = node_name
+
+    def get_name(self):
+        """Get the node's name."""
+        return self.obj_dict["name"]
+
+    def get_port(self):
+        """Get the node's port."""
+        return self.obj_dict["port"]
+
+    def add_style(self, style):
+        styles = self.obj_dict["attributes"].get("style", None)
+        if not styles and style:
+            styles = [style]
+        else:
+            styles = styles.split(",")
+            styles.append(style)
+
+        self.obj_dict["attributes"]["style"] = ",".join(styles)
+
+    def to_string(self):
+        """Return string representation of node in DOT language."""
+        # RMF: special case defaults for node, edge and graph properties.
+        #
+        node = quote_if_necessary(self.obj_dict["name"])
+
+        node_attr = list()
+
+        for attr in sorted(self.obj_dict["attributes"]):
+            value = self.obj_dict["attributes"][attr]
+            if value == "":
+                value = '""'
+            if value is not None:
+                node_attr.append("%s=%s" % (attr, quote_if_necessary(value)))
+            else:
+                node_attr.append(attr)
+
+        # No point in having nodes setting any defaults if the don't set
+        # any attributes...
+        #
+        if node in ("graph", "node", "edge") and len(node_attr) == 0:
+            return ""
+
+        node_attr = ", ".join(node_attr)
+
+        if node_attr:
+            node += " [" + node_attr + "]"
+
+        return node + ";"
+
+
+class Edge(Common):
+    """A graph edge.
+
+    This class represents a graph's edge with all its attributes.
+
+    edge(src, dst, attribute=value, ...)
+
+    src: source node, subgraph or cluster
+    dst: destination node, subgraph or cluster
+
+    `src` and `dst` can be specified as a `Node`, `Subgraph` or
+    `Cluster` object, or as the name string of such a component.
+
+    All the attributes defined in the Graphviz dot language should
+    be supported.
+
+    Attributes can be set through the dynamically generated methods:
+
+     set_[attribute name], i.e. set_label, set_fontname
+
+    or directly by using the instance's special dictionary:
+
+     Edge.obj_dict['attributes'][attribute name], i.e.
+
+        edge_instance.obj_dict['attributes']['label']
+        edge_instance.obj_dict['attributes']['fontname']
+
+    """
+
+    def __init__(self, src="", dst="", obj_dict=None, **attrs):
+        self.obj_dict = dict()
+        if isinstance(src, (Node, Subgraph, Cluster)):
+            src = src.get_name()
+        if isinstance(dst, (Node, Subgraph, Cluster)):
+            dst = dst.get_name()
+        points = (quote_if_necessary(src), quote_if_necessary(dst))
+        self.obj_dict["points"] = points
+        if obj_dict is None:
+            # Copy the attributes
+            self.obj_dict["attributes"] = dict(attrs)
+            self.obj_dict["type"] = "edge"
+            self.obj_dict["parent_graph"] = None
+            self.obj_dict["parent_edge_list"] = None
+            self.obj_dict["sequence"] = None
+        else:
+            self.obj_dict = obj_dict
+        self.create_attribute_methods(EDGE_ATTRIBUTES)
+
+    def __str__(self):
+        return self.to_string()
+
+    def get_source(self):
+        """Get the edges source node name."""
+        return self.obj_dict["points"][0]
+
+    def get_destination(self):
+        """Get the edge's destination node name."""
+        return self.obj_dict["points"][1]
+
+    def __hash__(self):
+        return hash(hash(self.get_source()) + hash(self.get_destination()))
+
+    def __eq__(self, edge):
+        """Compare two edges.
+
+        If the parent graph is directed, arcs linking
+        node A to B are considered equal and A->B != B->A
+
+        If the parent graph is undirected, any edge
+        connecting two nodes is equal to any other
+        edge connecting the same nodes, A->B == B->A
+        """
+
+        if not isinstance(edge, Edge):
+            raise pydot.Error("Can not compare an edge to a non-edge object.")
+
+        if self.get_parent_graph().get_top_graph_type() == "graph":
+
+            # If the graph is undirected, the edge has neither
+            # source nor destination.
+            #
+            if (
+                self.get_source() == edge.get_source()
+                and self.get_destination() == edge.get_destination()
+            ) or (
+                edge.get_source() == self.get_destination()
+                and edge.get_destination() == self.get_source()
+            ):
+                return True
+
+        else:
+            if (
+                self.get_source() == edge.get_source()
+                and self.get_destination() == edge.get_destination()
+            ):
+                return True
+
+        return False
+
+    def parse_node_ref(self, node_str):
+        if not isinstance(node_str, str):
+            return node_str
+
+        if node_str.startswith('"') and node_str.endswith('"'):
+
+            return node_str
+
+        node_port_idx = node_str.rfind(":")
+
+        if (
+            node_port_idx > 0
+            and node_str[0] == '"'
+            and node_str[node_port_idx - 1] == '"'
+        ):
+            return node_str
+
+        if node_port_idx > 0:
+            a = node_str[:node_port_idx]
+            b = node_str[node_port_idx + 1 :]
+
+            node = quote_if_necessary(a)
+            node += ":" + quote_if_necessary(b)
+
+            return node
+
+        return node_str
+
+    def to_string(self):
+        """Return string representation of edge in DOT language."""
+        src = self.parse_node_ref(self.get_source())
+        dst = self.parse_node_ref(self.get_destination())
+
+        if isinstance(src, frozendict):
+            edge = [Subgraph(obj_dict=src).to_string()]
+        elif isinstance(src, int):
+            edge = [str(src)]
+        else:
+            edge = [src]
+
+        if (
+            self.get_parent_graph()
+            and self.get_parent_graph().get_top_graph_type()
+            and self.get_parent_graph().get_top_graph_type() == "digraph"
+        ):
+            edge.append("->")
+
+        else:
+            edge.append("--")
+
+        if isinstance(dst, frozendict):
+            edge.append(Subgraph(obj_dict=dst).to_string())
+        elif isinstance(dst, int):
+            edge.append(str(dst))
+        else:
+            edge.append(dst)
+
+        edge_attr = list()
+
+        for attr in sorted(self.obj_dict["attributes"]):
+            value = self.obj_dict["attributes"][attr]
+            if value == "":
+                value = '""'
+            if value is not None:
+                edge_attr.append("%s=%s" % (attr, quote_if_necessary(value)))
+            else:
+                edge_attr.append(attr)
+
+        edge_attr = ", ".join(edge_attr)
+
+        if edge_attr:
+            edge.append(" [" + edge_attr + "]")
+
+        return " ".join(edge) + ";"
+
+
+class Graph(Common):
+    """Class representing a graph in Graphviz's dot language.
+
+    This class implements the methods to work on a representation
+    of a graph in Graphviz's dot language.
+
+    graph(  graph_name='G', graph_type='digraph',
+        strict=False, suppress_disconnected=False, attribute=value, ...)
+
+    graph_name:
+        the graph's name
+    graph_type:
+        can be 'graph' or 'digraph'
+    suppress_disconnected:
+        defaults to False, which will remove from the
+        graph any disconnected nodes.
+    simplify:
+        if True it will avoid displaying equal edges, i.e.
+        only one edge between two nodes. removing the
+        duplicated ones.
+
+    All the attributes defined in the Graphviz dot language should
+    be supported.
+
+    Attributes can be set through the dynamically generated methods:
+
+     set_[attribute name], i.e. set_size, set_fontname
+
+    or using the instance's attributes:
+
+     Graph.obj_dict['attributes'][attribute name], i.e.
+
+        graph_instance.obj_dict['attributes']['label']
+        graph_instance.obj_dict['attributes']['fontname']
+    """
+
+    def __init__(
+        self,
+        graph_name="G",
+        obj_dict=None,
+        graph_type="digraph",
+        strict=False,
+        suppress_disconnected=False,
+        simplify=False,
+        **attrs,
+    ):
+        if obj_dict is not None:
+            self.obj_dict = obj_dict
+
+        else:
+
+            self.obj_dict = dict()
+
+            self.obj_dict["attributes"] = dict(attrs)
+
+            if graph_type not in ["graph", "digraph"]:
+                raise pydot.Error(
+                    (
+                        'Invalid type "{t}". '
+                        "Accepted graph types are: "
+                        "graph, digraph"
+                    ).format(t=graph_type)
+                )
+
+            self.obj_dict["name"] = quote_if_necessary(graph_name)
+            self.obj_dict["type"] = graph_type
+
+            self.obj_dict["strict"] = strict
+            self.obj_dict["suppress_disconnected"] = suppress_disconnected
+            self.obj_dict["simplify"] = simplify
+
+            self.obj_dict["current_child_sequence"] = 1
+            self.obj_dict["nodes"] = dict()
+            self.obj_dict["edges"] = dict()
+            self.obj_dict["subgraphs"] = dict()
+
+            self.set_parent_graph(self)
+
+        self.create_attribute_methods(GRAPH_ATTRIBUTES)
+
+    def __str__(self):
+        return self.to_string()
+
+    def get_graph_type(self):
+        return self.obj_dict["type"]
+
+    def get_top_graph_type(self):
+        parent = self
+        while True:
+            parent_ = parent.get_parent_graph()
+            if parent_ == parent:
+                break
+            parent = parent_
+
+        return parent.obj_dict["type"]
+
+    def set_graph_defaults(self, **attrs):
+        self.add_node(Node("graph", **attrs))
+
+    def get_graph_defaults(self, **attrs):
+        graph_nodes = self.get_node("graph")
+
+        if isinstance(graph_nodes, (list, tuple)):
+            return [node.get_attributes() for node in graph_nodes]
+
+        return graph_nodes.get_attributes()
+
+    def set_node_defaults(self, **attrs):
+        """Define default node attributes.
+
+        These attributes only apply to nodes added to the graph after
+        calling this method.
+        """
+        self.add_node(Node("node", **attrs))
+
+    def get_node_defaults(self, **attrs):
+        graph_nodes = self.get_node("node")
+
+        if isinstance(graph_nodes, (list, tuple)):
+            return [node.get_attributes() for node in graph_nodes]
+
+        return graph_nodes.get_attributes()
+
+    def set_edge_defaults(self, **attrs):
+        self.add_node(Node("edge", **attrs))
+
+    def get_edge_defaults(self, **attrs):
+        graph_nodes = self.get_node("edge")
+
+        if isinstance(graph_nodes, (list, tuple)):
+            return [node.get_attributes() for node in graph_nodes]
+
+        return graph_nodes.get_attributes()
+
+    def set_simplify(self, simplify):
+        """Set whether to simplify or not.
+
+        If True it will avoid displaying equal edges, i.e.
+        only one edge between two nodes. removing the
+        duplicated ones.
+        """
+        self.obj_dict["simplify"] = simplify
+
+    def get_simplify(self):
+        """Get whether to simplify or not.
+
+        Refer to set_simplify for more information.
+        """
+        return self.obj_dict["simplify"]
+
+    def set_type(self, graph_type):
+        """Set the graph's type, 'graph' or 'digraph'."""
+        self.obj_dict["type"] = graph_type
+
+    def get_type(self):
+        """Get the graph's type, 'graph' or 'digraph'."""
+        return self.obj_dict["type"]
+
+    def set_name(self, graph_name):
+        """Set the graph's name."""
+        self.obj_dict["name"] = graph_name
+
+    def get_name(self):
+        """Get the graph's name."""
+        return self.obj_dict["name"]
+
+    def set_strict(self, val):
+        """Set graph to 'strict' mode.
+
+        This option is only valid for top level graphs.
+        """
+        self.obj_dict["strict"] = val
+
+    def get_strict(self, val):
+        """Get graph's 'strict' mode (True, False).
+
+        This option is only valid for top level graphs.
+        """
+        return self.obj_dict["strict"]
+
+    def set_suppress_disconnected(self, val):
+        """Suppress disconnected nodes in the output graph.
+
+        This option will skip nodes in
+        the graph with no incoming or outgoing
+        edges. This option works also
+        for subgraphs and has effect only in the
+        current graph/subgraph.
+        """
+        self.obj_dict["suppress_disconnected"] = val
+
+    def get_suppress_disconnected(self, val):
+        """Get if suppress disconnected is set.
+
+        Refer to set_suppress_disconnected for more information.
+        """
+        return self.obj_dict["suppress_disconnected"]
+
+    def get_next_sequence_number(self):
+        seq = self.obj_dict["current_child_sequence"]
+        self.obj_dict["current_child_sequence"] += 1
+        return seq
+
+    def add_node(self, graph_node):
+        """Adds a node object to the graph.
+
+        It takes a node object as its only argument and returns
+        None.
+        """
+        if not isinstance(graph_node, Node):
+            raise TypeError(
+                "add_node() received " + "a non node class object: " + str(graph_node)
+            )
+
+        node = self.get_node(graph_node.get_name())
+
+        if not node:
+            self.obj_dict["nodes"][graph_node.get_name()] = [graph_node.obj_dict]
+            graph_node.set_parent_graph(self.get_parent_graph())
+        else:
+            self.obj_dict["nodes"][graph_node.get_name()].append(graph_node.obj_dict)
+
+        graph_node.set_sequence(self.get_next_sequence_number())
+
+    def del_node(self, name, index=None):
+        """Delete a node from the graph.
+
+        Given a node's name all node(s) with that same name
+        will be deleted if 'index' is not specified or set
+        to None.
+        If there are several nodes with that same name and
+        'index' is given, only the node in that position
+        will be deleted.
+
+        'index' should be an integer specifying the position
+        of the node to delete. If index is larger than the
+        number of nodes with that name, no action is taken.
+
+        If nodes are deleted it returns True. If no action
+        is taken it returns False.
+        """
+        if isinstance(name, Node):
+            name = name.get_name()
+
+        if name in self.obj_dict["nodes"]:
+
+            if index is not None and index < len(self.obj_dict["nodes"][name]):
+                del self.obj_dict["nodes"][name][index]
+                return True
+            else:
+                del self.obj_dict["nodes"][name]
+                return True
+
+        return False
+
+    def get_node(self, name):
+        """Retrieve a node from the graph.
+
+        Given a node's name the corresponding Node
+        instance will be returned.
+
+        If one or more nodes exist with that name a list of
+        Node instances is returned.
+        An empty list is returned otherwise.
+        """
+        match = list()
+
+        if name in self.obj_dict["nodes"]:
+
+            match.extend(
+                [Node(obj_dict=obj_dict) for obj_dict in self.obj_dict["nodes"][name]]
+            )
+
+        return match
+
+    def get_nodes(self):
+        """Get the list of Node instances."""
+        return self.get_node_list()
+
+    def get_node_list(self):
+        """Get the list of Node instances.
+
+        This method returns the list of Node instances
+        composing the graph.
+        """
+        node_objs = list()
+
+        for node in self.obj_dict["nodes"]:
+            obj_dict_list = self.obj_dict["nodes"][node]
+            node_objs.extend([Node(obj_dict=obj_d) for obj_d in obj_dict_list])
+
+        return node_objs
+
+    def add_edge(self, graph_edge):
+        """Adds an edge object to the graph.
+
+        It takes a edge object as its only argument and returns
+        None.
+        """
+        if not isinstance(graph_edge, Edge):
+            raise TypeError(
+                "add_edge() received a non edge class object: " + str(graph_edge)
+            )
+
+        edge_points = (graph_edge.get_source(), graph_edge.get_destination())
+
+        if edge_points in self.obj_dict["edges"]:
+            edge_list = self.obj_dict["edges"][edge_points]
+            edge_list.append(graph_edge.obj_dict)
+        else:
+            self.obj_dict["edges"][edge_points] = [graph_edge.obj_dict]
+
+        graph_edge.set_sequence(self.get_next_sequence_number())
+        graph_edge.set_parent_graph(self.get_parent_graph())
+
+    def del_edge(self, src_or_list, dst=None, index=None):
+        """Delete an edge from the graph.
+
+        Given an edge's (source, destination) node names all
+        matching edges(s) will be deleted if 'index' is not
+        specified or set to None.
+        If there are several matching edges and 'index' is
+        given, only the edge in that position will be deleted.
+
+        'index' should be an integer specifying the position
+        of the edge to delete. If index is larger than the
+        number of matching edges, no action is taken.
+
+        If edges are deleted it returns True. If no action
+        is taken it returns False.
+        """
+        if isinstance(src_or_list, (list, tuple)):
+            if dst is not None and isinstance(dst, int):
+                index = dst
+            src, dst = src_or_list
+        else:
+            src, dst = src_or_list, dst
+
+        if isinstance(src, Node):
+            src = src.get_name()
+
+        if isinstance(dst, Node):
+            dst = dst.get_name()
+
+        if (src, dst) in self.obj_dict["edges"]:
+            if index is not None and index < len(self.obj_dict["edges"][(src, dst)]):
+                del self.obj_dict["edges"][(src, dst)][index]
+                return True
+            else:
+                del self.obj_dict["edges"][(src, dst)]
+                return True
+
+        return False
+
+    def get_edge(self, src_or_list, dst=None):
+        """Retrieved an edge from the graph.
+
+        Given an edge's source and destination the corresponding
+        Edge instance(s) will be returned.
+
+        If one or more edges exist with that source and destination
+        a list of Edge instances is returned.
+        An empty list is returned otherwise.
+        """
+        if isinstance(src_or_list, (list, tuple)) and dst is None:
+            edge_points = tuple(src_or_list)
+            edge_points_reverse = (edge_points[1], edge_points[0])
+        else:
+            edge_points = (src_or_list, dst)
+            edge_points_reverse = (dst, src_or_list)
+
+        match = list()
+
+        if edge_points in self.obj_dict["edges"] or (
+            self.get_top_graph_type() == "graph"
+            and edge_points_reverse in self.obj_dict["edges"]
+        ):
+            edges_obj_dict = self.obj_dict["edges"].get(
+                edge_points,
+                self.obj_dict["edges"].get(edge_points_reverse, None),
+            )
+
+            for edge_obj_dict in edges_obj_dict:
+                match.append(
+                    Edge(edge_points[0], edge_points[1], obj_dict=edge_obj_dict)
+                )
+
+        return match
+
+    def get_edges(self):
+        return self.get_edge_list()
+
+    def get_edge_list(self):
+        """Get the list of Edge instances.
+
+        This method returns the list of Edge instances
+        composing the graph.
+        """
+        edge_objs = list()
+
+        for edge in self.obj_dict["edges"]:
+            obj_dict_list = self.obj_dict["edges"][edge]
+            edge_objs.extend([Edge(obj_dict=obj_d) for obj_d in obj_dict_list])
+
+        return edge_objs
+
+    def add_subgraph(self, sgraph):
+        """Adds an subgraph object to the graph.
+
+        It takes a subgraph object as its only argument and returns
+        None.
+        """
+        if not isinstance(sgraph, Subgraph) and not isinstance(sgraph, Cluster):
+            raise TypeError(
+                "add_subgraph() received a non subgraph class object:" + str(sgraph)
+            )
+
+        if sgraph.get_name() in self.obj_dict["subgraphs"]:
+
+            sgraph_list = self.obj_dict["subgraphs"][sgraph.get_name()]
+            sgraph_list.append(sgraph.obj_dict)
+
+        else:
+            self.obj_dict["subgraphs"][sgraph.get_name()] = [sgraph.obj_dict]
+
+        sgraph.set_sequence(self.get_next_sequence_number())
+        sgraph.set_parent_graph(self.get_parent_graph())
+
+    def get_subgraph(self, name):
+        """Retrieved a subgraph from the graph.
+
+        Given a subgraph's name the corresponding
+        Subgraph instance will be returned.
+
+        If one or more subgraphs exist with the same name, a list of
+        Subgraph instances is returned.
+        An empty list is returned otherwise.
+        """
+        match = list()
+
+        if name in self.obj_dict["subgraphs"]:
+
+            sgraphs_obj_dict = self.obj_dict["subgraphs"].get(name)
+
+            for obj_dict_list in sgraphs_obj_dict:
+                match.append(Subgraph(obj_dict=obj_dict_list))
+
+        return match
+
+    def get_subgraphs(self):
+        return self.get_subgraph_list()
+
+    def get_subgraph_list(self):
+        """Get the list of Subgraph instances.
+
+        This method returns the list of Subgraph instances
+        in the graph.
+        """
+        sgraph_objs = list()
+
+        for sgraph in self.obj_dict["subgraphs"]:
+            obj_dict_list = self.obj_dict["subgraphs"][sgraph]
+            sgraph_objs.extend([Subgraph(obj_dict=obj_d) for obj_d in obj_dict_list])
+
+        return sgraph_objs
+
+    def set_parent_graph(self, parent_graph):
+        self.obj_dict["parent_graph"] = parent_graph
+
+        for k in self.obj_dict["nodes"]:
+            obj_list = self.obj_dict["nodes"][k]
+            for obj in obj_list:
+                obj["parent_graph"] = parent_graph
+
+        for k in self.obj_dict["edges"]:
+            obj_list = self.obj_dict["edges"][k]
+            for obj in obj_list:
+                obj["parent_graph"] = parent_graph
+
+        for k in self.obj_dict["subgraphs"]:
+            obj_list = self.obj_dict["subgraphs"][k]
+            for obj in obj_list:
+                Graph(obj_dict=obj).set_parent_graph(parent_graph)
+
+    def to_string(self):
+        """Return string representation of graph in DOT language.
+
+        @return: graph and subelements
+        @rtype: `str`
+        """
+        graph = list()
+
+        if self.obj_dict.get("strict", None) is not None:
+
+            if self == self.get_parent_graph() and self.obj_dict["strict"]:
+
+                graph.append("strict ")
+
+        graph_type = self.obj_dict["type"]
+        if graph_type == "subgraph" and not self.obj_dict.get("show_keyword", True):
+            graph_type = ""
+        s = "{type} {name} {{\n".format(type=graph_type, name=self.obj_dict["name"])
+        graph.append(s)
+
+        for attr in sorted(self.obj_dict["attributes"]):
+
+            if self.obj_dict["attributes"].get(attr, None) is not None:
+
+                val = self.obj_dict["attributes"].get(attr)
+                if val == "":
+                    val = '""'
+                if val is not None:
+                    graph.append("%s=%s" % (attr, quote_if_necessary(val)))
+                else:
+                    graph.append(attr)
+
+                graph.append(";\n")
+
+        edges_done = set()
+
+        edge_obj_dicts = list()
+        for k in self.obj_dict["edges"]:
+            edge_obj_dicts.extend(self.obj_dict["edges"][k])
+
+        if edge_obj_dicts:
+            edge_src_set, edge_dst_set = list(
+                zip(*[obj["points"] for obj in edge_obj_dicts])
+            )
+            edge_src_set, edge_dst_set = set(edge_src_set), set(edge_dst_set)
+        else:
+            edge_src_set, edge_dst_set = set(), set()
+
+        node_obj_dicts = list()
+        for k in self.obj_dict["nodes"]:
+            node_obj_dicts.extend(self.obj_dict["nodes"][k])
+
+        sgraph_obj_dicts = list()
+        for k in self.obj_dict["subgraphs"]:
+            sgraph_obj_dicts.extend(self.obj_dict["subgraphs"][k])
+
+        obj_list = [
+            (obj["sequence"], obj)
+            for obj in (edge_obj_dicts + node_obj_dicts + sgraph_obj_dicts)
+        ]
+        obj_list.sort(key=lambda x: x[0])
+
+        for idx, obj in obj_list:
+
+            if obj["type"] == "node":
+                node = Node(obj_dict=obj)
+
+                if self.obj_dict.get("suppress_disconnected", False):
+
+                    if (
+                        node.get_name() not in edge_src_set
+                        and node.get_name() not in edge_dst_set
+                    ):
+                        continue
+
+                graph.append(node.to_string() + "\n")
+
+            elif obj["type"] == "edge":
+                edge = Edge(obj_dict=obj)
+
+                if self.obj_dict.get("simplify", False) and edge in edges_done:
+                    continue
+
+                graph.append(edge.to_string() + "\n")
+                edges_done.add(edge)
+
+            else:
+                sgraph = Subgraph(obj_dict=obj)
+                graph.append(sgraph.to_string() + "\n")
+
+        graph.append("}\n")
+
+        return "".join(graph)
+
+
+class Subgraph(Graph):
+    """Class representing a subgraph in Graphviz's dot language.
+
+    This class implements the methods to work on a representation
+    of a subgraph in Graphviz's dot language.
+
+    subgraph(graph_name='subG',
+             suppress_disconnected=False,
+             attribute=value,
+             ...)
+
+    graph_name:
+        the subgraph's name
+    suppress_disconnected:
+        defaults to false, which will remove from the
+        subgraph any disconnected nodes.
+    All the attributes defined in the Graphviz dot language should
+    be supported.
+
+    Attributes can be set through the dynamically generated methods:
+
+     set_[attribute name], i.e. set_size, set_fontname
+
+    or using the instance's attributes:
+
+     Subgraph.obj_dict['attributes'][attribute name], i.e.
+
+        subgraph_instance.obj_dict['attributes']['label']
+        subgraph_instance.obj_dict['attributes']['fontname']
+    """
+
+    # RMF: subgraph should have all the
+    # attributes of graph so it can be passed
+    # as a graph to all methods
+    #
+    def __init__(
+        self,
+        graph_name="",
+        obj_dict=None,
+        suppress_disconnected=False,
+        simplify=False,
+        **attrs,
+    ):
+        Graph.__init__(
+            self,
+            graph_name=graph_name,
+            obj_dict=obj_dict,
+            suppress_disconnected=suppress_disconnected,
+            simplify=simplify,
+            **attrs,
+        )
+
+        if obj_dict is None:
+
+            self.obj_dict["type"] = "subgraph"
+
+
+class Cluster(Graph):
+    """Class representing a cluster in Graphviz's dot language.
+
+    This class implements the methods to work on a representation
+    of a cluster in Graphviz's dot language.
+
+    cluster(graph_name='subG',
+            suppress_disconnected=False,
+            attribute=value,
+            ...)
+
+    graph_name:
+        the cluster's name
+        (the string 'cluster' will be always prepended)
+    suppress_disconnected:
+        defaults to false, which will remove from the
+        cluster any disconnected nodes.
+    All the attributes defined in the Graphviz dot language should
+    be supported.
+
+    Attributes can be set through the dynamically generated methods:
+
+     set_[attribute name], i.e. set_color, set_fontname
+
+    or using the instance's attributes:
+
+     Cluster.obj_dict['attributes'][attribute name], i.e.
+
+        cluster_instance.obj_dict['attributes']['label']
+        cluster_instance.obj_dict['attributes']['fontname']
+    """
+
+    def __init__(
+        self,
+        graph_name="subG",
+        obj_dict=None,
+        suppress_disconnected=False,
+        simplify=False,
+        **attrs,
+    ):
+        Graph.__init__(
+            self,
+            graph_name=graph_name,
+            obj_dict=obj_dict,
+            suppress_disconnected=suppress_disconnected,
+            simplify=simplify,
+            **attrs,
+        )
+
+        if obj_dict is None:
+
+            self.obj_dict["type"] = "subgraph"
+            self.obj_dict["name"] = quote_if_necessary("cluster_" + graph_name)
+
+        self.create_attribute_methods(CLUSTER_ATTRIBUTES)
+
+
+class Dot(Graph):
+    """A container for handling a dot language file.
+
+    This class implements methods to write and process
+    a dot language file. It is a derived class of
+    the base class 'Graph'.
+    """
+
+    def __init__(self, *argsl, **argsd):
+        Graph.__init__(self, *argsl, **argsd)
+
+        self.shape_files = list()
+        self.formats = [
+            "canon",
+            "cmap",
+            "cmapx",
+            "cmapx_np",
+            "dia",
+            "dot",
+            "fig",
+            "gd",
+            "gd2",
+            "gif",
+            "hpgl",
+            "imap",
+            "imap_np",
+            "ismap",
+            "jpe",
+            "jpeg",
+            "jpg",
+            "mif",
+            "mp",
+            "pcl",
+            "pdf",
+            "pic",
+            "plain",
+            "plain-ext",
+            "png",
+            "ps",
+            "ps2",
+            "svg",
+            "svgz",
+            "vml",
+            "vmlz",
+            "vrml",
+            "vtx",
+            "wbmp",
+            "xdot",
+            "xlib",
+        ]
+
+        self.prog = "dot"
+
+        # Automatically creates all
+        # the methods enabling the creation
+        # of output in any of the supported formats.
+        for frmt in self.formats:
+
+            def new_method(f=frmt, prog=self.prog, encoding=None):
+                """Refer to docstring of method `create`."""
+                return self.create(format=f, prog=prog, encoding=encoding)
+
+            name = "create_{fmt}".format(fmt=frmt)
+            self.__setattr__(name, new_method)
+
+        for frmt in self.formats + ["raw"]:
+
+            def new_method(path, f=frmt, prog=self.prog, encoding=None):
+                """Refer to docstring of method `write.`"""
+                self.write(path, format=f, prog=prog, encoding=encoding)
+
+            name = "write_{fmt}".format(fmt=frmt)
+            self.__setattr__(name, new_method)
+
+    def __getstate__(self):
+        dict = copy.copy(self.obj_dict)
+        return dict
+
+    def __setstate__(self, state):
+        self.obj_dict = state
+
+    def set_shape_files(self, file_paths):
+        """Add the paths of the required image files.
+
+        If the graph needs graphic objects to
+        be used as shapes or otherwise
+        those need to be in the same folder as
+        the graph is going to be rendered
+        from. Alternatively the absolute path to
+        the files can be specified when
+        including the graphics in the graph.
+
+        The files in the location pointed to by
+        the path(s) specified as arguments
+        to this method will be copied to
+        the same temporary location where the
+        graph is going to be rendered.
+        """
+        if isinstance(file_paths, str):
+            self.shape_files.append(file_paths)
+
+        if isinstance(file_paths, (list, tuple)):
+            self.shape_files.extend(file_paths)
+
+    def set_prog(self, prog):
+        """Sets the default program.
+
+        Sets the default program in charge of processing
+        the dot file into a graph.
+        """
+        self.prog = prog
+
+    def write(self, path, prog=None, format="raw", encoding=None):
+        """Writes a graph to a file.
+
+        Given a filename 'path' it will open/create and truncate
+        such file and write on it a representation of the graph
+        defined by the dot object in the format specified by
+        'format' and using the encoding specified by `encoding` for text.
+        The format 'raw' is used to dump the string representation
+        of the Dot object, without further processing.
+        The output can be processed by any of graphviz tools, defined
+        in 'prog', which defaults to 'dot'
+        Returns True or False according to the success of the write
+        operation.
+
+        There's also the preferred possibility of using:
+
+            write_'format'(path, prog='program')
+
+        which are automatically defined for all the supported formats.
+        [write_ps(), write_gif(), write_dia(), ...]
+
+        The encoding is passed to `open` [1].
+
+        [1] https://docs.python.org/3/library/functions.html#open
+        """
+        if prog is None:
+            prog = self.prog
+        if format == "raw":
+            s = self.to_string()
+            with io.open(path, mode="wt", encoding=encoding) as f:
+                f.write(s)
+        else:
+            s = self.create(prog, format, encoding=encoding)
+            with io.open(path, mode="wb") as f:
+                f.write(s)
+        return True
+
+    def create(self, prog=None, format="ps", encoding=None):
+        """Creates and returns a binary image for the graph.
+
+        create will write the graph to a temporary dot file in the
+        encoding specified by `encoding` and process it with the
+        program given by 'prog' (which defaults to 'twopi'), reading
+        the binary image output and return it as `bytes`.
+
+        There's also the preferred possibility of using:
+
+            create_'format'(prog='program')
+
+        which are automatically defined for all the supported formats,
+        for example:
+
+          - `create_ps()`
+          - `create_gif()`
+          - `create_dia()`
+
+        If 'prog' is a list, instead of a string,
+        then the fist item is expected to be the program name,
+        followed by any optional command-line arguments for it:
+
+            [ 'twopi', '-Tdot', '-s10' ]
+
+
+        @param prog: either:
+
+          - name of GraphViz executable that
+            can be found in the `$PATH`, or
+
+          - absolute path to GraphViz executable.
+
+          If you have added GraphViz to the `$PATH` and
+          use its executables as installed
+          (without renaming any of them)
+          then their names are:
+
+            - `'dot'`
+            - `'twopi'`
+            - `'neato'`
+            - `'circo'`
+            - `'fdp'`
+            - `'sfdp'`
+
+          On Windows, these have the notorious ".exe" extension that,
+          only for the above strings, will be added automatically.
+
+          The `$PATH` is inherited from `os.env['PATH']` and
+          passed to `subprocess.Popen` using the `env` argument.
+
+          If you haven't added GraphViz to your `$PATH` on Windows,
+          then you may want to give the absolute path to the
+          executable (for example, to `dot.exe`) in `prog`.
+        """
+        if prog is None:
+            prog = self.prog
+
+        assert prog is not None
+
+        if isinstance(prog, (list, tuple)):
+            prog, args = prog[0], prog[1:]
+        else:
+            args = []
+
+        # temp file
+        tmp_fd, tmp_name = tempfile.mkstemp()
+        os.close(tmp_fd)
+        self.write(tmp_name, encoding=encoding)
+        tmp_dir = os.path.dirname(tmp_name)
+
+        # For each of the image files...
+        for img in self.shape_files:
+            # Get its data
+            f = open(img, "rb")
+            f_data = f.read()
+            f.close()
+            # And copy it under a file with the same name in
+            # the temporary directory
+            f = open(os.path.join(tmp_dir, os.path.basename(img)), "wb")
+            f.write(f_data)
+            f.close()
+
+        arguments = ["-T{}".format(format)] + args + [tmp_name]
+
+        try:
+            stdout_data, stderr_data, process = call_graphviz(
+                program=prog,
+                arguments=arguments,
+                working_dir=tmp_dir,
+            )
+        except OSError as e:
+            if e.errno == errno.ENOENT:
+                args = list(e.args)
+                args[1] = '"{prog}" not found in path.'.format(prog=prog)
+                raise OSError(*args)
+            else:
+                raise
+
+        # clean file litter
+        for img in self.shape_files:
+            os.unlink(os.path.join(tmp_dir, os.path.basename(img)))
+
+        os.unlink(tmp_name)
+
+        if process.returncode != 0:
+            message = (
+                '"{prog}" with args {arguments} returned code: {code}\n\n'
+                "stdout, stderr:\n {out}\n{err}\n"
+            ).format(
+                prog=prog,
+                arguments=arguments,
+                code=process.returncode,
+                out=stdout_data,
+                err=stderr_data,
+            )
+            print(message)
+
+        assert (
+            process.returncode == 0
+        ), '"{prog}" with args {arguments} returned code: {code}'.format(
+            prog=prog,
+            arguments=arguments,
+            code=process.returncode,
+        )
+
+        return stdout_data
diff --git a/python/aitemplate/utils/visualization/web_template.py b/python/aitemplate/utils/visualization/web_template.py
new file mode 100644
index 000000000..3f1f1c920
--- /dev/null
+++ b/python/aitemplate/utils/visualization/web_template.py
@@ -0,0 +1,381 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Web template for visualization
+"""
+
+# flake8: noqa
+
+import jinja2
+
+INDEX_TEMPLATE = jinja2.Template(
+    """
+<!DOCTYPE html>
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.3.1/dist/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
+  <title>{{network_name}}</title>
+</head>
+
+<style>
+
+html {
+  scroll-behavior: smooth;
+}
+
+* { box-sizing: border-box; }
+body {
+  font: 16px Arial;
+}
+.autocomplete {
+  /*the container must be positioned relative:*/
+  position: relative;
+  display: inline-block;
+}
+input {
+  border: 1px solid transparent;
+  background-color: #f1f1f1;
+  padding: 10px;
+  font-size: 16px;
+}
+input[type=text] {
+  background-color: #f1f1f1;
+  width: 100%;
+}
+input[type=submit] {
+  background-color: DodgerBlue;
+  color: #fff;
+}
+.autocomplete-items {
+  position: absolute;
+  border: 1px solid #d4d4d4;
+  border-bottom: none;
+  border-top: none;
+  z-index: 99;
+  /*position the autocomplete items to be the same width as the container:*/
+  top: 100%;
+  left: 0;
+  right: 0;
+}
+.autocomplete-items div {
+  padding: 10px;
+  cursor: pointer;
+  background-color: #fff;
+  border-bottom: 1px solid #d4d4d4;
+}
+.autocomplete-items div:hover {
+  /*when hovering an item:*/
+  background-color: #e9e9e9;
+}
+.autocomplete-active {
+  /*when navigating through the items using the arrow keys:*/
+  background-color: DodgerBlue !important;
+  color: #ffffff;
+}
+
+.popover {
+  box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2);
+  title-bg: "#0d6efd";
+}
+
+.header {
+  position: fixed;
+  width: 100%;
+  top: 0;
+  left: 0;
+
+}
+
+</style>
+
+
+<body>
+
+<nav id="nav_bar" class="navbar fixed-top bg-light">
+  <div class="container-fluid">
+    <a onclick="back_to_head()" class="navbar-brand">{{network_name}}</a>
+    <div class="navbar-right">
+        <div class="autocomplete" style="width:300px;">
+        <input id="name_input" class="form-control me-2" type="search" placeholder="Search" aria-label="Search">
+        </div>
+        <button class="btn btn-outline-success" onclick="launch_modal_with_input()">Search</button>
+    </div>
+  </div>
+</nav>
+
+
+  <script
+  src="https://code.jquery.com/jquery-3.6.0.js"
+  integrity="sha256-H+K7U5CnXl1h5ywQfKtSj8PCmoN9aaq30gDh27Xc0jk="
+  crossorigin="anonymous"></script>
+  
+  <script src="https://cdn.jsdelivr.net/npm/popper.js@1.12.9/dist/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
+  <script src="https://d3js.org/d3.v5.min.js"></script>
+  <script src="https://unpkg.com/@hpcc-js/wasm@0.3.11/dist/index.min.js"></script>
+  <script src="https://unpkg.com/d3-graphviz@3.0.5/build/d3-graphviz.js"></script>
+
+
+
+  <div id="graph" style="text-align: center;"></div>
+  {{modals}}
+  
+
+  <script>
+  items = [{{items|join(", ")}}];
+  function autocomplete(inp, arr) {
+  /*the autocomplete function takes two arguments,
+  the text field element and an array of possible autocompleted values:*/
+  var currentFocus;
+  /*execute a function when someone writes in the text field:*/
+  inp.addEventListener("input", function(e) {
+      var a, b, i, val = this.value;
+      /*close any already open lists of autocompleted values*/
+      closeAllLists();
+      if (!val) { return false;}
+      currentFocus = -1;
+      /*create a DIV element that will contain the items (values):*/
+      a = document.createElement("DIV");
+      a.setAttribute("id", this.id + "autocomplete-list");
+      a.setAttribute("class", "autocomplete-items");
+      /*append the DIV element as a child of the autocomplete container:*/
+      this.parentNode.appendChild(a);
+      /*for each item in the array...*/
+      for (i = 0; i < arr.length; i++) {
+        /*check if the item starts with the same letters as the text field value:*/
+        if (arr[i].substr(0, val.length).toUpperCase() == val.toUpperCase()) {
+          /*create a DIV element for each matching element:*/
+          b = document.createElement("DIV");
+          /*make the matching letters bold:*/
+          b.innerHTML = "<strong>" + arr[i].substr(0, val.length) + "</strong>";
+          b.innerHTML += arr[i].substr(val.length);
+          /*insert a input field that will hold the current array item's value:*/
+          b.innerHTML += "<input type='hidden' value='" + arr[i] + "'>";
+          /*execute a function when someone clicks on the item value (DIV element):*/
+              b.addEventListener("click", function(e) {
+              /*insert the value for the autocomplete text field:*/
+              inp.value = this.getElementsByTagName("input")[0].value;
+              /*close the list of autocompleted values,
+              (or any other open lists of autocompleted values:*/
+              closeAllLists();
+          });
+          a.appendChild(b);
+        }
+      }
+  });
+  /*execute a function presses a key on the keyboard:*/
+  inp.addEventListener("keydown", function(e) {
+      var x = document.getElementById(this.id + "autocomplete-list");
+      if (x) x = x.getElementsByTagName("div");
+      if (e.keyCode == 40) {
+        /*If the arrow DOWN key is pressed,
+        increase the currentFocus variable:*/
+        currentFocus++;
+        /*and and make the current item more visible:*/
+        addActive(x);
+      } else if (e.keyCode == 38) { //up
+        /*If the arrow UP key is pressed,
+        decrease the currentFocus variable:*/
+        currentFocus--;
+        /*and and make the current item more visible:*/
+        addActive(x);
+      } else if (e.keyCode == 13) {
+        /*If the ENTER key is pressed, prevent the form from being submitted,*/
+        e.preventDefault();
+        if (currentFocus > -1) {
+          /*and simulate a click on the "active" item:*/
+          if (x) x[currentFocus].click();
+        }
+      }
+  });
+  function addActive(x) {
+    /*a function to classify an item as "active":*/
+    if (!x) return false;
+    /*start by removing the "active" class on all items:*/
+    removeActive(x);
+    if (currentFocus >= x.length) currentFocus = 0;
+    if (currentFocus < 0) currentFocus = (x.length - 1);
+    /*add class "autocomplete-active":*/
+    x[currentFocus].classList.add("autocomplete-active");
+  }
+  function removeActive(x) {
+    /*a function to remove the "active" class from all autocomplete items:*/
+    for (var i = 0; i < x.length; i++) {
+      x[i].classList.remove("autocomplete-active");
+    }
+  }
+  function closeAllLists(elmnt) {
+    /*close all autocomplete lists in the document,
+    except the one passed as an argument:*/
+    var x = document.getElementsByClassName("autocomplete-items");
+    for (var i = 0; i < x.length; i++) {
+      if (elmnt != x[i] && elmnt != inp) {
+      x[i].parentNode.removeChild(x[i]);
+    }
+  }
+}
+/*execute a function when someone clicks in the document:*/
+document.addEventListener("click", function (e) {
+    closeAllLists(e.target);
+});
+}
+  autocomplete(document.getElementById("name_input"), items);
+
+  function back_to_head() {
+    var modal_id = items[0];
+    var modal = document.getElementById(modal_id);
+    modal.scrollIntoView({ block: 'center',  behavior: 'smooth' });
+  }
+
+  function launch_modal_with_input() {
+    var modal_id = document.getElementById("name_input").value;
+    var modal = document.getElementById(modal_id);
+    if (modal == null) {
+        var msg = "Could not find node with name: " + modal_id;
+        alert(msg);
+    } else {
+        modal.scrollIntoView({ block: 'center',  behavior: 'smooth' });
+        var obj = $("#" + modal_id);
+        var shape = obj.find("polygon:first");
+        var color = shape.attr("stroke");
+        shape.attr("fill", color);
+        for (let i = 0; i < 5; i++) {
+            obj.fadeOut(100).fadeIn(100).fadeOut(100).fadeIn(100);
+        }
+        obj.promise().done(function(){
+            shape.attr("fill", "none");
+        });
+    }
+  }
+
+
+
+
+  </script>
+
+
+
+
+  <script>
+    var dotSrc = `{{dot_src}}`;
+    var popover_data = {{popover_data}};
+    var graphviz = d3.select("#graph").graphviz();
+    var pop_finish = 0;
+    // var dotSrcLines;
+
+    function add_popover() {
+      for (let i = 0; i < items.length; i++) {
+        var id = items[i];
+        var obj = $("#" + id);
+        obj.attr("data-content", popover_data[id]);
+        obj.attr("rel", "popover");
+        obj.attr("data-original-title", id);
+        obj.attr("data-placement", "top");
+        obj.attr("data-trigger", "hover");
+        obj.popover();
+      }
+    }
+  
+
+    function render() {
+      // console.log('DOT source =', dotSrc);
+      // dotSrcLines = dotSrc.split('\\n');
+      graphviz.transition(function() {
+        return d3.transition().delay(100).duration(1000);
+      }).renderDot(dotSrc).on("end", interactive);
+    }
+
+    function launch_modal(modal_id) {
+      $('#' + modal_id + "_modal").modal('show');
+    }
+
+
+    function interactive() {
+      nodes = d3.selectAll('.node,.edge');
+      nodes.on("click", function() {
+        var id = d3.select(this).attr('id');
+        // console.log('Element id="%s"', id);
+        document.getElementById("name_input").value = id;
+        launch_modal(id);
+      });
+      nodes.on("mouseover", function() {
+        if (pop_finish == 0) {
+            add_popover();
+            pop_finish = 1;
+        }
+        var id = d3.select(this).attr("id");
+        // console.log('Move over Element id="%s"', id);
+        var obj = $("#" + id);
+        var shape = obj.find("polygon:first");
+        var color = shape.attr("stroke");
+        shape.attr("fill", color);
+        
+      });
+      nodes.on("mouseout", function() {
+        var id = d3.select(this).attr("id");
+        // console.log('Move off Element id="%s"', id);
+        var obj = $("#" + id);
+        var shape = obj.find("polygon:first");
+        shape.attr("fill", "none");
+      });
+
+    }
+    render(dotSrc);
+  </script>
+  
+</body>
+"""
+)
+
+
+MODAL_TEMPLATE = jinja2.Template(
+    """
+<div class="modal fade" id="{{modal_id}}" tabindex="-1" role="dialog" aria-labelledby="{{modal_label}}" aria-hidden="true">
+  <div class="modal-dialog" role="document">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title" id="{{modal_label}}">{{modal_title}}</h5>
+        <button type="button" class="close" data-dismiss="modal" aria-label="Close">
+          <span aria-hidden="true">&times;</span>
+        </button>
+      </div>
+      <div class="modal-body">
+        {{modal_content}}
+      </div>
+      <div class="modal-footer">
+        <button type="button" class="btn btn-primary" data-dismiss="modal">Close</button>
+      </div>
+    </div>
+  </div>
+</div>
+"""
+)
+TABLE_TEMPLATE = jinja2.Template(
+    """
+<table class="table">
+  <thead class="thead-dark">
+    <tr>
+      <th scope="col">Attributes</th>
+      <th scope="col">Value</th>
+    </tr>
+  </thead>
+  <tbody> {% for key, value in table_data.items() %} <tr>
+      <td> {{key}} </td>
+      <td> {{value}} </td>
+    </tr> {% endfor %} </tbody>
+</table>
+"""
+)
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 000000000..d44f3d6cf
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,176 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# pylint: disable=invalid-name, exec-used
+
+import os
+import shutil
+
+from setuptools import find_packages, setup
+from setuptools.dist import Distribution
+
+# flake8: noqa
+
+CURRENT_DIR = os.path.dirname(__file__)
+libinfo_py = os.path.join(CURRENT_DIR, "aitemplate", "_libinfo.py")
+libinfo = {}
+with open(libinfo_py, "r") as f:
+    exec(f.read(), libinfo)
+__version__ = libinfo["__version__"]
+
+
+class BinaryDistribution(Distribution):
+    def has_ext_modules(self):
+        return False
+
+    def is_pure(self):
+        return True
+
+
+# temp copy 3rdparty libs to build dir
+try:
+    shutil.copytree("../3rdparty", "./aitemplate/3rdparty")
+except FileExistsError:
+    pass
+try:
+    shutil.copytree("../static", "./aitemplate/static")
+except FileExistsError:
+    pass
+try:
+    shutil.copytree("../licenses", "./aitemplate/licenses")
+except FileExistsError:
+    pass
+
+
+def gen_file_list(srcs, f_cond):
+    file_list = []
+    for src in srcs:
+        for root, _, files in os.walk(src):
+            value = []
+            for file in files:
+                if f_cond(file):
+                    path = os.path.join(root, file)
+                    value.append(path.replace("aitemplate/", ""))
+            file_list.extend(value)
+    return file_list
+
+
+def gen_cutlass_list():
+    srcs = [
+        "aitemplate/3rdparty/cutlass/include",
+        "aitemplate/3rdparty/cutlass/examples",
+        "aitemplate/3rdparty/cutlass/tools/util/include",
+    ]
+    f_cond = lambda x: True if x.endswith(".h") or x.endswith(".cuh") else False
+    return gen_file_list(srcs, f_cond)
+
+
+def gen_cutlass_lib_list():
+    srcs = ["aitemplate/3rdparty/cutlass/tools/library/scripts"]
+    f_cond = lambda x: True
+    return gen_file_list(srcs, f_cond)
+
+
+def gen_cub_list():
+    srcs = ["aitemplate/3rdparty/cub/cub"]
+    f_cond = lambda x: True if x.endswith(".h") or x.endswith(".cuh") else False
+    return gen_file_list(srcs, f_cond)
+
+
+def gen_ck_list():
+    srcs = [
+        "aitemplate/3rdparty/composable_kernel/include",
+        "aitemplate/3rdparty/composable_kernel/library/include/ck/library/utility",
+    ]
+    f_cond = lambda x: True if x.endswith(".h") or x.endswith(".hpp") else False
+    return gen_file_list(srcs, f_cond)
+
+
+def gen_flash_attention_list():
+    srcs = [
+        "aitemplate/backend/cuda/attention/src",
+        "aitemplate/backend/cuda/attention/src/fmha",
+    ]
+    f_cond = lambda x: True if x.endswith(".h") or x.endswith(".cuh") else False
+    return gen_file_list(srcs, f_cond)
+
+
+def gen_static_list():
+    srcs = [
+        "aitemplate/static",
+    ]
+    f_cond = lambda x: True if x.endswith(".h") or x.endswith(".cpp") else False
+    return gen_file_list(srcs, f_cond)
+
+
+def gen_utils_file_list():
+    srcs = ["aitemplate/utils"]
+    f_cond = lambda x: True if x.endswith(".py") else False
+    return gen_file_list(srcs, f_cond)
+
+
+def gen_backend_common_file_list():
+    srcs = ["aitemplate/backend/common"]
+    f_cond = lambda x: True if x.endswith(".py") or x.endswith(".cuh") else False
+    return gen_file_list(srcs, f_cond)
+
+
+def gen_license_file_list():
+    srcs = ["aitemplate/licenses"]
+    f_cond = lambda x: True
+    return gen_file_list(srcs, f_cond)
+
+
+setup_kwargs = {}
+include_libs = True
+wheel_include_libs = True
+
+
+setup(
+    name="aitemplate",
+    version=__version__,
+    description="AITemplate: Make Templates Great for AI",
+    zip_safe=True,
+    install_requires=["jinja2", "numpy"],
+    packages=find_packages(),
+    package_data={
+        "aitemplate": [
+            "backend/cuda/elementwise/custom_math.cuh",
+            "backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh",
+            "backend/cuda/groupnorm/groupnorm_kernel.cuh",
+            "backend/cuda/softmax/softmax.cuh",
+            "backend/cuda/vision_ops/nms/batched_nms_kernel.cuh",
+            "backend/cuda/vision_ops/nms/nms_kernel.cuh",
+            "backend/cuda/vision_ops/roi_ops/multi_level_roi_align.cuh",
+            "backend/rocm/elementwise/custom_math.h",
+        ]
+        + gen_utils_file_list()
+        + gen_cutlass_list()
+        + gen_cutlass_lib_list()
+        + gen_cub_list()
+        + gen_ck_list()
+        + gen_flash_attention_list()
+        + gen_static_list()
+        + gen_backend_common_file_list()
+        + gen_license_file_list(),
+    },
+    python_requires=">=3.7, <4",
+    distclass=BinaryDistribution,
+    **setup_kwargs,
+)
+
+# remove temp
+shutil.rmtree("./aitemplate/3rdparty")
+shutil.rmtree("./aitemplate/static")
+shutil.rmtree("./aitemplate/licenses")
diff --git a/static/README.md b/static/README.md
new file mode 100644
index 000000000..97c3f1b81
--- /dev/null
+++ b/static/README.md
@@ -0,0 +1,143 @@
+# About
+
+This directory contains all of the C++ sources that are static during AITemplate compilation, including the bulk of the runtime implementation.
+
+## C++ Implementation
+
+### `Model` v.s. `ModelContainer`
+
+These are the two main classes involved in the C++ runtime implementation.
+
+* The bulk of the runtime implementation is in `Model`.
+* `ModelContainer` stores a set of shared constants and a collection of `Model`s. Almost all functions in `model_interface.h` forward to a method on `ModelContainer`. When `Run` is invoked, `ModelContainer` looks for an available `Model`, or blocks until one is available (see the section on asynchronous predictions). It then forwards the run request to the runtime.
+
+### Code Structure
+
+Some important files:
+* `include/model_interface.h`: The interface that we expose in the compiled .so
+* `include/model_container.h`: The bulk of the `ModelContainer` implementation.
+
+Some files are generated at compile time. These include:
+* `model-generated.h`: The implementation for `Model`.
+* `model_container_base.cu`: A small part of the implementation for `ModelContainer` needs to be codegened. So `ModelContainer` inherits from `ModelContainerBase`, and `ModelContainerBase`'s implementation lives in this file. See `model_container.h` for more details.
+
+All codegen templates can be found in `backend/main_templates.py`. The codegen implementation is in `backend/codegen.py`.
+
+Note that many of the headers in this directory rely on generated code and thus cannot be `#include`d in external projects. The exception is `model_interface.h`.
+
+## Python `Model`
+
+`Model` is a collection of Python bindings to the C++ AIT runtime. This section describes the API.
+
+### `AITData`
+
+This class represents a contiguous blob of memory that AIT will use as a tensor. It is simply a named tuple with these fields:
+
+* `data_ptr: int`: An **unowned** pointer to **GPU** memory. In general, all of the APIs expect that this pointer will be valid for the entire duration of the call.
+* `shape: List[int]`: The shape of the tensor.
+* `dtype: str`: The tensor's dtype; one of `"float32", "float16", "int32", "int64"`. Note that most ops only support float16 at this stage.
+
+If using AITemplate with PyTorch, `AITData`s can be constructed with the `torch_to_ait_data` utility:
+
+```python
+x = torch.randn(3, 3, 3).half().cuda()
+# Equivalent to AITData(x.data_ptr(), [3, 3, 3], "float16")
+x_ait = torch_to_ait_data(x)
+```
+
+If PyTorch is not available, `Model` provides a set of functions for copying, allocating, and freeing GPU memory. See the docstrings in `compiler/model.py` for more information.
+
+### `run`
+
+`run` takes a set of inputs and outputs as `AITData`s. Both arguments can be passed as either an ordered list or a dictionary (mapping name to tensor).
+
+```python
+# Arguments as a dictionary
+module.run(
+  {"input0": in0_ait, "input1": in1_ait},
+  {"output0": out0_ait, "output1": out0_ait},
+)
+
+# Arguments as an ordered list. Note that you might need to query
+# the index mapping.
+input_name_to_idx = module.get_input_name_to_index_map()
+output_name_to_idx = module.get_output_name_to_index_map()
+
+inputs = [None for i in range(len(input_name_to_idx))]
+outputs = [None for i in range(len(input_name_to_idx))]
+
+for name in input_name_to_idx:
+  inputs[input_name_to_idx[name]] = ait_inputs[name]
+
+for name in output_name_to_idx:
+  outputs[output_name_to_idx[name]] = ait_outputs[name]
+
+module.run(inputs, outputs)
+```
+
+One important caveat is that the output must be its **maximum** size. This is because of dynamic shapes - the size of the output may vary, but its shape is not inferred until inference time. The maximum shape can be queried with the `get_output_maximum_shape()`:
+
+```python
+# Can use either name or index.
+name_to_idx = module.get_output_name_to_idx()
+max_shape = module.get_output_maximum_shape(name_to_idx["output"])
+max_shape = module.get_output_maximum_shape("output")
+```
+
+`Model.run` returns a dictionary of output `AITData`s with (possibly dynamic) shapes that the runtime inferred.
+
+#### Nullptr Inputs/Outputs
+
+In general, inputs are allowed to be null if they are size 0 (e.g. at least one dimension is 0). The runtime enforces this with a check before any kernels are launched.
+
+```cpp
+If (input_name == nullptr && dim0 * dim1 * … * dimN != 0) {
+  throw std::runtime_error(“input_name cannot be null!”);
+}
+```
+
+This is convenient since torch.data_ptr() returns null for size zero tensors. The dynamic shape computation is skipped if the lower bound of the tensor’s size is positive.
+
+#### Constants
+
+There are two types of constants in AIT; *bound* and *unbound* constants. A bound constant is known at compile time and may participate in constant folding. Bound constants are copied into GPU memory at model loading time. Values for bound constants may be provided by passing a dictionary (mapping constant name to AIT tensor) to `compile_model`.
+
+Unbound constants, on the other hand, do not participate in constant folding and must be provided before running the model. These must be set via `Model.set_constant`:
+
+```python
+module.set_constant("my_constant", AITData(...))
+# The pointer in the the tensor must live for the entire duration of run()
+module.run(...)
+```
+
+Constants are read-only and *shared* with all runtimes in the `ModelContainer`.
+
+#### `run_with_tensors`
+`run_with_tensors` is a convenience method with the same interface as `run`, except it can take lists of `torch.Tensor`s:
+
+```python
+input0 = torch.randn(input0_shape).cuda().half()
+output0 = torch.empty(output0_shape).cuda().half()
+# Returns a dictionary of reshaped outputs
+result = module.run_with_tensors([input0], [output0])
+```
+
+#### Streams and Asynchronous Predictions
+
+A pointer to a stream can optionally be passed to `run`. If none is given, the prediction happens on the default stream 0. If the `sync` argument is set to `True`, the stream is synchronized before `run()` returns. `sync` is `True` by default.
+
+Multiple predictions can happen at the same time (on the same or different streams). Under the hood, there is a fixed-size pool of runtime objects. When all the runtimes are used, `run()` blocks until one is available.
+The size of this pool can be configured with the `num_runtimes` option in `Model`'s constructor.
+
+#### CUDA Graph
+
+Run also takes a `graph_mode` option. If set to true, the runtime will try to use [CUDA graphs](https://developer.nvidia.com/blog/cuda-graphs/) to run the model. `graph_mode` is not supported on ROCm.
+
+The following is a high level overview of how graph mode works:
+
+1) Each `Model` has an internal stream used for graph capturing. The model first runs all ops on this stream in capture mode. No kernel launches happen during this stage.
+2) If this is the first run, a graph is instantiated via `cudaGraphInstantiate`.
+3) On subsequent runs, we try to avoid the relatively expensive `cudaGraphInstantiate` call by updating the graph executor (`cudaGraphExecUpdate`). However, a new graph may still be instantiated if the topology of the graph somehow changed between runs.
+4) Once we have the graph executor, we launch a single kernel on the stream that the user provided to `run()`.
+
+Graph mode is mainly beneficial when there are many small kernel launches. A lot of overhead can be avoided since there is only a single kernel launch in graph mode.
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
new file mode 100644
index 000000000..cb3961214
--- /dev/null
+++ b/static/csrc/model_container.cpp
@@ -0,0 +1,475 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#include "model_container.h"
+
+#include "device_functions-generated.h"
+#include "raii_wrapper.h"
+
+namespace ait {
+
+ModelContainer::ModelContainer(
+    size_t num_models,
+    size_t blob_size,
+    size_t workspace_size,
+    size_t num_inputs,
+    size_t num_outputs,
+    size_t num_unbound_constants,
+    size_t params_size)
+    : ModelContainerBase(
+          num_inputs,
+          num_outputs,
+          num_unbound_constants,
+          params_size),
+      num_inputs_(num_inputs),
+      num_outputs_(num_outputs) {
+  if (num_models == 0) {
+    throw std::runtime_error("Number of models must be positive");
+  }
+  models_.reserve(num_models);
+  available_models_.reserve(num_models);
+
+  for (size_t i = 0; i < num_models; ++i) {
+    models_.emplace_back(
+        blob_size,
+        workspace_size,
+        num_inputs,
+        num_outputs,
+        num_unbound_constants,
+        static_cast<uint8_t*>(constants_.get()));
+    available_models_.push_back(&models_.back());
+  }
+}
+
+void ModelContainer::Run(
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    StreamType stream,
+    bool sync,
+    bool graph_mode,
+    int64_t** output_shapes_out) {
+  auto* model = GetAvailableModel();
+  try {
+    PrepareForRun(model, inputs, num_inputs, outputs, num_outputs);
+    model->Run(stream, graph_mode);
+  } catch (...) {
+    std::lock_guard lk(models_mutex_);
+    available_models_.push_back(model);
+    throw;
+  }
+
+  if (output_shapes_out) {
+    for (size_t i = 0; i < num_outputs; ++i) {
+      auto* out_shape = output_shapes_out[i];
+      model->GetOutputShape(i, out_shape);
+    }
+  }
+
+  {
+    std::lock_guard lk(models_mutex_);
+    pending_models_.push_back(model);
+  }
+  pending_models_available_.notify_one();
+  if (sync) {
+    StreamSynchronize(stream);
+  }
+}
+
+void ModelContainer::RunWithOutputsOnHost(
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    StreamType stream,
+    bool graph_mode,
+    int64_t** output_shapes_out) {
+  std::vector<std::pair<GPUPtr, size_t>> owned_outputs_ptrs;
+  std::vector<AITData> owned_outputs;
+  owned_outputs_ptrs.reserve(num_outputs);
+  owned_outputs.reserve(num_outputs);
+  for (size_t i = 0; i < num_outputs; ++i) {
+    size_t num_bytes = MaxOutputStorageBytes(i);
+    owned_outputs_ptrs.emplace_back(RAII_DeviceMalloc(num_bytes), num_bytes);
+    owned_outputs.emplace_back(
+        owned_outputs_ptrs.back().first.get(),
+        outputs[i].shape,
+        outputs[i].dtype);
+  }
+
+  Run(inputs,
+      num_inputs,
+      owned_outputs.data(),
+      num_outputs,
+      stream,
+      /*sync=*/false,
+      graph_mode,
+      output_shapes_out);
+
+  for (size_t i = 0; i < num_outputs; ++i) {
+    auto& owned_output = owned_outputs_ptrs[i];
+    auto& ptr = owned_output.first;
+    auto num_bytes = owned_output.second;
+    DEVICE_CHECK(CopyToHost(outputs[i].ptr, ptr.get(), num_bytes, stream));
+  }
+
+  DEVICE_CHECK(StreamSynchronize(stream));
+}
+
+float ModelContainer::Benchmark(
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    StreamType stream,
+    bool graph_mode,
+    size_t count,
+    size_t num_threads,
+    bool use_unique_stream_per_thread,
+    int64_t** output_shapes_out) {
+  if (num_threads == 0) {
+    num_threads = std::thread::hardware_concurrency();
+  }
+
+  if (num_threads == 1) {
+    return BenchmarkImpl(
+               inputs,
+               num_inputs,
+               outputs,
+               num_outputs,
+               stream,
+               graph_mode,
+               count,
+               output_shapes_out) /
+        count;
+  }
+  // Clone the outputs, each thread needs its own set
+  std::vector<std::vector<GPUPtr>> per_thread_outputs_ptrs;
+  std::vector<std::vector<AITData>> per_thread_outputs;
+  std::vector<StreamPtr> per_thread_streams;
+  per_thread_outputs_ptrs.reserve(num_threads - 1);
+  per_thread_outputs.reserve(num_threads - 1);
+
+  if (use_unique_stream_per_thread) {
+    per_thread_streams.reserve(num_threads);
+    for (size_t i = 0; i < num_threads; ++i) {
+      per_thread_streams.push_back(RAII_StreamCreate(/*non_blocking=*/true));
+    }
+  }
+
+  for (size_t i = 1; i < num_threads; ++i) {
+    std::vector<GPUPtr> cloned_outputs_ptrs;
+    std::vector<AITData> cloned_outputs;
+
+    cloned_outputs_ptrs.reserve(num_outputs);
+    cloned_outputs.reserve(num_outputs);
+
+    for (size_t j = 0; j < num_outputs; ++j) {
+      size_t num_bytes = MaxOutputStorageBytes(j);
+      cloned_outputs_ptrs.emplace_back(RAII_DeviceMalloc(num_bytes));
+      auto* new_pointer = cloned_outputs_ptrs.back().get();
+      DEVICE_CHECK(
+          DeviceToDeviceCopy(new_pointer, outputs[j].ptr, num_bytes, stream));
+      cloned_outputs.emplace_back(
+          new_pointer, outputs[j].shape, outputs[j].dtype);
+    }
+    per_thread_outputs_ptrs.push_back(std::move(cloned_outputs_ptrs));
+    per_thread_outputs.push_back(std::move(cloned_outputs));
+  }
+  DEVICE_CHECK(StreamSynchronize(stream));
+
+  auto get_stream = [stream, use_unique_stream_per_thread, &per_thread_streams](
+                        size_t thread_idx) {
+    if (!use_unique_stream_per_thread) {
+      return stream;
+    }
+    return per_thread_streams[thread_idx].get();
+  };
+
+  auto thread_func = [&](size_t thread_idx) {
+    AITData* thread_outputs =
+        thread_idx == 0 ? outputs : per_thread_outputs[thread_idx - 1].data();
+    StreamType thread_stream = get_stream(thread_idx);
+    auto* thread_output_shapes_out =
+        thread_idx == 0 ? output_shapes_out : nullptr;
+    return BenchmarkImpl(
+        inputs,
+        num_inputs,
+        thread_outputs,
+        num_outputs,
+        thread_stream,
+        graph_mode,
+        count,
+        thread_output_shapes_out);
+  };
+
+  std::vector<std::future<float>> futures;
+  futures.reserve(num_threads);
+  for (size_t i = 0; i < num_threads; ++i) {
+    futures.push_back(std::async(std::launch::async, thread_func, i));
+  }
+
+  auto max_time = std::accumulate(
+      futures.begin(), futures.end(), 0.f, [](float cur_val, auto& future) {
+        return std::max(future.get(), cur_val);
+      });
+
+  // Verify that all the outputs are the same
+  for (size_t i = 0; i < num_outputs; ++i) {
+    auto output_size = MaxOutputStorageBytes(i);
+    auto output_host = std::make_unique<uint8_t[]>(output_size);
+    // NB: technically, we don't have to copy to host here, but using
+    // std::memcmp is easier than writing a kernel that does comparisons
+    // for both backends, and performance is not important here.
+    DEVICE_CHECK(
+        CopyToHost(output_host.get(), outputs[i].ptr, output_size, stream));
+    DEVICE_CHECK(StreamSynchronize(stream));
+
+    for (size_t thread_idx = 1; thread_idx < num_threads; ++thread_idx) {
+      auto* thread_output = per_thread_outputs[thread_idx - 1][i].ptr;
+      auto thread_output_host = std::make_unique<uint8_t[]>(output_size);
+      auto thread_stream = get_stream(thread_idx);
+      DEVICE_CHECK(CopyToHost(
+          thread_output_host.get(), thread_output, output_size, thread_stream));
+      DEVICE_CHECK(StreamSynchronize(thread_stream));
+      if (std::memcmp(
+              output_host.get(), thread_output_host.get(), output_size)) {
+        throw std::runtime_error(
+            "Output " + std::to_string(i) +
+            " did not match for a spawned thread!");
+      }
+    }
+  }
+  auto total_num_iters = num_threads * count;
+  return max_time / total_num_iters;
+}
+
+void ModelContainer::SetConstant(const char* name, const AITData& tensor) {
+  auto it = unbound_constant_name_to_idx_.find(name);
+  if (it == unbound_constant_name_to_idx_.end()) {
+    // TODO make this an exception after we fix the CMF benchmarks
+    LOG(ERROR) << "Constant " << name << " not found";
+    return;
+  }
+  auto constant_idx = it->second + num_inputs_ + num_outputs_;
+  ValidateDtype(tensor.dtype, constant_idx);
+
+  CHECK_VECTOR_ACCESS(max_param_storage_bytes_, constant_idx)
+  auto expected_num_bytes = max_param_storage_bytes_[constant_idx];
+  auto actual_num_bytes =
+      tensor.shape.Numel() * AITemplateDtypeSizeBytes(tensor.dtype);
+  if (expected_num_bytes != actual_num_bytes) {
+    throw std::runtime_error(
+        std::string(
+            "SetConstant did not recieve correct number of bytes for constant ") +
+        name + ": expected " + std::to_string(expected_num_bytes) +
+        " but got " + std::to_string(actual_num_bytes) +
+        ". Check that the provided tensor's shape is correct.");
+  }
+
+  auto* src = tensor.ptr;
+  for (auto& model : models_) {
+    model.SetConstant(name, src);
+  }
+}
+
+size_t ModelContainer::NumInputs() const {
+  return num_inputs_;
+}
+
+const char* ModelContainer::InputName(size_t input_idx) const {
+  CHECK_VECTOR_ACCESS(param_names_, input_idx)
+  return param_names_[input_idx];
+}
+
+size_t ModelContainer::NumOutputs() const {
+  return num_outputs_;
+}
+
+const char* ModelContainer::OutputName(size_t output_idx) const {
+  auto idx = output_idx + num_inputs_;
+  CHECK_VECTOR_ACCESS(param_names_, idx)
+  return param_names_[idx];
+}
+
+AITemplateParamShape ModelContainer::MaxOutputShape(size_t output_idx) const {
+  auto idx = output_idx + num_inputs_;
+  CHECK_VECTOR_ACCESS(max_param_shapes_, idx)
+  auto& out_shape = max_param_shapes_[idx];
+  return AITemplateParamShape{out_shape.data(), out_shape.size()};
+}
+
+AITemplateDtype ModelContainer::OutputDtype(size_t output_idx) const {
+  auto idx = output_idx + num_inputs_;
+  CHECK_VECTOR_ACCESS(param_dtypes_, idx)
+  return param_dtypes_[idx];
+}
+
+size_t ModelContainer::MaxOutputStorageBytes(size_t output_idx) const {
+  auto idx = output_idx + num_inputs_;
+  CHECK_VECTOR_ACCESS(max_param_storage_bytes_, idx)
+  return max_param_storage_bytes_[idx];
+}
+
+void ModelContainer::PrepareForRun(
+    Model* model,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs) {
+  if (num_inputs != num_inputs_) {
+    auto msg = "Got wrong number of inputs; expected " +
+        std::to_string(num_inputs_) + ", got " + std::to_string(num_inputs);
+    throw std::runtime_error(std::move(msg));
+  }
+  if (num_inputs > 0 && inputs == nullptr) {
+    throw std::runtime_error("inputs cannot be null");
+  }
+  if (num_outputs != num_outputs_) {
+    auto msg = "Got wrong number of outputs; expected " +
+        std::to_string(num_outputs_) + ", got " + std::to_string(num_outputs);
+    throw std::runtime_error(std::move(msg));
+  }
+  if (num_outputs > 0 && outputs == nullptr) {
+    throw std::runtime_error("outputs cannot be null");
+  }
+  for (size_t i = 0; i < num_inputs_; ++i) {
+    auto& input = inputs[i];
+    ValidateDtype(input.dtype, i);
+    model->SetInput(input.ptr, input.shape, i);
+  }
+
+  for (size_t i = 0; i < num_outputs_; ++i) {
+    auto& output = outputs[i];
+    ValidateDtype(output.dtype, i + num_inputs_);
+    model->SetOutput(output.ptr, i);
+  }
+}
+
+Model* ModelContainer::GetAvailableModel() {
+  std::unique_lock lk(models_mutex_);
+  if (available_models_.empty()) {
+    ReclaimFinishedModels(lk);
+  }
+  auto* result = available_models_.back();
+  available_models_.pop_back();
+  return result;
+}
+
+void ModelContainer::ReclaimFinishedModels(std::unique_lock<std::mutex>& lk) {
+  // Put any complete models at the end
+  auto it = std::stable_partition(
+      pending_models_.begin(), pending_models_.end(), [](Model* m) {
+        return m->IsPending();
+      });
+
+  if (it != pending_models_.end()) {
+    // Move all available models to the pool.
+    available_models_.insert(
+        available_models_.end(), it, pending_models_.end());
+    pending_models_.erase(it, pending_models_.end());
+    return;
+  }
+
+  pending_models_available_.wait(
+      lk, [this]() { return !pending_models_.empty(); });
+  // There are no available workspaces! We have to wait on one.
+  auto* model = pending_models_.front();
+  pending_models_.pop_front();
+  lk.unlock();
+  try {
+    model->WaitForCompletion();
+  } catch (...) {
+    lk.lock();
+    available_models_.push_back(model);
+    throw;
+  }
+  lk.lock();
+  available_models_.push_back(model);
+}
+
+void ModelContainer::ValidateDtype(AITemplateDtype dtype, size_t idx) const {
+  CHECK_VECTOR_ACCESS(param_dtypes_, idx)
+  if (dtype != param_dtypes_[idx]) {
+    auto GetEnumString = [](auto dtype) {
+      switch (dtype) {
+        case AITemplateDtype::kUnset:
+          return "kUnset";
+        case AITemplateDtype::kHalf:
+          return "kHalf";
+        case AITemplateDtype::kFloat:
+          return "kFloat";
+        case AITemplateDtype::kInt:
+          return "kInt";
+        case AITemplateDtype::kLong:
+          return "kLong";
+        default:
+          return "unknown";
+      }
+    };
+    throw std::runtime_error(
+        "Got wrong dtype for param " + std::to_string(idx) + "; expected " +
+        GetEnumString(param_dtypes_[idx]) + ", got " + GetEnumString(dtype));
+  }
+}
+
+float ModelContainer::BenchmarkImpl(
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    StreamType stream,
+    bool graph_mode,
+    size_t count,
+    int64_t** output_shapes_out) {
+  auto* model = GetAvailableModel();
+  float runtime_ms = 0.;
+  auto start_event = RAII_CreateEvent();
+  auto end_event = RAII_CreateEvent();
+  try {
+    PrepareForRun(model, inputs, num_inputs, outputs, num_outputs);
+    DEVICE_CHECK(EventRecord(start_event.get(), stream));
+
+    for (size_t i = 0; i < count; ++i) {
+      model->Run(stream, graph_mode);
+    }
+  } catch (...) {
+    std::lock_guard lk(models_mutex_);
+    available_models_.push_back(model);
+    throw;
+  }
+  if (output_shapes_out) {
+    for (size_t i = 0; i < num_outputs; ++i) {
+      auto* out_shape = output_shapes_out[i];
+      model->GetOutputShape(i, out_shape);
+    }
+  }
+  // Push the model back into the pool before synchronizing the event
+  // to exercise the concurrency code
+  {
+    std::lock_guard lk(models_mutex_);
+    pending_models_.push_back(model);
+  }
+  pending_models_available_.notify_one();
+
+  DEVICE_CHECK(EventRecord(end_event.get(), stream));
+  DEVICE_CHECK(EventSynchronize(end_event.get()));
+  DEVICE_CHECK(
+      EventElapsedTime(&runtime_ms, start_event.get(), end_event.get()));
+  LOG(INFO) << "Benchmark runtime ms/iter: " << runtime_ms / count;
+  return runtime_ms;
+}
+
+} // namespace ait
diff --git a/static/csrc/model_interface.cpp b/static/csrc/model_interface.cpp
new file mode 100644
index 000000000..91529f7d1
--- /dev/null
+++ b/static/csrc/model_interface.cpp
@@ -0,0 +1,229 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#include "model_interface.h"
+#include <iostream>
+#include <unordered_map>
+#include "model-generated.h"
+#include "model_container.h"
+
+// Important: don't let exceptions escape the functions below.
+// They can cause problems when -fvisibility=hidden. But more
+// importantly, they can crash the program if they try to cross
+// the language boundary into Python.
+#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)     \
+  try {                                          \
+    __VA_ARGS__                                  \
+  } catch (const std::exception& e) {            \
+    LOG(ERROR) << "Error: " << e.what();         \
+    return AITemplateError::AITemplateFailure;   \
+  } catch (...) {                                \
+    LOG(ERROR) << "Unknown exception occurred."; \
+    return AITemplateError::AITemplateFailure;   \
+  }                                              \
+  return AITemplateError::AITemplateSuccess;
+
+#define RETURN_ERROR_IF_NULL(var)                          \
+  if (var == nullptr) {                                    \
+    LOG(ERROR) << "Variable " << #var << " can't be null"; \
+    return AITemplateError::AITemplateFailure;             \
+  }
+
+extern "C" {
+
+AITemplateError AITemplateModelContainerCreate(
+    AITemplateModelHandle* ret,
+    size_t num_runtimes) {
+  if (num_runtimes == 0) {
+    LOG(ERROR) << "num_runtimes must be positive, but got 0";
+    return AITemplateError::AITemplateFailure;
+  }
+  RETURN_ERROR_IF_NULL(ret)
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* m = ait::CreateModelContainer(num_runtimes);
+    *ret = reinterpret_cast<AITemplateModelHandle>(m);
+    return AITemplateError::AITemplateSuccess;
+  })
+}
+
+AITemplateError AITemplateModelContainerDelete(AITemplateModelHandle handle) {
+  RETURN_ERROR_IF_NULL(handle)
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+    delete m;
+    return AITemplateError::AITemplateSuccess;
+  });
+}
+
+AITemplateError AITemplateModelContainerSetConstant(
+    AITemplateModelHandle handle,
+    const char* name,
+    const AITData* tensor) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(tensor)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ m->SetConstant(name, *tensor); })
+}
+
+AITemplateError AITemplateModelContainerRun(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool sync,
+    bool graph_mode,
+    int64_t** output_shapes_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    m->Run(
+        inputs,
+        num_inputs,
+        outputs,
+        num_outputs,
+        stream,
+        sync,
+        graph_mode,
+        output_shapes_out);
+  })
+}
+
+AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool graph_mode,
+    int64_t** output_shapes_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    m->RunWithOutputsOnHost(
+        inputs,
+        num_inputs,
+        outputs,
+        num_outputs,
+        stream,
+        graph_mode,
+        output_shapes_out);
+  })
+}
+
+AITemplateError AITemplateModelContainerBenchmark(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool graph_mode,
+    size_t count,
+    size_t num_threads,
+    bool use_unique_stream_per_thread,
+    float* runtime_ms,
+    int64_t** output_shapes_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(runtime_ms)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *runtime_ms = m->Benchmark(
+        inputs,
+        num_inputs,
+        outputs,
+        num_outputs,
+        stream,
+        graph_mode,
+        count,
+        num_threads,
+        use_unique_stream_per_thread,
+        output_shapes_out);
+  })
+}
+
+AITemplateError AITemplateModelContainerGetNumInputs(
+    AITemplateModelHandle handle,
+    size_t* num_inputs_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(num_inputs_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_inputs_out = m->NumInputs(); })
+}
+
+AITemplateError AITemplateModelContainerGetInputName(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    const char** input_name_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(input_name_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *input_name_out = m->InputName(input_idx); })
+}
+
+AITemplateError AITemplateModelContainerGetNumOutputs(
+    AITemplateModelHandle handle,
+    size_t* num_outputs_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(num_outputs_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_outputs_out = m->NumOutputs(); })
+}
+
+AITemplateError AITemplateModelContainerGetOutputName(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    const char** output_name_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(output_name_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *output_name_out = m->OutputName(output_idx); })
+}
+
+AITemplateError AITemplateModelContainerGetMaximumOutputShape(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    AITemplateParamShape* shape_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(shape_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *shape_out = m->MaxOutputShape(output_idx); })
+}
+
+AITemplateError AITemplateModelContainerGetOutputDtype(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    AITemplateDtype* dtype_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(dtype_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *dtype_out = m->OutputDtype(output_idx); })
+}
+
+AITemplateError AITemplateModelContainerGetNumRuntimes(
+    AITemplateModelHandle handle,
+    size_t* num_runtimes_out) {
+  RETURN_ERROR_IF_NULL(num_runtimes_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_runtimes_out = m->GetNumRuntimes(); })
+}
+} // extern "C"
diff --git a/static/csrc/rocm_hack.cpp b/static/csrc/rocm_hack.cpp
new file mode 100644
index 000000000..d92c48ed9
--- /dev/null
+++ b/static/csrc/rocm_hack.cpp
@@ -0,0 +1,62 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#include "library/include/ck/library/utility/device_memory.hpp"
+#include "library/include/ck/library/utility/host_tensor.hpp"
+#include "library/include/ck/library/utility/host_tensor_generator.hpp"
+
+// hack for DeviceMem linking error
+// TODO fix this by making CK a header-only lib
+// <<< hack begin
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) {
+  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+void* DeviceMem::GetDeviceBuffer() const {
+  return mpDeviceBuf;
+}
+void DeviceMem::ToDevice(const void* p) const {
+  hipGetErrorString(hipMemcpy(
+      mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+void DeviceMem::FromDevice(void* p) const {
+  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+DeviceMem::~DeviceMem() {
+  hipGetErrorString(hipFree(mpDeviceBuf));
+}
+struct KernelTimerImpl {
+  KernelTimerImpl() {
+    hipGetErrorString(hipEventCreate(&mStart));
+    hipGetErrorString(hipEventCreate(&mEnd));
+  }
+  ~KernelTimerImpl() {
+    hipGetErrorString(hipEventDestroy(mStart));
+    hipGetErrorString(hipEventDestroy(mEnd));
+  }
+  void Start() {
+    hipGetErrorString(hipDeviceSynchronize());
+    hipGetErrorString(hipEventRecord(mStart, nullptr));
+  }
+  void End() {
+    hipGetErrorString(hipEventRecord(mEnd, nullptr));
+    hipGetErrorString(hipEventSynchronize(mEnd));
+  }
+  float GetElapsedTime() const {
+    float time;
+    hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
+    return time;
+  }
+  hipEvent_t mStart, mEnd;
+};
+// >>> hack end
diff --git a/static/csrc/utility.cpp b/static/csrc/utility.cpp
new file mode 100644
index 000000000..fe3e0bdf7
--- /dev/null
+++ b/static/csrc/utility.cpp
@@ -0,0 +1,69 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#include "utility.h"
+#include "logging.h"
+
+#define FAIL_IF_ERROR(expr)                       \
+  if ((expr) != ait::GetDeviceSuccess()) {        \
+    LOG(ERROR) << "Call " << #expr << " failed."; \
+    return AITemplateError::AITemplateFailure;    \
+  }
+
+AITemplateError AITemplateDeviceMalloc(
+    void** ptr_out,
+    size_t size,
+    ait::StreamType stream,
+    bool sync) {
+  FAIL_IF_ERROR(ait::DeviceMallocAsync(ptr_out, size, stream));
+  if (sync) {
+    FAIL_IF_ERROR(ait::StreamSynchronize(stream));
+  }
+  return AITemplateError::AITemplateSuccess;
+}
+
+AITemplateError AITemplateDeviceFree(
+    void* ptr,
+    ait::StreamType stream,
+    bool sync) {
+  FAIL_IF_ERROR(ait::FreeDeviceMemoryAsync(ptr, stream));
+  if (sync) {
+    FAIL_IF_ERROR(ait::StreamSynchronize(stream));
+  }
+  return AITemplateError::AITemplateSuccess;
+}
+
+AITemplateError AITemplateMemcpy(
+    void* dst,
+    const void* src,
+    size_t count,
+    ait::AITemplateMemcpyKind kind,
+    ait::StreamType stream,
+    bool sync) {
+  switch (kind) {
+    case ait::AITemplateMemcpyKind::HostToDevice:
+      FAIL_IF_ERROR(ait::CopyToDevice(dst, src, count, stream));
+      break;
+    case ait::AITemplateMemcpyKind::DeviceToHost:
+      FAIL_IF_ERROR(ait::CopyToHost(dst, src, count, stream));
+      break;
+    case ait::AITemplateMemcpyKind::DeviceToDevice:
+      FAIL_IF_ERROR(ait::DeviceToDeviceCopy(dst, src, count, stream));
+      break;
+  }
+  if (sync) {
+    FAIL_IF_ERROR(ait::StreamSynchronize(stream));
+  }
+  return AITemplateError::AITemplateSuccess;
+}
diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h
new file mode 100644
index 000000000..bb377a8bd
--- /dev/null
+++ b/static/include/cuda_device_functions.h
@@ -0,0 +1,185 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+
+#include <string>
+
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+namespace ait {
+
+using DeviceError = cudaError_t;
+using DevicePropertyType = cudaDeviceProp;
+using StreamType = cudaStream_t;
+using EventType = cudaEvent_t;
+using GraphType = cudaGraph_t;
+using GraphExecType = cudaGraphExec_t;
+using Handle = void*;
+
+inline DeviceError GetDevice(int* device_idx) {
+  return cudaGetDevice(device_idx);
+}
+
+inline DeviceError GetDeviceProperties(
+    DevicePropertyType* prop,
+    int device_idx) {
+  return cudaGetDeviceProperties(prop, device_idx);
+}
+
+inline DeviceError StreamCreate(StreamType* stream, bool non_blocking = false) {
+  auto flags = non_blocking ? cudaStreamNonBlocking : cudaStreamDefault;
+  return cudaStreamCreateWithFlags(stream, flags);
+}
+
+inline DeviceError StreamBeginCapture(StreamType stream, bool global = true) {
+  auto capture_mode =
+      global ? cudaStreamCaptureModeGlobal : cudaStreamCaptureModeThreadLocal;
+  return cudaStreamBeginCapture(stream, capture_mode);
+}
+
+inline DeviceError StreamEndCapture(StreamType stream, GraphType* graph) {
+  return cudaStreamEndCapture(stream, graph);
+}
+
+inline DeviceError StreamDestroy(StreamType stream) {
+  return cudaStreamDestroy(stream);
+}
+
+inline DeviceError GraphInstantiate(
+    GraphExecType* graph_exec,
+    GraphType graph) {
+  return cudaGraphInstantiate(graph_exec, graph, nullptr, nullptr, 0);
+}
+
+inline DeviceError GraphDestroy(GraphType graph) {
+  return cudaGraphDestroy(graph);
+}
+
+inline DeviceError GraphExecUpdate(GraphExecType graph_exec, GraphType graph) {
+  cudaGraphExecUpdateResult update;
+  return cudaGraphExecUpdate(graph_exec, graph, nullptr, &update);
+}
+
+inline DeviceError GraphExecDestroy(GraphExecType graph_exec) {
+  return cudaGraphExecDestroy(graph_exec);
+}
+
+inline DeviceError GraphExecLaunch(
+    GraphExecType graph_exec,
+    StreamType stream) {
+  return cudaGraphLaunch(graph_exec, stream);
+}
+
+inline DeviceError CopyToDevice(
+    Handle dst,
+    const void* src,
+    size_t size,
+    StreamType stream = 0) {
+  return cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream);
+}
+
+inline DeviceError CopyToHost(
+    Handle dst,
+    const void* src,
+    size_t size,
+    StreamType stream = 0) {
+  return cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream);
+}
+
+inline DeviceError DeviceToDeviceCopy(
+    Handle dst,
+    const void* src,
+    size_t size,
+    StreamType stream = 0) {
+  return cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream);
+}
+
+inline DeviceError FreeDeviceMemory(Handle src) {
+  return cudaFree(src);
+}
+
+inline DeviceError FreeDeviceMemoryAsync(Handle src, StreamType stream = 0) {
+  return cudaFreeAsync(src, stream);
+}
+
+inline DeviceError DeviceMalloc(Handle* dst, size_t size) {
+  return cudaMalloc(dst, size);
+}
+
+inline DeviceError DeviceMallocAsync(
+    Handle* dst,
+    size_t size,
+    StreamType stream = 0) {
+  return cudaMallocAsync(dst, size, stream);
+}
+
+inline DeviceError GetDeviceSuccess() {
+  return cudaSuccess;
+}
+
+inline DeviceError DeviceMemset(Handle src, int value, size_t size) {
+  return cudaMemset(src, value, size);
+}
+
+inline DeviceError GetLastError() {
+  return cudaGetLastError();
+}
+
+inline std::string GetLastErrorString() {
+  return cudaGetErrorString(cudaGetLastError());
+}
+
+inline DeviceError StreamSynchronize(StreamType stream) {
+  return cudaStreamSynchronize(stream);
+}
+
+inline DeviceError CreateEvent(EventType* event) {
+  return cudaEventCreate(event);
+}
+
+inline DeviceError DestroyEvent(EventType event) {
+  return cudaEventDestroy(event);
+}
+
+inline DeviceError EventRecord(EventType event, StreamType stream = 0) {
+  return cudaEventRecord(event, stream);
+}
+
+inline DeviceError EventSynchronize(EventType event) {
+  return cudaEventSynchronize(event);
+}
+
+inline DeviceError EventElapsedTime(float* ms, EventType start, EventType end) {
+  return cudaEventElapsedTime(ms, start, end);
+}
+
+inline DeviceError QueryEvent(EventType event) {
+  return cudaEventQuery(event);
+}
+
+inline const char* GetErrorString(DeviceError err) {
+  return cudaGetErrorString(err);
+}
+
+inline DeviceError GetDeviceNotReady() {
+  return cudaErrorNotReady;
+}
+
+} // namespace ait
diff --git a/static/include/logging.h b/static/include/logging.h
new file mode 100644
index 000000000..ea23a3017
--- /dev/null
+++ b/static/include/logging.h
@@ -0,0 +1,622 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+#pragma once
+
+#ifndef DMLC_LOGGING_H_
+#define DMLC_LOGGING_H_
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#if DMLC_LOG_STACK_TRACE
+#include <cxxabi.h>
+
+#include <sstream>
+#include DMLC_EXECINFO_H
+#endif
+
+/*! \brief whether use glog for logging */
+#ifndef DMLC_USE_GLOG
+#define DMLC_USE_GLOG 0
+#endif
+
+/*
+ * The preprocessor definition DMLC_USE_LOGGING_LIBRARY determines whether to
+ * use a user-defined logging library. If defined, dmlc will not define the
+ * macros CHECK() and LOG() and instead locate CHECK() and LOG() from the value
+ * of DMLC_USE_LOGGING_LIBRARY. The DMLC_USE_LOGGING_LIBRARY macro shall be of
+ * form <my_logging.h>:
+ *
+ * #define DMLC_USE_LOGGING_LIBRARY <my_logging.h>
+ *
+ * Make sure to define CHECK() and LOG() macros in the provided header;
+ * otherwise the build will fail.
+ */
+
+/*!
+ * \brief whether throw dmlc::Error instead of
+ *  directly calling abort when FATAL error occured
+ *  NOTE: this may still not be perfect.
+ *  do not use FATAL and CHECK in destructors
+ */
+#ifndef DMLC_LOG_FATAL_THROW
+#define DMLC_LOG_FATAL_THROW 1
+#endif
+
+/*!
+ * \brief whether always log a message before throw
+ * This can help identify the error that cannot be catched.
+ */
+#ifndef DMLC_LOG_BEFORE_THROW
+#define DMLC_LOG_BEFORE_THROW 0
+#endif
+
+/*!
+ * \brief Whether to use customized logger,
+ * whose output can be decided by other libraries.
+ */
+#ifndef DMLC_LOG_CUSTOMIZE
+#define DMLC_LOG_CUSTOMIZE 0
+#endif
+
+/*!
+ * \brief Whether to enable debug logging feature.
+ */
+#ifndef DMLC_LOG_DEBUG
+#ifdef NDEBUG
+#define DMLC_LOG_DEBUG 0
+#else
+#define DMLC_LOG_DEBUG 1
+#endif
+#endif
+
+/*!
+ * \brief Whether to disable date message on the log.
+ */
+#ifndef DMLC_LOG_NODATE
+#define DMLC_LOG_NODATE 0
+#endif
+
+/*! \brief helper macro to generate string concat */
+#define DMLC_STR_CONCAT_(__x, __y) __x##__y
+#define DMLC_STR_CONCAT(__x, __y) DMLC_STR_CONCAT_(__x, __y)
+
+#define DMLC_THROW_EXCEPTION noexcept(false)
+#define DMLC_NO_EXCEPTION noexcept(true)
+
+#if defined(__GNUC__) || defined(__clang__)
+#define DMLC_ALWAYS_INLINE inline __attribute__((__always_inline__))
+#elif defined(_MSC_VER)
+#define DMLC_ALWAYS_INLINE __forceinline
+#else
+#define DMLC_ALWAYS_INLINE inline
+#endif
+
+#if defined(_MSC_VER)
+#define DMLC_NO_INLINE __declspec(noinline)
+#else
+#define DMLC_NO_INLINE __attribute__((noinline))
+#endif
+
+namespace dmlc {
+/*!
+ * \brief exception class that will be thrown by
+ *  default logger if DMLC_LOG_FATAL_THROW == 1
+ */
+struct Error : public std::runtime_error {
+  /*!
+   * \brief constructor
+   * \param s the error message
+   */
+  explicit Error(const std::string& s) : std::runtime_error(s) {}
+};
+
+#if DMLC_LOG_STACK_TRACE
+// get stack trace logging depth from env variable.
+inline size_t LogStackTraceLevel() {
+  size_t level;
+  if (auto var = std::getenv("DMLC_LOG_STACK_TRACE_DEPTH")) {
+    if (1 == sscanf(var, "%zu", &level)) {
+      return level + 1;
+    }
+  }
+  return DMLC_LOG_STACK_TRACE_SIZE;
+}
+
+inline std::string Demangle(char const* msg_str) {
+  using std::string;
+  string msg(msg_str);
+  size_t symbol_start = string::npos;
+  size_t symbol_end = string::npos;
+  if (((symbol_start = msg.find("_Z")) != string::npos) &&
+      (symbol_end = msg.find_first_of(" +", symbol_start))) {
+    string left_of_symbol(msg, 0, symbol_start);
+    string symbol(msg, symbol_start, symbol_end - symbol_start);
+    string right_of_symbol(msg, symbol_end);
+
+    int status = 0;
+    size_t length = string::npos;
+    std::unique_ptr<char, void (*)(void* __ptr)> demangled_symbol = {
+        abi::__cxa_demangle(symbol.c_str(), 0, &length, &status), &std::free};
+    if (demangled_symbol && status == 0 && length > 0) {
+      string symbol_str(demangled_symbol.get());
+      std::ostringstream os;
+      os << left_of_symbol << symbol_str << right_of_symbol;
+      return os.str();
+    }
+  }
+  return string(msg_str);
+}
+
+// By default skip the first frame because
+// that belongs to ~LogMessageFatal
+inline std::string StackTrace(
+    size_t start_frame = 1,
+    const size_t stack_size = DMLC_LOG_STACK_TRACE_SIZE) {
+  using std::string;
+  std::ostringstream stacktrace_os;
+  std::vector<void*> stack(stack_size);
+  int nframes = backtrace(stack.data(), static_cast<int>(stack_size));
+  if (start_frame < static_cast<size_t>(nframes)) {
+    stacktrace_os << "Stack trace:\n";
+  }
+  char** msgs = backtrace_symbols(stack.data(), nframes);
+  if (msgs != nullptr) {
+    for (int frameno = start_frame; frameno < nframes; ++frameno) {
+      string msg = dmlc::Demangle(msgs[frameno]);
+      stacktrace_os << "  [bt] (" << frameno - start_frame << ") " << msg
+                    << "\n";
+    }
+  }
+  free(msgs);
+  string stack_trace = stacktrace_os.str();
+  return stack_trace;
+}
+
+#else // DMLC_LOG_STACK_TRACE is off
+
+inline size_t LogStackTraceLevel() {
+  return 0;
+}
+
+inline std::string demangle(char const* msg_str) {
+  return std::string();
+}
+
+inline std::string StackTrace(
+    size_t start_frame = 1,
+    const size_t stack_size = 0) {
+  return std::string(
+      "Stack trace not available when "
+      "DMLC_LOG_STACK_TRACE is disabled at compile time.");
+}
+
+#endif // DMLC_LOG_STACK_TRACE
+} // namespace dmlc
+
+#if DMLC_USE_GLOG
+#include <glog/logging.h>
+
+namespace dmlc {
+/*!
+ * \brief optionally redirect to google's init log
+ * \param argv0 The arguments.
+ */
+inline void InitLogging(const char* argv0) {
+  google::InitGoogleLogging(argv0);
+}
+} // namespace dmlc
+
+#elif defined DMLC_USE_LOGGING_LIBRARY
+
+#include DMLC_USE_LOGGING_LIBRARY
+namespace dmlc {
+inline void InitLogging(const char*) {
+  // DO NOTHING
+}
+} // namespace dmlc
+
+#else
+// use a light version of glog
+#include <assert.h>
+
+#include <ctime>
+#include <iostream>
+#include <sstream>
+
+#if defined(_MSC_VER)
+#pragma warning(disable : 4722)
+#pragma warning(disable : 4068)
+#endif
+
+namespace dmlc {
+inline void InitLogging(const char*) {
+  // DO NOTHING
+}
+
+// get debug option from env variable.
+inline bool DebugLoggingEnabled() {
+  static int state = 0;
+  if (state == 0) {
+    if (auto var = std::getenv("DMLC_LOG_DEBUG")) {
+      if (std::string(var) == "1") {
+        state = 1;
+      } else {
+        state = -1;
+      }
+    } else {
+      // by default hide debug logging.
+      state = -1;
+    }
+  }
+  return state == 1;
+}
+
+#ifndef DMLC_GLOG_DEFINED
+
+template <typename X, typename Y>
+std::unique_ptr<std::string> LogCheckFormat(const X& x, const Y& y) {
+  std::ostringstream os;
+  os << " (" << x << " vs. " << y
+     << ") "; /* CHECK_XX(x, y) requires x and y can be serialized to string.
+                 Use CHECK(x OP y) otherwise. NOLINT(*) */
+  // no std::make_unique until c++14
+  return std::unique_ptr<std::string>(new std::string(os.str()));
+}
+
+// This function allows us to ignore sign comparison in the right scope.
+#define DEFINE_CHECK_FUNC(name, op)                               \
+  template <typename X, typename Y>                               \
+  DMLC_ALWAYS_INLINE std::unique_ptr<std::string> LogCheck##name( \
+      const X& x, const Y& y) {                                   \
+    if (x op y)                                                   \
+      return nullptr;                                             \
+    return LogCheckFormat(x, y);                                  \
+  }                                                               \
+  DMLC_ALWAYS_INLINE std::unique_ptr<std::string> LogCheck##name( \
+      int x, int y) {                                             \
+    return LogCheck##name<int, int>(x, y);                        \
+  }
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+DEFINE_CHECK_FUNC(_LT, <)
+DEFINE_CHECK_FUNC(_GT, >)
+DEFINE_CHECK_FUNC(_LE, <=)
+DEFINE_CHECK_FUNC(_GE, >=)
+DEFINE_CHECK_FUNC(_EQ, ==)
+DEFINE_CHECK_FUNC(_NE, !=)
+#pragma GCC diagnostic pop
+
+#define CHECK_BINARY_OP(name, op, x, y)                   \
+  if (auto __dmlc__log__err = dmlc::LogCheck##name(x, y)) \
+  dmlc::LogMessageFatal(__FILE__, __LINE__).stream()      \
+      << "Check failed: " << #x " " #op " " #y << *__dmlc__log__err << ": "
+
+// Always-on checking
+#define CHECK(x)                                     \
+  if (!(x))                                          \
+  dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \
+      << "Check failed: " #x << ": "
+#define CHECK_LT(x, y) CHECK_BINARY_OP(_LT, <, x, y)
+#define CHECK_GT(x, y) CHECK_BINARY_OP(_GT, >, x, y)
+#define CHECK_LE(x, y) CHECK_BINARY_OP(_LE, <=, x, y)
+#define CHECK_GE(x, y) CHECK_BINARY_OP(_GE, >=, x, y)
+#define CHECK_EQ(x, y) CHECK_BINARY_OP(_EQ, ==, x, y)
+#define CHECK_NE(x, y) CHECK_BINARY_OP(_NE, !=, x, y)
+#define CHECK_NOTNULL(x)                                            \
+  ((x) == NULL ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \
+       << "Check  notnull: " #x << ' ',                             \
+   (x)                                                              \
+               : (x)) // NOLINT(*)
+
+// Debug-only checking.
+#if DMLC_LOG_DEBUG
+#define DCHECK(x) \
+  while (false)   \
+  CHECK(x)
+#define DCHECK_LT(x, y) \
+  while (false)         \
+  CHECK((x) < (y))
+#define DCHECK_GT(x, y) \
+  while (false)         \
+  CHECK((x) > (y))
+#define DCHECK_LE(x, y) \
+  while (false)         \
+  CHECK((x) <= (y))
+#define DCHECK_GE(x, y) \
+  while (false)         \
+  CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) \
+  while (false)         \
+  CHECK((x) == (y))
+#define DCHECK_NE(x, y) \
+  while (false)         \
+  CHECK((x) != (y))
+#else
+#define DCHECK(x) CHECK(x)
+#define DCHECK_LT(x, y) CHECK((x) < (y))
+#define DCHECK_GT(x, y) CHECK((x) > (y))
+#define DCHECK_LE(x, y) CHECK((x) <= (y))
+#define DCHECK_GE(x, y) CHECK((x) >= (y))
+#define DCHECK_EQ(x, y) CHECK((x) == (y))
+#define DCHECK_NE(x, y) CHECK((x) != (y))
+#endif // DMLC_LOG_DEBUG
+
+#if DMLC_LOG_CUSTOMIZE
+#define LOG_INFO dmlc::CustomLogMessage(__FILE__, __LINE__)
+#else
+#define LOG_INFO dmlc::LogMessage(__FILE__, __LINE__)
+#endif
+#define LOG_ERROR LOG_INFO
+#define LOG_WARNING LOG_INFO
+#define LOG_FATAL dmlc::LogMessageFatal(__FILE__, __LINE__)
+#define LOG_QFATAL LOG_FATAL
+
+// Poor man version of VLOG
+#define VLOG(x) LOG_INFO.stream()
+
+#define LOG(severity) LOG_##severity.stream()
+#define LG LOG_INFO.stream()
+#define LOG_IF(severity, condition) \
+  !(condition) ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+
+#if DMLC_LOG_DEBUG
+
+#define LOG_DFATAL LOG_FATAL
+#define DFATAL FATAL
+#define DLOG(severity) LOG_IF(severity, ::dmlc::DebugLoggingEnabled())
+#define DLOG_IF(severity, condition) \
+  LOG_IF(severity, ::dmlc::DebugLoggingEnabled() && (condition))
+
+#else
+
+#define LOG_DFATAL LOG_ERROR
+#define DFATAL ERROR
+#define DLOG(severity) \
+  true ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+#define DLOG_IF(severity, condition) \
+  (true || !(condition)) ? (void)0 : dmlc::LogMessageVoidify() & LOG(severity)
+#endif
+
+// Poor man version of LOG_EVERY_N
+#define LOG_EVERY_N(severity, n) LOG(severity)
+
+#endif // DMLC_GLOG_DEFINED
+
+class DateLogger {
+ public:
+  DateLogger() {
+#if defined(_MSC_VER)
+    _tzset();
+#endif
+  }
+  const char* HumanDate() {
+#if !defined(_LIBCPP_SGX_CONFIG) && DMLC_LOG_NODATE == 0
+#if defined(_MSC_VER)
+    _strtime_s(buffer_, sizeof(buffer_));
+#else
+    time_t time_value = time(NULL);
+    struct tm* pnow;
+#if !defined(_WIN32)
+    struct tm now;
+    pnow = localtime_r(&time_value, &now);
+#else
+    pnow = localtime(&time_value); // NOLINT(*)
+#endif
+    snprintf(
+        buffer_,
+        sizeof(buffer_),
+        "%02d:%02d:%02d",
+        pnow->tm_hour,
+        pnow->tm_min,
+        pnow->tm_sec);
+#endif
+    return buffer_;
+#else
+    return "";
+#endif // _LIBCPP_SGX_CONFIG
+  }
+
+ private:
+  char buffer_[9];
+};
+
+#ifndef _LIBCPP_SGX_NO_IOSTREAMS
+class LogMessage {
+ public:
+  LogMessage(const char* file, int line)
+      :
+#ifdef __ANDROID__
+        log_stream_(std::cout)
+#else
+        log_stream_(std::cerr)
+#endif
+  {
+    log_stream_ << "[" << pretty_date_.HumanDate() << "] " << file << ":"
+                << line << ": ";
+  }
+  ~LogMessage() {
+    log_stream_ << '\n';
+  }
+  std::ostream& stream() {
+    return log_stream_;
+  }
+
+ protected:
+  std::ostream& log_stream_;
+
+ private:
+  DateLogger pretty_date_;
+  LogMessage(const LogMessage&);
+  void operator=(const LogMessage&);
+};
+
+// customized logger that can allow user to define where to log the message.
+class CustomLogMessage {
+ public:
+  CustomLogMessage(const char* file, int line) {
+    log_stream_ << "[" << DateLogger().HumanDate() << "] " << file << ":"
+                << line << ": ";
+  }
+  ~CustomLogMessage() {
+    Log(log_stream_.str());
+  }
+  std::ostream& stream() {
+    return log_stream_;
+  }
+  /*!
+   * \brief customized logging of the message.
+   * This function won't be implemented by libdmlc
+   * \param msg The message to be logged.
+   */
+  static void Log(const std::string& msg);
+
+ private:
+  std::ostringstream log_stream_;
+};
+#else
+class DummyOStream {
+ public:
+  template <typename T>
+  DummyOStream& operator<<(T _) {
+    return *this;
+  }
+  inline std::string str() {
+    return "";
+  }
+};
+class LogMessage {
+ public:
+  LogMessage(const char* file, int line) : log_stream_() {}
+  DummyOStream& stream() {
+    return log_stream_;
+  }
+
+ protected:
+  DummyOStream log_stream_;
+
+ private:
+  LogMessage(const LogMessage&);
+  void operator=(const LogMessage&);
+};
+#endif
+
+#if defined(_LIBCPP_SGX_NO_IOSTREAMS)
+class LogMessageFatal : public LogMessage {
+ public:
+  LogMessageFatal(const char* file, int line) : LogMessage(file, line) {}
+  ~LogMessageFatal() {
+    abort();
+  }
+
+ private:
+  LogMessageFatal(const LogMessageFatal&);
+  void operator=(const LogMessageFatal&);
+};
+#elif DMLC_LOG_FATAL_THROW == 0
+class LogMessageFatal : public LogMessage {
+ public:
+  LogMessageFatal(const char* file, int line) : LogMessage(file, line) {}
+  ~LogMessageFatal() {
+    log_stream_ << "\n" << StackTrace(1, LogStackTraceLevel()) << "\n";
+    abort();
+  }
+
+ private:
+  LogMessageFatal(const LogMessageFatal&);
+  void operator=(const LogMessageFatal&);
+};
+#else
+class LogMessageFatal {
+ public:
+  LogMessageFatal(const char* file, int line) {
+    GetEntry().Init(file, line);
+  }
+  std::ostringstream& stream() {
+    return GetEntry().log_stream;
+  }
+  DMLC_NO_INLINE ~LogMessageFatal() DMLC_THROW_EXCEPTION {
+#if DMLC_LOG_STACK_TRACE
+    GetEntry().log_stream << "\n"
+                          << StackTrace(1, LogStackTraceLevel()) << "\n";
+#endif
+    throw GetEntry().Finalize();
+  }
+
+ private:
+  struct Entry {
+    std::ostringstream log_stream;
+    DMLC_NO_INLINE void Init(const char* file, int line) {
+      DateLogger date;
+      log_stream.str("");
+      log_stream.clear();
+      log_stream << "[" << date.HumanDate() << "] " << file << ":" << line
+                 << ": ";
+    }
+    dmlc::Error Finalize() {
+#if DMLC_LOG_BEFORE_THROW
+      LOG(ERROR) << log_stream.str();
+#endif
+      return dmlc::Error(log_stream.str());
+    }
+    // Due to a bug in MinGW, objects with non-trivial destructor cannot be
+    // thread-local. See https://sourceforge.net/p/mingw-w64/bugs/527/ Hence,
+    // don't use thread-local for the log stream if the compiler is MinGW.
+#if !(defined(__MINGW32__) || defined(__MINGW64__))
+    DMLC_NO_INLINE static Entry& ThreadLocal() {
+      static thread_local Entry result;
+      return result;
+    }
+#endif
+  };
+  LogMessageFatal(const LogMessageFatal&);
+  void operator=(const LogMessageFatal&);
+
+#if defined(__MINGW32__) || defined(__MINGW64__)
+  DMLC_NO_INLINE Entry& GetEntry() {
+    return entry_;
+  }
+
+  Entry entry_;
+#else
+  DMLC_NO_INLINE Entry& GetEntry() {
+    return Entry::ThreadLocal();
+  }
+#endif
+};
+#endif
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class LogMessageVoidify {
+ public:
+  LogMessageVoidify() {}
+  // This has to be an operator with a precedence lower than << but
+  // higher than "?:". See its usage.
+#if !defined(_LIBCPP_SGX_NO_IOSTREAMS)
+  void operator&(std::ostream&) {}
+#endif
+};
+
+} // namespace dmlc
+
+#endif
+#endif // DMLC_LOGGING_H_
diff --git a/static/include/macros.h b/static/include/macros.h
new file mode 100644
index 000000000..84bf47155
--- /dev/null
+++ b/static/include/macros.h
@@ -0,0 +1,29 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+// This file contains macros that are shared across many source files.
+#include <stdexcept>
+#include <string>
+
+#include "device_functions-generated.h"
+
+#define DEVICE_CHECK(call)                                           \
+  if ((call) != GetDeviceSuccess()) {                                \
+    throw std::runtime_error(                                        \
+        #call " API call failed: " + GetLastErrorString() + " at " + \
+        __FILE__ + ", line" + std::to_string(__LINE__));             \
+  }
+
+#define LAUNCH_CHECK() DEVICE_CHECK(GetLastError())
diff --git a/static/include/model_container.h b/static/include/model_container.h
new file mode 100644
index 000000000..a18702489
--- /dev/null
+++ b/static/include/model_container.h
@@ -0,0 +1,189 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+
+#include "model-generated.h"
+#include "raii_wrapper.h"
+
+#include <condition_variable>
+#include <cstring>
+#include <future>
+#include <mutex>
+#include <numeric>
+#include <unordered_map>
+
+namespace ait {
+
+// ModelContainer inherits from this class; its implementation is
+// generated at compilation time. Most of the ModelContainer
+// logic does not need codegen; anything that does should be put
+// into this class instead.
+class ModelContainerBase {
+ public:
+  ModelContainerBase(
+      size_t num_inputs,
+      size_t num_outputs,
+      size_t num_unbound_constants,
+      size_t params_size);
+
+ protected:
+  // The set of unbound constants/weights/parameters. These are constants which
+  // have no value at compile time and do not participate in constant folding.
+  // They must be set via SetConstant prior to inference.
+  std::unordered_map<std::string, size_t> unbound_constant_name_to_idx_;
+
+  // a single piece of memory for all constants
+  GPUPtr constants_;
+
+  // size of the containers below: # inputs + # outputs + # unbound constants.
+  size_t num_params_;
+
+  // These entries correspond to inputs/outputs/unbound constants in order;
+  // inputs first, then outputs, then constants.
+  std::vector<const char*> param_names_;
+  std::vector<std::vector<int64_t>> max_param_shapes_;
+  std::vector<AITemplateDtype> param_dtypes_;
+
+  // NB: technically these could be derived from both the max shape and
+  // the dytpe, but it's easier to just cache them.
+  std::vector<size_t> max_param_storage_bytes_;
+  std::vector<size_t> max_param_numel_;
+};
+
+// This creates a new ModelContainer; its implementation is also
+// codegened (the parameters passed to the ctor are determined
+// at compilation time)
+class ModelContainer;
+ModelContainer* CreateModelContainer(size_t num_runtimes);
+
+// Each ModelContainer contains num_models Models. Inference runs
+// can be started by invoking Run() with lists of pre-allocated
+// input/output tensors. GetOutputMaximumShape() can be used to
+// determine how much memory is required for each output.
+//
+// If there are N tensors marked with is_output=True,
+// the user will always be expected to pass N output pointers -
+// extra copies will occur if the outputs are views of constants,
+// inputs, or other outputs in this case to avoid surprises.
+//
+// Use stream = nullptr for default stream. ModelContainer/Model does not
+// create or own any stream. The user is expected to create and manage streams.
+//
+// We can support at most num_models concurrent inferences.
+// Run() takes a stream to run the inference on. For example,
+// to start up two inferences on different streams concurrently,
+// we can do this:
+//
+// model_container.Run(inputs0, num_inputs, outputs0, num_ouputs, stream0, ...);
+// model_container.Run(inputs1, num_inputs, outputs1, num_ouputs, stream1, ...);
+// StreamSynchronize(stream0);
+// StreamSynchronize(stream1);
+//
+// Note that if there are no models available for inference, Run() will block
+// until one becomes available.
+class ModelContainer : ModelContainerBase {
+ public:
+  ModelContainer(
+      size_t num_models,
+      size_t blob_size,
+      size_t workspace_size,
+      size_t num_inputs,
+      size_t num_outputs,
+      size_t num_unbound_constants,
+      size_t params_size);
+
+  void Run(
+      const AITData* inputs,
+      size_t num_inputs,
+      AITData* outputs,
+      size_t num_outputs,
+      StreamType stream,
+      bool sync,
+      bool graph_mode,
+      int64_t** output_shapes_out);
+
+  void RunWithOutputsOnHost(
+      const AITData* inputs,
+      size_t num_inputs,
+      AITData* outputs,
+      size_t num_outputs,
+      StreamType stream,
+      bool graph_mode,
+      int64_t** output_shapes_out);
+
+  float Benchmark(
+      const AITData* inputs,
+      size_t num_inputs,
+      AITData* outputs,
+      size_t num_outputs,
+      StreamType stream,
+      bool graph_mode,
+      size_t count,
+      size_t num_threads,
+      bool use_unique_stream_per_thread,
+      int64_t** output_shapes_out);
+
+  void SetConstant(const char* name, const AITData& tensor);
+
+  size_t NumInputs() const;
+  size_t NumOutputs() const;
+
+  const char* InputName(size_t input_idx) const;
+  const char* OutputName(size_t output_idx) const;
+
+  AITemplateParamShape MaxOutputShape(size_t output_idx) const;
+  AITemplateDtype OutputDtype(size_t output_idx) const;
+  size_t MaxOutputStorageBytes(size_t output_idx) const;
+
+  size_t GetNumRuntimes() const {
+    return models_.size();
+  }
+
+ private:
+  void PrepareForRun(
+      Model* model,
+      const AITData* inputs,
+      size_t num_inputs,
+      AITData* outputs,
+      size_t num_outputs);
+
+  Model* GetAvailableModel();
+  void ReclaimFinishedModels(std::unique_lock<std::mutex>& lk);
+  void ValidateDtype(AITemplateDtype dtype, size_t idx) const;
+
+  float BenchmarkImpl(
+      const AITData* inputs,
+      size_t num_inputs,
+      AITData* outputs,
+      size_t num_outputs,
+      StreamType stream,
+      bool graph_mode,
+      size_t count,
+      int64_t** output_shapes_out);
+
+  std::vector<Model> models_;
+  std::vector<Model*> available_models_;
+  std::deque<Model*> pending_models_;
+
+  // Guards accesses to available/pending models.
+  std::mutex models_mutex_;
+  // Notified whenever a model is put into pending_models_.
+  std::condition_variable pending_models_available_;
+
+  size_t num_inputs_;
+  size_t num_outputs_;
+};
+
+} // namespace ait
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
new file mode 100644
index 000000000..6d6200afc
--- /dev/null
+++ b/static/include/model_interface.h
@@ -0,0 +1,184 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <numeric>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+// We compile all models with -fvisibility=hidden. Any symbols that need to be
+// exposed in the final shared library must be declared with AIT_EXPORT to make
+// them visible.
+
+#ifdef __GNUC__ // Applies to any compiler with GNU extensions (clang and g++)
+#define AIT_EXPORT __attribute__((__visibility__("default")))
+#else
+#ifdef _WIN32
+#define AIT_EXPORT __declspec(dllexport)
+#else
+#define AIT_EXPORT
+#endif
+#endif
+
+struct AITemplateModelOpaque {};
+using AITemplateModelHandle = AITemplateModelOpaque*;
+
+enum class AITemplateError : int {
+  AITemplateSuccess = 0,
+  AITemplateFailure = 1,
+};
+
+struct AITemplateParamShape {
+  AITemplateParamShape() : shape_data(nullptr), size(0) {}
+  AITemplateParamShape(const int64_t* shape_data_in, size_t size_in)
+      : shape_data(shape_data_in), size(size_in) {}
+
+  const int64_t* shape_data;
+  size_t size;
+
+  size_t Numel() const {
+    return std::accumulate(
+        shape_data, shape_data + size, 1, std::multiplies<int64_t>());
+  }
+};
+
+enum class AITemplateDtype {
+  kUnset = 0,
+  kHalf,
+  kFloat,
+  kInt,
+  kLong,
+};
+
+struct AITData {
+  AITData() : ptr(nullptr), dtype(AITemplateDtype::kUnset) {}
+
+  AITData(
+      void* ptr_in,
+      const AITemplateParamShape& shape_in,
+      AITemplateDtype dtype_in)
+      : ptr(ptr_in), shape(shape_in), dtype(dtype_in) {}
+
+  void* ptr;
+  AITemplateParamShape shape;
+  AITemplateDtype dtype;
+};
+
+inline size_t AITemplateDtypeSizeBytes(AITemplateDtype dtype) {
+  switch (dtype) {
+    case AITemplateDtype::kHalf:
+      return 2;
+    case AITemplateDtype::kFloat:
+      return 4;
+    case AITemplateDtype::kInt:
+      return 4;
+    case AITemplateDtype::kLong:
+      return 8;
+    case AITemplateDtype::kUnset:
+      throw std::runtime_error("Unset dtype has no size!");
+  }
+}
+
+struct AITemplateStreamOpaque {};
+using AITemplateStreamHandle = AITemplateStreamOpaque*;
+
+extern "C" {
+
+AIT_EXPORT AITemplateError
+AITemplateModelContainerCreate(AITemplateModelHandle* ret, size_t num_runtimes);
+
+AIT_EXPORT AITemplateError
+AITemplateModelContainerDelete(AITemplateModelHandle handle);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerSetConstant(
+    AITemplateModelHandle handle,
+    const char* name,
+    const AITData* tensor);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerRun(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool sync,
+    bool graph_mode,
+    int64_t** output_shapes_out);
+
+// Like AITemplateModelContainerRun, but expects outputs to be allocated on the
+// host. Does an extra sync/copy at the end to copy them over. Warning: don't
+// use this! It's not optimal with respect to performance. It's here for use by
+// internal constant folding passes.
+AIT_EXPORT AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool graph_mode,
+    int64_t** output_shapes_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerBenchmark(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* ouputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    bool graph_mode,
+    size_t count,
+    size_t num_threads,
+    bool use_unique_stream_per_thread,
+    float* runtime_ms,
+    int64_t** output_shapes_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetNumInputs(
+    AITemplateModelHandle handle,
+    size_t* num_inputs_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetInputName(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    const char** input_name_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetNumOutputs(
+    AITemplateModelHandle handle,
+    size_t* num_outputs_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetOutputName(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    const char** output_name_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetMaximumOutputShape(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    AITemplateParamShape* shape_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetOutputDtype(
+    AITemplateModelHandle handle,
+    size_t output_idx,
+    AITemplateDtype* out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetNumRuntimes(
+    AITemplateModelHandle handle,
+    size_t* num_runtimes_out);
+
+} // extern "C"
diff --git a/static/include/owned_constants.h b/static/include/owned_constants.h
new file mode 100644
index 000000000..64ceeea47
--- /dev/null
+++ b/static/include/owned_constants.h
@@ -0,0 +1,46 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+// This exposes the raw data for constants that are
+// compiled into the final .so. When a ModelContainer is created,
+// it copies this data into some owned GPU memory.
+
+#include <array>
+#include <cstdint>
+#include <utility>
+
+namespace ait {
+
+struct ConstantInfo {
+  // Unowned pointer w/ static lifetime
+  const char* name;
+  // Offset into _binary_constants_bin_start
+  size_t data_offset;
+  // Offset into owned GPU slab.
+  size_t internal_offset;
+  // How big is this tensor in bytes?
+  size_t num_bytes;
+};
+
+} // namespace ait
+
+// At codegen time, we write out a binary file called constants.bin.
+// We then turn the raw binary to an object file that exposes this
+// symbol and link it into the final .so.
+// For information on the binary format, see `man objcopy`, under
+// the "binary-architecture" flag:
+// https://man7.org/linux/man-pages/man1/objcopy.1.html
+extern const uint8_t _binary_constants_bin_start[];
+extern const uint8_t _binary_constants_bin_end[];
diff --git a/static/include/raii_wrapper.h b/static/include/raii_wrapper.h
new file mode 100644
index 000000000..440205afe
--- /dev/null
+++ b/static/include/raii_wrapper.h
@@ -0,0 +1,53 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+// Some helpful unique_ptr instantiations and factory functions for CUDA types
+#include <memory>
+#include <type_traits>
+
+#include "device_functions-generated.h"
+#include "macros.h"
+
+namespace ait {
+
+// RAII wrapper for owned GPU memory. Not that the underlying calls
+// to malloc/free are synchronous for simplicity.
+using GPUPtr = std::unique_ptr<void, decltype(&FreeDeviceMemory)>;
+
+using StreamPtr = std::
+    unique_ptr<std::remove_pointer<StreamType>::type, decltype(&StreamDestroy)>;
+
+using EventPtr = std::
+    unique_ptr<std::remove_pointer<EventType>::type, decltype(&DestroyEvent)>;
+
+inline GPUPtr RAII_DeviceMalloc(size_t num_bytes) {
+  void* output;
+  DEVICE_CHECK(DeviceMalloc(&output, num_bytes));
+  return GPUPtr(output, FreeDeviceMemory);
+}
+
+inline StreamPtr RAII_StreamCreate(bool non_blocking = false) {
+  StreamType stream;
+  DEVICE_CHECK(StreamCreate(&stream, non_blocking));
+  return StreamPtr(stream, StreamDestroy);
+}
+
+inline EventPtr RAII_CreateEvent() {
+  EventType event;
+  DEVICE_CHECK(CreateEvent(&event));
+  return EventPtr(event, DestroyEvent);
+}
+
+} // namespace ait
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
new file mode 100644
index 000000000..dced66911
--- /dev/null
+++ b/static/include/rocm_device_functions.h
@@ -0,0 +1,192 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+
+#include <string>
+
+// #include <half.hpp>
+#include <stdlib.h>
+#include <cstdlib>
+#include <initializer_list>
+#include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/print.hpp"
+#include "library/include/ck/library/utility/device_memory.hpp"
+#include "library/include/ck/library/utility/host_tensor.hpp"
+#include "library/include/ck/library/utility/host_tensor_generator.hpp"
+
+namespace ait {
+
+using DeviceError = hipError_t;
+using DevicePropertyType = hipDeviceProp_t;
+using StreamType = hipStream_t;
+using EventType = hipEvent_t;
+using GraphType = hipGraph_t;
+using GraphExecType = hipGraphExec_t;
+using Handle = void*;
+
+inline DeviceError GetDevice(int* device_idx) {
+  return hipGetDevice(device_idx);
+}
+
+inline DeviceError GetDeviceProperties(
+    DevicePropertyType* prop,
+    int device_idx) {
+  return hipGetDeviceProperties(prop, device_idx);
+}
+
+inline DeviceError StreamCreate(StreamType* stream, bool non_blocking = false) {
+  auto flags = non_blocking ? hipStreamNonBlocking : hipStreamDefault;
+  return hipStreamCreateWithFlags(stream, flags);
+}
+
+inline DeviceError StreamBeginCapture(StreamType stream, bool global = true) {
+  auto capture_mode =
+      global ? hipStreamCaptureModeGlobal : hipStreamCaptureModeThreadLocal;
+  return hipStreamBeginCapture(stream, capture_mode);
+}
+
+inline DeviceError StreamEndCapture(StreamType stream, GraphType* graph) {
+  return hipStreamEndCapture(stream, graph);
+}
+
+inline DeviceError StreamDestroy(StreamType stream) {
+  return hipStreamDestroy(stream);
+}
+
+inline DeviceError GraphInstantiate(
+    GraphExecType* graph_exec,
+    GraphType graph) {
+  return hipGraphInstantiate(graph_exec, graph, nullptr, nullptr, 0);
+}
+
+inline DeviceError GraphDestroy(GraphType graph) {
+  return hipGraphDestroy(graph);
+}
+
+inline DeviceError GraphExecUpdate(GraphExecType graph_exec, GraphType graph) {
+  // We don't have hipGraphExecUpdate in some versions of rocm
+  return hipErrorUnknown;
+}
+
+inline DeviceError GraphExecDestroy(GraphExecType graph_exec) {
+  return hipGraphExecDestroy(graph_exec);
+}
+
+inline DeviceError GraphExecLaunch(
+    GraphExecType graph_exec,
+    StreamType stream) {
+  return hipGraphLaunch(graph_exec, stream);
+}
+
+inline DeviceError CopyToDevice(
+    Handle dst,
+    const void* src,
+    size_t size,
+    StreamType stream = 0) {
+  return hipMemcpyAsync(dst, src, size, hipMemcpyHostToDevice, stream);
+}
+
+inline DeviceError CopyToHost(
+    Handle dst,
+    const void* src,
+    size_t size,
+    StreamType stream = 0) {
+  return hipMemcpyAsync(dst, src, size, hipMemcpyDeviceToHost, stream);
+}
+
+inline DeviceError DeviceToDeviceCopy(
+    Handle dst,
+    const void* src,
+    size_t size,
+    StreamType stream = 0) {
+  return hipMemcpyAsync(dst, src, size, hipMemcpyDeviceToDevice, stream);
+}
+
+inline DeviceError FreeDeviceMemory(Handle src) {
+  return hipFree(src);
+}
+
+inline DeviceError FreeDeviceMemoryAsync(
+    Handle src,
+    StreamType /*stream*/ = 0) {
+  // hipFreeAsync is not supported in many versions of HIP
+  return hipFree(src);
+}
+
+inline DeviceError DeviceMalloc(Handle* dst, size_t size) {
+  return hipMalloc(dst, size);
+}
+
+inline DeviceError DeviceMallocAsync(
+    Handle* dst,
+    size_t size,
+    StreamType /*stream*/ = 0) {
+  // hipMallocAsync is not supported in many versions of HIP
+  return hipMalloc(dst, size);
+}
+
+inline DeviceError GetDeviceSuccess() {
+  return hipSuccess;
+}
+
+inline DeviceError DeviceMemset(Handle src, int value, size_t size) {
+  return hipMemset(src, value, size);
+}
+
+inline DeviceError GetLastError() {
+  return hipGetLastError();
+}
+
+inline std::string GetLastErrorString() {
+  return hipGetErrorString(hipGetLastError());
+}
+
+inline DeviceError StreamSynchronize(StreamType stream) {
+  return hipStreamSynchronize(stream);
+}
+
+inline DeviceError CreateEvent(EventType* event) {
+  return hipEventCreate(event);
+}
+
+inline DeviceError DestroyEvent(EventType event) {
+  return hipEventDestroy(event);
+}
+
+inline DeviceError EventRecord(EventType event, StreamType stream = 0) {
+  return hipEventRecord(event, stream);
+}
+
+inline DeviceError EventSynchronize(EventType event) {
+  return hipEventSynchronize(event);
+}
+
+inline DeviceError EventElapsedTime(float* ms, EventType start, EventType end) {
+  return hipEventElapsedTime(ms, start, end);
+}
+
+inline DeviceError QueryEvent(EventType event) {
+  return hipEventQuery(event);
+}
+
+inline const char* GetErrorString(DeviceError err) {
+  return hipGetErrorString(err);
+}
+
+inline DeviceError GetDeviceNotReady() {
+  return hipErrorNotReady;
+}
+
+} // namespace ait
diff --git a/static/include/utility.h b/static/include/utility.h
new file mode 100644
index 000000000..170fd7853
--- /dev/null
+++ b/static/include/utility.h
@@ -0,0 +1,54 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+#include "device_functions-generated.h"
+#include "model_interface.h"
+
+// Utility functions for allocating, freeing, and manipulating GPU memory.
+// These are useful to have around for Python clients - it allows users
+// to allocate AIT tensors without depending on any extra Python libraries.
+
+namespace ait {
+enum class AITemplateMemcpyKind {
+  HostToDevice = 0,
+  DeviceToHost,
+  DeviceToDevice,
+};
+} // namespace ait
+
+extern "C" {
+
+AIT_EXPORT
+AITemplateError AITemplateDeviceMalloc(
+    void** ptr_out,
+    size_t size,
+    ait::StreamType stream = 0,
+    bool sync = true);
+
+AIT_EXPORT
+AITemplateError AITemplateDeviceFree(
+    void* ptr,
+    ait::StreamType stream = 0,
+    bool sync = true);
+
+AIT_EXPORT
+AITemplateError AITemplateMemcpy(
+    void* dst,
+    const void* src,
+    size_t count,
+    ait::AITemplateMemcpyKind kind,
+    ait::StreamType stream = 0,
+    bool sync = true);
+}
diff --git a/tests/ci_profile_cache/README.md b/tests/ci_profile_cache/README.md
new file mode 100644
index 000000000..a83fbc393
--- /dev/null
+++ b/tests/ci_profile_cache/README.md
@@ -0,0 +1,5 @@
+# Profiling Database for CI (Deprecated)
+
+Profile Cache DB for CI is deprecated. Now CI will select the algorithm with the smallest tiling size and smallest alignments for CI.
+
+The selection function is defined at: `backend/target.py:  Target:select_minimal_algo` and specialized in each backend target implementation.
diff --git a/tests/ci_profile_cache/update_cache.py b/tests/ci_profile_cache/update_cache.py
new file mode 100644
index 000000000..8011b7546
--- /dev/null
+++ b/tests/ci_profile_cache/update_cache.py
@@ -0,0 +1,827 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A utility script used for modify entries in the profiling cache
+"""
+
+import argparse
+import hashlib
+import logging
+import os
+import re
+import sqlite3
+
+from typing import Any, Dict, List, Tuple
+
+import jinja2
+from aitemplate.backend.profiler_cache import GEMM_INSERT_TEMPLATE, GEMM_QUERY_TEMPLATE
+
+logging.basicConfig(format="%(name)s: %(message)s", level=logging.INFO)
+logger = logging.getLogger("update-cache")
+
+DEFAULT_QUERY_TEMPLATE = jinja2.Template(
+    """
+SELECT id, op_type, algo, device, exec_entry, dtype_acc
+FROM {{table}}
+WHERE
+{% if has_device %}
+device='{{device}}' AND
+{% endif %}
+{% if has_exec_entry_sha1 %}
+exec_entry_sha1='{{exec_entry_sha1}}' AND
+{% endif %}
+{% if has_algo %}
+algo='{{algo}}' AND
+{% endif %}
+{% if has_dtype_acc %}
+dtype_acc='{{dtype_acc}}' AND
+{% endif %}
+op_type='{{op_type}}'
+"""
+)
+
+ID_QUERY_TEMPLATE = jinja2.Template(
+    """
+SELECT id, op_type, algo, device, exec_entry, dtype_acc
+FROM {{table}}
+WHERE
+id={{id}}
+"""
+)
+
+DEVICE_QUERY_TEMPLATE = jinja2.Template(
+    """
+SELECT id, op_type, algo, device, exec_entry, dtype_acc
+FROM {{table}}
+WHERE
+device='{{device}}'
+"""
+)
+
+DEVICE_ONE_OP_TYPE_QUERY_TEMPLATE = jinja2.Template(
+    """
+SELECT id, op_type, algo, device, exec_entry, dtype_acc
+FROM {{table}}
+WHERE
+op_type='{{op_type}}' AND
+device='{{device}}'
+"""
+)
+
+QUERY_ROW_TEMPLATE = jinja2.Template(
+    """
+SELECT *
+FROM {{table}}
+WHERE
+id={{id}}
+"""
+)
+
+
+QUERY_COLUMN_NAMES_TEMPLATE = jinja2.Template(
+    """
+SELECT * FROM {{table}}
+"""
+)
+
+
+DEL_ID_TEMPLATE = jinja2.Template(
+    """
+DELETE FROM {{table}} WHERE id={{id}}
+"""
+)
+
+
+def del_entry(
+    db_conn: sqlite3.Connection,
+    table: str,
+    entry_id: str,
+    op_type: str,
+):
+    """create an sm75 entry based on 'entry' and insert the newly-created
+       entry into the db based
+
+    Parameters
+    ----------
+    db_conn: sqlite3.Connection
+        sqlite3 Connection object
+    table:
+        db table
+    entry_id:
+        entry id to be deleted
+    op_type: str
+        op type in the query, e.g. gemm_rcr_bias
+
+    Returns
+    ----------
+    """
+    logger.info("query entry for deletion - id: %s, op_type: %s", entry_id, op_type)
+    db_conn_cur = db_conn.cursor()
+    entries = query_cache(
+        db_conn_cur=db_conn_cur,
+        table=table,
+        op_type=op_type,
+        op_keys=None,
+        query_template=ID_QUERY_TEMPLATE,
+        algo=None,
+        device=None,
+        entry_id=entry_id,
+    )
+    if len(entries) == 0:
+        logger.info("Could not find valid entries, skip")
+        return
+
+    assert len(entries) == 1
+    if op_type != entries[0][1]:
+        raise RuntimeError(
+            f"cannot delete the entry, unmatched op_type: {op_type}, {entries[0][1]}"
+        )
+    logger.info("deleting entry - id: %s, op_type: %s", entry_id, op_type)
+    del_query = DEL_ID_TEMPLATE.render(
+        table=table,
+        id=entry_id,
+    )
+    db_conn_cur.execute(del_query)
+    db_conn.commit()
+    logger.info("entry deleted successfully")
+
+
+def insert_sm75_entry(
+    db_conn: sqlite3.Connection,
+    table: str,
+    entry: Tuple[Any],
+    column_names: List[str],
+    algo: str,
+):
+    """create an sm75 entry based on 'entry' and insert the newly-created
+       entry into the db based
+
+    Parameters
+    ----------
+    db_conn: sqlite3.Connection
+        sqlite3 Connection object
+    table:
+        db table
+    entry: Tuple[Any]
+        sm80 entry
+    column_names: List[str
+        all column names in the table
+    algo: str
+        new algo used to update the args Dict
+    device: str
+        new device used to update the args Dict
+
+    Returns
+    ----------
+    """
+    db_conn_cur = db_conn.cursor()
+    row_query = QUERY_ROW_TEMPLATE.render(table=table, id=entry[0])
+    db_conn_cur.execute(row_query)
+    row_values = db_conn_cur.fetchall()
+    assert len(row_values) == 1
+    args = make_insertion_args(column_names, row_values[0])
+
+    if algo is None:
+        algo = make_75_algo_from_80(args["algo"])
+
+    args["algo"] = algo
+    args["device"] = "75"
+
+    new_args_str = "\n".join(["{}: {}".format(n, v) for n, v in args.items()])
+    logger.info("new_args:\n%s", new_args_str)
+    query_sql = GEMM_QUERY_TEMPLATE.render(
+        dev="cuda",
+        dtype_a=args["dtype_a"],
+        dtype_b=args["dtype_b"],
+        dtype_c=args["dtype_c"],
+        dtype_acc=args["dtype_acc"],
+        major_a=args["major_a"],
+        major_b=args["major_b"],
+        major_c=args["major_c"],
+        op_type=args["op_type"],
+        device=args["device"],
+        epilogue=args["epilogue"],
+        split_k=args["split_k"],
+        exec_entry_sha1=args["exec_entry_sha1"],
+    )
+
+    db_conn_cur.execute(query_sql)
+    existing_entries = db_conn_cur.fetchall()
+    # make sure we don't overwrite existing entries
+    if len(existing_entries) != 0:
+        id_query = DEFAULT_QUERY_TEMPLATE.render(
+            table=table,
+            op_type=args["op_type"],
+            device=args["device"],
+            exec_entry_sha1=args["exec_entry_sha1"],
+        )
+        db_conn_cur.execute(id_query)
+        id_entries = db_conn_cur.fetchall()
+        raise RuntimeError(
+            "An entry (id: {}, op_type: {}) already exist in the table: {}".format(
+                id_entries[0][0], args["op_type"], existing_entries[0][0]
+            )
+        )
+
+    insertion_sql = GEMM_INSERT_TEMPLATE.render(dev="cuda", **args)
+    db_conn_cur.execute(insertion_sql)
+    db_conn.commit()
+    logger.info(
+        "successfully insert an sm75 entry for: '%s', '%s'",
+        args["op_type"],
+        args["exec_entry"],
+    )
+
+
+def make_gemm_exec_key(op_keys: str) -> str:
+    """make exec_key for gemm-family ops based on the given op_keys string
+
+    Parameters
+    ----------
+    op_keys: str
+        op_keys used to make the exec_key, e.g. 2x3x4
+
+    Returns
+    ----------
+    str
+        exec_key
+    """
+    if ";" in op_keys:
+        raise RuntimeError("invalid op_keys for gemm: '{}'".format(op_keys))
+
+    values = [int(v) for v in op_keys.split("x")]
+    if len(values) != 3:
+        raise RuntimeError("invalid op_keys for gemm: '{}'".format(op_keys))
+
+    name_values = [("M", values[0]), ("N", values[1]), ("K", values[2])]
+    return " && ".join(["{} == {}".format(n, v) for n, v in name_values])
+
+
+def make_bmm_exec_key(op_keys: str) -> str:
+    """make exec_key for bmm-family ops based on the given op_keys string
+
+    Parameters
+    ----------
+    op_keys: str
+        op_keys used to make the exec_key, e.g. 2x3x4x5
+
+    Returns
+    ----------
+    str
+        exec_key
+    """
+    if ";" in op_keys:
+        raise RuntimeError("invalid op_keys for bmm: '{}'".format(op_keys))
+
+    values = [int(v) for v in op_keys.split("x")]
+    if len(values) != 4:
+        raise RuntimeError("invalid op_keys for bmm: '{}'".format(op_keys))
+
+    name_values = [
+        ("B", values[0]),
+        ("M", values[1]),
+        ("N", values[2]),
+        ("K", values[3]),
+    ]
+    return " && ".join(["{} == {}".format(n, v) for n, v in name_values])
+
+
+def make_group_gemm_exec_key(op_keys: str) -> str:
+    """make exec_key for group_gemm-family ops based on the given op_keys string
+
+    Parameters
+    ----------
+    op_keys: str
+        op_keys used to make the exec_key, e.g. 2x3x4;4x5x6;7x8x9
+
+    Returns
+    ----------
+    str
+        exec_key
+    """
+    shapes = op_keys.split(";")
+    name_value_strs = []
+    for i, shape in enumerate(shapes):
+        values = shape.split("x")
+        if len(values) != 3:
+            raise RuntimeError("invalid op_keys for group_gemm: '{}'".format(op_keys))
+        name_values = [
+            ("GROUP_{}_M".format(i), int(values[0])),
+            ("GROUP_{}_N".format(i), int(values[1])),
+            ("GROUP_{}_K".format(i), int(values[2])),
+        ]
+        name_value_str = " &&".join(["{} == {}".format(n, v) for n, v in name_values])
+        name_value_strs.append(" " + name_value_str)
+    return "  &&".join(name_value_strs)
+
+
+def query_cache(
+    db_conn_cur: sqlite3.Cursor,
+    table: str,
+    op_type: str,
+    op_keys: str,
+    query_template: str,
+    algo: str = None,
+    device: str = None,
+    entry_id: int = None,
+    suppress_print: bool = False,
+    exec_key: str = None,
+    dtype_acc: str = None,
+) -> List[Tuple[int, str, str, str, str, str]]:
+    """query the database for the given op_type and op_keys in the table and
+       print out the entries if the query succeeds.
+
+    Parameters
+    ----------
+    db_conn_cur: sqlite3.Cursor
+        sqlite3 connection Cursor object
+    table: str
+        the table to be queried
+    op_type: str
+        op type in the query, e.g. gemm_rcr_bias
+    op_keys: str
+        op keys used to construct exec_hash_sh1 in the query, e.g. 2x3x4 for
+        gemm-family op type
+    query_template: str
+        jinja template to be rendered to create the query
+    device: str optional
+        specify the device field in the query if it's not None. Default value
+        is None, which means we will query all device kinds.
+    algo: str optional
+        specify the also field in the query if it's not None. Default value
+        is None.
+    entry_id: int optional
+        specify the id field in the query if it's not None
+    suppress_print: bool optional
+        do not print returned queries if it's True
+    exec_key: str optional
+        we will use this exec_key for query if it's not None
+    dtype_acc: str optional
+        we will use this dtype_acc for query if it's not None
+
+    Returns
+    -------
+    List[Tuple[int, str, str, str, str, str]]
+        returns a list of valid entries
+    """
+    exec_entry_sha1 = None
+    if op_keys is not None and exec_key is None:
+        if op_type.startswith("gemm"):
+            exec_key = make_gemm_exec_key(op_keys)
+        elif op_type.startswith("bmm"):
+            exec_key = make_bmm_exec_key(op_keys)
+        elif op_type.startswith("group_gemm"):
+            exec_key = make_group_gemm_exec_key(op_keys)
+        else:
+            raise RuntimeError("invalid op_type: " + op_type)
+
+    if exec_key is not None:
+        if not suppress_print:
+            logger.info("exec_key: '%s'", exec_key)
+        exec_entry_sha1 = hashlib.sha1(exec_key.encode("utf-8")).hexdigest()
+        if not suppress_print:
+            logger.info("exec_sha1: '%s'", exec_entry_sha1)
+
+    query_args = {
+        "table": table,
+        "op_type": op_type,
+    }
+    query_args["has_exec_entry_sha1"] = False
+    if exec_entry_sha1 is not None:
+        query_args["exec_entry_sha1"] = exec_entry_sha1
+        query_args["has_exec_entry_sha1"] = True
+
+    query_args["has_device"] = False
+    if device is not None:
+        query_args["device"] = device
+        query_args["has_device"] = True
+
+    query_args["has_entry_id"] = False
+    if entry_id is not None:
+        query_args["id"] = entry_id
+        query_args["has_entry_id"] = True
+
+    query_args["has_dtype_acc"] = False
+    if dtype_acc is not None:
+        query_args["dtype_acc"] = dtype_acc
+        query_args["has_dtype_acc"] = True
+
+    query_args["has_algo"] = False
+    if algo is not None:
+        query_args["algo"] = algo
+        query_args["has_algo"] = True
+
+    query = query_template.render(**query_args)
+    if not suppress_print:
+        print("query:{}".format(query))
+
+    db_conn_cur.execute(query)
+    entries = db_conn_cur.fetchall()
+    if not suppress_print:
+        logger.info("entries: id, op_type, algo, device, exec_entry")
+    for entry in entries:
+        if not suppress_print:
+            print("entry: {}".format(entry))
+    return entries
+
+
+def process_missing_75_entries_from_80(
+    db_conn: sqlite3.Connection,
+    table: str,
+    op_type: str,
+    gen_sm75_entry: bool = False,
+):
+    """for the given op_type, print out all missing sm75 entries if their
+    sm80 counterparts exist in the table. If gen_sm75_entry is True, we will
+    generate an sm75 entry based on each returned sm80 entry.
+
+    Parameters
+    ----------
+    db_conn: sqlite3.Connection
+        sqlite3 Connection object
+    table:
+        db table
+    op_type: str
+        op type in the query, e.g. gemm_rcr_bias
+
+    Returns
+    ----------
+    """
+    logger.info("query all missing sm75 entries - op_type: %s", op_type)
+    db_conn_cur = db_conn.cursor()
+    if op_type == "all":
+        op_type = None
+        query_template = DEVICE_QUERY_TEMPLATE
+    else:
+        query_template = DEVICE_ONE_OP_TYPE_QUERY_TEMPLATE
+    sm80_entries = query_cache(
+        db_conn_cur=db_conn_cur,
+        table=table,
+        op_type=op_type,
+        op_keys=None,
+        query_template=query_template,
+        algo=None,
+        device="80",
+        entry_id=None,
+        suppress_print=True,
+    )
+
+    # entry: (id, op_type, algo, device, exec_entry, dtype_acc)
+    for sm80_entry in sm80_entries:
+        eid, op_type, _, _, exec_entry, dtype_acc = sm80_entry
+        # these ops do not work with sm75
+        if op_type.startswith("bmm_softmax") or op_type.startswith("group_gemm"):
+            continue
+        sm75_entries = query_cache(
+            db_conn_cur=db_conn_cur,
+            table=table,
+            op_type=op_type,
+            op_keys=None,
+            query_template=DEFAULT_QUERY_TEMPLATE,
+            algo=None,
+            device="75",
+            entry_id=None,
+            suppress_print=True,
+            exec_key=exec_entry,
+            dtype_acc=dtype_acc,
+        )
+        if len(sm75_entries) == 0:
+            print("missing sm75 entry for this sm80 entry: '{}'".format(sm80_entry))
+            if gen_sm75_entry:
+                logger.info("gen sm75 entry for: '%s'", sm80_entry)
+                column_names = get_column_names(db_conn_cur, table)
+                insert_sm75_entry(db_conn, table, sm80_entry, column_names, None)
+
+
+def make_insertion_args(column_names: List[str], values: List[Any]) -> Dict[str, Any]:
+    """generate a dictionary that will be used for inserting an entry.
+
+    Parameters
+    ----------
+    column_names: List[str]
+        column names in the db
+    values: List[Any]
+        values paired with names
+
+    Return
+    ----------
+    Dict[str, Any]
+        generated dictionary
+    """
+    if len(column_names) != len(values):
+        raise RuntimeError(
+            "len(column_names) does not equal to len(values): {}, {}".format(
+                len(column_names), len(values)
+            )
+        )
+    name_values = {}
+    for name, val in zip(column_names, values):
+        name_values[name] = val
+    return name_values
+
+
+def make_75_algo_from_80(old_algo: str):
+    """try out best to make an sm75 algo based on the provided sm80 algo.
+
+    Parameters
+    ----------
+    old_algo: str
+        sm80 algo string used for making sm75 algo
+    Returns
+    -------
+    str
+        newly-created sm75 algo string
+    """
+    if "s16816gemm" in old_algo:
+        pattern = re.compile(
+            r"cutlass_tensorop_(f\d+)_s16816gemm_(f\d+)_(\d+x\d+_\d+)x\d+_([tn]+)_align_\d+_(\d+)"
+        )
+        result = pattern.match(old_algo)
+        if result is None:
+            raise RuntimeError("Invalid algo format: '{}'".format(old_algo))
+
+        dtype = result.group(1)
+        acc_dtype = result.group(2)
+        # make a tile that supported by sm75
+        tile = "64x64_32"
+        layout = result.group(4)
+        align_c = result.group(5)
+        new_algo = f"cutlass_tensorop_{dtype}_s1688gemm_{acc_dtype}_{tile}x2_{layout}_align_1_{align_c}"
+    elif "s1688gemm" in old_algo:
+        pattern = re.compile(
+            r"cutlass_tensorop_s1688gemm_(\d+x\d+_\d+)x\d+_([tn]+)_align_\d+_(\d+)"
+        )
+        result = pattern.match(old_algo)
+        if result is None:
+            raise RuntimeError("Invalid algo format: '{}'".format(old_algo))
+
+        tile = "64x64_32"
+        layout = result.group(2)
+        align_c = result.group(3)
+        new_algo = f"cutlass_tensorop_h1688gemm_{tile}x2_{layout}_align_1_{align_c}"
+    elif "h16816gemm" in old_algo:
+        pattern = re.compile(
+            r"cutlass_tensorop_h16816gemm_(\d+x\d+_\d+)x\d+_([tn]+)_align_\d+_(\d+)"
+        )
+        result = pattern.match(old_algo)
+        if result is None:
+            raise RuntimeError("Invalid algo format: '{}'".format(old_algo))
+
+        tile = "64x64_32"
+        layout = result.group(2)
+        align_c = result.group(3)
+        new_algo = f"cutlass_tensorop_h1688gemm_{tile}x2_{layout}_align_1_{align_c}"
+    else:
+        raise RuntimeError("Invalid old_algo format: '{}'".format(old_algo))
+
+    logger.info("new_algo: '%s'", new_algo)
+    return new_algo
+
+
+def get_column_names(db_conn_cur: sqlite3.Cursor, table: str):
+    column_names_query = QUERY_COLUMN_NAMES_TEMPLATE.render(table=table)
+    columns = db_conn_cur.execute(column_names_query)
+    column_names = [col[0] for col in columns.description]
+    logger.info("colum_names:%s", column_names)
+    return column_names
+
+
+def gen_one_75_entry_from_80(
+    db_conn: sqlite3.Connection,
+    table: str,
+    op_type: str,
+    op_keys: str,
+    algo: str = None,
+):
+    """generate sm75 entries based on the corresponding sm80 entries queried
+    by the given op_type and op_keys.
+
+    Parameters
+    ----------
+    db_conn: sqlite3.Connection
+        sqlite3 Connection object
+    table: str
+        the table to be queried
+    op_type: str
+        op type in the query, e.g. gemm_rcr_bias
+    op_keys: str
+        op keys used to construct exec_hash_sh1 in the query, e.g. 2x3x4 for
+        gemm-family op type
+    algo: optional str
+        if it's not None, the algo field will be filled with the specified algo
+        string for the sm75 entry.
+
+    Returns
+    -------
+    None
+    """
+    db_conn_cur = db_conn.cursor()
+    entries = query_cache(
+        db_conn_cur=db_conn_cur,
+        table=table,
+        op_type=op_type,
+        op_keys=op_keys,
+        query_template=DEFAULT_QUERY_TEMPLATE,
+        algo=None,
+        device="80",
+    )
+    if len(entries) == 0:
+        logger.info("Could not find valid entries, skip")
+        return
+
+    column_names = get_column_names(db_conn_cur, table)
+
+    if algo is not None:
+        if len(entries) != 1:
+            raise RuntimeError(
+                "multiple entried found, which cannot be applied to a single algo"
+            )
+        insert_sm75_entry(db_conn, table, entries[0], column_names, algo)
+        return
+
+    for entry in entries:
+        insert_sm75_entry(db_conn, table, entry, column_names, algo)
+
+
+def parse_cmd_arguments():
+    """a function to parse command-line arguments
+
+    Parameters
+    ----------
+    None
+
+    Returns
+    -------
+    None
+    """
+    arg_parser = argparse.ArgumentParser(
+        description="update entries in the profiling database",
+        usage="""
+Examples:
+  * insert an sm75 entry for the given op_type and op_keys from an existing sm80 entry:
+    --gen_one_75_entry_from_80 --db=cuda.db --op_type=gemm_crc_bias --op_keys="4x5x6", or
+    --gen_one_75_entry_from_80 --db=cuda.db --op_type=bmm_crc --op_keys="3x4x5x6"
+    --gen_one_75_entry_from_80 --db=cuda.db --op_type=group_gemm_crc --op_keys="1x2x3;4x5x6"
+  * query an sm75 entry for the given op_type and op_keys:
+    --query_device=75 --db=cuda.db --op_type=gemm_crc_bias --op_keys="4x5x6"
+  * query entries for all devices for the given op_type and op_keys:
+    --query_all_devices --db=cuda.db --op_type=gemm_crc_bias --op_keys="4x5x6"
+  * query missing sm75 entries for all op_type:
+    --db=cuda.db --op_type="all" --query_missing_75_entries_from_80
+  * generate missing sm75 entries for gemm_crc_bias:
+    --db=cuda.db --op_type="gemm_crc_bias" --gen_75_entries_from_80
+""",
+    )
+
+    arg_parser.add_argument("--db", required=True, help="specify the data base")
+    arg_parser.add_argument(
+        "--table", default="cuda_gemm", choices=["cuda_gemm"], help="specify the table"
+    )
+    arg_parser.add_argument(
+        "--op_type", required=True, help="specify the op type of the cache entry"
+    )
+    arg_parser.add_argument(
+        "--op_keys",
+        help="specify the keys that will be used to construct exec_hash_sh1 for "
+        "the cache entry. Each op_type follows its own format below: "
+        "  * gemm family: MxNxK, e.g. --op_keys='8x4x2'"
+        "  * bmm family: BxMxNxK, e.g., --op_keys='4x8x4x2'"
+        "  * group_gemm family: BxMxNxK, e.g., --op_keys='8x4x2;4x5x2;3x4x5' "
+        "        where gemm's problem sizes are separated by semicolon",
+    )
+    arg_parser.add_argument(
+        "--query_all_devices",
+        action="store_true",
+        help="query cache entries for all devices in the current table",
+    )
+    arg_parser.add_argument(
+        "--query_device",
+        choices=["75", "80"],
+        help="query cache entries for a single device in the current table",
+    )
+    arg_parser.add_argument(
+        "--gen_one_75_entry_from_80",
+        action="store_true",
+        help="generate a cuda sm75 entry based on an existing sm80 entry. "
+        "If --algo is not specified, we will generate an algo field based "
+        "the existing algo field in the sm80 entry. We make our best guess "
+        "to ensure the auto-generated algo would work for sm75. If it did "
+        "not work, you could use --algo to specify a correct one.",
+    )
+    arg_parser.add_argument(
+        "--gen_75_entries_from_80",
+        action="store_true",
+        help="generate a missing cuda sm75 entry for each existing sm80 entry "
+        "for the given op_type. If op_type is 'all', we will generate a missing "
+        "cuda sm75 entry from each existing sm80 entry for all op_type(s).",
+    )
+    arg_parser.add_argument(
+        "--query_missing_75_entries_from_80",
+        action="store_true",
+        help="for the given op_type, print out all missing cuda sm75 entries if "
+        "there exists an sm80 counterpart. If op_type is 'all', we print all "
+        "missing cuda sm75 entries for all op_type(s).",
+    )
+    arg_parser.add_argument(
+        "--algo",
+        help="specify the algo (e.g. cutlass_tensorop_f16_xxx_tn_align_1_8) for "
+        "the entry. We will set the algo field to this passed argument for "
+        "the generated entry.",
+    )
+    arg_parser.add_argument(
+        "--del_entry_by_id", help="delete an entry with the specified id"
+    )
+
+    return arg_parser.parse_args()
+
+
+def main():
+    """The main entry of the tool
+
+    Parameters
+    ----------
+    None
+
+    Returns
+    -------
+    None
+    """
+
+    args = parse_cmd_arguments()
+
+    db_file = os.path.abspath(args.db)
+    if not os.path.exists(db_file):
+        raise RuntimeError("database does not exist at: " + db_file)
+
+    db_conn = sqlite3.connect(db_file)
+
+    if args.del_entry_by_id is not None:
+        del_entry(
+            db_conn=db_conn,
+            table=args.table,
+            entry_id=args.del_entry_by_id,
+            op_type=args.op_type,
+        )
+        return
+
+    if args.query_missing_75_entries_from_80:
+        process_missing_75_entries_from_80(
+            db_conn=db_conn,
+            table=args.table,
+            op_type=args.op_type,
+            gen_sm75_entry=False,
+        )
+        return
+
+    if args.gen_75_entries_from_80:
+        process_missing_75_entries_from_80(
+            db_conn=db_conn,
+            table=args.table,
+            op_type=args.op_type,
+            gen_sm75_entry=True,
+        )
+        return
+
+    if args.query_device is not None or args.query_all_devices:
+        query_device = None if args.query_all_devices else args.query_device
+        assert args.op_type != "all", "cannot query 'all' op_type"
+        db_conn_cur = db_conn.cursor()
+        query_cache(
+            db_conn_cur=db_conn_cur,
+            table=args.table,
+            op_type=args.op_type,
+            op_keys=args.op_keys,
+            query_template=DEFAULT_QUERY_TEMPLATE,
+            algo=args.algo,
+            device=query_device,
+        )
+        return
+
+    if args.op_keys is None:
+        raise RuntimeError("Please specify op_keys")
+
+    if args.gen_one_75_entry_from_80:
+        gen_one_75_entry_from_80(
+            db_conn=db_conn,
+            table=args.table,
+            op_type=args.op_type,
+            op_keys=args.op_keys,
+            algo=args.algo,
+        )
+        return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/lint/check_meta_header.py b/tests/lint/check_meta_header.py
new file mode 100644
index 000000000..dd69d74fb
--- /dev/null
+++ b/tests/lint/check_meta_header.py
@@ -0,0 +1,108 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""Check Python source code contains Meta copyright header
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+
+import click
+
+
+def process_header(header, comment):
+    lines = header.split("\n")
+    new_lines = [comment + " " + line for line in lines]
+    return "\n".join(new_lines) + "\n"
+
+
+with open("licenses/license.header.txt") as fi:
+    HEADER = fi.read()
+HEADER_lines = HEADER.splitlines()[1:]
+PY_HEADER = process_header(HEADER, "#")
+CPP_HEADER = process_header(HEADER, "//")
+
+
+def dfs(root_path: str) -> list[str]:
+    """DFS source code tree to find python files missing header
+
+    Parameters
+    ----------
+    root_path : str
+        root source directory path
+
+    Returns
+    -------
+    list[str]
+        file list missing header
+    """
+    ret = []
+    for root, _, files in os.walk(root_path, topdown=False):
+        for name in files:
+            path = os.path.join(root, name)
+            if path.endswith(".py"):
+                with open(path) as fi:
+                    src = fi.read()
+                    flag = True
+                    for line in HEADER_lines:
+                        if line not in src:
+                            flag = False
+                            break
+                    if not flag:
+                        ret.append(path)
+    return ret
+
+
+def fix_header(file_list: list[str]) -> None:
+    """Adding Meta header to to source files
+
+    Parameters
+    ----------
+    file_list : list[str]
+        file list missing header
+    """
+    for path in file_list:
+        src = ""
+        with open(path) as fi:
+            src = fi.read()
+        with open(path, "w") as fo:
+            fo.write(PY_HEADER)
+            fo.write(src)
+
+
+@click.command()
+@click.option(
+    "--path", help="Root directory of source to be checked", required=True, type=str
+)
+@click.option(
+    "--fixit", default=False, help="Fix missing header", required=False, type=bool
+)
+def check_header(path, fixit):
+    ret = dfs(path)
+    if len(ret) == 0:
+        sys.exit(0)
+    print("Need to add Meta header to the following files.")
+    print("----------------File List----------------")
+    for line in ret:
+        print(line)
+    print("-----------------------------------------")
+    if fixit:
+        fix_header(ret)
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+    check_header()
diff --git a/tests/lint/flake8_problem_matcher.json b/tests/lint/flake8_problem_matcher.json
new file mode 100644
index 000000000..00d492cf2
--- /dev/null
+++ b/tests/lint/flake8_problem_matcher.json
@@ -0,0 +1,17 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "flake8",
+      "severity": "error",
+      "pattern": [
+        {
+          "regexp": "^([^:]+):(\\d+):(\\d+):\\s+(.*)$",
+          "file": 1,
+          "line": 2,
+          "column": 3,
+          "message": 4
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/unittest/backend/test_fused_elementwise_backend.py b/tests/unittest/backend/test_fused_elementwise_backend.py
new file mode 100644
index 000000000..1e7bf3416
--- /dev/null
+++ b/tests/unittest/backend/test_fused_elementwise_backend.py
@@ -0,0 +1,412 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+FusedElementwise unittest for backend-agnostic codegen functions.
+"""
+
+import unittest
+
+from aitemplate.backend.common.elementwise_common import (
+    ElementwiseMetaData,
+    FusedElementwiseMetaData,
+    gen_function_single_thread,
+)
+from aitemplate.compiler import ops
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.frontend import Tensor
+
+BATCH_SIZE = 1024
+M = 256
+K = 1024
+
+
+class FusedElementwiseCommonCodeGenTestCase(unittest.TestCase):
+    def test_unary(self):
+        op1 = ops.elementwise(None)
+        op2 = ops.elementwise(None)
+        X1 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=None,
+            dst_ops=[op1],
+        )
+        X2 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op1],
+            dst_ops=[op2],
+        )
+        X3 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op2],
+            dst_ops=[],
+        )
+
+        fused_func_metadata = FusedElementwiseMetaData(
+            inputs=[X1],
+            outputs=[X3],
+            input_accessors=[TensorAccessor(X1)],
+            output_accessors=[TensorAccessor(X3)],
+            original_inputs=[X1],
+            original_outputs=[X3],
+            read_t="uint4",
+            op_t="half2",
+            data_t="half",
+            input_broadcast_sizes=None,
+            dynamic_dims=[],
+            sub_funcs=[
+                ElementwiseMetaData(
+                    func_name="cos",
+                    args=[X1],
+                    outputs=[X2],
+                    op_t="half2",
+                ),
+                ElementwiseMetaData(
+                    func_name="sign",
+                    args=[X2],
+                    outputs=[X3],
+                    op_t="half2",
+                ),
+            ],
+        )
+
+        func_call = gen_function_single_thread(
+            fused_func_metadata,
+            ["input0"],
+            ["output0"],
+            None,
+        )
+        self.assertEqual(func_call, "output0 = sign(cos(input0));\n")
+
+    def test_multi_inputs(self):
+        op1 = ops.elementwise(None)
+        op2 = ops.elementwise(None)
+        op3 = ops.elementwise(None)
+        X1 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=None,
+            dst_ops=[op1],
+        )
+        X2 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=None,
+            dst_ops=[op1],
+        )
+        X3 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op1],
+            dst_ops=[op2],
+        )
+        X4 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=None,
+            dst_ops=[op2],
+        )
+        X5 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op2],
+            dst_ops=[op3],
+        )
+        X6 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op3],
+            dst_ops=[],
+        )
+
+        fused_func_metadata = FusedElementwiseMetaData(
+            inputs=[X1, X2, X4],
+            outputs=[X6],
+            input_accessors=[
+                TensorAccessor(X1),
+                TensorAccessor(X2),
+                TensorAccessor(X4),
+            ],
+            output_accessors=[TensorAccessor(X6)],
+            original_inputs=[X1, X2, X4],
+            original_outputs=[X6],
+            read_t="uint4",
+            op_t="half2",
+            data_t="half",
+            input_broadcast_sizes=None,
+            dynamic_dims=[],
+            sub_funcs=[
+                ElementwiseMetaData(
+                    func_name="mul",
+                    args=[X1, X2],
+                    outputs=[X3],
+                    op_t="half2",
+                ),
+                ElementwiseMetaData(
+                    func_name="add",
+                    args=[X3, X4],
+                    outputs=[X5],
+                    op_t="half2",
+                ),
+                ElementwiseMetaData(
+                    func_name="tanh",
+                    args=[X5],
+                    outputs=[X6],
+                    op_t="half2",
+                ),
+            ],
+        )
+
+        func_call = gen_function_single_thread(
+            fused_func_metadata,
+            ["input0", "input1", "input2"],
+            ["output0"],
+            None,
+        )
+        self.assertEqual(func_call, "output0 = tanh(add(mul(input0,input1),input2));\n")
+
+    def test_constant(self):
+        op1 = ops.elementwise(None)
+        op2 = ops.elementwise(None)
+        X1 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=None,
+            dst_ops=[op1],
+        )
+        X2 = Tensor(
+            shape=[],
+            src_ops=None,
+            dst_ops=[op1],
+            value=10,
+        )
+        X3 = Tensor(
+            shape=[],
+            src_ops=None,
+            dst_ops=[op2],
+            value=20,
+        )
+        X4 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op1],
+            dst_ops=[op2],
+        )
+        X5 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op2],
+            dst_ops=[],
+        )
+
+        fused_func_metadata = FusedElementwiseMetaData(
+            inputs=[X1],
+            outputs=[X5],
+            input_accessors=[TensorAccessor(X1)],
+            output_accessors=[TensorAccessor(X5)],
+            original_inputs=[X1],
+            original_outputs=[X5],
+            read_t="uint4",
+            op_t="half2",
+            data_t="half",
+            input_broadcast_sizes=None,
+            dynamic_dims=[],
+            sub_funcs=[
+                ElementwiseMetaData(
+                    func_name="mul",
+                    args=[X1, X2],
+                    outputs=[X4],
+                    op_t="half2",
+                ),
+                ElementwiseMetaData(
+                    func_name="add",
+                    args=[X3, X4],
+                    outputs=[X5],
+                    op_t="half2",
+                ),
+            ],
+        )
+
+        func_call = gen_function_single_thread(
+            fused_func_metadata,
+            ["input0"],
+            ["output0"],
+            None,
+        )
+        self.assertEqual(
+            func_call, "output0 = add(half2(20,20),mul(input0,half2(10,10)));\n"
+        )
+
+    def test_converter(self):
+        op1 = ops.elementwise(None)
+        op2 = ops.elementwise(None)
+        X1 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=None,
+            dst_ops=[op1],
+        )
+        X2 = Tensor(
+            shape=[],
+            src_ops=None,
+            dst_ops=[op1],
+            value=10,
+        )
+        X3 = Tensor(
+            shape=[],
+            src_ops=None,
+            dst_ops=[op2],
+            value=20,
+        )
+        X4 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op1],
+            dst_ops=[op2],
+        )
+        X5 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op2],
+            dst_ops=[],
+        )
+        fused_func_metadata = FusedElementwiseMetaData(
+            inputs=[X1],
+            outputs=[X5],
+            input_accessors=[TensorAccessor(X1)],
+            output_accessors=[TensorAccessor(X5)],
+            original_inputs=[X1],
+            original_outputs=[X5],
+            read_t="uint4",
+            op_t="half",
+            data_t="half",
+            input_broadcast_sizes=None,
+            dynamic_dims=[],
+            sub_funcs=[
+                ElementwiseMetaData(
+                    func_name="mul",
+                    args=[X1, X2],
+                    outputs=[X4],
+                    op_t="float",
+                ),
+                ElementwiseMetaData(
+                    func_name="add",
+                    args=[X3, X4],
+                    outputs=[X5],
+                    op_t="half",
+                ),
+            ],
+        )
+        convertors = {
+            "half": {"float": "__half2float"},
+            "float": {"half": "__float2half_rn"},
+        }
+        func_call = gen_function_single_thread(
+            fused_func_metadata,
+            ["input0"],
+            ["output0"],
+            convertors,
+        )
+        self.assertEqual(
+            func_call,
+            "output0 = add(half(20),__float2half_rn(mul(__half2float(input0),float(10))));\n",
+        )
+
+    def test_multi_outputs(self):
+        op1 = ops.elementwise(None)
+        op2 = ops.elementwise(None)
+        op3 = ops.elementwise(None)
+        X1 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=None,
+            dst_ops=[op1],
+        )
+        X2 = Tensor(
+            shape=[],
+            src_ops=None,
+            dst_ops=[op1],
+            value=10,
+        )
+        X3 = Tensor(
+            shape=[],
+            src_ops=None,
+            dst_ops=[op2],
+            value=20,
+        )
+        X4 = Tensor(
+            shape=[],
+            src_ops=None,
+            dst_ops=[op3],
+            value=-3,
+        )
+        X5 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op1],
+            dst_ops=[op2, op3],
+        )
+        X6 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op2],
+            dst_ops=[],
+        )
+        X7 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            src_ops=[op3],
+            dst_ops=[],
+        )
+
+        fused_func_metadata = FusedElementwiseMetaData(
+            inputs=[X1],
+            outputs=[X6, X7],
+            input_accessors=[TensorAccessor(X1)],
+            output_accessors=[TensorAccessor(X6), TensorAccessor(X7)],
+            original_inputs=[X1],
+            original_outputs=[X6, X7],
+            read_t="uint4",
+            op_t="half",
+            data_t="half",
+            input_broadcast_sizes=None,
+            dynamic_dims=[],
+            sub_funcs=[
+                ElementwiseMetaData(
+                    func_name="mul",
+                    args=[X1, X2],
+                    outputs=[X5],
+                    op_t="float",
+                ),
+                ElementwiseMetaData(
+                    func_name="add",
+                    args=[X3, X5],
+                    outputs=[X6],
+                    op_t="half",
+                ),
+                ElementwiseMetaData(
+                    func_name="add",
+                    args=[X4, X5],
+                    outputs=[X7],
+                    op_t="half",
+                ),
+            ],
+        )
+        convertors = {
+            "half": {"float": "__half2float"},
+            "float": {"half": "__float2half_rn"},
+        }
+        func_call = gen_function_single_thread(
+            fused_func_metadata,
+            ["input0"],
+            ["output0", "output1"],
+            convertors,
+        )
+        self.assertEqual(
+            func_call.strip(),
+            "\n".join(
+                [
+                    "half tmp_0 = __float2half_rn(mul(__half2float(input0),float(10)));",
+                    "output0 = add(half(20),tmp_0);",
+                    "output1 = add(half(-3),tmp_0);",
+                ]
+            ),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/backend/test_model_api.py b/tests/unittest/backend/test_model_api.py
new file mode 100644
index 000000000..f907959de
--- /dev/null
+++ b/tests/unittest/backend/test_model_api.py
@@ -0,0 +1,1408 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import contextlib
+import itertools
+import unittest
+from typing import Callable, Optional, Tuple
+
+import numpy as np
+
+import torch
+
+from aitemplate.compiler import AIT_DEFAULT_NUM_RUNTIMES, compile_model, ops
+from aitemplate.compiler.base import (
+    _ConstantTensorData,
+    _HostConstantTensorData,
+    _NumpyConstantTensorData,
+    _TorchConstantTensorData,
+    get_dtype_size,
+    IntVar,
+)
+from aitemplate.compiler.model import (
+    AITData,
+    AITemplateMemcpyKind,
+    Model,
+    torch_to_ait_data,
+)
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class ModelAPITestCase(unittest.TestCase):
+    def _get_simple_graph_and_output(
+        self,
+        test_name: str,
+        dynamic_shape: bool = False,
+        unsqueeze_output: bool = False,
+    ) -> Tuple[
+        Model, Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]
+    ]:
+        target = detect_target()
+        input_0 = Tensor(shape=[1], dtype="float16", name="input_0", is_input=True)
+        input_0_view = ops.reshape()(input_0, [1])
+        input_1 = Tensor(
+            shape=[IntVar([1, 1]) if dynamic_shape else 1],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        output = ops.elementwise(FuncEnum.MUL)(input_0_view, input_1)
+        if unsqueeze_output:
+            output = ops.unsqueeze(0)(output)
+
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", test_name)
+        in0_pt = torch.randn([1]).cuda().half()
+        in1_pt = torch.randn([1]).cuda().half()
+        output_pt = torch.mul(in0_pt, in1_pt)
+        if unsqueeze_output:
+            output_pt = output_pt.unsqueeze(0)
+        output_storage = torch.randn(output_pt.shape).cuda().half()
+        return (module, (in0_pt, in1_pt), (output_pt, output_storage))
+
+    def test_set_unnamed_input(self):
+        target = detect_target()
+
+        input_0 = Tensor(shape=[1], dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=[1], dtype="float16", is_input=True)
+        output = ops.elementwise(FuncEnum.SUB)(input_0, input_1)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", "test_set_unnamed_input")
+        in0_pt = torch.randn([1]).cuda().half()
+        in1_pt = torch.randn([1]).cuda().half()
+        output_pt = in0_pt - in1_pt
+
+        output_storage = torch.empty_like(output_pt)
+        module.run_with_tensors(
+            [in0_pt, in1_pt],
+            [output_storage],
+        )
+
+        self.assertTrue(torch.allclose(output_storage, output_pt))
+
+    def _test_param_name_to_index(self, output_is_view: bool, name: str):
+        target = detect_target()
+
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=[1, 2], dtype="float16", name="input_1", is_input=True)
+        output = ops.elementwise(FuncEnum.SUB)(input_0, input_1)
+        if output_is_view:
+            output = ops.squeeze(0)(output)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", name)
+        input_name_to_index = module.get_input_name_to_index_map()
+        self.assertEqual(input_name_to_index, {"input_0": 0, "input_1": 1})
+        output_name_to_index = module.get_output_name_to_index_map()
+        self.assertEqual(output_name_to_index, {"output": 0})
+
+    def test_get_param_name_to_index(self):
+        self._test_param_name_to_index(
+            output_is_view=False, name="test_get_param_name_to_index"
+        )
+
+    def test_get_param_name_to_index_output_is_view(self):
+        self._test_param_name_to_index(
+            output_is_view=True, name="test_get_param_name_to_index_output_is_view"
+        )
+
+    def test_error_handling_not_enough_inputs_outputs(self):
+        module, (in0_pt, in1_pt), outputs = self._get_simple_graph_and_output(
+            "test_error_handling_not_enough_inputs_outputs"
+        )
+        self.assertRaises(
+            RuntimeError, module.run, [], [torch_to_ait_data(outputs[-1])]
+        )
+        self.assertRaises(
+            RuntimeError,
+            module.run_with_tensors,
+            [in0_pt, in1_pt],
+            [],
+        )
+
+    def test_error_handling_null_inputs_outputs(self):
+        module, (in0_pt, in1_pt), outputs = self._get_simple_graph_and_output(
+            "test_error_handling_null_inputs_outputs"
+        )
+        in0_pt_size = list(in0_pt.size())
+        in1_pt_size = list(in1_pt.size())
+        self.assertRaises(
+            RuntimeError,
+            module.run,
+            [
+                AITData(0, in0_pt_size, "float16"),
+                AITData(0, in1_pt_size, "float16"),
+            ],
+            [torch_to_ait_data(outputs[-1])],
+        )
+        self.assertRaises(
+            RuntimeError,
+            module.run,
+            [
+                AITData(in0_pt.data_ptr(), in0_pt_size, "float16"),
+                AITData(in1_pt.data_ptr(), in1_pt_size, "float16"),
+            ],
+            [AITData(0, list(outputs[-1].size()), "float16")],
+        )
+
+    def test_error_handling_wrong_param_dtypes(self):
+        module, (in0_pt, in1_pt), (out_pt, out_ait) = self._get_simple_graph_and_output(
+            "test_error_handling"
+        )
+        in0_pt_size = list(in0_pt.size())
+        in1_pt_size = list(in1_pt.size())
+        self.assertRaises(
+            RuntimeError,
+            module.run,
+            [
+                AITData(in0_pt.data_ptr(), in0_pt_size, "float32"),
+                AITData(in1_pt.data_ptr(), in1_pt_size, "float32"),
+            ],
+            [torch_to_ait_data(out_ait)],
+        )
+
+        self.assertRaises(
+            RuntimeError,
+            module.run,
+            [
+                torch_to_ait_data(in0_pt),
+                torch_to_ait_data(in1_pt),
+            ],
+            [AITData(out_ait.data_ptr(), list(out_ait.size()), "float32")],
+        )
+
+        self.assertRaises(
+            RuntimeError,
+            module.run_with_tensors,
+            [
+                in0_pt,
+                in1_pt.float(),
+            ],
+            [out_ait],
+        )
+
+        self.assertRaises(
+            RuntimeError,
+            module.run_with_tensors,
+            [
+                in0_pt,
+                in1_pt,
+            ],
+            [out_ait.float()],
+        )
+
+    def test_one_input_many_constants(self):
+        target = detect_target()
+
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_1)
+        output = ops.elementwise(FuncEnum.MUL)(x, constant_2)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", "test_one_input_many_constants")
+        in0_pt = torch.randn((1, 2)).cuda().half()
+        const_1_pt = torch.randn((1, 2)).cuda().half()
+        const_2_pt = torch.randn((1, 2)).cuda().half()
+
+        module.set_constant_with_tensor("constant_1", const_1_pt)
+        module.set_constant_with_tensor("constant_2", const_2_pt)
+
+        output_data = torch.empty([1, 2]).cuda().half()
+        module.run_with_tensors([in0_pt], [output_data])
+
+        expected = in0_pt * const_1_pt.cuda() * const_2_pt.cuda()
+        self.assertTrue(torch.allclose(output_data, expected))
+
+    def test_get_param_maximum_shape(self):
+        for dynamic_shape in (False, True):
+            module, inputs, output_np = self._get_simple_graph_and_output(
+                "test_get_param_maximum_shape",
+                dynamic_shape=dynamic_shape,
+            )
+            names_to_index = module.get_output_name_to_index_map()
+            output_shape = module.get_output_maximum_shape(names_to_index["output"])
+            self.assertEqual(output_shape, [1])
+
+            # Test str API
+            output_shape = module.get_output_maximum_shape("output")
+            self.assertEqual(output_shape, [1])
+
+    def test_error_handling_maximum_shape(self):
+        module, inputs, output_np = self._get_simple_graph_and_output(
+            "test_get_param_maximum_shape",
+        )
+        self.assertRaises(ValueError, module.get_output_maximum_shape, "not_an_output")
+        self.assertRaises(
+            TypeError,
+            module.get_output_maximum_shape,
+            [],  # not a string or int
+        )
+
+    def test_get_param_maximum_shape_output_is_view(self):
+        for dynamic_shape in (False, True):
+            module, inputs, output_np = self._get_simple_graph_and_output(
+                "test_get_param_maximum_shape",
+                dynamic_shape=dynamic_shape,
+                unsqueeze_output=True,
+            )
+            names_to_index = module.get_output_name_to_index_map()
+            output_shape = module.get_output_maximum_shape(names_to_index["output"])
+            self.assertEqual(output_shape, [1, 1])
+
+    def test_dynamic_shape_api(self):
+        target = detect_target()
+        dynamic_dim = IntVar([1, 10], name="batch_size")
+        input_0 = Tensor(
+            shape=[dynamic_dim, 2],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        input_1 = Tensor(
+            shape=[dynamic_dim, 2],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        output = ops.elementwise(FuncEnum.MUL)(input_1, input_0)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", "dynamic_shape_api")
+        for batch_size in (1, 10):
+            in0_pt = torch.randn([batch_size, 2]).cuda().half()
+            in1_pt = torch.randn([batch_size, 2]).cuda().half()
+            output_pt = torch.mul(in0_pt, in1_pt)
+            output_storage = (
+                torch.empty(module.get_output_maximum_shape("output")).cuda().half()
+            )
+            outputs_ait = module.run_with_tensors(
+                [in0_pt, in1_pt],
+                [output_storage],
+            )
+            self.assertTrue(torch.allclose(output_pt, outputs_ait["output"]))
+
+    def _test_output_is_alias_of_input(self, view_of_view: bool):
+        target = detect_target()
+        input_0 = Tensor(
+            shape=[2, 2],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        output = ops.reshape()(input_0, [4])
+        if view_of_view:
+            output = ops.reshape()(output, [4, 1])
+
+        output._attrs["is_output"] = True
+        output._attrs["name"] = "output"
+
+        module = compile_model(output, target, "./tmp", "output_is_alias_of_input")
+
+        in0_pt = torch.randn((2, 2)).cuda().half()
+        out_shape = (4, 1) if view_of_view else (4,)
+        out_pt = in0_pt.reshape(out_shape)
+        out_ait = torch.empty(out_shape).cuda().half()
+
+        module.run_with_tensors([in0_pt], [out_ait])
+        self.assertTrue(torch.equal(out_pt, out_ait))
+
+    def test_output_is_view_of_input(self):
+        self._test_output_is_alias_of_input(False)
+
+    def test_output_is_view_of_view_of_input(self):
+        self._test_output_is_alias_of_input(True)
+
+    def test_output_is_input(self):
+        target = detect_target()
+        input_0 = Tensor(
+            shape=[2, 2], dtype="float16", name="input_0", is_input=True, is_output=True
+        )
+
+        module = compile_model(input_0, target, "./tmp", "output_is_input")
+
+        in0_pt = torch.randn((2, 2)).cuda().half()
+        out_ait = torch.empty((2, 2)).cuda().half()
+        module.run_with_tensors([in0_pt], [out_ait])
+        self.assertTrue(torch.equal(out_ait, in0_pt))
+
+        inputs = module.get_input_name_to_index_map()
+        self.assertEqual(inputs, {"input_0": 0})
+
+        outputs = module.get_output_name_to_index_map()
+        self.assertEqual(outputs, {"input_0": 0})
+
+    def _test_output_is_view_of_constant(self, view_of_view: bool):
+        target = detect_target()
+        const = Tensor(shape=[2, 2], dtype="float16", name="constant")
+        output = ops.reshape()(
+            const,
+            [
+                4,
+            ],
+        )
+        if view_of_view:
+            output = ops.reshape()(output, [4, 1])
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", "output_is_view_of_constant")
+
+        const_pt = torch.randn((2, 2)).cuda().half()
+        out_shape = (4, 1) if view_of_view else (4,)
+        out_pt = const_pt.reshape(out_shape)
+        out_ait = torch.empty(out_shape).cuda().half()
+
+        module.set_constant_with_tensor("constant", const_pt)
+        module.run_with_tensors([], [out_ait])
+        self.assertTrue(torch.equal(out_ait, out_pt))
+
+    def test_output_is_view_of_constant(self):
+        self._test_output_is_view_of_constant(False)
+
+    def test_output_is_view_of_view_of_constant(self):
+        self._test_output_is_view_of_constant(True)
+
+    def test_output_is_constant(self):
+        target = detect_target()
+        const = Tensor(shape=[2, 2], dtype="float16", name="constant", is_output=True)
+        module = compile_model(const, target, "./tmp", "output_is_constant")
+
+        const_pt = torch.randn((2, 2)).cuda().half()
+        out_ait = torch.empty((2, 2)).cuda().half()
+        module.set_constant_with_tensor("constant", const_pt)
+        module.run_with_tensors([], [out_ait])
+        self.assertTrue(torch.equal(out_ait, const_pt))
+
+    def _test_output_is_view_of_another_output(self, view_of_view: bool):
+        target = detect_target()
+        input_0 = Tensor(
+            shape=[2, 2],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        output = ops.elementwise(FuncEnum.MUL)(input_0, input_0)
+        output._attrs["is_output"] = True
+        output._attrs["name"] = "output"
+
+        view = ops.reshape()(output, (4,))
+        if view_of_view:
+            view = ops.reshape()(view, (4, 1))
+        view._attrs["is_output"] = True
+        view._attrs["name"] = "view"
+
+        output1 = ops.elementwise(FuncEnum.MUL)(view, view)
+        output1._attrs["is_output"] = True
+        output1._attrs["name"] = "output1"
+
+        outputs = [output, view, output1]
+        module = compile_model(
+            outputs, target, "./tmp", "output_is_alias_of_another_output"
+        )
+
+        out_shape = (4, 1) if view_of_view else (4,)
+        in0_pt = torch.randn((2, 2)).cuda().half()
+        out_pt = in0_pt * in0_pt
+        view_pt = out_pt.reshape(out_shape)
+        out1_pt = view_pt * view_pt
+
+        out_ait = torch.empty((2, 2)).cuda().half()
+        view_ait = torch.empty(out_shape).cuda().half()
+        out1_ait = torch.empty(out_shape).cuda().half()
+        module.run_with_tensors(
+            [in0_pt],
+            [out_ait, view_ait, out1_ait],
+        )
+        self.assertTrue(torch.equal(out_pt, out_ait))
+        self.assertTrue(torch.equal(view_pt, view_ait))
+        self.assertTrue(torch.equal(out1_pt, out1_ait))
+
+    def test_output_is_view_of_another_output(self):
+        self._test_output_is_view_of_another_output(False)
+
+    def test_output_is_view_of_view_of_another_output(self):
+        self._test_output_is_view_of_another_output(True)
+
+    def test_output_is_alias_of_input_and_another_output(self):
+        target = detect_target()
+        input_0 = Tensor(
+            shape=[2, 2],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        view1 = ops.reshape()(input_0, (1, 4))
+        view1._attrs["is_output"] = True
+        view1._attrs["name"] = "view1"
+
+        view2 = ops.reshape()(view1, (4,))
+        view2._attrs["is_output"] = True
+        view2._attrs["name"] = "view2"
+
+        module = compile_model(
+            [view1, view2], target, "./tmp", "output_is_alias_of_another_output"
+        )
+
+        in0_pt = torch.randn((2, 2)).cuda().half()
+        view1_pt = in0_pt.reshape((1, 4))
+        view2_pt = in0_pt.reshape((4,))
+
+        view1_ait = torch.empty((1, 4)).cuda().half()
+        view2_ait = torch.empty((4,)).cuda().half()
+        module.run_with_tensors(
+            [in0_pt],
+            [view1_ait, view2_ait],
+        )
+        self.assertTrue(torch.equal(view1_pt, view1_ait))
+        self.assertTrue(torch.equal(view2_pt, view2_ait))
+
+    def test_benchmark(self):
+        module, (in0, in1), (out_pt, out_ait) = self._get_simple_graph_and_output(
+            "test_benchmark"
+        )
+        runtime_ms, _, outputs_ait = module.benchmark(
+            [
+                torch_to_ait_data(in0),
+                torch_to_ait_data(in1),
+            ],
+            [torch_to_ait_data(out_ait)],
+        )
+        self.assertGreater(runtime_ms, 0)
+        self.assertTrue(torch.equal(out_pt, out_ait))
+        self.assertEqual(
+            outputs_ait,
+            {"output": AITData(out_ait.data_ptr(), [1], "float16")},
+        )
+
+        runtime_ms, _, tensors = module.benchmark_with_tensors(
+            [in0, in1],
+            [out_ait],
+        )
+        self.assertGreater(runtime_ms, 0)
+        self.assertTrue(torch.equal(out_pt, out_ait))
+        self.assertEqual(len(tensors), 1)
+        self.assertTrue(torch.equal(tensors["output"], in0 * in1))
+
+    def test_get_output_dtype(self):
+        module, inputs, output_np = self._get_simple_graph_and_output(
+            "test_get_param_dtype"
+        )
+        self.assertEqual(module.get_output_dtype(0), 1)
+
+    def test_dynamic_dims_out_of_bounds_error(self):
+        target = detect_target()
+        batch_size = IntVar([10, 1], name="batch_size")
+        input_0 = Tensor(
+            shape=[batch_size, 10], dtype="float16", name="input_0", is_input=True
+        )
+        output = ops.elementwise(FuncEnum.MUL)(input_0, input_0)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+        module = compile_model(
+            output, target, "./tmp", "test_dynamic_dim_out_of_bounds"
+        )
+
+        in0_pt = torch.randn((5, 10)).half().cuda()
+        out_pt = torch.empty(module.get_output_maximum_shape("output")).cuda().half()
+
+        self.assertRaises(
+            RuntimeError,
+            module.run,
+            [AITData(in0_pt.data_ptr(), [0, 10], "float16")],
+            [torch_to_ait_data(out_pt)],
+        )
+
+        self.assertRaises(
+            RuntimeError,
+            module.run,
+            [AITData(in0_pt.data_ptr(), [11, 10], "float16")],
+            [torch_to_ait_data(out_pt)],
+        )
+
+        # Make sure we can run with a valid batch size
+        out = module.run_with_tensors(
+            [in0_pt],
+            [out_pt],
+        )
+        self.assertTrue(torch.equal(out["output"], in0_pt * in0_pt))
+
+    def test_output_can_be_null_if_lower_bound_size_is_zero(self):
+        target = detect_target()
+        dynamic_dim = IntVar([0, 10], name="batch_size")
+        input_0 = Tensor(
+            shape=[dynamic_dim, 2],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        input_1 = Tensor(
+            shape=[dynamic_dim, 2],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        output = ops.elementwise(FuncEnum.MUL)(input_1, input_0)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(
+            output,
+            target,
+            "./tmp",
+            "test_output_can_be_null_if_lower_bound_size_is_zero",
+        )
+        shape = [0, 2]
+        module.run(
+            [
+                AITData(0, shape, "float16"),
+                AITData(0, shape, "float16"),
+            ],
+            [AITData(0, [10, 2], "float16")],
+        )
+
+    def test_with_tensors_api_fails_on_cpu_inputs(self):
+        module, (in0, in1), (out_pt, out_ait) = self._get_simple_graph_and_output(
+            "test_fail_on_cpu_inputs"
+        )
+
+        self.assertRaises(
+            ValueError,
+            module.run_with_tensors,
+            [in0.cpu(), in1.cpu()],
+            [out_ait],
+        )
+        self.assertRaises(
+            ValueError,
+            module.run_with_tensors,
+            [in0, in1],
+            [out_ait.cpu()],
+        )
+        self.assertRaises(
+            ValueError,
+            module.benchmark_with_tensors,
+            [in0.cpu(), in1.cpu()],
+            [out_ait],
+        )
+        self.assertRaises(
+            ValueError,
+            module.benchmark_with_tensors,
+            [in0, in1],
+            [out_ait.cpu()],
+        )
+
+    def test_with_tensors_api_fails_on_strided_inputs(self):
+        target = detect_target()
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        output = ops.elementwise(FuncEnum.MUL)(input_0, input_0)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(
+            output, target, "./tmp", "test_with_tensors_api_fails_on_strided_inputs"
+        )
+
+        x = torch.randn((1, 1))
+        in0_pt = x.expand((1, 2))
+        out_pt = x.expand((1, 2))
+
+        self.assertRaises(
+            ValueError,
+            module.run_with_tensors,
+            [in0_pt],
+            [out_pt.contiguous()],
+        )
+        self.assertRaises(
+            ValueError, module.run_with_tensors, [x.contiguous()], [out_pt]
+        )
+
+    def _get_graph_three_inputs_three_outputs(self):
+        target = detect_target()
+        input_0 = Tensor(shape=[1], dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=[1], dtype="float16", name="input_1", is_input=True)
+        input_2 = Tensor(shape=[1], dtype="float16", name="input_2", is_input=True)
+
+        output_0 = ops.elementwise(FuncEnum.ADD)(input_0, input_1)
+        output_1 = ops.elementwise(FuncEnum.ADD)(input_1, input_2)
+        output_2 = ops.elementwise(FuncEnum.ADD)(input_0, input_2)
+
+        output_0._attrs["name"] = "output_0"
+        output_1._attrs["name"] = "output_1"
+        output_2._attrs["name"] = "output_2"
+
+        output_0._attrs["is_output"] = True
+        output_1._attrs["is_output"] = True
+        output_2._attrs["is_output"] = True
+
+        module = compile_model(
+            [output_0, output_1, output_2], target, "./tmp", "test_dict_api"
+        )
+
+        in0_pt = torch.randn((1,)).cuda().half()
+        in1_pt = torch.randn((1,)).cuda().half()
+        in2_pt = torch.randn((1,)).cuda().half()
+
+        out0_pt = torch.empty((1,)).cuda().half()
+        out1_pt = torch.empty((1,)).cuda().half()
+        out2_pt = torch.empty((1,)).cuda().half()
+
+        expected_out0 = in0_pt + in1_pt
+        expected_out1 = in1_pt + in2_pt
+        expected_out2 = in0_pt + in2_pt
+
+        return (
+            module,
+            (in0_pt, in1_pt, in2_pt),
+            (out0_pt, out1_pt, out2_pt),
+            (expected_out0, expected_out1, expected_out2),
+        )
+
+    def test_dict_api(self):
+        (
+            module,
+            (in0_pt, in1_pt, in2_pt),
+            outputs,
+            expected,
+        ) = self._get_graph_three_inputs_three_outputs()
+        out0_pt, out1_pt, out2_pt = outputs
+        in_args = {
+            "input_0": torch_to_ait_data(in0_pt),
+            "input_1": torch_to_ait_data(in1_pt),
+            "input_2": torch_to_ait_data(in2_pt),
+        }
+        out_args = {
+            "output_0": torch_to_ait_data(out0_pt),
+            "output_1": torch_to_ait_data(out1_pt),
+            "output_2": torch_to_ait_data(out2_pt),
+        }
+
+        module.run(in_args, out_args)
+        for out, expect in zip(outputs, expected):
+            self.assertTrue(torch.equal(out, expect))
+            out.zero_()
+        module.benchmark(in_args, out_args)
+
+        in_args_pt = {
+            "input_0": in0_pt,
+            "input_1": in1_pt,
+            "input_2": in2_pt,
+        }
+        out_args_pt = {
+            "output_0": out0_pt,
+            "output_1": out1_pt,
+            "output_2": out2_pt,
+        }
+
+        module.run_with_tensors(in_args_pt, out_args_pt)
+        for out, expect in zip(outputs, expected):
+            self.assertTrue(torch.equal(out, expect))
+            out.zero_()
+        module.benchmark_with_tensors(in_args_pt, out_args_pt)
+
+    def test_error_handling_dict_api(self):
+        (
+            module,
+            (in0_pt, in1_pt, in2_pt),
+            outputs,
+            expected,
+        ) = self._get_graph_three_inputs_three_outputs()
+        out0_pt, out1_pt, out2_pt = outputs
+        in_args_pt = {
+            "input_0": in0_pt,
+            "input_1": in1_pt,
+            "input_2": in2_pt,
+        }
+        out_args_pt = {
+            "output_0": out0_pt,
+            "output_1": out1_pt,
+            "output_2": out2_pt,
+            "not_an_output": torch.randn((3, 3)),
+        }
+
+        self.assertRaises(ValueError, module.run_with_tensors, {}, {})
+        self.assertRaises(ValueError, module.run_with_tensors, in_args_pt, {})
+        self.assertRaises(ValueError, module.run_with_tensors, in_args_pt, out_args_pt)
+
+    def test_error_handling_model_init(self):
+        for num_runtimes in (-1, 0):
+            target = detect_target()
+            input_0 = Tensor(
+                shape=[1],
+                dtype="float16",
+                name="input_0",
+                is_input=True,
+                is_output=True,
+            )
+            with self.assertRaises(ValueError):
+                compile_model(
+                    input_0,
+                    target,
+                    "./tmp",
+                    "test_error_handling_model_init",
+                    num_runtimes=num_runtimes,
+                )
+
+    def test_bind_data_to_tensor_host_data(self):
+        tensor = Tensor([10, 2], dtype="float16")
+        self.assertRaises(
+            ValueError,
+            tensor._bind_data,
+            _HostConstantTensorData(b"\x00" * 10),
+        )
+        # Make sure we can actually construct a constant tensor with the correct
+        # size.
+        for dtype in ("float16", "float32", "int32", "int64"):
+            dtype_size = get_dtype_size(dtype)
+            data = _HostConstantTensorData(b"\x00" * 20 * dtype_size, dtype=dtype)
+            self.assertEqual(data.size(), len(data.to_bytes()))
+            self.assertTrue(all(x == 0 for x in data.to_bytes()))
+            tensor = Tensor([10, 2], dtype=dtype)
+            tensor._bind_data(data)
+            self.assertIsNotNone(tensor._attrs["data"])
+
+            data_numpy = _NumpyConstantTensorData(np.zeros([10, 2], dtype))
+            self.assertEqual(data_numpy.size(), len(data_numpy.to_bytes()))
+            self.assertTrue(all(x == 0 for x in data_numpy.to_bytes()))
+            tensor = Tensor([10, 2], dtype=dtype)
+            tensor._bind_data(data_numpy)
+            self.assertIsNotNone(tensor._attrs["data"])
+
+    def test_bind_torch_tensor_data(self):
+        small_tensor = torch.randn((5, 2)).cuda().half()
+        tensor = Tensor([10, 2], dtype="float16")
+        self.assertRaises(
+            ValueError,
+            tensor._bind_data,
+            _TorchConstantTensorData(small_tensor),
+        )
+        dtype_to_torch = {
+            "float16": torch.float16,
+            "float32": torch.float32,
+            "int32": torch.int32,
+            "int64": torch.int64,
+        }
+        for dtype in dtype_to_torch.keys():
+            tensor = torch.ones((10, 2), dtype=dtype_to_torch[dtype]).cuda()
+            data = _TorchConstantTensorData(tensor)
+            self.assertEqual(data.size(), len(data.to_bytes()))
+
+            data_np = np.frombuffer(data.to_bytes(), dtype=dtype).reshape((10, 2))
+            np.testing.assert_equal(data_np, tensor.cpu().numpy())
+
+            tensor = Tensor([10, 2], dtype=dtype)
+            tensor._bind_data(data)
+            self.assertIsNotNone(tensor._attrs["data"])
+
+    def test_constant_tensor_construction_fails_mismatched_dtypes(self):
+        torch_data = _TorchConstantTensorData(torch.randn((10, 2)).cuda())
+        np_data = _NumpyConstantTensorData(np.random.rand(10, 2))
+        host_data = _HostConstantTensorData("\x00" * 20 * 4, dtype="float32")
+        bad_data = (torch_data, np_data, host_data)
+        for data in bad_data:
+            tensor = Tensor([10, 2], dtype="float16")
+            self.assertRaises(
+                ValueError,
+                tensor._bind_data,
+                data,
+            )
+
+    def _test_use_constant_tensor(
+        self,
+        make_data: Callable[[torch.Tensor], _ConstantTensorData],
+        name: str,
+        size: int = 3,
+    ):
+        target = detect_target()
+        in0_pt = torch.randn((size,)).half()
+        in1_pt = torch.randn((size,)).half()
+
+        in0_data = make_data(in0_pt)
+        in0 = Tensor(shape=[size], dtype="float16")
+        in0._bind_data(in0_data)
+        in1_data = make_data(in1_pt)
+        in1 = Tensor(shape=[size], dtype="float16")
+        in1._bind_data(in1_data)
+
+        out = ops.elementwise(FuncEnum.MUL)(in0, in1)
+        out._attrs["name"] = "output"
+        out._attrs["is_output"] = True
+
+        module = compile_model(out, target, "./tmp", name)
+
+        output_ait = torch.randn((size,)).half().cuda()
+        module.run_with_tensors([], [output_ait])
+
+        self.assertTrue(torch.equal(output_ait.cpu(), in0_pt * in1_pt))
+
+    def test_use_internal_constant_tensors_host(self):
+        self._test_use_constant_tensor(
+            lambda tensor: _HostConstantTensorData(tensor.cpu().numpy().tobytes()),
+            "test_use_internal_constant_tensors_host",
+        )
+
+    def test_use_internal_constant_tensors_gpu(self):
+        self._test_use_constant_tensor(
+            lambda tensor: _TorchConstantTensorData(tensor),
+            "test_use_internal_constant_tensors_host",
+        )
+
+    def test_use_internal_constant_tensors_huge(self):
+        self._test_use_constant_tensor(
+            lambda tensor: _TorchConstantTensorData(tensor),
+            "test_use_internal_constant_tensors_huge",
+            size=int(1e9 / 2),
+        )
+
+    def test_run_return_value_dynamic_batch(self):
+        target = detect_target()
+
+        input_0 = Tensor(
+            shape=[IntVar([0, 2], name="out01"), IntVar([0, 2], name="out12")],
+            dtype="float16",
+            name="out0",
+            is_input=True,
+            is_output=True,
+        )
+        out = ops.elementwise(FuncEnum.MUL)(input_0, input_0)
+        out._attrs["name"] = "out1"
+        out._attrs["is_output"] = True
+
+        module = compile_model(
+            [input_0, out],
+            target,
+            "./tmp",
+            "test_run_return_value_dynamic_batch",
+        )
+
+        for a in range(0, 2):
+            for b in range(0, 2):
+                in0 = torch.randn([a, b]).cuda().half()
+                out0 = torch.empty_like(in0)
+                out1 = torch.empty_like(in0)
+
+                expected = {
+                    "out0": AITData(out0.data_ptr(), [a, b], "float16"),
+                    "out1": AITData(out1.data_ptr(), [a, b], "float16"),
+                }
+                actual = module.run(
+                    {"out0": torch_to_ait_data(in0)},
+                    {
+                        "out0": torch_to_ait_data(out0),
+                        "out1": torch_to_ait_data(out1),
+                    },
+                )
+                self.assertEqual(expected, actual)
+
+                out_tensors = module.run_with_tensors([in0], [out0, out1])
+                self.assertEqual(len(out_tensors), 2)
+                self.assertTrue(torch.equal(out_tensors["out0"], in0))
+                self.assertTrue(torch.equal(out_tensors["out1"], in0 * in0))
+
+    def test_run_return_value_static_shapes(self):
+        target = detect_target()
+
+        input_0 = Tensor(
+            shape=[1, 2, 3, 4],
+            dtype="float16",
+            name="out0",
+            is_input=True,
+            is_output=True,
+        )
+        out = ops.elementwise(FuncEnum.MUL)(input_0, input_0)
+        out._attrs["name"] = "out1"
+        out._attrs["is_output"] = True
+
+        module = compile_model(
+            [input_0, out],
+            target,
+            "./tmp",
+            "test_run_return_value_static_shapes",
+        )
+
+        in0 = torch.randn([1, 2, 3, 4]).cuda().half()
+        out0 = torch.empty_like(in0)
+        out1 = torch.empty_like(in0)
+        expected = {
+            "out0": AITData(out0.data_ptr(), [1, 2, 3, 4], "float16"),
+            "out1": AITData(out1.data_ptr(), [1, 2, 3, 4], "float16"),
+        }
+        actual = module.run(
+            {"out0": torch_to_ait_data(in0)},
+            {
+                "out0": torch_to_ait_data(out0),
+                "out1": torch_to_ait_data(out1),
+            },
+        )
+        self.assertEqual(expected, actual)
+
+        out_tensors = module.run_with_tensors([in0], [out0, out1])
+        self.assertEqual(len(out_tensors), 2)
+        self.assertTrue(torch.equal(out_tensors["out0"], in0))
+        self.assertTrue(torch.equal(out_tensors["out1"], in0 * in0))
+
+    def test_run_return_value_dynamic_second_dim(self):
+        target = detect_target()
+        input_0 = Tensor(
+            shape=[10, IntVar([0, 2], name="dim"), 2],
+            dtype="float16",
+            name="out0",
+            is_input=True,
+            is_output=True,
+        )
+        out = ops.elementwise(FuncEnum.MUL)(input_0, input_0)
+        out._attrs["name"] = "out1"
+        out._attrs["is_output"] = True
+
+        module = compile_model(
+            [input_0, out],
+            target,
+            "./tmp",
+            "test_run_return_value_dynamic_second_dim",
+        )
+
+        for dim in range(0, 2):
+            in0 = torch.randn([10, dim, 2]).cuda().half()
+            out0 = torch.empty_like(in0)
+            out1 = torch.empty_like(in0)
+
+            expected = {
+                "out0": AITData(out0.data_ptr(), [10, dim, 2], "float16"),
+                "out1": AITData(out1.data_ptr(), [10, dim, 2], "float16"),
+            }
+            actual = module.run(
+                {"out0": torch_to_ait_data(in0)},
+                {
+                    "out0": torch_to_ait_data(out0),
+                    "out1": torch_to_ait_data(out1),
+                },
+            )
+            self.assertEqual(expected, actual)
+
+            out_tensors = module.run_with_tensors([in0], {"out0": out0, "out1": out1})
+            self.assertEqual(len(out_tensors), 2)
+            self.assertTrue(torch.equal(out_tensors["out0"], in0))
+            self.assertTrue(torch.equal(out_tensors["out1"], in0 * in0))
+
+    def test_many_threads_one_stream(self):
+        module, (in0, in1), (out_pt, out_ait) = self._get_simple_graph_and_output(
+            "test_many_threads_one_stream"
+        )
+        runtime_ms, _, _ = module.benchmark_with_tensors(
+            [in0, in1],
+            [out_ait],
+            num_threads=8,
+            count=1000,
+        )
+
+    def test_many_threads_many_streams(self):
+        module, (in0, in1), (out_pt, out_ait) = self._get_simple_graph_and_output(
+            "test_benchmark"
+        )
+        runtime_ms, _, _ = module.benchmark_with_tensors(
+            [in0, in1],
+            [out_ait],
+            num_threads=8,
+            count=1000,
+            use_unique_stream_per_thread=True,
+        )
+
+    def test_compiled_module_preserves_output_order(self):
+        input0 = Tensor(shape=[1], dtype="float16", name="input0", is_input=True)
+        output0 = ops.elementwise(FuncEnum.MUL)(input0, input0)
+        output0._attrs["is_output"] = True
+        output0._attrs["name"] = "output0"
+
+        output1 = ops.elementwise(FuncEnum.ADD)(input0, input0)
+        output1._attrs["is_output"] = True
+        output1._attrs["name"] = "output1"
+
+        output2 = ops.elementwise(FuncEnum.MUL)(output0, output1)
+        output2._attrs["is_output"] = True
+        output2._attrs["name"] = "output2"
+
+        test_name = "test_compiled_module_preserves_output_order"
+
+        for output_ordering in itertools.permutations((output0, output1, output2)):
+            target = detect_target()
+            with compile_model(
+                output_ordering,
+                target,
+                "./tmp",
+                test_name,
+            ) as module:
+                expected_ordering = {
+                    tensor._attrs["name"]: idx
+                    for idx, tensor in enumerate(output_ordering)
+                }
+                self.assertEqual(
+                    module.get_output_name_to_index_map(),
+                    expected_ordering,
+                )
+
+    def test_error_non_output_in_output_tensors_list(self):
+        input0 = Tensor(shape=[1], dtype="float16", name="input0", is_input=True)
+        intermediate = ops.elementwise(FuncEnum.ADD)(input0, input0)
+        output0 = ops.elementwise(FuncEnum.MUL)(intermediate, intermediate)
+        output0._attrs["is_output"] = True
+        output0._attrs["name"] = "output0"
+
+        target = detect_target()
+        self.assertRaises(
+            (KeyError, ValueError),
+            compile_model,
+            [input0, output0],
+            target,
+            "./tmp",
+            "test_error_non_output_in_output_tensors_list",
+        )
+
+    def test_error_missing_output_in_output_tensors_list(self):
+        input0 = Tensor(shape=[1], dtype="float16", name="input0", is_input=True)
+        intermediate = ops.elementwise(FuncEnum.ADD)(input0, input0)
+        intermediate._attrs["is_output"] = True
+
+        output0 = ops.elementwise(FuncEnum.MUL)(intermediate, intermediate)
+        output0._attrs["is_output"] = True
+        output0._attrs["name"] = "output0"
+
+        target = detect_target()
+        self.assertRaises(
+            ValueError,
+            compile_model,
+            [output0],
+            target,
+            "./tmp",
+            "test_error_missing_output_in_output_tensors_list",
+        )
+
+    def test_error_duplicate_output_in_output_tensors_list(self):
+        input0 = Tensor(shape=[1], dtype="float16", name="input0", is_input=True)
+        intermediate = ops.elementwise(FuncEnum.ADD)(input0, input0)
+        output0 = ops.elementwise(FuncEnum.MUL)(intermediate, intermediate)
+        output0._attrs["is_output"] = True
+        output0._attrs["name"] = "output0"
+
+        target = detect_target()
+        self.assertRaises(
+            ValueError,
+            compile_model,
+            [output0, output0],
+            target,
+            "./tmp",
+            "test_error_duplicate_output_in_output_tensors_list",
+        )
+
+    def test_run_with_outputs_on_host(self):
+        (
+            module,
+            (in0_pt, in1_pt),
+            (out_pt, out_storage),
+        ) = self._get_simple_graph_and_output("test_run_with_outputs_on_host")
+        out_host = out_storage.cpu()
+        out_pt_host = out_pt.cpu()
+        module._run_with_outputs_on_host(
+            [
+                torch_to_ait_data(in0_pt),
+                torch_to_ait_data(in1_pt),
+            ],
+            [torch_to_ait_data(out_host)],
+        )
+
+        self.assertTrue(torch.equal(out_pt_host, out_host))
+        out_host.zero_()
+
+        module._run_with_tensors_outputs_on_host(
+            {"input_0": in0_pt, "input_1": in1_pt}, {"output": out_host}
+        )
+        self.assertTrue(torch.equal(out_pt_host, out_host))
+
+    def test_run_with_outputs_on_host_fails_with_outputs_on_device(self):
+        (
+            module,
+            (in0_pt, in1_pt),
+            (_, out_storage),
+        ) = self._get_simple_graph_and_output(
+            "test_run_with_outputs_on_host_fails_with_outputs_on_device"
+        )
+
+        self.assertRaises(
+            ValueError,
+            module._run_with_tensors_outputs_on_host,
+            {"input_0": in0_pt, "input_1": in1_pt},
+            {"output": out_storage},
+        )
+
+    def test_cannot_use_closed_model(self):
+        (
+            module,
+            (in0_pt, in1_pt),
+            (_, out_storage),
+        ) = self._get_simple_graph_and_output("test_cannot_use_closed_model")
+
+        module.close()
+
+        self.assertRaises(
+            RuntimeError, module.run_with_tensors, [in0_pt, in1_pt], [out_storage]
+        )
+
+    def test_cannot_use_closed_model_context_manager(self):
+        (
+            module,
+            (in0_pt, in1_pt),
+            (_, out_storage),
+        ) = self._get_simple_graph_and_output("test_cannot_use_closed_model")
+
+        with module as m:
+            pass
+
+        self.assertRaises(
+            RuntimeError, m.run_with_tensors, [in0_pt, in1_pt], [out_storage]
+        )
+
+    def test_run_fails_with_unbound_constants(self):
+        target = detect_target()
+
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        x = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
+        output = ops.elementwise(FuncEnum.MUL)(x, constant_2)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(
+            output, target, "./tmp", "test_run_fails_with_unbound_constants"
+        )
+
+        const_1_pt = torch.randn((1, 2)).cuda().half()
+        const_2_pt = torch.randn((1, 2)).cuda().half()
+        output_data = torch.empty([1, 2]).cuda().half()
+
+        with self.assertRaises(RuntimeError):
+            module.run_with_tensors([], [output_data])
+
+        module.set_constant_with_tensor("constant_1", const_1_pt)
+
+        with self.assertRaises(RuntimeError):
+            module.run_with_tensors([], [output_data])
+        module.set_constant_with_tensor("constant_2", const_2_pt)
+
+        module.run_with_tensors([], [output_data])
+
+        expected = const_1_pt * const_1_pt * const_2_pt
+        torch.testing.assert_allclose(output_data, expected)
+
+    def test_set_constant_fails_wrong_dtype(self):
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        output = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        for wrong_tensor in (
+            torch.zeros([1, 2]).long().cuda(),
+            torch.zeros([1, 2]).int().cuda(),
+            torch.zeros([1, 2]).float().cuda(),
+        ):
+            target = detect_target()
+            with compile_model(
+                output, target, "./tmp", "test_set_constant_fails_wrong_dtype"
+            ) as module:
+                self.assertRaises(
+                    RuntimeError,
+                    module.set_constant_with_tensor,
+                    "constant_1",
+                    wrong_tensor,
+                )
+
+    def test_set_constant_fails_wrong_shape(self):
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        output = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        for wrong_shape in (
+            [2, 2],
+            [3, 4],
+            [0],
+        ):
+            wrong_tensor = torch.randn(wrong_shape).half().cuda()
+            target = detect_target()
+            with compile_model(
+                output, target, "./tmp", "test_set_constant_fails_wrong_shape"
+            ) as module:
+                self.assertRaises(
+                    RuntimeError,
+                    module.set_constant_with_tensor,
+                    "constant_1",
+                    wrong_tensor,
+                )
+
+    def test_null_arguments_error(self):
+        (
+            module,
+            (in0_pt, in1_pt),
+            (_, out_storage),
+        ) = self._get_simple_graph_and_output("test_null_arguments_error")
+
+        old_handle = module.handle
+        module.handle = None
+        self.assertRaises(
+            RuntimeError, module.run_with_tensors, [in0_pt, in1_pt], [out_storage]
+        )
+        self.assertRaises(RuntimeError, module.get_output_dtype, 0)
+
+        # Put it back. Don't want to leak memory!
+        module.handle = old_handle
+
+    def test_memcpy(self):
+        (
+            module,
+            _,
+            _,
+        ) = self._get_simple_graph_and_output("test_memcpy")
+
+        # D2D
+        torch_stream = torch.cuda.Stream().cuda_stream
+        for stream_ptr in (None, torch_stream):
+            expected = torch.randn((3, 2, 1)).half().cuda()
+            actual = torch.empty_like(expected)
+            module.memcpy(
+                actual.data_ptr(),
+                expected.data_ptr(),
+                actual.numel() * actual.element_size(),
+                AITemplateMemcpyKind.DeviceToDevice,
+                stream_ptr,
+            )
+            self.assertTrue(torch.equal(expected, actual))
+
+            # D2H
+            expected = torch.randn((3, 2, 1)).half().cuda()
+            actual = torch.empty_like(expected).cpu()
+            module.memcpy(
+                actual.data_ptr(),
+                expected.data_ptr(),
+                actual.numel() * actual.element_size(),
+                AITemplateMemcpyKind.DeviceToHost,
+                stream_ptr,
+            )
+            self.assertTrue(torch.equal(expected.cpu(), actual))
+
+            # H2D
+            expected = torch.randn((3, 2, 1)).half()
+            actual = torch.empty_like(expected).cuda()
+            module.memcpy(
+                actual.data_ptr(),
+                expected.data_ptr(),
+                actual.numel() * actual.element_size(),
+                AITemplateMemcpyKind.HostToDevice,
+                stream_ptr,
+            )
+            self.assertTrue(torch.equal(expected, actual.cpu()))
+
+    def test_alloc(self):
+        (
+            module,
+            (in0_pt, in1_pt),
+            (out_pt, out_ait),
+        ) = self._get_simple_graph_and_output("test_memcpy")
+
+        @contextlib.contextmanager
+        def alloc_like(tensor: torch.Tensor, stream_ptr: Optional[int]):
+            assert tensor.dtype == torch.half
+            nbytes = tensor.numel() * tensor.element_size()
+            ptr = module.allocate_gpu_memory(nbytes, stream_ptr)
+            try:
+                yield AITData(ptr, list(tensor.shape), "float16"), nbytes
+            finally:
+                module.free_gpu_memory(ptr, stream_ptr)
+
+        torch_stream = torch.cuda.Stream().cuda_stream
+        for stream_ptr in (None, torch_stream):
+            with alloc_like(out_ait, stream_ptr) as (output, nbytes):
+                module.run(
+                    {
+                        "input_0": torch_to_ait_data(in0_pt),
+                        "input_1": torch_to_ait_data(in1_pt),
+                    },
+                    {"output": output},
+                )
+                module.memcpy(
+                    out_ait.data_ptr(),
+                    output.data_ptr,
+                    nbytes,
+                    AITemplateMemcpyKind.DeviceToDevice,
+                    stream_ptr,
+                )
+                self.assertTrue(torch.equal(out_pt, out_ait))
+
+    def test_get_num_runtimes(self):
+        self.assertEqual(AIT_DEFAULT_NUM_RUNTIMES, 1)
+        x = Tensor([1], dtype="float16", is_input=True, is_output=True)
+        with compile_model(
+            x, detect_target(), "./tmp", "test_get_num_runtimes_compile_module_default"
+        ) as module:
+            self.assertEqual(module.get_num_runtimes(), 1)
+
+        with compile_model(
+            x,
+            detect_target(),
+            "./tmp",
+            "test_get_num_runtimes_compile_module_custom",
+            num_runtimes=2,
+        ) as module:
+            self.assertEqual(module.get_num_runtimes(), 2)
+
+    def test_ait_data_numpy_conversions(self):
+        x = Tensor([1], dtype="float16", is_input=True, is_output=True)
+        with compile_model(
+            x, detect_target(), "./tmp", "test_ait_data_numpy_conversions"
+        ) as module:
+            x_shape = [1, 2, 3]
+            x = np.ones(x_shape, dtype="float16")
+            x_ait = module.numpy_to_ait_data(x)
+            self.assertEqual(x_ait.dtype, "float16")
+            self.assertEqual(x_ait.shape, x_shape)
+
+            x_copied = module.ait_data_to_numpy(x_ait)
+            np.testing.assert_equal(x, x_copied)
+
+            y = torch.ones(x_shape, dtype=torch.float16).cuda()
+            y_ait = AITData(y.data_ptr(), x_shape, "float16")
+            y_np = module.ait_data_to_numpy(y_ait)
+
+            np.testing.assert_equal(x, y_np)
+
+    def test_numpy_to_ait_data_manual_free(self):
+        x = Tensor([1], dtype="float16", is_input=True, is_output=True)
+        with compile_model(
+            x, detect_target(), "./tmp", "test_numpy_to_ait_data_manual_free"
+        ) as module:
+            x_shape = [1, 2, 3]
+            x = np.ones(x_shape, dtype="float16")
+            x_ait = module.numpy_to_ait_data(x)
+            module.free_gpu_memory(x_ait.data_ptr)
+            # Make sure we don't double-free when we exit.
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/benchmark/test_group_gemm_benchmark.py b/tests/unittest/benchmark/test_group_gemm_benchmark.py
new file mode 100644
index 000000000..d7a05ad1a
--- /dev/null
+++ b/tests/unittest/benchmark/test_group_gemm_benchmark.py
@@ -0,0 +1,654 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+logger = logging.getLogger(__name__)
+
+
+def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True):
+    inputs = []
+    for i, (n, k) in enumerate(nk_groups):
+        X = Tensor(
+            shape=[m, k],
+            dtype="float16",
+            name="x_{}".format(i + start),
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[n, k],
+            dtype="float16",
+            name="w_{}".format(i + start),
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[n],
+            dtype="float16",
+            name="b_{}".format(i + start),
+            is_input=True,
+        )
+        if has_bias:
+            inputs.append([X, W, B])
+        else:
+            inputs.append([X, W])
+    return inputs
+
+
+def _prepare_batch_input_tensors(b, m, n, k, has_bias=False):
+    inputs = []
+    X = Tensor(
+        shape=[b, m, k],
+        dtype="float16",
+        name="x",
+        is_input=True,
+    )
+    W = Tensor(
+        shape=[b, n, k],
+        dtype="float16",
+        name="w",
+        is_input=True,
+    )
+    B = Tensor(
+        shape=[n],
+        dtype="float16",
+        name="b",
+        is_input=True,
+    )
+    if has_bias:
+        inputs = [X, W, B]
+    else:
+        inputs = [X, W]
+    return inputs
+
+
+def _prepare_inputs(m, nk_groups, repeats=10, has_bias=True):
+    inputs = []
+    for _ in range(repeats):
+        inputs.append([])
+        for (n, k) in nk_groups:
+            x_pt = torch.randn(m, k).half().cuda()
+            w_pt = torch.randn(n, k).half().cuda()
+            b_pt = torch.randn(n).half().cuda()
+            if has_bias:
+                inputs[-1].extend((x_pt, w_pt, b_pt))
+            else:
+                inputs[-1].extend((x_pt, w_pt))
+    return inputs
+
+
+def _prepare_batch_inputs(non_batch_inputs, has_bias=False):
+    inputs = [[] for i in range(len(non_batch_inputs))]
+    for i, non_batch_input in enumerate(non_batch_inputs):
+        n = 3 if has_bias else 2
+        to_be_stacked = [[] for j in range(n)]
+        for j, inp in enumerate(non_batch_input):
+            to_be_stacked[j % n].append(inp)
+        for j in range(n):
+            inputs[i].append(torch.stack(to_be_stacked[j], dim=0))
+    return inputs
+
+
+def _prepare_outputs(output_tensors):
+    def _to_int_list(shape):
+        result = []
+        for d in shape:
+            assert isinstance(d, IntImm)
+            result.append(d._attrs["values"][0])
+        return result
+
+    output_shapes = [_to_int_list(output._attrs["shape"]) for output in output_tensors]
+    outputs = [torch.empty(shape).half().cuda() for shape in output_shapes]
+    return outputs
+
+
+def _prepare_group_gemm_ait_module(
+    m, nk_groups_1, nk_groups_2=None, test_idx=0, has_bias=True
+):
+    output_tensors = []
+
+    OP = ops.group_gemm_rcr_bias() if has_bias else ops.group_gemm_rcr()
+    Ys = OP(operand_groups=_prepare_input_tensors(m, nk_groups_1, has_bias=has_bias))
+    output_tensors.extend(Ys)
+
+    if nk_groups_2:
+        OP_2 = ops.group_gemm_rcr_bias()
+        Ys_2 = OP_2(
+            operand_groups=_prepare_input_tensors(
+                m, nk_groups_2, len(nk_groups_1), has_bias=has_bias
+            )
+        )
+        output_tensors.extend(Ys_2)
+    else:
+        nk_groups_2 = []
+
+    for i, Y in enumerate(output_tensors):
+        Y._attrs["name"] = "y_{}".format(i)
+        Y._attrs["is_output"] = True
+
+    target = detect_target()
+    module = compile_model(
+        output_tensors,
+        target,
+        "./tmp",
+        f"group_gemm_rcr_{'bias_' if has_bias else ''}{m}_{len(nk_groups_1)}_{len(nk_groups_2)}_{test_idx}",
+    )
+    outputs = _prepare_outputs(output_tensors)
+    return outputs, module
+
+
+def _prepare_gemm_ait_module(m, nk_groups, test_idx=0, has_bias=True):
+    group_input_tensors = _prepare_input_tensors(m, nk_groups, has_bias=has_bias)
+    input_tensors = []
+    output_tensors = []
+    for group in group_input_tensors:
+        input_tensors.extend(group)
+        Y = ops.gemm_rcr_bias()(*group) if has_bias else ops.gemm_rcr()(*group)
+        output_tensors.append(Y)
+
+    for i, Y in enumerate(output_tensors):
+        Y._attrs["name"] = "y_{}".format(i)
+        Y._attrs["is_output"] = True
+
+    target = detect_target()
+    module = compile_model(
+        output_tensors,
+        target,
+        "./tmp",
+        f"gemm_rcr_{'bias_' if has_bias else ''}{m}_{len(nk_groups)}_{test_idx}",
+    )
+    outputs = _prepare_outputs(output_tensors)
+    return outputs, module
+
+
+def _prepare_bmm_ait_module(b, m, n, k, has_bias=False):
+    assert (
+        not has_bias
+    ), "bmm_rcr_bias is not implemented! has_bias has to be false for now"
+    input_tensors = _prepare_batch_input_tensors(b, m, n, k, has_bias=has_bias)
+    Y = ops.bmm_rcr()(*input_tensors)
+    Y._attrs["name"] = "y"
+    Y._attrs["is_output"] = True
+
+    target = detect_target()
+    module = compile_model(
+        [Y],
+        target,
+        "./tmp",
+        f"bmm_rcr_{'bias_' if has_bias else ''}{b}_{m}_{n}_{k}",
+    )
+    outputs = _prepare_outputs([Y])
+    return outputs, module
+
+
+def _benchmark(count, inputs_repeats, warmup, inputs, outputs, module, test_name):
+    for i in range(warmup):
+        module.run_with_tensors(inputs[i % inputs_repeats], outputs, sync=False)
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for i in range(count):
+        module.run_with_tensors(inputs[i % inputs_repeats], outputs, sync=False)
+    end_event.record()
+    torch.cuda.synchronize()
+    logger.warning(
+        f"{test_name} benchmark, duration: {start_event.elapsed_time(end_event) / count}ms",
+    )
+
+
+@unittest.skipIf(detect_target().in_ci_env(), "benchmark")
+class GroupGemmBenchTestCase(unittest.TestCase):
+    def test_rcr(self):
+        M = 256
+        K1 = 128
+        N1 = 60
+        K2 = 192
+        N2 = 64
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning("Group Gemm need SM80 HW")
+            return
+        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
+        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        OP = ops.group_gemm_rcr_bias()
+        Y1, Y2 = OP(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
+        Y1._attrs["name"] = "y1"
+        Y1._attrs["is_output"] = True
+        Y2._attrs["name"] = "y2"
+        Y2._attrs["is_output"] = True
+        module = compile_model([Y1, Y2], target, "./tmp", "group_gemm_rcr_bias")
+        X1_pt = torch.randn(M, K1).cuda().half()
+        X2_pt = torch.randn(M, K2).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        B1_pt = torch.randn(N1).cuda().half()
+        B2_pt = torch.randn(N2).cuda().half()
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
+
+        inputs = [
+            X1_pt,
+            W1_pt,
+            B1_pt,
+            X2_pt,
+            W2_pt,
+            B2_pt,
+        ]
+        y1 = torch.empty([M, N1]).cuda().half()
+        y2 = torch.empty([M, N2]).cuda().half()
+        module.run_with_tensors(inputs, [y1, y2])
+        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+
+    def _benchmark_rcr(
+        self,
+        m,
+        nk_groups_1,
+        nk_groups_2=None,
+        test_idx=0,
+        test_name="",
+        benchmark_non_group=False,
+    ):
+        logger.warning(
+            f"{test_name} benchmark, m: {m}, nk groups: {nk_groups_1}, {nk_groups_2}",
+        )
+        WARMUP = 10000
+        COUNT = 50000
+        nk_groups = nk_groups_1 + nk_groups_2
+        total_size = 0
+        for n, k in nk_groups:
+            total_size = (m * k + n * k) * 2  # half_t is 2 bytes.
+        inputs_repeats = int(
+            (40 * 1024 * 1024) / total_size * 2
+        )  # Makes sure input size is larger than L2 cache.
+
+        if nk_groups_2 is None:
+            nk_groups_2 = []
+        inputs = _prepare_inputs(m, nk_groups, inputs_repeats)
+
+        group_gemm_outputs, group_gemm_module = _prepare_group_gemm_ait_module(
+            m, nk_groups_1, nk_groups_2, test_idx
+        )
+        _benchmark(
+            COUNT,
+            inputs_repeats,
+            WARMUP,
+            inputs,
+            group_gemm_outputs,
+            group_gemm_module,
+            f"{test_name}: group_gemm_batch_size_{m}",
+        )
+
+        if benchmark_non_group:
+            nk_groups = nk_groups_1 + nk_groups_2
+            gemm_outputs, gemm_module = _prepare_gemm_ait_module(m, nk_groups, test_idx)
+            _benchmark(
+                COUNT,
+                inputs_repeats,
+                WARMUP,
+                inputs,
+                gemm_outputs,
+                gemm_module,
+                f"{test_name}: gemm_batch_size_{m}",
+            )
+
+    def _benchmark_batch_rcr(self, b, m, n, k, test_name=""):
+        logger.warning(
+            f"{test_name} benchmark, b: {b}, m: {m}, n: {n}, k: {k}",
+        )
+        WARMUP = 10000
+        COUNT = 50000
+        nk_groups = [[n, k]] * b
+        total_size = (m * k + n * k) * b * 2  # half_t is 2 bytes.
+        inputs_repeats = int(
+            (40 * 1024 * 1024) / total_size * 2
+        )  # Makes sure input size is larger than L2 cache.
+
+        group_gemm_inputs = _prepare_inputs(
+            m, nk_groups, inputs_repeats, has_bias=False
+        )
+        group_gemm_outputs, group_gemm_module = _prepare_group_gemm_ait_module(
+            m, nk_groups, has_bias=False
+        )
+        _benchmark(
+            COUNT,
+            inputs_repeats,
+            WARMUP,
+            group_gemm_inputs,
+            group_gemm_outputs,
+            group_gemm_module,
+            f"{test_name}: batch_{b}_group_gemm_{m}_{n}_{k}",
+        )
+        gemm_outputs, gemm_module = _prepare_gemm_ait_module(
+            m, nk_groups, has_bias=False
+        )
+        _benchmark(
+            COUNT,
+            inputs_repeats,
+            WARMUP,
+            group_gemm_inputs,
+            gemm_outputs,
+            gemm_module,
+            f"{test_name}: batch_{b}_normal_gemm_{m}_{n}_{k}",
+        )
+        gemm_output_cat = torch.stack(gemm_outputs, dim=0)
+        bmm_inputs = _prepare_batch_inputs(group_gemm_inputs, has_bias=False)
+        bmm_outputs, bmm_module = _prepare_bmm_ait_module(b, m, n, k, has_bias=False)
+        _benchmark(
+            COUNT,
+            inputs_repeats,
+            WARMUP,
+            bmm_inputs,
+            bmm_outputs,
+            bmm_module,
+            f"batch_{b}_bmm_{m}_{n}_{k}",
+        )
+        self.assertTrue(
+            torch.allclose(gemm_output_cat, bmm_outputs[0], atol=1e-1, rtol=1e-1)
+        )
+
+    def test_rcr_benchmark_1(self):
+        group1 = [
+            [512, 704],
+            [256, 704],
+            [512, 120],
+            [256, 120],
+            [256, 328],
+            [256, 328],
+            [128, 480],
+            [256, 480],
+        ]
+        group2 = [[256, 3200], [128, 3200]]
+        for B in (1024, 2048):
+            # two separate kernels
+            self._benchmark_rcr(
+                B,
+                group1,
+                group2,
+            )
+
+    def test_rcr_benchmark_2(self):
+        group1 = [
+            [512, 704],
+            [256, 704],
+            [512, 120],
+            [256, 120],
+            [256, 328],
+            [256, 328],
+            [128, 480],
+            [256, 480],
+        ]
+        group2 = [[256, 3200], [128, 3200]]
+        groups = group1 + group2
+        for B in (1024, 2048):
+            # out of order
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                0,
+            )
+
+    def test_rcr_benchmark_3(self):
+        group1 = [
+            [512, 704],
+            [256, 704],
+            [512, 120],
+            [256, 120],
+            [256, 328],
+            [256, 328],
+            [128, 480],
+            [256, 480],
+        ]
+        group2 = [[256, 3200], [128, 3200]]
+        for B in (1024, 2048):
+            self._benchmark_rcr(
+                B,
+                group2 + group1,
+                [],
+                1,
+            )
+
+    def test_rcr_benchmark_4(self):
+        group1 = [
+            [512, 704],
+            [256, 704],
+            [512, 120],
+            [256, 120],
+            [256, 328],
+            [256, 328],
+            [128, 480],
+            [256, 480],
+        ]
+        group2 = [[256, 3200], [128, 3200]]
+        groups = group1 + group2
+        for B in (1024, 2048):
+            # order by decreasing k
+            groups.sort(key=lambda i: i[1], reverse=True)
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                2,
+            )
+
+    def test_rcr_benchmark_5(self):
+        group1 = [
+            [512, 704],
+            [256, 704],
+            [512, 120],
+            [256, 120],
+            [256, 328],
+            [256, 328],
+            [128, 480],
+            [256, 480],
+        ]
+        group2 = [[256, 3200], [128, 3200]]
+        groups = group1 + group2
+        for B in (1024, 2048):
+            # order by increasing k
+            groups.sort(key=lambda i: i[1], reverse=False)
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                3,
+            )
+
+    def test_ads_20x_inline_10_groups(self):
+        groups = [
+            [200, 72],
+            [200, 64],
+            [200, 120],
+            [200, 120],
+            [200, 64],
+            [200, 72],
+            [200, 64],
+            [600, 2048],
+            [200, 120],
+            [200, 120],
+        ]
+        # order by decreasing k
+        groups.sort(key=lambda i: i[1], reverse=True)
+        for B in (1024, 2048):
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                benchmark_non_group=True,
+                test_name="20_inline_cvr_10_groups",
+            )
+
+    def test_ads_20x_inline_6_groups(self):
+        groups = [
+            [1056, 144],
+            [528, 2400],
+            [528, 360],
+            [656, 176],
+            [656, 112],
+            [200, 64],
+        ]
+        # order by decreasing k
+        groups.sort(key=lambda i: i[1], reverse=True)
+        for B in (1024, 2048):
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                benchmark_non_group=True,
+                test_name="20_inline_cvr_6_groups",
+            )
+
+    def test_ads_20x_inline_2_groups_1(self):
+        groups = [
+            [256, 256],
+            [256, 256],
+        ]
+        # order by decreasing k
+        groups.sort(key=lambda i: i[1], reverse=True)
+        for B in (1024, 2048):
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                benchmark_non_group=True,
+                test_name="20_inline_cvr_2_groups_1",
+            )
+
+    def test_ads_20x_inline_2_groups_2(self):
+        groups = [
+            [22560, 256],
+            [6000, 256],
+        ]
+        # order by decreasing k
+        groups.sort(key=lambda i: i[1], reverse=True)
+        for B in (1024, 2048):
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                benchmark_non_group=True,
+                test_name="20_inline_cvr_2_groups_2",
+            )
+
+    def test_ads_dhen_inline_15_groups(self):
+        groups = [
+            [768, 144],
+            [384, 2400],
+            [384, 360],
+            [512, 176],
+            [512, 112],
+            [128, 64],
+            [128, 72],
+            [128, 64],
+            [128, 64],
+            [128, 72],
+            [128, 120],
+            [128, 120],
+            [128, 64],
+            [128, 120],
+            [128, 120],
+        ]
+        # order by decreasing k
+        groups.sort(key=lambda i: i[1], reverse=True)
+        for B in (1024, 2048):
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                benchmark_non_group=True,
+                test_name="dhen_inline_cvr_15_groups",
+            )
+
+    def test_ads_dhen_inline_2_groups_1(self):
+        groups = [
+            [256, 4096],
+            [256, 4096],
+        ]
+        # order by decreasing k
+        groups.sort(key=lambda i: i[1], reverse=True)
+        for B in (1024, 2048):
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                benchmark_non_group=True,
+                test_name="dhen_inline_cvr_2_groups_1",
+            )
+
+    def test_ads_dhen_inline_2_groups_2(self):
+        groups = [
+            [47136, 256],
+            [4096, 256],
+        ]
+        # order by decreasing k
+        groups.sort(key=lambda i: i[1], reverse=True)
+        for B in (1024, 2048):
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                benchmark_non_group=True,
+                test_name="dhen_inline_cvr_2_groups_2",
+            )
+
+    def test_ads_17x_inline_11_groups(self):
+        groups = [
+            [128, 64],
+            [128, 120],
+            [128, 72],
+            [128, 72],
+            [128, 120],
+            [128, 120],
+            [128, 64],
+            [128, 64],
+            [128, 120],
+            [128, 64],
+            [384, 2048],
+        ]
+        # order by decreasing k
+        groups.sort(key=lambda i: i[1], reverse=True)
+        for B in (1024, 2048):
+            self._benchmark_rcr(
+                B,
+                groups,
+                [],
+                benchmark_non_group=True,
+                test_name="17x_inline_cvr_11_groups",
+            )
+
+    def test_ads_11x_ctr_17_groups(self):
+        for B in (1024, 2048):
+            self._benchmark_batch_rcr(
+                b=17,
+                m=B,
+                n=160,
+                k=512,
+                test_name="11x_ctr_17_groups",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/benchmark/test_strided_layernorm_benchmark.py b/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
new file mode 100644
index 000000000..4ecda179d
--- /dev/null
+++ b/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
@@ -0,0 +1,85 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import itertools
+import json
+import logging
+import unittest
+import uuid
+
+from aitemplate.testing import detect_target
+from aitemplate.testing.benchmark_ait import make_input_output_pools, run_benchmark
+
+from ..compiler.test_strided_layernorm import build_ait_module, eval_pt
+
+LOGGER = logging.getLogger(__name__)
+
+
+class TestStridedLayerNormBenchmark(unittest.TestCase):
+    @unittest.skipIf(detect_target().in_ci_env(), "don't run benchmark in CI")
+    def test_benchmark(self):
+        for (input_nonbatch_shape, (start_indices, end_indices),) in itertools.product(
+            ((2048, 256), (2048, 512), (2048, 1024), (2048, 2048)),
+            (((0, 0, 4), (None, None, 224)), ((0, 0, 3), (None, None, 223))),
+        ):
+            BATCH_SIZE = 2
+            NUM_ITERS = 1000000
+            NUM_WARMUP_ITERS = 10000
+            INPUT_POOL_SIZE = 100
+            _layernorm_common_params = {
+                "input_nonbatch_shape": input_nonbatch_shape,
+                "n_normalize_over_last_dims": 1,
+                "gamma_is_none": None,
+                "beta_is_none": None,
+                "fuse_sigmoid_mul": False,
+                "eps": 1e-5,
+                "start_indices": start_indices,
+                "end_indices": end_indices,
+            }
+            ait_module = build_ait_module(
+                batch_sizes=(BATCH_SIZE,),
+                workdir=uuid.uuid4().hex,
+                **_layernorm_common_params,
+            )
+            inputs_pool, outputs_pool = make_input_output_pools(
+                pool_size=INPUT_POOL_SIZE,
+                eval_pt_func=lambda: eval_pt(
+                    batch_size=BATCH_SIZE,
+                    **_layernorm_common_params,
+                ),
+                input_filter_func=lambda k, v: not k.startswith("output")
+                and v is not None,
+                output_filter_func=lambda k, _: k.startswith("output"),
+            )
+            mean_runtime = run_benchmark(
+                ait_module=ait_module,
+                inputs_pool=inputs_pool,
+                outputs_pool=outputs_pool,
+                num_iters=NUM_ITERS,
+                num_warmup_iters=NUM_WARMUP_ITERS,
+            )
+            benchmark_results = {
+                "mean_runtime": mean_runtime,
+                "input_nonbatch_shape": input_nonbatch_shape,
+                "start_indices": start_indices,
+                "end_indices": end_indices,
+            }
+            LOGGER.warning(
+                f"Benchmark results {json.dumps(benchmark_results, separators=(',', ':'))}"
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_constant_folding.py b/tests/unittest/compiler/test_constant_folding.py
new file mode 100644
index 000000000..9823ef02d
--- /dev/null
+++ b/tests/unittest/compiler/test_constant_folding.py
@@ -0,0 +1,316 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, Model, ops
+
+from aitemplate.compiler.base import (
+    _create_host_zero_tensor,
+    _TorchConstantTensorData,
+    Tensor,
+)
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.public import IntImm
+from aitemplate.compiler.transform.transform_utils import check_graph_validity
+
+from aitemplate.testing import detect_target
+
+
+class ConstantFoldingTestCase(unittest.TestCase):
+    def _verify_graph(
+        self, mod: Model, expected_num_constants: int, expected_num_nodes: int
+    ) -> None:
+        check_graph_validity(mod.debug_sorted_graph, raiseError=True)
+        graph_size = len(mod.debug_sorted_graph)
+        self.assertEqual(graph_size, expected_num_nodes)
+
+        num_constants = sum(
+            1 for tensor in mod.debug_sorted_graph if tensor._attrs["data"] is not None
+        )
+        # Make sure the extra constants are deleted.
+        self.assertEqual(num_constants, expected_num_constants)
+
+    def test_simple_constant_fold(self):
+        target = detect_target()
+
+        inp0_pt = torch.randn((3, 3)).half().cuda()
+        inp1_pt = torch.randn((3, 3)).half().cuda()
+        inp2_pt = torch.randn((3, 3)).half().cuda()
+        x_pt = inp0_pt * inp1_pt
+        y_pt = (inp2_pt + x_pt).flatten()
+
+        inp0_ait = Tensor(shape=(3, 3), name="inp0")
+        inp0_ait._bind_data(_TorchConstantTensorData(inp0_pt))
+        inp1_ait = Tensor(shape=(3, 3), name="inp1")
+        inp1_ait._bind_data(_TorchConstantTensorData(inp1_pt))
+        inp2_ait = Tensor(shape=[3, 3], name="inp2", is_input=True)
+
+        x_ait = ops.elementwise(FuncEnum.MUL)(inp0_ait, inp1_ait)
+        # prevent mul/add fusion. If the ops get fused, then inp2_ait will be
+        # an input to the fused op, which will prevent constant folding.
+        x_view = ops.flatten()(x_ait)
+        inp2_view = ops.flatten()(inp2_ait)
+        y_ait = ops.elementwise(FuncEnum.ADD)(inp2_view, x_view)
+        y_ait._attrs["name"] = "y"
+        y_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        mod = compile_model(y_ait, target, "./tmp", "test_constant_folding_simple")
+
+        y = torch.empty((9,)).cuda().half()
+        mod.run_with_tensors({"inp2": inp2_pt}, {"y": y})
+        self.assertTrue(torch.equal(y, y_pt))
+
+        # Make sure we eliminated the first elementwise op. We start with 7
+        # tensors, eliminate the elementwise op + its 2 inputs + two flattens,
+        # and add one constant, so the total size should be 3.
+        self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=3)
+
+    def test_pad_constant_weight(self):
+        M, N, K = 16, 32, 3
+        w_pt = torch.randn((K, N)).half().cuda()
+        weight_data = _TorchConstantTensorData(w_pt)
+        input_0 = Tensor(shape=[M, K], name="input_0", is_input=True)
+        W = Tensor(shape=[K, N], name="weight")
+        W._bind_data(weight_data)
+        Y = ops.gemm_rrr()(input_0, W)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        mod = compile_model(Y, target, "./tmp", "test_pad_constant_weight")
+
+        input_0_pt = torch.randn((M, K)).half().cuda()
+        y_pt = torch.matmul(input_0_pt, w_pt)
+
+        y = torch.empty((M, N)).cuda().half()
+        mod.run_with_tensors({"input_0": input_0_pt}, {"y": y})
+
+        self.assertTrue(torch.equal(y, y_pt))
+
+        # The apply_padding graph pass will add padding to both the input and the
+        # weight in this case with concatenate(). The concatenate for the weight
+        # will be folded, so we will be left with 2 constants.
+        self._verify_graph(mod, expected_num_constants=2, expected_num_nodes=5)
+
+    def test_fold_long_chain(self):
+        M, N, K = 16, 32, 3
+        w1_pt = torch.randn((K, N)).half().cuda()
+        w1_data = _TorchConstantTensorData(w1_pt)
+
+        w2_pt = torch.randn((K, N)).half().cuda()
+        w2_data = _TorchConstantTensorData(w2_pt)
+
+        w3_pt = w1_pt * w2_pt
+        x_pt = torch.randn((M, K)).half().cuda()
+        x_pt_data = _TorchConstantTensorData(x_pt)
+
+        y_pt = torch.matmul(x_pt, w3_pt)
+        w4_pt = torch.randn((M, N)).half().cuda()
+        w4_data = _TorchConstantTensorData(w4_pt)
+        z_pt = y_pt * w4_pt
+
+        w1_ait = Tensor(shape=[K, N], name="w1")
+        w1_ait._bind_data(w1_data)
+        w2_ait = Tensor(shape=[K, N], name="w2")
+        w2_ait._bind_data(w2_data)
+        w3_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
+        x_ait = Tensor(shape=[M, K], name="x")
+        x_ait._bind_data(x_pt_data)
+        y_ait = ops.gemm_rrr()(x_ait, w3_ait)
+        w4_ait = Tensor(shape=[M, N], name="w4")
+        w4_ait._bind_data(w4_data)
+        z_ait = ops.elementwise(FuncEnum.MUL)(y_ait, w4_ait)
+        z_ait._attrs["name"] = "z"
+        z_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        mod = compile_model(z_ait, target, "./tmp", "test_pad_constant_weight")
+
+        z = torch.empty((M, N)).cuda().half()
+        mod.run_with_tensors({}, {"z": z})
+
+        self.assertTrue(torch.equal(z, z_pt))
+
+        # The entire graph is turned into a constant.
+        self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
+
+    def test_constant_folding_through_views(self):
+        target = detect_target()
+
+        inp0_pt = torch.randn((3, 3)).half().cuda()
+        inp1_pt = torch.randn((3, 3)).half().cuda()
+        y_pt = (inp0_pt * inp1_pt).flatten()
+
+        inp0_ait = Tensor(shape=(3, 3), name="inp0")
+        inp0_ait._bind_data(_TorchConstantTensorData(inp0_pt))
+        inp1_ait = Tensor(shape=(3, 3), name="inp1")
+        inp1_ait._bind_data(_TorchConstantTensorData(inp1_pt))
+        inp0_view = ops.flatten()(inp0_ait)
+        inp1_view = ops.flatten()(inp1_ait)
+        y_ait = ops.elementwise(FuncEnum.MUL)(inp0_view, inp1_view)
+        y_ait._attrs["name"] = "y"
+        y_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        mod = compile_model(
+            y_ait, target, "./tmp", "test_constant_folding_through_views"
+        )
+
+        y = torch.empty((9,)).cuda().half()
+        mod.run_with_tensors({}, {"y": y})
+        self.assertTrue(torch.equal(y, y_pt))
+
+        # The entire graph is eliminated.
+        self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
+
+    def test_late_binding(self):
+        # Test binding constants through compile_model
+        M, N, K = 16, 32, 3
+        w1_pt = torch.randn((K, N)).half().cuda()
+
+        w2_pt = torch.randn((K, N)).half().cuda()
+
+        w3_pt = w1_pt * w2_pt
+        x_pt = torch.randn((M, K)).half().cuda()
+
+        y_pt = torch.matmul(x_pt, w3_pt)
+        w4_pt = torch.randn((M, N)).half().cuda()
+        z_pt = y_pt * w4_pt
+
+        w1_ait = Tensor(shape=[K, N], name="w1")
+        w2_ait = Tensor(shape=[K, N], name="w2")
+        w3_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
+        x_ait = Tensor(shape=[M, K], name="x")
+        y_ait = ops.gemm_rrr()(x_ait, w3_ait)
+        w4_ait = Tensor(shape=[M, N], name="w4")
+        z_ait = ops.elementwise(FuncEnum.MUL)(y_ait, w4_ait)
+        z_ait._attrs["name"] = "z"
+        z_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        mod = compile_model(
+            z_ait,
+            target,
+            "./tmp",
+            "test_late_binding",
+            constants={"w1": w1_pt, "w2": w2_pt, "x": x_pt, "w4": w4_pt},
+        )
+
+        z = torch.empty((M, N)).cuda().half()
+        mod.run_with_tensors({}, {"z": z})
+
+        self.assertTrue(torch.equal(z, z_pt))
+
+        # The entire graph is turned into a constant.
+        self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
+
+    def test_late_binding_error_constant_already_bound(self):
+        N, K = IntImm(16), IntImm(32)
+        w1_ait = _create_host_zero_tensor(shape=[K, N], name="w1")
+        w2_ait = Tensor(shape=[K, N], name="w2")
+        y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
+        y_ait._attrs["name"] = "y"
+        y_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        torch_shape = (K.value(), N.value())
+        with self.assertRaisesRegex(ValueError, "Tensor w1 is already bound!"):
+            compile_model(
+                y_ait,
+                target,
+                "./tmp",
+                "test_late_binding",
+                constants={
+                    "w1": torch.randn(torch_shape).cuda().half(),
+                    "w2": torch.randn(torch_shape).cuda().half(),
+                },
+            )
+
+    def test_late_binding_error_cannot_bind_input(self):
+        N, K = IntImm(16), IntImm(32)
+        w1_ait = Tensor(shape=[K, N], name="w1", is_input=True)
+        w2_ait = Tensor(shape=[K, N], name="w2")
+        y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
+        y_ait._attrs["name"] = "y"
+        y_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        torch_shape = (K.value(), N.value())
+        with self.assertRaisesRegex(ValueError, "Cannot bind input tensor w1"):
+            compile_model(
+                y_ait,
+                target,
+                "./tmp",
+                "test_late_binding",
+                constants={
+                    "w1": torch.randn(torch_shape).cuda().half(),
+                    "w2": torch.randn(torch_shape).cuda().half(),
+                },
+            )
+
+    def test_late_binding_error_cannot_bind_non_constant(self):
+        N, K = IntImm(16), IntImm(32)
+        w1_ait = Tensor(shape=[K, N], name="w1")
+        w2_ait = Tensor(shape=[K, N], name="w2")
+        y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
+        y_ait._attrs["name"] = "y"
+        y_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        torch_shape = (K.value(), N.value())
+        with self.assertRaisesRegex(ValueError, "Cannot bind non-constant tensor y"):
+            compile_model(
+                y_ait,
+                target,
+                "./tmp",
+                "test_late_binding",
+                constants={
+                    "w1": torch.randn(torch_shape).cuda().half(),
+                    "w2": torch.randn(torch_shape).cuda().half(),
+                    "y": torch.randn(torch_shape).cuda().half(),
+                },
+            )
+
+    def test_late_binding_fails_wrong_dtype(self):
+        w1_ait = Tensor(shape=[1], name="w1", dtype="float16")
+        y = ops.elementwise(FuncEnum.MUL)(w1_ait, w1_ait)
+        y._attrs["name"] = "y"
+        y._attrs["is_output"] = True
+
+        wrong_inputs = (
+            torch.randn((1,)),
+            torch.zeros((1,)).int(),
+            torch.zeros((1,)).long(),
+        )
+        for w1_pt in wrong_inputs:
+            with self.assertRaisesRegex(
+                ValueError,
+                r"data's dtype did not match: expected float16, got .*",
+            ):
+                target = detect_target()
+                compile_model(
+                    y,
+                    target,
+                    "./tmp",
+                    "test_late_binding_fails_wrong_dtype",
+                    constants={"w1": w1_pt},
+                )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_conv_elementwise.py b/tests/unittest/compiler/test_fuse_conv_elementwise.py
new file mode 100644
index 000000000..fcd21a6f1
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_conv_elementwise.py
@@ -0,0 +1,696 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import graph_has_op
+from aitemplate.utils import shape_utils
+
+
+@unittest.skipIf(
+    detect_target().name() == "cuda" and detect_target()._arch < "80",
+    "On CUDA, only supported on > SM80 arch.",
+)
+class FuseConvCase(unittest.TestCase):
+    def _build_conv2d(
+        self,
+        batch_dim,
+        CO,
+        HH,
+        WW,
+        CI,
+        filter_HW,
+        stride=1,
+        transpose=False,
+    ):
+        X = Tensor(
+            shape=[batch_dim, HH, WW, CI],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        W = Tensor(
+            shape=[CO, filter_HW, filter_HW, CI],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        if transpose:
+            conv2d = ops.transposed_conv2d(stride=stride, pad=0)(X, W)
+        else:
+            conv2d = ops.conv2d(stride=stride, pad=0)(X, W)
+
+        return conv2d
+
+    def test_do_not_fuse_with_add_not_1d(self):
+        """
+        We can't turn conv2d into conv2d_bias if the thing we do
+        an add with is not 1d.
+        """
+
+        # Keep IntImm batch here just not to mess with profiling strategy
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B, name="batch_dim")
+        CO, HH, WW, CI = 256, 28, 28, 128
+        filter_HW = 3
+
+        bias = Tensor(
+            shape=[batch_dim, 26, 26, CO], dtype="float16", name="bias", is_input=True
+        )
+        conv2d = self._build_conv2d(batch_dim, CO, HH, WW, CI, filter_HW)
+        output = ops.elementwise(FuncEnum.ADD)(bias, conv2d)
+        output._attrs["is_output"] = True
+        output._attrs["name"] = "output_0"
+
+        target = detect_target()
+        module = compile_model(
+            output, target, "./tmp", "test_do_not_fuse_with_add_not_1d"
+        )
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "output_0":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "fused_elementwise")
+
+        for b in B:
+            X_pt = torch.randn(b, CI, HH, WW).cuda().half()
+            W_pt = torch.randn(CO, CI, filter_HW, filter_HW).cuda().half()
+            Y_pt = torch.nn.functional.conv2d(X_pt, W_pt)
+            B_pt = torch.randn(Y_pt.size()).cuda().half()
+            Y_pt = Y_pt + B_pt
+
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            w = W_pt.permute((0, 2, 3, 1)).contiguous()
+            b_pt = B_pt.permute((0, 2, 3, 1)).contiguous()
+            inputs = {"input_0": x, "input_1": w, "bias": b_pt}
+
+            y = torch.empty([b, 26, 26, CO]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            y_transpose = y.permute(0, 3, 1, 2)
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+    def test_do_not_fuse_transpose_with_add_not_1d(self):
+        """
+        We can't turn transposed_conv2d into transposed_conv2d_bias if the thing we do
+        an add with is not 1d.
+        """
+        B = [1]
+        CO, HH, WW, CI = 256, 28, 28, 256
+        filter_HW = 2
+
+        batch_dim = shape_utils.gen_int_var_min_max(B, name="batch_dim")
+        bias = Tensor(
+            shape=[batch_dim, 56, 56, CO], dtype="float16", name="bias", is_input=True
+        )
+        conv2d = self._build_conv2d(
+            batch_dim, CO, HH, WW, CI, filter_HW, stride=2, transpose=True
+        )
+        output = ops.elementwise(FuncEnum.ADD)(bias, conv2d)
+        output._attrs["is_output"] = True
+        output._attrs["name"] = "output_0"
+
+        target = detect_target()
+        module = compile_model(
+            output, target, "./tmp", "test_do_not_fuse_with_add_not_1d"
+        )
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "output_0":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "fused_elementwise")
+
+        for b in B:
+            X_pt = torch.randn(b, CI, HH, WW).cuda().half()
+            W_pt = torch.randn(CO, CI, filter_HW, filter_HW).cuda().half()
+            W_pt = torch.randn(CO, CI, filter_HW, filter_HW).cuda().half()
+            Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, stride=2)
+            B_pt = torch.randn(b, CO, 56, 56).cuda().half()
+            Y_pt = Y_pt + B_pt
+
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            w = W_pt.permute((0, 2, 3, 1)).contiguous()
+            b_pt = B_pt.permute((0, 2, 3, 1)).contiguous()
+            inputs = {"input_0": x, "input_1": w, "bias": b_pt}
+
+            y = torch.empty([b, 56, 56, CO]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            y_transpose = y.permute(0, 3, 1, 2)
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+
+class FuseConvBiasCase(unittest.TestCase):
+    def _build_conv2d_bias(self, batch_dim, CO, HH, WW, CI, filter_HW, decomposed):
+        X = Tensor(
+            shape=[batch_dim, HH, WW, CI],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        W = Tensor(
+            shape=[CO, filter_HW, filter_HW, CI],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        if decomposed:
+            conv2d = ops.conv2d(stride=1, pad=1, dilate=1)(X, W)
+            conv2d_bias = ops.elementwise(FuncEnum.ADD)(conv2d, B)
+        else:
+            conv2d_bias = ops.conv2d_bias(stride=1, pad=1, dilate=1)(X, W, B)
+
+        return conv2d_bias
+
+    def test_conv2d_bias(self):
+        # Keep IntImm batch here just not to mess with profiling strategy
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B, name="batch_dim")
+        CO, HH, WW, CI = 256, 28, 28, 128
+        filter_HW = 3
+
+        conv2d_bias = self._build_conv2d_bias(
+            batch_dim, CO, HH, WW, CI, filter_HW, True
+        )
+        conv2d_bias._attrs["is_output"] = True
+        conv2d_bias._attrs["name"] = "output_0"
+
+        target = detect_target()
+        module = compile_model(conv2d_bias, target, "./tmp", "test_conv2d_bias")
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "output_0":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "conv2d_bias")
+
+        for b in B:
+            X_pt = torch.randn(b, CI, HH, WW).cuda().half()
+            W_pt = torch.randn(CO, CI, filter_HW, filter_HW).cuda().half()
+            B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+            Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+            Y_pt = Y_pt + B_pt
+
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            w = W_pt.permute((0, 2, 3, 1)).contiguous()
+            inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
+
+            y = torch.empty([b, HH, WW, CO]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            y_transpose = y.permute(0, 3, 1, 2)
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+    def test_conv2d_bias_add_relu(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B, name="batch_dim")
+        CO, HH, WW, CI = 256, 28, 28, 128
+        filter_HW = 3
+
+        conv2d_bias = self._build_conv2d_bias(
+            batch_dim, CO, HH, WW, CI, filter_HW, False
+        )
+        D = Tensor(
+            shape=[batch_dim, HH, WW, CO],
+            dtype="float16",
+            name="input_3",
+            is_input=True,
+        )
+        conv2d_bias_add = ops.elementwise(FuncEnum.ADD)(conv2d_bias, D)
+        conv2d_bias_add_relu = ops.elementwise(FuncEnum.RELU)(conv2d_bias_add)
+        conv2d_bias_add_relu._attrs["is_output"] = True
+        conv2d_bias_add_relu._attrs["name"] = "output_0"
+
+        target = detect_target()
+        module = compile_model(
+            conv2d_bias_add_relu, target, "./tmp", "test_conv2d_bias_add_relu"
+        )
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "output_0":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "conv2d_bias_add_relu")
+
+        for b in B:
+            X_pt = torch.randn(b, CI, HH, WW).cuda().half()
+            W_pt = torch.randn(CO, CI, filter_HW, filter_HW).cuda().half()
+            B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+            D_pt = torch.randn(b, CO, HH, WW).cuda().half()
+            Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+            Y_pt = Y_pt + B_pt + D_pt
+            Y_pt = torch.nn.functional.relu(Y_pt)
+
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            w = W_pt.permute((0, 2, 3, 1)).contiguous()
+            d = D_pt.permute((0, 2, 3, 1)).contiguous()
+            inputs = {
+                "input_0": x,
+                "input_1": w,
+                "input_2": B_pt.squeeze(),
+                "input_3": d,
+            }
+
+            y = torch.empty([b, HH, WW, CO]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            y_transpose = y.permute(0, 3, 1, 2)
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+    def test_conv2d_bias_relu(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B, name="batch_dim")
+        CO, HH, WW, CI = 256, 28, 28, 128
+        filter_HW = 3
+
+        conv2d_bias = self._build_conv2d_bias(
+            batch_dim, CO, HH, WW, CI, filter_HW, False
+        )
+        conv2d_bias_relu = ops.elementwise(FuncEnum.RELU)(conv2d_bias)
+        conv2d_bias_relu._attrs["is_output"] = True
+        conv2d_bias_relu._attrs["name"] = "output_0"
+
+        target = detect_target()
+        module = compile_model(
+            conv2d_bias_relu, target, "./tmp", "test_conv2d_bias_relu"
+        )
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "output_0":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "conv2d_bias_relu")
+
+        for b in B:
+            X_pt = torch.randn(b, CI, HH, WW).cuda().half()
+            W_pt = torch.randn(CO, CI, filter_HW, filter_HW).cuda().half()
+            B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+            Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+            Y_pt = Y_pt + B_pt
+            Y_pt = torch.nn.functional.relu(Y_pt)
+
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            w = W_pt.permute((0, 2, 3, 1)).contiguous()
+            inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
+
+            y = torch.empty([b, HH, WW, CO]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            y_transpose = y.permute(0, 3, 1, 2)
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+    def test_conv2d_bias_sigmoid(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B, name="batch_dim")
+        CO, HH, WW, CI = 256, 28, 28, 128
+        filter_HW = 3
+
+        conv2d_bias = self._build_conv2d_bias(
+            batch_dim, CO, HH, WW, CI, filter_HW, False
+        )
+        conv2d_bias_sigmoid = ops.elementwise(FuncEnum.SIGMOID)(conv2d_bias)
+        conv2d_bias_sigmoid._attrs["is_output"] = True
+        conv2d_bias_sigmoid._attrs["name"] = "output_0"
+
+        target = detect_target()
+        module = compile_model(
+            conv2d_bias_sigmoid, target, "./tmp", "test_conv2d_bias_sigmoid"
+        )
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "output_0":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "conv2d_bias_sigmoid")
+
+        for b in B:
+            X_pt = torch.randn(b, CI, HH, WW).cuda().half()
+            W_pt = torch.randn(CO, CI, filter_HW, filter_HW).cuda().half()
+            B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+            Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+            Y_pt = Y_pt + B_pt
+            Y_pt = torch.sigmoid(Y_pt)
+
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            w = W_pt.permute((0, 2, 3, 1)).contiguous()
+            inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
+
+            y = torch.empty([b, HH, WW, CO]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            y_transpose = y.permute(0, 3, 1, 2)
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+    def test_conv2d_bias_add_fusion(self):
+        target = detect_target()
+        if target.name() == "rocm":
+            return
+
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B, name="batch_dim")
+        CO, HH, WW, CI = 256, 28, 28, 128
+        filter_HW = 3
+        R = Tensor(
+            shape=[batch_dim, HH, WW, CO],
+            dtype="float16",
+            name="residual",
+            is_input=True,
+        )
+
+        conv2d_bias = self._build_conv2d_bias(
+            batch_dim, CO, HH, WW, CI, filter_HW, False
+        )
+        conv2d_bias_add = ops.elementwise(FuncEnum.ADD)(conv2d_bias, R)
+        conv2d_bias_add._attrs["is_output"] = True
+        conv2d_bias_add._attrs["name"] = "output_0"
+
+        module = compile_model(conv2d_bias_add, target, "./tmp", "test_conv2d_bias_add")
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "output_0":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "conv2d_bias_add_identity")
+
+        for b in B:
+            X_pt = torch.randn(b, CI, HH, WW).cuda().half()
+            W_pt = torch.randn(CO, CI, filter_HW, filter_HW).cuda().half()
+            B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+            R_pt = torch.randn(b, CO, HH, WW).cuda().half()
+            Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+            Y_pt = Y_pt + B_pt + R_pt
+
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            w = W_pt.permute((0, 2, 3, 1)).contiguous()
+            r = R_pt.permute((0, 2, 3, 1)).contiguous()
+            inputs = {
+                "input_0": x,
+                "input_1": w,
+                "input_2": B_pt.squeeze(),
+                "residual": r,
+            }
+
+            y = torch.empty([b, HH, WW, CO]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            y_transpose = y.permute(0, 3, 1, 2)
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+    def test_conv2d_bias_add_do_not_fuse(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B, name="batch_dim")
+        CO, HH, WW, CI = 256, 28, 28, 128
+        filter_HW = 3
+        R = Tensor(
+            shape=[batch_dim, 1, WW, CO],
+            dtype="float16",
+            name="residual",
+            is_input=True,
+        )
+
+        conv2d_bias = self._build_conv2d_bias(
+            batch_dim, CO, HH, WW, CI, filter_HW, False
+        )
+        conv2d_bias_add = ops.elementwise(FuncEnum.ADD)(conv2d_bias, R)
+        conv2d_bias_add._attrs["is_output"] = True
+        conv2d_bias_add._attrs["name"] = "output_0"
+
+        target = detect_target()
+        module = compile_model(conv2d_bias_add, target, "./tmp", "test_conv2d_bias_add")
+
+        graph = module.debug_sorted_graph
+
+        self.assertFalse(graph_has_op(graph, "conv2d_bias_add_identity"))
+        self.assertTrue(graph_has_op(graph, "conv2d_bias"))
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class FuseConvBiasFewChannelCase(unittest.TestCase):
+    def test_conv2d_bias_relu_few_channels(self):
+        HH, WW, CI, CO, batch = 224, 224, 4, 64, 4
+        KK = 7
+        stride = 2
+        pad = 3
+        target = detect_target()
+        X = Tensor(
+            shape=[batch, HH, WW, CI],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[CO, KK, KK, CI], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        OP = ops.conv2d_bias_few_channels(stride=stride, pad=pad, dilate=1)
+        Y = OP(X, W, B)
+        Y = ops.elementwise(FuncEnum.RELU)(Y)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", "test_conv_bias_relu_few_channels")
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "output_0":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "conv2d_bias_relu_few_channels")
+
+        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
+        W_pt = torch.randn(CO, CI, KK, KK).cuda().half()
+        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=pad, stride=stride)
+        Y_pt = Y_pt + B_pt
+        Y_pt = torch.nn.functional.relu(Y_pt)
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
+        y = torch.empty([batch, HH // stride, WW // stride, CO]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class FuseTransposedConvCase(unittest.TestCase):
+    def _build_transposedConv2d_bias_relu_chain(
+        self, batch, HH, WW, CI, CO, filter_HW, stride, pad, dilate, depth, decomposed
+    ):
+        X = Tensor(
+            shape=[batch, HH, WW, CI],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[CO, filter_HW, filter_HW, CI],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        if decomposed:
+            transposed_conv2d = ops.transposed_conv2d(
+                stride=stride, pad=pad, dilate=dilate
+            )(X, W)
+            if depth == 0:
+                return transposed_conv2d
+
+            transposed_conv2d_bias = ops.elementwise(FuncEnum.ADD)(transposed_conv2d, B)
+        else:
+            transposed_conv2d_bias = ops.transposed_conv2d_bias(
+                stride=stride, pad=pad, dilate=dilate
+            )(X, W, B)
+            if depth == 0:
+                raise RuntimeError("depth == 0 needs to be decomposed.")
+        if depth == 1:
+            return transposed_conv2d_bias
+
+        transposed_conv2d_bias_relu = ops.elementwise(FuncEnum.RELU)(
+            transposed_conv2d_bias
+        )
+        if depth == 2:
+            return transposed_conv2d_bias_relu
+
+        raise RuntimeError(f"depth should be <= 2, unknown depth {depth}")
+
+    def _test_transposed_conv2d_bias(self, decomposed):
+        batch = 4
+        HH, WW, CI, CO = 14, 14, 256, 256
+        filter_HW = 2
+        stride = 2
+        pad = 0
+        dilate = 1
+        transposed_conv2d_bias = self._build_transposedConv2d_bias_relu_chain(
+            batch,
+            HH,
+            WW,
+            CI,
+            CO,
+            filter_HW,
+            stride,
+            pad,
+            dilate,
+            1,
+            decomposed=decomposed,
+        )
+        transposed_conv2d_bias._attrs["is_output"] = True
+        transposed_conv2d_bias._attrs["name"] = "output_0"
+
+        target = detect_target()
+        module = compile_model(
+            transposed_conv2d_bias,
+            target,
+            "./tmp",
+            f"fuse_transpose_conv2d_bias_{decomposed}",
+        )
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "output_0":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "transposed_conv2d_bias")
+
+        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
+        W_pt = torch.randn(CO, CI, filter_HW, filter_HW).cuda().half()
+        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv_transpose2d(
+            X_pt, W_pt, padding=pad, stride=stride
+        )
+        Y_pt = Y_pt + B_pt
+
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        y = torch.empty([batch, 28, 28, CO]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}, [y]
+        )
+        y_transpose = y.permute((0, 3, 1, 2))
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+    def test_transposed_conv2d_bias(self):
+        self._test_transposed_conv2d_bias(True)
+        self._test_transposed_conv2d_bias(False)
+
+    def _test_transposed_conv2d_bias_relu(self, decomposed):
+        batch = 4
+        HH, WW, CI, CO = 14, 14, 256, 256
+        filter_HW = 2
+        stride = 2
+        pad = 0
+        dilate = 1
+        transposed_conv2d_bias_relu = self._build_transposedConv2d_bias_relu_chain(
+            batch,
+            HH,
+            WW,
+            CI,
+            CO,
+            filter_HW,
+            stride,
+            pad,
+            dilate,
+            2,
+            decomposed=decomposed,
+        )
+        transposed_conv2d_bias_relu._attrs["is_output"] = True
+        transposed_conv2d_bias_relu._attrs["name"] = "output_0"
+
+        target = detect_target()
+        module = compile_model(
+            transposed_conv2d_bias_relu,
+            target,
+            "./tmp",
+            f"fuse_transpose_conv2d_bias_relu_{decomposed}",
+        )
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "output_0":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "transposed_conv2d_bias_relu")
+
+        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
+        W_pt = torch.randn(CO, CI, filter_HW, filter_HW).cuda().half()
+        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv_transpose2d(
+            X_pt, W_pt, padding=pad, stride=stride
+        )
+        Y_pt = Y_pt + B_pt
+        Y_pt = torch.relu(Y_pt)
+
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        y = torch.empty([batch, 28, 28, CO]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}, [y]
+        )
+        y_transpose = y.permute((0, 3, 1, 2))
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+    def test_transposed_conv2d_bias_relu(self):
+        self._test_transposed_conv2d_bias_relu(True)
+        self._test_transposed_conv2d_bias_relu(False)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_expand.py b/tests/unittest/compiler/test_fuse_expand.py
new file mode 100644
index 000000000..dcf3f67a9
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_expand.py
@@ -0,0 +1,63 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar, Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import graph_has_op
+from parameterized import param, parameterized
+
+
+class TestFuseExpand(unittest.TestCase):
+    @parameterized.expand(
+        [
+            param(True, "test_fuse_expand_elementwise_exact"),
+            param(False, "test_fuse_expand_elementwise_non_exact"),
+        ]
+    )
+    def test_fuse_expand_elementwise(self, exact_match: bool, name: str):
+        N, M = (2, 10) if exact_match else (1, 1)
+        x = Tensor(
+            [IntVar([1, 10], name="batch"), 2, 10],
+            is_input=True,
+            name="x",
+        )
+        B = ops.size()(x, 0)
+
+        y = Tensor([1, N, M], is_input=True, name="y")
+        y_expanded = ops.expand()(y, [B, -1, -1])
+
+        z = ops.elementwise(FuncEnum.ADD)(x, y_expanded)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        with compile_model(z, detect_target(), "./tmp", name) as mod:
+            self.assertFalse(graph_has_op(mod.debug_sorted_graph, "expand"))
+            for batch_size in (1, 5, 10):
+                x_pt = torch.randn((batch_size, 2, 10)).half().cuda()
+                y_pt = torch.randn((1, N, M)).half().cuda()
+                z_pt = x_pt + y_pt.expand(batch_size, -1, -1)
+
+                z_ait = torch.empty_like(z_pt)
+                mod.run_with_tensors({"x": x_pt, "y": y_pt}, {"z": z_ait})
+
+                self.assertTrue(torch.equal(z_ait, z_pt))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_mm_elementwise.py b/tests/unittest/compiler/test_fuse_mm_elementwise.py
new file mode 100644
index 000000000..b118db813
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_mm_elementwise.py
@@ -0,0 +1,1387 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class FuseGemmRcrBiasCase(unittest.TestCase):
+    def _build_gemm_rcr_bias(self, M, N, K, decomposed):
+        X_shape = [M, K]
+        W_shape = [N, K]
+        B_shape = [N]
+
+        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+
+        if decomposed:
+            gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
+            bias_tensor = ops.elementwise(FuncEnum.ADD)(gemm_tensor, input_2)
+        else:
+            bias_tensor = ops.gemm_universal.gemm_rcr_bias()(input_0, input_1, input_2)
+
+        return bias_tensor
+
+    def _build_gemm_rcr_bias_add_add_relu_chain(self, M, N, K, depth, decomposed):
+        D_shape = [M, N]
+        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_4 = Tensor(shape=D_shape, dtype="float16", name="input_4", is_input=True)
+
+        bias_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed)
+        if depth == 1:
+            return bias_tensor
+
+        add_tensor = ops.elementwise(FuncEnum.ADD)(bias_tensor, input_3)
+        if depth == 2:
+            return add_tensor
+
+        add2_tensor = ops.elementwise(FuncEnum.ADD)(add_tensor, input_4)
+        if depth == 3:
+            return add2_tensor
+
+        relu_tensor = ops.elementwise(FuncEnum.RELU)(add2_tensor)
+        if depth == 4:
+            return relu_tensor
+
+        raise AssertionError("No suitable output tensors available")
+
+    def _build_gemm_rcr_bias_mul(self, M, N, K, decomposed):
+        D_shape = [M, N]
+        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+
+        bias_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed)
+        mul_tensor = ops.elementwise(FuncEnum.MUL)(bias_tensor, input_3)
+
+        return mul_tensor
+
+    def _test_gemm_rcr_bias(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+        bias_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
+            m_dim, N, K, 1, decomposed
+        )
+        bias_tensor._attrs["name"] = "final_tensor"
+        output = ops.elementwise(FuncEnum.COS)(bias_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0, 0]
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            y = torch.empty([M, N]).cuda().half()
+
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_gemm_rcr_bias_add(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+        add_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
+            m_dim, N, K, 2, decomposed
+        )
+        add_tensor._attrs["name"] = "final_tensor"
+        output = ops.elementwise(FuncEnum.COS)(add_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            D0_pt = torch.randn(M, N).cuda().half()
+            Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt)
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0, 0, 0]
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            inputs[input_name_to_index["input_3"]] = D0_pt
+            y = torch.empty([M, N]).cuda().half()
+
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_gemm_rcr_bias_add_add(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+        add2_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
+            m_dim, N, K, 3, decomposed
+        )
+        add2_tensor._attrs["name"] = "final_tensor"
+        output = ops.elementwise(FuncEnum.COS)(add2_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add_add")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            D0_pt = torch.randn(M, N).cuda().half()
+            D1_pt = torch.randn(M, N).cuda().half()
+            Y_pt = torch.cos(
+                torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt + D1_pt
+            )
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0, 0, 0, 0]
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            inputs[input_name_to_index["input_3"]] = D0_pt
+            inputs[input_name_to_index["input_4"]] = D1_pt
+
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_gemm_rcr_bias_add_add_relu(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+        relu_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
+            m_dim, N, K, 4, decomposed
+        )
+        relu_tensor._attrs["name"] = "final_tensor"
+        output = ops.elementwise(FuncEnum.COS)(relu_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add_add_relu")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            D0_pt = torch.randn(M, N).cuda().half()
+            D1_pt = torch.randn(M, N).cuda().half()
+            Y_pt = torch.cos(
+                torch.nn.functional.relu(
+                    torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt + D1_pt
+                )
+            )
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0, 0, 0, 0]
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            inputs[input_name_to_index["input_3"]] = D0_pt
+            inputs[input_name_to_index["input_4"]] = D1_pt
+
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_gemm_rcr_bias_add_fail(self):
+        M, N, K = 16, 32, 8
+        B_shape = [N]
+
+        input_3 = Tensor(shape=B_shape, dtype="float16", name="input_3", is_input=True)
+
+        gemm_bias_tensor = self._build_gemm_rcr_bias(M, N, K, False)
+        gemm_bias_tensor._attrs["name"] = "gemm_tensor"
+        add_tensor = ops.elementwise(FuncEnum.ADD)(gemm_bias_tensor, input_3)
+        add_tensor._attrs["name"] = "gemm_bias_add_tensor"
+
+        output = ops.elementwise(FuncEnum.COS)(add_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", "gemm_bias_fusion_add_fail")
+
+        # This shouldn't be merged into gemm_rcr_bias_add since input_3 needs broadcasting
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "gemm_tensor":
+                check_tensor = tensor
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias")
+
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+        B1_pt = torch.randn(N).cuda().half()
+        Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + B1_pt)
+
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors([X_pt, W_pt, B_pt, B1_pt], [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_gemm_rcr_bias_chained(self):
+        M, N, K = 16, 32, 8
+        X_shape = [M, K]
+        W_shape = [N, K]
+        B_shape = [N]
+
+        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+
+        gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
+        add_tensor = ops.elementwise(FuncEnum.ADD)(gemm_tensor, input_2)
+        add_tensor._attrs["name"] = "first_gemm"
+
+        D_shape = [N, N]
+        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        gemm1_tensor = ops.gemm_universal.gemm_rcr()(add_tensor, input_3)
+        add1_tensor = ops.elementwise(FuncEnum.ADD)(gemm1_tensor, input_2)
+        add1_tensor._attrs["name"] = "second_gemm"
+
+        output = ops.elementwise(FuncEnum.COS)(add1_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", "gemm_bias_fusion_chained")
+
+        gemm_check = [False, False]
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "first_gemm":
+                src_op = list(tensor.src_ops())[0]
+                self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias")
+                gemm_check[0] = True
+            if tensor._attrs["name"] == "second_gemm":
+                src_op = list(tensor.src_ops())[0]
+                self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias")
+                gemm_check[1] = True
+        self.assertTupleEqual(tuple(gemm_check), (True, True))
+
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+        D_pt = torch.randn(N, N).cuda().half()
+        Y_pt = torch.cos(
+            torch.nn.functional.linear(
+                torch.nn.functional.linear(X_pt, W_pt, bias=B_pt), D_pt, bias=B_pt
+            )
+        )
+
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors([X_pt, W_pt, B_pt, D_pt], [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_gemm_rcr_bias_fail(self):
+        M, N, K = 16, 32, 8
+        X_shape = [M, K]
+        W_shape = [N, K]
+        B_shape = [M, N]
+
+        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+
+        gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
+        add_tensor = ops.elementwise(FuncEnum.ADD)(gemm_tensor, input_2)
+        add_tensor._attrs["name"] = "final_tensor"
+
+        output = ops.elementwise(FuncEnum.COS)(add_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", "gemm_bias_fusion_fail")
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if len(tensor.src_ops()) != 1:
+                continue
+            src_op = list(tensor.src_ops())[0]
+            if src_op._attrs["op"] == "gemm_rcr":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(M, N).cuda().half()
+        Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt) + B_pt)
+
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors([X_pt, W_pt, B_pt], [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_gemm_rcr_bias_add_relu(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+        D_shape = [m_dim, N]
+
+        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+
+        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed)
+        add_tensor = ops.elementwise(FuncEnum.ADD)(bias_tensor, input_3)
+        relu_tensor = ops.elementwise(FuncEnum.RELU)(add_tensor)
+        relu_tensor._attrs["name"] = "final_tensor"
+        output = ops.elementwise(FuncEnum.COS)(relu_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add_relu")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            D0_pt = torch.randn(M, N).cuda().half()
+            Y_pt = torch.cos(
+                torch.nn.functional.relu(
+                    torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt
+                )
+            )
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [None] * 4
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            inputs[input_name_to_index["input_3"]] = D0_pt
+
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_gemm_rcr_bias_tanh(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+
+        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed)
+        tanh_tensor = ops.elementwise(FuncEnum.TANH)(bias_tensor)
+        tanh_tensor._attrs["name"] = "final_tensor"
+        output = ops.elementwise(FuncEnum.COS)(tanh_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_tanh")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            Y_pt = torch.cos(
+                torch.tanh(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
+            )
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [None] * 3
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_gemm_rcr_bias_mul(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+
+        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed)
+        mul_tensor._attrs["name"] = "final_tensor"
+        output = ops.elementwise(FuncEnum.COS)(mul_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_mul")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            D0_pt = torch.randn(M, N).cuda().half()
+            Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) * D0_pt)
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [None] * 4
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            inputs[input_name_to_index["input_3"]] = D0_pt
+
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_gemm_rcr_bias_mul_add(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+        D_shape = [m_dim, N]
+
+        input_4 = Tensor(shape=D_shape, dtype="float16", name="input_4", is_input=True)
+        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed)
+        add_tensor = ops.elementwise(FuncEnum.ADD)(mul_tensor, input_4)
+        add_tensor._attrs["name"] = "final_tensor"
+        output = ops.elementwise(FuncEnum.COS)(add_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_mul_add")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            D0_pt = torch.randn(M, N).cuda().half()
+            D1_pt = torch.randn(M, N).cuda().half()
+            Y_pt = torch.cos(
+                torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) * D0_pt + D1_pt
+            )
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [None] * 5
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            inputs[input_name_to_index["input_3"]] = D0_pt
+            inputs[input_name_to_index["input_4"]] = D1_pt
+
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_gemm_rcr_bias_mul_tanh(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+
+        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed)
+        tanh_tensor = ops.elementwise(FuncEnum.TANH)(mul_tensor)
+        tanh_tensor._attrs["name"] = "final_tensor"
+        output = ops.elementwise(FuncEnum.COS)(tanh_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_mul_tanh")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            D0_pt = torch.randn(M, N).cuda().half()
+            Y_pt = torch.cos(
+                torch.tanh(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) * D0_pt)
+            )
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [None] * 4
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            inputs[input_name_to_index["input_3"]] = D0_pt
+
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_gemm_rcr_bias(self):
+        self._test_gemm_rcr_bias([8], 16, 8, True, "gemm_rcr_bias_basic_decomposed")
+        self._test_gemm_rcr_bias([8], 16, 8, False, "gemm_rcr_bias_basic")
+        self._test_gemm_rcr_bias([8, 32], 16, 8, False, "gemm_rcr_bias_dynamic")
+        self._test_gemm_rcr_bias([8], 16, 3, False, "gemm_rcr_bias_need_align")
+
+    def test_gemm_rcr_bias_add(self):
+        self._test_gemm_rcr_bias_add(
+            [8], 16, 8, True, "gemm_rcr_bias_add_basic_decomposed"
+        )
+        self._test_gemm_rcr_bias_add([8], 16, 8, False, "gemm_rcr_bias_add_basic")
+        self._test_gemm_rcr_bias_add([8, 32], 16, 8, False, "gemm_rcr_bias_add_dynamic")
+        self._test_gemm_rcr_bias_add([8], 16, 3, False, "gemm_rcr_bias_add_need_align")
+
+    def test_gemm_rcr_bias_add_add(self):
+        self._test_gemm_rcr_bias_add_add(
+            [8], 16, 8, True, "gemm_rcr_bias_add_add_basic_decomposed"
+        )
+        self._test_gemm_rcr_bias_add_add(
+            [8], 16, 8, False, "gemm_rcr_bias_add_add_basic"
+        )
+        self._test_gemm_rcr_bias_add_add(
+            [8, 32], 16, 8, False, "gemm_rcr_bias_add_add_dynamic"
+        )
+        self._test_gemm_rcr_bias_add_add(
+            [8], 16, 3, False, "gemm_rcr_bias_add_add_need_align"
+        )
+
+    def test_gemm_rcr_bias_add_add_relu(self):
+        self._test_gemm_rcr_bias_add_add_relu(
+            [8], 16, 8, True, "gemm_rcr_bias_add_add_relu_basic_decomposed"
+        )
+        self._test_gemm_rcr_bias_add_add_relu(
+            [8], 16, 8, False, "gemm_rcr_bias_add_add_relu_basic"
+        )
+        self._test_gemm_rcr_bias_add_add_relu(
+            [8, 32], 16, 8, False, "gemm_rcr_bias_add_add_relu_dynamic"
+        )
+        self._test_gemm_rcr_bias_add_add_relu(
+            [8], 16, 3, False, "gemm_rcr_bias_add_add_relu_need_align"
+        )
+
+    def test_gemm_rcr_bias_add_relu(self):
+        self._test_gemm_rcr_bias_add_relu(
+            [8], 16, 8, True, "gemm_rcr_bias_add_relu_basic_decomposed"
+        )
+        self._test_gemm_rcr_bias_add_relu(
+            [8], 16, 8, False, "gemm_rcr_bias_add_relu_basic"
+        )
+        self._test_gemm_rcr_bias_add_relu(
+            [8, 32], 16, 8, False, "gemm_rcr_bias_add_relu_dynamic"
+        )
+        self._test_gemm_rcr_bias_add_relu(
+            [8], 16, 3, False, "gemm_rcr_bias_add_relu_need_align"
+        )
+
+    def test_gemm_rcr_bias_mul(self):
+        self._test_gemm_rcr_bias_mul(
+            [8], 16, 8, True, "gemm_rcr_bias_mul_basic_decomposed"
+        )
+        self._test_gemm_rcr_bias_mul([8], 16, 8, False, "gemm_rcr_bias_mul_basic")
+        self._test_gemm_rcr_bias_mul([8, 32], 16, 8, False, "gemm_rcr_bias_mul_dynamic")
+        self._test_gemm_rcr_bias_mul([8], 16, 3, False, "gemm_rcr_bias_mul_need_align")
+
+    def test_gemm_rcr_bias_mul_add(self):
+        self._test_gemm_rcr_bias_mul_add(
+            [8], 16, 8, True, "gemm_rcr_bias_mul_add_basic_decomposed"
+        )
+        self._test_gemm_rcr_bias_mul_add(
+            [8], 16, 8, False, "gemm_rcr_bias_mul_add_basic"
+        )
+        self._test_gemm_rcr_bias_mul_add(
+            [8, 32], 16, 8, False, "gemm_rcr_bias_mul_add_dynamic"
+        )
+        self._test_gemm_rcr_bias_mul_add(
+            [8], 16, 3, False, "gemm_rcr_bias_mul_add_need_align"
+        )
+
+    def test_gemm_rcr_bias_mul_tanh(self):
+        self._test_gemm_rcr_bias_mul_tanh(
+            [8], 16, 8, True, "gemm_rcr_bias_mul_tanh_basic_decomposed"
+        )
+        self._test_gemm_rcr_bias_mul_tanh(
+            [8], 16, 8, False, "gemm_rcr_bias_mul_tanh_basic"
+        )
+        self._test_gemm_rcr_bias_mul_tanh(
+            [8, 32], 16, 8, False, "gemm_rcr_bias_mul_tanh_dynamic"
+        )
+        self._test_gemm_rcr_bias_mul_tanh(
+            [8], 16, 3, False, "gemm_rcr_bias_mul_tanh_need_align"
+        )
+
+
+class FuseGemmRcrBiasActivationCase(unittest.TestCase):
+    def _build_gemm_rcr_bias(self, M, N, K, decomposed):
+        X_shape = [M, K]
+        W_shape = [N, K]
+        B_shape = [N]
+
+        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+
+        if decomposed:
+            gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
+            bias_tensor = ops.elementwise(FuncEnum.ADD)(gemm_tensor, input_2)
+        else:
+            bias_tensor = ops.gemm_rcr_bias()(input_0, input_1, input_2)
+
+        return bias_tensor
+
+    def _build_gemm_rcr_bias_sigmoid(self, M, N, K, decomposed):
+        gemm_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed)
+        sigmoid_tensor = ops.elementwise(FuncEnum.SIGMOID)(gemm_tensor)
+
+        return sigmoid_tensor
+
+    def _test_gemm_rcr_bias_activation(
+        self, Ms, N, K, activation, target_ait, decomposed, testname
+    ):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+        if activation == "relu":
+            ait_func = FuncEnum.RELU
+            pt_func = torch.nn.functional.relu
+        elif activation == "sigmoid":
+            ait_func = FuncEnum.SIGMOID
+            pt_func = torch.sigmoid
+        elif activation == "tanh":
+            ait_func = FuncEnum.TANH
+            pt_func = torch.tanh
+        else:
+            raise AssertionError("Activation not supported")
+
+        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed)
+        act_tensor = ops.elementwise(ait_func)(bias_tensor)
+        act_tensor._attrs["name"] = "final_tensor"
+        output = ops.elementwise(FuncEnum.COS)(act_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], target_ait)
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            Y_pt = torch.cos(pt_func(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)))
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0, 0]
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            y = torch.empty([M, N]).cuda().half()
+
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_gemm_rcr_bias_sigmoid_mul(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+        D_shape = [m_dim, N]
+        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+
+        sigmoid_tensor = self._build_gemm_rcr_bias_sigmoid(m_dim, N, K, decomposed)
+        mul_tensor = ops.elementwise(FuncEnum.MUL)(sigmoid_tensor, input_3)
+        mul_tensor._attrs["name"] = "final_tensor"
+
+        output = ops.elementwise(FuncEnum.COS)(mul_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_sigmoid_mul")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            D_pt = torch.randn(M, N).cuda().half()
+            Y_pt = torch.cos(
+                torch.sigmoid(torch.nn.functional.linear(X_pt, W_pt, B_pt)) * D_pt
+            )
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0, 0, 0]
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            inputs[input_name_to_index["input_3"]] = D_pt
+
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_gemm_rcr_bias_sigmoid_mul_tanh(self, Ms, N, K, decomposed, testname):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+        D_shape = [m_dim, N]
+        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+
+        sigmoid_tensor = self._build_gemm_rcr_bias_sigmoid(m_dim, N, K, decomposed)
+        mul_tensor = ops.elementwise(FuncEnum.MUL)(sigmoid_tensor, input_3)
+        tanh_tensor = ops.elementwise(FuncEnum.TANH)(mul_tensor)
+        tanh_tensor._attrs["name"] = "final_tensor"
+
+        output = ops.elementwise(FuncEnum.COS)(tanh_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_sigmoid_mul_tanh")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            D_pt = torch.randn(M, N).cuda().half()
+            Y_pt = torch.cos(
+                torch.tanh(
+                    torch.sigmoid(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
+                    * D_pt
+                )
+            )
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0, 0, 0]
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            inputs[input_name_to_index["input_3"]] = D_pt
+
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_gemm_rcr_bias_relu(self):
+        self._test_gemm_rcr_bias_activation(
+            [8],
+            16,
+            8,
+            "relu",
+            "gemm_rcr_bias_relu",
+            True,
+            "gemm_rcr_bias_relu_basic_decomposed",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8], 16, 8, "relu", "gemm_rcr_bias_relu", False, "gemm_rcr_bias_relu_basic"
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8, 32],
+            16,
+            8,
+            "relu",
+            "gemm_rcr_bias_relu",
+            False,
+            "gemm_rcr_bias_relu_dynamic",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8],
+            16,
+            3,
+            "relu",
+            "gemm_rcr_bias_relu",
+            False,
+            "gemm_rcr_bias_relu_need_align",
+        )
+
+    def test_gemm_rcr_bias_sigmoid(self):
+        self._test_gemm_rcr_bias_activation(
+            [8],
+            16,
+            8,
+            "sigmoid",
+            "gemm_rcr_bias_sigmoid",
+            True,
+            "gemm_rcr_bias_sigmoid_basic_decomposed",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8],
+            16,
+            8,
+            "sigmoid",
+            "gemm_rcr_bias_sigmoid",
+            False,
+            "gemm_rcr_bias_sigmoid_basic",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8, 32],
+            16,
+            8,
+            "sigmoid",
+            "gemm_rcr_bias_sigmoid",
+            False,
+            "gemm_rcr_bias_sigmoid_dynamic",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8],
+            16,
+            3,
+            "sigmoid",
+            "gemm_rcr_bias_sigmoid",
+            False,
+            "gemm_rcr_bias_sigmoid_need_align",
+        )
+
+    def test_gemm_rcr_bias_sigmoid_mul(self):
+        self._test_gemm_rcr_bias_sigmoid_mul(
+            [8], 16, 8, True, "gemm_rcr_bias_sigmoid_mul_basic_decomposed"
+        )
+        self._test_gemm_rcr_bias_sigmoid_mul(
+            [8], 16, 8, False, "gemm_rcr_bias_sigmoid_mul_basic"
+        )
+        self._test_gemm_rcr_bias_sigmoid_mul(
+            [8, 32], 16, 8, False, "gemm_rcr_bias_sigmoid_mul_dynamic"
+        )
+        self._test_gemm_rcr_bias_sigmoid_mul(
+            [8], 16, 3, False, "gemm_rcr_bias_sigmoid_mul_need_align"
+        )
+
+    def test_gemm_rcr_bias_sigmoid_mul_tanh(self):
+        self._test_gemm_rcr_bias_sigmoid_mul_tanh(
+            [8], 16, 8, True, "gemm_rcr_bias_sigmoid_mul_tanh_basic_decomposed"
+        )
+        self._test_gemm_rcr_bias_sigmoid_mul_tanh(
+            [8], 16, 8, False, "gemm_rcr_bias_sigmoid_mul_tanh_basic"
+        )
+        self._test_gemm_rcr_bias_sigmoid_mul_tanh(
+            [8, 32], 16, 8, False, "gemm_rcr_bias_sigmoid_mul_tanh_dynamic"
+        )
+        self._test_gemm_rcr_bias_sigmoid_mul_tanh(
+            [8], 16, 3, False, "gemm_rcr_bias_sigmoid_mul_tanh_need_align"
+        )
+
+    def test_gemm_rcr_bias_tanh(self):
+        self._test_gemm_rcr_bias_activation(
+            [8],
+            16,
+            8,
+            "tanh",
+            "gemm_rcr_bias_tanh",
+            True,
+            "gemm_rcr_bias_tanh_basic_decomposed",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8], 16, 8, "tanh", "gemm_rcr_bias_tanh", False, "gemm_rcr_bias_tanh_basic"
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8, 32],
+            16,
+            8,
+            "tanh",
+            "gemm_rcr_bias_tanh",
+            False,
+            "gemm_rcr_bias_tanh_dynamic",
+        )
+        self._test_gemm_rcr_bias_activation(
+            [8],
+            16,
+            3,
+            "tanh",
+            "gemm_rcr_bias_tanh",
+            False,
+            "gemm_rcr_bias_tanh_need_align",
+        )
+
+
+class FuseGemmRcrBiasSwishCase(unittest.TestCase):
+    def _test_gemm_rcr_bias_swish(self, Ms, N, K, testname, use_add=False):
+        m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
+        X_shape = [m_dim, K]
+        W_shape = [N, K]
+        B_shape = [N]
+        D_shape = [m_dim, N]
+        input_1 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
+        input_2 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
+        input_3 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        input_4 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+
+        if use_add:
+            tensor = ops.gemm_rcr()(input_1, input_2)
+            gemm_tensor = ops.elementwise(FuncEnum.ADD)(tensor, input_3)
+        else:
+            gemm_tensor = ops.gemm_rcr_bias()(input_1, input_2, input_3)
+        sigmoid_tensor = ops.elementwise(FuncEnum.SIGMOID)(gemm_tensor)
+        swish_tensor = ops.elementwise(FuncEnum.MUL)(gemm_tensor, sigmoid_tensor)
+        swish_tensor._attrs["name"] = "final_tensor"
+
+        output = ops.elementwise(FuncEnum.ADD)(swish_tensor, input_4)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "final_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        self.assertEqual(len(check_tensor.src_ops()), 1)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_swish")
+
+        for M in Ms:
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            D_pt = torch.randn(M, N).cuda().half()
+            gemm_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+            Y_pt = gemm_pt * torch.sigmoid(gemm_pt) + D_pt
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0, 0, 0]
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = B_pt
+            inputs[input_name_to_index["input_3"]] = D_pt
+
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_gemm_rcr_bias_swish(self):
+        self._test_gemm_rcr_bias_swish([8], 16, 8, "gemm_rcr_bias_swish_basic")
+        self._test_gemm_rcr_bias_swish([8, 32], 16, 8, "gemm_rcr_bias_swish_dynamic")
+        self._test_gemm_rcr_bias_swish([8], 16, 3, "gemm_rcr_bias_swish_need_align")
+
+    def test_gemm_rcr_add_swish(self):
+        self._test_gemm_rcr_bias_swish([8], 16, 8, "gemm_rcr_add_swish_basic", True)
+        self._test_gemm_rcr_bias_swish(
+            [8, 32], 16, 8, "gemm_rcr_add_swish_dynamic", True
+        )
+        self._test_gemm_rcr_bias_swish(
+            [8], 16, 3, "gemm_rcr_add_swish_need_align", True
+        )
+
+
+class FuseBmmCcrAddCase(unittest.TestCase):
+    def _test_bmm_ccr_add(self, Bs, M, N, K, testname):
+        batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
+        A_shape = [batch_dim, K, M]
+        B_shape = [batch_dim, N, K]
+        D0_shape = [batch_dim, M, N]
+        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        bmm_tensor = ops.gemm_universal.bmm_ccr()(input_0, input_1)
+        add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
+        add_tensor._attrs["name"] = "add_tensor"
+        output = ops.elementwise(FuncEnum.ADD)(add_tensor, input_2)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            src_ops = list(tensor.src_ops())
+            if len(src_ops) != 1:
+                continue
+            if src_ops[0]._attrs["op"].startswith("bmm"):
+                check_tensor = tensor
+                self.assertEqual(src_ops[0]._attrs["op"], "bmm_ccr_add")
+                break
+        self.assertIsNotNone(check_tensor)
+
+        for B in Bs:
+            X_pt = torch.randn(B, K, M).cuda().half()
+            W_pt = torch.randn(B, N, K).cuda().half()
+            D0_pt = torch.randn(B, M, N).cuda().half()
+            Y_pt = torch.bmm(X_pt.transpose(2, 1), W_pt.transpose(2, 1)) + D0_pt + D0_pt
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0, 0]
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = D0_pt
+
+            y = torch.empty([B, M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_bmm_ccr_add_negative(self, testname, negative_type):
+        B, K, M, N = 8, 32, 16, 8
+        A_shape = [B, K, M]
+        B_shape = [B, N, K]
+        D0_shape = [B, M, N]
+        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        bmm_tensor = ops.gemm_universal.bmm_ccr()(input_0, input_1)
+        bmm_tensor._attrs["name"] = "bmm_tensor"
+        if negative_type == "is_output":
+            bmm_tensor._attrs["is_output"] = True
+            output_1 = bmm_tensor
+        elif negative_type == "other_input":
+            other_tensor = ops.elementwise(FuncEnum.COS)(bmm_tensor)
+            other_tensor._attrs["name"] = "output_1"
+            other_tensor._attrs["is_output"] = True
+            output_1 = other_tensor
+        add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
+        add_tensor._attrs["name"] = "add_tensor"
+        output = ops.elementwise(FuncEnum.ADD)(add_tensor, input_2)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model([output, output_1], target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "bmm_tensor":
+                check_tensor = tensor
+                break
+        self.assertIsNotNone(check_tensor)
+        src_op = list(check_tensor.src_ops())[0]
+        self.assertEqual(src_op._attrs["op"], "bmm_ccr")
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D0_pt = torch.randn(B, M, N).cuda().half()
+
+        bmm_pt = torch.bmm(X_pt.transpose(2, 1), W_pt.transpose(2, 1))
+        Y_pt = bmm_pt + D0_pt + D0_pt
+        if negative_type == "is_output":
+            Y1_pt = bmm_pt
+        else:
+            Y1_pt = torch.cos(bmm_pt)
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0, 0, 0]
+        inputs[input_name_to_index["input_0"]] = X_pt
+        inputs[input_name_to_index["input_1"]] = W_pt
+        inputs[input_name_to_index["input_2"]] = D0_pt
+
+        y = torch.empty([B, M, N]).cuda().half()
+        y1 = torch.empty([B, M, N]).cuda().half()
+        output_name_to_index = module.get_output_name_to_index_map()
+        if output_name_to_index["output_0"] == 0:
+            ys = [y, y1]
+        else:
+            ys = [y1, y]
+
+        module.run_with_tensors(inputs, ys)
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
+
+    def test_bmm_ccr_add(self):
+        self._test_bmm_ccr_add([8], 32, 16, 8, "bmm_ccr_add_basic")
+        self._test_bmm_ccr_add([8, 32], 32, 16, 8, "bmm_ccr_add_dynamic")
+        self._test_bmm_ccr_add([8], 7, 13, 3, "bmm_ccr_add_need_align")
+
+    def test_bmm_ccr_add_negative(self):
+        self._test_bmm_ccr_add_negative("bmm_ccr_add_negative_output", "is_output")
+        self._test_bmm_ccr_add_negative("bmm_ccr_add_negative_input", "other_input")
+
+    def test_bmm_ccr_add_double_shared_input(self):
+        B, M, N, K = 8, 32, 16, 8
+
+        A_shape = [B, K, M]
+        B_shape = [B, N, K]
+        D0_shape = [B, M, N]
+        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        input_11 = Tensor(
+            shape=B_shape, dtype="float16", name="input_11", is_input=True
+        )
+        bmm_tensor = ops.gemm_universal.bmm_ccr()(input_0, input_1)
+        bmm_tensor_1 = ops.gemm_universal.bmm_ccr()(input_0, input_11)
+
+        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
+        add_tensor._attrs["name"] = "add_tensor"
+        add_tensor_1 = ops.elementwise(FuncEnum.ADD)(bmm_tensor_1, input_2)
+        add_tensor_1._attrs["name"] = "add_tensor_1"
+
+        output = ops.elementwise(FuncEnum.ADD)(add_tensor, input_2)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+        output_1 = ops.elementwise(FuncEnum.ADD)(add_tensor_1, input_2)
+        output_1._attrs["name"] = "output_1"
+        output_1._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(
+            [output, output_1], target, "./tmp", "bmm_ccr_double_shared_inputs"
+        )
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if (
+                tensor._attrs["name"] == "add_tensor"
+                or tensor._attrs["name"] == "add_tensor_1"
+            ):
+                check_tensor = tensor
+            if check_tensor is None:
+                continue
+            self.assertEqual(len(check_tensor.src_ops()), 1)
+            src_op = list(check_tensor.src_ops())[0]
+            self.assertEqual(src_op._attrs["op"], "bmm_ccr_add")
+            check_tensor = None
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        W1_pt = torch.randn(B, N, K).cuda().half()
+        D0_pt = torch.randn(B, M, N).cuda().half()
+        Y_pt = torch.bmm(X_pt.transpose(2, 1), W_pt.transpose(2, 1)) + D0_pt + D0_pt
+        Y1_pt = torch.bmm(X_pt.transpose(2, 1), W1_pt.transpose(2, 1)) + D0_pt + D0_pt
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [None] * 4
+        inputs[input_name_to_index["input_0"]] = X_pt
+        inputs[input_name_to_index["input_1"]] = W_pt
+        inputs[input_name_to_index["input_11"]] = W1_pt
+        inputs[input_name_to_index["input_2"]] = D0_pt
+
+        y = torch.empty([B, M, N]).cuda().half()
+        y1 = torch.empty([B, M, N]).cuda().half()
+        ys = [None] * 2
+        output_name_to_index = module.get_output_name_to_index_map()
+        ys[output_name_to_index["output_0"]] = y
+        ys[output_name_to_index["output_1"]] = y1
+
+        module.run_with_tensors(inputs, ys)
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
+
+
+class FuseBmmCrrAddCase(unittest.TestCase):
+    def _test_bmm_crr_add(self, Bs, M, N, K, testname):
+        batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
+        A_shape = [batch_dim, K, M]
+        B_shape = [batch_dim, K, N]
+        D0_shape = [batch_dim, M, N]
+        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        bmm_tensor = ops.gemm_universal.bmm_crr()(input_0, input_1)
+        add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
+        add_tensor._attrs["name"] = "add_tensor"
+        output = ops.elementwise(FuncEnum.ADD)(add_tensor, input_2)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            src_ops = list(tensor.src_ops())
+            if len(src_ops) != 1:
+                continue
+            if src_ops[0]._attrs["op"].startswith("bmm"):
+                check_tensor = tensor
+                self.assertEqual(src_ops[0]._attrs["op"], "bmm_crr_add")
+                break
+        self.assertIsNotNone(check_tensor)
+
+        for B in Bs:
+            X_pt = torch.randn(B, K, M).cuda().half()
+            W_pt = torch.randn(B, K, N).cuda().half()
+            D0_pt = torch.randn(B, M, N).cuda().half()
+            Y_pt = torch.bmm(X_pt.transpose(2, 1), W_pt) + D0_pt + D0_pt
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [None] * 3
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = D0_pt
+
+            y = torch.empty([B, M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_bmm_crr_add(self):
+        self._test_bmm_crr_add([8], 32, 16, 8, "bmm_crr_add_basic")
+        self._test_bmm_crr_add([8, 32], 32, 16, 8, "bmm_crr_add_dynamic")
+        self._test_bmm_crr_add([8], 7, 13, 3, "bmm_crr_add_need_align")
+
+
+class FuseBmmRrrAddCase(unittest.TestCase):
+    def _test_bmm_rrr_add(self, Bs, M, N, K, testname):
+        batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
+        A_shape = [batch_dim, M, K]
+        B_shape = [batch_dim, K, N]
+        D0_shape = [batch_dim, M, N]
+        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        bmm_tensor = ops.gemm_universal.bmm_rrr()(input_0, input_1)
+        add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
+        add_tensor._attrs["name"] = "add_tensor"
+        output = ops.elementwise(FuncEnum.ADD)(add_tensor, input_2)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            src_ops = list(tensor.src_ops())
+            if len(src_ops) != 1:
+                continue
+            if src_ops[0]._attrs["op"].startswith("bmm"):
+                check_tensor = tensor
+                self.assertEqual(src_ops[0]._attrs["op"], "bmm_rrr_add")
+                break
+        self.assertIsNotNone(check_tensor)
+
+        for B in Bs:
+            X_pt = torch.randn(B, M, K).cuda().half()
+            W_pt = torch.randn(B, K, N).cuda().half()
+            D0_pt = torch.randn(B, M, N).cuda().half()
+            Y_pt = torch.bmm(X_pt, W_pt) + D0_pt + D0_pt
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [None] * 3
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = D0_pt
+
+            y = torch.empty([B, M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_bmm_rrr_add(self):
+        self._test_bmm_rrr_add([8], 32, 16, 8, "bmm_rrr_add_basic")
+        self._test_bmm_rrr_add([8, 32], 32, 16, 8, "bmm_rrr_add_dynamic")
+        self._test_bmm_rrr_add([8], 7, 13, 3, "bmm_rrr_add_need_align")
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_permute_bmm.py b/tests/unittest/compiler/test_fuse_permute_bmm.py
new file mode 100644
index 000000000..b510acf7b
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_permute_bmm.py
@@ -0,0 +1,647 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class FusePermuteBmmCase(unittest.TestCase):
+    def _create_permute_bmm_graph(
+        self, A_shape, B_shape, bmm_type, permA, permB, bias_shape=None
+    ):
+        OP = getattr(ops, bmm_type, None)
+        assert OP is not None
+
+        A = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        B = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        X = A
+        W = B
+        if permA:
+            A = ops.permute021()(A)
+        if permB:
+            B = ops.permute021()(B)
+        inputs = [A, B]
+        if bias_shape is not None:
+            inputs.append(
+                Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+            )
+
+        Y = OP()(*inputs)
+        Y._attrs["name"] = "target_bmm_tensor"
+        return X, W, Y
+
+    def _test_missing_alignment_bmm(
+        self, A_shape, B_shape, bmm_type, permA, permB, testname
+    ):
+        X, W, bmm_tensor = self._create_permute_bmm_graph(
+            A_shape, B_shape, bmm_type, permA, permB
+        )
+        output = ops.elementwise(FuncEnum.COS)(bmm_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        found_tensor = False
+        for tensor in module.debug_sorted_graph:
+            src_ops = tensor.src_ops()
+            if len(src_ops) == 0:
+                continue
+            assert (
+                len(src_ops) == 1
+            ), "constructed graph should only have single-source op tensors"
+            src_op = list(tensor.src_ops())[0]
+            if src_op._attrs["op"].startswith("bmm"):
+                found_tensor = True
+                self.assertEqual(src_op._attrs["op"], bmm_type)
+        self.assertTrue(found_tensor)
+
+    def test_misalign_a_bmm(self):
+        self._test_missing_alignment_bmm(
+            [2, 4, 7], [2, 7, 8], "bmm_crr", True, False, "bmm_crr_misalign_a"
+        )
+        self._test_missing_alignment_bmm(
+            [2, 4, 7], [2, 8, 4], "bmm_rcr", True, False, "bmm_rcr_misalign_a"
+        )
+        self._test_missing_alignment_bmm(
+            [2, 4, 7], [2, 4, 8], "bmm_rrr", True, False, "bmm_rrr_misalign_a"
+        )
+
+    def test_misalign_b_bmm(self):
+        self._test_missing_alignment_bmm(
+            [2, 8, 4], [2, 8, 7], "bmm_ccr", False, True, "bmm_ccr_misalign_b"
+        )
+        self._test_missing_alignment_bmm(
+            [2, 7, 8], [2, 8, 7], "bmm_crr", False, True, "bmm_crr_misalign_b"
+        )
+        self._test_missing_alignment_bmm(
+            [2, 4, 8], [2, 8, 7], "bmm_rcr", False, True, "bmm_rcr_misalign_b"
+        )
+
+    def _test_permute_bmm(
+        self,
+        B,
+        A_shape,
+        B_shape,
+        original_bmm,
+        new_bmm,
+        testname,
+        bias_shape=None,
+    ):
+        new_layout = new_bmm[-3:]
+        if new_layout[0] == "r":
+            M, K = A_shape[-2:]
+        else:
+            K, M = A_shape[-2:]
+
+        if new_layout[1] == "r":
+            N = B_shape[-1]
+        else:
+            N = B_shape[-2]
+
+        permA = original_bmm[-3] != new_bmm[-3]
+        permB = original_bmm[-2] != new_bmm[-2]
+
+        if bias_shape is not None:
+            assert original_bmm.startswith("gemm")
+            assert new_bmm.startswith("bmm")
+            original_bmm += "_bias"
+            new_bmm += "_add"
+
+        X, W, bmm_tensor = self._create_permute_bmm_graph(
+            A_shape,
+            B_shape,
+            original_bmm,
+            permA,
+            permB,
+            bias_shape=bias_shape,
+        )
+
+        output = ops.elementwise(FuncEnum.COS)(bmm_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        # Due to massive rewriting of alignment/padding, we check whether we removed the old bmm with new one instead.
+        exist_new_bmm = False
+        for tensor in module.debug_sorted_graph:
+            src_ops = tensor.src_ops()
+            if len(src_ops) == 0:
+                continue
+            assert (
+                len(src_ops) == 1
+            ), "constructed graph should only have single-source op tensors"
+            src_op = list(tensor.src_ops())[0]
+            assert src_op._attrs["op"] != original_bmm
+
+            if src_op._attrs["op"] == new_bmm:
+                exist_new_bmm = True
+
+        assert exist_new_bmm, "Can't find converted bmm op in graph"
+
+        for b in B:
+            if len(A_shape) > 2:
+                X_pt = torch.randn(b, M, K).cuda().half()
+            else:
+                X_pt = torch.randn(M, K).cuda().half()
+
+            if len(B_shape) > 2:
+                W_pt = torch.randn(b, K, N).cuda().half()
+            else:
+                W_pt = torch.randn(K, N).cuda().half()
+
+            Y_pt = torch.matmul(X_pt, W_pt)
+
+            if bias_shape is not None:
+                bias_pt = torch.randn(bias_shape[0]).cuda().half()
+                Y_pt += bias_pt
+
+            Y_pt = torch.cos(Y_pt)
+
+            if new_layout[0] == "c":
+                perm = (0, 2, 1) if len(A_shape) > 2 else (1, 0)
+                X_pt = X_pt.permute(perm).contiguous()
+            if new_layout[1] == "c":
+                perm = (0, 2, 1) if len(B_shape) > 2 else (1, 0)
+                W_pt = W_pt.permute(perm).contiguous()
+
+            # We currently only have row-major outputs.
+            y = torch.empty([b, M, N]).cuda().half()
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0] if bias_shape is None else [0, 0, 0]
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            if bias_shape is not None:
+                inputs[input_name_to_index["input_2"]] = bias_pt
+
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_ccr_to_rrr(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B, [batch_dim, 2, 4], [batch_dim, 4, 8], "bmm_ccr", "bmm_rrr", "ccr_to_rrr"
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 3, 5],
+            [batch_dim, 5, 7],
+            "bmm_ccr",
+            "bmm_rrr",
+            "ccr_to_rrr_need_align",
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [batch_dim, 4, 8],
+            "bmm_ccr",
+            "bmm_rrr",
+            "ccr_to_rrr_dynamic",
+        )
+
+    def test_ccr_to_crr(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B, [batch_dim, 4, 2], [batch_dim, 4, 8], "bmm_ccr", "bmm_crr", "ccr_to_crr"
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 5, 3],
+            [batch_dim, 5, 7],
+            "bmm_ccr",
+            "bmm_crr",
+            "ccr_to_crr_need_align",
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [batch_dim, 4, 8],
+            "bmm_ccr",
+            "bmm_crr",
+            "ccr_to_crr_dynamic",
+        )
+
+    def test_ccr_to_rcr(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B, [batch_dim, 2, 4], [batch_dim, 8, 4], "bmm_ccr", "bmm_rcr", "ccr_to_rcr"
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 3, 5],
+            [batch_dim, 7, 5],
+            "bmm_ccr",
+            "bmm_rcr",
+            "ccr_to_rcr_need_align",
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [batch_dim, 8, 4],
+            "bmm_ccr",
+            "bmm_rcr",
+            "ccr_to_rcr_dynamic",
+        )
+
+    def test_crr_to_ccr(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B, [batch_dim, 4, 2], [batch_dim, 8, 4], "bmm_crr", "bmm_ccr", "crr_to_ccr"
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 5, 3],
+            [batch_dim, 7, 5],
+            "bmm_crr",
+            "bmm_ccr",
+            "crr_to_ccr_need_align",
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [batch_dim, 8, 4],
+            "bmm_crr",
+            "bmm_ccr",
+            "crr_to_ccr_dynamic",
+        )
+
+    def test_crr_to_rrr(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B, [batch_dim, 2, 4], [batch_dim, 4, 8], "bmm_crr", "bmm_rrr", "crr_to_rrr"
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 3, 5],
+            [batch_dim, 5, 7],
+            "bmm_crr",
+            "bmm_rrr",
+            "crr_to_rrr_need_align",
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [batch_dim, 4, 8],
+            "bmm_crr",
+            "bmm_rrr",
+            "crr_to_rrr_dynamic",
+        )
+
+    def test_rcr_to_ccr(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B, [batch_dim, 4, 2], [batch_dim, 8, 4], "bmm_rcr", "bmm_ccr", "rcr_to_ccr"
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 5, 3],
+            [batch_dim, 7, 5],
+            "bmm_rcr",
+            "bmm_ccr",
+            "rcr_to_ccr_need_align",
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [batch_dim, 8, 4],
+            "bmm_rcr",
+            "bmm_ccr",
+            "rcr_to_ccr_dynamic",
+        )
+
+    def test_rcr_to_rrr(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B, [batch_dim, 2, 4], [batch_dim, 4, 8], "bmm_rcr", "bmm_rrr", "rcr_to_rrr"
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 3, 5],
+            [batch_dim, 5, 7],
+            "bmm_rcr",
+            "bmm_rrr",
+            "rcr_to_rrr_need_align",
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [batch_dim, 4, 8],
+            "bmm_rcr",
+            "bmm_rrr",
+            "rcr_to_rrr_dynamic",
+        )
+
+    def test_rrr_to_crr(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B, [batch_dim, 4, 2], [batch_dim, 4, 8], "bmm_rrr", "bmm_crr", "rrr_to_crr"
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 5, 3],
+            [batch_dim, 5, 7],
+            "bmm_rrr",
+            "bmm_crr",
+            "rrr_to_crr_need_align",
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [batch_dim, 4, 8],
+            "bmm_rrr",
+            "bmm_crr",
+            "rrr_to_crr_dynamic",
+        )
+
+    def test_rrr_to_rcr(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B, [batch_dim, 2, 4], [batch_dim, 8, 4], "bmm_rrr", "bmm_rcr", "rrr_to_rcr"
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 3, 5],
+            [batch_dim, 7, 5],
+            "bmm_rrr",
+            "bmm_rcr",
+            "rrr_to_rcr_need_align",
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B, [batch_dim, 2, 4], [batch_dim, 8, 4], "bmm_rrr", "bmm_rcr", "rrr_to_rcr"
+        )
+
+    def _test_gemm_broadcast_rcr_to_ccr(self, test_bias):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [8, 4],
+            "gemm_rcr",
+            "bmm_ccr",
+            "rcr_to_ccr_gemm_broadcast_b",
+            bias_shape=[8] if test_bias else None,
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 5, 3],
+            [7, 5],
+            "gemm_rcr",
+            "bmm_ccr",
+            "rcr_to_ccr_need_align_gemm_broadcast_b",
+            bias_shape=[7] if test_bias else None,
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [8, 4],
+            "gemm_rcr",
+            "bmm_ccr",
+            "rcr_to_ccr_dynamic_gemm_broadcast_b",
+            bias_shape=[8] if test_bias else None,
+        )
+
+    def _test_gemm_broadcast_rcr_to_rrr(self, test_bias):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [4, 8],
+            "gemm_rcr",
+            "bmm_rrr",
+            "rcr_to_rrr_gemm_broadcast_b",
+            bias_shape=[8] if test_bias else None,
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 3, 5],
+            [5, 7],
+            "gemm_rcr",
+            "bmm_rrr",
+            "rcr_to_rrr_need_align_gemm_broadcast_b",
+            bias_shape=[7] if test_bias else None,
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [4, 8],
+            "gemm_rcr",
+            "bmm_rrr",
+            "rcr_to_rrr_dynamic_gemm_broadcast_b",
+            bias_shape=[8] if test_bias else None,
+        )
+
+    def _test_gemm_broadcast_rrr_to_crr(self, test_bias):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [4, 8],
+            "gemm_rrr",
+            "bmm_crr",
+            "rrr_to_crr_gemm_broadcast_b",
+            bias_shape=[8] if test_bias else None,
+        )
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 5, 3],
+            [5, 7],
+            "gemm_rrr",
+            "bmm_crr",
+            "rrr_to_crr_need_align_gemm_broadcast_b",
+            bias_shape=[7] if test_bias else None,
+        )
+
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [4, 8],
+            "gemm_rrr",
+            "bmm_crr",
+            "rrr_to_crr_dynamic_gemm_broadcast_b",
+            bias_shape=[8] if test_bias else None,
+        )
+
+    def test_gemm_broadcast_rcr_to_ccr(self):
+        self._test_gemm_broadcast_rcr_to_ccr(True)
+        self._test_gemm_broadcast_rcr_to_ccr(False)
+
+    def test_gemm_broadcast_rrr_to_crr(self):
+        self._test_gemm_broadcast_rrr_to_crr(True)
+        self._test_gemm_broadcast_rrr_to_crr(False)
+
+    def test_permute_multiple_consumer(self):
+        A_shape = [2, 8, 4]
+        B_shape = [2, 8, 8]
+
+        A = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        B1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+
+        permA = ops.permute021()(A)
+
+        C1 = ops.bmm_rrr()(permA, B1)
+        C2 = ops.elementwise(FuncEnum.COS)(permA)
+
+        output = ops.concatenate()((C1, C2), dim=0)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", "permute_multiple_consumer")
+
+        graph = module.debug_sorted_graph
+        bmm_tensors = 0
+        for tensor in graph:
+            src_ops = tensor.src_ops()
+            if len(src_ops) != 1:
+                continue
+            src_op = list(tensor.src_ops())[0]
+            if src_op._attrs["op"].startswith("bmm"):
+                bmm_tensors += 1
+                self.assertEqual(src_op._attrs["op"], "bmm_crr")
+        self.assertEqual(bmm_tensors, 1)
+
+        A_pt = torch.randn(*A_shape).cuda().half()
+        AT_pt = A_pt.permute((0, 2, 1))
+        B1_pt = torch.randn(*B_shape).cuda().half()
+
+        C1_pt = torch.bmm(AT_pt, B1_pt)
+        C2_pt = torch.cos(AT_pt)
+
+        Y_pt = torch.concat((C1_pt, C2_pt), dim=0)
+
+        y = torch.empty([4, 4, 8]).cuda().half()
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0, 0]
+        inputs[input_name_to_index["input_0"]] = A_pt
+        inputs[input_name_to_index["input_1"]] = B1_pt
+
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_permute_multiple_only_bmm_consumer(self):
+        A_shape = [2, 8, 4]
+        B_shape = [2, 8, 8]
+
+        A = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        B1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        B2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+
+        permA = ops.permute021()(A)
+
+        C1 = ops.bmm_rrr()(permA, B1)
+        C2 = ops.bmm_rrr()(permA, B2)
+
+        output = ops.concatenate()((C1, C2), dim=0)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", "permute_multiple_bmm_consumer")
+
+        graph = module.debug_sorted_graph
+        bmm_tensors = 0
+        for tensor in graph:
+            src_ops = tensor.src_ops()
+            if len(src_ops) != 1:
+                continue
+            src_op = list(tensor.src_ops())[0]
+            # All permutes should've be gone.
+            self.assertFalse(src_op._attrs["op"].startswith("permute"))
+            if src_op._attrs["op"].startswith("bmm"):
+                bmm_tensors += 1
+                self.assertEqual(src_op._attrs["op"], "bmm_crr")
+        self.assertEqual(bmm_tensors, 2)
+
+        A_pt = torch.randn(*A_shape).cuda().half()
+        AT_pt = A_pt.permute((0, 2, 1))
+        B1_pt = torch.randn(*B_shape).cuda().half()
+        B2_pt = torch.randn(*B_shape).cuda().half()
+
+        C1_pt = torch.bmm(AT_pt, B1_pt)
+        C2_pt = torch.bmm(AT_pt, B2_pt)
+
+        Y_pt = torch.concat((C1_pt, C2_pt), dim=0)
+
+        y = torch.empty([4, 4, 8]).cuda().half()
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0, 0, 0]
+        inputs[input_name_to_index["input_0"]] = A_pt
+        inputs[input_name_to_index["input_1"]] = B1_pt
+        inputs[input_name_to_index["input_2"]] = B2_pt
+
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
new file mode 100644
index 000000000..437f27105
--- /dev/null
+++ b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
@@ -0,0 +1,283 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for elementwise fusion with complex dependencies.
+"""
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils
+from torch import nn
+
+
+class FusedElementwiseComplexDependencyTestCase(unittest.TestCase):
+    def test_fused_elementwise_direct_input_dependency(self):
+        r"""
+            X0   X1
+             \   /
+              Add_1        X2
+               |    \      /
+               |      Add_2
+                \      /
+                  Sub_1
+
+        Add_1, Add_2, and Sub_1 should be fused together.
+        """
+
+        M = 10
+        N = 4
+        X0 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="X1",
+            value=3.0,
+        )
+        X2 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X2",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        R1 = ops.elementwise(FuncEnum.ADD)(R0, X2)
+        R2 = ops.elementwise(FuncEnum.SUB)(R0, R1)
+        R2._attrs["name"] = "R2"
+        R2._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            R2,
+            target,
+            "./tmp",
+            "fused_elementwise_direct_input_dependency",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+
+        x0_pt = torch.rand(M, N).cuda().half()
+        x2_pt = torch.rand(M, N).cuda().half()
+
+        r0_pt = x0_pt + 3 + x2_pt
+        r1_pt = r0_pt + x2_pt
+        r2_pt = r0_pt - r1_pt
+
+        r2 = torch.empty([M, N]).cuda().half()
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X2": x2_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, [r2])
+        self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
+
+    def test_fused_elementwise_indirect_input_dependency(self):
+        r"""
+            X0   X1
+             \   /
+              Add_1        X2
+               |    \      /
+               |     Gemm_1
+               |        |
+               |      Tanh_1
+                \      /
+                  Sub_1
+
+        Tanh_1 and Sub_1 should be fused together.
+        """
+
+        M = 10
+        K = 4
+        N = 4
+        X0 = Tensor(
+            shape=[M, K],
+            dtype="float16",
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="X1",
+            value=3.0,
+        )
+        X2 = Tensor(
+            shape=[K, N],
+            dtype="float16",
+            name="X2",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        R1 = ops.gemm_rcr()(R0, X2)
+        R2 = ops.elementwise(FuncEnum.TANH)(R1)
+        R3 = ops.elementwise(FuncEnum.SUB)(R0, R2)
+        R3._attrs["name"] = "R3"
+        R3._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            R3,
+            target,
+            "./tmp",
+            "fused_elementwise_indirect_input_dependency",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        x0_pt = torch.rand(M, K).cuda().half()
+        x2_pt = torch.rand(K, N).cuda().half()
+
+        r0_pt = x0_pt + 3
+        r1_pt = nn.functional.linear(r0_pt, x2_pt)
+        r2_pt = torch.tanh(r1_pt)
+        r3_pt = r0_pt - r2_pt
+
+        r3 = torch.empty([M, N]).cuda().half()
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X2": x2_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, [r3])
+        self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
+
+    def test_fused_elementwise_multi_dependency(self):
+        r"""
+            X0   X1                X3
+             \   /                 |
+              Add_1        X2     Tanh_2      X4
+               |    \      /       |    \     /
+               |     Gemm_1        |    Gemm_2
+               |        |           \    /
+               |     Tanh_1         Sub_2
+                \      /             |
+                  Sub_1              |
+                      \             /
+                        \         /
+                           Add_2
+
+        Tanh_1, Sub_1, Sub_2 and Add_2 should be fused together.
+        """
+
+        M = 10
+        K = 4
+        N = 4
+        X0 = Tensor(
+            shape=[M, K],
+            dtype="float16",
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="X1",
+            value=3.0,
+        )
+        X2 = Tensor(
+            shape=[K, N],
+            dtype="float16",
+            name="X2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[M, K],
+            dtype="float16",
+            name="X3",
+            is_input=True,
+        )
+        X4 = Tensor(
+            shape=[K, N],
+            dtype="float16",
+            name="X4",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        R1 = ops.gemm_rcr()(R0, X2)
+        R2 = ops.elementwise(FuncEnum.TANH)(R1)
+        R3 = ops.elementwise(FuncEnum.SUB)(R0, R2)
+        R4 = ops.elementwise(FuncEnum.TANH)(X3)
+        R5 = ops.gemm_rcr()(R4, X4)
+        R6 = ops.elementwise(FuncEnum.SUB)(R4, R5)
+        R7 = ops.elementwise(FuncEnum.ADD)(R6, R3)
+        R7._attrs["name"] = "R7"
+        R7._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            R7,
+            target,
+            "./tmp",
+            "fused_elementwise_multi_dependency",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 5)
+
+        x0_pt = torch.rand(M, K).cuda().half()
+        x2_pt = torch.rand(K, N).cuda().half()
+        x3_pt = torch.rand(M, K).cuda().half()
+        x4_pt = torch.rand(K, N).cuda().half()
+
+        r0_pt = x0_pt + 3
+        r1_pt = nn.functional.linear(r0_pt, x2_pt)
+        r2_pt = torch.tanh(r1_pt)
+        r3_pt = r0_pt - r2_pt
+        r4_pt = torch.tanh(x3_pt)
+        r5_pt = nn.functional.linear(r4_pt, x4_pt)
+        r6_pt = r4_pt - r5_pt
+        r7_pt = r6_pt + r3_pt
+
+        r7 = torch.empty([M, N]).cuda().half()
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X2": x2_pt,
+            "X3": x3_pt,
+            "X4": x4_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, [r7])
+        self.assertTrue(torch.allclose(r7, r7_pt, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fused_elementwise_out_of_order.py b/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
new file mode 100644
index 000000000..f7513b42b
--- /dev/null
+++ b/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
@@ -0,0 +1,135 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for elementwise fusion out-of-order cases.
+"""
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from torch import nn
+
+
+class FusedElementwiseOutOfOrderTestCase(unittest.TestCase):
+    def test_fused_elementwise_out_of_order(self):
+        r"""
+           X0   X1
+            \   /
+             Add_1      Gemm_2(X2, X4)
+              |    \      /
+        Gemm_1(X3)   Sub_1
+              |        |
+              |      Gemm_3(X4)
+               \      /
+                 Sub_2
+
+        Add_1 and Sub_1 will be fused together.
+        However tensor order needs to be re-adjusted.
+
+        Original order:
+        [X0, X1, R0, X3, R1, X2, X4, R2, R3, R4, R5]
+
+        New order:
+        [X2, X4, R2, X0, X1, R0, X3, R1, R3, R4, R5]
+        """
+
+        M = 10
+        K = 4
+        N = 4
+        X0 = Tensor(
+            shape=[M, K],
+            dtype="float16",
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="X1",
+            value=3.0,
+        )
+        X2 = Tensor(
+            shape=[M, K],
+            dtype="float16",
+            name="X2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[K, N],
+            dtype="float16",
+            name="X3",
+            is_input=True,
+        )
+        X4 = Tensor(
+            shape=[K, N],
+            dtype="float16",
+            name="X4",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        R1 = ops.gemm_rcr()(R0, X3)
+        R2 = ops.gemm_rcr()(X2, X4)
+        R3 = ops.elementwise(FuncEnum.SUB)(R0, R2)
+        R4 = ops.gemm_rcr()(R3, X4)
+        R5 = ops.elementwise(FuncEnum.SUB)(R1, R4)
+        R5._attrs["name"] = "R5"
+        R5._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            R5,
+            target,
+            "./tmp",
+            "fused_elementwise_out_of_order",
+        )
+
+        x0_pt = torch.rand(M, K).cuda().half()
+        x2_pt = torch.rand(M, K).cuda().half()
+        x3_pt = torch.rand(K, N).cuda().half()
+        x4_pt = torch.rand(K, N).cuda().half()
+
+        r0_pt = x0_pt + 3
+        r1_pt = nn.functional.linear(r0_pt, x3_pt)
+        r2_pt = nn.functional.linear(x2_pt, x4_pt)
+        r3_pt = r0_pt - r2_pt
+        r4_pt = nn.functional.linear(r3_pt, x4_pt)
+        r5_pt = r1_pt - r4_pt
+
+        r5 = torch.empty([M, N]).cuda().half()
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X2": x2_pt,
+            "X3": x3_pt,
+            "X4": x4_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, [r5])
+        self.assertTrue(torch.allclose(r5, r5_pt, atol=1e-2, rtol=1e-2))
+
+    def test_fused_elementwise_out_of_order_with_size(self):
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_group_fusions.py b/tests/unittest/compiler/test_group_fusions.py
new file mode 100644
index 000000000..732f74631
--- /dev/null
+++ b/tests/unittest/compiler/test_group_fusions.py
@@ -0,0 +1,458 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate import compiler
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import count_ops, has_op
+from aitemplate.utils import graph_utils, logger
+
+
+def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True, only_params=False):
+    inputs = []
+    for i, (n, k) in enumerate(nk_groups):
+        X = Tensor(
+            shape=[m, k],
+            dtype="float16",
+            name="x_{}".format(i + start),
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[n, k],
+            dtype="float16",
+            name="w_{}".format(i + start),
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[n],
+            dtype="float16",
+            name="b_{}".format(i + start),
+            is_input=True,
+        )
+        if has_bias:
+            if only_params:
+                inputs.append([W, B])
+            else:
+                inputs.append([X, W, B])
+        else:
+            if only_params:
+                inputs.append([W])
+            else:
+                inputs.append([X, W])
+    return inputs
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GroupOpTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(GroupOpTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_group_layernorm_sigmoid_mul_cat_fusion(
+        self,
+        input_shapes,
+        gamma_is_none=False,
+        beta_is_none=False,
+        add_size_op=False,
+        fuse_sigmoid_mul=True,
+        num_group_ops=1,
+        should_fail=False,
+    ):
+        if gamma_is_none or beta_is_none or len(input_shapes) <= 1:
+            should_fail = True
+        testname = (
+            "group_layernorm_sigmoid_mul_fusion"
+            if fuse_sigmoid_mul
+            else "group_layernorm_fusion"
+        )
+        logger.info(
+            __file__,
+            f"{testname}: input_shapes={input_shapes}, "
+            f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}",
+        )
+        inputs = []
+        gammas = []
+        betas = []
+        normalized_shapes = []
+        for i, shape in enumerate(input_shapes):
+            inputs.append(
+                Tensor(
+                    shape=[
+                        IntImm(shape[0]),
+                        IntImm(shape[1]),
+                    ],
+                    dtype="float16",
+                    name="X_" + str(i),
+                    is_input=True,
+                )
+            )
+            gamma = (
+                None
+                if gamma_is_none
+                else Tensor(
+                    shape=[IntImm(shape[1])],
+                    dtype="float16",
+                    name="gamma_" + str(i),
+                    is_input=True,
+                )
+            )
+            gammas.append(gamma)
+            beta = (
+                None
+                if beta_is_none
+                else Tensor(
+                    shape=[IntImm(shape[1])],
+                    dtype="float16",
+                    name="beta_" + str(i),
+                    is_input=True,
+                )
+            )
+            betas.append(beta)
+            if add_size_op:
+                size = ops.size()(inputs[-1], 1)
+                normalized_shapes.append([size])
+            else:
+                normalized_shapes.append([IntImm(shape[1])])
+
+        Ys = []
+
+        for i in range(len(input_shapes)):
+            Y0 = ops.layernorm()(inputs[i], gammas[i], betas[i], normalized_shapes[i])
+            if fuse_sigmoid_mul:
+                Y1 = ops.elementwise(FuncEnum.SIGMOID)(Y0)
+                Y2 = ops.elementwise(FuncEnum.MUL)(inputs[i], Y1)
+                Ys.append(Y2)
+            else:
+                Ys.append(Y0)
+
+        for i, Y in enumerate(Ys):
+            Y._attrs["is_output"] = True
+            Y._attrs["name"] = f"output_{i}"
+
+        target = detect_target()
+        module = compile_model(
+            Ys,
+            target,
+            "./tmp",
+            f"{testname}_{self._test_id}",
+        )
+        self._test_id += 1
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        group_op = (
+            "group_layernorm_sigmoid_mul" if fuse_sigmoid_mul else "group_layernorm"
+        )
+        if should_fail:
+            assert not has_op(sorted_ops, group_op)
+            return
+        else:
+            assert has_op(sorted_ops, group_op)
+            assert (
+                count_ops(sorted_ops, group_op) == num_group_ops
+            ), f"expecting {num_group_ops} {group_op} ops, found {count_ops(sorted_ops, group_op)}"
+
+        B = len(input_shapes)
+
+        logger.info(
+            __file__,
+            f"Run test group_layernorm_sigmoid_mul. Input shapes: {input_shapes}",
+        )
+
+        xs_pt = []
+        gammas_pt = []
+        betas_pt = []
+        for shape in input_shapes:
+            xs_pt.append(torch.randn(shape).cuda().half())
+            gamma_pt = None if gamma_is_none else torch.randn(shape[1]).cuda().half()
+            gammas_pt.append(gamma_pt)
+            beta_pt = None if beta_is_none else torch.randn(shape[1]).cuda().half()
+            betas_pt.append(beta_pt)
+
+        ys_pt = []
+        for i in range(B):
+            y0 = torch.nn.functional.layer_norm(
+                xs_pt[i], xs_pt[i].size()[1:], gammas_pt[i], betas_pt[i]
+            )
+            if fuse_sigmoid_mul:
+                y = torch.mul(xs_pt[i], torch.sigmoid(y0))
+                ys_pt.append(y)
+            else:
+                ys_pt.append(y0)
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        num_inputs = len(input_shapes) * 3
+        inputs = [0 for i in range(num_inputs)]
+        for i in range(len(input_shapes)):
+            inputs[input_name_to_index[f"X_{i}"]] = xs_pt[i]
+            if not gamma_is_none:
+                inputs[input_name_to_index[f"gamma_{i}"]] = gammas_pt[i]
+            if not beta_is_none:
+                inputs[input_name_to_index[f"beta_{i}"]] = betas_pt[i]
+        ys = []
+        for y_pt in ys_pt:
+            ys.append(torch.empty(y_pt.size()).cuda().half())
+        module.run_with_tensors(inputs, ys)
+        # module.benchmark_with_tensors(inputs, ys)
+        for y_pt, y in zip(ys_pt, ys):
+            self.assertTrue(
+                torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2),
+                f"max diff: {torch.max(y_pt - y)}, min diff: {torch.min(y_pt - y)}",
+            )
+
+    def test_group_layernorm_sigmoid_mul_fusion(self):
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 256]], fuse_sigmoid_mul=True
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 256]] * 4, fuse_sigmoid_mul=True
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 128], [128, 256], [128, 125]],
+            fuse_sigmoid_mul=True,
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[10, 64], [10, 64], [10, 64]],
+            beta_is_none=True,
+            fuse_sigmoid_mul=True,
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 1025], [128, 1276], [128, 1023]],
+            gamma_is_none=True,
+            fuse_sigmoid_mul=True,
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 256]] * 4,
+            fuse_sigmoid_mul=False,
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[64, 64], [128, 256], [1, 125]],
+            fuse_sigmoid_mul=True,
+            should_fail=True,
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 128], [128, 256], [128, 125]],
+            fuse_sigmoid_mul=True,
+            add_size_op=True,
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 128], [128, 256], [128, 125], [128, 125]],
+            fuse_sigmoid_mul=True,
+            num_group_ops=2,
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 120], [128, 1], [128, 256], [128, 1024]],
+            fuse_sigmoid_mul=True,
+            num_group_ops=1,
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 64]] * 39 + [[128, 256]] * 10,
+            fuse_sigmoid_mul=True,
+            num_group_ops=2,
+        )
+
+        # ctr_mbl_feed overarch cases
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [
+                [2048, 256],
+                [2048, 256],
+                [2048, 128],
+                [2048, 128],
+                [2048, 128],
+                [2048, 128],
+                [2048, 128],
+                [2048, 1024],
+            ],
+            fuse_sigmoid_mul=True,
+            num_group_ops=1,
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[2048, 256], [2048, 256], [2048, 1024]],
+            fuse_sigmoid_mul=True,
+            num_group_ops=1,
+        )
+
+    def _test_group_gemm_fusion(
+        self,
+        m,
+        nk_groups,
+        has_bias=True,
+        has_relu=False,
+        has_sigmoid=False,
+        should_fail=False,
+    ):
+        logger.info(
+            __file__,
+            f"Running _test_group_gemm_fusion, m = {m}, nk_groups = {nk_groups}, "
+            f"has_bias = {has_bias}, has_relu = {has_relu}, has_sigmoid = {has_sigmoid}, "
+            f"should_fail = {should_fail}",
+        )
+        if len(nk_groups) == 1:
+            should_fail = True
+        op_type = None
+        if has_bias:
+            if has_relu:
+                op = ops.gemm_rcr_bias_relu
+                op_type = "group_gemm_rcr_bias_relu"
+            elif has_sigmoid:
+                op = ops.gemm_rcr_bias_sigmoid
+                op_type = "group_gemm_rcr_bias_sigmoid"
+            else:
+                op = ops.gemm_rcr_bias
+                op_type = "group_gemm_rcr_bias"
+        else:
+            op = ops.gemm_rcr
+            op_type = "group_gemm_rcr"
+
+        group_input_tensors = _prepare_input_tensors(m, nk_groups, has_bias=has_bias)
+        graph = []
+        for i, group in enumerate(group_input_tensors):
+            Y = op()(*group)
+            graph.append(Y)
+            Y._attrs["name"] = "y_{}".format(i)
+            Y._attrs["is_output"] = True
+
+        target = detect_target()
+        with target:
+            graph = compiler.transform.toposort(graph)
+            compiler.transform.name_graph(graph)
+            compiler.transform.mark_param_tensor(graph)
+            graph = compiler.transform.fuse_ops(graph)
+            graph = compiler.transform.fuse_group_gemm_ops(graph)
+            sorted_ops = graph_utils.get_sorted_ops(graph)
+
+            if not should_fail:
+                assert has_op(sorted_ops, op_type)
+            else:
+                assert not has_op(sorted_ops, op_type)
+
+    def test_group_gemm_fusion(self):
+        self._test_group_gemm_fusion(1024, [[16, 64], [32, 32]])
+        self._test_group_gemm_fusion(1024, [[16, 64], [32, 40]], has_bias=False)
+        self._test_group_gemm_fusion(
+            1024, [[16, 64], [32, 40], [75, 128]], has_relu=True
+        )
+        self._test_group_gemm_fusion(
+            1024, [[16, 64], [32, 40], [75, 128]], has_sigmoid=True
+        )
+
+        # test misalignment
+        self._test_group_gemm_fusion(1024, [[16, 44], [32, 32]], should_fail=True)
+        self._test_group_gemm_fusion(1024, [[16, 13], [32, 1]], should_fail=True)
+
+    def _test_split_group_gemm_fusion(
+        self,
+        m,
+        nk_groups_1,
+        nk_groups_2,
+        split_dim=1,
+        should_fail=False,
+        num_group_ops=2,
+    ):
+        logger.info(
+            __file__,
+            f"Running _test_split_group_gemm_fusion, m = {m}, nk_groups_1 = {nk_groups_1}, "
+            f"nk_groups_2 = {nk_groups_2}, split_dim = {split_dim}, should_fail: {should_fail}, "
+            f"num_group_ops = {num_group_ops}",
+        )
+        op_type = "group_gemm_rcr_bias"
+
+        inputs1 = _prepare_input_tensors(
+            m, nk_groups_1, has_bias=True, only_params=True
+        )
+        inputs2 = _prepare_input_tensors(
+            m, nk_groups_2, start=len(nk_groups_1), has_bias=True, only_params=False
+        )
+
+        if split_dim == 1:
+            split_sizes = [k for n, k in nk_groups_1]
+            K = sum(split_sizes)
+            X = Tensor(
+                shape=[m, K],
+                dtype="float16",
+                name="input",
+                is_input=True,
+            )
+        else:
+            split_sizes = m
+            X = Tensor(
+                shape=[m * len(nk_groups_1), nk_groups_1[0][1]],
+                dtype="float16",
+                name="input",
+                is_input=True,
+            )
+
+        Y1s = ops.split()(X, split_sizes, split_dim)
+
+        graph = []
+        for i, inputs in enumerate(inputs1):
+            inputs = [Y1s[i]] + inputs
+            Y = ops.gemm_rcr_bias()(*inputs)
+            graph.append(Y)
+            Y._attrs["name"] = "y_{}".format(i)
+            Y._attrs["is_output"] = True
+
+        for i, inputs in enumerate(inputs2):
+            Y = ops.gemm_rcr_bias()(*inputs)
+            graph.append(Y)
+            Y._attrs["name"] = "y_{}".format(len(nk_groups_1) + i)
+            Y._attrs["is_output"] = True
+
+        target = detect_target()
+        with target:
+            graph = compiler.transform.toposort(graph)
+            compiler.transform.name_graph(graph)
+            compiler.transform.mark_param_tensor(graph)
+            graph = compiler.transform.fuse_ops(graph)
+            graph = compiler.transform.fuse_group_gemm_ops(graph)
+            graph = compiler.transform.transform_strided_ops(graph)
+            sorted_ops = graph_utils.get_sorted_ops(graph)
+
+            if should_fail:
+                assert has_op(sorted_ops, "split")
+                assert count_ops(sorted_ops, op_type) == num_group_ops
+            else:
+                assert not has_op(sorted_ops, "split")
+                assert count_ops(sorted_ops, op_type) == num_group_ops
+
+    def test_split_group_gemm_fusion(self):
+        self._test_split_group_gemm_fusion(
+            1024, [[16, 64], [16, 40], [16, 128]], [[1, 16], [3, 48]], num_group_ops=2
+        )
+        self._test_split_group_gemm_fusion(
+            48,
+            [[16, 64], [16, 64], [16, 64]],
+            [[1, 16], [3, 48]],
+            split_dim=0,
+            should_fail=True,
+            num_group_ops=1,
+        )
+        self._test_split_group_gemm_fusion(
+            48,
+            [[16, 63], [16, 64], [16, 64]],
+            [[1, 16], [3, 48]],
+            should_fail=True,
+            num_group_ops=1,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_memory_planning.py b/tests/unittest/compiler/test_memory_planning.py
new file mode 100644
index 000000000..e2ce76b5a
--- /dev/null
+++ b/tests/unittest/compiler/test_memory_planning.py
@@ -0,0 +1,124 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import random
+import unittest
+
+import torch
+from aitemplate import compiler
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import Operator
+from aitemplate.frontend import IntImm, nn, Tensor
+from aitemplate.testing import detect_target
+
+
+class MemoryPlanningTestCase(unittest.TestCase):
+    def test_memory_planning_with_tensor_views(self):
+        target = detect_target()
+        dtype = "float16"
+        # batch_size = [4, 16] # reduce_sum doesn't work with dynamic shape
+        batch_size = [4]
+        in_shape = [32, 16, 8]
+        X = Tensor(
+            shape=[IntImm(value=batch_size[0], name="input_batch"), *in_shape],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+
+        sum_0 = ops.reduce_sum(3, keepdim=True, dtype=dtype)
+        sum_1 = ops.reduce_sum(2, keepdim=True, dtype=dtype)
+        sum_2 = ops.reduce_sum(2, keepdim=False, dtype=dtype)
+        reshape_0 = nn.Reshape()
+        reshape_1 = nn.Reshape()
+        add_0 = ops.elementwise(ops.common.FuncEnum.ADD)
+        flatten_0 = nn.Flatten()
+
+        T0 = sum_0(X)  # [b, 32, 16 1]
+        T1 = sum_1(T0)  # [b, 32, 1, 1]
+
+        # This reshape is fused into sum_1.
+        T2 = reshape_0(T1, [-1, 32])  # [b, 32]
+
+        # This reshape cannot be fused with sum_0 because T0 is used by multiple dst_ops.
+        # This reshape cannot be fused with sum_2 because input_accessors haven't been added into reduction ops yet.
+        T3 = reshape_1(T0, [-1, 32, 16])  # [b, 32, 16]
+
+        T4 = sum_2(T3)  # [b, 32]
+        T5 = add_0(T2, T4)  # [b, 32]
+
+        # This flatten is fused with add_0.
+        OUT = flatten_0(T5)  # [b * 32]
+
+        OUT._attrs["name"] = "output_0"
+        OUT._attrs["is_output"] = True
+
+        module = compile_model(OUT, target, "./tmp", "memory_planning")
+        self.assertEqual(len(module.debug_sorted_graph), 6)
+
+        assert T0._attrs["offset"] == T3._attrs["offset"]
+
+        for b in batch_size:
+            X_shape = [b] + in_shape
+            x_pt = torch.randn(X_shape).cuda().half()
+            t0_pt = torch.sum(x_pt, dim=3, keepdim=True)
+            t1_pt = torch.sum(t0_pt, dim=2, keepdim=True)
+            t2_pt = torch.reshape(t1_pt, [-1, 32])
+            t3_pt = torch.reshape(t0_pt, [-1, 32, 16])
+            t4_pt = torch.sum(t3_pt, dim=2, keepdim=False)
+            out_pt = torch.add(t2_pt, t4_pt).flatten()
+
+            out = torch.empty(out_pt.size()).cuda().half()
+            module.run_with_tensors([x_pt], [out])
+            self.assertTrue(torch.allclose(out_pt, out, atol=1e-1, rtol=1e-2))
+
+    def test_memory_planning_workspace_offsets(self):
+        class DummyOp(Operator):
+            def __init__(self):
+                super().__init__()
+                self._attrs["op"] = "dummy_op"
+
+            def __call__(
+                self, inp: Tensor, workspace_size: int, unique_workspace_size: int
+            ):
+                self._attrs["inputs"] = [inp]
+                self._set_depth()
+                self._attrs["workspace"] = workspace_size
+                self._attrs["unique_workspace"] = unique_workspace_size
+                output = Tensor(shape=[1], src_ops={self})
+                self._attrs["outputs"] = [output]
+                return output
+
+        X = Tensor(shape=[1], is_input=True)
+        unique_workspace_expected_size = 0
+        shared_workspace_expected_size = 0
+        for _ in range(10):
+            shared = random.randint(0, 10)
+            unique = random.randint(0, 10)
+
+            shared_workspace_expected_size = max(shared_workspace_expected_size, shared)
+            unique_workspace_expected_size += unique
+            X = DummyOp()(X, shared, unique)
+
+        X._attrs["is_output"] = True
+        graph = compiler.transform.toposort(X)
+        compiler.transform.name_graph(graph)
+        _, _, workspace = compiler.transform.memory_planning(graph)
+        self.assertEqual(workspace.shared_size, shared_workspace_expected_size)
+        self.assertEqual(workspace.unique_size, unique_workspace_expected_size)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py b/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
new file mode 100644
index 000000000..515959224
--- /dev/null
+++ b/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
@@ -0,0 +1,99 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import logging
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger, shape_utils
+
+
+class PadBmmBiasWithCatTestCase(unittest.TestCase):
+    def _test_pad_bmm_rrr_bias_with_cat(self, test_name, bs, ms, n, k1, k2):
+        k = k1 + k2
+        b_dim = shape_utils.gen_int_var_min_max(bs, name="b")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X1 = Tensor(shape=[b_dim, m_dim, k1], dtype="float16", name="x1", is_input=True)
+        X2 = Tensor(shape=[b_dim, m_dim, k2], dtype="float16", name="x2", is_input=True)
+        X4 = ops.concatenate()([X1, X2], dim=2)
+
+        W1 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w1", is_input=True)
+        B1 = Tensor(shape=[b_dim, m_dim, n], dtype="float16", name="b1", is_input=True)
+        W2 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w2", is_input=True)
+        B2 = Tensor(shape=[b_dim, m_dim, n], dtype="float16", name="b2", is_input=True)
+        Y1 = ops.bmm_rrr_add()(X4, W1, B1)
+        Y2 = ops.bmm_rrr_add()(X4, W2, B2)
+
+        Y = ops.concatenate()([Y1, Y2], dim=2)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Skip this test on SM75")
+            return
+        module = compile_model(
+            [Y], target, "./tmp", f"test_bmm_rrr_padding_{test_name}"
+        )
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+
+        for b, m in itertools.product(bs, ms):
+            X1_pt = torch.randn(b, m, k1).cuda().half()
+            X2_pt = torch.randn(b, m, k2).cuda().half()
+            X4_pt = torch.cat([X1_pt, X2_pt], dim=2)
+
+            W1_pt = torch.randn(b, k, n).cuda().half()
+            B1_pt = torch.randn(b, m, n).cuda().half()
+            W2_pt = torch.randn(b, k, n).cuda().half()
+            B2_pt = torch.randn(b, m, n).cuda().half()
+
+            Y1_pt = torch.baddbmm(B1_pt, X4_pt, W1_pt)
+            Y2_pt = torch.baddbmm(B2_pt, X4_pt, W2_pt)
+            Y_pt = torch.cat([Y1_pt, Y2_pt], dim=2)
+
+            inputs = [0] * 6
+            name_to_idx = module.get_input_name_to_index_map()
+            inputs[name_to_idx["x1"]] = X1_pt
+            inputs[name_to_idx["x2"]] = X2_pt
+            inputs[name_to_idx["w1"]] = W1_pt
+            inputs[name_to_idx["w2"]] = W2_pt
+            inputs[name_to_idx["b1"]] = B1_pt
+            inputs[name_to_idx["b2"]] = B2_pt
+
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_pad_bmm_rrr_bias_with_cat(self):
+        self._test_pad_bmm_rrr_bias_with_cat(
+            "static_odd_k", bs=[2], ms=[64], n=32, k1=3, k2=10
+        )
+        self._test_pad_bmm_rrr_bias_with_cat(
+            "static_odd_kn", bs=[2], ms=[128], n=31, k1=1, k2=8
+        )
+        self._test_pad_bmm_rrr_bias_with_cat(
+            "dynamic_odd_kn", bs=[1, 2, 3], ms=[2, 5, 7], n=15, k1=1, k2=2
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py b/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
new file mode 100644
index 000000000..ac91d3fbe
--- /dev/null
+++ b/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
@@ -0,0 +1,83 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger, shape_utils
+
+
+class PadGemmWithCatTestCase(unittest.TestCase):
+    def _test_pad_gemm_rrr_with_cat(self, test_name, ms, n, k1, k2):
+        k = k1 + k2
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X1 = Tensor(shape=[m_dim, k1], dtype="float16", name="x1", is_input=True)
+        W1 = Tensor(shape=[k, n], dtype="float16", name="w1", is_input=True)
+        X2 = Tensor(shape=[m_dim, k2], dtype="float16", name="x2", is_input=True)
+        W2 = Tensor(shape=[k, n], dtype="float16", name="w2", is_input=True)
+        X4 = ops.concatenate()([X1, X2], dim=1)
+        Y1 = ops.gemm_rrr()(X4, W1)
+        Y2 = ops.gemm_rrr()(X4, W2)
+        Y = ops.concatenate()([Y1, Y2], dim=1)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Skip this test on SM75")
+            return
+        dll_name = f"test_rrr_padding_{test_name}.so"
+        module = compile_model(
+            [Y], target, "./tmp", "pad_gemm_with_cat_rrr", dll_name=dll_name
+        )
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+
+        for m in ms:
+            X1_pt = torch.randn(m, k1).cuda().half()
+            W1_pt = torch.randn(k, n).cuda().half()
+            X2_pt = torch.randn(m, k2).cuda().half()
+            W2_pt = torch.randn(k, n).cuda().half()
+            X4_pt = torch.cat([X1_pt, X2_pt], dim=1)
+            Y1_pt = torch.matmul(X4_pt, W1_pt)
+            Y2_pt = torch.matmul(X4_pt, W2_pt)
+            Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
+
+            inputs = [0] * 4
+            name_to_idx = module.get_input_name_to_index_map()
+            inputs[name_to_idx["x1"]] = X1_pt
+            inputs[name_to_idx["x2"]] = X2_pt
+            inputs[name_to_idx["w1"]] = W1_pt
+            inputs[name_to_idx["w2"]] = W2_pt
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_pad_gemm_rrr_with_cat(self):
+        self._test_pad_gemm_rrr_with_cat("static_odd_k", ms=[128], n=32, k1=3, k2=10)
+        self._test_pad_gemm_rrr_with_cat("static_odd_kn", ms=[128], n=31, k1=1, k2=8)
+        self._test_pad_gemm_rrr_with_cat(
+            "dynamic_odd_kn", ms=[2, 5, 7], n=15, k1=1, k2=2
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_pad_gemm_with_cat.py b/tests/unittest/compiler/test_pad_gemm_with_cat.py
new file mode 100644
index 000000000..b5cf0ffa6
--- /dev/null
+++ b/tests/unittest/compiler/test_pad_gemm_with_cat.py
@@ -0,0 +1,95 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+
+
+class PadGemmWithCatTestCase(unittest.TestCase):
+    def test_pad_gemm_rcr_with_cat(self):
+        M = 128
+        N = 32
+        K1 = 3
+        K2 = 10
+        K = K1 + K2
+
+        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
+        W1 = Tensor(shape=[N, K], dtype="float16", name="w1", is_input=True)
+        B1 = Tensor(shape=[N], dtype="float16", name="b1", is_input=True)
+
+        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
+        W2 = Tensor(shape=[N, K], dtype="float16", name="w2", is_input=True)
+        B2 = Tensor(shape=[N], dtype="float16", name="b2", is_input=True)
+
+        X3 = ops.elementwise(FuncEnum.ADD)(X1, X1)
+        X4 = ops.concatenate()([X2, X3], dim=1)
+        X5 = ops.gemm_rcr_bias()(X4, W1, B1)
+        X6 = ops.gemm_rcr_bias()(X4, W2, B2)
+        Y = ops.concatenate()([X5, X6], dim=1)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Skip this test on SM75")
+            return
+        dll_name = "test_rcr.so"
+        module = compile_model(
+            [Y], target, "./tmp", "pad_gemm_with_cat", dll_name=dll_name
+        )
+
+        X1_pt = torch.randn(M, K1).cuda().half()
+        X2_pt = torch.randn(M, K2).cuda().half()
+        W1_pt = torch.randn(N, K).cuda().half()
+        W2_pt = torch.randn(N, K).cuda().half()
+        B1_pt = torch.randn(N).cuda().half()
+        B2_pt = torch.randn(N).cuda().half()
+        X3_pt = torch.add(X1_pt, X1_pt)
+        X4_pt = torch.cat([X2_pt, X3_pt], dim=1)
+        X5_pt = torch.nn.functional.linear(X4_pt, W1_pt, bias=B1_pt)
+        X6_pt = torch.nn.functional.linear(X4_pt, W2_pt, bias=B2_pt)
+        Y_pt = torch.cat([X5_pt, X6_pt], dim=1)
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_pt.size())
+
+        inputs = [0] * 6
+        name_to_idx = module.get_input_name_to_index_map()
+        inputs[name_to_idx["x1"]] = X1_pt
+        inputs[name_to_idx["x2"]] = X2_pt
+
+        inputs[name_to_idx["w1"]] = W1_pt
+        inputs[name_to_idx["w2"]] = W2_pt
+
+        inputs[name_to_idx["b1"]] = B1_pt
+        inputs[name_to_idx["b2"]] = B2_pt
+
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
new file mode 100644
index 000000000..878a61355
--- /dev/null
+++ b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
@@ -0,0 +1,171 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+from parameterized import param, parameterized
+
+
+class PadGemmWithElementwise(unittest.TestCase):
+    @parameterized.expand(
+        [
+            param("static_M", [23], 7, 3),
+            param("dynamic_M", [1, 78, 99], 7, 3),
+        ]
+    )
+    def test_pad_gemm_rcr_bias_broadcast_with_elementwise(self, test_name, ms, n, k):
+        m_dim = shape_utils.gen_int_var_min_max(ms, "M")
+
+        X1 = Tensor(shape=[m_dim, k], dtype="float16", name="x1", is_input=True)
+        W1 = Tensor(shape=[n, k], dtype="float16", name="w1", is_input=True)
+        B1 = Tensor(shape=[n], dtype="float16", name="b1", is_input=True)
+        S1 = Tensor(shape=[m_dim, n], dtype="float16", name="s1", is_input=True)
+        S2 = Tensor(shape=[m_dim, n], dtype="float16", name="s2", is_input=True)
+
+        X2 = ops.gemm_rcr_bias_mul_add()(X1, W1, B1, S1, S2)
+        Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
+
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            [Y], target, "./tmp", f"pad_gemm_with_elementwise_{test_name}"
+        )
+
+        for m in ms:
+            X1_pt = torch.randn(m, k).cuda().half()
+            W1_pt = torch.randn(n, k).cuda().half()
+            B1_pt = torch.randn(n).cuda().half()
+            S1_pt = torch.randn(m, n).cuda().half()
+            S2_pt = torch.randn(m, n).cuda().half()
+
+            X2_pt = torch.nn.functional.linear(X1_pt, W1_pt, B1_pt) * S1_pt + S2_pt
+            Y_pt = X2_pt + X2_pt
+
+            inputs = [0] * 5
+            name_to_idx = module.get_input_name_to_index_map()
+            inputs[name_to_idx["x1"]] = X1_pt
+            inputs[name_to_idx["w1"]] = W1_pt
+            inputs[name_to_idx["b1"]] = B1_pt
+            inputs[name_to_idx["s1"]] = S1_pt
+            inputs[name_to_idx["s2"]] = S2_pt
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    @parameterized.expand(
+        [
+            ("static_shape", [3], [1], 5, 3),
+            ("dynamic_M", [3], [1, 78, 99], 7, 3),
+            ("dynamic_B", [3, 5, 8], [3], 11, 15),
+            ("dynamic_BM", [3, 5, 8], [3, 9, 10], 17, 21),
+        ]
+    )
+    def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k):
+        b_dim = shape_utils.gen_int_var_min_max(bs, "B")
+        m_dim = shape_utils.gen_int_var_min_max(ms, "M")
+
+        X1 = Tensor(shape=[b_dim, m_dim, k], dtype="float16", name="x1", is_input=True)
+        W1 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w1", is_input=True)
+        B1 = Tensor(shape=[b_dim, m_dim, n], dtype="float16", name="b1", is_input=True)
+
+        X2 = ops.bmm_rrr_add()(X1, W1, B1)
+        Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
+
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            [Y], target, "./tmp", f"pad_bmm_with_elementwise_{test_name}"
+        )
+
+        for b, m in itertools.product(bs, ms):
+            X1_pt = torch.randn(b, m, k).cuda().half()
+            W1_pt = torch.randn(b, k, n).cuda().half()
+            B1_pt = torch.randn(b, m, n).cuda().half()
+
+            X2_pt = torch.matmul(X1_pt, W1_pt) + B1_pt
+            Y_pt = X2_pt + X2_pt
+
+            inputs = [0, 0, 0]
+            name_to_idx = module.get_input_name_to_index_map()
+            inputs[name_to_idx["x1"]] = X1_pt
+            inputs[name_to_idx["w1"]] = W1_pt
+            inputs[name_to_idx["b1"]] = B1_pt
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    @parameterized.expand(
+        [
+            ("static_shape", [3], [1], 5, 3),
+            ("dynamic_M", [3], [1, 78, 99], 7, 3),
+            ("dynamic_B", [3, 5, 8], [3], 11, 15),
+            ("dynamic_BM", [3, 5, 8], [3, 9, 10], 17, 21),
+        ]
+    )
+    def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k):
+        b_dim = shape_utils.gen_int_var_min_max(bs, "B")
+        m_dim = shape_utils.gen_int_var_min_max(ms, "M")
+
+        # (M, B, K) * (B, K, N) = (M, B, N)
+        X1 = Tensor(shape=[m_dim, b_dim, k], dtype="float16", name="x1", is_input=True)
+        W1 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w1", is_input=True)
+        B1 = Tensor(shape=[b_dim, n], dtype="float16", name="b1", is_input=True)
+
+        X2 = ops.perm102_bmm_rrr_bias()(X1, W1, B1)
+        Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
+
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            [Y], target, "./tmp", f"pad_perm102_with_elementwise_{test_name}"
+        )
+
+        for b, m in itertools.product(bs, ms):
+            X1_pt = torch.randn(m, b, k).cuda().half()
+            W1_pt = torch.randn(b, k, n).cuda().half()
+            B1_pt = torch.randn(b, n).cuda().half()
+            Bias_pt = B1_pt.unsqueeze(1)
+
+            X2_pt = torch.permute(
+                torch.baddbmm(Bias_pt, torch.permute(X1_pt, (1, 0, 2)), W1_pt),
+                (1, 0, 2),
+            )
+            Y_pt = X2_pt + X2_pt
+            inputs = [0, 0, 0]
+            name_to_idx = module.get_input_name_to_index_map()
+            inputs[name_to_idx["x1"]] = X1_pt
+            inputs[name_to_idx["w1"]] = W1_pt
+            inputs[name_to_idx["b1"]] = B1_pt
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_parallel_gemm_fusions.py b/tests/unittest/compiler/test_parallel_gemm_fusions.py
new file mode 100644
index 000000000..3108256ff
--- /dev/null
+++ b/tests/unittest/compiler/test_parallel_gemm_fusions.py
@@ -0,0 +1,542 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+from typing import Sequence
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.transform.fuse_parallel_gemms import fuse_parallel_gemms
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import count_ops, has_op
+from aitemplate.utils import graph_utils, logger
+
+
+def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True):
+    inputs = []
+    batch_dim = IntImm(m)
+    for i, (n, k) in enumerate(nk_groups):
+        X = Tensor(
+            shape=[batch_dim, IntImm(k)],
+            dtype="float16",
+            name="x_{}".format(i + start),
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[IntImm(n), IntImm(k)],
+            dtype="float16",
+            name="w_{}".format(i + start),
+        )
+        B = Tensor(
+            shape=[IntImm(n)],
+            dtype="float16",
+            name="b_{}".format(i + start),
+        )
+        if has_bias:
+            inputs.append([X, W, B])
+        else:
+            inputs.append([X, W])
+    return inputs
+
+
+def _prepare_inputs_and_constants(m, nk_groups, start=0, has_bias=True):
+    inputs = []
+    constants = {}
+
+    for i, (n, k) in enumerate(nk_groups):
+        x_pt = torch.randn(m, k).half().cuda()
+        w_pt = torch.randn(n, k).half().cuda()
+        b_pt = torch.randn(n).half().cuda()
+
+        inputs.append(x_pt)
+        constants[f"w_{i}"] = w_pt
+        if has_bias:
+            constants[f"b_{i}"] = b_pt
+
+    return inputs, constants
+
+
+def _prepare_outputs(output_tensors):
+    def _to_int_list(shape):
+        result = []
+        for d in shape:
+            assert isinstance(d, IntImm)
+            result.append(d._attrs["values"][0])
+        return result
+
+    output_shapes = [_to_int_list(output._attrs["shape"]) for output in output_tensors]
+    outputs = [torch.empty(shape).half().cuda() for shape in output_shapes]
+    return outputs
+
+
+def _prepare_ait_module(m, nk_groups, constants, test_idx=0, has_bias=True):
+    group_input_tensors = _prepare_input_tensors(m, nk_groups, has_bias=has_bias)
+    output_tensors = []
+    for group in group_input_tensors:
+        group[0] = ops.elementwise(FuncEnum.TANH)(group[0])
+        Y = ops.gemm_rcr_bias()(*group) if has_bias else ops.gemm_rcr()(*group)
+        output_tensors.append(Y)
+
+    Y = ops.concatenate()(output_tensors, dim=-1)
+    Y._attrs["name"] = "y"
+    Y._attrs["is_output"] = True
+
+    target = detect_target()
+    module = compile_model(
+        Y,
+        target,
+        "./tmp",
+        f"test_multi_parallel_gemm_cat_groups_{test_idx}",
+        constants=constants,
+    )
+    outputs = _prepare_outputs([Y])
+    return outputs, module
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ParallelGemmCatFusionTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(ParallelGemmCatFusionTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _fuse_2_split_parallel_gemm_cat(
+        self, b: int, ms: Sequence[int], n: int, k: int
+    ):
+        logger.info(
+            __file__,
+            f"_fuse_2_split_parallel_gemm_cat, b: {b}, ms: {ms}, n: {n}, k: {k}",
+        )
+        X1 = Tensor(
+            shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
+            dtype="float16",
+            name="X1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
+            dtype="float16",
+            name="X2",
+            is_input=True,
+        )
+        Ws = []
+        Bs = []
+        for i in range(2 * b):
+            W = Tensor(
+                shape=[IntImm(n), IntImm(k)],
+                dtype="float16",
+                name=f"W{i}",
+                is_input=True,
+            )
+            Ws.append(W)
+            B = Tensor(
+                shape=[IntImm(n)],
+                dtype="float16",
+                name=f"B{i}",
+                is_input=True,
+            )
+            Bs.append(B)
+
+        X3 = ops.split()(X1, k, dim=-1)
+        X4 = ops.split()(X2, k, dim=-1)
+        cat_inputs = []
+        gemm_inputs = X3 + X4
+        for i in range(2 * b):
+            X5 = ops.gemm_rcr_bias()(gemm_inputs[i], Ws[i], Bs[i])
+            cat_inputs.append(X5)
+        cat_output = ops.concatenate()(cat_inputs, dim=-1)
+
+        cat_output._attrs["name"] = "output0"
+        cat_output._attrs["is_output"] = True
+
+        sorted_graph = toposort(cat_output)
+        new_sorted_graph = fuse_parallel_gemms(sorted_graph)
+
+        sorted_ops = graph_utils.get_sorted_ops(new_sorted_graph)
+        assert not has_op(
+            sorted_ops, "perm102_bmm_rrr_bias"
+        ), "the final graph should not have op perm102_bmm_rrr_bias"
+        assert not has_op(
+            sorted_ops, "perm102_bmm_rcr_bias"
+        ), "the final graph should not have op perm102_bmm_rcr_bias"
+
+    def _fuse_parallel_gemm_cat(
+        self,
+        b: int,
+        ms: Sequence[int],
+        n: int,
+        k: int,
+        perm102_bmm_op: str,
+        has_tanh: bool = True,
+        reshape_weight: bool = False,
+    ):
+        logger.info(
+            __file__, f"_fuse_parallel_gemm_cat, b: {b}, ms: {ms}, n: {n}, k: {k}"
+        )
+        X = Tensor(
+            shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+        Ws = []
+        Bs = []
+        for i in range(b):
+            W = Tensor(
+                shape=[IntImm(n), IntImm(k)],
+                dtype="float16",
+                name=f"W{i}",
+            )
+            if reshape_weight:
+                W = ops.reshape()(W, [n, k])  # no-op, for testing
+            Ws.append(W)
+            B = Tensor(
+                shape=[IntImm(n)],
+                dtype="float16",
+                name=f"B{i}",
+            )
+            Bs.append(B)
+
+        X1 = ops.split()(X, k, dim=-1)
+        cat_inputs = []
+        for i in range(b):
+            X2 = ops.elementwise(FuncEnum.TANH)(X1[i]) if has_tanh else X1[i]
+            X3 = ops.gemm_rcr_bias()(X2, Ws[i], Bs[i])
+            cat_inputs.append(X3)
+        cat_output = ops.concatenate()(cat_inputs, dim=-1)
+
+        cat_output._attrs["name"] = "output0"
+        cat_output._attrs["is_output"] = True
+
+        constants = {}
+        for i in range(b):
+            constants[f"W{i}"] = torch.randn(n, k).cuda().half()
+            constants[f"B{i}"] = torch.randn(n).cuda().half()
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [cat_output],
+            target,
+            "./tmp",
+            f"_fuse_parallel_gemm_cat_{self._test_id}",
+            constants=constants,
+        ) as module:
+            self._test_id += 1
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            assert has_op(
+                sorted_ops, perm102_bmm_op
+            ), "the final graph does not have op perm102_bmm_rrr_bias"
+            if not has_tanh:
+                assert not has_op(
+                    sorted_ops, "split"
+                ), "the final graph has split op, but it should not"
+
+            for m in ms:
+                x_pt = torch.randn(m, b * k).cuda().half()
+                x1_pt = torch.split(x_pt, k, dim=-1)
+
+                cat_inputs_pt = []
+                for i in range(b):
+                    x2_pt = x1_pt[i].tanh() if has_tanh else x1_pt[i]
+                    x3_pt = torch.nn.functional.linear(
+                        x2_pt, constants[f"W{i}"], constants[f"B{i}"]
+                    )
+                    cat_inputs_pt.append(x3_pt)
+                cat_output_pt = torch.cat(cat_inputs_pt, dim=-1)
+
+                # Run AITemplate module.
+
+                out = torch.empty([m, b * n]).cuda().half()
+                module.run_with_tensors([x_pt], [out])
+                # module.benchmark_with_tensors([x_pt], [out])
+
+                # Do comparisons.
+                self.assertTrue(
+                    torch.allclose(out, cat_output_pt, atol=1e-2, rtol=1e-2)
+                )
+
+    def test_fuse_parallel_gemm_cat(self):
+        # test n x gemms + cat
+        self._fuse_parallel_gemm_cat(
+            b=4, ms=[256, 512], n=128, k=64, perm102_bmm_op="perm102_bmm_rrr_bias"
+        )
+        self._fuse_parallel_gemm_cat(
+            b=4, ms=[256, 512], n=128, k=100, perm102_bmm_op="perm102_bmm_rrr_bias"
+        )
+        self._fuse_parallel_gemm_cat(
+            b=4, ms=[128, 256], n=100, k=32, perm102_bmm_op="perm102_bmm_rcr_bias"
+        )
+        self._fuse_parallel_gemm_cat(
+            b=16, ms=[15, 31], n=7, k=5, perm102_bmm_op="perm102_bmm_rrr_bias"
+        )
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[128, 256],
+            n=100,
+            k=32,
+            perm102_bmm_op="perm102_bmm_rcr_bias",
+            reshape_weight=True,
+        )
+
+        # test split + n x gemms + cat
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[256, 512],
+            n=128,
+            k=64,
+            perm102_bmm_op="perm102_bmm_rrr_bias",
+            has_tanh=False,
+        )
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[128, 256],
+            n=100,
+            k=32,
+            perm102_bmm_op="perm102_bmm_rcr_bias",
+            has_tanh=False,
+        )
+        self._fuse_parallel_gemm_cat(
+            b=16,
+            ms=[15, 31],
+            n=7,
+            k=5,
+            perm102_bmm_op="perm102_bmm_rrr_bias",
+            has_tanh=False,
+        )
+        self._fuse_parallel_gemm_cat(
+            b=16,
+            ms=[1024, 2048],
+            n=100,
+            k=128,
+            perm102_bmm_op="perm102_bmm_rcr_bias",
+            has_tanh=False,
+        )
+
+        # test multiple split + n x gemms + cat
+        self._fuse_2_split_parallel_gemm_cat(b=4, ms=[256, 512], n=128, k=64)
+
+    def _test_fuse_parallel_gemm_cat_partial(
+        self,
+        b1: int,
+        b2: int,
+        ms: Sequence[int],
+        n: int,
+        k: int,
+        has_tanh: bool = True,
+    ):
+        logger.info(
+            __file__,
+            f"_fuse_parallel_gemm_cat_partial, b1: {b1}, b2: {b2}, ms: {ms}, n: {n}, k: {k}",
+        )
+        batch_dim = IntVar(ms, "input_batch")
+        b = b1 + b2
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(b1 * k)],
+            dtype="float16",
+            name="X1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(b2 * k)],
+            dtype="float16",
+            name="X2",
+            is_input=True,
+        )
+        Ws = []
+        Bs = []
+        for i in range(b):
+            W = Tensor(
+                shape=[IntImm(n), IntImm(k)],
+                dtype="float16",
+                name=f"W{i}",
+            )
+            Ws.append(W)
+            B = Tensor(
+                shape=[IntImm(n)],
+                dtype="float16",
+                name=f"B{i}",
+            )
+            Bs.append(B)
+
+        cat_inputs = []
+
+        X3 = ops.split()(X1, k, dim=-1)
+        for i in range(b1):
+            X5 = ops.elementwise(FuncEnum.TANH)(X3[i]) if has_tanh else X3[i]
+            X6 = ops.gemm_rcr_bias()(X5, Ws[i], Bs[i])
+            cat_inputs.append(X6)
+
+        X7 = ops.reshape()(X1, [-1, b1, k])
+        W = Tensor(
+            shape=[IntImm(b1), IntImm(n), IntImm(k)],
+            dtype="float16",
+            name="W",
+        )
+        B = Tensor(
+            shape=[IntImm(b1), IntImm(n)],
+            dtype="float16",
+            name="B",
+        )
+        WT = ops.permute021()(W)
+
+        X8 = ops.perm102_bmm_rcr()(X7, W)
+        X9 = ops.reshape()(X8, [batch_dim, -1])
+        cat_inputs.append(X9)
+
+        X10 = ops.perm102_bmm_rcr_bias()(X7, W, B)
+        X11 = ops.reshape()(X10, [batch_dim, -1])
+        cat_inputs.append(X11)
+
+        X12 = ops.perm102_bmm_rrr()(X7, WT)
+        X13 = ops.reshape()(X12, [batch_dim, -1])
+        cat_inputs.append(X13)
+
+        X14 = ops.perm102_bmm_rrr_bias()(X7, WT, B)
+        X15 = ops.reshape()(X14, [batch_dim, -1])
+        cat_inputs.append(X15)
+
+        X4 = ops.split()(X2, k, dim=-1)
+        for i in range(b2):
+            X5 = ops.elementwise(FuncEnum.TANH)(X4[i]) if has_tanh else X4[i]
+            X6 = ops.gemm_rcr_bias()(X5, Ws[i + b1], Bs[i + b1])
+            cat_inputs.append(X6)
+
+        cat_output = ops.concatenate()(cat_inputs, dim=-1)
+
+        cat_output._attrs["name"] = "output0"
+        cat_output._attrs["is_output"] = True
+
+        constants = {}
+        for i in range(b):
+            constants[f"W{i}"] = torch.randn(n, k).cuda().half()
+            constants[f"B{i}"] = torch.randn(n).cuda().half()
+
+        constants["W"] = torch.randn(b1, n, k).cuda().half()
+        constants["B"] = torch.randn(b1, n).cuda().half()
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [cat_output],
+            target,
+            "./tmp",
+            f"_fuse_parallel_gemm_cat_{self._test_id}",
+            constants=constants,
+        ) as module:
+            self._test_id += 1
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            assert not has_op(
+                sorted_ops, "gemm_rcr_bias"
+            ), "the final graph still has op gemm_rcr_bias"
+            if not has_tanh:
+                assert not has_op(
+                    sorted_ops, "split"
+                ), "the final graph has split op, but it should not"
+
+            for m in ms:
+                x_pt = torch.randn(m, b1 * k).cuda().half()
+                x1_pt = torch.split(x_pt, k, dim=-1)
+
+                cat_inputs_pt = []
+                for i in range(b1):
+                    x3_pt = x1_pt[i].tanh() if has_tanh else x1_pt[i]
+                    x4_pt = torch.nn.functional.linear(
+                        x3_pt, constants[f"W{i}"], constants[f"B{i}"]
+                    )
+                    cat_inputs_pt.append(x4_pt)
+
+                x5_pt = x_pt.reshape(m, b1, k).permute([1, 0, 2])  # [b, m, k]
+                # [b, m, k] x [b, k, n] -> [b, m, n]
+                x6_pt = torch.bmm(x5_pt, constants["W"].permute([0, 2, 1]))
+                x7_pt = x6_pt.permute([1, 0, 2])  # [m, b, n]
+                x8_pt = x7_pt.reshape([m, -1])  # [m, b * n]
+                cat_inputs_pt.append(x8_pt)
+
+                x9_pt = (x7_pt + constants["B"]).reshape([m, -1])
+                cat_inputs_pt.append(x9_pt)
+                cat_inputs_pt.append(x8_pt)
+                cat_inputs_pt.append(x9_pt)
+
+                xx_pt = torch.randn(m, b2 * k).cuda().half()
+                x2_pt = torch.split(xx_pt, k, dim=-1)
+                for i in range(b2):
+                    x3_pt = x2_pt[i].tanh() if has_tanh else x2_pt[i]
+                    x4_pt = torch.nn.functional.linear(
+                        x3_pt, constants[f"W{i + b1}"], constants[f"B{i + b1}"]
+                    )
+                    cat_inputs_pt.append(x4_pt)
+
+                cat_output_pt = torch.cat(cat_inputs_pt, dim=-1)
+
+                # Run AITemplate module.
+
+                out = torch.empty(cat_output_pt.size()).cuda().half()
+                module.run_with_tensors({"X1": x_pt, "X2": xx_pt}, {"output0": out})
+
+                # Do comparisons.
+                self.assertTrue(
+                    torch.allclose(out, cat_output_pt, atol=1e-2, rtol=1e-2)
+                )
+
+    def test_fuse_parallel_gemm_cat_partial(self):
+        self._test_fuse_parallel_gemm_cat_partial(4, 4, [128, 256], 32, 64, True)
+        self._test_fuse_parallel_gemm_cat_partial(4, 4, [128, 256], 32, 64, False)
+        self._test_fuse_parallel_gemm_cat_partial(3, 3, [128, 256], 30, 66, True)
+        self._test_fuse_parallel_gemm_cat_partial(2, 2, [128, 256], 33, 55, True)
+
+    def _test_multi_parallel_gemm_cat_groups(self, m, nk_groups, num_unfused_ops=0):
+        inputs, constants = _prepare_inputs_and_constants(m, nk_groups)
+        outputs, module = _prepare_ait_module(
+            m, nk_groups, constants, test_idx=self._test_id
+        )
+        self._test_id += 1
+        with module:
+            sorted_graph = module.debug_sorted_graph
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            actual_unfused_ops = count_ops(sorted_ops, "gemm_rcr_bias")
+            assert (
+                actual_unfused_ops == num_unfused_ops
+            ), f"Expecting {num_unfused_ops} unfused gemm_rcr_bias ops, found {actual_unfused_ops}"
+            ys = []
+            for i, input in enumerate(inputs):
+                tanh = input.tanh()
+                y = torch.nn.functional.linear(
+                    tanh, constants[f"w_{i}"], constants[f"b_{i}"]
+                )
+                ys.append(y)
+            pt_y = torch.cat(ys, dim=-1)
+            module.run_with_tensors(inputs, outputs)
+            self.assertTrue(torch.allclose(pt_y, outputs[0], atol=1e-2, rtol=1e-2))
+
+    def test_multi_parallel_gemm_cat_groups(self):
+        self._test_multi_parallel_gemm_cat_groups(
+            256,
+            [[128, 64]] * 2 + [[128, 120]] * 4 + [[128, 72]] * 2 + [[128, 64]] * 2,
+        )
+        self._test_multi_parallel_gemm_cat_groups(
+            256, [[128, 64]] * 2 + [[128, 120]] + [[128, 72]] * 2 + [[128, 64]], 2
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_permute_bmm_special_op.py b/tests/unittest/compiler/test_permute_bmm_special_op.py
new file mode 100644
index 000000000..ef47daaef
--- /dev/null
+++ b/tests/unittest/compiler/test_permute_bmm_special_op.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class FusePermuteBmmRRRN1Case(unittest.TestCase):
+    def _test_permute_bmm_rrr_n1(self, B, M, K, testname):
+        N = 1
+
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        X = Tensor(
+            shape=[batch_dim, M, K], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+        )
+
+        WT = ops.permute021()(W)
+
+        Y = ops.bmm_rrr()(X, WT)
+        Y._attrs["name"] = "bmm_rrr_tensor"
+
+        output = ops.elementwise(FuncEnum.COS)(Y)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        bmm_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "bmm_rrr_tensor":
+                bmm_tensor = tensor
+                break
+
+        assert len(bmm_tensor._attrs["src_ops"]) == 1
+        src_op = list(bmm_tensor._attrs["src_ops"])[0]
+        assert src_op._attrs["op"] == "bmm_rcr_n1"
+
+        for b in B:
+            X_pt = torch.randn(b, M, K).cuda().half()
+            W_pt = torch.randn(b, K, N).cuda().half()
+
+            Y_pt = torch.cos(torch.bmm(X_pt, W_pt))
+            w = W_pt.permute([0, 2, 1]).contiguous()
+
+            # We currently only have row-major outputs.
+            y = torch.empty([b, M, N]).cuda().half()
+            module.run_with_tensors({"input_0": X_pt, "input_1": w}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_permute_bmm_rrr_n1(self):
+        self._test_permute_bmm_rrr_n1([1], 4, 8, "permute_bmm_rrr_n1")
+        self._test_permute_bmm_rrr_n1([1, 3], 4, 8, "permute_bmm_rrr_n1_dynamic")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_public_import.py b/tests/unittest/compiler/test_public_import.py
new file mode 100644
index 000000000..be4d658ba
--- /dev/null
+++ b/tests/unittest/compiler/test_public_import.py
@@ -0,0 +1,54 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import unittest
+
+from aitemplate.compiler.public import *  # noqa: F403
+
+
+class PublicImportTestCase(unittest.TestCase):
+    """Tests whether compiler.ops.public imports classes correctly."""
+
+    def test_import(self):
+        input1 = Tensor(shape=[IntVar([1024, 2048]), IntImm(32)])  # noqa: F405
+        input2 = Tensor(shape=[IntImm(32), IntImm(128)])  # noqa: F405
+
+        o1 = elementwise(FuncEnum.ADD)(input1, input1)  # noqa: F405
+
+        _ = gemm_rrr()(input1, input2)  # noqa: F405
+
+        x1 = reshape()(input1, [1, -1, 32])  # noqa: F405
+        x2 = reshape()(input2, [1, 32, 128])  # noqa: F405
+        o3 = bmm_rrr()(x1, x2)  # noqa: F405
+
+        _ = reduce_sum(dim=2)(o3)  # noqa: F405
+
+        _ = concatenate()([input1, o1], dim=1)  # noqa: F405
+
+        _ = permute()(o3, (0, 2, 1))  # noqa: F405
+
+        _ = topk(k=10)(input1)  # noqa: F405
+        _ = layernorm(input1)  # noqa: F405
+
+        _ = clamp()(input1, -1, 1)  # noqa: F405
+
+        _ = reduce_mean(dim=1)(input1)  # noqa: F405
+        _ = vector_norm(ord_kind=2, dim=0)(input1)  # noqa: F405
+        _ = var(dim=0, unbiased=False, keepdim=True)(input1)  # noqa: F405
+        _ = softmax()(input1, dim=1)  # noqa: F405
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_refine_graph.py b/tests/unittest/compiler/test_refine_graph.py
new file mode 100644
index 000000000..83367776d
--- /dev/null
+++ b/tests/unittest/compiler/test_refine_graph.py
@@ -0,0 +1,319 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils, logger
+
+
+class RefineGraphTestCase(unittest.TestCase):
+    def test_elementwise_ops(self):
+        M = 10
+        N = 4
+        X0 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X2",
+            is_input=True,
+        )
+
+        Y0 = (X0 + X1) * X2
+        Y1 = (X0 + X2) * X1
+        Y0._attrs["name"] = "Y0"
+        Y0._attrs["is_output"] = True
+        Y1._attrs["name"] = "Y1"
+        Y1._attrs["is_output"] = True
+        target = detect_target()
+        module = compile_model(
+            [Y0, Y1],
+            target,
+            "./tmp",
+            "test_refine_graph_elementwise",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+
+        assert len(sorted_ops) == 2
+        assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
+
+    def test_elementwise_ops_single_input_no_refine(self):
+        M = 10
+        N = 4
+        X0 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X1",
+            is_input=True,
+        )
+
+        Y0 = X0 / (X0 * X0)
+        Y1 = (X1 * X1) / X1
+        Y0._attrs["name"] = "Y0"
+        Y0._attrs["is_output"] = True
+        Y1._attrs["name"] = "Y1"
+        Y1._attrs["is_output"] = True
+        target = detect_target()
+        module = compile_model(
+            [Y0, Y1],
+            target,
+            "./tmp",
+            "test_refine_graph_elementwise",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+
+        assert len(sorted_ops) == 2
+        assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
+
+    def test_elementwise_ops_single_input(self):
+        M = 10
+        N = 4
+        X0 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[M, N],
+            dtype="float16",
+            name="X1",
+            is_input=True,
+        )
+
+        Y0 = ops.elementwise(FuncEnum.SILU)(X0)
+        Y1 = ops.elementwise(FuncEnum.SILU)(X1)
+        Y0._attrs["name"] = "Y0"
+        Y0._attrs["is_output"] = True
+        Y1._attrs["name"] = "Y1"
+        Y1._attrs["is_output"] = True
+        target = detect_target()
+        module = compile_model(
+            [Y0, Y1],
+            target,
+            "./tmp",
+            "test_elementwise_ops_single_input",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+
+        assert len(sorted_ops) == 2
+        assert sorted_ops[0]._attrs["name"] == sorted_ops[1]._attrs["name"]
+
+        inputs = {}
+        outputs = {}
+        inputs["X0"] = torch.randn([M, N]).cuda().half()
+        inputs["X1"] = torch.randn([M, N]).cuda().half()
+        outputs["Y0"] = torch.empty([M, N]).cuda().half()
+        outputs["Y1"] = torch.empty([M, N]).cuda().half()
+
+        module.run_with_tensors(inputs, outputs)
+        y0 = torch.nn.functional.silu(inputs["X0"])
+        y1 = torch.nn.functional.silu(inputs["X1"])
+
+        self.assertTrue(torch.allclose(y0, outputs["Y0"], 1e-2, 1e-2))
+        self.assertTrue(torch.allclose(y1, outputs["Y1"], 1e-2, 1e-2))
+
+    def _build_gemm_rcr_bias(self, M, N, K, start_idx=0):
+        X_shape = [M, K]
+        W_shape = [N, K]
+        B_shape = [N]
+
+        input_0 = Tensor(
+            shape=X_shape, dtype="float16", name=f"input_{start_idx}", is_input=True
+        )
+        input_1 = Tensor(
+            shape=W_shape, dtype="float16", name=f"input_{start_idx + 1}", is_input=True
+        )
+        input_2 = Tensor(
+            shape=B_shape, dtype="float16", name=f"input_{start_idx + 2}", is_input=True
+        )
+
+        gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
+        bias_tensor = ops.elementwise(FuncEnum.ADD)(gemm_tensor, input_2)
+
+        return bias_tensor
+
+    def _build_gemm_rcr_bias_mul(self, M, N, K, start_idx=0):
+        D_shape = [M, N]
+        input_3 = Tensor(
+            shape=D_shape, dtype="float16", name=f"input_{start_idx + 3}", is_input=True
+        )
+
+        bias_tensor = self._build_gemm_rcr_bias(M, N, K, start_idx)
+        mul_tensor = ops.elementwise(FuncEnum.MUL)(bias_tensor, input_3)
+
+        return mul_tensor
+
+    def test_gemm_ops(self):
+        M = 128
+        N = 64
+        K = 256
+
+        Y1 = self._build_gemm_rcr_bias_mul(M, N, K, 0)
+        Y2 = self._build_gemm_rcr_bias_mul(M, N, K, 4)
+        Y1._attrs["name"] = "Y0"
+        Y1._attrs["is_output"] = True
+        Y2._attrs["name"] = "Y1"
+        Y2._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            [Y1, Y2],
+            target,
+            "./tmp",
+            "test_refine_graph_gemm",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+
+        assert len(sorted_ops) == 2
+        assert sorted_ops[0]._attrs["name"] == sorted_ops[1]._attrs["name"]
+
+    def test_bmm_ops_accessor(self):
+        dtype = "float16"
+        B = 16
+        M = 128
+        K = 64
+        N = 256
+        T_A = Tensor(
+            shape=[B, M, K],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        T_B = Tensor(
+            shape=[B, N, K],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        Xs = ops.split()(T_A, 32, -1)
+        Ys = ops.split()(T_B, 32, -1)
+        assert len(Xs) == len(Ys)
+
+        n = len(Xs)
+        Cs = []
+        for i in range(n):
+            X = Xs[i]
+            Y = Ys[i]
+            C = ops.bmm_rcr()(X, Y)
+            Cs.append(C)
+        Y = ops.concatenate()(Cs, dim=-1)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            "test_refine_graph_bmm",
+        )
+
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+
+        assert len(sorted_ops) == 2
+        assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
+
+    def test_refine_graph_group_gemms(self):
+        M = 256
+        K1 = 128
+        N1 = 60
+        K2 = 192
+        N2 = 64
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        Y1, Y2 = ops.group_gemm_rcr()(operand_groups=[[X1, W1], [X2, W2]])
+        Y3, Y4 = ops.group_gemm_rcr()(operand_groups=[[X1, W1], [X2, W2]])
+        Y1._attrs["name"] = "y1"
+        Y1._attrs["is_output"] = True
+        Y2._attrs["name"] = "y2"
+        Y2._attrs["is_output"] = True
+        Y3._attrs["name"] = "y3"
+        Y3._attrs["is_output"] = True
+        Y4._attrs["name"] = "y4"
+        Y4._attrs["is_output"] = True
+
+        graph_outputs = [Y1, Y2, Y3, Y4]
+
+        module = compile_model(
+            graph_outputs, target, "./tmp", "test_refine_graph_group_gemms"
+        )
+
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        assert len(sorted_ops) == 2
+        assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
+
+        X1_pt = torch.randn(M, K1).cuda().half()
+        X2_pt = torch.randn(M, K2).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+        }
+        y1 = torch.empty([M, N1]).cuda().half()
+        y2 = torch.empty([M, N2]).cuda().half()
+        y3 = torch.empty([M, N1]).cuda().half()
+        y4 = torch.empty([M, N2]).cuda().half()
+        outputs = {"y1": y1, "y2": y2, "y3": y3, "y4": y4}
+
+        module.run_with_tensors(inputs, outputs)
+        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y2_pt, outputs["y4"], atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_remove_unused_ops.py b/tests/unittest/compiler/test_remove_unused_ops.py
new file mode 100644
index 000000000..c1014fab3
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_unused_ops.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils, shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class RemoveUnusedOpsTestCase(unittest.TestCase):
+    def _test_remove_unused_ops(
+        self,
+        batch_size=(1, 3),
+        X_shape=(5, 10),
+        test_name="test_remove_unused_ops",
+    ):
+        target = detect_target()
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        X = Tensor(
+            shape=[b_dim, *X_shape],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        Y1 = ops.size()(X)
+        Y2 = ops.getitem()(Y1, 1)
+        CONST_X = Tensor(
+            shape=[],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+            value=Y2._attrs["int_var"].value(),
+        )
+        Y = ops.elementwise(FuncEnum.ADD)(X, CONST_X)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_shape_pt = (b, *X_shape)
+            X_pt = torch.randn(X_shape_pt).cuda().half()
+            Y_pt = X_pt + X_pt.size(1)
+
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+            self.assertEqual(len(module.debug_sorted_graph), 2)
+            self.assertEqual(
+                len(graph_utils.get_sorted_ops(module.debug_sorted_graph)), 1
+            )
+
+    def test_remove_unused_ops(self):
+        self._test_remove_unused_ops()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_slice_elemwise_fusion.py b/tests/unittest/compiler/test_slice_elemwise_fusion.py
new file mode 100644
index 000000000..6ed7a7cc6
--- /dev/null
+++ b/tests/unittest/compiler/test_slice_elemwise_fusion.py
@@ -0,0 +1,513 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class SliceElemwiseFusionTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceElemwiseFusionTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_slice_elemwise_fusion(
+        self,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        test_name,
+        expected_read_t,
+        expected_op_t,
+        expected_data_t,
+        input_x2_shape=None,
+    ):
+        dtype = "float16"
+        X1 = Tensor(
+            shape=slice_input_shape,
+            dtype=dtype,
+            name="input_x1",
+            is_input=True,
+        )
+        slice_op = ops.dynamic_slice()
+        slice_output = slice_op(
+            X1, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        slice_output._attrs["name"] = "slice_output"
+        if input_x2_shape is None:
+            input_x2_shape = [d.value() for d in slice_output._attrs["shape"]]
+
+        # second input of the elemwise op
+        X2 = Tensor(
+            shape=input_x2_shape,
+            dtype=dtype,
+            name="input_x2",
+            is_input=True,
+        )
+
+        Y = ops.elementwise(FuncEnum.ADD)(slice_output, X2)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        # 2 inputs + 1 output
+        self.assertEqual(len(sorted_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
+        self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
+
+        # Run PyTorch
+        x1_pt = torch.randn(*slice_input_shape).cuda().half()
+        x2_pt = torch.randn(*input_x2_shape).cuda().half()
+
+        slice_indices = [
+            slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+        ]
+        slice_output_pt = x1_pt[slice_indices]
+        y_pt = slice_output_pt + x2_pt
+
+        # Run AITemplate module.
+        inputs = {
+            "input_x1": x1_pt,
+            "input_x2": x2_pt,
+        }
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_slice_elemwise_fusion(self):
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10,),
+            slice_start_indices=(2,),
+            slice_end_indices=(None,),
+            test_name="slice_elemwise_fusion",
+            expected_read_t="uint",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10, 20),
+            slice_start_indices=(0, 3),
+            slice_end_indices=(None, 8),
+            test_name="slice_elemwise_fusion",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10, 20, 30),
+            slice_start_indices=(0, 3, 0),
+            slice_end_indices=(None, 5, None),
+            test_name="slice_elemwise_fusion",
+            expected_read_t="uint",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+
+    def test_slice_elemwise_fusion_broadcast(self):
+        # slice_output broadcasts to input_x2_shape with the same dimensionality
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10, 16),
+            slice_start_indices=(2, 0),
+            slice_end_indices=(3, None),
+            test_name="slice_elemwise_fusion_broadcast",
+            expected_read_t="uint4",
+            expected_op_t="half2",
+            expected_data_t="half",
+            input_x2_shape=(4, 16),
+        )
+
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10, 10),
+            slice_start_indices=(0, 3),
+            slice_end_indices=(None, 4),
+            test_name="slice_elemwise_fusion_broadcast",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+            input_x2_shape=(10, 3),
+        )
+
+        # mixed
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(1, 1, 10),
+            slice_start_indices=(0, 0, 2),
+            slice_end_indices=(None, None, 7),
+            test_name="slice_elemwise_fusion_broadcast",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+            input_x2_shape=(10, 3, 1),
+        )
+
+        # slice_output broadcasts to input_x2_shape with different dimensionalities
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10, 10),
+            slice_start_indices=(0, 3),
+            slice_end_indices=(None, 4),
+            test_name="slice_elemwise_fusion_broadcast",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+            input_x2_shape=(4, 10, 3),
+        )
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10, 20, 10),
+            slice_start_indices=(0, 0, 3),
+            slice_end_indices=(None, None, 4),
+            test_name="slice_elemwise_fusion_broadcast",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+            input_x2_shape=(20, 3),
+        )
+
+    def _test_slice_elemwise_fusion_dynamic(
+        self,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        test_name,
+        expected_read_t,
+        expected_op_t,
+        expected_data_t,
+        input_x2_shape=None,
+    ):
+        dtype = "float16"
+        x_shape = [
+            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
+            for d in slice_input_shape
+        ]
+        X1 = Tensor(
+            shape=x_shape,
+            dtype=dtype,
+            name="input_x1",
+            is_input=True,
+        )
+        slice_op = ops.dynamic_slice()
+        slice_output = slice_op(
+            X1, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        slice_output._attrs["name"] = "slice_output"
+        slice_output_shape = slice_output._attrs["shape"]
+        if input_x2_shape is None:
+            x2_shape = slice_output_shape
+        else:
+            # iterate from right to handle cases with different dimensionalities
+            x2_shape = []
+            for x2_d, s_d in itertools.zip_longest(
+                reversed(input_x2_shape), reversed(slice_output_shape)
+            ):
+                if x2_d is None:
+                    break
+                if s_d is None:
+                    x2_shape.append(
+                        shape_utils.gen_int_var_min_max(x2_d)
+                        if isinstance(x2_d, list)
+                        else IntImm(x2_d)
+                    )
+                elif isinstance(x2_d, list):
+                    # must re-use the dynamic dim names from slice_output_shape
+                    x2_shape.append(s_d)
+                else:
+                    x2_shape.append(IntImm(x2_d))
+            x2_shape.reverse()
+
+        X2 = Tensor(
+            shape=x2_shape,
+            dtype=dtype,
+            name="input_x2",
+            is_input=True,
+        )
+
+        Y1 = ops.elementwise(FuncEnum.TANH)(X2)
+        Y2 = ops.elementwise(FuncEnum.SUB)(Y1, X2)
+        Y = ops.elementwise(FuncEnum.ADD)(slice_output, Y2)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        # 2 inputs + 1 output
+        self.assertEqual(len(sorted_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
+        self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
+
+        for d in slice_input_shape:
+            if isinstance(d, list):
+                Ms = d
+                break
+        assert Ms is not None, "expected to have at least one dynamic dim"
+        for idx in range(len(Ms)):
+            # Run PyTorch
+            x_shape_pt = [
+                d[idx] if isinstance(d, list) else d for d in slice_input_shape
+            ]
+            if input_x2_shape is None:
+                input_x2_shape_pt = [
+                    d0[idx] if isinstance(d0, list) else d1.value()
+                    for (d0, d1) in zip(slice_input_shape, slice_output_shape)
+                ]
+            else:
+                input_x2_shape_pt = [
+                    d[idx] if isinstance(d, list) else d for d in input_x2_shape
+                ]
+            x1_pt = torch.randn(*x_shape_pt).cuda().half()
+            x2_pt = torch.randn(*input_x2_shape_pt).cuda().half()
+
+            slice_indices = [
+                slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+            ]
+            slice_output_pt = x1_pt[slice_indices]
+            y1_pt = torch.tanh(x2_pt)
+            y2_pt = y1_pt - x2_pt
+            y_pt = slice_output_pt + y2_pt
+
+            # Run AITemplate module.
+            inputs = {
+                "input_x1": x1_pt,
+                "input_x2": x2_pt,
+            }
+            y = torch.empty(y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self.test_count += 1
+
+    def test_slice_elemwise_fusion_dynamic(self):
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], 10),
+            slice_start_indices=(0, 3),
+            slice_end_indices=(None, 7),
+            test_name="slice_elemwise_fusion_dynamic",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], [4, 10], 16),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(None, None, 16),
+            test_name="slice_elemwise_fusion_dynamic",
+            expected_read_t="uint2",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], [4, 10], 20, 16),
+            slice_start_indices=(0, 0, 7, 0),
+            slice_end_indices=(None, None, 10, None),
+            test_name="slice_elemwise_fusion_dynamic",
+            expected_read_t="uint4",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+
+    def test_slice_elemwise_fusion_dynamic_broadcast(self):
+        # slice_output broadcasts to input_x2
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], 8, 16),
+            slice_start_indices=(0, 4, 0),
+            slice_end_indices=(None, 5, None),
+            test_name="slice_elemwise_fusion_dynamic_broadcast",
+            expected_read_t="uint4",
+            expected_op_t="half2",
+            expected_data_t="half",
+            input_x2_shape=([5, 16], 4, 16),
+        )
+        # mixed
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], 10, 10),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(None, None, 5),
+            test_name="slice_elemwise_fusion_dynamic_broadcast",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+            input_x2_shape=(1, 10, 15),
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([1, 1], [4, 20], 10),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(None, None, 5),
+            test_name="slice_elemwise_fusion_dynamic_broadcast",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+            input_x2_shape=(10, 1, 15),
+        )
+        # with different dimensionalities
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], 10, 10),
+            slice_start_indices=(0, 0, 0),
+            slice_end_indices=(None, None, 8),
+            test_name="slice_elemwise_fusion_dynamic_broadcast",
+            expected_read_t="uint",
+            expected_op_t="half2",
+            expected_data_t="half",
+            input_x2_shape=(10, 8),
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], 10, 10),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(None, None, 5),
+            test_name="slice_elemwise_fusion_dynamic_broadcast",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+            input_x2_shape=(3, [5, 16], 10, 15),
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], 10, 20),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(None, None, 12),
+            test_name="slice_elemwise_fusion_dynamic_broadcast",
+            expected_read_t="uint2",
+            expected_op_t="half2",
+            expected_data_t="half",
+            input_x2_shape=([3, 7], [5, 16], 10, 8),
+        )
+
+    def _test_two_slice_elemwise_fusion_dynamic(
+        self,
+        slice_input_shape,
+        slice_start_indices1,
+        slice_end_indices1,
+        slice_start_indices2,
+        slice_end_indices2,
+        expected_read_t,
+        expected_op_t,
+        expected_data_t,
+        test_name,
+    ):
+        dtype = "float16"
+        x_shape = [
+            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
+            for d in slice_input_shape
+        ]
+        X1 = Tensor(
+            shape=x_shape,
+            dtype=dtype,
+            name="input_x1",
+            is_input=True,
+        )
+        slice_op1 = ops.dynamic_slice()
+        slice_output1 = slice_op1(
+            X1, start_indices=slice_start_indices1, end_indices=slice_end_indices1
+        )
+        slice_output1._attrs["name"] = "slice_output1"
+
+        slice_op2 = ops.dynamic_slice()
+        slice_output2 = slice_op2(
+            X1, start_indices=slice_start_indices2, end_indices=slice_end_indices2
+        )
+        slice_output2._attrs["name"] = "slice_output2"
+
+        Y = ops.elementwise(FuncEnum.ADD)(slice_output1, slice_output2)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        # 1 inputs + 1 output
+        self.assertEqual(len(sorted_graph), 2)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
+        self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
+
+        for d in slice_input_shape:
+            if isinstance(d, list):
+                Ms = d
+                break
+        assert Ms is not None, "expected to have at least one dynamic dim"
+        for idx in range(len(Ms)):
+            # Run PyTorch
+            x_shape_pt = [
+                d[idx] if isinstance(d, list) else d for d in slice_input_shape
+            ]
+            x1_pt = torch.randn(*x_shape_pt).cuda().half()
+
+            slice_indices1 = [
+                slice(i, j) for i, j in zip(slice_start_indices1, slice_end_indices1)
+            ]
+            slice_indices2 = [
+                slice(i, j) for i, j in zip(slice_start_indices2, slice_end_indices2)
+            ]
+            slice_output1_pt = x1_pt[slice_indices1]
+            slice_output2_pt = x1_pt[slice_indices2]
+            y_pt = slice_output1_pt + slice_output2_pt
+
+            # Run AITemplate module.
+            inputs = {
+                "input_x1": x1_pt,
+            }
+            y = torch.empty(y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self.test_count += 1
+
+    def test_two_slice_elemwise_fusion_dynamic(self):
+        self._test_two_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([3, 50], 100),
+            slice_start_indices1=(0, 4),
+            slice_end_indices1=(None, 8),
+            slice_start_indices2=(0, 16),
+            slice_end_indices2=(None, 20),
+            expected_read_t="uint2",
+            expected_op_t="half2",
+            expected_data_t="half",
+            test_name="two_slice_elemwise_fusion_dynamic",
+        )
+        self._test_two_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([3, 50], 100),
+            slice_start_indices1=(0, 3),
+            slice_end_indices1=(None, 7),
+            slice_start_indices2=(0, 4),
+            slice_end_indices2=(None, 8),
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+            test_name="two_slice_elemwise_fusion_dynamic",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_slice_gemm_fusion.py b/tests/unittest/compiler/test_slice_gemm_fusion.py
new file mode 100644
index 000000000..60d5dd2de
--- /dev/null
+++ b/tests/unittest/compiler/test_slice_gemm_fusion.py
@@ -0,0 +1,769 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils, shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class SliceGemmFusionTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceGemmFusionTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_slice_gemm_rcr_fusion_a(
+        self,
+        N,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        test_name,
+    ):
+        dtype = "float16"
+
+        tensor_B = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_b",
+            is_input=True,
+        )
+        X = Tensor(
+            shape=slice_input_shape,
+            dtype=dtype,
+            name="input_x",
+            is_input=True,
+        )
+        Bias = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+
+        slice_op = ops.dynamic_slice()
+        tensor_A = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        tensor_A._attrs["name"] = "slice_output"
+
+        Y = ops.gemm_rcr_bias()(tensor_A, tensor_B, Bias)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test_{}.so".format(self.test_count)
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 4)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+
+        # Run PyTorch
+        b_pt = torch.randn(N, K).cuda().half()
+        input_pt = torch.randn(*slice_input_shape).cuda().half()
+        bias_pt = torch.randn(N).cuda().half()
+
+        slice_indices = [
+            slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+        ]
+        a_pt = input_pt[slice_indices]
+        y_pt = torch.nn.functional.linear(a_pt, b_pt, bias=bias_pt)
+
+        # Run AITemplate module.
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors([input_pt, b_pt, bias_pt], [y])
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_slice_gemm_rcr_fusion_a(self):
+        # [slice_end_indices[0] - slice_start_indices[0]] = M
+        # [slice_end_indices[1] - slice_start_indices[1]] = K
+        # a = [M, K]
+        # b = [N, K]
+        self._test_slice_gemm_rcr_fusion_a(
+            N=4,
+            K=8,
+            slice_input_shape=(2, 8),
+            slice_start_indices=(0, 0),
+            slice_end_indices=(None, None),
+            test_name="slice_gemm_rcr_fusion_a",
+        )
+        self._test_slice_gemm_rcr_fusion_a(
+            N=32,
+            K=6,
+            slice_input_shape=(24, 32),
+            slice_start_indices=(0, 10),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_fusion_a",
+        )
+        self._test_slice_gemm_rcr_fusion_a(
+            N=32,
+            K=16,
+            slice_input_shape=(24, 32),
+            slice_start_indices=(0, 2),
+            slice_end_indices=(None, 18),
+            test_name="slice_gemm_rcr_fusion_a",
+        )
+        self._test_slice_gemm_rcr_fusion_a(
+            N=32,
+            K=16,
+            slice_input_shape=(24, 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 24),
+            test_name="slice_gemm_rcr_fusion_a",
+        )
+        self._test_slice_gemm_rcr_fusion_a(
+            N=32,
+            K=16,
+            slice_input_shape=(24, 16),
+            slice_start_indices=(3, 0),
+            slice_end_indices=(15, None),
+            test_name="slice_gemm_rcr_fusion_a",
+        )
+
+    # This is a test for testing cases where we correctly update a/b_alignment
+    # based on input_accessors
+    def test_slice_gemm_rcr_fusion_align(self):
+        # [slice_end_indices[0] - slice_start_indices[0]] = M
+        # [slice_end_indices[1] - slice_start_indices[1]] = K
+        # a = [M, K]
+        # b = [N, K]
+
+        # Note that we have to force profiling in ci. Otherwise, we would not
+        # be able to fetch cached config.
+        target = detect_target()
+        old_force_ci = os.environ.get("FORCE_PROFILE", None)
+        if target.in_ci_env():
+            os.environ["FORCE_PROFILE"] = "1"
+        # make a test with smaller alignment
+        self._test_slice_gemm_rcr_fusion_a(
+            N=3,
+            K=16,
+            slice_input_shape=(24, 32),
+            slice_start_indices=(0, 2),
+            slice_end_indices=(None, 18),
+            test_name="slice_gemm_rcr_fusion_a",
+        )
+        # Next, make another one with a larger alignment.
+        # If we don't update a/b_alignment accordingly, we would end up with
+        # misalignment failures.
+        self._test_slice_gemm_rcr_fusion_a(
+            N=3,
+            K=16,
+            slice_input_shape=(24, 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 24),
+            test_name="slice_gemm_rcr_fusion_a",
+        )
+
+        # another set of tests for a/b alignments
+        self._test_slice_gemm_rcr_fusion_b(
+            M=21,
+            K=4,
+            slice_input_shape=(3, 32),
+            slice_start_indices=(0, 6),
+            slice_end_indices=(None, 10),
+            test_name="slice_gemm_rcr_fusion_b",
+        )
+        self._test_slice_gemm_rcr_fusion_b(
+            M=21,
+            K=4,
+            slice_input_shape=(3, 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 12),
+            test_name="slice_gemm_rcr_fusion_b",
+        )
+        self._test_slice_gemm_rcr_fusion_b(
+            M=21,
+            K=4,
+            slice_input_shape=(3, 32),
+            slice_start_indices=(0, 10),
+            slice_end_indices=(None, 14),
+            test_name="slice_gemm_rcr_fusion_b",
+        )
+
+        # another set of tests for a/b alignments
+        self._test_slice_gemm_rcr_bias_add(
+            M=5,
+            N=2,
+            K=2,
+            slice_input_shape=(5, 32),
+            slice_start_indices=(0, 10),
+            slice_end_indices=(None, 12),
+            test_name="slice_gemm_rcr_bias_add",
+        )
+        self._test_slice_gemm_rcr_bias_add(
+            M=5,
+            N=2,
+            K=2,
+            slice_input_shape=(5, 32),
+            slice_start_indices=(0, 16),
+            slice_end_indices=(None, 18),
+            test_name="slice_gemm_rcr_bias_add",
+        )
+
+        # restore old env
+        if target.in_ci_env():
+            if old_force_ci is None:
+                del os.environ["FORCE_PROFILE"]
+            else:
+                os.environ["FORCE_PROFILE"] = old_force_ci
+
+    def _test_slice_gemm_rcr_fusion_b(
+        self,
+        M,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        test_name,
+    ):
+        dtype = "float16"
+
+        tensor_A = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_a",
+            is_input=True,
+        )
+        X = Tensor(
+            shape=slice_input_shape,
+            dtype=dtype,
+            name="input_x",
+            is_input=True,
+        )
+
+        slice_op = ops.dynamic_slice()
+        tensor_B = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        tensor_B._attrs["name"] = "slice_output"
+
+        Y = ops.gemm_rcr()(tensor_A, tensor_B)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test_{}.so".format(self.test_count)
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+
+        # Run PyTorch
+        a_pt = torch.randn(M, K).cuda().half()
+        input_pt = torch.randn(*slice_input_shape).cuda().half()
+
+        slice_indices = [
+            slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+        ]
+        b_pt = input_pt[slice_indices]
+        y_pt = torch.nn.functional.linear(a_pt, b_pt)
+
+        # Run AITemplate module.
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors([input_pt, a_pt], [y])
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_slice_gemm_rcr_fusion_b(self):
+        # a = [M, K]
+        # [slice_end_indices[0] - slice_start_indices[0]] = N
+        # [slice_end_indices[1] - slice_start_indices[1]] = K
+        # b = [N, K]
+        self._test_slice_gemm_rcr_fusion_b(
+            M=2,
+            K=8,
+            slice_input_shape=(4, 8),
+            slice_start_indices=(0, 0),
+            slice_end_indices=(None, None),
+            test_name="slice_gemm_rcr_fusion_b",
+        )
+        self._test_slice_gemm_rcr_fusion_b(
+            M=24,
+            K=16,
+            slice_input_shape=(32, 32),
+            slice_start_indices=(0, 16),
+            slice_end_indices=(None, 32),
+            test_name="slice_gemm_rcr_fusion_b",
+        )
+
+    def _test_slice_gemm_rcr_fusion_a_2(
+        self,
+        M,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        test_name,
+        no_fusion=False,
+    ):
+        dtype = "float16"
+
+        X = Tensor(
+            shape=slice_input_shape,
+            dtype=dtype,
+            name="input_x",
+            is_input=True,
+        )
+        Bias = Tensor(
+            shape=[IntImm(M)],
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+
+        slice_op = ops.dynamic_slice()
+        tensor_A = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        tensor_A._attrs["name"] = "slice_output"
+        a_shape = [d.value() for d in tensor_A._attrs["shape"]]
+        assert (
+            a_shape[0] == M
+        ), f"invalid test shape: expected a_shape[0] to be {M}, but got {a_shape[0]}"
+        assert (
+            a_shape[1] == M
+        ), f"invalid test shape: expected a_shape[1] to be {M}, but got {a_shape[1]}"
+        # tensor_A is used for A and B in gemm computation
+
+        Y = ops.gemm_rcr_bias()(tensor_A, tensor_A, Bias)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test_{}.so".format(self.test_count)
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        if no_fusion:
+            self.assertEqual(len(sorted_graph), 4)
+        else:
+            self.assertEqual(len(sorted_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        if no_fusion:
+            self.assertEqual(len(sorted_ops), 2)
+        else:
+            self.assertEqual(len(sorted_ops), 1)
+
+        # Run PyTorch
+        input_pt = torch.randn(*slice_input_shape).cuda().half()
+        bias_pt = torch.randn(M).cuda().half()
+
+        slice_indices = [
+            slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+        ]
+        a_pt = input_pt[slice_indices]
+        y_pt = torch.nn.functional.linear(a_pt, a_pt, bias=bias_pt)
+
+        # Run AITemplate module.
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors([input_pt, bias_pt], [y])
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+        dll_name = "test_{}.so".format(self.test_count)
+        self.test_count += 1
+
+    def test_slice_gemm_rcr_fusion_a_2(self):
+        # [slice_end_indices[0] - slice_start_indices[0]] = M
+        # [slice_end_indices[1] - slice_start_indices[1]] = M
+        # a = [M, M]
+        # b = [M, M]
+        self._test_slice_gemm_rcr_fusion_a_2(
+            M=8,
+            slice_input_shape=(8, 24),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_fusion_a_2",
+        )
+        self._test_slice_gemm_rcr_fusion_a_2(
+            M=8,
+            slice_input_shape=(8, 23),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_fusion_a_2",
+            no_fusion=True,
+        )
+
+    def _test_slice_gemm_rcr_bias_add(
+        self,
+        M,
+        N,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        test_name,
+    ):
+        dtype = "float16"
+
+        tensor_B = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_b",
+            is_input=True,
+        )
+        X = Tensor(
+            shape=slice_input_shape,
+            dtype=dtype,
+            name="input_x",
+            is_input=True,
+        )
+        Bias = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+        D = Tensor(
+            shape=[M, N],
+            dtype=dtype,
+            name="input_d",
+            is_input=True,
+        )
+
+        slice_op = ops.dynamic_slice()
+        tensor_A = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        tensor_A._attrs["name"] = "slice_output"
+
+        Y1 = ops.gemm_universal.gemm_rcr()(tensor_A, tensor_B)
+        Y2 = ops.elementwise(FuncEnum.ADD)(Y1, Bias)
+        Y = ops.elementwise(FuncEnum.ADD)(Y2, D)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test_{}.so".format(self.test_count)
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 5)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+
+        # Run PyTorch
+        b_pt = torch.randn(N, K).cuda().half()
+        input_pt = torch.randn(*slice_input_shape).cuda().half()
+        bias_pt = torch.randn(N).cuda().half()
+        d_pt = torch.randn(M, N).cuda().half()
+
+        slice_indices = [
+            slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+        ]
+        a_pt = input_pt[slice_indices]
+        y2_pt = torch.nn.functional.linear(a_pt, b_pt, bias=bias_pt)
+        y_pt = y2_pt + d_pt
+
+        # Run AITemplate module.
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors([input_pt, b_pt, bias_pt, d_pt], [y])
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_slice_gemm_rcr_bias_add(self):
+        # [slice_end_indices[0] - slice_start_indices[0]] = M
+        # [slice_end_indices[1] - slice_start_indices[1]] = K
+        # a = [M, K]
+        # b = [N, K]
+        self._test_slice_gemm_rcr_bias_add(
+            M=4,
+            N=2,
+            K=8,
+            slice_input_shape=(4, 16),
+            slice_start_indices=(0, 0),
+            slice_end_indices=(None, 8),
+            test_name="slice_gemm_rcr_bias_add",
+        )
+        self._test_slice_gemm_rcr_bias_add(
+            M=4,
+            N=2,
+            K=8,
+            slice_input_shape=(4, 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_bias_add",
+        )
+
+    def test_slice_nd_gemm_rcr_fusion_a(self):
+        # [slice_end_indices[0] - slice_start_indices[0]] = M0
+        # [slice_end_indices[1] - slice_start_indices[1]] = M1
+        # ...
+        # [slice_end_indices[2] - slice_start_indices[2]] = K
+        # a = [M0, M1, ..., K]
+        # b = [N, K]
+        self._test_slice_gemm_rcr_fusion_a(
+            N=5,
+            K=8,
+            slice_input_shape=(4, 13, 2, 32),
+            slice_start_indices=(0, 0, 0, 8),
+            slice_end_indices=(None, None, None, 16),
+            test_name="slice_nd_gemm_rcr_fusion_a",
+        )
+        self._test_slice_gemm_rcr_fusion_a(
+            N=5,
+            K=4,
+            slice_input_shape=(4, 13, 2, 32),
+            slice_start_indices=(0, 0, 0, 8),
+            slice_end_indices=(None, None, None, 12),
+            test_name="slice_nd_gemm_rcr_fusion_a",
+        )
+        self._test_slice_gemm_rcr_fusion_a(
+            N=5,
+            K=4,
+            slice_input_shape=(13, 2, 32),
+            slice_start_indices=(0, 0, 10),
+            slice_end_indices=(None, None, 14),
+            test_name="slice_nd_gemm_rcr_fusion_a",
+        )
+
+    def _test_slice_gemm_rcr_fusion_dynamic(
+        self,
+        N,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        test_name,
+    ):
+        dtype = "float16"
+
+        tensor_B = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_b",
+            is_input=True,
+        )
+        x_shape = [
+            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
+            for d in slice_input_shape
+        ]
+        X = Tensor(
+            shape=x_shape,
+            dtype=dtype,
+            name="input_x",
+            is_input=True,
+        )
+        Bias = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+
+        slice_op = ops.dynamic_slice()
+        tensor_A = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        tensor_A._attrs["name"] = "slice_output"
+
+        Y = ops.gemm_rcr_bias()(tensor_A, tensor_B, Bias)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test_{}.so".format(self.test_count)
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 4)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+
+        Ms = None
+        for d in slice_input_shape:
+            if isinstance(d, list):
+                Ms = d
+                break
+        assert Ms is not None, "expected to have at least one dynamic dim"
+        for idx in range(len(Ms)):
+            # Run PyTorch
+            b_pt = torch.randn(N, K).cuda().half()
+            input_shape_pt = [
+                d[idx] if isinstance(d, list) else d for d in slice_input_shape
+            ]
+            input_pt = torch.randn(*input_shape_pt).cuda().half()
+            bias_pt = torch.randn(N).cuda().half()
+
+            slice_indices = [
+                slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+            ]
+            a_pt = input_pt[slice_indices]
+            y_pt = torch.nn.functional.linear(a_pt, b_pt, bias=bias_pt)
+
+            # Run AITemplate module.
+            y = torch.empty(y_pt.size()).cuda().half()
+            module.run_with_tensors([input_pt, b_pt, bias_pt], [y])
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self.test_count += 1
+
+    def test_slice_gemm_rcr_fusion_dynamic(self):
+        # [slice_end_indices[0] - slice_start_indices[0]] = M
+        # [slice_end_indices[1] - slice_start_indices[1]] = K
+        # a = [M, K]
+        # b = [N, K]
+        self._test_slice_gemm_rcr_fusion_dynamic(
+            N=4,
+            K=8,
+            slice_input_shape=([4, 9], 8),
+            slice_start_indices=(0, 0),
+            slice_end_indices=(None, None),
+            test_name="slice_gemm_rcr_fusion_dynamic",
+        )
+        self._test_slice_gemm_rcr_fusion_dynamic(
+            N=4,
+            K=8,
+            slice_input_shape=([4, 9], 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_fusion_dynamic",
+        )
+        self._test_slice_gemm_rcr_fusion_dynamic(
+            N=4,
+            K=8,
+            slice_input_shape=([10, 20], [4, 9], 32),
+            slice_start_indices=(0, 0, 8),
+            slice_end_indices=(None, None, 16),
+            test_name="slice_gemm_rcr_fusion_dynamic",
+        )
+
+    def _test_slice_multiple_gemm_rcr_fusion_a(
+        self,
+        N,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        test_name,
+    ):
+        dtype = "float16"
+
+        tensor_B1 = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_b1",
+            is_input=True,
+        )
+        tensor_B2 = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_b2",
+            is_input=True,
+        )
+        X = Tensor(
+            shape=slice_input_shape,
+            dtype=dtype,
+            name="input_x",
+            is_input=True,
+        )
+        Bias = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+
+        slice_op = ops.dynamic_slice()
+        tensor_A = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        tensor_A._attrs["name"] = "slice_output"
+
+        Y1 = ops.gemm_rcr_bias()(tensor_A, tensor_B1, Bias)
+        Y2 = ops.gemm_rcr_bias()(tensor_A, tensor_B2, Bias)
+        Y = ops.elementwise(FuncEnum.ADD)(Y1, Y2)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test_{}.so".format(self.test_count)
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 6)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+
+        # Run PyTorch
+        b1_pt = torch.randn(N, K).cuda().half()
+        b2_pt = torch.randn(N, K).cuda().half()
+        input_pt = torch.randn(*slice_input_shape).cuda().half()
+        bias_pt = torch.randn(N).cuda().half()
+
+        slice_indices = [
+            slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+        ]
+        a_pt = input_pt[slice_indices]
+        y1_pt = torch.nn.functional.linear(a_pt, b1_pt, bias=bias_pt)
+        y2_pt = torch.nn.functional.linear(a_pt, b2_pt, bias=bias_pt)
+        y_pt = y1_pt + y2_pt
+
+        # Run AITemplate module.
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(
+            {
+                "input_x": input_pt,
+                "input_b1": b1_pt,
+                "input_b2": b2_pt,
+                "bias": bias_pt,
+            },
+            [y],
+        )
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_slice_multiple_gemm_rcr_fusion_a(self):
+        # [slice_end_indices[0] - slice_start_indices[0]] = M
+        # [slice_end_indices[1] - slice_start_indices[1]] = K
+        # a = [M, K]
+        # b = [N, K]
+        self._test_slice_multiple_gemm_rcr_fusion_a(
+            N=4,
+            K=16,
+            slice_input_shape=(30, 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 24),
+            test_name="slice_multiple_gemm_rcr_fusion_a",
+        )
+        self._test_slice_multiple_gemm_rcr_fusion_a(
+            N=4,
+            K=6,
+            slice_input_shape=(30, 32),
+            slice_start_indices=(0, 12),
+            slice_end_indices=(None, 18),
+            test_name="slice_multiple_gemm_rcr_fusion_a",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_slice_reshape_scatter.py b/tests/unittest/compiler/test_slice_reshape_scatter.py
new file mode 100644
index 000000000..aaa38deec
--- /dev/null
+++ b/tests/unittest/compiler/test_slice_reshape_scatter.py
@@ -0,0 +1,140 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+logger = logging.getLogger(__name__)
+
+
+class SliceScatterReshapeCatTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceScatterReshapeCatTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 1
+
+    def _run_one_test(
+        self,
+        *,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        reshape_to,
+        input_x_shape,
+        dim,
+        add_tanh=False,
+    ):
+        target = detect_target()
+
+        input_X_pt = torch.randn(input_x_shape).cuda().half()
+
+        Ys_pt = []
+        Xs_pt = []
+        for input_shape, start_indices, end_indices in zip(
+            input_shapes, input_start_indices, input_end_indices
+        ):
+            X_pt = torch.randn(input_shape).cuda().half()
+            Xs_pt.append(X_pt)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+            Y_pt = X_pt[slice_indices]
+            Ys_pt.append(Y_pt)
+        Y1_pt = torch.cat(Ys_pt, dim)
+        Y2_pt = torch.reshape(Y1_pt, reshape_to)
+        Y_pt = torch.cat([Y2_pt, input_X_pt], dim=dim)
+        if add_tanh:
+            Y_pt = torch.tanh(Y_pt)
+
+        input_X = Tensor(
+            shape=input_x_shape, dtype="float16", name="input_x", is_input=True
+        )
+        Ys = []
+        for idx, (input_shape, start_indices, end_indices) in enumerate(
+            zip(input_shapes, input_start_indices, input_end_indices)
+        ):
+            slice_op = ops.dynamic_slice()
+            X_name = "input_{}".format(idx)
+            X = Tensor(shape=input_shape, dtype="float16", name=X_name, is_input=True)
+            Y = slice_op(X, start_indices=start_indices, end_indices=end_indices)
+            Ys.append(Y)
+        concat_op = ops.concatenate()
+        Y1 = concat_op(Ys, dim)
+        Y2 = ops.reshape()(Y1, reshape_to)
+        concat_op_2 = ops.concatenate()
+        if add_tanh:
+            concat_op_2 = ops.concatenate_tanh()
+        Y = concat_op_2([Y2, input_X], dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logger.info(
+            "AITemplate output_0 shape: {}, pt shape: {}".format(y_shape, Y_pt.size())
+        )
+        np.testing.assert_equal(y_shape, Y_pt.size())
+
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            Y, target, "./tmp", "slice_scatter_reshape_cat", dll_name=dll_name
+        )
+        Y_src_ops = Y._attrs["src_ops"]
+        np.testing.assert_equal(len(Y_src_ops), 2)
+        np.testing.assert_equal(concat_op_2 in Y_src_ops, True)
+        np.testing.assert_equal(concat_op_2._attrs["input_masks"], [False, True])
+        Y_src_ops_list = list(Y_src_ops)
+        slice_reshape_scatter_op = (
+            Y_src_ops_list[1] if concat_op_2 == Y_src_ops_list[0] else Y_src_ops_list[0]
+        )
+        np.testing.assert_equal(
+            slice_reshape_scatter_op._attrs["op"], "slice_reshape_scatter"
+        )
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(Xs_pt) + 1)]
+        for i, X_pt in enumerate(Xs_pt):
+            inputs[input_name_to_index[f"input_{i}"]] = X_pt
+        inputs[input_name_to_index["input_x"]] = input_X_pt
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_slice_scatter_reshape(self):
+        self._run_one_test(
+            input_shapes=[[8, 16], [20, 30]],
+            input_start_indices=[[0, 4], [12, 2]],
+            input_end_indices=[[4, 14], [16, 8]],
+            reshape_to=[4, 2, 8],
+            input_x_shape=[4, 5, 8],
+            dim=1,
+        )
+        self._run_one_test(
+            input_shapes=[[8, 16], [20, 30]],
+            input_start_indices=[[0, 4], [12, 2]],
+            input_end_indices=[[4, 14], [16, 8]],
+            reshape_to=[4, 2, 8],
+            input_x_shape=[4, 5, 8],
+            dim=1,
+            add_tanh=True,
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_slice_scatter_pattern.py b/tests/unittest/compiler/test_slice_scatter_pattern.py
new file mode 100644
index 000000000..a537d11fe
--- /dev/null
+++ b/tests/unittest/compiler/test_slice_scatter_pattern.py
@@ -0,0 +1,474 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops, transform
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils
+
+
+class SliceScatterPatternTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceScatterPatternTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _make_slice_ops(
+        self,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        batch_sizes=None,
+        input_type="float16",
+    ):
+        Ys = []
+        for idx, (input_shape, start_indices, end_indices) in enumerate(
+            zip(input_shapes, input_start_indices, input_end_indices)
+        ):
+            slice_op = ops.dynamic_slice()
+            X_name = "input_{}".format(idx)
+            if batch_sizes is None:
+                X = Tensor(
+                    shape=input_shape, dtype=input_type, name=X_name, is_input=True
+                )
+            else:
+                X = Tensor(
+                    shape=[
+                        IntVar(values=batch_sizes, name="input_batch_{}".format(idx)),
+                        *input_shape,
+                    ],
+                    dtype=input_type,
+                    name=X_name,
+                    is_input=True,
+                )
+            Y = slice_op(X, start_indices=start_indices, end_indices=end_indices)
+            Ys.append(Y)
+        return Ys
+
+    def _make_test_graph(
+        self,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        batch_sizes=None,
+    ):
+        Ys = self._make_slice_ops(
+            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+        )
+        concat_op = ops.concatenate()
+        Y = concat_op(Ys, dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        return Y
+
+    def _graph_transformation_test(
+        self,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        batch_sizes=None,
+    ):
+        graph = self._make_test_graph(
+            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+        )
+        graph = transform.toposort(graph)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        orig_graph_size = len(graph)
+        graph = transform.transform_strided_ops(graph)
+        self.assertEqual(len(graph), orig_graph_size - len(input_shapes))
+
+        Y = graph[-1]
+        self.assertEqual(Y._attrs["name"], "output_0")
+        self.assertTrue(Y._attrs["is_output"])
+        self.assertNotEqual(len(Y.src_ops()), 0)
+        fused_op = list(Y.src_ops())[0]
+        self.assertEqual(fused_op._attrs["op"], "slice_scatter")
+        for idx, x in enumerate(fused_op._attrs["inputs"]):
+            self.assertEqual(x._attrs["name"], "input_{}".format(idx))
+
+    def _e2e_test(self, input_shapes, input_start_indices, input_end_indices, dim):
+        logging.info(
+            "e2e test with input_shapes {}, start_indices {}, end_indices {}".format(
+                input_shapes, input_start_indices, input_end_indices
+            )
+        )
+
+        target = detect_target()
+
+        Ys_pt = []
+        Xs_pt = []
+        for input_shape, start_indices, end_indices in zip(
+            input_shapes, input_start_indices, input_end_indices
+        ):
+            X_pt = torch.randn(input_shape).cuda().half()
+            Xs_pt.append(X_pt)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+            Y_pt = X_pt[slice_indices]
+            Ys_pt.append(Y_pt)
+        Y_pt = torch.cat(Ys_pt, dim)
+
+        Y = self._make_test_graph(
+            input_shapes, input_start_indices, input_end_indices, dim
+        )
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info(
+            "AITemplate output_0 shape: {}, pt shape: {}".format(y_shape, Y_pt.size())
+        )
+        np.testing.assert_equal(y_shape, Y_pt.size())
+
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            Y, target, "./tmp", "slice_scatter_e2e", dll_name=dll_name
+        )
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(Xs_pt))]
+        for i, X_pt in enumerate(Xs_pt):
+            inputs[input_name_to_index[f"input_{i}"]] = X_pt
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def _e2e_batch_test(
+        self, input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+    ):
+        logging.info(
+            "e2e batch test with batch_sizes {}, input_shapes{}, "
+            "start_indices {}, end_indices {}".format(
+                batch_sizes, input_shapes, input_start_indices, input_end_indices
+            )
+        )
+
+        target = detect_target()
+
+        Y = self._make_test_graph(
+            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+        )
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            Y, target, "./tmp", "slice_scatter_e2d_batch", dll_name=dll_name
+        )
+
+        for batch in batch_sizes:
+            logging.info("checking batch: {}".format(batch))
+
+            Ys_pt = []
+            Xs_pt = []
+            for input_shape, start_indices, end_indices in zip(
+                input_shapes, input_start_indices, input_end_indices
+            ):
+                X_pt = torch.randn([batch, *input_shape]).cuda().half()
+                Xs_pt.append(X_pt)
+                slice_indices = [
+                    slice(i, j) for i, j in zip(start_indices, end_indices)
+                ]
+                Y_pt = X_pt[slice_indices]
+                Ys_pt.append(Y_pt)
+            Y_pt = torch.cat(Ys_pt, dim)
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0 for i in range(len(Xs_pt))]
+            for i, X_pt in enumerate(Xs_pt):
+                inputs[input_name_to_index[f"input_{i}"]] = X_pt
+            y = torch.empty(y_shape).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+            self.test_count += 1
+
+    def _run_one_test(
+        self, *, input_shapes, input_start_indices, input_end_indices, dim
+    ):
+        self._graph_transformation_test(
+            input_shapes, input_start_indices, input_end_indices, dim
+        )
+        self._e2e_test(input_shapes, input_start_indices, input_end_indices, dim)
+
+    def _run_one_batch_test(
+        self, *, batch_sizes, input_shapes, input_start_indices, input_end_indices, dim
+    ):
+        self._graph_transformation_test(
+            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+        )
+        self._e2e_batch_test(
+            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+        )
+
+    def test_slice_scatter(self):
+        self._run_one_test(
+            input_shapes=[[2, 3, 5], [3, 7, 10]],
+            input_start_indices=[[0, 1, 0], [2, 5, 3]],
+            input_end_indices=[[2, 2, 4], [-1, 6, 7]],
+            dim=0,
+        )
+        self._run_one_test(
+            input_shapes=[[2, 3, 5], [3, 7, 10]],
+            input_start_indices=[[0, 1, 0], [0, 1, 3]],
+            input_end_indices=[[2, 2, 4], [2, 7, 7]],
+            dim=1,
+        )
+        self._run_one_test(
+            input_shapes=[[3, 3, 4], [3, 7, 10]],
+            input_start_indices=[[1, 0, -3], [0, 2, 1]],
+            input_end_indices=[[3, 3, 4], [2, 5, -1]],
+            dim=2,
+        )
+
+    def test_batch_slice_scatter(self):
+        self._run_one_batch_test(
+            batch_sizes=[1024, 4, 128],
+            input_shapes=[[3], [3]],
+            input_start_indices=[[1, 1], [0, 0]],
+            input_end_indices=[[2, 3], [1, 2]],
+            dim=0,
+        )
+        self._run_one_batch_test(
+            batch_sizes=[4, 3, 7],
+            input_shapes=[[2], [4]],
+            input_start_indices=[[1, 0], [0, 1]],
+            input_end_indices=[[2, 1], [1, -1]],
+            dim=1,
+        )
+
+    def _make_test_graph_multi_dsts(
+        self,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+    ):
+        Ys = self._make_slice_ops(
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+        )
+        input_type = "float16"
+        # make the first input tensor have multiple uses
+        slice_op_0 = list(Ys[0].src_ops())[0]
+        X0 = slice_op_0._attrs["inputs"][0]
+        X0_shape = [d._attrs["values"][0] for d in X0._attrs["shape"]]
+        num_slice_inputs = len(input_shapes)
+        X1_name = f"input_{num_slice_inputs}"
+        X1 = Tensor(shape=X0_shape, dtype=input_type, name=X1_name, is_input=True)
+        concat_op = ops.concatenate()
+        Y0 = concat_op(Ys, dim)
+        Y0._attrs["name"] = "output_0"
+        Y0._attrs["is_output"] = True
+
+        add_op = ops.elementwise(FuncEnum.ADD)
+        Y1 = add_op(X0, X1)
+        Y1._attrs["name"] = "output_1"
+        Y1._attrs["is_output"] = True
+
+        return (Y0, Y1)
+
+    def _test_slice_scatter_multi_dsts(
+        self, *, input_shapes, input_start_indices, input_end_indices, dim
+    ):
+        """test cases where a tensor being sliced has multiple dsts"""
+
+        logging.info(
+            f"multi_dsts e2e test with input_shapes: {input_shapes}, "
+            f"start_indices: {input_start_indices}, end_indices: {input_end_indices}"
+        )
+        target = detect_target()
+
+        Ys_pt = []
+        Xs_pt = []
+        for input_shape, start_indices, end_indices in zip(
+            input_shapes, input_start_indices, input_end_indices
+        ):
+            X_pt = torch.randn(input_shape).cuda().half()
+            Xs_pt.append(X_pt)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+            Y_pt = X_pt[slice_indices]
+            Ys_pt.append(Y_pt)
+        Y0_pt = torch.cat(Ys_pt, dim)
+
+        input0_shape = Xs_pt[0].size()
+        other_X_pt = torch.randn(input0_shape).cuda().half()
+        Xs_pt.append(other_X_pt)
+        Y1_pt = Xs_pt[0] + other_X_pt
+
+        Y0, Y1 = self._make_test_graph_multi_dsts(
+            input_shapes, input_start_indices, input_end_indices, dim
+        )
+
+        y0_shape = [var._attrs["values"][0] for var in Y0._attrs["shape"]]
+        y1_shape = [var._attrs["values"][0] for var in Y1._attrs["shape"]]
+        np.testing.assert_equal(y0_shape, Y0_pt.size())
+        np.testing.assert_equal(y1_shape, Y1_pt.size())
+
+        test_name = "slice_scatter_multi_dsts"
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model((Y0, Y1), target, "./tmp", test_name, dll_name=dll_name)
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+        if sorted_ops[0]._attrs["op"] == "fused_elementwise":
+            slice_scatter_op = sorted_ops[1]
+        else:
+            slice_scatter_op = sorted_ops[0]
+        self.assertEqual(slice_scatter_op._attrs["op"], "slice_scatter")
+        for idx, x in enumerate(slice_scatter_op._attrs["inputs"]):
+            self.assertEqual(x._attrs["name"], "input_{}".format(idx))
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(Xs_pt))]
+        for i, X_pt in enumerate(Xs_pt):
+            inputs[input_name_to_index[f"input_{i}"]] = X_pt
+        y0 = torch.empty(y0_shape).cuda().half()
+        y1 = torch.empty(y1_shape).cuda().half()
+        module.run_with_tensors(inputs, [y0, y1])
+        self.assertTrue(torch.allclose(Y0_pt, y0, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_slice_scatter_multi_dsts(self):
+        self._test_slice_scatter_multi_dsts(
+            input_shapes=[[4, 3, 4], [3, 7, 10]],
+            input_start_indices=[[1, 0, -3], [0, 2, 1]],
+            input_end_indices=[[3, 3, 4], [2, 5, -1]],
+            dim=2,
+        )
+
+    def _make_test_graph_multi_dsts_2(
+        self,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+    ):
+        """Make a graph where (1) a tensor is sliced twice and both slices are
+        fed into the same concat op, and (2) another sliced output (i.e not
+        the one from (1)) is fed into the same concat op twice.
+        """
+
+        Ys = self._make_slice_ops(
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+        )
+        slice_op_0 = list(Ys[0].src_ops())[0]
+        X0 = slice_op_0._attrs["inputs"][0]
+        # make one more slice op that takes the tensor input of the first slice op
+        slice_op = ops.dynamic_slice()
+        Y0 = slice_op(
+            X0, start_indices=input_start_indices[0], end_indices=input_end_indices[0]
+        )
+        Ys.append(Y0)
+
+        # The last sliced output is fed into concat twice
+        Y_1 = Ys[-1]
+        Ys.append(Y_1)
+
+        concat_op = ops.concatenate()
+        Y = concat_op(Ys, dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        return Y
+
+    def _test_slice_scatter_multi_dsts_2(
+        self, *, input_shapes, input_start_indices, input_end_indices, dim
+    ):
+        logging.info(
+            f"multi_dsts_2 e2e test with input_shapes: {input_shapes}, "
+            f"start_indices: {input_start_indices}, end_indices: {input_end_indices}"
+        )
+        target = detect_target()
+
+        Ys_pt = []
+        Xs_pt = []
+        for input_shape, start_indices, end_indices in zip(
+            input_shapes, input_start_indices, input_end_indices
+        ):
+            X_pt = torch.randn(input_shape).cuda().half()
+            Xs_pt.append(X_pt)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+            Y_pt = X_pt[slice_indices]
+            Ys_pt.append(Y_pt)
+        X0_pt = Xs_pt[0]
+        slice0_indices = [
+            slice(i, j) for i, j in zip(input_start_indices[0], input_end_indices[0])
+        ]
+        Y0_pt = X0_pt[slice0_indices]
+        Ys_pt.append(Y0_pt)
+
+        Y1_pt = Ys_pt[-1]
+        Ys_pt.append(Y1_pt)
+
+        Y_pt = torch.cat(Ys_pt, dim)
+
+        Y = self._make_test_graph_multi_dsts_2(
+            input_shapes, input_start_indices, input_end_indices, dim
+        )
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        np.testing.assert_equal(y_shape, Y_pt.size())
+
+        test_name = "slice_scatter_multi_dsts_2"
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        slice_scatter_op = sorted_ops[0]
+        self.assertEqual(slice_scatter_op._attrs["op"], "slice_scatter")
+        slice_scatter_inputs = slice_scatter_op._attrs["inputs"]
+        for idx, x in enumerate(slice_scatter_inputs[:-2]):
+            self.assertEqual(x._attrs["name"], "input_{}".format(idx))
+        self.assertEqual(slice_scatter_inputs[-2]._attrs["name"], "input_0")
+        self.assertEqual(slice_scatter_inputs[-1]._attrs["name"], "input_0")
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(Xs_pt))]
+        for i, X_pt in enumerate(Xs_pt):
+            inputs[input_name_to_index[f"input_{i}"]] = X_pt
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_slice_scatter_multi_dsts_2(self):
+        self._test_slice_scatter_multi_dsts_2(
+            input_shapes=[[2, 3, 5], [3, 7, 10]],
+            input_start_indices=[[0, 1, 0], [0, 1, 3]],
+            input_end_indices=[[2, 2, 4], [2, 7, 7]],
+            dim=1,
+        )
+        self._test_slice_scatter_multi_dsts_2(
+            input_shapes=[[2, 4, 4], [3, 7, 10]],
+            input_start_indices=[[0, 0, -3], [0, 2, 1]],
+            input_end_indices=[[2, 3, 4], [2, 5, -1]],
+            dim=2,
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_slice_view_strided.py b/tests/unittest/compiler/test_slice_view_strided.py
new file mode 100644
index 000000000..65ebc9393
--- /dev/null
+++ b/tests/unittest/compiler/test_slice_view_strided.py
@@ -0,0 +1,122 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar
+from aitemplate.testing import detect_target, test_utils
+from aitemplate.utils import graph_utils
+
+
+class SliceViewStridedOpTestCase(unittest.TestCase):
+    def test_slice_view_gemm_fusible(self):
+        N = 4
+        batch_dim = IntVar([1, 2, 3], "batch_size")
+
+        input0 = test_utils.gen_input_tensor([batch_dim, 2 * N, N], name="input0")
+        X0 = ops.dynamic_slice()(input0, [None, None, None], [None, N, None])
+        X1 = ops.reshape()(X0, [-1, N * N])
+        input1 = test_utils.gen_input_tensor([N, N * N], name="input1")
+        Y = ops.gemm_rcr()(X1, input1)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", "slice_reshape_gemm_fusible")
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            # Run PyTorch baseline.
+            input0_pt = torch.randn([batch_size, 2 * N, N]).cuda().half()
+            x0_pt = input0_pt[:, :N, :]
+            x1_pt = torch.reshape(x0_pt, [-1, N * N])
+            input1_pt = torch.rand([N, N * N]).cuda().half()
+            y_pt = torch.nn.functional.linear(x1_pt, input1_pt)
+            y = torch.empty(y_pt.shape).cuda().half()
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "input0": input0_pt,
+                    "input1": input1_pt,
+                },
+                [y],
+            )
+
+            # Do comparisons.
+            self.assertTrue(
+                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
+                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
+            )
+
+    def test_slice_view_gemm_non_fusible(self):
+        N = 4
+        batch_dim = IntVar([1, 2, 3], "batch_size")
+
+        input0 = test_utils.gen_input_tensor([batch_dim, N, 2 * N], name="input0")
+        X0 = ops.dynamic_slice()(input0, [None, None, None], [None, None, N])
+        X1 = ops.reshape()(X0, [-1, N * N])
+        input1 = test_utils.gen_input_tensor([N, N * N], name="input1")
+        Y = ops.gemm_rcr()(X1, input1)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", "slice_reshape_gemm_non_fusible")
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 4)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            # Run PyTorch baseline.
+            input0_pt = torch.randn([batch_size, N, 2 * N]).cuda().half()
+            x0_pt = input0_pt[:, :, :N]
+            x1_pt = torch.reshape(x0_pt, [-1, N * N])
+            input1_pt = torch.rand([N, N * N]).cuda().half()
+            y_pt = torch.nn.functional.linear(x1_pt, input1_pt)
+            y = torch.empty(y_pt.shape).cuda().half()
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "input0": input0_pt,
+                    "input1": input1_pt,
+                },
+                [y],
+            )
+
+            # Do comparisons.
+            self.assertTrue(
+                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
+                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_split_bmm_fusion.py b/tests/unittest/compiler/test_split_bmm_fusion.py
new file mode 100644
index 000000000..0a07920b0
--- /dev/null
+++ b/tests/unittest/compiler/test_split_bmm_fusion.py
@@ -0,0 +1,299 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import has_op
+from aitemplate.utils import graph_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class SplitBmmFusionTestCase(unittest.TestCase):
+    def _test_split_bmm_rcr_fusion(
+        self,
+        bmm_rcr_op,
+        B,
+        M,
+        N,
+        K,
+        split_size_or_sections,
+        split_dim,
+        testname,
+        with_padding=False,
+    ):
+        dtype = "float16"
+
+        T_A = Tensor(
+            shape=[B, M, K],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        T_B = Tensor(
+            shape=[B, N, K],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        Xs = ops.split()(T_A, split_size_or_sections, split_dim)
+        Ys = ops.split()(T_B, split_size_or_sections, split_dim)
+        assert len(Xs) == len(Ys)
+
+        n = len(Xs)
+        Cs = []
+        for i in range(n):
+            X = Xs[i]
+            Y = Ys[i]
+            C = bmm_rcr_op()(X, Y)
+            Cs.append(C)
+        Y = ops.concatenate()(Cs, dim=split_dim)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        a = torch.randn(B, M, K).cuda().half()
+        b = torch.randn(B, N, K).cuda().half()
+        xs = a.split(split_size_or_sections, split_dim)
+        ys = b.split(split_size_or_sections, split_dim)
+        cs = []
+        for i in range(n):
+            x = xs[i]
+            y = ys[i]
+            c = torch.bmm(x, y.permute(0, 2, 1))
+            cs.append(c)
+        y_pt = torch.cat(cs, dim=split_dim)
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", testname)
+        graph = module.debug_sorted_graph
+        if not with_padding:
+            assert len(graph) == 3, (
+                f"The final graph should have only 3 tensors. "
+                f"But it has {len(graph)} tensors now."
+            )
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors({"input0": a, "input1": b}, [y])
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_split_bmm_rcr_fusion_static(self):
+        # bmm_rcr (K with an odd value) with padding:
+        # in this case, split and bmm_rcr are not going to be fused actually because
+        # of the padding applied to bmm_rcr.
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr,
+            1,
+            10000,
+            3,
+            5,
+            [2, 3],
+            2,
+            "test_split_bmm_rcr",
+            with_padding=True,
+        )
+        # bmm_rcr_n1
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr_n1, 1, 160, 1, 32, 8, 2, "test_split_bmm_rcr"
+        )
+        # bmm_rcr_n1, split_dim = 2
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr_n1, 1, 10000, 1, 5, [2, 3], 2, "test_split_bmm_rcr"
+        )
+        # bmm_rcr_n1, split_dim = 2
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr_n1, 1, 10000, 1, 5, [3, 2], 2, "test_split_bmm_rcr"
+        )
+        # bmm_rcr_n1
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr_n1, 1, 10, 1, 32, [16, 8, 8], 2, "test_split_bmm_rcr"
+        )
+        # bmm_rcr_n1, split_dim = 0
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr, 4, 10000, 1, 32, [2, 2], 0, "test_split_bmm_rcr"
+        )
+        # bmm_rcr_n1, split_dim = 1
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr, 64, 2, 2, 32, 1, 1, "test_split_bmm_rcr"
+        )
+        # bmm_rcr
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr, 1024, 128, 512, 256 * 2, 256, 2, "test_split_bmm_rcr"
+        )
+        # bmm_rcr
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr, 1, 10000, 3, 96, [32, 32, 32], 2, "test_split_bmm_rcr"
+        )
+        # bmm_rcr, split_dim = 0, can only be static
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr, 1024, 128, 512, 256 * 2, 512, 0, "test_split_bmm_rcr"
+        )
+        # bmm_rcr, split_dim = 1
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr, 1024, 512, 512, 256 * 2, 256, 1, "test_split_bmm_rcr"
+        )
+
+    def _test_split_bmm_rcr_fusion_dynamic_M(
+        self,
+        bmm_rcr_op,
+        B,
+        Ms,
+        N,
+        K,
+        split_size_or_sections,
+        split_dim,
+        testname,
+    ):
+        dtype = "float16"
+        assert isinstance(Ms, (list, tuple))
+
+        T_A = Tensor(
+            shape=[B, IntVar(name="input_batch", values=Ms), K],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        T_B = Tensor(
+            shape=[B, N, K],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        Xs = ops.split()(T_A, split_size_or_sections, split_dim)
+        Ys = ops.split()(T_B, split_size_or_sections, split_dim)
+        assert len(Xs) == len(Ys)
+
+        n = len(Xs)
+        Cs = []
+        for i in range(n):
+            X = Xs[i]
+            Y = Ys[i]
+            C = bmm_rcr_op()(X, Y)
+            Cs.append(C)
+        Y = ops.concatenate()(Cs, dim=split_dim)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", testname)
+        graph = module.debug_sorted_graph
+        assert len(graph) == 3, (
+            f"The final graph should have only 3 tensors. "
+            f"But it has {len(graph)} tensors now."
+        )
+
+        for M in Ms:
+            a = torch.randn(B, M, K).cuda().half()
+            b = torch.randn(B, N, K).cuda().half()
+            xs = a.split(split_size_or_sections, split_dim)
+            ys = b.split(split_size_or_sections, split_dim)
+            cs = []
+            for i in range(n):
+                x = xs[i]
+                y = ys[i]
+                c = torch.bmm(x, y.permute(0, 2, 1))
+                cs.append(c)
+            y_pt = torch.cat(cs, dim=split_dim)
+
+            y = torch.empty(y_pt.size()).cuda().half()
+            module.run_with_tensors({"input0": a, "input1": b}, [y])
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_split_bmm_rcr_fusion_dynamic_M(self):
+        # bmm_rcr_n1
+        self._test_split_bmm_rcr_fusion_dynamic_M(
+            ops.bmm_rcr_n1,
+            1,
+            [100, 160],
+            1,
+            32,
+            8,
+            2,
+            "test_split_bmm_rcr_n1_dynamic_M",
+        )
+        # bmm_rcr
+        self._test_split_bmm_rcr_fusion_dynamic_M(
+            ops.bmm_rcr,
+            1024,
+            [128, 256],
+            512,
+            256 * 2,
+            256,
+            2,
+            "test_split_bmm_rcr_dynamic_M",
+        )
+
+    def _test_split_bmm_rcr_fusion_qkv(
+        self,
+        B,  # batch_size * num_heads * 3
+        M,
+        N,
+        K,
+        NH,  # num_heads
+        split_size_or_sections,
+        split_dim=0,
+        testname="test_split_qkv",
+        should_fail=False,
+    ):
+        dtype = "float16"
+
+        X = Tensor(
+            shape=[B, M, K],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        scale = Tensor(shape=[], dtype="float16", name="scale", value=K**-0.5)
+
+        (Q, KK, V) = ops.split()(X, split_size_or_sections, split_dim)
+        QK = ops.bmm_rcr()(Q, KK)
+        QK = ops.elementwise(FuncEnum.MUL)(QK, scale)
+        QK = ops.softmax()(QK, -1)
+        Y = ops.bmm_rrr_permute((NH,))(QK, V)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        a = torch.randn(B, M, K).cuda().half()
+        (q, k, v) = a.split(split_size_or_sections, split_dim)
+        qk = torch.bmm(q, k.permute(0, 2, 1)) * K**-0.5
+        qk = torch.softmax(qk, -1)
+        qkv = torch.bmm(qk, v)
+        y_pt = qkv.reshape(B // 3 // NH, NH, M, K).permute([0, 2, 1, 3])
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", testname)
+        graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(graph)
+        if should_fail:
+            assert has_op(sorted_ops, "split"), "The final graph should have split op!"
+        else:
+            assert not has_op(sorted_ops, "split"), "The final graph has split op!"
+
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors({"input0": a}, [y])
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_split_bmm_rcr_fusion_qkv(self):
+        self._test_split_bmm_rcr_fusion_qkv(3, 4096, 4096, 512, 1, 1)
+        self._test_split_bmm_rcr_fusion_qkv(3 * 16, 1024, 1024, 256, 16, 16)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_split_bmm_softmax_bmm.py b/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
new file mode 100644
index 000000000..519a035b2
--- /dev/null
+++ b/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
@@ -0,0 +1,92 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# BMM + Softmax + BMM
+# (B, M, K) * (B, N, K) = (B, M, N) #RCR
+# softmax on dim N (B, M, N)
+# (B, M, N) * (B, N, O) = (B, M, O) #RRR
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils, shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "cuda", "Only supported by ROCM.")
+class SplitBMMTestCase(unittest.TestCase):
+    def _test_split_reshape_bmm_permute(
+        self, bs, nheads, seq_len, hidden_size, test_name
+    ):
+        target = detect_target()
+        head_dim = hidden_size // nheads
+        scale = head_dim**-0.5
+
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        input_shape = [3, batch_dim, nheads, seq_len, head_dim]
+        X = Tensor(shape=input_shape, dtype="float16", name="input_0", is_input=True)
+        (Q, K, V) = ops.split()(X, 1, dim=0)
+
+        OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=scale)
+        Y = OP(
+            (ops.reshape()(Q, [-1, seq_len, head_dim])),
+            (ops.reshape()(K, [-1, seq_len, head_dim])),
+            (ops.reshape()(V, [-1, seq_len, head_dim])),
+        )
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"split_bmm_softmax_bmm_{test_name}")
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        # 1 input tensors + 1 output tensor
+        self.assertEqual(len(sorted_graph), 2)
+        # only bmm_softmax_bmm_permute left
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+
+        for b in bs:
+            input_shape = [3, b, nheads, seq_len, head_dim]
+            x_pt = torch.randn(*input_shape).cuda().half()
+            (q_pt, k_pt, v_pt) = torch.split(x_pt, 1, dim=0)
+            q_pt = q_pt.reshape(-1, seq_len, head_dim)
+            k_pt = k_pt.reshape(-1, seq_len, head_dim)
+            v_pt = v_pt.reshape(-1, seq_len, head_dim)
+
+            # [b, seq_len, head_dim] @ [b, head_dim, seq_len]
+            attn = (q_pt @ k_pt.transpose(-2, -1)) * scale
+            attn = attn.softmax(dim=-1)
+            # [b, seq_len, seq_len] @ [b, seq_len, head_dim]
+            y_l = attn @ v_pt
+            y_r = y_l.reshape(b, nheads, seq_len, head_dim)
+            y_pt = torch.permute(y_r, [0, 2, 1, 3])
+
+            y = torch.empty([b, seq_len, nheads, head_dim]).cuda().half()
+            module.run_with_tensors([x_pt], [y])
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_split_reshape_bmm_permute(self):
+        self._test_split_reshape_bmm_permute(
+            bs=[1], nheads=12, seq_len=256, hidden_size=768, test_name="static"
+        )
+        self._test_split_reshape_bmm_permute(
+            bs=[16], nheads=12, seq_len=256, hidden_size=768, test_name="static"
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_split_view_strided.py b/tests/unittest/compiler/test_split_view_strided.py
new file mode 100644
index 000000000..a0a96cdc4
--- /dev/null
+++ b/tests/unittest/compiler/test_split_view_strided.py
@@ -0,0 +1,180 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.testing import detect_target, test_utils
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class SplitViewStridedOpTestCase(unittest.TestCase):
+    def _test_split_view_bmm_rcr(
+        self,
+        bmm_rcr_op,
+        Bs,
+        Ms,
+        input_A_shape,
+        input_B_shape,
+        split_size_or_sections,
+        split_dim,
+        reshape_A,
+        reshape_B,
+        expected_num_tensors,
+        expected_num_ops,
+        testname,
+    ):
+        T_A = Tensor(
+            shape=input_A_shape,
+            name="input0",
+            is_input=True,
+        )
+        T_B = Tensor(
+            shape=input_B_shape,
+            name="input1",
+            is_input=True,
+        )
+        Xs = ops.split()(T_A, split_size_or_sections, split_dim)
+        Ys = ops.split()(T_B, split_size_or_sections, split_dim)
+        self.assertEqual(len(Xs), len(Ys))
+
+        n = len(Xs)
+        Cs = []
+        for i in range(n):
+            X = ops.reshape()(Xs[i], reshape_A)
+            Y = ops.reshape()(Ys[i], reshape_B)
+            C = bmm_rcr_op()(X, Y)
+            C._attrs["name"] = f"output_{i}"
+            C._attrs["is_output"] = True
+            Cs.append(C)
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Cs, target, "./tmp", testname)
+        graph = module.debug_sorted_graph
+        self.assertEqual(len(graph), expected_num_tensors)
+        self.assertEqual(len(graph_utils.get_sorted_ops(graph)), expected_num_ops)
+
+        for B, M in itertools.product(Bs, Ms):
+            dim_to_value_dict = {
+                "batch_size": B,
+                "emb_pool_size": M,
+            }
+            a = (
+                torch.randn(
+                    *test_utils.get_shape(T_A._attrs["shape"], dim_to_value_dict)
+                )
+                .cuda()
+                .half()
+            )
+            b = (
+                torch.randn(
+                    *test_utils.get_shape(T_B._attrs["shape"], dim_to_value_dict)
+                )
+                .cuda()
+                .half()
+            )
+            xs = a.split(split_size_or_sections, split_dim)
+            ys = b.split(split_size_or_sections, split_dim)
+            cs = []
+            for i in range(n):
+                x = torch.reshape(xs[i], reshape_A)
+                y = torch.reshape(ys[i], reshape_B)
+                c = torch.bmm(x, y.permute(0, 2, 1))
+                cs.append(c)
+
+            ys = [torch.empty(y_pt.size()).cuda().half() for y_pt in cs]
+            module.run_with_tensors({"input0": a, "input1": b}, ys)
+
+            for y, y_pt in zip(ys, cs):
+                self.assertTrue(
+                    torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
+                    f"y: {y}\ny_pt: {y_pt}",
+                )
+
+    def test_split_view_bmm_rcr_fusion(self):
+        b_dim = shape_utils.gen_int_var([1, 1024], "batch_size")
+        m_dim = shape_utils.gen_int_var([100, 200], "emb_pool_size")
+
+        # bmm_rcr dynamic M fusible
+        self._test_split_view_bmm_rcr(
+            ops.bmm_rcr,
+            Bs=[1],
+            Ms=[100, 105, 160],
+            input_A_shape=[1, m_dim, 10, 2],
+            input_B_shape=[1, 6, 10, 2],
+            split_size_or_sections=10,
+            split_dim=2,
+            reshape_A=[1, -1, 20],
+            reshape_B=[1, 6, 20],
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            testname="test_split_bmm_rcr_dynamic_m_fusible",
+        )
+
+        # bmm_rcr_n1 dynamic B fusible
+        self._test_split_view_bmm_rcr(
+            ops.bmm_rcr_n1,
+            Bs=[2, 4, 5, 10],
+            Ms=[100],
+            input_A_shape=[b_dim, 100, 10, 4],
+            input_B_shape=[b_dim, 1, 10, 4],
+            split_size_or_sections=2,
+            split_dim=2,
+            reshape_A=[-1, 100, 8],
+            reshape_B=[-1, 1, 8],
+            expected_num_tensors=7,
+            expected_num_ops=5,
+            testname="test_split_bmm_rcr_n1_dynamic_b_fusible",
+        )
+
+        # bmm_rcr dynamic M unfusible
+        self._test_split_view_bmm_rcr(
+            ops.bmm_rcr,
+            Bs=[2],
+            Ms=[100, 200],
+            input_A_shape=[2, m_dim, 10, 10],
+            input_B_shape=[2, 6, 10, 10],
+            split_size_or_sections=5,
+            split_dim=3,
+            reshape_A=[2, -1, 50],
+            reshape_B=[2, -1, 50],
+            expected_num_tensors=8,
+            expected_num_ops=4,
+            testname="test_split_bmm_rcr_dynamic_m_non_fusible",
+        )
+
+        # bmm_rcr dynamic M, B unfusible
+        self._test_split_view_bmm_rcr(
+            ops.bmm_rcr,
+            Bs=[2, 4, 5, 10],
+            Ms=[100, 200],
+            input_A_shape=[b_dim, m_dim, 10, 8],
+            input_B_shape=[b_dim, m_dim, 10, 8],
+            split_size_or_sections=2,
+            split_dim=2,
+            reshape_A=[-1, 10, 16],
+            reshape_B=[-1, 10, 16],
+            expected_num_tensors=27,
+            expected_num_ops=17,
+            testname="test_split_bmm_rcr_dynamic_bm_non_fusible",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_group_gemm.py b/tests/unittest/compiler/test_strided_group_gemm.py
new file mode 100644
index 000000000..1d4cfbba3
--- /dev/null
+++ b/tests/unittest/compiler/test_strided_group_gemm.py
@@ -0,0 +1,255 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import os
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+
+
+class StridedGroupGemmTestCase(unittest.TestCase):
+    def _test_strided_group_gemm(self, M, N1, K1, N2, K2, N3, test_name):
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+
+        M1 = M
+        M2 = M
+        M3 = M
+
+        dim = 1
+
+        X1 = Tensor(
+            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+        )
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        X2 = Tensor(
+            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+        )
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+
+        X3 = Tensor(shape=[M3, N3], dtype="float16", name="x3", is_input=True)
+
+        group_gemm_op = ops.group_gemm_rcr()
+        Y1, Y2 = group_gemm_op(operand_groups=[[X1, W1], [X2, W2]])
+        Y1._attrs["name"] = "y1"
+        Y2._attrs["name"] = "y2"
+        concat_op = ops.concatenate()
+        Y = concat_op([Y1, Y2, X3], dim=dim)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        module = compile_model([Y], target, "./tmp", test_name)
+        Y_src_ops = Y._attrs["src_ops"]
+        np.testing.assert_equal(len(Y_src_ops), 2)
+        np.testing.assert_equal(Y_src_ops, {group_gemm_op, concat_op})
+        expected_inputs_group_gemm_op = [X1, W1, X2, W2]
+        np.testing.assert_equal(
+            expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
+        )
+
+        X1_pt = torch.randn(M1, K1).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        X2_pt = torch.randn(M2, K2).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        X3_pt = torch.randn(M3, N3).cuda().half()
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
+        Y_pt = torch.cat([Y1_pt, Y2_pt, X3_pt], dim=dim)
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_pt.size())
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+            "x3": X3_pt,
+        }
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_strided_group_gemm(self):
+        self._test_strided_group_gemm(
+            M=128,
+            N1=32,
+            K1=32,
+            N2=64,
+            K2=16,
+            N3=8,
+            test_name="strided_group_gemm_rcr_cat1",
+        )
+        self._test_strided_group_gemm(
+            M=8, N1=32, K1=32, N2=4, K2=4, N3=3, test_name="strided_group_gemm_rcr_cat2"
+        )
+
+    def _test_strided_group_gemm_bias(
+        self, M, N1, K1, N2, K2, N3, test_name, input_first
+    ):
+        # input_first determines if we place input tensor (X3) to be the first
+        # concatenated tensor or not
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+        M1 = M
+        M2 = M
+        M3 = M
+
+        dim = 1
+
+        X1 = Tensor(
+            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+        )
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
+        X2 = Tensor(
+            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+        )
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+
+        X3 = Tensor(shape=[M3, N3], dtype="float16", name="x3", is_input=True)
+
+        group_gemm_op = ops.group_gemm_rcr_bias()
+        Y1, Y2 = group_gemm_op(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
+        Y1._attrs["name"] = "y1"
+        Y2._attrs["name"] = "y2"
+        concat_op = ops.concatenate()
+        if input_first:
+            Y = concat_op([X3, Y1, Y2], dim=dim)
+        else:
+            Y = concat_op([Y1, Y2, X3], dim=dim)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            [Y],
+            target,
+            "./tmp",
+            test_name,
+        )
+        Y_src_ops = Y._attrs["src_ops"]
+        np.testing.assert_equal(len(Y_src_ops), 2)
+        np.testing.assert_equal(Y_src_ops, {group_gemm_op, concat_op})
+        expected_inputs_group_gemm_op = [X1, W1, B1, X2, W2, B2]
+        np.testing.assert_equal(
+            expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
+        )
+
+        X1_pt = torch.randn(M1, K1).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        B1_pt = torch.randn(N1).cuda().half()
+        X2_pt = torch.randn(M2, K2).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        B2_pt = torch.randn(N2).cuda().half()
+        X3_pt = torch.randn(M3, N3).cuda().half()
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
+        if input_first:
+            Y_pt = torch.cat([X3_pt, Y1_pt, Y2_pt], dim=dim)
+        else:
+            Y_pt = torch.cat([Y1_pt, Y2_pt, X3_pt], dim=dim)
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_pt.size())
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "b1": B1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+            "b2": B2_pt,
+            "x3": X3_pt,
+        }
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_strided_group_gemm_bias(self):
+        self._test_strided_group_gemm_bias(
+            M=128,
+            N1=32,
+            K1=32,
+            N2=64,
+            K2=16,
+            N3=8,
+            test_name="strided_group_gemm_rcr_bias_cat1",
+            input_first=False,
+        )
+        self._test_strided_group_gemm_bias(
+            M=8,
+            N1=32,
+            K1=32,
+            N2=4,
+            K2=4,
+            N3=3,
+            test_name="strided_group_gemm_rcr_bias_cat2",
+            input_first=False,
+        )
+
+    # test if we update epilogue alignment values correctly
+    def test_strided_group_gemm_epilogue_alignment(self):
+        # Note that we have to force profiling in ci. Otherwise, we would not
+        # be able to fetch cached config.
+        target = detect_target()
+        old_force_ci = os.environ.get("FORCE_PROFILE", None)
+        if target.in_ci_env():
+            os.environ["FORCE_PROFILE"] = "1"
+
+        # a smaller epilogue alignment value 2
+        self._test_strided_group_gemm_bias(
+            M=18,
+            N1=24,
+            K1=32,
+            N2=62,
+            K2=16,
+            N3=2,
+            test_name="strided_group_gemm_rcr_epilogue_alignment1",
+            input_first=True,
+        )
+        # a bigger epilogue alignment value 4
+        self._test_strided_group_gemm_bias(
+            M=18,
+            N1=24,
+            K1=32,
+            N2=62,
+            K2=16,
+            N3=4,
+            test_name="strided_group_gemm_rcr_epilogue_alignment2",
+            input_first=True,
+        )
+
+        # restore old env
+        if target.in_ci_env():
+            if old_force_ci is None:
+                del os.environ["FORCE_PROFILE"]
+            else:
+                os.environ["FORCE_PROFILE"] = old_force_ci
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_group_layernorm.py b/tests/unittest/compiler/test_strided_group_layernorm.py
new file mode 100644
index 000000000..b3b7ecd5a
--- /dev/null
+++ b/tests/unittest/compiler/test_strided_group_layernorm.py
@@ -0,0 +1,335 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+import uuid
+from typing import List
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+def build_ait_module(
+    *,
+    batch_sizes,
+    input_nonbatch_shapes,
+    start_indices,
+    end_indices,
+    n_normalize_over_last_dims,
+    gamma_is_none,
+    beta_is_none,
+    fuse_sigmoid_mul,
+    eps,
+    ait_dtype="float16",
+    workdir="./tmp",
+    test_name="slice_group_layernorm",
+):
+    target = detect_target()
+    inputs = [
+        Tensor(
+            shape=[
+                shape_utils.gen_int_var_min_max(values=batch_sizes, name="input_batch"),
+                *shape,
+            ],
+            dtype=ait_dtype,
+            name=f"input_{i}",
+            is_input=True,
+        )
+        for i, shape in enumerate(input_nonbatch_shapes)
+    ]
+    sliced_inputs = [
+        ops.dynamic_slice()(input_node, start_indices, end_indices)
+        for input_node in inputs
+    ]
+    layernorm_weight_shapes = [
+        n.shape()[-n_normalize_over_last_dims:] for n in sliced_inputs
+    ]
+    gammas = [
+        None
+        if gamma_is_none
+        else Tensor(shape=shape, dtype=ait_dtype, name=f"gamma_{i}", is_input=True)
+        for i, shape in enumerate(layernorm_weight_shapes)
+    ]
+    betas = [
+        None
+        if beta_is_none
+        else Tensor(shape=shape, dtype=ait_dtype, name=f"beta_{i}", is_input=True)
+        for i, shape in enumerate(layernorm_weight_shapes)
+    ]
+    layernorm_op = (
+        ops.group_layernorm_sigmoid_mul() if fuse_sigmoid_mul else ops.group_layernorm()
+    )
+    outputs = layernorm_op(
+        sliced_inputs,
+        gammas=gammas,
+        betas=betas,
+        normalized_shapes=layernorm_weight_shapes,
+        eps=eps,
+    )
+    for i, output in enumerate(outputs):
+        output._attrs["is_output"] = True
+        output._attrs["name"] = f"output_{i}"
+    return compile_model(
+        outputs,
+        target,
+        workdir,
+        test_name,
+    )
+
+
+def apply_pt_layernorm(
+    *, input, normalized_shape, weight, bias, fuse_sigmoid_mul=False, eps=1e-5
+):
+    layernorm_output = torch.nn.functional.layer_norm(
+        input=input,
+        normalized_shape=normalized_shape,
+        weight=weight,
+        bias=bias,
+        eps=eps,
+    )
+    if fuse_sigmoid_mul:
+        output = torch.mul(input, torch.sigmoid(layernorm_output))
+    else:
+        output = layernorm_output
+    return output
+
+
+def eval_pt(
+    *,
+    batch_size,
+    input_nonbatch_shapes,
+    start_indices,
+    end_indices,
+    n_normalize_over_last_dims,
+    gamma_is_none,
+    beta_is_none,
+    fuse_sigmoid_mul,
+    eps,
+    dtype=torch.float16,
+    device="cuda",
+):
+    dtype_device = {"dtype": dtype, "device": device}
+    inputs = [
+        torch.randn(batch_size, *shape, **dtype_device)
+        for shape in input_nonbatch_shapes
+    ]
+    sliced_inputs = [
+        x[[slice(i, j) for i, j in zip(start_indices, end_indices)]] for x in inputs
+    ]
+    layernorm_weight_shapes = [
+        x.shape[-n_normalize_over_last_dims:] for x in sliced_inputs
+    ]
+    gammas = [
+        None if gamma_is_none else torch.randn(shape, **dtype_device)
+        for shape in layernorm_weight_shapes
+    ]
+    betas = [
+        None if beta_is_none else torch.randn(shape, **dtype_device)
+        for shape in layernorm_weight_shapes
+    ]
+    outputs = [
+        apply_pt_layernorm(
+            input=input,
+            normalized_shape=normalized_shape,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            fuse_sigmoid_mul=fuse_sigmoid_mul,
+        )
+        for input, normalized_shape, weight, bias in zip(
+            sliced_inputs, layernorm_weight_shapes, gammas, betas
+        )
+    ]
+    return {
+        **{f"input_{i}": x for i, x in enumerate(inputs)},
+        **{f"gamma_{i}": x for i, x in enumerate(gammas)},
+        **{f"beta_{i}": x for i, x in enumerate(betas)},
+        **{f"output_{i}": x for i, x in enumerate(outputs)},
+    }
+
+
+class SliceGroupLayerNormTestCase(unittest.TestCase):
+    def _test_slice_group_layer_norm(
+        self,
+        *,
+        input_nonbatch_shapes: List[List[int]] = None,
+        n_normalize_over_last_dims: int = 1,
+        batch_sizes=(3, 4, 7, 11, 18),
+        gamma_is_none=False,
+        beta_is_none=False,
+        fuse_sigmoid_mul=False,
+        eps=1e-5,
+        start_indices: List[int] = (0,),
+        end_indices: List[int] = (None,),
+    ):
+        input_rank = 1 + len(input_nonbatch_shapes[0])
+        if 1 == len(start_indices) and len(start_indices) != input_rank:
+            start_indices = [start_indices[0]] * input_rank
+        if 1 == len(end_indices) and len(end_indices) != input_rank:
+            end_indices = [end_indices[0]] * input_rank
+
+        _layernorm_common_params = {
+            "input_nonbatch_shapes": input_nonbatch_shapes,
+            "n_normalize_over_last_dims": n_normalize_over_last_dims,
+            "gamma_is_none": gamma_is_none,
+            "beta_is_none": beta_is_none,
+            "fuse_sigmoid_mul": fuse_sigmoid_mul,
+            "eps": eps,
+            "start_indices": start_indices,
+            "end_indices": end_indices,
+        }
+
+        ait_module = build_ait_module(
+            batch_sizes=batch_sizes,
+            workdir=uuid.uuid4().hex,
+            **_layernorm_common_params,
+        )
+        for batch_size in batch_sizes:
+            pt_tensors = eval_pt(
+                batch_size=batch_size,
+                **_layernorm_common_params,
+            )
+            ait_inputs = {
+                k: v
+                for k, v in pt_tensors.items()
+                if v is not None and not k.startswith("output")
+            }
+            ait_outputs = {
+                k: torch.empty_like(v)
+                for k, v in pt_tensors.items()
+                if k.startswith("output")
+            }
+            ait_module.run_with_tensors(ait_inputs, ait_outputs)
+
+            for k, v in ait_outputs.items():
+                self.assertTrue(
+                    torch.allclose(v, pt_tensors[k], atol=1e-2, rtol=1e-3),
+                    f"max diff: {torch.max(v - pt_tensors[k]) if v.numel() > 0 else 0}, "
+                    f"min diff: {torch.min(v - pt_tensors[k]) if v.numel() > 0 else 0}",
+                )
+
+    def _test_slice_group_layer_norm_kernels(
+        self,
+        **kwargs,
+    ):
+        for start_indices, end_indices, input_nonbatch_shapes in (
+            # (cuda-half4) kernel
+            ((0, 0, 0, 4), (None, None, None, 36), ((4, 1, 40), (4, 1, 40))),
+            # (generic n < 1024) kernel
+            ((0, 0, 0, 11), (None, None, None, 13), ((4, 1, 15), (4, 1, 15))),
+            # (cuda-half; block size = 512) kernel
+            ((0, 0, 0, 1), (None, None, None, 1026), ((4, 1, 1027), (4, 1, 1027))),
+        ):
+            self._test_slice_group_layer_norm(
+                start_indices=start_indices,
+                end_indices=end_indices,
+                input_nonbatch_shapes=input_nonbatch_shapes,
+                **kwargs,
+            )
+
+    def _test_middle_slice_group_layer_norm_kernels(
+        self,
+        **kwargs,
+    ):
+        for start_indices, end_indices, input_nonbatch_shapes in (
+            # (cuda-half4) kernel
+            ((0, 0, 4, 0), (None, None, 36, None), ((2, 40, 4), (2, 40, 4))),
+            # (generic n < 1024) kernel
+            ((0, 0, 11, 0), (None, None, 13, None), ((2, 15, 2), (2, 15, 2))),
+            # (cuda-half; block size = 512) kernel
+            ((0, 0, 1, 0), (None, None, 1026, None), ((2, 1027, 2), (2, 1027, 2))),
+        ):
+            self._test_slice_group_layer_norm(
+                start_indices=start_indices,
+                end_indices=end_indices,
+                input_nonbatch_shapes=input_nonbatch_shapes,
+                **kwargs,
+            )
+
+    def test_slice_group_layer_norm_float16(self):
+        for (
+            n_normalize_over_last_dims,
+            gamma_is_none,
+            beta_is_none,
+        ) in itertools.product(
+            (1, 3),
+            (True, False),
+            (True, False),
+        ):
+            self._test_slice_group_layer_norm_kernels(
+                n_normalize_over_last_dims=n_normalize_over_last_dims,
+                gamma_is_none=gamma_is_none,
+                beta_is_none=beta_is_none,
+                fuse_sigmoid_mul=False,
+            )
+
+    def test_middle_slice_group_layer_norm_float16(self):
+        for (
+            n_normalize_over_last_dims,
+            gamma_is_none,
+            beta_is_none,
+        ) in itertools.product(
+            (2, 3),
+            (True, False),
+            (True, False),
+        ):
+            self._test_middle_slice_group_layer_norm_kernels(
+                n_normalize_over_last_dims=n_normalize_over_last_dims,
+                gamma_is_none=gamma_is_none,
+                beta_is_none=beta_is_none,
+                fuse_sigmoid_mul=False,
+            )
+
+    def test_slice_group_layer_norm_fuse_sigmoid_mul_float16(self):
+        for (
+            n_normalize_over_last_dims,
+            gamma_is_none,
+            beta_is_none,
+        ) in itertools.product(
+            (1, 3),
+            (True, False),
+            (True, False),
+        ):
+            self._test_slice_group_layer_norm_kernels(
+                n_normalize_over_last_dims=n_normalize_over_last_dims,
+                gamma_is_none=gamma_is_none,
+                beta_is_none=beta_is_none,
+                fuse_sigmoid_mul=True,
+            )
+
+    def test_middle_slice_group_layer_norm_fuse_sigmoid_mul_float16(self):
+        for (
+            n_normalize_over_last_dims,
+            gamma_is_none,
+            beta_is_none,
+        ) in itertools.product(
+            (2, 3),
+            (True, False),
+            (True, False),
+        ):
+            self._test_middle_slice_group_layer_norm_kernels(
+                n_normalize_over_last_dims=n_normalize_over_last_dims,
+                gamma_is_none=gamma_is_none,
+                beta_is_none=beta_is_none,
+                fuse_sigmoid_mul=True,
+            )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_layernorm.py b/tests/unittest/compiler/test_strided_layernorm.py
new file mode 100644
index 000000000..43b270669
--- /dev/null
+++ b/tests/unittest/compiler/test_strided_layernorm.py
@@ -0,0 +1,297 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+import uuid
+from typing import List
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+def build_ait_module(
+    *,
+    batch_sizes,
+    input_nonbatch_shape,
+    start_indices,
+    end_indices,
+    n_normalize_over_last_dims,
+    gamma_is_none,
+    beta_is_none,
+    fuse_sigmoid_mul,
+    eps,
+    ait_dtype="float16",
+    workdir="./tmp",
+    test_name="slice_layernorm",
+):
+    target = detect_target()
+    X0 = Tensor(
+        shape=[
+            shape_utils.gen_int_var_min_max(values=batch_sizes, name="input_batch"),
+            *input_nonbatch_shape,
+        ],
+        dtype=ait_dtype,
+        name="input",
+        is_input=True,
+    )
+    X1 = ops.dynamic_slice()(X0, start_indices, end_indices)
+    layernorm_weight_shape = X1.shape()[-n_normalize_over_last_dims:]
+    if gamma_is_none:
+        X2 = None
+    else:
+        X2 = Tensor(
+            shape=layernorm_weight_shape,
+            dtype=ait_dtype,
+            name="gamma",
+            is_input=True,
+        )
+    if beta_is_none:
+        X3 = None
+    else:
+        X3 = Tensor(
+            shape=layernorm_weight_shape,
+            dtype=ait_dtype,
+            name="beta",
+            is_input=True,
+        )
+    if fuse_sigmoid_mul:
+        layernorm_op = ops.layernorm()
+        sigmoid_op = ops.elementwise(FuncEnum.SIGMOID)
+        mul_op = ops.elementwise(FuncEnum.MUL)
+        layernorm_out = layernorm_op(X1, X2, X3, layernorm_weight_shape, eps=eps)
+        sigmoid_out = sigmoid_op(layernorm_out)
+        _ = mul_op(sigmoid_out, X1)
+        fused_op = ops.layernorm_sigmoid_mul(layernorm_op, sigmoid_op, mul_op)
+        output = fused_op()
+    else:
+        output = ops.layernorm()(X1, X2, X3, layernorm_weight_shape, eps)
+    output._attrs["is_output"] = True
+    output._attrs["name"] = "output"
+    return compile_model(
+        output,
+        target,
+        workdir,
+        test_name,
+    )
+
+
+def eval_pt(
+    *,
+    batch_size,
+    input_nonbatch_shape,
+    start_indices,
+    end_indices,
+    n_normalize_over_last_dims,
+    gamma_is_none,
+    beta_is_none,
+    fuse_sigmoid_mul,
+    eps,
+    dtype=torch.float16,
+    device="cuda",
+):
+    dtype_device = {"dtype": dtype, "device": device}
+    X0 = torch.randn(batch_size, *input_nonbatch_shape, **dtype_device)
+    X1 = X0[[slice(i, j) for i, j in zip(start_indices, end_indices)]]
+    layernorm_weight_shape = X1.shape[-n_normalize_over_last_dims:]
+    if gamma_is_none:
+        X2 = None
+    else:
+        X2 = torch.randn(layernorm_weight_shape, **dtype_device)
+    if beta_is_none:
+        X3 = None
+    else:
+        X3 = torch.randn(layernorm_weight_shape, **dtype_device)
+    X4 = torch.nn.functional.layer_norm(
+        input=X1,
+        normalized_shape=layernorm_weight_shape,
+        weight=X2,
+        bias=X3,
+        eps=eps,
+    )
+    if fuse_sigmoid_mul:
+        output = torch.mul(X1, torch.sigmoid(X4))
+    else:
+        output = X4
+    return {"input": X0, "gamma": X2, "beta": X3, "output": output}
+
+
+class SliceLayerNormTestCase(unittest.TestCase):
+    def _test_slice_layer_norm(
+        self,
+        *,
+        input_nonbatch_shape: List[int] = (16, 64, 1024),
+        n_normalize_over_last_dims: int = 1,
+        batch_sizes=(3, 4, 7, 11, 18),
+        gamma_is_none=False,
+        beta_is_none=False,
+        fuse_sigmoid_mul=False,
+        eps=1e-5,
+        start_indices: List[int] = (0,),
+        end_indices: List[int] = (None,),
+    ):
+
+        input_rank = 1 + len(input_nonbatch_shape)
+        if 1 == len(start_indices) and len(start_indices) != input_rank:
+            start_indices = [start_indices[0]] * input_rank
+        if 1 == len(end_indices) and len(end_indices) != input_rank:
+            end_indices = [end_indices[0]] * input_rank
+
+        _layernorm_common_params = {
+            "input_nonbatch_shape": input_nonbatch_shape,
+            "n_normalize_over_last_dims": n_normalize_over_last_dims,
+            "gamma_is_none": gamma_is_none,
+            "beta_is_none": beta_is_none,
+            "fuse_sigmoid_mul": fuse_sigmoid_mul,
+            "eps": eps,
+            "start_indices": start_indices,
+            "end_indices": end_indices,
+        }
+
+        ait_module = build_ait_module(
+            batch_sizes=batch_sizes,
+            workdir=uuid.uuid4().hex,
+            **_layernorm_common_params,
+        )
+        for batch_size in batch_sizes:
+            pt_tensors = eval_pt(
+                batch_size=batch_size,
+                **_layernorm_common_params,
+            )
+            ait_inputs = {
+                k: v for k, v in pt_tensors.items() if v is not None and k != "output"
+            }
+            ait_outputs = {"output": torch.empty_like(pt_tensors["output"])}
+            ait_module.run_with_tensors(ait_inputs, ait_outputs)
+
+            self.assertTrue(
+                torch.allclose(
+                    ait_outputs["output"], pt_tensors["output"], atol=1e-3, rtol=1e-3
+                )
+            )
+
+    def _test_slice_layer_norm_kernels(
+        self,
+        **kwargs,
+    ):
+        for start_indices, end_indices, input_nonbatch_shape in (
+            # (cuda-half4) kernel
+            ((0, 0, 0, 4), (None, None, None, 36), (4, 1, 40)),
+            # (generic n < 1024) kernel
+            ((0, 0, 0, 11), (None, None, None, 13), (4, 1, 15)),
+            # (cuda-half; block size = 512) kernel
+            ((0, 0, 0, 1), (None, None, None, 1026), (4, 1, 1027)),
+        ):
+            self._test_slice_layer_norm(
+                start_indices=start_indices,
+                end_indices=end_indices,
+                input_nonbatch_shape=input_nonbatch_shape,
+                **kwargs,
+            )
+
+    def _test_middle_slice_layer_norm_kernels(
+        self,
+        **kwargs,
+    ):
+        for start_indices, end_indices, input_nonbatch_shape in (
+            # (cuda-half4) kernel
+            ((0, 0, 4, 0), (None, None, 36, None), (2, 40, 4)),
+            # (generic n < 1024) kernel
+            ((0, 0, 11, 0), (None, None, 13, None), (2, 15, 2)),
+            # (cuda-half; block size = 512) kernel
+            ((0, 0, 1, 0), (None, None, 1026, None), (2, 1027, 2)),
+        ):
+            self._test_slice_layer_norm(
+                start_indices=start_indices,
+                end_indices=end_indices,
+                input_nonbatch_shape=input_nonbatch_shape,
+                **kwargs,
+            )
+
+    def test_slice_layer_norm_float16(self):
+        for (
+            n_normalize_over_last_dims,
+            gamma_is_none,
+            beta_is_none,
+        ) in itertools.product(
+            (1, 3),
+            (True, False),
+            (True, False),
+        ):
+            self._test_slice_layer_norm_kernels(
+                n_normalize_over_last_dims=n_normalize_over_last_dims,
+                gamma_is_none=gamma_is_none,
+                beta_is_none=beta_is_none,
+                fuse_sigmoid_mul=False,
+            )
+
+    def test_middle_slice_layer_norm_float16(self):
+        for (
+            n_normalize_over_last_dims,
+            gamma_is_none,
+            beta_is_none,
+        ) in itertools.product(
+            (2, 3),
+            (True, False),
+            (True, False),
+        ):
+            self._test_middle_slice_layer_norm_kernels(
+                n_normalize_over_last_dims=n_normalize_over_last_dims,
+                gamma_is_none=gamma_is_none,
+                beta_is_none=beta_is_none,
+                fuse_sigmoid_mul=False,
+            )
+
+    def test_slice_layer_norm_fuse_sigmoid_mul_float16(self):
+        for (
+            n_normalize_over_last_dims,
+            gamma_is_none,
+            beta_is_none,
+        ) in itertools.product(
+            (1, 3),
+            (True, False),
+            (True, False),
+        ):
+            self._test_slice_layer_norm_kernels(
+                n_normalize_over_last_dims=n_normalize_over_last_dims,
+                gamma_is_none=gamma_is_none,
+                beta_is_none=beta_is_none,
+                fuse_sigmoid_mul=True,
+            )
+
+    def test_middle_slice_layer_norm_fuse_sigmoid_mul_float16(self):
+        for (
+            n_normalize_over_last_dims,
+            gamma_is_none,
+            beta_is_none,
+        ) in itertools.product(
+            (2, 3),
+            (True, False),
+            (True, False),
+        ):
+            self._test_middle_slice_layer_norm_kernels(
+                n_normalize_over_last_dims=n_normalize_over_last_dims,
+                gamma_is_none=gamma_is_none,
+                beta_is_none=beta_is_none,
+                fuse_sigmoid_mul=True,
+            )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_layernorm_reshape.py b/tests/unittest/compiler/test_strided_layernorm_reshape.py
new file mode 100644
index 000000000..2cc725c4a
--- /dev/null
+++ b/tests/unittest/compiler/test_strided_layernorm_reshape.py
@@ -0,0 +1,147 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+import uuid
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+def build_ait_module(
+    *,
+    batch_sizes,
+    eps,
+    ait_dtype="float16",
+    workdir="./tmp",
+    test_name="slice_layernorm_reshape",
+):
+    input_nonbatch_shape = [6912]
+    target = detect_target()
+    batch_size = shape_utils.gen_int_var_min_max(values=batch_sizes, name="input_batch")
+    inputs = Tensor(
+        shape=[
+            batch_size,
+            *input_nonbatch_shape,
+        ],
+        dtype=ait_dtype,
+        name="input",
+        is_input=True,
+    )
+    slice_out = ops.dynamic_slice()(
+        inputs, start_indices=[0, 0], end_indices=[None, 6400]
+    )
+    reshape_out = ops.reshape()(slice_out, shape=[-1, 128, 50])
+    layernorm_weight_shape = reshape_out.shape()[-2:]
+    gamma_beta_params = {
+        "shape": layernorm_weight_shape,
+        "dtype": ait_dtype,
+        "is_input": True,
+    }
+    gammas = Tensor(
+        name="gamma",
+        **gamma_beta_params,
+    )
+    betas = Tensor(
+        name="beta",
+        **gamma_beta_params,
+    )
+    layernorm_out = ops.layernorm()(
+        reshape_out, gammas, betas, layernorm_weight_shape, eps
+    )
+    output = ops.reshape()(layernorm_out, shape=[-1, 6400])
+
+    output._attrs["is_output"] = True
+    output._attrs["name"] = "output"
+    return (
+        inputs,
+        output,
+        compile_model(
+            output,
+            target,
+            workdir,
+            test_name,
+        ),
+    )
+
+
+def eval_pt(
+    *,
+    batch_size,
+    eps,
+    dtype=torch.float16,
+    device="cuda",
+):
+    dtype_device = {"dtype": dtype, "device": device}
+    inputs = torch.randn(batch_size, 6912, **dtype_device)
+    slice_out = inputs[:, :6400]
+    reshape_out = slice_out.reshape(-1, 128, 50)
+    layernorm_weight_shape = reshape_out.shape[-2:]
+    gammas = torch.randn(layernorm_weight_shape, **dtype_device)
+    betas = torch.randn(layernorm_weight_shape, **dtype_device)
+    layernorm_out = torch.nn.functional.layer_norm(
+        input=reshape_out,
+        normalized_shape=layernorm_weight_shape,
+        weight=gammas,
+        bias=betas,
+        eps=eps,
+    )
+    output = layernorm_out.reshape(shape=[-1, 6400])
+
+    return {"input": inputs, "gamma": gammas, "beta": betas, "output": output}
+
+
+class SliceLayerNormReshapeTestCase(unittest.TestCase):
+    def test_slice_layer_norm_reshape(
+        self,
+        *,
+        batch_sizes=(3, 4),
+        eps=1e-5,
+        atol=1e-3,
+        rtol=1e-3,
+    ):
+        ait_in_node, ait_out_node, ait_module = build_ait_module(
+            batch_sizes=batch_sizes,
+            workdir=uuid.uuid4().hex,
+            eps=eps,
+        )
+
+        for op_name in (
+            next(iter(ait_in_node._attrs["dst_ops"]))._attrs["name"],
+            next(iter(ait_out_node._attrs["src_ops"]))._attrs["name"],
+        ):
+            self.assertRegex(op_name, "layernorm")
+
+        for batch_size in batch_sizes:
+            pt_tensors = eval_pt(
+                batch_size=batch_size,
+                eps=eps,
+            )
+            ait_inputs = {k: v for k, v in pt_tensors.items() if k != "output"}
+            ait_outputs = {"output": torch.empty_like(pt_tensors["output"])}
+            ait_module.run_with_tensors(ait_inputs, ait_outputs)
+
+            self.assertTrue(
+                torch.allclose(
+                    ait_outputs["output"], pt_tensors["output"], atol=atol, rtol=rtol
+                )
+            )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_op_cat_pattern.py b/tests/unittest/compiler/test_strided_op_cat_pattern.py
new file mode 100644
index 000000000..d7c2fe063
--- /dev/null
+++ b/tests/unittest/compiler/test_strided_op_cat_pattern.py
@@ -0,0 +1,1557 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import logging
+import os
+import unittest
+
+from typing import List, Optional
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class StridedOpCatPatternTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(StridedOpCatPatternTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _fused_elementwise_e2e_helper(
+        self,
+        batch0_sizes: List[int],
+        batch1_sizes: List[int],
+        m1: int,
+        m2: int,
+        m3: int,
+        k: int,
+    ):
+        # Construct one graph with 2 fused_elementwises + 1 cat.
+        batch0_dim = shape_utils.gen_int_var_min_max(batch0_sizes, "batch_0")
+        batch1_dim = shape_utils.gen_int_var_min_max(batch1_sizes, "batch_1")
+
+        X1 = Tensor(
+            shape=[batch0_dim, batch1_dim, IntImm(m1), IntImm(k)],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="X2",
+            value=3.0,
+        )
+        X3 = Tensor(
+            shape=[batch0_dim, batch1_dim, IntImm(m2), IntImm(k)],
+            dtype="float16",
+            name="input1",
+            is_input=True,
+        )
+        X9 = Tensor(
+            shape=[batch0_dim, batch1_dim, IntImm(m3), IntImm(k)],
+            dtype="float16",
+            name="input2",
+            is_input=True,
+        )
+
+        X4 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        X5 = ops.elementwise(FuncEnum.TANH)(X4)
+        X6 = ops.elementwise(FuncEnum.TANH)(X3)
+        X7 = ops.concatenate()([X5, X6, X9], dim=2)
+        X8 = ops.reshape()(X7, [-1, (m1 + m2 + m3) * k])
+        X8._attrs["name"] = "output0"
+        X8._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [X8],
+            target,
+            "./tmp",
+            "fused_elementwise_cat_m1_{}_m2_{}_m3_{}_k_{}".format(m1, m2, m3, k),
+        ) as module:
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), 5)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), 4)
+
+            # Run PyTorch baseline.
+            for sizes in itertools.product(batch0_sizes, batch1_sizes):
+                x1_pt = torch.randn(sizes[0], sizes[1], m1, k).cuda().half()
+                x3_pt = torch.randn(sizes[0], sizes[1], m2, k).cuda().half()
+                x9_pt = torch.randn(sizes[0], sizes[1], m3, k).cuda().half()
+                x5_pt = torch.tanh(x1_pt + 3.0)
+                x6_pt = torch.tanh(x3_pt)
+                x7_pt = torch.cat([x5_pt, x6_pt, x9_pt], dim=2)
+                x8_pt = torch.reshape(x7_pt, [-1, (m1 + m2 + m3) * k])
+
+                # Run AITemplate module.
+                inputs = [x1_pt, x3_pt, x9_pt]
+                x8 = (
+                    torch.empty([sizes[0] * sizes[1], (m1 + m2 + m3) * k]).cuda().half()
+                )
+                module.run_with_tensors(inputs, [x8])
+
+                # Do comparisons.
+                self.assertTrue(torch.allclose(x8, x8_pt, atol=1e-2, rtol=1e-2))
+
+    def test_elementwise(self):
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[1024], batch1_sizes=[2], m1=8, m2=16, m3=8, k=1
+        )
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[3], batch1_sizes=[100], m1=16, m2=64, m3=8, k=32
+        )
+
+        # Stride alignment tests.
+        # half v.s. half
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[100, 30], batch1_sizes=[2], m1=1, m2=1, m3=8, k=1
+        )
+        # half2 v.s. half
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[30], batch1_sizes=[2, 88, 99], m1=2, m2=3, m3=8, k=1
+        )
+        # half v.s. half2
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[77, 89, 188], batch1_sizes=[1, 2, 4], m1=3, m2=2, m3=8, k=1
+        )
+        # half4 v.s. half
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2], batch1_sizes=[1, 3, 1024], m1=4, m2=5, m3=8, k=1
+        )
+        # half v.s. half8
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2], batch1_sizes=[1, 3, 1024], m1=3, m2=8, m3=8, k=1
+        )
+        # half4 v.s. half2
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2], batch1_sizes=[1, 3, 1024], m1=4, m2=6, m3=8, k=1
+        )
+        # half2 v.s. half8
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2], batch1_sizes=[1, 3, 1024], m1=6, m2=8, m3=8, k=1
+        )
+        # half4 v.s. half8
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2], batch1_sizes=[1, 3, 1024], m1=12, m2=16, m3=8, k=1
+        )
+
+        # Offset alignment tests.
+        # offset alignment = 1
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88], batch1_sizes=[20], m1=3, m2=4, m3=5, k=1
+        )
+        # offset alignment = 2
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88], batch1_sizes=[20], m1=6, m2=8, m3=2, k=1
+        )
+        # offset alignment = 4
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88], batch1_sizes=[20], m1=12, m2=16, m3=4, k=1
+        )
+
+    def test_elementwise_cat_1(self):
+        BATCH_SIZE = 1024
+        NUM_FLOAT_FEATURES = 1456
+
+        X1 = Tensor(
+            shape=[IntImm(BATCH_SIZE), IntImm(NUM_FLOAT_FEATURES)],
+            dtype="float16",
+            name="float_features",
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.SIGN)(X1)  # Sign
+        X3 = ops.elementwise(FuncEnum.ABS)(X1)  # Abs
+        X4 = ops.elementwise(FuncEnum.LOGE)(
+            ops.elementwise(FuncEnum.ADD)(X3, Tensor(shape=[], value=1.0))
+        )  # Log1p
+        X5 = ops.elementwise(FuncEnum.MUL)(X2, X4)  # Mul
+        X6 = ops.concatenate()([X5, X1], dim=1)  # Concat
+        X6._attrs["name"] = "output0"
+        X6._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [X6],
+            target,
+            "./tmp",
+            "test_elementwise_cat_1",
+        ) as module:
+            float_features = torch.randn(BATCH_SIZE, NUM_FLOAT_FEATURES).cuda().half()
+            x1_pt = torch.sign(float_features)  # Sign
+            x2_pt = torch.abs(float_features)  # Abs
+            x3_pt = torch.log1p(x2_pt)  # Log1p
+            x4_pt = x1_pt * x3_pt  # Mul
+            x5_pt = torch.cat([x4_pt, float_features], dim=1)  # Concat
+
+            # Run AITemplate module.
+            x6 = torch.empty(x5_pt.size()).cuda().half()
+            module.run_with_tensors([float_features], [x6])
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(x6, x5_pt, atol=1e-2, rtol=1e-2))
+
+    def _fused_gemm_e2e_helper(self, m: int, k: int, n1: int, n2: int, n3: int):
+        # Construct one graph with 3 gemms + 1 cat.
+        X1 = Tensor(
+            shape=[IntImm(m), IntImm(k)],
+            dtype="float16",
+            name="X1",
+            is_input=True,
+        )
+        W1 = Tensor(
+            shape=[IntImm(n1), IntImm(k)],
+            dtype="float16",
+            name="W1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[IntImm(m), IntImm(k)],
+            dtype="float16",
+            name="X2",
+            is_input=True,
+        )
+        W2 = Tensor(
+            shape=[IntImm(n2), IntImm(k)],
+            dtype="float16",
+            name="W2",
+            is_input=True,
+        )
+        B2 = Tensor(
+            shape=[IntImm(n2)],
+            dtype="float16",
+            name="B2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[IntImm(m), IntImm(k)],
+            dtype="float16",
+            name="X3",
+            is_input=True,
+        )
+        W3 = Tensor(
+            shape=[IntImm(k), IntImm(n3)],
+            dtype="float16",
+            name="W3",
+            is_input=True,
+        )
+        X4 = Tensor(
+            shape=[IntImm(m), IntImm(n2)],
+            dtype="float16",
+            name="X4",
+            is_input=True,
+        )
+
+        X5 = ops.gemm_rcr()(X1, W1)
+        X6 = ops.gemm_rcr_bias()(X2, W2, B2)
+        X7 = ops.gemm_rrr()(X3, W3)
+        X8 = ops.gemm_rcr_bias_add_add_relu()(X2, W2, B2, X4, X4)
+        X9 = ops.concatenate()([X5, X6, X7, X8], dim=1)
+        X9._attrs["name"] = "output0"
+        X9._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [X9],
+            target,
+            "./tmp",
+            "fused_gemm_m_{}_k_{}_n1_{}_n2_{}_n3_{}".format(m, k, n1, n2, n3),
+        ) as module:
+
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), 9)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), 4)
+
+            # Run PyTorch baseline.
+            x1_pt = torch.randn(m, k).cuda().half()
+            w1_pt = torch.randn(n1, k).cuda().half()
+            x2_pt = torch.randn(m, k).cuda().half()
+            w2_pt = torch.randn(n2, k).cuda().half()
+            b2_pt = torch.randn(n2).cuda().half()
+            x3_pt = torch.randn(m, k).cuda().half()
+            w3_pt = torch.randn(k, n3).cuda().half()
+            x4_pt = torch.randn(m, n2).cuda().half()
+
+            x5_pt = torch.nn.functional.linear(x1_pt, w1_pt)
+            x6_pt = torch.nn.functional.linear(x2_pt, w2_pt, b2_pt)
+            x7_pt = torch.nn.functional.linear(x3_pt, torch.transpose(w3_pt, 0, 1))
+            x8_pt = torch.relu(
+                torch.nn.functional.linear(x2_pt, w2_pt, b2_pt) + x4_pt + x4_pt
+            )
+
+            x9_pt = torch.cat([x5_pt, x6_pt, x7_pt, x8_pt], dim=1)
+
+            # Run AITemplate module.
+            inputs = [0] * 8
+            name_to_idx = module.get_input_name_to_index_map()
+            inputs[name_to_idx["X1"]] = x1_pt
+            inputs[name_to_idx["X2"]] = x2_pt
+            inputs[name_to_idx["X3"]] = x3_pt
+            inputs[name_to_idx["X4"]] = x4_pt
+
+            inputs[name_to_idx["W1"]] = w1_pt
+            inputs[name_to_idx["W2"]] = w2_pt
+            inputs[name_to_idx["W3"]] = w3_pt
+
+            inputs[name_to_idx["B2"]] = b2_pt
+
+            x9 = torch.empty([m, n1 + n2 + n3 + n2]).cuda().half()
+            module.run_with_tensors(inputs, [x9])
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(x9, x9_pt, atol=1e-1, rtol=1e-1))
+
+    def test_gemm(self):
+        self._fused_gemm_e2e_helper(m=1024, k=256, n1=5, n2=32, n3=4)
+        self._fused_gemm_e2e_helper(m=1024, k=256, n1=32, n2=32, n3=64)
+        self._fused_gemm_e2e_helper(m=1024, k=128, n1=16, n2=32, n3=8)
+        self._fused_gemm_e2e_helper(m=1024, k=256, n1=8, n2=16, n3=32)
+
+    def _fused_gemm_alignment_e2e_helper(
+        self, gemm_op, input_n: int, m: int, k: int, n: int
+    ):
+        # Construct one graph with 1 input + 1 gemm_bias_add + 1 cat.
+        Input1 = Tensor(
+            shape=[IntImm(m), IntImm(input_n)],
+            dtype="float16",
+            name="Input1",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[IntImm(m), IntImm(k)],
+            dtype="float16",
+            name="X1",
+            is_input=True,
+        )
+        W1 = Tensor(
+            shape=[IntImm(n), IntImm(k)],
+            dtype="float16",
+            name="W1",
+            is_input=True,
+        )
+        B1 = Tensor(
+            shape=[IntImm(n)],
+            dtype="float16",
+            name="B1",
+            is_input=True,
+        )
+
+        gemm_op_kind = gemm_op._attrs["op"]
+        if gemm_op_kind == "gemm_rcr_bias_add":
+            num_inputs = 5
+            X2 = Tensor(
+                shape=[IntImm(m), IntImm(n)],
+                dtype="float16",
+                name="X2",
+                is_input=True,
+            )
+            Y1 = gemm_op(X1, W1, B1, X2)
+        elif gemm_op_kind == "gemm_rcr_bias":
+            num_inputs = 4
+            Y1 = gemm_op(X1, W1, B1)
+        else:
+            assert 0, f"unsupported gemm kind: {gemm_op_kind}"
+
+        Y = ops.concatenate()([Input1, Y1], dim=1)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [Y],
+            target,
+            "./tmp",
+            f"fused_{gemm_op_kind}_alignment_input_n_{input_n}_m_{m}_n_{n}_k_{k}",
+        ) as module:
+
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            if gemm_op_kind == "gemm_rcr_bias_add":
+                # 5 inputs + 1 output
+                self.assertEqual(len(sorted_graph), num_inputs + 1)
+            else:
+                # 4 inputs + 1 output
+                self.assertEqual(len(sorted_graph), num_inputs + 1)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), 2)
+            Y_src_ops = list(Y._attrs["src_ops"])
+            np.testing.assert_equal(len(Y_src_ops), 2)
+            if Y_src_ops[0]._attrs["op"] == "concatenate":
+                concat_op = Y_src_ops[0]
+            else:
+                concat_op = Y_src_ops[1]
+            np.testing.assert_equal(concat_op._attrs["input_masks"], [True, False])
+
+            # Run PyTorch baseline.
+            input_pt = torch.randn(m, input_n).cuda().half()
+            x1_pt = torch.randn(m, k).cuda().half()
+            w1_pt = torch.randn(n, k).cuda().half()
+            b1_pt = torch.randn(n).cuda().half()
+
+            y1_pt = torch.nn.functional.linear(x1_pt, w1_pt)
+            y1_pt = torch.nn.functional.linear(x1_pt, w1_pt, b1_pt)
+            if gemm_op_kind == "gemm_rcr_bias_add":
+                x2_pt = torch.randn(m, n).cuda().half()
+                y1_pt += x2_pt
+
+            y_pt = torch.cat([input_pt, y1_pt], dim=1)
+
+            # Run AITemplate module.
+            inputs = [0] * num_inputs
+            name_to_idx = module.get_input_name_to_index_map()
+            inputs[name_to_idx["Input1"]] = input_pt
+            inputs[name_to_idx["X1"]] = x1_pt
+            if gemm_op_kind == "gemm_rcr_bias_add":
+                inputs[name_to_idx["X2"]] = x2_pt
+            inputs[name_to_idx["W1"]] = w1_pt
+            inputs[name_to_idx["B1"]] = b1_pt
+
+            y = torch.empty([m, input_n + n]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-1, rtol=1e-1))
+
+    def test_gemm_alignment(self):
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=1, m=2, k=2, n=4
+        )
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=2, m=4, k=8, n=1
+        )
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=4, m=4, k=4, n=2
+        )
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=7, m=4, k=4, n=8
+        )
+
+    # Tests to ensure that we correctly update epilogue alignment values
+    def test_gemm_update_epilogue_alignment(self):
+        # Note that we have to force profiling in ci. Otherwise, we would not
+        # be able to fetch cached config.
+        target = detect_target()
+        old_force_ci = os.environ.get("FORCE_PROFILE", None)
+        if target.in_ci_env():
+            os.environ["FORCE_PROFILE"] = "1"
+
+        # a smaller epilogue alignment 1
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias(), input_n=1, m=2, k=2, n=4
+        )
+        # a larger epilogue alignment 4
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias(), input_n=4, m=2, k=2, n=4
+        )
+
+        # a smaller epilogue alignment 1
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=2, m=3, k=2, n=4
+        )
+        # a larger epilogue alignment 4
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=4, m=3, k=2, n=4
+        )
+
+        # restore old env
+        if target.in_ci_env():
+            if old_force_ci is None:
+                del os.environ["FORCE_PROFILE"]
+            else:
+                os.environ["FORCE_PROFILE"] = old_force_ci
+
+    def _fused_layernorm_e2e_helper(
+        self,
+        m: int,
+        n1: int,
+        n2: int,
+        cat_dim: int,
+        batch_size: Optional[IntVar] = None,
+        gamma_is_none: bool = False,
+        beta_is_none: bool = False,
+    ):
+        logging.info(
+            f"_fused_layernorm_e2e: m={m}, n1={n1}, n2={n2}, cat_dim={cat_dim}, batch_size={batch_size}"
+            f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}"
+        )
+
+        def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
+            return shape if batch_size is None else [batch_size] + shape
+
+        # Construct one graph with 2 layernorms + 1 cat.
+        X1 = Tensor(
+            shape=_maybe_add_batch_size_ait([IntImm(m), IntImm(n1)]),
+            dtype="float16",
+            name="X1",
+            is_input=True,
+        )
+        if gamma_is_none:
+            GAMMA1 = None
+        else:
+            GAMMA1 = Tensor(
+                shape=[IntImm(n1)],
+                dtype="float16",
+                name="gamma1",
+                is_input=True,
+            )
+        if beta_is_none:
+            BETA1 = None
+        else:
+            BETA1 = Tensor(
+                shape=[IntImm(n1)],
+                dtype="float16",
+                name="beta1",
+                is_input=True,
+            )
+        X2 = Tensor(
+            shape=_maybe_add_batch_size_ait([IntImm(m), IntImm(n2)]),
+            dtype="float16",
+            name="X2",
+            is_input=True,
+        )
+        if gamma_is_none:
+            GAMMA2 = None
+        else:
+            GAMMA2 = Tensor(
+                shape=[IntImm(n2)],
+                dtype="float16",
+                name="gamma2",
+                is_input=True,
+            )
+        if beta_is_none:
+            BETA2 = None
+        else:
+            BETA2 = Tensor(
+                shape=[IntImm(n2)],
+                dtype="float16",
+                name="beta2",
+                is_input=True,
+            )
+        X3 = ops.layernorm(normalized_shape=[IntImm(n1)])(X1, GAMMA1, BETA1)
+        X4 = ops.elementwise(FuncEnum.SIGMOID)(X3)
+        X5 = ops.elementwise(FuncEnum.MUL)(X1, X4)
+        X6 = ops.layernorm(normalized_shape=[IntImm(n2)])(X2, GAMMA2, BETA2)
+        X7 = ops.concatenate()([X5, X6], dim=cat_dim)
+        X7._attrs["is_output"] = True
+        X7._attrs["name"] = "output"
+
+        def _maybe_add_batch_size_pt(shape: List[int]) -> List[int]:
+            return shape if batch_size is None else [batch_size.upper_bound()] + shape
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [X7],
+            target,
+            "./tmp",
+            "fused_layernorm",
+        ) as module:
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            num_tensors = 7
+            if gamma_is_none:
+                num_tensors -= 2
+            if beta_is_none:
+                num_tensors -= 2
+            self.assertEqual(len(sorted_graph), num_tensors)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), 2)
+
+            # Run PyTorch baseline.
+            x1_pt = torch.randn(_maybe_add_batch_size_pt([m, n1])).cuda().half()
+            if gamma_is_none:
+                gamma1_pt = None
+            else:
+                gamma1_pt = torch.randn(n1).cuda().half()
+            if beta_is_none:
+                beta1_pt = None
+            else:
+                beta1_pt = torch.randn(n1).cuda().half()
+            x2_pt = torch.randn(_maybe_add_batch_size_pt([m, n2])).cuda().half()
+            if gamma_is_none:
+                gamma2_pt = None
+            else:
+                gamma2_pt = torch.randn(n2).cuda().half()
+            if beta_is_none:
+                beta2_pt = None
+            else:
+                beta2_pt = torch.randn(n2).cuda().half()
+
+            x3_pt = torch.nn.functional.layer_norm(
+                x1_pt, x1_pt.size()[-1:], gamma1_pt, beta1_pt
+            )
+            x5_pt = torch.mul(x1_pt, torch.sigmoid(x3_pt))
+            x6_pt = torch.nn.functional.layer_norm(
+                x2_pt, x2_pt.size()[-1:], gamma2_pt, beta2_pt
+            )
+            x7_pt = torch.cat([x5_pt, x6_pt], dim=cat_dim)
+
+            # Run AITemplate module.
+            inputs = [x1_pt]
+            if not gamma_is_none:
+                inputs.append(gamma1_pt)
+            if not beta_is_none:
+                inputs.append(beta1_pt)
+            inputs.append(x2_pt)
+            if not gamma_is_none:
+                inputs.append(gamma2_pt)
+            if not beta_is_none:
+                inputs.append(beta2_pt)
+            x7 = torch.empty(x7_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [x7])
+
+            # Do comparisons.
+            self.assertTrue(
+                torch.allclose(x7, x7_pt, atol=1e-2, rtol=1e-2),
+                f"max diff: {torch.max(x7 - x7_pt)}, min diff: {torch.min(x7 - x7_pt)}",
+            )
+
+    def test_layernorm(self):
+        self._fused_layernorm_e2e_helper(m=1024, n1=256, n2=256, cat_dim=1)
+        self._fused_layernorm_e2e_helper(m=1024, n1=4, n2=1, cat_dim=1)
+        self._fused_layernorm_e2e_helper(m=1024, n1=1025, n2=1, cat_dim=1)
+        self._fused_layernorm_e2e_helper(m=1024, n1=1, n2=256, cat_dim=1)
+        self._fused_layernorm_e2e_helper(m=1024, n1=1, n2=1, cat_dim=1)
+        self._fused_layernorm_e2e_helper(m=1024, n1=256, n2=256, cat_dim=0)
+        self._fused_layernorm_e2e_helper(m=1, n1=256, n2=256, cat_dim=0)
+
+        self._fused_layernorm_e2e_helper(
+            m=1024, n1=256, n2=256, cat_dim=1, gamma_is_none=True, beta_is_none=True
+        )
+        self._fused_layernorm_e2e_helper(
+            m=1024, n1=256, n2=256, cat_dim=0, gamma_is_none=True
+        )
+
+        # Test alignments.
+        # half v.s. half4
+        self._fused_layernorm_e2e_helper(m=2, n1=3, n2=128, cat_dim=1)
+        self._fused_layernorm_e2e_helper(m=2, n1=128, n2=5, cat_dim=1)
+
+        self._fused_layernorm_e2e_helper(
+            m=2, n1=3, n2=128, cat_dim=1, gamma_is_none=True, beta_is_none=True
+        )
+
+        # Test w/ batch sizes
+        self._fused_layernorm_e2e_helper(
+            m=1024, n1=256, n2=256, cat_dim=1, batch_size=IntImm(2)
+        )
+        self._fused_layernorm_e2e_helper(
+            m=1024,
+            n1=256,
+            n2=256,
+            cat_dim=1,
+            batch_size=IntVar([1, 10], name="batch_size"),
+        )
+
+    def _test_group_layernorm_sigmoid_mul_cat_fusion(
+        self,
+        input_shapes,
+        cat_dim=1,
+        gamma_is_none=False,
+        beta_is_none=False,
+        fuse_sigmoid_mul=True,
+        use_group_ops=True,
+        num_cat_ops=1,
+    ):
+        assert num_cat_ops in (1, 2), "Only supports testing with num_cat_ops in (1, 2)"
+        testname = (
+            f"group_layernorm_sigmoid_mul_{num_cat_ops}_cat_fusion"
+            if fuse_sigmoid_mul
+            else f"group_layernorm_{num_cat_ops}_cat_fusion"
+        )
+        logging.info(
+            f"{testname}: input_shapes={input_shapes}, cat_dim={cat_dim}, "
+            f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}"
+        )
+        inputs = []
+        gammas = []
+        betas = []
+        normalized_shapes = []
+        Ns = []
+        for i, shape in enumerate(input_shapes):
+            inputs.append(
+                Tensor(
+                    shape=[
+                        IntImm(shape[0]),
+                        IntImm(shape[1]),
+                    ],
+                    dtype="float16",
+                    name="X_" + str(i),
+                    is_input=True,
+                )
+            )
+            gamma = (
+                None
+                if gamma_is_none
+                else Tensor(
+                    shape=[IntImm(shape[1])],
+                    dtype="float16",
+                    name="gamma_" + str(i),
+                    is_input=True,
+                )
+            )
+            gammas.append(gamma)
+            beta = (
+                None
+                if beta_is_none
+                else Tensor(
+                    shape=[IntImm(shape[1])],
+                    dtype="float16",
+                    name="beta_" + str(i),
+                    is_input=True,
+                )
+            )
+            betas.append(beta)
+            normalized_shapes.append([IntImm(shape[1])])
+            Ns.append(shape[1])
+
+        Y0s = []
+        if use_group_ops:
+            op = (
+                ops.group_layernorm_sigmoid_mul
+                if fuse_sigmoid_mul
+                else ops.group_layernorm
+            )
+            Y0s = op()(inputs, gammas, betas, normalized_shapes)
+        else:
+            for i in range(len(input_shapes)):
+                Y0 = ops.layernorm()(
+                    inputs[i], gammas[i], betas[i], normalized_shapes[i]
+                )
+                if fuse_sigmoid_mul:
+                    Y1 = ops.elementwise(FuncEnum.SIGMOID)(Y0)
+                    Y2 = ops.elementwise(FuncEnum.MUL)(inputs[i], Y1)
+                    Y0s.append(Y2)
+                else:
+                    Y0s.append(Y0)
+
+        if num_cat_ops == 1:
+            Ys = [ops.concatenate()(Y0s, dim=cat_dim)]
+        else:
+            assert (
+                len(input_shapes) % 2 == 0
+            ), "len(input_shapes) must be even when num_cat_ops == 2"
+            half = len(input_shapes) // 2
+            Y1 = ops.concatenate()(Y0s[:half], dim=cat_dim)
+            Y2 = ops.concatenate()(Y0s[half:], dim=cat_dim)
+            Ys = [Y1, Y2]
+
+        for i, Y in enumerate(Ys):
+            Y._attrs["is_output"] = True
+            Y._attrs["name"] = f"output_{i}"
+
+        target = detect_target()
+        with compile_model(
+            Ys,
+            target,
+            "./tmp",
+            f"{testname}_{self._test_id}",
+        ) as module:
+            self._test_id += 1
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            num_inputs = 3
+            if gamma_is_none:
+                num_inputs -= 1
+            if beta_is_none:
+                num_inputs -= 1
+            self.assertEqual(
+                len(sorted_graph), num_inputs * len(input_shapes) + num_cat_ops
+            )
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), 1)
+
+            B = len(input_shapes)
+
+            logging.info(
+                f"Run test group_layernorm_sigmoid_mul + {num_cat_ops} cat. Input shapes: {input_shapes}"
+            )
+
+            xs_pt = []
+            gammas_pt = []
+            betas_pt = []
+            for shape in input_shapes:
+                xs_pt.append(torch.randn(shape).cuda().half())
+                gamma_pt = (
+                    None if gamma_is_none else torch.randn(shape[1]).cuda().half()
+                )
+                gammas_pt.append(gamma_pt)
+                beta_pt = None if beta_is_none else torch.randn(shape[1]).cuda().half()
+                betas_pt.append(beta_pt)
+
+            y0s_pt = []
+            for i in range(B):
+                y0 = torch.nn.functional.layer_norm(
+                    xs_pt[i], xs_pt[i].size()[1:], gammas_pt[i], betas_pt[i]
+                )
+                if fuse_sigmoid_mul:
+                    y = torch.mul(xs_pt[i], torch.sigmoid(y0))
+                    y0s_pt.append(y)
+                else:
+                    y0s_pt.append(y0)
+            ys_pt = []
+            if num_cat_ops == 1:
+                ys_pt = [torch.cat(y0s_pt, dim=cat_dim)]
+            else:
+                half = len(input_shapes) // 2
+                y1_pt = torch.cat(y0s_pt[:half], dim=cat_dim)
+                y2_pt = torch.cat(y0s_pt[half:], dim=cat_dim)
+                ys_pt = [y1_pt, y2_pt]
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            total_num_inputs = len(input_shapes) * num_inputs
+            inputs = [0 for i in range(total_num_inputs)]
+            for i in range(len(input_shapes)):
+                inputs[input_name_to_index[f"X_{i}"]] = xs_pt[i]
+                if not gamma_is_none:
+                    inputs[input_name_to_index[f"gamma_{i}"]] = gammas_pt[i]
+                if not beta_is_none:
+                    inputs[input_name_to_index[f"beta_{i}"]] = betas_pt[i]
+            ys = []
+            for y_pt in ys_pt:
+                ys.append(torch.empty(y_pt.size()).cuda().half())
+            module.run_with_tensors(inputs, ys)
+            for y_pt, y in zip(ys_pt, ys):
+                self.assertTrue(
+                    torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2),
+                    f"max diff: {torch.max(y_pt - y)}, min diff: {torch.min(y_pt - y)}",
+                )
+
+    def test_group_layernorm_sigmoid_mul_cat_fusion(self):
+        for fuse_sigmoid_mul in (True, False):
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]], 0, fuse_sigmoid_mul=fuse_sigmoid_mul
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 4, 0, fuse_sigmoid_mul=fuse_sigmoid_mul
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 3, 1, fuse_sigmoid_mul=fuse_sigmoid_mul
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 64], [128, 256], [128, 125]],
+                1,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[10, 64], [10, 64], [10, 64]], 0, fuse_sigmoid_mul=fuse_sigmoid_mul
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 1025], [128, 1276], [128, 1023]],
+                1,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+            )
+
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]],
+                0,
+                gamma_is_none=True,
+                beta_is_none=True,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[10, 64], [10, 64], [10, 64]],
+                0,
+                beta_is_none=True,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 1025], [128, 1276], [128, 1023]],
+                1,
+                gamma_is_none=True,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 6, 0, fuse_sigmoid_mul=fuse_sigmoid_mul, num_cat_ops=2
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 6, 1, fuse_sigmoid_mul=fuse_sigmoid_mul, num_cat_ops=2
+            )
+            # test group layernorm fusion (horizontal fusion)
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 6,
+                1,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                use_group_ops=False,
+                num_cat_ops=2,
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 6,
+                1,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                use_group_ops=False,
+            )
+
+    def _test_bmm_cat_fusion(self, B, M, Ns, Ks, cat_dim, testname):
+        n = len(Ns)
+        Cs = []
+        dtype = "float16"
+
+        Xs_pt = []
+        Ys_pt = []
+        Cs_pt = []
+        for i in range(n):
+            N = Ns[i]
+            K = Ks[i]
+            X = Tensor(
+                shape=[B, M, K],
+                dtype=dtype,
+                name=f"X{i}",
+                is_input=True,
+            )
+            Y = Tensor(
+                shape=[B, N, K],
+                dtype=dtype,
+                name=f"Y{i}",
+                is_input=True,
+            )
+            if N > 1:
+                C = ops.bmm_rcr()(X, Y)
+            else:
+                C = ops.bmm_rcr_n1()(X, Y)
+            Cs.append(C)
+
+            x = torch.randn(B, M, K).cuda().half()
+            y = torch.randn(B, N, K).cuda().half()
+            c = torch.bmm(x, y.permute([0, 2, 1]))
+            Xs_pt.append(x)
+            Ys_pt.append(y)
+            Cs_pt.append(c)
+
+        Y = ops.concatenate()(Cs, dim=cat_dim)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+        y_pt = torch.cat(Cs_pt, dim=cat_dim)
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(Y, target, "./tmp", testname) as module:
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0 for i in range(2 * n)]
+            for i in range(n):
+                inputs[input_name_to_index[f"X{i}"]] = Xs_pt[i]
+                inputs[input_name_to_index[f"Y{i}"]] = Ys_pt[i]
+            y = torch.empty(y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_bmm_cat_fusion(self):
+        self._test_bmm_cat_fusion(1, 8, [2, 2, 2], [4, 5, 32], 2, "test_bmm_cat_1")
+        self._test_bmm_cat_fusion(1, 16, [1, 1, 1], [32, 16, 32], 1, "test_bmm_cat_2")
+        self._test_bmm_cat_fusion(1, 16, [1, 1, 1], [32, 16, 32], 2, "test_bmm_cat_3")
+        self._test_bmm_cat_fusion(1, 16, [1, 1, 1], [32, 16, 32], -1, "test_bmm_cat_4")
+
+    def _test_bmm_rcr_update_epilogue_alignment(
+        self, bmm_op, input_N, B, M, N, K, testname
+    ):
+        # create a graph with 1 input + 1 bmm + 1 concat
+        cat_dim = -1
+        dtype = "float16"
+
+        bmm_op_kind = bmm_op._attrs["op"]
+        Input1 = Tensor(
+            shape=[IntImm(B), IntImm(M), IntImm(input_N)],
+            dtype=dtype,
+            name="Input1",
+            is_input=True,
+        )
+        X = Tensor(
+            shape=[B, M, K],
+            dtype=dtype,
+            name="X",
+            is_input=True,
+        )
+        if "rcr" in bmm_op_kind:
+            w_shape = [B, N, K]
+        elif "rrr" in bmm_op_kind:
+            w_shape = [B, K, N]
+        else:
+            assert 0, f"unsupported {bmm_op_kind=}"
+
+        W = Tensor(
+            shape=w_shape,
+            dtype=dtype,
+            name="W",
+            is_input=True,
+        )
+        num_inputs = 3
+        if bmm_op_kind.endswith("_add"):
+            num_inputs += 1
+            X2 = Tensor(
+                shape=[IntImm(B), IntImm(M), IntImm(N)],
+                dtype="float16",
+                name="X2",
+                is_input=True,
+            )
+            C = bmm_op(X, W, X2)
+        else:
+            C = bmm_op(X, W)
+
+        input1_pt = torch.randn(B, M, input_N).cuda().half()
+        x_pt = torch.randn(B, M, K).cuda().half()
+        w_pt = torch.randn(*w_shape).cuda().half()
+        if num_inputs == 4:
+            x2_pt = torch.randn(B, M, N).cuda().half()
+
+        if "rcr" in bmm_op_kind:
+            c_pt = torch.bmm(x_pt, w_pt.permute([0, 2, 1]))
+        elif "rrr" in bmm_op_kind:
+            c_pt = torch.bmm(x_pt, w_pt)
+
+        if num_inputs == 4:
+            c_pt = c_pt + x2_pt
+
+        Y = ops.concatenate()([Input1, C], dim=cat_dim)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+        y_pt = torch.cat([input1_pt, c_pt], dim=cat_dim)
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", testname)
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0] * num_inputs
+        inputs[input_name_to_index["Input1"]] = input1_pt
+        inputs[input_name_to_index["X"]] = x_pt
+        inputs[input_name_to_index["W"]] = w_pt
+        if num_inputs == 4:
+            inputs[input_name_to_index["X2"]] = x2_pt
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    # Test to ensure we update epilogue alignment values
+    def test_bmm_rcr_update_epilogue_alignment(self):
+        # Note that we have to force profiling in ci. Otherwise, we would not
+        # be able to fetch cached config.
+        target = detect_target()
+        old_force_ci = os.environ.get("FORCE_PROFILE", None)
+        if target.in_ci_env():
+            os.environ["FORCE_PROFILE"] = "1"
+
+        # a smaller epilogue value 2
+        self._test_bmm_rcr_update_epilogue_alignment(
+            bmm_op=ops.bmm_rrr_add(),
+            input_N=3,
+            B=3,
+            M=4,
+            N=5,
+            K=8,
+            testname="test_bmm_rcr_epilogue_3",
+        )
+        # a larger epilogue value 4
+        self._test_bmm_rcr_update_epilogue_alignment(
+            bmm_op=ops.bmm_rrr_add(),
+            input_N=8,
+            B=3,
+            M=4,
+            N=5,
+            K=8,
+            testname="test_bmm_rcr_epilogue_4",
+        )
+
+        # a smaller epilogue value 2
+        self._test_bmm_rcr_update_epilogue_alignment(
+            bmm_op=ops.bmm_rcr(),
+            input_N=2,
+            B=3,
+            M=5,
+            N=4,
+            K=8,
+            testname="test_bmm_rcr_epilogue_1",
+        )
+        # a larger epilogue value 4
+        self._test_bmm_rcr_update_epilogue_alignment(
+            bmm_op=ops.bmm_rcr(),
+            input_N=4,
+            B=3,
+            M=5,
+            N=4,
+            K=8,
+            testname="test_bmm_rcr_epilogue_2",
+        )
+
+        # restore old env
+        if target.in_ci_env():
+            if old_force_ci is None:
+                del os.environ["FORCE_PROFILE"]
+            else:
+                os.environ["FORCE_PROFILE"] = old_force_ci
+
+    def _test_reduce_cat_fusion_1(
+        self,
+        input_shape,
+        reduction_dim,
+        keepdim,
+        cat_dim,
+        new_cat_dim_val,
+        test_name,
+        input_type="float16",
+    ):
+        torch.manual_seed(0)
+        logging.info(
+            f"Test reduce_cat_fusion_1 with input shape {input_shape}, "
+            f"reduction_dim {reduction_dim}, and cat_dim {cat_dim}"
+        )
+        target = detect_target()
+
+        X1 = Tensor(shape=input_shape, dtype=input_type, name="input_1", is_input=True)
+
+        x2_shape = []
+        for idx in range(len(input_shape)):
+            if idx == reduction_dim:
+                if keepdim:
+                    x2_shape.append(1)
+            else:
+                x2_shape.append(input_shape[idx])
+        # set concat_dim to a new value for testing
+        x2_shape[cat_dim] = new_cat_dim_val
+        X2 = Tensor(shape=x2_shape, dtype=input_type, name="input_2", is_input=True)
+
+        reduce_op = ops.reduce_mean(reduction_dim, keepdim=keepdim, dtype=None)
+        Y1 = reduce_op(X1)
+        Y = ops.concatenate()([Y1, X2], dim=cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [dim._attrs["values"][0] for dim in Y._attrs["shape"]]
+        y_dtype = Y._attrs["dtype"]
+
+        logging.info("AITemplate output_shape: {}".format(y_shape))
+        logging.info("AITemplate output_type: {}".format(y_dtype))
+
+        with compile_model(Y, target, "./tmp", test_name) as module:
+            Y_src_ops = list(Y._attrs["src_ops"])
+            np.testing.assert_equal(len(Y_src_ops), 2)
+            if Y_src_ops[0]._attrs["op"] == "concatenate":
+                concat_op = Y_src_ops[0]
+                np.testing.assert_equal(Y_src_ops[1], reduce_op)
+            else:
+                concat_op = Y_src_ops[1]
+                np.testing.assert_equal(Y_src_ops[0], reduce_op)
+            np.testing.assert_equal(concat_op._attrs["input_masks"], [False, True])
+
+            X1_pt = get_random_torch_tensor(input_shape, input_type)
+            X2_pt = get_random_torch_tensor(x2_shape, input_type)
+            Y1_pt = torch.mean(X1_pt, dim=reduction_dim, keepdim=keepdim)
+            Y_pt = torch.cat([Y1_pt, X2_pt], dim=cat_dim)
+
+            inputs = [X1_pt, X2_pt]
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors(inputs, [y])
+
+            self.assertTrue(
+                torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2, equal_nan=True)
+            )
+
+    def test_reduce_cat_fusion_1(self):
+        self._test_reduce_cat_fusion_1(
+            input_shape=[4, 2],
+            reduction_dim=1,
+            keepdim=True,
+            cat_dim=1,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_1_0",
+        )
+        self._test_reduce_cat_fusion_1(
+            input_shape=[7, 8, 2],
+            reduction_dim=2,
+            keepdim=True,
+            cat_dim=1,
+            new_cat_dim_val=4,
+            test_name="test_reduce_cat_1_1",
+        )
+        self._test_reduce_cat_fusion_1(
+            input_shape=[7, 5, 2],
+            reduction_dim=2,
+            keepdim=False,
+            cat_dim=1,
+            new_cat_dim_val=4,
+            test_name="test_reduce_cat_1_2",
+        )
+        self._test_reduce_cat_fusion_1(
+            input_shape=[7, 500, 200],
+            reduction_dim=2,
+            keepdim=False,
+            cat_dim=1,
+            new_cat_dim_val=9,
+            test_name="test_reduce_cat_1_3",
+        )
+
+    def _test_reduce_cat_fusion_2(
+        self,
+        input_shape,
+        reduction_dim,
+        keepdim,
+        cat_dim,
+        new_cat_dim_val,
+        test_name,
+        input_type="float16",
+    ):
+        torch.manual_seed(0)
+        logging.info(
+            f"Test reduce_cat_fusion_1 with input shape {input_shape}, "
+            f"reduction_dim {reduction_dim}, and cat_dim {cat_dim}"
+        )
+        target = detect_target()
+
+        X1 = Tensor(shape=input_shape, dtype=input_type, name="input_1", is_input=True)
+
+        x2_shape = []
+        for idx in range(len(input_shape)):
+            if idx == reduction_dim:
+                if keepdim:
+                    x2_shape.append(1)
+            else:
+                x2_shape.append(input_shape[idx])
+        # set concat_dim to a new value for testing
+        x2_shape[cat_dim] = new_cat_dim_val
+        X2 = Tensor(shape=x2_shape, dtype=input_type, name="input_2", is_input=True)
+
+        reduce_mean_op = ops.reduce_mean(reduction_dim, keepdim=keepdim, dtype=None)
+        Y1 = reduce_mean_op(X1)
+        reduce_var_op = ops.var(
+            dim=reduction_dim, unbiased=True, keepdim=keepdim, dtype=None
+        )
+        Y2 = reduce_var_op(X1)
+        Y3 = ops.concatenate()([X2, Y1, Y2], dim=cat_dim)
+
+        x3_shape = [d._attrs["values"][0] for d in Y3._attrs["shape"]]
+        X3 = Tensor(shape=x3_shape, dtype=input_type, name="input_3", is_input=True)
+
+        add_op = ops.elementwise(FuncEnum.ADD)
+        Y = add_op(Y3, X3)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [dim._attrs["values"][0] for dim in Y._attrs["shape"]]
+        y_dtype = Y._attrs["dtype"]
+
+        logging.info("AITemplate output_shape: {}".format(y_shape))
+        logging.info("AITemplate output_type: {}".format(y_dtype))
+
+        with compile_model(Y, target, "./tmp", test_name) as module:
+            Y_src_ops = list(Y._attrs["src_ops"])
+            np.testing.assert_equal(len(Y_src_ops), 1)
+            fused_add_op = Y_src_ops[0]
+            add_op_inputs = fused_add_op._attrs["inputs"]
+            if add_op_inputs[0]._attrs["name"] == "input_3":
+                concat_op_output = add_op_inputs[1]
+            else:
+                concat_op_output = add_op_inputs[0]
+            Y3_src_ops = list(concat_op_output._attrs["src_ops"])
+            np.testing.assert_equal(len(Y3_src_ops), 3)
+            if Y3_src_ops[0]._attrs["op"] == "concatenate":
+                concat_op = Y3_src_ops[0]
+            elif Y3_src_ops[1]._attrs["op"] == "concatenate":
+                concat_op = Y3_src_ops[1]
+            elif Y3_src_ops[2]._attrs["op"] == "concatenate":
+                concat_op = Y3_src_ops[2]
+            np.testing.assert_equal(
+                concat_op._attrs["input_masks"], [True, False, False]
+            )
+
+            X1_pt = get_random_torch_tensor(input_shape, input_type)
+            X2_pt = get_random_torch_tensor(x2_shape, input_type)
+            X3_pt = get_random_torch_tensor(x3_shape, input_type)
+            Y1_pt = torch.mean(X1_pt, dim=reduction_dim, keepdim=keepdim)
+            Y2_pt = torch.var(X1_pt, dim=reduction_dim, unbiased=True, keepdim=keepdim)
+            Y3_pt = torch.cat([X2_pt, Y1_pt, Y2_pt], dim=cat_dim)
+            Y_pt = Y3_pt + X3_pt
+            inputs = [X1_pt, X2_pt, X3_pt]
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(
+                torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2, equal_nan=True)
+            )
+
+    def test_reduce_cat_fusion_2(self):
+        self._test_reduce_cat_fusion_2(
+            input_shape=[10, 22, 16],
+            reduction_dim=2,
+            keepdim=True,
+            cat_dim=2,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_2_0",
+        )
+        self._test_reduce_cat_fusion_2(
+            input_shape=[10, 22, 16],
+            reduction_dim=1,
+            keepdim=False,
+            cat_dim=1,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_2_1",
+        )
+        self._test_reduce_cat_fusion_2(
+            input_shape=[1, 130, 1],
+            reduction_dim=2,
+            keepdim=True,
+            cat_dim=2,
+            new_cat_dim_val=1,
+            test_name="test_reduce_cat_2_2",
+        )
+        self._test_reduce_cat_fusion_2(
+            input_shape=[1, 1000000, 6],
+            reduction_dim=2,
+            keepdim=True,
+            cat_dim=2,
+            new_cat_dim_val=1,
+            test_name="test_reduce_cat_2_3",
+        )
+        self._test_reduce_cat_fusion_2(
+            input_shape=[3, 10000, 5],
+            reduction_dim=2,
+            keepdim=True,
+            cat_dim=2,
+            new_cat_dim_val=4,
+            test_name="test_reduce_cat_2_4",
+        )
+
+    def _test_reduce_cat_fusion_3(
+        self,
+        input_shape,
+        reduction_dim,
+        keepdim,
+        cat_dim,
+        new_cat_dim_val,
+        test_name,
+        input_type="float16",
+    ):
+        torch.manual_seed(0)
+        logging.info(
+            f"Test reduce_cat_fusion_3 with input shape {input_shape}, "
+            f"reduction_dim {reduction_dim}, and cat_dim {cat_dim}"
+        )
+        target = detect_target()
+
+        X1 = Tensor(shape=input_shape, dtype=input_type, name="input_1", is_input=True)
+
+        x2_shape = []
+        for idx in range(len(input_shape)):
+            if idx == reduction_dim:
+                if keepdim:
+                    x2_shape.append(1)
+            else:
+                x2_shape.append(input_shape[idx])
+        # set concat_dim to a new value for testing
+        x2_shape[cat_dim] = new_cat_dim_val
+        X2 = Tensor(shape=x2_shape, dtype=input_type, name="input_2", is_input=True)
+
+        reduce_op = ops.reduce_mean(reduction_dim, keepdim=keepdim, dtype=None)
+        Y1 = reduce_op(X1)
+        Y = ops.concatenate()([X2, Y1, X2], dim=cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [dim._attrs["values"][0] for dim in Y._attrs["shape"]]
+        y_dtype = Y._attrs["dtype"]
+
+        logging.info("AITemplate output_shape: {}".format(y_shape))
+        logging.info("AITemplate output_type: {}".format(y_dtype))
+
+        with compile_model(Y, target, "./tmp", test_name) as module:
+            Y_src_ops = list(Y._attrs["src_ops"])
+            np.testing.assert_equal(len(Y_src_ops), 2)
+            if Y_src_ops[0]._attrs["op"] == "concatenate":
+                concat_op = Y_src_ops[0]
+                np.testing.assert_equal(Y_src_ops[1], reduce_op)
+            else:
+                concat_op = Y_src_ops[1]
+                np.testing.assert_equal(Y_src_ops[0], reduce_op)
+            np.testing.assert_equal(
+                concat_op._attrs["input_masks"], [True, False, True]
+            )
+
+            X1_pt = get_random_torch_tensor(input_shape, input_type)
+            X2_pt = get_random_torch_tensor(x2_shape, input_type)
+            Y1_pt = torch.mean(X1_pt, dim=reduction_dim, keepdim=keepdim)
+            Y_pt = torch.cat([X2_pt, Y1_pt, X2_pt], dim=cat_dim)
+
+            inputs = [X1_pt, X2_pt]
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors(inputs, [y])
+
+            self.assertTrue(
+                torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2, equal_nan=True)
+            )
+
+    def test_reduce_cat_fusion_3(self):
+        self._test_reduce_cat_fusion_3(
+            input_shape=[10, 22, 16],
+            reduction_dim=1,
+            keepdim=True,
+            cat_dim=0,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_3_0",
+        )
+        self._test_reduce_cat_fusion_3(
+            input_shape=[3, 11, 16],
+            reduction_dim=2,
+            keepdim=False,
+            cat_dim=0,
+            new_cat_dim_val=10,
+            test_name="test_reduce_cat_3_1",
+        )
+
+    def _test_reduce_cat_fusion_batch(
+        self,
+        batch_sizes,
+        input_shape,
+        reduction_dim,
+        keepdim,
+        cat_dim,
+        new_cat_dim_val,
+        test_name,
+        input_type="float16",
+    ):
+        torch.manual_seed(0)
+        logging.info(
+            f"Test reduce_cat_fusion_1 with input shape {input_shape}, "
+            f"reduction_dim {reduction_dim}, and cat_dim {cat_dim}"
+        )
+        target = detect_target()
+        batch_dim_name = "input_batch"
+        batch_dim = shape_utils.gen_int_var_min_max(
+            values=batch_sizes, name=batch_dim_name
+        )
+
+        X1 = Tensor(
+            shape=[batch_dim, *input_shape],
+            dtype=input_type,
+            name="input_1",
+            is_input=True,
+        )
+
+        x2_shape = []
+        for idx in range(len(input_shape)):
+            if idx == reduction_dim:
+                if keepdim:
+                    x2_shape.append(1)
+            else:
+                x2_shape.append(input_shape[idx])
+        assert (
+            cat_dim != 0
+        ), f"cat_dim is not allowed to be 0 in this test but got {cat_dim}"
+        # set concat_dim to a new value for testing
+        x2_shape[cat_dim - 1] = new_cat_dim_val
+        X2 = Tensor(
+            shape=[batch_dim, *x2_shape],
+            dtype=input_type,
+            name="input_2",
+            is_input=True,
+        )
+
+        ord_kind = 2
+        reduce_op = ops.vector_norm(
+            ord_kind=ord_kind, dim=reduction_dim, keepdim=keepdim
+        )
+        Y1 = reduce_op(X1)
+        Y = ops.concatenate()([Y1, X2], dim=cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_dtype = Y._attrs["dtype"]
+
+        logging.info("AITemplate output_type: {}".format(y_dtype))
+
+        with compile_model(Y, target, "./tmp", test_name) as module:
+            Y_src_ops = list(Y._attrs["src_ops"])
+            np.testing.assert_equal(len(Y_src_ops), 2)
+            if Y_src_ops[0]._attrs["op"] == "concatenate":
+                concat_op = Y_src_ops[0]
+                np.testing.assert_equal(Y_src_ops[1], reduce_op)
+            else:
+                concat_op = Y_src_ops[1]
+                np.testing.assert_equal(Y_src_ops[0], reduce_op)
+            np.testing.assert_equal(concat_op._attrs["input_masks"], [False, True])
+
+            for batch in batch_sizes:
+                X1_pt = get_random_torch_tensor([batch, *input_shape], input_type)
+                X2_pt = get_random_torch_tensor([batch, *x2_shape], input_type)
+                Y1_pt = torch.linalg.vector_norm(
+                    X1_pt, ord=ord_kind, dim=reduction_dim, keepdim=keepdim
+                )
+                Y_pt = torch.cat([Y1_pt, X2_pt], dim=cat_dim)
+
+                inputs = [X1_pt, X2_pt]
+                y = torch.empty_like(Y_pt)
+                module.run_with_tensors(inputs, [y])
+                self.assertTrue(
+                    torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2, equal_nan=True)
+                )
+
+    def test_reduce_cat_fusion_batch(self):
+        self._test_reduce_cat_fusion_batch(
+            batch_sizes=[5, 20],
+            input_shape=[4, 2],
+            reduction_dim=2,
+            keepdim=True,
+            cat_dim=2,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_1_0",
+        )
+
+    def test_col_reduce_cat_fusion(self):
+        torch.manual_seed(0)
+        input_a_shape = [1, 4096]
+        input_b_shape = [1, 250, 256]
+        input_type = "float16"
+        reduction_dim = 1
+        cat_dim = -1
+        test_name = "test_col_reduce_sum_cat"
+
+        target = detect_target()
+        A = Tensor(shape=input_a_shape, dtype=input_type, name="input_a", is_input=True)
+        B = Tensor(shape=input_b_shape, dtype=input_type, name="input_b", is_input=True)
+
+        X = ops.reduce_sum(dim=reduction_dim)(B)
+        Y = ops.concatenate()([A, X], dim=cat_dim)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+        concat_op = sorted_ops[1]
+        np.testing.assert_equal(concat_op._attrs["input_masks"], [True, True])
+
+        a_pt = get_random_torch_tensor(input_a_shape, input_type)
+        b_pt = get_random_torch_tensor(input_b_shape, input_type)
+        x_pt = torch.sum(b_pt, dim=reduction_dim)
+        y_pt = torch.cat([a_pt, x_pt], dim=cat_dim)
+
+        y = torch.empty(y_pt.size()).cuda().half()
+        inputs = {"input_a": a_pt, "input_b": b_pt}
+        module.run_with_tensors(inputs, [y])
+        y_pt = y_pt.cpu().numpy()
+
+        np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_reshape_cat.py b/tests/unittest/compiler/test_strided_reshape_cat.py
new file mode 100644
index 000000000..2566947c3
--- /dev/null
+++ b/tests/unittest/compiler/test_strided_reshape_cat.py
@@ -0,0 +1,247 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+
+
+class StridedReshapeCatTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(StridedReshapeCatTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 1
+
+    def _test_strided_reshape_cat(self, num_cat_ops=1):
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+
+        M1 = 128
+        N1 = 32
+        K1 = 32
+
+        M2 = 128
+        N2 = 8
+        K2 = 16
+
+        M3 = 128
+        N3 = 16
+        K3 = 16
+
+        BS = 128
+        Input_M = 2
+        Input_N = 8
+
+        dim = 1
+
+        X1 = Tensor(
+            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+        )
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        X2 = Tensor(
+            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+        )
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+
+        X3 = Tensor(
+            shape=[IntImm(M3), IntImm(K3)], dtype="float16", name="x3", is_input=True
+        )
+        W3 = Tensor(shape=[N3, K3], dtype="float16", name="w3", is_input=True)
+
+        Input = Tensor(
+            shape=[BS, Input_M, Input_N], dtype="float16", name="input", is_input=True
+        )
+
+        group_gemm_op = ops.group_gemm_rcr()
+        Y1_orig, Y2_orig, Y3_orig = group_gemm_op(
+            operand_groups=[[X1, W1], [X2, W2], [X3, W3]]
+        )
+        Y1 = ops.reshape()(Y1_orig, [BS, -1, Input_N])
+        Y2 = ops.unsqueeze(dim)(Y2_orig)
+        Y3 = ops.reshape()(Y3_orig, [BS, -1, Input_N])
+        Y1._attrs["name"] = "y1"
+        Y2._attrs["name"] = "y2"
+        Y3._attrs["name"] = "y3"
+        if num_cat_ops == 1:
+            concat_op = ops.concatenate()
+            Y = concat_op([Y1, Y2, Input, Y3], dim=dim)
+        else:
+            concat_op_1 = ops.concatenate()
+            concat_op_2 = ops.concatenate()
+            Y4 = concat_op_1([Y1, Y2], dim=dim)
+            Y5 = concat_op_2([Input, Y3], dim=dim)
+            Y6 = ops.reshape()(Y4, [BS, -1, Input_N])
+            Y7 = ops.reshape()(Y5, [BS, -1, Input_N])
+            Y = ops.concatenate()([Y6, Y7], dim=dim)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            [Y], target, "./tmp", "strided_reshape_cat", dll_name=dll_name
+        )
+
+        Y_src_ops = Y._attrs["src_ops"]
+        if num_cat_ops == 1:
+            np.testing.assert_equal(len(Y_src_ops), 2)
+            np.testing.assert_equal(Y_src_ops, {group_gemm_op, concat_op})
+            np.testing.assert_equal(
+                concat_op._attrs["input_masks"], [False, False, True, False]
+            )
+        else:
+            np.testing.assert_equal(concat_op_1._attrs["input_masks"], [False, False])
+            np.testing.assert_equal(concat_op_2._attrs["input_masks"], [True, False])
+
+        expected_inputs_group_gemm_op = [X1, W1, X2, W2, X3, W3]
+        np.testing.assert_equal(
+            expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
+        )
+
+        X1_pt = torch.randn(M1, K1).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        X2_pt = torch.randn(M2, K2).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        X3_pt = torch.randn(M3, K3).cuda().half()
+        W3_pt = torch.randn(N3, K3).cuda().half()
+        Input_pt = torch.randn(BS, Input_M, Input_N).cuda().half()
+        Y1_orig_pt = torch.nn.functional.linear(X1_pt, W1_pt)
+        Y2_orig_pt = torch.nn.functional.linear(X2_pt, W2_pt)
+        Y3_orig_pt = torch.nn.functional.linear(X3_pt, W3_pt)
+        Y1_pt = torch.reshape(Y1_orig_pt, [BS, -1, Input_N])
+        Y2_pt = torch.unsqueeze(Y2_orig_pt, dim)
+        Y3_pt = torch.reshape(Y3_orig_pt, [BS, -1, Input_N])
+        Y_pt = torch.cat([Y1_pt, Y2_pt, Input_pt, Y3_pt], dim=dim)
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_pt.size())
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+            "x3": X3_pt,
+            "w3": W3_pt,
+            "input": Input_pt,
+        }
+
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.test_count += 1
+
+    def test_strided_reshape_cat(self, num_cat_ops=1):
+        self._test_strided_reshape_cat(1)
+        self._test_strided_reshape_cat(2)
+
+    def test_strided_reshape_cat_bias(self):
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+
+        M1 = 128
+        N1 = 32
+        K1 = 32
+
+        M2 = 128
+        N2 = 8
+        K2 = 16
+
+        BS = 128
+        Input_M = 2
+        Input_N = 8
+
+        dim = 1
+
+        X1 = Tensor(
+            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+        )
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
+        X2 = Tensor(
+            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+        )
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+
+        Input = Tensor(
+            shape=[BS, Input_M, Input_N], dtype="float16", name="input", is_input=True
+        )
+
+        group_gemm_op = ops.group_gemm_rcr_bias()
+        Y1_orig, Y2_orig = group_gemm_op(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
+        Y1 = ops.reshape()(Y1_orig, [BS, -1, Input_N])
+        Y2 = ops.unsqueeze(dim)(Y2_orig)
+        Y1._attrs["name"] = "y1"
+        Y2._attrs["name"] = "y2"
+        concat_op = ops.concatenate()
+        Y = concat_op([Y1, Y2, Input], dim=dim)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            [Y], target, "./tmp", "strided_reshape_cat_bias", dll_name=dll_name
+        )
+        Y_src_ops = Y._attrs["src_ops"]
+        np.testing.assert_equal(len(Y_src_ops), 2)
+        np.testing.assert_equal(Y_src_ops, {group_gemm_op, concat_op})
+        np.testing.assert_equal(concat_op._attrs["input_masks"], [False, False, True])
+        expected_inputs_group_gemm_op = [X1, W1, B1, X2, W2, B2]
+        np.testing.assert_equal(
+            expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
+        )
+
+        X1_pt = torch.randn(M1, K1).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        B1_pt = torch.randn(N1).cuda().half()
+        X2_pt = torch.randn(M2, K2).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        B2_pt = torch.randn(N2).cuda().half()
+        Input_pt = torch.randn(BS, Input_M, Input_N).cuda().half()
+        Y1_orig_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
+        Y2_orig_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
+        Y1_pt = torch.reshape(Y1_orig_pt, [BS, -1, Input_N])
+        Y2_pt = torch.unsqueeze(Y2_orig_pt, dim)
+        Y_pt = torch.cat([Y1_pt, Y2_pt, Input_pt], dim=dim)
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_pt.size())
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "b1": B1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+            "b2": B2_pt,
+            "input": Input_pt,
+        }
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.test_count += 1
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_scatter.py b/tests/unittest/compiler/test_strided_scatter.py
new file mode 100644
index 000000000..f98aa5211
--- /dev/null
+++ b/tests/unittest/compiler/test_strided_scatter.py
@@ -0,0 +1,870 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils, shape_utils
+
+logger = logging.getLogger(__name__)
+
+
+class StridedScatterTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(StridedScatterTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _make_tensor(
+        self,
+        input_shape,
+        input_name,
+        input_type="float16",
+    ):
+        x_shape = [
+            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
+            for d in input_shape
+        ]
+        X = Tensor(shape=x_shape, dtype=input_type, name=input_name, is_input=True)
+        return X
+
+    def _make_add(
+        self, input_shape, input_0_name, input_1_name, output_name, input_type="float16"
+    ):
+        input_add_shape = [
+            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
+            for d in input_shape
+        ]
+        input_Add_0 = Tensor(
+            shape=input_add_shape,
+            dtype=input_type,
+            name=input_0_name,
+            is_input=True,
+        )
+        input_Add_1 = Tensor(
+            shape=input_add_shape,
+            dtype=input_type,
+            name=input_1_name,
+            is_input=True,
+        )
+        add_output = ops.elementwise(FuncEnum.ADD)(input_Add_0, input_Add_1)
+        add_output._attrs["name"] = output_name
+        return add_output
+
+    def _make_slice_ops(
+        self,
+        input_shapes,
+        input_tensors,
+        start_indices,
+        end_indices,
+        input_type="float16",
+    ):
+        Ys = []
+        for idx, (input_shape, input_tensor, s_indices, e_indices) in enumerate(
+            zip(input_shapes, input_tensors, start_indices, end_indices)
+        ):
+            slice_op = ops.dynamic_slice()
+            if input_tensor is not None:
+                X = input_tensor
+            else:
+                X_name = f"input_{idx}"
+                X = self._make_tensor(input_shape, X_name)
+            Y = slice_op(X, start_indices=s_indices, end_indices=e_indices)
+            Ys.append(Y)
+        return Ys
+
+    # all slice ops take input tensors
+    def _test_strided_scatter_basic(
+        self,
+        input_shapes,
+        start_indices,
+        end_indices,
+        scatter_dim,
+        test_name,
+    ):
+        logger.info(
+            f"test_strided_scatter_basic with {input_shapes=}, "
+            f"{start_indices=}, {end_indices=}"
+        )
+
+        input_tensors = [None] * len(input_shapes)
+        slice_outputs = self._make_slice_ops(
+            input_shapes,
+            input_tensors,
+            start_indices,
+            end_indices,
+        )
+        concat_op = ops.concatenate()
+        Y = concat_op(slice_outputs, scatter_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        # len(inputs) + 1 output
+        self.assertEqual(len(sorted_graph), len(input_shapes) + 1)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        fused_op = sorted_ops[0]
+        self.assertEqual(fused_op._attrs["op"], "concatenate")
+
+        # Run PyTorch
+        slice_outputs_pt = []
+        xs_pt = []
+        for input_shape, s_indices, e_indices in zip(
+            input_shapes, start_indices, end_indices
+        ):
+            x_pt = torch.randn(input_shape).cuda().half()
+            xs_pt.append(x_pt)
+            slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
+            slice_output_pt = x_pt[slice_indices]
+            slice_outputs_pt.append(slice_output_pt)
+        y_pt = torch.cat(slice_outputs_pt, scatter_dim)
+
+        # run ait
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(xs_pt))]
+        for i, x_pt in enumerate(xs_pt):
+            inputs[input_name_to_index[f"input_{i}"]] = x_pt
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_strided_scatter_basic(self):
+        self._test_strided_scatter_basic(
+            input_shapes=([2], [3]),
+            start_indices=([1], [2]),
+            end_indices=([2], [-1]),
+            scatter_dim=0,
+            test_name="strided_scatter_basic",
+        )
+        self._test_strided_scatter_basic(
+            input_shapes=([3, 10], [3, 10]),
+            start_indices=([0, 4], [0, 2]),
+            end_indices=([None, 6], [None, 4]),
+            scatter_dim=1,
+            test_name="strided_scatter_basic",
+        )
+        self._test_strided_scatter_basic(
+            input_shapes=([10, 8], [20, 8]),
+            start_indices=([1, 0], [4, 0]),
+            end_indices=([2, None], [8, None]),
+            scatter_dim=0,
+            test_name="strided_scatter_basic",
+        )
+        self._test_strided_scatter_basic(
+            input_shapes=([10, 30, 20], [10, 8, 20], [10, 10, 20]),
+            start_indices=([0, 5, 0], [0, 6, 0], [0, 1, 0]),
+            end_indices=([None, 6, None], [None, 8, None], [None, 4, None]),
+            scatter_dim=1,
+            test_name="strided_scatter_basic",
+        )
+
+    def _test_strided_scatter_dynamic(
+        self,
+        input_shapes,
+        start_indices,
+        end_indices,
+        scatter_dim,
+        test_name,
+        make_slices=None,
+    ):
+        logger.info(
+            f"test_strided_scatter_dynamic with {input_shapes=}, "
+            f"{start_indices=}, {end_indices=}"
+        )
+
+        input_tensors = [None] * len(input_shapes)
+        if make_slices is not None:
+            assert len(input_shapes) == len(make_slices), (
+                "Expected input_shapes and make_slices to have the smae length"
+                f" but got {len(input_shapes)} and {len(make_slices)}"
+            )
+            for idx, (input_shape, make_slice) in enumerate(
+                zip(input_shapes, make_slices)
+            ):
+                if not make_slice:
+                    input_name = f"input_{idx}"
+                    input_tensors[idx] = self._make_tensor(input_shape, input_name)
+        slice_outputs = self._make_slice_ops(
+            input_shapes,
+            input_tensors,
+            start_indices,
+            end_indices,
+        )
+        concat_op = ops.concatenate()
+        Y = concat_op(slice_outputs, scatter_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        # len(inputs) + 1 output
+        self.assertEqual(len(sorted_graph), len(input_shapes) + 1)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        fused_op = sorted_ops[0]
+        self.assertEqual(fused_op._attrs["op"], "concatenate")
+
+        for d in input_shapes[0]:
+            if isinstance(d, list):
+                Ms = d
+                break
+        assert Ms is not None, "expected to have at least one dynamic dim"
+        for idx in range(len(Ms)):
+            # Run PyTorch
+            slice_outputs_pt = []
+            xs_pt = []
+            for input_shape, s_indices, e_indices in zip(
+                input_shapes, start_indices, end_indices
+            ):
+                input_shape_pt = [
+                    d[idx] if isinstance(d, list) else d for d in input_shape
+                ]
+                x_pt = torch.randn(*input_shape_pt).cuda().half()
+                xs_pt.append(x_pt)
+                slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
+                slice_output_pt = x_pt[slice_indices]
+                slice_outputs_pt.append(slice_output_pt)
+            y_pt = torch.cat(slice_outputs_pt, scatter_dim)
+
+            # run ait
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0 for i in range(len(xs_pt))]
+            for i, x_pt in enumerate(xs_pt):
+                inputs[input_name_to_index[f"input_{i}"]] = x_pt
+            y = torch.empty(y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+            self.test_count += 1
+
+    def test_strided_scatter_dynamic(self):
+        self._test_strided_scatter_dynamic(
+            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            start_indices=([0, 1], [0, 2]),
+            end_indices=([None, 3], [None, 10]),
+            scatter_dim=1,
+            test_name="strided_scatter_dynamic",
+        )
+        self._test_strided_scatter_dynamic(
+            input_shapes=([[5, 16], [10, 20], 4], [[5, 16], [10, 20], 10]),
+            start_indices=([0, 0, 2], [0, 0, 2]),
+            end_indices=([None, None, 4], [None, None, 10]),
+            scatter_dim=2,
+            test_name="strided_scatter_dynamic",
+        )
+
+    def test_strided_scatter_partial(self):
+        self._test_strided_scatter_dynamic(
+            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            start_indices=([0, 1], [0, 2]),
+            end_indices=([None, 3], [None, 10]),
+            scatter_dim=1,
+            test_name="strided_scatter_partial",
+            make_slices=[True, False],
+        )
+        self._test_strided_scatter_dynamic(
+            input_shapes=(
+                [[5, 7], [1, 10], 4],
+                [[5, 7], [1, 10], 6],
+                [[5, 7], [1, 10], 8],
+            ),
+            start_indices=([0, 0, 2], [0, 0, 4], [0, 0, 6]),
+            end_indices=([None, None, 4], [None, None, 6], [None, None, 8]),
+            scatter_dim=2,
+            test_name="strided_scatter_partial",
+            make_slices=[True, False, True],
+        )
+        self._test_strided_scatter_dynamic(
+            input_shapes=(
+                [[5, 7], [1, 10], 4],
+                [[5, 7], [1, 10], 6],
+                [[5, 7], [1, 10], 8],
+            ),
+            start_indices=([0, 0, 2], [0, 0, 4], [0, 0, 6]),
+            end_indices=([None, None, 4], [None, None, 6], [None, None, 8]),
+            scatter_dim=2,
+            test_name="strided_scatter_partial",
+            make_slices=[False, False, True],
+        )
+
+    def _make_test_graph_multi_dsts_2(
+        self,
+        input_shapes,
+        input_tensors,
+        start_indices,
+        end_indices,
+        scatter_dim,
+    ):
+        """Make a graph where (1) a tensor is sliced twice and both slices are
+        fed into the same concat op, and (2) another sliced output (i.e not
+        the one from (1)) is fed into the same concat op twice.
+        """
+
+        Ys = self._make_slice_ops(
+            input_shapes,
+            input_tensors,
+            start_indices,
+            end_indices,
+            scatter_dim,
+        )
+        slice_op_0 = list(Ys[0].src_ops())[0]
+        X0 = slice_op_0._attrs["inputs"][0]
+        # make one more slice op that takes the tensor input of the first slice op
+        slice_op = ops.dynamic_slice()
+        Y0 = slice_op(X0, start_indices=start_indices[0], end_indices=end_indices[0])
+        Ys.append(Y0)
+
+        # The last sliced output is fed into concat twice
+        Y_1 = Ys[-1]
+        Ys.append(Y_1)
+
+        concat_op = ops.concatenate()
+        Y = concat_op(Ys, scatter_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        return Y
+
+    def _test_strided_scatter_multi_dsts_2(
+        self,
+        input_shapes,
+        start_indices,
+        end_indices,
+        scatter_dim,
+        test_name,
+    ):
+        logger.info(
+            f"strided_scatter_multi_dsts_2 with input_shapes: {input_shapes}, "
+            f"start_indices: {start_indices}, end_indices: {end_indices}"
+        )
+        target = detect_target()
+
+        Ys_pt = []
+        Xs_pt = []
+        for input_shape, s_indices, e_indices in zip(
+            input_shapes, start_indices, end_indices
+        ):
+            X_pt = torch.randn(input_shape).cuda().half()
+            Xs_pt.append(X_pt)
+            slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
+            Y_pt = X_pt[slice_indices]
+            Ys_pt.append(Y_pt)
+        X0_pt = Xs_pt[0]
+        slice0_indices = [slice(i, j) for i, j in zip(start_indices[0], end_indices[0])]
+        Y0_pt = X0_pt[slice0_indices]
+        Ys_pt.append(Y0_pt)
+
+        Y1_pt = Ys_pt[-1]
+        Ys_pt.append(Y1_pt)
+
+        Y_pt = torch.cat(Ys_pt, scatter_dim)
+
+        input_tensors = [None] * len(input_shapes)
+        Y = self._make_test_graph_multi_dsts_2(
+            input_shapes, input_tensors, start_indices, end_indices, scatter_dim
+        )
+
+        test_name = "strided_scatter_multi_dsts_2"
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        sorted_graph = module.debug_sorted_graph
+        # len(inputs) + 1 output
+        self.assertEqual(len(sorted_graph), len(input_shapes) + 1)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        fused_op = sorted_ops[0]
+        self.assertEqual(fused_op._attrs["op"], "concatenate")
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(Xs_pt))]
+        for i, X_pt in enumerate(Xs_pt):
+            inputs[input_name_to_index[f"input_{i}"]] = X_pt
+        y = torch.empty(Y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_strided_scatter_multi_dsts_2(self):
+        self._test_strided_scatter_multi_dsts_2(
+            input_shapes=[[3, 3, 10], [3, 7, 10]],
+            start_indices=[[0, 1, 0], [0, 1, 0]],
+            end_indices=[[None, 2, None], [None, 7, None]],
+            scatter_dim=1,
+            test_name="strided_scatter_partial",
+        )
+
+    def _test_strided_scatter_input_masks(
+        self,
+        Ms,
+        N,
+        K,
+        input_shapes,
+        start_indices,
+        end_indices,
+        scatter_dim,
+        test_name,
+        make_slices,
+    ):
+        # make a graph with 1 gemm_rcr_bias + 1 elemwise + multiple slices -> cat
+        logger.info(
+            f"test_strided_scatter_input_masks with {input_shapes=}, "
+            f"{start_indices=}, {end_indices=}"
+        )
+
+        input_A_name = "input_a"
+        input_A = self._make_tensor([list(Ms), K], input_A_name)
+        input_B_name = "input_b"
+        input_B = self._make_tensor([N, K], input_B_name)
+        input_Bias_name = "input_bias"
+        input_Bias = self._make_tensor([N], input_Bias_name)
+        gemm_output = ops.gemm_rcr_bias()(input_A, input_B, input_Bias)
+        gemm_output._attrs["name"] = "gemm_output"
+
+        input_Add_0_name = "input_add_0"
+        input_Add_1_name = "input_add_1"
+        add_output = self._make_add(
+            [list(Ms), N], input_Add_0_name, input_Add_1_name, "add_output"
+        )
+        # A, B, bias, add_0 and add_1
+        num_extra_inputs = 5
+
+        input_tensors = [None] * len(input_shapes)
+        if make_slices is not None:
+            assert len(input_shapes) == len(make_slices), (
+                "Expected input_shapes and make_slices to have the smae length"
+                f" but got {len(input_shapes)} and {len(make_slices)}"
+            )
+            for idx, (input_shape, make_slice) in enumerate(
+                zip(input_shapes, make_slices)
+            ):
+                if not make_slice:
+                    input_name = f"input_{idx}"
+                    input_tensors[idx] = self._make_tensor(input_shape, input_name)
+        slice_outputs = self._make_slice_ops(
+            input_shapes,
+            input_tensors,
+            start_indices,
+            end_indices,
+        )
+        concat_inputs = [gemm_output] + slice_outputs + [add_output]
+        concat_op = ops.concatenate()
+        Y = concat_op(concat_inputs, scatter_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        # len(inputs) + (A, B, bias, add_0 and add_1) + 1 output
+        self.assertEqual(len(sorted_graph), len(input_shapes) + num_extra_inputs + 1)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        # gemm, add, concat
+        self.assertEqual(len(sorted_ops), 3)
+        result_concat_op = None
+        for op in sorted_ops:
+            if op._attrs["op"] == "concatenate":
+                result_concat_op = op
+                break
+        np.testing.assert_equal(result_concat_op is not None, True)
+        input_masks = [False] + [True] * len(input_shapes) + [False]
+        np.testing.assert_equal(concat_op._attrs["input_masks"], input_masks)
+
+        for idx, M in enumerate(Ms):
+            # Run PyTorch
+            a_pt = torch.randn(M, K).cuda().half()
+            b_pt = torch.randn(N, K).cuda().half()
+            bias_pt = torch.randn(N).cuda().half()
+            gemm_output_pt = torch.nn.functional.linear(a_pt, b_pt, bias=bias_pt)
+
+            add_0_pt = torch.randn(M, N).cuda().half()
+            add_1_pt = torch.randn(M, N).cuda().half()
+            add_output_pt = add_0_pt + add_1_pt
+
+            slice_outputs_pt = []
+            xs_pt = []
+            for input_shape, s_indices, e_indices in zip(
+                input_shapes, start_indices, end_indices
+            ):
+                input_shape_pt = [
+                    d[idx] if isinstance(d, list) else d for d in input_shape
+                ]
+                x_pt = torch.randn(*input_shape_pt).cuda().half()
+                xs_pt.append(x_pt)
+                slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
+                slice_output_pt = x_pt[slice_indices]
+                slice_outputs_pt.append(slice_output_pt)
+            cat_inputs_pt = [gemm_output_pt] + slice_outputs_pt + [add_output_pt]
+            y_pt = torch.cat(cat_inputs_pt, scatter_dim)
+
+            # run ait
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0 for i in range(len(xs_pt) + num_extra_inputs)]
+            for i, x_pt in enumerate(xs_pt):
+                inputs[input_name_to_index[f"input_{i}"]] = x_pt
+            inputs[input_name_to_index["input_a"]] = a_pt
+            inputs[input_name_to_index["input_b"]] = b_pt
+            inputs[input_name_to_index["input_bias"]] = bias_pt
+            inputs[input_name_to_index[input_Add_0_name]] = add_0_pt
+            inputs[input_name_to_index[input_Add_1_name]] = add_1_pt
+            y = torch.empty(y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+            self.test_count += 1
+
+    def test_strided_scatter_input_masks(self):
+        # gemm_output[Ms, N]
+        self._test_strided_scatter_input_masks(
+            Ms=(5, 16),
+            N=4,
+            K=10,
+            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            start_indices=([0, 1], [0, 2]),
+            end_indices=([None, 3], [None, 10]),
+            scatter_dim=1,
+            test_name="strided_scatter_input_masks",
+            make_slices=[True, False],
+        )
+        self._test_strided_scatter_input_masks(
+            Ms=(5, 16),
+            N=4,
+            K=10,
+            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            start_indices=([0, 1], [0, 2]),
+            end_indices=([None, 2], [None, 10]),
+            scatter_dim=1,
+            test_name="strided_scatter_input_masks",
+            make_slices=[True, True],
+        )
+
+    # one tensor is sliced twice
+    def _test_strided_scatter_basic_2(
+        self,
+        input_shape_0,
+        input_shape_2,
+        start_indices,
+        end_indices,
+        scatter_dim,
+        test_name,
+    ):
+        logger.info(f"test_strided_scatter_basic with {start_indices=}, {end_indices=}")
+
+        input_name_0 = "input_0"
+        input_0 = self._make_tensor(input_shape_0, input_name_0)
+        input_name_2 = "input_2"
+        input_2 = self._make_tensor(input_shape_2, input_name_2)
+
+        input_tensors = [input_2, input_0, input_2]
+        input_shapes = [None] * len(input_tensors)
+        slice_outputs = self._make_slice_ops(
+            input_shapes,
+            input_tensors,
+            start_indices,
+            end_indices,
+        )
+        concat_op = ops.concatenate()
+        Y = concat_op(slice_outputs, scatter_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        # len(inputs) + 1 output
+        self.assertEqual(len(sorted_graph), 2 + 1)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        fused_op = sorted_ops[0]
+        self.assertEqual(fused_op._attrs["op"], "concatenate")
+
+        # Run PyTorch
+        slice_outputs_pt = []
+        x0_pt = torch.randn(input_shape_0).cuda().half()
+        x2_pt = torch.randn(input_shape_2).cuda().half()
+        xs_pt = [x2_pt, x0_pt, x2_pt]
+        for x_pt, s_indices, e_indices in zip(xs_pt, start_indices, end_indices):
+            slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
+            slice_output_pt = x_pt[slice_indices]
+            slice_outputs_pt.append(slice_output_pt)
+        y_pt = torch.cat(slice_outputs_pt, scatter_dim)
+
+        # run ait
+        inputs = {"input_0": x0_pt, "input_2": x2_pt}
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_strided_scatter_basic_2(self):
+        self._test_strided_scatter_basic_2(
+            input_shape_0=(1, 10),
+            input_shape_2=(1, 8),
+            start_indices=(
+                [0, 0],
+                [0, 0],
+                [0, 0],
+            ),
+            end_indices=(
+                [None, 2],  # input_2
+                [None, 8],  # input_0
+                [None, 4],  # input_2
+            ),
+            scatter_dim=1,
+            test_name="strided_scatter_basic_2",
+        )
+
+    def _test_strided_scatter_input_masks_2(
+        self,
+        Ms0,
+        N0,
+        Ms1,
+        N1,
+        start_indices,
+        end_indices,
+        scatter_dim,
+        test_name,
+    ):
+        # make a graph with 2 elemwise + 3 slices where 1 elemwise is sliced twice
+        logger.info(
+            f"test_strided_scatter_input_masks {start_indices=}, {end_indices=}"
+        )
+
+        add_0_input_name_0 = "add_0_input_0"
+        add_0_input_name_1 = "add_0_input_1"
+        add_output0 = self._make_add(
+            [list(Ms0), N0], add_0_input_name_0, add_0_input_name_1, "add_0_output"
+        )
+        add_1_input_name_0 = "add_1_input_0"
+        add_1_input_name_1 = "add_1_input_1"
+        add_output1 = self._make_add(
+            [list(Ms1), N1], add_1_input_name_0, add_1_input_name_1, "add_1_output"
+        )
+
+        input_tensors = [add_output0, add_output1, add_output0]
+        input_shapes = [None] * len(input_tensors)
+        slice_outputs = self._make_slice_ops(
+            input_shapes,
+            input_tensors,
+            start_indices,
+            end_indices,
+        )
+        concat_op = ops.concatenate()
+        Y = concat_op(slice_outputs, scatter_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        # 4 adds' inputs + 1 add0 output + 1 add1 output + 1 concat output
+        self.assertEqual(len(sorted_graph), 6 + 1)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        # 2 adds + concat
+        self.assertEqual(len(sorted_ops), 3)
+        result_concat_op = None
+        for op in sorted_ops:
+            if op._attrs["op"] == "concatenate":
+                result_concat_op = op
+                break
+        np.testing.assert_equal(result_concat_op is not None, True)
+        input_masks = [True, True, True]
+        np.testing.assert_equal(concat_op._attrs["input_masks"], input_masks)
+
+        for M0, M1 in zip(Ms0, Ms1):
+            # Run PyTorch
+            add_0_0_pt = torch.randn(M0, N0).cuda().half()
+            add_0_1_pt = torch.randn(M0, N0).cuda().half()
+            add_0_output_pt = add_0_0_pt + add_0_1_pt
+            add_1_0_pt = torch.randn(M1, N1).cuda().half()
+            add_1_1_pt = torch.randn(M1, N1).cuda().half()
+            add_1_output_pt = add_1_0_pt + add_1_1_pt
+
+            slice_outputs_pt = []
+            xs_pt = [add_0_output_pt, add_1_output_pt, add_0_output_pt]
+            for x_pt, s_indices, e_indices in zip(xs_pt, start_indices, end_indices):
+                slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
+                slice_output_pt = x_pt[slice_indices]
+                slice_outputs_pt.append(slice_output_pt)
+            y_pt = torch.cat(slice_outputs_pt, scatter_dim)
+
+            # run ait
+            inputs = {
+                add_0_input_name_0: add_0_0_pt,
+                add_0_input_name_1: add_0_1_pt,
+                add_1_input_name_0: add_1_0_pt,
+                add_1_input_name_1: add_1_1_pt,
+            }
+            y = torch.empty(y_pt.size()).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+            self.test_count += 1
+
+    def test_strided_scatter_input_masks_2(self):
+        self._test_strided_scatter_input_masks_2(
+            Ms0=(4, 10),
+            N0=6,
+            Ms1=(4, 10),
+            N1=7,
+            start_indices=(
+                [0, 0],
+                [0, 0],
+                [0, 0],
+            ),
+            end_indices=(
+                [None, 2],  # input0
+                [None, 5],  # input1
+                [None, 4],  # input0
+            ),
+            scatter_dim=1,
+            test_name="strided_scatter_input_masks_2",
+        )
+
+    # concatenating a slice op, a split op, and an elementwise op
+    def _test_strided_scatter_with_split(
+        self,
+        add_input_shape,
+        split_input_shape,
+        slice_input_shapes,
+        start_indices,
+        end_indices,
+        scatter_dim,
+        test_name,
+    ):
+        logger.info(
+            f"test_strided_scatter_with_split with {start_indices=}, {end_indices=}"
+        )
+
+        # make add
+        input_Add_0_name = "input_add_0"
+        input_Add_1_name = "input_add_1"
+        add_output = self._make_add(
+            add_input_shape, input_Add_0_name, input_Add_1_name, "add_output"
+        )
+
+        # make split
+        split_input_name = "split_input"
+        split_input = self._make_tensor(split_input_shape, split_input_name)
+        split_dim_size = split_input_shape[scatter_dim]
+        split_outputs = ops.split()(
+            split_input, int(split_dim_size / 2), dim=scatter_dim
+        )
+
+        slice_input_tensors = [None] * len(slice_input_shapes)
+        slice_outputs = self._make_slice_ops(
+            slice_input_shapes,
+            slice_input_tensors,
+            start_indices,
+            end_indices,
+        )
+        concat_inputs = [add_output] + slice_outputs + list(split_outputs)
+        concat_op = ops.concatenate()
+        Y = concat_op(concat_inputs, scatter_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        # 2 add inputs + 1 split input + len(slice_inputs) + 1 output
+        self.assertEqual(len(sorted_graph), 3 + len(slice_input_shapes) + 1)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        # add and concat
+        self.assertEqual(len(sorted_ops), 2)
+        result_concat_op = None
+        for op in sorted_ops:
+            if op._attrs["op"] == "concatenate":
+                result_concat_op = op
+                break
+        np.testing.assert_equal(result_concat_op is not None, True)
+        input_masks = [False, True, True, True]
+        np.testing.assert_equal(concat_op._attrs["input_masks"], input_masks)
+
+        # Run PyTorch
+        input_add_0_pt = torch.randn(add_input_shape).cuda().half()
+        input_add_1_pt = torch.randn(add_input_shape).cuda().half()
+        add_output_pt = input_add_0_pt + input_add_1_pt
+
+        split_input_pt = torch.randn(split_input_shape).cuda().half()
+        split_outputs_pt = torch.split(
+            split_input_pt, int(split_dim_size / 2), dim=scatter_dim
+        )
+
+        slice_outputs_pt = []
+        xs_pt = []
+        for input_shape, s_indices, e_indices in zip(
+            slice_input_shapes, start_indices, end_indices
+        ):
+            x_pt = torch.randn(input_shape).cuda().half()
+            xs_pt.append(x_pt)
+            slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
+            slice_output_pt = x_pt[slice_indices]
+            slice_outputs_pt.append(slice_output_pt)
+        cat_inputs_pt = [add_output_pt] + slice_outputs_pt + list(split_outputs_pt)
+        y_pt = torch.cat(cat_inputs_pt, scatter_dim)
+
+        inputs = {
+            input_Add_0_name: input_add_0_pt,
+            input_Add_1_name: input_add_1_pt,
+            split_input_name: split_input_pt,
+        }
+        for i, x_pt in enumerate(xs_pt):
+            inputs[f"input_{i}"] = x_pt
+
+        # run ait
+        y = torch.empty(y_pt.size()).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_strided_scatter_with_split(self):
+        self._test_strided_scatter_with_split(
+            add_input_shape=(4, 10),
+            split_input_shape=(4, 9),
+            slice_input_shapes=([4, 6], [4, 12]),
+            start_indices=([0, 2], [0, 8]),
+            end_indices=([None, 4], [None, 12]),
+            scatter_dim=1,
+            test_name="strided_scatter_with_split",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_split_group_gemm.py b/tests/unittest/compiler/test_strided_split_group_gemm.py
new file mode 100644
index 000000000..dcafe5231
--- /dev/null
+++ b/tests/unittest/compiler/test_strided_split_group_gemm.py
@@ -0,0 +1,338 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+
+
+class StridedSplitGroupGemmTestCase(unittest.TestCase):
+    def test_split_group_gemm(self):
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+
+        K1 = 32
+        K2 = 16
+        K3 = 64
+
+        M = 128
+        N = 32
+        K = K1 + K2 + K3
+
+        dim = 1
+
+        X = Tensor(
+            shape=[IntImm(M), IntImm(K)],
+            dtype="float16",
+            name="x",
+            is_input=True,
+        )
+        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
+
+        split_op = ops.split()
+        X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
+        group_gemm_op = ops.group_gemm_rcr()
+        Y = group_gemm_op(
+            operand_groups=[[X1, W1], [X2, W2], [X3, W3]], output_stride_dim=dim
+        )
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        dll_name = "test_rcr_cat.so"
+        module = compile_model(
+            [Y], target, "./tmp", "strided_split_group_gemm_rcr_cat", dll_name=dll_name
+        )
+        Y_src_ops = Y._attrs["src_ops"]
+        np.testing.assert_equal(len(Y_src_ops), 1)
+        Y_src_op = Y_src_ops[0]
+        np.testing.assert_equal(Y_src_op, group_gemm_op)
+        expected_inputs_group_gemm_op = [X, W1, X, W2, X, W3]
+        np.testing.assert_equal(
+            expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
+        )
+
+        X_pt = torch.randn(M, K).cuda().half()
+        W1_pt = torch.randn(N, K1).cuda().half()
+        W2_pt = torch.randn(N, K2).cuda().half()
+        W3_pt = torch.randn(N, K3).cuda().half()
+        X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
+        Y3_pt = torch.nn.functional.linear(X3_pt, W3_pt)
+        Y_pt = torch.cat([Y1_pt, Y2_pt, Y3_pt], dim=dim)
+        Y_np = Y_pt.cpu().numpy()
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_np.shape)
+
+        inputs = [
+            X_pt,
+            W1_pt,
+            W2_pt,
+            W3_pt,
+        ]
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_split_group_gemm_bias(self):
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+
+        K1 = 32
+        K2 = 16
+        K3 = 64
+
+        M = 128
+        N = 32
+        K = K1 + K2 + K3
+
+        dim = 1
+
+        X = Tensor(
+            shape=[IntImm(M), IntImm(K)], dtype="float16", name="x", is_input=True
+        )
+        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
+        B1 = Tensor(shape=[N], dtype="float16", name="b1", is_input=True)
+        B2 = Tensor(shape=[N], dtype="float16", name="b2", is_input=True)
+        B3 = Tensor(shape=[N], dtype="float16", name="b3", is_input=True)
+
+        split_op = ops.split()
+        X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
+        group_gemm_op = ops.group_gemm_rcr_bias()
+        Y = group_gemm_op(
+            operand_groups=[[X1, W1, B1], [X2, W2, B2], [X3, W3, B3]],
+            output_stride_dim=dim,
+        )
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        dll_name = "test_rcr_bias_cat.so"
+        module = compile_model(
+            [Y],
+            target,
+            "./tmp",
+            "strided_split_group_gemm_rcr_bias_cat",
+            dll_name=dll_name,
+        )
+        Y_src_ops = Y._attrs["src_ops"]
+        np.testing.assert_equal(len(Y_src_ops), 1)
+        Y_src_op = Y_src_ops[0]
+        np.testing.assert_equal(Y_src_op, group_gemm_op)
+        expected_inputs_group_gemm_op = [X, W1, B1, X, W2, B2, X, W3, B3]
+        np.testing.assert_equal(
+            expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
+        )
+
+        X_pt = torch.randn(M, K).cuda().half()
+        W1_pt = torch.randn(N, K1).cuda().half()
+        W2_pt = torch.randn(N, K2).cuda().half()
+        W3_pt = torch.randn(N, K3).cuda().half()
+        B1_pt = torch.randn(N).cuda().half()
+        B2_pt = torch.randn(N).cuda().half()
+        B3_pt = torch.randn(N).cuda().half()
+        X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
+        Y3_pt = torch.nn.functional.linear(X3_pt, W3_pt, bias=B3_pt)
+        Y_pt = torch.cat([Y1_pt, Y2_pt, Y3_pt], dim=dim)
+        Y_np = Y_pt.cpu().numpy()
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_np.shape)
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0] * 7
+        inputs[input_name_to_index["x"]] = X_pt
+        inputs[input_name_to_index["w1"]] = W1_pt
+        inputs[input_name_to_index["w2"]] = W2_pt
+        inputs[input_name_to_index["w3"]] = W3_pt
+        inputs[input_name_to_index["b1"]] = B1_pt
+        inputs[input_name_to_index["b2"]] = B2_pt
+        inputs[input_name_to_index["b3"]] = B3_pt
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_split_group_gemm_reorder(self):
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+
+        K1 = 32
+        K2 = 16
+        K3 = 64
+
+        M = 128
+        N = 32
+        K = K1 + K2 + K3
+
+        dim = 1
+
+        X = Tensor(
+            shape=[IntImm(M), IntImm(K)], dtype="float16", name="x", is_input=True
+        )
+        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
+
+        split_op = ops.split()
+        X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
+        group_gemm_op = ops.group_gemm_rcr()
+        Y = group_gemm_op(
+            operand_groups=[[X2, W2], [X1, W1], [X3, W3]], output_stride_dim=dim
+        )
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        dll_name = "test_rcr_cat_reorder.so"
+        module = compile_model(
+            [Y], target, "./tmp", "strided_split_group_gemm_rcr_cat", dll_name=dll_name
+        )
+        Y_src_ops = Y._attrs["src_ops"]
+        np.testing.assert_equal(len(Y_src_ops), 1)
+        Y_src_op = Y_src_ops[0]
+        np.testing.assert_equal(Y_src_op, group_gemm_op)
+        expected_inputs_group_gemm_op = [X, W2, X, W1, X, W3]
+        np.testing.assert_equal(
+            expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
+        )
+
+        X_pt = torch.randn(M, K).cuda().half()
+        W1_pt = torch.randn(N, K1).cuda().half()
+        W2_pt = torch.randn(N, K2).cuda().half()
+        W3_pt = torch.randn(N, K3).cuda().half()
+        X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
+        Y3_pt = torch.nn.functional.linear(X3_pt, W3_pt)
+        Y_pt = torch.cat([Y2_pt, Y1_pt, Y3_pt], dim=dim)
+        Y_np = Y_pt.cpu().numpy()
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_np.shape)
+
+        inputs = [0 for i in range(4)]
+        name_to_idx = module.get_input_name_to_index_map()
+        inputs[name_to_idx["x"]] = X_pt
+        inputs[name_to_idx["w1"]] = W1_pt
+        inputs[name_to_idx["w2"]] = W2_pt
+        inputs[name_to_idx["w3"]] = W3_pt
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_split_group_gemm_bias_reorder(self):
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+
+        K1 = 32
+        K2 = 16
+        K3 = 64
+
+        M = 128
+        N = 32
+        K = K1 + K2 + K3
+
+        dim = 1
+
+        X = Tensor(
+            shape=[IntImm(M), IntImm(K)], dtype="float16", name="x", is_input=True
+        )
+        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
+        B1 = Tensor(shape=[N], dtype="float16", name="b1", is_input=True)
+        B2 = Tensor(shape=[N], dtype="float16", name="b2", is_input=True)
+        B3 = Tensor(shape=[N], dtype="float16", name="b3", is_input=True)
+
+        split_op = ops.split()
+        X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
+        group_gemm_op = ops.group_gemm_rcr_bias()
+        Y = group_gemm_op(
+            operand_groups=[[X2, W2, B2], [X3, W3, B3], [X1, W1, B1]],
+            output_stride_dim=dim,
+        )
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        dll_name = "test_rcr_bias_cat_reorder.so"
+        module = compile_model(
+            [Y],
+            target,
+            "./tmp",
+            "strided_split_group_gemm_rcr_bias_cat",
+            dll_name=dll_name,
+        )
+        Y_src_ops = Y._attrs["src_ops"]
+        np.testing.assert_equal(len(Y_src_ops), 1)
+        Y_src_op = Y_src_ops[0]
+        np.testing.assert_equal(Y_src_op, group_gemm_op)
+        expected_inputs_group_gemm_op = [X, W2, B2, X, W3, B3, X, W1, B1]
+        np.testing.assert_equal(
+            expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
+        )
+
+        X_pt = torch.randn(M, K).cuda().half()
+        W1_pt = torch.randn(N, K1).cuda().half()
+        W2_pt = torch.randn(N, K2).cuda().half()
+        W3_pt = torch.randn(N, K3).cuda().half()
+        B1_pt = torch.randn(N).cuda().half()
+        B2_pt = torch.randn(N).cuda().half()
+        B3_pt = torch.randn(N).cuda().half()
+        X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
+        Y3_pt = torch.nn.functional.linear(X3_pt, W3_pt, bias=B3_pt)
+        Y_pt = torch.cat([Y2_pt, Y3_pt, Y1_pt], dim=dim)
+        Y_np = Y_pt.cpu().numpy()
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_np.shape)
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0] * 7
+        inputs[input_name_to_index["x"]] = X_pt
+        inputs[input_name_to_index["w1"]] = W1_pt
+        inputs[input_name_to_index["w2"]] = W2_pt
+        inputs[input_name_to_index["w3"]] = W3_pt
+        inputs[input_name_to_index["b1"]] = B1_pt
+        inputs[input_name_to_index["b2"]] = B2_pt
+        inputs[input_name_to_index["b3"]] = B3_pt
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_view_cat.py b/tests/unittest/compiler/test_strided_view_cat.py
new file mode 100644
index 000000000..9b1636767
--- /dev/null
+++ b/tests/unittest/compiler/test_strided_view_cat.py
@@ -0,0 +1,206 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import List
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar
+from aitemplate.testing import detect_target, test_utils
+from aitemplate.utils import graph_utils
+from parameterized import param, parameterized
+
+
+def custom_name_func(testcase_func, param_num, param):
+    return f"{testcase_func.__name__}_{param_num}_{param.args[0]}"
+
+
+class StridedViewCatOpTestCase(unittest.TestCase):
+    @parameterized.expand(
+        [
+            param(
+                "gemm_reshape_cat_fusible_simple",
+                n=2,
+                new_shape=[-1, 2, 2],
+                cat_dim=2,
+                expected_num_tensors=12,
+                expected_num_ops=10,
+            ),
+            param(
+                "gemm_reshape_cat_fusible_expand_1",
+                n=2,
+                new_shape=[-1, 2, 1, 2],
+                cat_dim=3,
+                expected_num_tensors=12,
+                expected_num_ops=10,
+            ),
+            param(
+                "gemm_reshape_cat_fusible_expand_2",
+                n=4,
+                new_shape=[-1, 4, 4, 1],
+                cat_dim=2,
+                expected_num_tensors=12,
+                expected_num_ops=10,
+            ),
+            param(
+                "gemm_reshape_cat_fusible_expand_3",
+                n=2,
+                new_shape=[-1, 2, 2, 1],
+                cat_dim=2,
+                expected_num_tensors=12,
+                expected_num_ops=10,
+            ),
+            param(
+                "gemm_reshape_cat_fusible_expand_4",
+                n=4,
+                new_shape=[-1, 4, 2, 2],
+                cat_dim=2,
+                expected_num_tensors=12,
+                expected_num_ops=10,
+            ),
+            param(
+                "gemm_reshape_cat_non_fusible_dynamic_dim",
+                n=2,
+                new_shape=[-1, 2],
+                cat_dim=1,
+                expected_num_tensors=25,
+                expected_num_ops=18,
+            ),
+            param(
+                "gemm_reshape_cat_non_fusible_stride_dim",
+                n=2,
+                new_shape=[-1, 2 * 2],
+                cat_dim=1,
+                expected_num_tensors=15,
+                expected_num_ops=10,
+            ),
+            param(
+                "gemm_reshape_cat_non_fusible_expand",
+                n=4,
+                new_shape=[-1, 4, 2, 2],
+                cat_dim=3,
+                expected_num_tensors=17,
+                expected_num_ops=10,
+            ),
+        ],
+        name_func=custom_name_func,
+    )
+    def test_strided_gemm_view_cat_fusible(
+        self,
+        test_name: str,
+        n: int,
+        new_shape: List[int],
+        cat_dim: int,
+        expected_num_tensors: int,
+        expected_num_ops: int,
+    ):
+        batch_dim = IntVar([1, 2, 3], "batch_size")
+        input0 = test_utils.gen_input_tensor([batch_dim, n, n], name="input0")
+        input1 = test_utils.gen_input_tensor([n, n], name="input1")
+        input2 = test_utils.gen_input_tensor([batch_dim, n, n], name="input2")
+        input3 = test_utils.gen_input_tensor([n], name="input3")
+        input4 = test_utils.gen_input_tensor([batch_dim, n, n], name="input4")
+        input5 = test_utils.gen_input_tensor([batch_dim, n, n], name="input5")
+        input6 = test_utils.gen_input_tensor([n, n, n], name="input6")
+
+        X0 = ops.gemm_rcr()(input0, input1)
+        X1 = ops.gemm_rcr_bias()(input0, input1, input3)
+        X2 = ops.gemm_rcr_bias_add()(input0, input1, input3, input4)
+        X3 = ops.gemm_rcr_bias_add_add()(input0, input1, input3, input4, input4)
+        X4 = ops.bmm_rcr()(input0, input2)
+
+        # For now these ops do not support output_accessors yet.
+        # TODO: enable these checks once these ops support output_accessors.
+        X5 = ops.bmm_rrr_add()(input0, input2, input3)
+
+        # [m, b, k] x [b, n, k] -> [m, b, n] b = n, k = n
+        X6 = ops.perm102_bmm_rcr()(input0, input6)
+        X7 = ops.perm102_bmm_rrr()(input0, input6)
+
+        Xs = [X2, X1, X0, X3, X4, X5, X6, X7]
+        Ys = [ops.reshape()(X, new_shape) for X in Xs]
+        Ys.insert(2, ops.reshape()(input5, new_shape))
+        Z = ops.concatenate()(Ys, dim=cat_dim)
+        Z._attrs["name"] = "output0"
+        Z._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Z], target, "./tmp", test_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), expected_num_tensors)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            input0_pt = torch.randn([batch_size, n, n]).cuda().half()
+            input1_pt = torch.randn([n, n]).cuda().half()
+            input2_pt = torch.randn([batch_size, n, n]).cuda().half()
+            input3_pt = torch.randn([n]).cuda().half()
+            input4_pt = torch.randn([batch_size, n, n]).cuda().half()
+            input5_pt = torch.randn([batch_size, n, n]).cuda().half()
+            input6_pt = torch.randn([n, n, n]).cuda().half()
+
+            # Run PyTorch baseline.
+            x0_pt = torch.nn.functional.linear(input0_pt, input1_pt)
+            x1_pt = torch.nn.functional.linear(input0_pt, input1_pt, input3_pt)
+            x2_pt = (
+                torch.nn.functional.linear(input0_pt, input1_pt, input3_pt) + input4_pt
+            )
+            x3_pt = (
+                torch.nn.functional.linear(input0_pt, input1_pt, input3_pt)
+                + input4_pt
+                + input4_pt
+            )
+            x4_pt = torch.bmm(input0_pt, input2_pt.transpose(1, 2))
+            x5_pt = torch.bmm(input0_pt, input2_pt) + input3_pt
+            x6_pt = torch.bmm(
+                input0_pt.permute(1, 0, 2), input6_pt.permute(0, 2, 1)
+            ).permute(1, 0, 2)
+            x7_pt = torch.bmm(input0_pt.permute(1, 0, 2), input6_pt).permute(1, 0, 2)
+
+            xs_pt = [x2_pt, x1_pt, x0_pt, x3_pt, x4_pt, x5_pt, x6_pt, x7_pt]
+            ys_pt = [torch.reshape(x, new_shape) for x in xs_pt]
+            ys_pt.insert(2, torch.reshape(input5_pt, new_shape))
+            z_pt = torch.cat(ys_pt, dim=cat_dim)
+            z = torch.empty(z_pt.shape).cuda().half()
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "input0": input0_pt,
+                    "input1": input1_pt,
+                    "input2": input2_pt,
+                    "input3": input3_pt,
+                    "input4": input4_pt,
+                    "input5": input5_pt,
+                    "input6": input6_pt,
+                },
+                [z],
+            )
+
+            # Do comparisons.
+            self.assertTrue(
+                torch.allclose(z, z_pt, atol=1e-2, rtol=1e-2),
+                f"batch_size: {batch_size}, z: {z}, z_pt: {z_pt}, input5_pt: {input5_pt}",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_strided_view_op.py b/tests/unittest/compiler/test_strided_view_op.py
new file mode 100644
index 000000000..38bf9e7a0
--- /dev/null
+++ b/tests/unittest/compiler/test_strided_view_op.py
@@ -0,0 +1,390 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from functools import partial
+
+from typing import Callable, Dict, List, Tuple
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target, test_utils
+from aitemplate.utils import graph_utils
+
+from parameterized import param, parameterized
+
+
+def _gen_simple_strided_ops(
+    batch_dim: IntVar, n1: int, n2: int
+) -> List[Tuple[Tensor, Callable[[torch.Tensor], torch.Tensor]]]:
+    return [
+        (
+            "tanh",
+            ops.elementwise(FuncEnum.TANH)(
+                test_utils.gen_input_tensor([batch_dim, n1, n2])
+            ),
+            torch.tanh,
+        ),
+        (
+            "layernorm",
+            ops.layernorm(normalized_shape=[IntImm(n2)])(
+                test_utils.gen_input_tensor([batch_dim, n1, n2])
+            ),
+            partial(torch.nn.functional.layer_norm, normalized_shape=[n2]),
+        ),
+        (
+            "sum",
+            ops.reduce_sum(2, keepdim=True)(
+                test_utils.gen_input_tensor([batch_dim, n1, n2])
+            ),
+            partial(torch.sum, dim=2, keepdim=True),
+        ),
+    ]
+
+
+def _gen_fusible_view_ops_after_strided_op() -> Dict[str, Callable[[Tensor], Tensor]]:
+    def reshape_op(input_tensor: Tensor):
+        shape = input_tensor._attrs["shape"]
+        return ops.reshape()(
+            input_tensor,
+            [-1, shape[1].value() * shape[2].value()],
+        )
+
+    def flatten_op(input_tensor: Tensor):
+        return ops.flatten(start_dim=1, end_dim=-1)(input_tensor)
+
+    return {"reshape": reshape_op, "flatten": flatten_op}
+
+
+def _gen_non_fusible_view_ops_after_strided_op() -> Dict[
+    str, Callable[[Tensor], Tensor]
+]:
+    def reshape_op(input_tensor: Tensor):
+        n2 = input_tensor._attrs["shape"][2].value()
+        return ops.reshape()(input_tensor, [-1, n2])
+
+    def flatten_op(input_tensor: Tensor):
+        return ops.flatten(start_dim=0, end_dim=1)(input_tensor)
+
+    return {"reshape": reshape_op, "flatten": flatten_op}
+
+
+def _gen_multiple_fusible_view_ops_after_strided_op() -> Dict[
+    str, Callable[[Tensor], Tensor]
+]:
+    def _get_shape(input_tensor: Tensor):
+        return (
+            input_tensor._attrs["shape"][1].value(),
+            input_tensor._attrs["shape"][2].value(),
+        )
+
+    def multi_reshape(input_tensor: Tensor):
+        n1, n2 = _get_shape(input_tensor)
+        return ops.reshape()(
+            ops.reshape()(
+                input_tensor,
+                [-1, n1 * n2],
+            ),
+            [-1, n1, n2],
+        )
+
+    def squeeze_unsqueeze(input_tensor: Tensor):
+        n1, n2 = _get_shape(input_tensor)
+        return ops.squeeze(dim=1)(ops.unsqueeze(dim=1)(input_tensor))
+
+    return {
+        "multi_reshape": multi_reshape,
+        "squeeze_unsqueeze": squeeze_unsqueeze,
+    }
+
+
+def custom_name_func(testcase_func, param_num, param):
+    return f"{testcase_func.__name__}_{param_num}_{param.args[0]}"
+
+
+class StridedViewOpTestCase(unittest.TestCase):
+    @parameterized.expand(
+        [
+            param(f"single_gemm_{name}_fusion", func)
+            for (name, func) in _gen_fusible_view_ops_after_strided_op().items()
+        ],
+        name_func=custom_name_func,
+    )
+    def test_single_gemm_and_view_fusible(self, test_name, func):
+        batch_dim = IntVar([1, 128, 256], "batch_size")
+        N1 = 8
+        N2 = 6
+        K = 10
+        input0 = test_utils.gen_input_tensor([batch_dim, N1, K])
+        input1 = test_utils.gen_input_tensor([N2, K])
+        X0 = ops.gemm_rcr()(input0, input1)
+        Y = ops.elementwise(FuncEnum.TANH)(func(X0))
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 4)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+
+        # Prepae PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            input0_pt = torch.randn(batch_size, N1, K).cuda().half()
+            input1_pt = torch.randn(N2, K).cuda().half()
+
+            # Run PyTorch baseline.
+            x0_pt = torch.matmul(input0_pt, input1_pt.transpose(0, 1))
+            dim_to_value_dict = {"batch_size": batch_size}
+            y_pt = torch.tanh(
+                torch.reshape(
+                    x0_pt, test_utils.get_shape(Y._attrs["shape"], dim_to_value_dict)
+                )
+            )
+            y = torch.empty(y_pt.shape).cuda().half()
+
+            # Run AITemplate module.
+            module.run_with_tensors([input0_pt, input1_pt], [y])
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    @parameterized.expand(
+        [
+            param(f"single_bmm_{name}_fusion", func)
+            for (
+                name,
+                func,
+            ) in _gen_multiple_fusible_view_ops_after_strided_op().items()
+        ],
+        name_func=custom_name_func,
+    )
+    def test_single_bmm_and_multi_view_fusible(self, test_name, func):
+        batch_dim = IntVar([1, 128, 256], "batch_size")
+        N1 = 8
+        N2 = 6
+        K = 10
+        input0 = test_utils.gen_input_tensor([batch_dim, N1, K])
+        input1 = test_utils.gen_input_tensor([batch_dim, K, N2])
+        X0 = ops.bmm_rrr()(input0, input1)
+        Y = ops.elementwise(FuncEnum.COS)(func(X0))
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 4)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+
+        # Prepae PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            input0_pt = torch.randn(batch_size, N1, K).cuda().half()
+            input1_pt = torch.randn(batch_size, K, N2).cuda().half()
+
+            # Run PyTorch baseline.
+            x0_pt = torch.matmul(input0_pt, input1_pt)
+            dim_to_value_dict = {"batch_size": batch_size}
+            y_pt = torch.cos(
+                torch.reshape(
+                    x0_pt, test_utils.get_shape(Y._attrs["shape"], dim_to_value_dict)
+                )
+            )
+            y = torch.empty(y_pt.shape).cuda().half()
+
+            # Run AITemplate module.
+            module.run_with_tensors([input0_pt, input1_pt], [y])
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    @parameterized.expand(
+        [
+            param(f"single_{op_name}_reshape_fusion", input_tensor, torch_func)
+            for (op_name, input_tensor, torch_func) in _gen_simple_strided_ops(
+                IntVar([1, 128, 256], "batch_size"), n1=10, n2=8
+            )
+        ],
+        name_func=custom_name_func,
+    )
+    def test_single_op_and_view_fusible(self, test_name, input_tensor, torch_func):
+        src_input = test_utils.get_src_input(input_tensor)
+        batch_dim = src_input._attrs["shape"][0]
+        n1 = src_input._attrs["shape"][1].value()
+        n2 = src_input._attrs["shape"][2].value()
+        Y = ops.elementwise(FuncEnum.TANH)(ops.reshape()(input_tensor, [batch_dim, -1]))
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+
+        # Prepae PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            input_pt = torch.randn(batch_size, n1, n2).cuda().half()
+
+            # Run PyTorch baseline.
+            y_pt = torch.tanh(torch.reshape(torch_func(input_pt), [batch_size, -1]))
+            y = torch.empty(y_pt.shape).cuda().half()
+
+            # Run AITemplate module.
+            module.run_with_tensors([input_pt], [y])
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    @parameterized.expand(
+        [
+            param(f"single_op_{name}_non_fusion", func)
+            for (name, func) in _gen_non_fusible_view_ops_after_strided_op().items()
+        ],
+        name_func=custom_name_func,
+    )
+    def test_single_op_and_view_non_fusible(self, test_name, func):
+        batch_dim = IntVar([1, 128, 256], "batch_size")
+        N1 = 8
+        N2 = 6
+        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2])
+        X1 = ops.elementwise(FuncEnum.TANH)(X0)
+        Y = ops.elementwise(FuncEnum.TANH)(func(X1))
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 4)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        # Prepae PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            x0_pt = torch.randn(batch_size, N1, N2).cuda().half()
+
+            # Run PyTorch baseline.
+            y_pt = torch.tanh(torch.reshape(torch.tanh(x0_pt), [-1, N2]))
+            y = torch.empty(y_pt.shape).cuda().half()
+
+            # Run AITemplate module.
+            module.run_with_tensors([x0_pt], [y])
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_two_serial_view_outputs(self):
+        batch_dim = IntVar([1, 128, 256], "batch_size")
+        N1 = 8
+        N2 = 6
+        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2])
+        X1 = ops.elementwise(FuncEnum.TANH)(X0)
+        Y1 = ops.reshape()(X1, [-1, N1 * N2])
+        Y2 = ops.reshape()(Y1, [-1, N1, N2])
+        Y1._attrs["name"] = "output1"
+        Y1._attrs["is_output"] = True
+        Y2._attrs["name"] = "output2"
+        Y2._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y1, Y2], target, "./tmp", "two_view_outputs")
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+
+        # Prepae PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            input_pt = torch.randn(batch_size, N1, N2).cuda().half()
+
+            # Run PyTorch baseline.
+            y1_pt = torch.reshape(torch.tanh(input_pt), [batch_size, N1 * N2])
+            y2_pt = torch.reshape(y1_pt, [batch_size, N1, N2])
+            y1 = torch.empty(y1_pt.shape).cuda().half()
+            y2 = torch.empty(y2_pt.shape).cuda().half()
+
+            # Run AITemplate module.
+            module.run_with_tensors([input_pt], [y1, y2])
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(y1, y1_pt, atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.allclose(y2, y2_pt, atol=1e-2, rtol=1e-2))
+
+    def test_two_parallel_views(self):
+        batch_dim = IntVar([1, 128, 256], "batch_size")
+        N1 = 8
+        N2 = 6
+        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2])
+        X1 = ops.elementwise(FuncEnum.TANH)(X0)
+        Y1 = ops.elementwise(FuncEnum.TANH)(ops.reshape()(X1, [-1, N1 * N2]))
+        Y2 = ops.elementwise(FuncEnum.TANH)(ops.reshape()(X1, [-1, N1, N2]))
+        Y1._attrs["name"] = "output1"
+        Y1._attrs["is_output"] = True
+        Y2._attrs["name"] = "output2"
+        Y2._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y1, Y2], target, "./tmp", "two_parallel_view_outputs")
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 4)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        # Prepae PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            input_pt = torch.randn(batch_size, N1, N2).cuda().half()
+            x1_pt = torch.tanh(input_pt)
+
+            # Run PyTorch baseline.
+            y1_pt = torch.tanh(torch.reshape(x1_pt, [batch_size, N1 * N2]))
+            y2_pt = torch.tanh(torch.reshape(x1_pt, [batch_size, N1, N2]))
+            y1 = torch.empty(y1_pt.shape).cuda().half()
+            y2 = torch.empty(y2_pt.shape).cuda().half()
+
+            # Run AITemplate module.
+            module.run_with_tensors([input_pt], [y1, y2])
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(y1, y1_pt, atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.allclose(y2, y2_pt, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_tensor_accessor.py b/tests/unittest/compiler/test_tensor_accessor.py
new file mode 100644
index 000000000..7370b19f0
--- /dev/null
+++ b/tests/unittest/compiler/test_tensor_accessor.py
@@ -0,0 +1,360 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import unittest
+from typing import List, Optional
+
+from aitemplate.compiler.base import IntVar, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+
+class TensorAccessorTestCase(unittest.TestCase):
+    def test_dim_mapping_for_stride(self):
+        tc = TensorAccessor(Tensor(shape=[2, 2, 4]))
+        self.assertListEqual(tc._dim_mapping, [([0], [0]), ([1], [1]), ([2], [2])])
+
+        tc.update_base_tensor(
+            Tensor(shape=[2, 2, 4]), stride_dim=2, stride_dim_offset=10
+        )
+        self.assertListEqual(tc._dim_mapping, [([0], [0]), ([1], [1]), ([2], [2])])
+
+    def _test_dim_mapping_helper(
+        self, original_shape, new_shape, expected_dim_mapping, test_reverse=False
+    ):
+        tc = TensorAccessor(Tensor(shape=original_shape))
+        tc.update_base_tensor_shape(Tensor(shape=new_shape))
+        self.assertEqual(tc._dim_mapping, expected_dim_mapping)
+
+        if test_reverse:
+            tc = TensorAccessor(Tensor(shape=new_shape))
+            tc.update_base_tensor_shape(Tensor(shape=original_shape))
+            expected_dim_mapping = (
+                [(mapping[1], mapping[0]) for mapping in expected_dim_mapping]
+                if expected_dim_mapping is not None
+                else None
+            )
+            self.assertEqual(tc._dim_mapping, expected_dim_mapping)
+
+    def test_dim_mapping_for_reshape_basic(self):
+        self._test_dim_mapping_helper(
+            original_shape=[2, 2, 4],
+            new_shape=[2, 2, 4],
+            expected_dim_mapping=[([0], [0]), ([1], [1]), ([2], [2])],
+            test_reverse=True,
+        )
+        self._test_dim_mapping_helper(
+            original_shape=[2, 2, 4],
+            new_shape=[2, 8],
+            expected_dim_mapping=[([0], [0]), ([1, 2], [1])],
+            test_reverse=True,
+        )
+        self._test_dim_mapping_helper(
+            original_shape=[2, 2, 4],
+            new_shape=[4, 4],
+            expected_dim_mapping=[([0, 1], [0]), ([2], [1])],
+            test_reverse=True,
+        )
+        self._test_dim_mapping_helper(
+            original_shape=[2, 2, 4],
+            new_shape=[16],
+            expected_dim_mapping=[([0, 1, 2], [0])],
+            test_reverse=True,
+        )
+        self._test_dim_mapping_helper(
+            original_shape=[2, 2, 4],
+            new_shape=[8, 2],
+            expected_dim_mapping=[([0, 1, 2], [0, 1])],
+            test_reverse=True,
+        )
+
+    def test_dim_mapping_for_reshape_ones(self):
+        self._test_dim_mapping_helper(
+            original_shape=[1, 1, 1, 1],
+            new_shape=[1, 1, 1],
+            expected_dim_mapping=[
+                ([0], []),
+                ([1], []),
+                ([2], []),
+                ([3], []),
+                ([], [0]),
+                ([], [1]),
+                ([], [2]),
+            ],
+            test_reverse=False,
+        )
+        self._test_dim_mapping_helper(
+            original_shape=[1, 3, 1, 1, 5, 1],
+            new_shape=[1, 3, 5, 1, 1],
+            expected_dim_mapping=[
+                ([0], []),
+                ([], [0]),
+                ([1], [1]),
+                ([2], []),
+                ([3], []),
+                ([4], [2]),
+                ([5], []),
+                ([], [3]),
+                ([], [4]),
+            ],
+            test_reverse=False,
+        )
+
+    def test_dim_mapping_for_reshape_dynamic(self):
+        batch_dim = IntVar([1, 2], name="batch_size")
+        self._test_dim_mapping_helper(
+            original_shape=[batch_dim, 2, 4],
+            new_shape=[batch_dim, 2, 4],
+            expected_dim_mapping=[([0], [0]), ([1], [1]), ([2], [2])],
+            test_reverse=True,
+        )
+        self._test_dim_mapping_helper(
+            original_shape=[batch_dim, 2, 4],
+            new_shape=[batch_dim, 8],
+            expected_dim_mapping=[([0], [0]), ([1, 2], [1])],
+            test_reverse=True,
+        )
+        self._test_dim_mapping_helper(
+            original_shape=[batch_dim, 2, 4],
+            new_shape=[4, batch_dim],
+            expected_dim_mapping=None,
+            test_reverse=True,
+        )
+        self._test_dim_mapping_helper(
+            original_shape=[batch_dim, 2, 4],
+            new_shape=[1, 1, batch_dim, 8],
+            expected_dim_mapping=[([], [0]), ([], [1]), ([0], [2]), ([1, 2], [3])],
+            test_reverse=True,
+        )
+
+        self._test_dim_mapping_helper(
+            original_shape=[1, 1, 1, batch_dim, 1, 1],
+            new_shape=[batch_dim],
+            expected_dim_mapping=[
+                ([0], []),
+                ([1], []),
+                ([2], []),
+                ([3], [0]),
+                ([4], []),
+                ([5], []),
+            ],
+            test_reverse=True,
+        )
+
+    def _test_get_stride_str_helper(
+        self,
+        original_shape,
+        view_shape: Optional[str],
+        stride_shape: str,
+        stride_dim,
+        dim,
+        dim_names: Optional[List[str]],
+        expected_stride_strs,
+    ):
+        tc = TensorAccessor(Tensor(shape=original_shape))
+        if view_shape is not None:
+            tc.update_base_tensor_shape(Tensor(shape=view_shape))
+        tc.update_base_tensor(
+            Tensor(shape=stride_shape), stride_dim, stride_dim_offset=0
+        )
+        self.assertEqual(tc.try_get_stride_strs(dim, dim_names), expected_stride_strs)
+
+    def test_stride_strs_basic(self):
+        self._test_get_stride_str_helper(
+            original_shape=[2, 4, 2],
+            view_shape=None,
+            stride_shape=[2, 8, 2],
+            stride_dim=1,
+            dim=2,
+            dim_names=None,
+            expected_stride_strs=[],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[2, 4, 2],
+            view_shape=None,
+            stride_shape=[2, 8, 2],
+            stride_dim=1,
+            dim=1,
+            dim_names=None,
+            expected_stride_strs=["2"],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[2, 4, 2],
+            view_shape=None,
+            stride_shape=[2, 8, 2],
+            stride_dim=1,
+            dim=0,
+            dim_names=None,
+            expected_stride_strs=["8", "2"],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[2, 4, 2],
+            view_shape=[2, 4, 2],
+            stride_shape=[2, 8, 2],
+            stride_dim=1,
+            dim=0,
+            dim_names=None,
+            expected_stride_strs=["8", "2"],
+        )
+
+    def test_stride_strs_static_mapping(self):
+        self._test_get_stride_str_helper(
+            original_shape=[2, 4, 2],
+            view_shape=[2, 8],
+            stride_shape=[2, 16],
+            stride_dim=1,
+            dim=0,
+            dim_names=None,
+            expected_stride_strs=["16"],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[2, 4, 2],
+            view_shape=[2, 8],
+            stride_shape=[2, 16],
+            stride_dim=1,
+            dim=1,
+            dim_names=None,
+            expected_stride_strs=None,
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[2, 4, 2],
+            view_shape=[2, 8],
+            stride_shape=[2, 16],
+            stride_dim=1,
+            dim=2,
+            dim_names=None,
+            expected_stride_strs=[],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[2, 2, 4],
+            view_shape=[2, 2, 2, 2],
+            stride_shape=[2, 2, 2, 4],
+            stride_dim=3,
+            dim=1,
+            dim_names=None,
+            expected_stride_strs=None,
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[2, 2, 4],
+            view_shape=[2, 2, 2, 2],
+            stride_shape=[2, 2, 4, 2],
+            stride_dim=2,
+            dim=1,
+            dim_names=None,
+            expected_stride_strs=["4", "2"],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[2, 4, 2],
+            view_shape=[2, 8, 1, 1],
+            stride_shape=[2, 8, 1, 2],
+            stride_dim=3,
+            dim=2,
+            dim_names=None,
+            expected_stride_strs=["2"],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[2, 4, 2],
+            view_shape=[2, 8, 1, 1],
+            stride_shape=[2, 8, 1, 2],
+            stride_dim=3,
+            dim=1,
+            dim_names=["a", "b", "c"],
+            expected_stride_strs=["2", "2"],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[2, 4, 2],
+            view_shape=[2, 8, 1, 1],
+            stride_shape=[2, 8, 1, 2],
+            stride_dim=3,
+            dim=0,
+            dim_names=None,
+            expected_stride_strs=["4", "2", "2"],
+        )
+
+    def test_stride_strs_dynamic_mapping(self):
+        batch_dim = IntVar([1, 2], name="batch_size")
+        emb_dim = IntVar([1, 2], name="emb_dim")
+        self._test_get_stride_str_helper(
+            original_shape=[batch_dim, emb_dim, 2],
+            view_shape=None,
+            stride_shape=[batch_dim, emb_dim, 4],
+            stride_dim=2,
+            dim=0,
+            dim_names=["a", "b", "c"],
+            expected_stride_strs=["b", "4"],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[batch_dim, emb_dim, 2],
+            view_shape=[batch_dim, emb_dim, 2],
+            stride_shape=[batch_dim, emb_dim, 4],
+            stride_dim=2,
+            dim=0,
+            dim_names=["a", "b", "c"],
+            expected_stride_strs=["b", "4"],
+        )
+
+        self._test_get_stride_str_helper(
+            original_shape=[batch_dim, 4, 2],
+            view_shape=[batch_dim, 8],
+            stride_shape=[batch_dim, 16],
+            stride_dim=1,
+            dim=0,
+            dim_names=["a", "b", "c"],
+            expected_stride_strs=["16"],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[batch_dim, 4, 2],
+            view_shape=[batch_dim, 8],
+            stride_shape=[batch_dim, 16],
+            stride_dim=1,
+            dim=1,
+            dim_names=None,
+            expected_stride_strs=None,
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[batch_dim, 4, 2],
+            view_shape=[batch_dim, 8],
+            stride_shape=[batch_dim, 16],
+            stride_dim=1,
+            dim=2,
+            dim_names=None,
+            expected_stride_strs=[],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[batch_dim, 4, 2],
+            view_shape=[batch_dim, 8, 1, 1],
+            stride_shape=[batch_dim, 8, 1, 2],
+            stride_dim=3,
+            dim=2,
+            dim_names=None,
+            expected_stride_strs=["2"],
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[batch_dim, 8],
+            view_shape=[batch_dim, 4, 2],
+            stride_shape=[batch_dim, 4, 4],
+            stride_dim=2,
+            dim=0,
+            dim_names=["a", "b"],
+            expected_stride_strs=None,
+        )
+        self._test_get_stride_str_helper(
+            original_shape=[batch_dim, 8],
+            view_shape=[batch_dim, 4, 2],
+            stride_shape=[batch_dim, 4, 4],
+            stride_dim=2,
+            dim=1,
+            dim_names=["a", "b"],
+            expected_stride_strs=[],
+        )
diff --git a/tests/unittest/compiler/test_transform_memory_ops.py b/tests/unittest/compiler/test_transform_memory_ops.py
new file mode 100644
index 000000000..4e3eebf54
--- /dev/null
+++ b/tests/unittest/compiler/test_transform_memory_ops.py
@@ -0,0 +1,247 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops, transform
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils
+
+
+class MemoryOpTransformationTestCase(unittest.TestCase):
+    BATCH_SIZE = 1024
+    M = 10
+    N = 128
+    USE_DYNAMIC_BATCH = False
+
+    def _prepare_cat_elimination_graph(self):
+        dtype = "float16"
+        X0 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X1 = ops.concatenate()([X0], dim=1)
+        X2 = ops.concatenate()([X1], dim=2)
+        X3 = ops.concatenate()([X2, X1], dim=1)
+        X3._attrs["name"] = "output0"
+        X3._attrs["is_output"] = True
+        return X3
+
+    def test_cat_elimination_graph_transformation(self):
+        OUTPUT = self._prepare_cat_elimination_graph()
+        graph = transform.toposort(OUTPUT)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        self.assertEqual(len(graph), 4)
+        graph = transform.transform_memory_ops(graph)
+        self.assertEqual(len(graph), 2)
+
+    def test_cat_elimination_e2e(self):
+        OUTPUT = self._prepare_cat_elimination_graph()
+        target = detect_target()
+        module = compile_model(OUTPUT, target, "./tmp", "cat_elimination")
+
+        x0_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
+        out_pt = torch.cat([x0_pt, x0_pt], dim=1)
+
+        out = torch.empty(out_pt.size()).cuda().half()
+        module.run_with_tensors([x0_pt], [out])
+        self.assertTrue(torch.allclose(out_pt, out, atol=1e-1, rtol=1e-2))
+
+    def _prepare_split_cat_elimination_graph(self):
+        dtype = "float16"
+        X0 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        [X1, X2] = ops.split()(X0, int(self.M / 2), dim=1)
+        X3 = ops.concatenate()([X1, X2], dim=1)
+        [X4, X5] = ops.split()(X3, int(self.N / 2), dim=2)
+        X6 = ops.concatenate()([X4, X5], dim=1)
+        X6._attrs["name"] = "output0"
+        X6._attrs["is_output"] = True
+
+        Y0 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch1")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        Y1 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch2")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input2",
+            is_input=True,
+        )
+        [Y2, Y3] = ops.split()(Y0, int(self.M / 2), dim=1)
+        Y4 = ops.concatenate()([Y1, Y2, Y3, Y0], dim=1)
+        Y4._attrs["name"] = "output1"
+        Y4._attrs["is_output"] = True
+
+        return [X6, Y4]
+
+    def test_split_cat_elimination_graph_transformation(self):
+        OUTPUT = self._prepare_split_cat_elimination_graph()
+        graph = transform.toposort(OUTPUT)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        self.assertEqual(len(graph), 12)
+        graph = transform.transform_memory_ops(graph)
+        self.assertEqual(len(graph), 7)
+
+    def test_split_cat_elimination_e2e(self):
+        OUTPUT = self._prepare_split_cat_elimination_graph()
+        target = detect_target()
+        module = compile_model(OUTPUT, target, "./tmp", "split_cat_elimination")
+
+        x0_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
+        x4_pt, x5_pt = torch.split(x0_pt, int(self.N / 2), dim=2)
+        out_pt0 = torch.cat([x4_pt, x5_pt], dim=1)
+        y0_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
+        y1_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
+        out_pt1 = torch.cat([y1_pt, y0_pt, y0_pt], dim=1)
+
+        out0 = torch.empty(out_pt0.size()).cuda().half()
+        out1 = torch.empty(out_pt1.size()).cuda().half()
+        module.run_with_tensors(
+            {"input0": x0_pt, "input1": y0_pt, "input2": y1_pt},
+            {"output0": out0, "output1": out1},
+        )
+        self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
+        self.assertTrue(torch.allclose(out_pt1, out1, atol=1e-1, rtol=1e-2))
+
+    def _prepare_cat_cat_elimination_graph(self):
+        dtype = "float16"
+        X0 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=int(self.M / 2)),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch1")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=int(self.M / 2)),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch2")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N + 4),
+            ],
+            dtype=dtype,
+            name="input2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch3")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N * 2),
+            ],
+            dtype=dtype,
+            name="input3",
+            is_input=True,
+        )
+
+        X5 = ops.concatenate()([X0, X1], dim=1)
+        X6 = ops.concatenate()([X5, X2], dim=2)
+        X7 = ops.concatenate()([X3, X6], dim=2)
+        X8 = ops.concatenate()([X7, X2], dim=2)
+        X8._attrs["name"] = "output0"
+        X8._attrs["is_output"] = True
+
+        return [X8]
+
+    def test_cat_cat_elimination_graph_transformation(self):
+        OUTPUT = self._prepare_cat_cat_elimination_graph()
+        graph = transform.toposort(OUTPUT)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        self.assertEqual(len(graph), 8)
+        self.assertEqual(len(graph_utils.get_sorted_ops(graph)), 4)
+        graph = transform.transform_memory_ops(graph)
+        self.assertEqual(len(graph), 6)
+        self.assertEqual(len(graph_utils.get_sorted_ops(graph)), 2)
+
+    def test_cat_cat_elimination_e2e(self):
+        OUTPUT = self._prepare_cat_cat_elimination_graph()
+        target = detect_target()
+        module = compile_model(OUTPUT, target, "./tmp", "cat_cat_elimination")
+
+        x0_pt = torch.randn([self.BATCH_SIZE, int(self.M / 2), self.N]).cuda().half()
+        x1_pt = torch.randn([self.BATCH_SIZE, int(self.M / 2), self.N]).cuda().half()
+        x2_pt = torch.randn([self.BATCH_SIZE, self.M, self.N + 4]).cuda().half()
+        x3_pt = torch.randn([self.BATCH_SIZE, self.M, self.N * 2]).cuda().half()
+        x5_pt = torch.cat([x0_pt, x1_pt], dim=1)
+        out_pt0 = torch.cat([x3_pt, x5_pt, x2_pt, x2_pt], dim=2)
+
+        out0 = torch.empty(out_pt0.size()).cuda().half()
+        module.run_with_tensors(
+            {"input0": x0_pt, "input1": x1_pt, "input2": x2_pt, "input3": x3_pt},
+            [out0],
+        )
+        self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_transform_odd_alignment.py b/tests/unittest/compiler/test_transform_odd_alignment.py
new file mode 100644
index 000000000..74f87e2ee
--- /dev/null
+++ b/tests/unittest/compiler/test_transform_odd_alignment.py
@@ -0,0 +1,493 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import _TorchConstantTensorData, Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+def _extract_shape(batch, shape):
+    if len(shape) == 2:
+        return shape
+
+    return (batch, shape[-2], shape[-1])
+
+
+class TransformOddAlignmentCase(unittest.TestCase):
+    def _create_permute_bmm_graph(
+        self, A_shape, B_shape, bmm_type, const_A=None, const_B=None
+    ):
+        OP = getattr(ops, bmm_type, None)
+        assert OP is not None
+
+        A = (
+            const_A
+            if const_A
+            else Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        )
+        B = (
+            const_B
+            if const_B
+            else Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        )
+
+        Y = OP()(A, B)
+        Y._attrs["name"] = "target_bmm_tensor"
+        return Y
+
+    def _extract_src_op(self, tensors):
+        ret = []
+        for tensor in tensors:
+            if len(tensor.src_ops()) != 1:
+                ret.append(None)
+            else:
+                ret.append(list(tensor.src_ops())[0])
+
+        return ret
+
+    def _test_permute_bmm_A(
+        self,
+        B,
+        shape_A,
+        shape_B,
+        origin_bmm,
+        target_bmm,
+        is_const,
+        is_elementwise=False,
+        strided_output=True,
+    ):
+        M = shape_A[-2] if origin_bmm[-3] == "r" else shape_A[-1]
+        N = shape_B[-1] if origin_bmm[-2] == "r" else shape_B[-2]
+
+        for b in B:
+            const_A, const_B = None, None
+            if is_elementwise:
+                const_A = Tensor(
+                    shape=shape_A, dtype="float16", name="input_0", is_input=True
+                )
+                const_A = ops.elementwise(FuncEnum.ADD)(const_A, const_A)
+            elif is_const:
+                const_A_data = torch.randn(_extract_shape(1, shape_A)).half().cuda()
+                const_A = Tensor(
+                    shape=_extract_shape(1, shape_A),
+                    name="input_0",
+                )
+                const_A._bind_data(_TorchConstantTensorData(const_A_data))
+            bmm_tensor = self._create_permute_bmm_graph(
+                shape_A, shape_B, origin_bmm, const_A=const_A, const_B=const_B
+            )
+
+            if strided_output:
+                output = ops.elementwise(FuncEnum.COS)(bmm_tensor)
+            else:
+                output = bmm_tensor
+            output._attrs["name"] = "output_0"
+            output._attrs["is_output"] = True
+
+            # Check value correctness
+            target = detect_target()
+            module = compile_model(
+                output,
+                target,
+                "./tmp",
+                f"alignment_permute_bmm_A_{b}_{origin_bmm}_to_{target_bmm}_{is_const}",
+            )
+
+            exist_new_bmm = False
+            for tensor in module.debug_sorted_graph:
+                src_ops = tensor.src_ops()
+                if len(src_ops) == 0:
+                    continue
+                if not is_elementwise:
+                    self.assertEqual(
+                        len(src_ops),
+                        1,
+                        "constructed graph should only have single-source op tensors",
+                    )
+                src_op = list(tensor.src_ops())[0]
+                if src_op._attrs["op"].startswith("bmm"):
+                    if not is_elementwise:
+                        self.assertEqual(src_op._attrs["op"], target_bmm)
+                    exist_new_bmm = True
+
+                    if is_const:
+                        continue
+                    inputs_op = self._extract_src_op(src_op._attrs["inputs"])
+                    if origin_bmm == target_bmm:
+                        if not is_elementwise:
+                            self.assertNotEqual(inputs_op[0]._attrs["op"], "permute021")
+                    else:
+                        self.assertEqual(inputs_op[0]._attrs["op"], "permute021")
+            self.assertTrue(exist_new_bmm, "Can't find converted bmm op in graph")
+
+            if is_const:
+                X_pt = const_A_data
+            else:
+                X_pt = torch.randn(_extract_shape(b, shape_A)).cuda().half()
+            X_pt_in = X_pt
+            if origin_bmm[-3] == "c":
+                X_pt_in = torch.permute(X_pt, [0, 2, 1])
+
+            W_pt = torch.randn(_extract_shape(b, shape_B)).cuda().half()
+            W_pt_in = W_pt
+            if origin_bmm[-2] == "c":
+                W_pt_in = torch.permute(W_pt, [0, 2, 1])
+            if is_elementwise:
+                W_pt_in = torch.add(W_pt_in, W_pt_in)
+            Y_pt = torch.matmul(X_pt_in, W_pt_in)
+            if strided_output:
+                Y_pt = torch.cos(Y_pt)
+
+            inputs = {"input_1": W_pt}
+            if not is_const:
+                inputs["input_0"] = X_pt
+
+            y = torch.empty([b, M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_permute_bmm_A(self):
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+
+        # const input misaligned on K, permute.
+        self._test_permute_bmm_A(
+            B,
+            [batch_dim, 8, 7],
+            [batch_dim, 7, 16],
+            "bmm_rrr",
+            "bmm_crr",
+            is_const=True,
+        )
+        # const input misaligned on K, permute. [2d broadcast]
+        self._test_permute_bmm_A(
+            B,
+            [8, 7],
+            [batch_dim, 7, 16],
+            "bmm_rrr",
+            "bmm_crr",
+            is_const=True,
+        )
+        # non-const input misaligned on K, permute.
+        self._test_permute_bmm_A(
+            B,
+            [batch_dim, 8, 7],
+            [batch_dim, 7, 16],
+            "bmm_rrr",
+            "bmm_crr",
+            is_const=False,
+        )
+        # elementwise input misaligned on K, don't permute.
+        self._test_permute_bmm_A(
+            B,
+            [batch_dim, 8, 7],
+            [batch_dim, 7, 16],
+            "bmm_rrr",
+            "bmm_rrr",
+            is_const=False,
+            is_elementwise=True,
+        )
+        # non-const input misaligned on M/N, permute.
+        self._test_permute_bmm_A(
+            B,
+            [batch_dim, 8, 7],
+            [batch_dim, 8, 16],
+            "bmm_crr",
+            "bmm_rrr",
+            is_const=False,
+            strided_output=False,
+        )
+        # non-const input misaligned on M/N, less flops, don't permute.
+        self._test_permute_bmm_A(
+            B,
+            [batch_dim, 8, 7],
+            [batch_dim, 8, 16],
+            "bmm_crr",
+            "bmm_crr",
+            is_const=False,
+            is_elementwise=True,
+        )
+
+    def _test_permute_bmm_B(
+        self,
+        B,
+        shape_A,
+        shape_B,
+        origin_bmm,
+        target_bmm,
+        is_const,
+        is_elementwise=False,
+        strided_output=True,
+    ):
+        M = shape_A[-2] if origin_bmm[-3] == "r" else shape_A[-1]
+        N = shape_B[-1] if origin_bmm[-2] == "r" else shape_B[-2]
+
+        for b in B:
+            const_A, const_B = None, None
+            if is_elementwise:
+                const_B = Tensor(
+                    shape=shape_B, dtype="float16", name="input_1", is_input=True
+                )
+                const_B = ops.elementwise(FuncEnum.ADD)(const_B, const_B)
+            elif is_const:
+                const_B_data = torch.randn(_extract_shape(1, shape_B)).half().cuda()
+                const_B = Tensor(
+                    shape=_extract_shape(1, shape_B),
+                    name="input_1",
+                )
+                const_B._bind_data(_TorchConstantTensorData(const_B_data))
+            bmm_tensor = self._create_permute_bmm_graph(
+                shape_A, shape_B, origin_bmm, const_A=const_A, const_B=const_B
+            )
+
+            if strided_output:
+                output = ops.elementwise(FuncEnum.COS)(bmm_tensor)
+            else:
+                output = bmm_tensor
+            output._attrs["name"] = "output_0"
+            output._attrs["is_output"] = True
+
+            # Check value correctness
+            target = detect_target()
+            module = compile_model(
+                output,
+                target,
+                "./tmp",
+                f"alignment_permute_bmm_B_{b}_{origin_bmm}_to_{target_bmm}_{is_const}",
+            )
+
+            exist_new_bmm = False
+            for tensor in module.debug_sorted_graph:
+                src_ops = tensor.src_ops()
+                if len(src_ops) == 0:
+                    continue
+                if not is_elementwise:
+                    self.assertEqual(
+                        len(src_ops),
+                        1,
+                        "constructed graph should only have single-source op tensors",
+                    )
+                src_op = list(tensor.src_ops())[0]
+                if src_op._attrs["op"].startswith("bmm"):
+                    self.assertEqual(src_op._attrs["op"], target_bmm)
+                    exist_new_bmm = True
+
+                    if is_const:
+                        continue
+                    inputs_op = self._extract_src_op(src_op._attrs["inputs"])
+                    if origin_bmm == target_bmm:
+                        if not is_elementwise:
+                            self.assertNotEqual(inputs_op[1]._attrs["op"], "permute021")
+                    else:
+                        self.assertEqual(inputs_op[1]._attrs["op"], "permute021")
+            self.assertTrue(exist_new_bmm, "Can't find converted bmm op in graph")
+
+            if is_const:
+                W_pt = const_B_data
+            else:
+                W_pt = torch.randn(_extract_shape(b, shape_B)).cuda().half()
+            W_pt_in = W_pt
+            if origin_bmm[-2] == "c":
+                W_pt_in = torch.permute(W_pt, [0, 2, 1])
+
+            X_pt = torch.randn(_extract_shape(b, shape_A)).cuda().half()
+            X_pt_in = X_pt
+            if origin_bmm[-3] == "c":
+                X_pt_in = torch.permute(X_pt, [0, 2, 1])
+            if is_elementwise:
+                W_pt_in = torch.add(W_pt_in, W_pt_in)
+            Y_pt = torch.matmul(X_pt_in, W_pt_in)
+            if strided_output:
+                Y_pt = torch.cos(Y_pt)
+
+            inputs = {"input_0": X_pt}
+            if not is_const:
+                inputs["input_1"] = W_pt
+
+            y = torch.empty([b, M, N]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_permute_bmm_B(self):
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+
+        # const input misaligned on K, permute.
+        self._test_permute_bmm_B(
+            B,
+            [batch_dim, 7, 8],
+            [batch_dim, 12, 7],
+            "bmm_ccr",
+            "bmm_crr",
+            is_const=True,
+        )
+        # const input misaligned on K, permute. [2d broadcast]
+        self._test_permute_bmm_B(
+            B,
+            [batch_dim, 8, 16],
+            [16, 7],
+            "bmm_rrr",
+            "bmm_rcr",
+            is_const=True,
+        )
+        # non-const input misaligned on K, permute.
+        self._test_permute_bmm_B(
+            B,
+            [batch_dim, 7, 8],
+            [batch_dim, 16, 7],
+            "bmm_ccr",
+            "bmm_crr",
+            is_const=False,
+        )
+        # elementwise input misaligned on K, don't permute.
+        self._test_permute_bmm_B(
+            B,
+            [batch_dim, 7, 8],
+            [batch_dim, 16, 7],
+            "bmm_ccr",
+            "bmm_ccr",
+            is_const=False,
+            is_elementwise=True,
+        )
+        # non-const input misaligned on M/N, permute.
+        self._test_permute_bmm_B(
+            B,
+            [batch_dim, 8, 16],
+            [batch_dim, 16, 7],
+            "bmm_rrr",
+            "bmm_rcr",
+            is_const=False,
+            strided_output=False,
+        )
+        # non-const input misaligned on M/N, less flop, don't permute.
+        self._test_permute_bmm_B(
+            B,
+            [batch_dim, 8, 16],
+            [batch_dim, 16, 7],
+            "bmm_rrr",
+            "bmm_rrr",
+            is_const=False,
+            is_elementwise=True,
+        )
+
+    def test_permute_bmm_epilogue(self):
+        B = [1, 3]
+        M = 7
+        K = 8
+        N = 16
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        shape_A = [batch_dim, K, M]
+        shape_B = [batch_dim, K, N]
+        shape_D = [batch_dim, M, N]
+
+        D = Tensor(shape=shape_D, dtype="float16", name="input_2", is_input=True)
+
+        bmm_tensor = self._create_permute_bmm_graph(shape_A, shape_B, "bmm_crr")
+        add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, D)
+
+        output = ops.elementwise(FuncEnum.COS)(add_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(
+            output, target, "./tmp", "alignment_permute_bmm_epilogue"
+        )
+
+        exist_new_bmm = False
+        for tensor in module.debug_sorted_graph:
+            src_ops = tensor.src_ops()
+            if len(src_ops) == 0:
+                continue
+            self.assertEqual(
+                len(src_ops),
+                1,
+                "constructed graph should only have single-source op tensors",
+            )
+            src_op = list(tensor.src_ops())[0]
+            if src_op._attrs["op"].startswith("bmm"):
+                self.assertEqual(src_op._attrs["op"], "bmm_rrr_add")
+                exist_new_bmm = True
+
+                inputs_op = self._extract_src_op(src_op._attrs["inputs"])
+                self.assertEqual(inputs_op[0]._attrs["op"], "permute021")
+        self.assertTrue(exist_new_bmm, "Can't find converted bmm op in graph")
+
+        for b in B:
+            X_pt = torch.randn(b, K, M).cuda().half()
+            W_pt = torch.randn(b, K, N).cuda().half()
+            D_pt = torch.randn(b, M, N).cuda().half()
+            Y_pt = torch.cos(torch.matmul(torch.permute(X_pt, [0, 2, 1]), W_pt) + D_pt)
+
+            y = torch.empty([b, M, N]).cuda().half()
+            module.run_with_tensors(
+                {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+            )
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_bmm_pad_special_case(self):
+        # We test one case that padding is cheaper than permuting.
+        B = [1, 3]
+        M = 2
+        K = 3
+        N = 6
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        shape_A = [batch_dim, M, K]
+        shape_B = [batch_dim, N, K]
+
+        bmm_tensor = self._create_permute_bmm_graph(shape_A, shape_B, "bmm_rcr")
+
+        output = ops.elementwise(FuncEnum.COS)(bmm_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", "alignment_pad_bmm")
+
+        exist_new_bmm = False
+        for tensor in module.debug_sorted_graph:
+            src_ops = tensor.src_ops()
+            if len(src_ops) == 0:
+                continue
+            self.assertEqual(
+                len(src_ops),
+                1,
+                "constructed graph should only have single-source op tensors",
+            )
+            src_op = list(tensor.src_ops())[0]
+            if src_op._attrs["op"].startswith("bmm"):
+                self.assertEqual(src_op._attrs["op"], "bmm_rcr")
+                exist_new_bmm = True
+        self.assertTrue(exist_new_bmm, "Can't find converted bmm op in graph")
+
+        for b in B:
+            X_pt = torch.randn(b, M, K).cuda().half()
+            W_pt = torch.randn(b, N, K).cuda().half()
+            Y_pt = torch.cos(torch.matmul(X_pt, torch.permute(W_pt, [0, 2, 1])))
+
+            y = torch.empty([b, M, N]).cuda().half()
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_transform_special_op.py b/tests/unittest/compiler/test_transform_special_op.py
new file mode 100644
index 000000000..9e5efcaba
--- /dev/null
+++ b/tests/unittest/compiler/test_transform_special_op.py
@@ -0,0 +1,366 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+from typing import List
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+from aitemplate.utils.graph_utils import get_sorted_ops
+
+
+class GemmRrrSmallNkTestCase(unittest.TestCase):
+    def _create_gemm_rrr_graph(self, M, K, N):
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[K, N], dtype="float16", name="input_1", is_input=True)
+        OP = ops.gemm_rrr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "gemm_rrr_tensor"
+        Y._attrs["is_output"] = True
+
+        return X, W, Y
+
+    def _test_small_nk(self, Ms, N, K, testname=None):
+        if testname is None:
+            testname = "gemm_rrr_small_nk_{}_{}_{}".format(Ms, N, K)
+            testname = testname.replace(" ", "")
+            testname = testname.replace("[", "")
+            testname = testname.replace("]", "")
+
+        X, W, gemm_tensor = self._create_gemm_rrr_graph(
+            shape_utils.gen_int_var_min_max(Ms), K, N
+        )
+
+        output = ops.elementwise(FuncEnum.COS)(gemm_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model([output, gemm_tensor], target, "./tmp", testname)
+
+        output_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "gemm_rrr_tensor":
+                output_tensor = tensor
+                break
+
+        self.assertIsNotNone(
+            output_tensor, "Cannot find output tensor from module's graph"
+        )
+        self.assertEqual(
+            len(output_tensor._attrs["src_ops"]),
+            1,
+            "Incorrect counts of src_ops in output",
+        )
+
+        src_op = list(output_tensor._attrs["src_ops"])[0]
+        self.assertEqual(
+            src_op._attrs["op"], "gemm_rrr_small_nk", "output op type incorrect"
+        )
+
+        for m in Ms:
+            X_pt = torch.randn(m, K).cuda().half()
+            W_pt = torch.randn(K, N).cuda().half()
+            mm_pt = torch.matmul(X_pt, W_pt)
+            Y_pt = torch.cos(mm_pt)
+            y = torch.empty([m, N]).cuda().half()
+            gemm_tensor_pt = torch.empty([m, N]).cuda().half()
+            module.run_with_tensors(
+                {"input_0": X_pt, "input_1": W_pt},
+                {"output_0": y, "gemm_rrr_tensor": gemm_tensor_pt},
+            )
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_small_nk(self):
+        self._test_small_nk([10], 8, 4)
+        self._test_small_nk([105], 8, 8)
+        self._test_small_nk([1000], 6, 4)
+
+    def test_small_nk_dynamic_shape(self):
+        self._test_small_nk([10, 30], 6, 4, "dynamic")
+        self._test_small_nk([10, 30, 50], 6, 4, "dynamic1")
+
+    def test_small_nk_alignment(self):
+        self._test_small_nk([1000], 6, 3)
+        self._test_small_nk([10], 6, 3)
+        self._test_small_nk([100, 200], 6, 3)
+        self._test_small_nk([105], 7, 1)
+
+    def test_small_nk_no_transform(self):
+        M, K, N = 8, 8, 16
+        _, _, output = self._create_gemm_rrr_graph(M, K, N)
+
+        target = detect_target()
+        module = compile_model(
+            output, target, "./tmp", "test_small_nk_fail_{}_{}_{}".format(M, K, N)
+        )
+
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "gemm_rrr_tensor":
+                output_tensor = tensor
+                break
+
+        self.assertIsNotNone(
+            output_tensor, "Cannot find output tensor from module's graph"
+        )
+        self.assertEqual(
+            len(output_tensor._attrs["src_ops"]),
+            1,
+            "Incorrect counts of src_ops in output",
+        )
+
+        src_op = list(output_tensor._attrs["src_ops"])[0]
+        self.assertEqual(src_op._attrs["op"], "gemm_rrr", "output op type incorrect")
+
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(K, N).cuda().half()
+        Y_pt = torch.matmul(X_pt, W_pt)
+
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+class BmmRcrN1TestCase(unittest.TestCase):
+    def _create_bmm_rcr_graph(self, B, M, N, K):
+        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+        OP = ops.bmm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "bmm_rcr_tensor"
+
+        return X, W, Y
+
+    def _test_n1_k8(self, B, M, N, K, testname=None):
+        if testname is None:
+            testname = "bmm_rcr_n1_{}_{}_{}_{}".format(B, M, N, K)
+            testname = testname.replace(" ", "")
+            testname = testname.replace("[", "")
+            testname = testname.replace("]", "")
+
+        X, W, bmm_tensor = self._create_bmm_rcr_graph(
+            B, shape_utils.gen_int_var_min_max(M), N, K
+        )
+        mul = ops.elementwise(FuncEnum.MUL)(bmm_tensor, Tensor(shape=[], value=1.0))
+        output = ops.elementwise(FuncEnum.COS)(mul)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        output_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "bmm_rcr_tensor":
+                output_tensor = tensor
+                break
+
+        assert output_tensor is not None
+        assert len(output_tensor._attrs["src_ops"]) == 1
+        src_op = list(output_tensor._attrs["src_ops"])[0]
+        assert src_op._attrs["op"] == "bmm_rcr_n1"
+
+        for m in M:
+            X_pt = torch.randn(B, m, K).cuda().half()
+            W_pt = torch.randn(B, N, K).cuda().half()
+
+            def pt_bmm(X_pt, W_pt):
+                WT = torch.transpose(W_pt, 2, 1)
+                Y_pt = torch.bmm(X_pt, WT)
+                return Y_pt
+
+            Y_pt = torch.cos(pt_bmm(X_pt, W_pt))
+
+            y = torch.empty([B, m, N]).cuda().half()
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_n1_k8(self):
+        self._test_n1_k8(1, [8], 1, 8)
+        self._test_n1_k8(10, [8], 1, 8)
+
+    def test_n1_k8_dynamic(self):
+        self._test_n1_k8(10, [8, 16], 1, 8)
+
+    def test_n_non1_fail(self):
+        B, M, K, N = 8, 8, 8, 8
+        _, _, output = self._create_bmm_rcr_graph(B, M, K, N)
+        output._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", "bmm_rcr_n_non1")
+
+        output_tensor = None
+        for tensor in module.debug_sorted_graph:
+            if tensor._attrs["name"] == "bmm_rcr_tensor":
+                output_tensor = tensor
+                break
+
+        self.assertIsNotNone(output_tensor, "bmm_rcr tensor not found")
+        self.assertEqual(len(output_tensor._attrs["src_ops"]), 1)
+        src_op = next(iter(output_tensor._attrs["src_ops"]))
+        self.assertEqual(src_op._attrs["op"], "bmm_rcr")
+
+
+@unittest.skip("enable it when ck fix")
+class OneByOneConvTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._counter = 0
+
+    def _assert_no_convs(self, sorted_graph: List[Tensor]):
+        for op in get_sorted_ops(sorted_graph):
+            self.assertFalse(op._attrs["op"].startswith("conv2d"))
+
+    def _assert_has_gemm(self, sorted_graph: List[Tensor]):
+        for op in get_sorted_ops(sorted_graph):
+            if op._attrs["op"].startswith("gemm_rcr"):
+                return
+        raise AssertionError("Did not find gemm_rcr in graph")
+
+    def _test_simple_1x1_conv(
+        self, batch, CO, HH, WW, CI, activation=None, with_bias=False
+    ):
+        if isinstance(batch, int):
+            batch = (batch,)
+        batch_var = shape_utils.gen_int_var_min_max(batch, name="batch_size")
+        X = Tensor(
+            shape=[batch_var, HH, WW, CI],
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[CO, 1, 1, CI],
+            name="input_1",
+            is_input=True,
+        )
+
+        if with_bias:
+            bias = Tensor(
+                shape=[CO],
+                name="bias",
+                is_input=True,
+            )
+            conv2d = ops.conv2d_bias(stride=1, pad=0)(X, W, bias)
+        else:
+            conv2d = ops.conv2d(stride=1, pad=0)(X, W)
+
+        if activation == "relu":
+            conv2d = ops.elementwise(FuncEnum.RELU)(conv2d)
+        elif activation == "sigmoid":
+            conv2d = ops.elementwise(FuncEnum.SIGMOID)(conv2d)
+
+        elif activation == "hardswish":
+            # We have no FuncEnum.HARDSWISH, must use fused version
+            if with_bias:
+                conv2d = ops.conv2d_bias_hardswish(stride=1, pad=0)(X, W, bias)
+            else:
+                raise NotImplementedError("Cannot use hardswish on conv2d without bias")
+
+        elif activation is not None:
+            raise NotImplementedError(f"Unsupported activation {activation}")
+
+        conv2d._attrs["name"] = "output"
+        conv2d._attrs["is_output"] = True
+
+        with compile_model(
+            conv2d,
+            detect_target(),
+            "./tmp",
+            f"test_simple_one_by_one_conv_{self._counter}",
+        ) as module:
+            self._counter += 1
+            self._assert_no_convs(module.debug_sorted_graph)
+            self._assert_has_gemm(module.debug_sorted_graph)
+
+            for batch_pt in batch:
+                X_pt = torch.randn(batch_pt, CI, HH, WW).half().cuda()
+                W_pt = torch.randn(CO, CI, 1, 1).half().cuda()
+
+                if with_bias:
+                    B_pt = torch.randn(CO).half().cuda()
+                else:
+                    B_pt = None
+
+                Y_pt = torch.nn.functional.conv2d(
+                    X_pt, W_pt, bias=B_pt, stride=1, padding=0
+                )
+
+                if activation == "relu":
+                    Y_pt = torch.relu(Y_pt)
+                elif activation == "sigmoid":
+                    Y_pt = torch.sigmoid(Y_pt)
+                elif activation == "hardswish":
+                    Y_pt = torch.nn.functional.hardswish(Y_pt)
+                elif activation is not None:
+                    raise NotImplementedError(f"Unsupported activation {activation}")
+
+                Y_ait = torch.empty(batch_pt, HH, WW, CO).half().cuda()
+                inputs = {
+                    "input_0": X_pt.permute(0, 2, 3, 1).contiguous(),
+                    "input_1": W_pt.permute(0, 2, 3, 1).contiguous(),
+                }
+                if with_bias:
+                    inputs["bias"] = B_pt
+
+                module.run_with_tensors(inputs, {"output": Y_ait})
+
+                torch.testing.assert_close(
+                    Y_pt, Y_ait.permute(0, 3, 1, 2), atol=1e-1, rtol=1e-1
+                )
+
+    def test_1x1_conv_no_bias(self):
+        self._test_simple_1x1_conv(batch=1, CO=256, HH=3, WW=4, CI=2)
+        self._test_simple_1x1_conv(
+            batch=3, CO=100, HH=200, WW=4, CI=2, activation="relu"
+        )
+        self._test_simple_1x1_conv(
+            batch=2, CO=128, HH=10, WW=42, CI=3, activation="sigmoid"
+        )
+        self._test_simple_1x1_conv(batch=5, CO=256, HH=15, WW=5, CI=13)
+        self._test_simple_1x1_conv(batch=(1, 10), CO=128, HH=2, WW=2, CI=10)
+
+    def test_1x1_conv_with_bias(self):
+        self._test_simple_1x1_conv(batch=1, CO=256, HH=3, WW=4, CI=2, with_bias=True)
+        self._test_simple_1x1_conv(
+            batch=3,
+            CO=100,
+            HH=200,
+            WW=4,
+            CI=2,
+            activation="relu",
+            with_bias=True,
+        )
+        self._test_simple_1x1_conv(
+            batch=2, CO=128, HH=10, WW=42, CI=3, activation="sigmoid", with_bias=True
+        )
+        self._test_simple_1x1_conv(
+            batch=2, CO=64, HH=10, WW=42, CI=3, activation="hardswish", with_bias=True
+        )
+        self._test_simple_1x1_conv(batch=5, CO=256, HH=15, WW=5, CI=13, with_bias=True)
+        self._test_simple_1x1_conv(
+            batch=(1, 10), CO=128, HH=2, WW=2, CI=10, with_bias=True
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_transform_utils.py b/tests/unittest/compiler/test_transform_utils.py
new file mode 100644
index 000000000..73de60dda
--- /dev/null
+++ b/tests/unittest/compiler/test_transform_utils.py
@@ -0,0 +1,171 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops, transform
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class TransformUtilsCheckValidityTestCase(unittest.TestCase):
+    def _get_simple_graph(self):
+        X = Tensor(shape=[2, 3, 4], dtype="float16", name="inputs_0")
+        X1 = ops.elementwise(FuncEnum.COS)(X)
+        X1._attrs["name"] = "COS"
+        X2 = ops.elementwise(FuncEnum.SIN)(X1)
+        X2._attrs["name"] = "SIN"
+        X3 = ops.elementwise(FuncEnum.ABS)(X1)
+        X3._attrs["name"] = "ABS"
+        X4 = ops.elementwise(FuncEnum.ADD)(X2, X3)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "ADD"
+        return X4
+
+    def test_check_validity_pass(self):
+        tensor = self._get_simple_graph()
+        graph = transform.toposort(tensor)
+
+        self.assertTrue(
+            transform.transform_utils.check_graph_validity(graph),
+            "false negative for check_graph_validity",
+        )
+
+    def test_check_validity_no_inputs(self):
+        tensor = self._get_simple_graph()
+        graph = transform.toposort(tensor)
+
+        src_op = list(graph[-2]._attrs["src_ops"])[0]
+        src_op._attrs["inputs"] = []
+        with self.assertRaisesRegex(
+            RuntimeError, "Tensor COS not in inputs for op None"
+        ):
+            transform.transform_utils.check_graph_validity(graph, raiseError=True)
+
+    def test_check_validity_no_outputs(self):
+        tensor = self._get_simple_graph()
+        graph = transform.toposort(tensor)
+
+        src_op = list(graph[-1]._attrs["src_ops"])[0]
+        src_op._attrs["outputs"] = []
+        with self.assertRaisesRegex(
+            RuntimeError, "Tensor ADD not in outputs for op None"
+        ):
+            transform.transform_utils.check_graph_validity(graph, raiseError=True)
+
+    def test_check_validity_no_src_op(self):
+        tensor = self._get_simple_graph()
+        graph = transform.toposort(tensor)
+
+        graph[-1]._attrs["src_ops"] = set()
+        with self.assertRaisesRegex(
+            RuntimeError, "Op None not designated as src_op for tensor ADD"
+        ):
+            transform.transform_utils.check_graph_validity(graph, raiseError=True)
+
+    def test_check_validity_no_dst_op(self):
+        tensor = self._get_simple_graph()
+        graph = transform.toposort(tensor)
+
+        graph[0]._attrs["dst_ops"] = set()
+        with self.assertRaisesRegex(
+            RuntimeError, "Op None not designated as dst_op for tensor inputs_0"
+        ):
+            transform.transform_utils.check_graph_validity(graph, raiseError=True)
+
+    def test_check_validity_lost_input(self):
+        tensor = self._get_simple_graph()
+        graph = transform.toposort(tensor)
+
+        graph = graph[1:]
+        with self.assertRaisesRegex(
+            RuntimeError, "Input tensor inputs_0 not established in graph for op None"
+        ):
+            transform.transform_utils.check_graph_validity(graph, raiseError=True)
+
+    def test_check_validity_lost_output(self):
+        tensor = self._get_simple_graph()
+        graph = transform.toposort(tensor)
+
+        graph = graph[0:-1]
+        with self.assertRaisesRegex(
+            RuntimeError, "Output tensor ADD not established in graph for op None"
+        ):
+            transform.transform_utils.check_graph_validity(graph, raiseError=True)
+
+
+class TransformUtilsReplaceTensorTestCase(unittest.TestCase):
+    def test_original_inputs_replace(self):
+        X_shape = [2, 3, 4]
+        X = Tensor(shape=X_shape, dtype="float16", name="inputs_0", is_input=True)
+        X1 = Tensor(shape=X_shape, dtype="float16", name="inputs_1", is_input=True)
+
+        X2 = ops.elementwise(FuncEnum.COS)(X)
+        X3 = ops.elementwise(FuncEnum.SIN)(X1)
+        X4 = ops.concatenate()([X2, X3])
+        X5 = ops.elementwise(FuncEnum.ADD)(X4, X4)
+        X5._attrs["is_output"] = True
+        X5._attrs["name"] = "ADD"
+
+        R = ops.elementwise(FuncEnum.COS)(X1)
+        transform.transform_utils.remove_dst_op_from_tensor(X1, list(X3.src_ops())[0])
+        transform.transform_utils.replace_tensor(X3, R)
+
+        target = detect_target()
+        module = compile_model(X5, target, "./tmp", "original_inputs_replace")
+
+        x_pt = torch.randn(X_shape).cuda().half()
+        x1_pt = torch.randn(X_shape).cuda().half()
+        x2_pt = torch.cos(x_pt)
+        r_pt = torch.cos(x1_pt)
+        x4_pt = torch.cat([x2_pt, r_pt])
+        x5_pt = torch.add(x4_pt, x4_pt)
+
+        y = torch.empty(x5_pt.size()).cuda().half()
+        module.run_with_tensors([x_pt, x1_pt], [y])
+        self.assertTrue(torch.allclose(x5_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_is_view_of_replace(self):
+        X_shape = [2, 3, 4]
+        X = Tensor(shape=X_shape, dtype="float16", name="inputs_0", is_input=True)
+        X1 = ops.elementwise(FuncEnum.COS)(X)
+        X2 = ops.elementwise(FuncEnum.ABS)(X1)
+        X3 = ops.reshape()(X2, [1, 24])
+        X4 = ops.elementwise(FuncEnum.SIN)(X3)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "SIN"
+
+        R = ops.elementwise(FuncEnum.COS)(X1)
+        transform.transform_utils.remove_dst_op_from_tensor(X1, list(X2.src_ops())[0])
+        transform.transform_utils.replace_tensor(X2, R)
+
+        target = detect_target()
+        module = compile_model(X4, target, "./tmp", "view_replace")
+
+        x_pt = torch.randn(X_shape).cuda().half()
+        x1_pt = torch.cos(x_pt)
+        r_pt = torch.cos(x1_pt)
+        x3_pt = torch.reshape(r_pt, (1, 24))
+        x4_pt = torch.sin(x3_pt)
+
+        y = torch.empty(x4_pt.size()).cuda().half()
+        module.run_with_tensors([x_pt], [y])
+        self.assertTrue(torch.allclose(x4_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_view_strided_op.py b/tests/unittest/compiler/test_view_strided_op.py
new file mode 100644
index 000000000..3589449c7
--- /dev/null
+++ b/tests/unittest/compiler/test_view_strided_op.py
@@ -0,0 +1,519 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+from typing import List, Optional
+
+import torch
+
+from aitemplate.compiler import compile_model, Model, ops
+from aitemplate.compiler.base import IntVar
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target, test_utils
+from aitemplate.utils import graph_utils
+
+from parameterized import param, parameterized
+
+
+def _gen_fusible_view_ops_before_strided_op(
+    name: str, batch_dim: Optional[IntVar], n1: int, n2: int
+) -> List[Tensor]:
+    assert n2 % 2 == 0, f"n2 must be even! n2: {n2}"
+    if batch_dim is not None:
+        return [
+            ops.reshape()(
+                test_utils.gen_input_tensor([batch_dim, n1 * n2], name),
+                [-1, n1, n2],
+            ),
+            ops.flatten(start_dim=2, end_dim=-1)(
+                test_utils.gen_input_tensor([batch_dim, n1, int(n2 / 2), 2], name)
+            ),
+            ops.squeeze(dim=1)(
+                test_utils.gen_input_tensor([batch_dim, 1, n1, n2], name)
+            ),
+        ]
+    else:
+        return [
+            ops.reshape()(
+                test_utils.gen_input_tensor([n1 * n2], name),
+                [n1, n2],
+            ),
+            ops.flatten(start_dim=1, end_dim=-1)(
+                test_utils.gen_input_tensor([n1, int(n2 / 2), 2], name)
+            ),
+            ops.squeeze(dim=0)(test_utils.gen_input_tensor([1, n1, n2], name)),
+        ]
+
+
+def _gen_non_fusible_view_ops_before_strided_op(
+    name: str, batch_dim: IntVar, n1: int, n2: int
+) -> List[Tensor]:
+    new_batch_dim = IntVar(
+        name=batch_dim._attrs["name"],
+        values=[int(value / 2) for value in batch_dim._attrs["values"]],
+    )
+    return [
+        ops.reshape()(
+            test_utils.gen_input_tensor([new_batch_dim, n1, n2 * 2], name),
+            [-1, n1, n2],
+        ),
+        ops.flatten(start_dim=0, end_dim=1)(
+            test_utils.gen_input_tensor([new_batch_dim, 2, n1, n2], name)
+        ),
+    ]
+
+
+def _gen_multiple_fusible_view_ops_before_strided_op(
+    name: str, batch_dim: IntVar, n1: int, n2: int
+) -> List[Tensor]:
+    return [
+        ops.reshape()(
+            ops.reshape()(
+                test_utils.gen_input_tensor([batch_dim, n1, n2], name),
+                [-1, n1 * n2],
+            ),
+            [-1, n1, n2],
+        ),
+        ops.squeeze(dim=1)(
+            ops.unsqueeze(dim=1)(
+                test_utils.gen_input_tensor([batch_dim, n1, n2], name)
+            ),
+        ),
+    ]
+
+
+def custom_name_func(testcase_func, param_num, param):
+    return f"{testcase_func.__name__}_{param_num}_{param.args[0]}"
+
+
+class ViewStridedOpTestCase(unittest.TestCase):
+    def _gen_view_bmm_module(
+        self,
+        input0: Tensor,
+        input1: Tensor,
+        test_name: str,
+        expected_num_tensors: int,
+        expected_num_ops: int,
+        num_bmms: int = 1,
+    ) -> Model:
+        Ys = []
+        for i in range(num_bmms):
+            Y = ops.bmm_rcr()(input0, input1)
+            Y._attrs["name"] = f"output{str(i)}"
+            Y._attrs["is_output"] = True
+            Ys.append(Y)
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Ys, target, "./tmp", test_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), expected_num_tensors)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+
+        return module
+
+    def _test_view_and_bmm(
+        self,
+        module: Model,
+        x0_pt: torch.Tensor,
+        x1_pt: torch.Tensor,
+        ys: List[torch.Tensor],
+        x0_shape: List[int],
+        x1_shape: List[int],
+    ):
+        # Run PyTorch baseline.
+        y_pts = []
+        for _ in range(len(ys)):
+            y_pt = torch.matmul(x0_pt, x1_pt.transpose(1, 2))
+            y_pts.append(y_pt)
+
+        # Run AITemplate module.
+        inputs = [x0_pt.reshape(*x0_shape), x1_pt.reshape(*x1_shape)]
+        module.run_with_tensors(inputs, ys)
+
+        # Do comparisons.
+        for y, y_pt in zip(ys, y_pts):
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    @parameterized.expand(
+        [
+            param(
+                f"single_{test_utils.get_src_op_name(tensor0)}_"
+                f"{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
+                tensor0,
+                tensor1,
+            )
+            for tensor0, tensor1 in zip(
+                _gen_fusible_view_ops_before_strided_op(
+                    "input0",
+                    batch_dim=IntVar([1, 128, 256], "batch_size"),
+                    n1=13,
+                    n2=46,
+                ),
+                _gen_fusible_view_ops_before_strided_op(
+                    "input1", batch_dim=IntImm(1), n1=5, n2=46
+                ),
+            )
+        ],
+        name_func=custom_name_func,
+    )
+    def test_single_view_and_bmm_fusible(
+        self, test_name: str, input0: Tensor, input1: Tensor
+    ):
+        orig_a_shape = test_utils.get_src_input(input0)._attrs["shape"]
+        orig_b_shape = test_utils.get_src_input(input1)._attrs["shape"]
+
+        # Gen module.
+        module = self._gen_view_bmm_module(
+            input0, input1, test_name, expected_num_tensors=3, expected_num_ops=1
+        )
+
+        # Prepae PyTorch tensors.
+        a_shape = input0._attrs["shape"]
+        b_shape = input1._attrs["shape"]
+        for batch_size in a_shape[0]._attrs["values"]:
+            x0_pt = (
+                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
+                .cuda()
+                .half()
+            )
+            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
+            y = (
+                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
+                .cuda()
+                .half()
+            )
+            dim_to_value_dict = {"batch_size": batch_size}
+            self._test_view_and_bmm(
+                module,
+                x0_pt,
+                x1_pt,
+                [y],
+                test_utils.get_shape(orig_a_shape, dim_to_value_dict),
+                test_utils.get_shape(orig_b_shape, dim_to_value_dict),
+            )
+
+    @parameterized.expand(
+        [
+            param(
+                f"single_{test_utils.get_src_op_name(tensor0)}_"
+                f"{test_utils.get_src_op_name(tensor0)}_multi_bmm_fusion",
+                tensor0,
+                tensor1,
+            )
+            for tensor0, tensor1 in zip(
+                _gen_fusible_view_ops_before_strided_op(
+                    "input0",
+                    batch_dim=IntVar([1, 128, 256], "batch_size"),
+                    n1=13,
+                    n2=46,
+                ),
+                _gen_fusible_view_ops_before_strided_op(
+                    "input1", batch_dim=IntImm(1), n1=5, n2=46
+                ),
+            )
+        ],
+        name_func=custom_name_func,
+    )
+    def test_single_view_and_multi_bmm_fusible(
+        self, test_name: str, input0: Tensor, input1: Tensor
+    ):
+        orig_a_shape = test_utils.get_src_input(input0)._attrs["shape"]
+        orig_b_shape = test_utils.get_src_input(input1)._attrs["shape"]
+
+        # Gen module.
+        module = self._gen_view_bmm_module(
+            input0,
+            input1,
+            test_name,
+            expected_num_tensors=4,
+            expected_num_ops=2,
+            num_bmms=2,
+        )
+
+        # Prepae PyTorch tensors.
+        a_shape = input0._attrs["shape"]
+        b_shape = input1._attrs["shape"]
+        for batch_size in a_shape[0]._attrs["values"]:
+            x0_pt = (
+                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
+                .cuda()
+                .half()
+            )
+            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
+            y0 = (
+                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
+                .cuda()
+                .half()
+            )
+            y1 = y0.clone()
+            dim_to_value_dict = {"batch_size": batch_size}
+            self._test_view_and_bmm(
+                module,
+                x0_pt,
+                x1_pt,
+                [y0, y1],
+                test_utils.get_shape(orig_a_shape, dim_to_value_dict),
+                test_utils.get_shape(orig_b_shape, dim_to_value_dict),
+            )
+
+    def test_multi_view_and_multi_bmm_fusible(self):
+        batch_dim = IntVar([1, 128, 256], "batch_size")
+        N0 = 13
+        N1 = 46
+        N2 = 5
+        X0 = test_utils.gen_input_tensor([batch_dim, N0 * N1], "input0")
+        X1 = test_utils.gen_input_tensor([1, N2 * N1], "input1")
+        X2 = ops.reshape()(X0, [-1, N0, N1])
+        X3 = ops.reshape()(X0, [-1, N0, N1])
+        X4 = ops.reshape()(X1, [-1, N2, N1])
+        X5 = ops.reshape()(X1, [-1, N2, N1])
+
+        orig_a_shape = X0._attrs["shape"]
+        orig_b_shape = X1._attrs["shape"]
+
+        Ys = []
+        Y0 = ops.bmm_rcr()(X2, X4)
+        Y1 = ops.bmm_rcr()(X3, X5)
+        Ys = [Y0, Y1]
+        for i, Y in enumerate(Ys):
+            Y._attrs["name"] = f"output{str(i)}"
+            Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Ys, target, "./tmp", "multi_view_multi_bmm_fusion")
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 4)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+
+        # Prepae PyTorch tensors.
+        a_shape = X2._attrs["shape"]
+        b_shape = X4._attrs["shape"]
+        for batch_size in a_shape[0]._attrs["values"]:
+            x0_pt = (
+                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
+                .cuda()
+                .half()
+            )
+            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
+            y0 = (
+                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
+                .cuda()
+                .half()
+            )
+            y1 = y0.clone()
+            dim_to_value_dict = {"batch_size": batch_size}
+            self._test_view_and_bmm(
+                module,
+                x0_pt,
+                x1_pt,
+                [y0, y1],
+                test_utils.get_shape(orig_a_shape, dim_to_value_dict),
+                test_utils.get_shape(orig_b_shape, dim_to_value_dict),
+            )
+
+    @parameterized.expand(
+        [
+            param(
+                f"multi_{test_utils.get_src_op_name(tensor0)}_"
+                f"{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
+                tensor0,
+                tensor1,
+            )
+            for tensor0, tensor1 in zip(
+                _gen_multiple_fusible_view_ops_before_strided_op(
+                    "input0",
+                    batch_dim=IntVar([1, 128, 256], "batch_size"),
+                    n1=13,
+                    n2=46,
+                ),
+                _gen_multiple_fusible_view_ops_before_strided_op(
+                    "input1", batch_dim=IntImm(1), n1=5, n2=46
+                ),
+            )
+        ],
+        name_func=custom_name_func,
+    )
+    def test_multiple_view_and_bmm_fusible(
+        self, test_name: str, input0: Tensor, input1: Tensor
+    ):
+        orig_a_shape = test_utils.get_src_input(
+            test_utils.get_src_input(input0)
+        )._attrs["shape"]
+        orig_b_shape = test_utils.get_src_input(
+            test_utils.get_src_input(input1)
+        )._attrs["shape"]
+
+        # Gen module.
+        module = self._gen_view_bmm_module(
+            input0, input1, test_name, expected_num_tensors=3, expected_num_ops=1
+        )
+
+        # Prepae PyTorch tensors.
+        a_shape = input0._attrs["shape"]
+        b_shape = input1._attrs["shape"]
+        for batch_size in a_shape[0]._attrs["values"]:
+            x0_pt = (
+                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
+                .cuda()
+                .half()
+            )
+            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
+            y = (
+                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
+                .cuda()
+                .half()
+            )
+            dim_to_value_dict = {"batch_size": batch_size}
+            self._test_view_and_bmm(
+                module,
+                x0_pt,
+                x1_pt,
+                [y],
+                test_utils.get_shape(orig_a_shape, dim_to_value_dict),
+                test_utils.get_shape(orig_b_shape, dim_to_value_dict),
+            )
+
+    @parameterized.expand(
+        [
+            param(
+                f"non_fusible_{test_utils.get_src_op_name(tensor0)}_{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
+                tensor0,
+                tensor1,
+            )
+            for tensor0, tensor1 in zip(
+                _gen_non_fusible_view_ops_before_strided_op(
+                    "input0",
+                    batch_dim=IntVar([2, 128, 256], "batch_size"),
+                    n1=13,
+                    n2=46,
+                ),
+                _gen_non_fusible_view_ops_before_strided_op(
+                    "input1",
+                    batch_dim=IntVar([2, 128, 256], "batch_size"),
+                    n1=5,
+                    n2=46,
+                ),
+            )
+        ],
+        name_func=custom_name_func,
+    )
+    def test_non_fusible_view_and_bmm(
+        self, test_name: str, input0: Tensor, input1: Tensor
+    ):
+        orig_a_shape = test_utils.get_src_input(input0)._attrs["shape"]
+        orig_b_shape = test_utils.get_src_input(input1)._attrs["shape"]
+
+        # Gen module.
+        module = self._gen_view_bmm_module(
+            input0, input1, test_name, expected_num_tensors=5, expected_num_ops=3
+        )
+
+        # Prepae PyTorch tensors.
+        a_shape = input0._attrs["shape"]
+        b_shape = input1._attrs["shape"]
+        for batch_size in a_shape[0]._attrs["values"]:
+            x0_pt = (
+                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
+                .cuda()
+                .half()
+            )
+            x1_pt = (
+                torch.randn(batch_size, b_shape[1].value(), b_shape[2].value())
+                .cuda()
+                .half()
+            )
+            y = (
+                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
+                .cuda()
+                .half()
+            )
+            dim_to_value_dict = {"batch_size": int(batch_size / 2)}
+            self._test_view_and_bmm(
+                module,
+                x0_pt,
+                x1_pt,
+                [y],
+                test_utils.get_shape(orig_a_shape, dim_to_value_dict),
+                test_utils.get_shape(orig_b_shape, dim_to_value_dict),
+            )
+
+    def test_single_view_and_gemm_fusible(self):
+        batch_dim = IntVar([1, 128, 256], "batch_size")
+        N0 = 13
+        N1 = 46
+        N2 = 6
+        X0 = test_utils.gen_input_tensor([batch_dim, N0 * N1], "input0")
+        X1 = test_utils.gen_input_tensor([1, N2 * N1], "input1")
+        X2 = test_utils.gen_input_tensor([N2], "input2")
+        X3 = ops.reshape()(X0, [-1, N0, N1])
+        X4 = ops.reshape()(X1, [N2, N1])
+        X5 = ops.reshape()(X1, [N1, N2])
+
+        Ys = []
+        Y0 = ops.gemm_rcr()(X3, X4)
+        Y1 = ops.gemm_rcr_bias()(X3, X4, X2)
+        Y2 = ops.gemm_rrr()(X3, X5)
+        Ys = [Y0, Y1, Y2]
+        for i, Y in enumerate(Ys):
+            Y._attrs["name"] = f"output{str(i)}"
+            Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Ys, target, "./tmp", "single_view_gemm_fusion")
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 6)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        # Prepae PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            x0_pt = torch.randn(batch_size, N0 * N1).cuda().half()
+            x1_pt = torch.randn(1, N2 * N1).cuda().half()
+            x2_pt = torch.randn(N2).cuda().half()
+            x3_pt = torch.reshape(x0_pt, [-1, N0, N1])
+            x4_pt = torch.reshape(x1_pt, [N2, N1])
+            x5_pt = torch.reshape(x1_pt, [N1, N2])
+            y0_pt = torch.nn.functional.linear(x3_pt, x4_pt)
+            y1_pt = torch.nn.functional.linear(x3_pt, x4_pt) + x2_pt
+            y2_pt = torch.nn.functional.linear(x3_pt, x5_pt.transpose(0, 1))
+            y_pts = [y0_pt, y1_pt, y2_pt]
+            ys = [
+                torch.empty(batch_size, N0, N2).cuda().half(),
+                torch.empty(batch_size, N0, N2).cuda().half(),
+                torch.empty(batch_size, N0, N2).cuda().half(),
+            ]
+
+            # Run AITemplate module.
+            inputs = [x0_pt, x1_pt, x2_pt]
+            module.run_with_tensors(inputs, ys)
+
+            # Do comparisons.
+            for y, y_pt in zip(ys, y_pts):
+                self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/frontend/test_module.py b/tests/unittest/frontend/test_module.py
new file mode 100644
index 000000000..7d0db2104
--- /dev/null
+++ b/tests/unittest/frontend/test_module.py
@@ -0,0 +1,246 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import OrderedDict
+
+import torch
+import torch as pt
+from aitemplate import frontend as ait
+
+from aitemplate.compiler import ops
+
+
+class NNModule(unittest.TestCase):
+    def test_module(self):
+        class AITModule(ait.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = None
+                self.b = None
+                self.w = ait.Parameter(shape=[32, 32], dtype="float16")
+                self.b = ait.Parameter(
+                    shape=[
+                        32,
+                    ],
+                    dtype="float16",
+                )
+
+            def forward(self, x):
+                return ops.gemm_rcr_bias()(x, self.w.tensor(), self.b.tensor())
+
+        class PTModule(pt.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = pt.nn.Parameter(torch.randn(32, 32))
+                self.b = pt.nn.Parameter(
+                    torch.randn(
+                        32,
+                    )
+                )
+
+            def forward(self, x):
+                return pt.mm(x, self.w) + self.b
+
+        a = AITModule()
+        ait_param_names = [x[0] for x in a.named_parameters()]
+
+        b = PTModule()
+        pt_param_names = [x[0] for x in b.named_parameters()]
+
+        for (x, y) in zip(ait_param_names, pt_param_names):
+            assert x == y
+
+    def test_sequential_1(self):
+        class AITModule(ait.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = ait.Parameter(shape=[32, 32], dtype="float16")
+                self.b = ait.Parameter(
+                    shape=[
+                        32,
+                    ],
+                    dtype="float16",
+                )
+
+            def forward(self, x):
+                return ops.gemm_rcr_bias()(x, self.w.tensor(), self.b.tensor())
+
+        class PTModule(pt.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = pt.nn.Parameter(torch.randn(32, 32))
+                self.b = pt.nn.Parameter(
+                    torch.randn(
+                        32,
+                    )
+                )
+
+            def forward(self, x):
+                return pt.mm(x, self.w) + self.b
+
+        a = ait.nn.Sequential(AITModule(), AITModule(), AITModule())
+        b = pt.nn.Sequential(
+            PTModule(),
+            PTModule(),
+            PTModule(),
+        )
+
+        ait_param_names = [x[0] for x in a.named_parameters()]
+        pt_param_names = [x[0] for x in b.named_parameters()]
+
+        for (x, y) in zip(ait_param_names, pt_param_names):
+            assert x == y
+
+    def test_sequential_2(self):
+        class AITModule(ait.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = ait.Parameter(shape=[32, 32], dtype="float16")
+                self.b = ait.Parameter(
+                    shape=[
+                        32,
+                    ],
+                    dtype="float16",
+                )
+
+            def forward(self, x):
+                return ops.gemm_rcr_bias()(x, self.w.tensor(), self.b.tensor())
+
+        class PTModule(pt.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = pt.nn.Parameter(torch.randn(32, 32))
+                self.b = pt.nn.Parameter(
+                    torch.randn(
+                        32,
+                    )
+                )
+
+            def forward(self, x):
+                return pt.mm(x, self.w) + self.b
+
+        a = ait.nn.Sequential(
+            OrderedDict(
+                [
+                    ("block1", AITModule()),
+                    ("block2", AITModule()),
+                    ("block3", AITModule()),
+                ]
+            )
+        )
+        b = pt.nn.Sequential(
+            OrderedDict(
+                [
+                    ("block1", PTModule()),
+                    ("block2", PTModule()),
+                    ("block3", PTModule()),
+                ]
+            )
+        )
+
+        ait_param_names = [x[0] for x in a.named_parameters()]
+        pt_param_names = [x[0] for x in b.named_parameters()]
+
+        for (x, y) in zip(ait_param_names, pt_param_names):
+            assert x == y
+
+    def test_module_dict(self):
+        class AITModule(ait.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = ait.Parameter(shape=[32, 32], dtype="float16")
+                self.b = ait.Parameter(
+                    shape=[
+                        32,
+                    ],
+                    dtype="float16",
+                )
+
+            def forward(self, x):
+                return ops.gemm_rcr_bias()(x, self.w.tensor(), self.b.tensor())
+
+        class PTModule(pt.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = pt.nn.Parameter(torch.randn(32, 32))
+                self.b = pt.nn.Parameter(
+                    torch.randn(
+                        32,
+                    )
+                )
+
+            def forward(self, x):
+                return pt.mm(x, self.w) + self.b
+
+        class AITDict(ait.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.dict_a = ait.nn.ModuleDict(
+                    {
+                        "block1": AITModule(),
+                        "block2": AITModule(),
+                    }
+                )
+                self.dict_b = ait.nn.ModuleDict(
+                    {
+                        "block_a": AITModule(),
+                        "block_b": AITModule(),
+                    }
+                )
+
+            def forward(self, x):
+                return (
+                    self.dict_a["block1"](x)
+                    + self.dict_a["block2"](x)
+                    + self.dict_b["block_a"](x)
+                    + self.dict_b["block_b"](x)
+                )
+
+        class PTDict(pt.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.dict_a = pt.nn.ModuleDict(
+                    {
+                        "block1": PTModule(),
+                        "block2": PTModule(),
+                    }
+                )
+                self.dict_b = pt.nn.ModuleDict(
+                    {
+                        "block_a": PTModule(),
+                        "block_b": PTModule(),
+                    }
+                )
+
+            def forward(self, x):
+                return (
+                    self.dict_a["block1"](x)
+                    + self.dict_a["block2"](x)
+                    + self.dict_b["block_a"](x)
+                    + self.dict_b["block_b"](x)
+                )
+
+        a = AITDict()
+        b = PTDict()
+
+        ait_param_names = [x[0] for x in a.named_parameters()]
+        pt_param_names = [x[0] for x in b.named_parameters()]
+
+        for (x, y) in zip(ait_param_names, pt_param_names):
+            assert x == y
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
new file mode 100644
index 000000000..0c4c09d37
--- /dev/null
+++ b/tests/unittest/ops/test_activation.py
@@ -0,0 +1,136 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for special activation Operator.
+"""
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class FusedElementwiseTestCase(unittest.TestCase):
+    def _test_leaky_relu(self, input_size, negative_slope=0.01, test_name="leaky_relu"):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        slope = Tensor(
+            shape=[],
+            dtype="float16",
+            name="slope",
+            value=negative_slope,
+        )
+        X2 = ops.elementwise(FuncEnum.LRELU)(X1, slope)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", test_name)
+
+        x1_pt = torch.randn(input_size).cuda().half()
+        OP_pt = torch.nn.LeakyReLU(negative_slope)
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
+    def _test_relu(self, input_size, test_name="relu"):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.RELU)(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", test_name)
+
+        x1_pt = torch.randn(input_size).cuda().half()
+        x2_pt = torch.relu(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
+    def _test_hardtanh(self, input_size, min_val=-1, max_val=1, test_name="hard_tanh"):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X_min = Tensor(
+            shape=[],
+            dtype="float16",
+            name="min_val",
+            value=min_val,
+            is_input=True,
+        )
+        X_max = Tensor(
+            shape=[],
+            dtype="float16",
+            name="max_val",
+            value=max_val,
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.HARDTANH)(X1, X_min, X_max)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", test_name)
+
+        x1_pt = torch.randn(input_size).cuda().half()
+        OP_pt = torch.nn.Hardtanh(min_val=min_val, max_val=max_val)
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
+    def test_lrelu(self):
+        self._test_leaky_relu([512, 512], test_name="leaky_relu_1")
+        self._test_leaky_relu(
+            [1024, 1024], negative_slope=0.5, test_name="leaky_relu_2"
+        )
+
+    def test_htanh(self):
+        self._test_hardtanh([512, 512], test_name="hard_tanh_1")
+        self._test_hardtanh(
+            [1024, 1024], min_val=-2, max_val=2, test_name="hard_tanh_2"
+        )
+
+    def test_relu(self):
+        self._test_relu([512, 512], test_name="relu_1")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_argmax.py b/tests/unittest/ops/test_argmax.py
new file mode 100644
index 000000000..eccdf0f75
--- /dev/null
+++ b/tests/unittest/ops/test_argmax.py
@@ -0,0 +1,58 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for argmax Operator.
+"""
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class argmaxTestCase(unittest.TestCase):
+    def _test_argmax(self, batch_size=1, shape=(2, 6), dim=0, test_name="argmax"):
+
+        o_shape = list(shape)[:-1]
+
+        X1 = Tensor(
+            shape=shape,
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+        X4 = ops.argmax(dim=dim)(X1)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(X4, target, "./tmp", test_name)
+
+        scores = torch.rand(shape).cuda().half()
+        y_pt = torch.argmax(scores, dim=dim)
+        y = torch.empty_like(y_pt, dtype=torch.int64)
+
+        module.run_with_tensors([scores], [y])
+        y_reshape = y.reshape(o_shape)
+        self.assertTrue(torch.allclose(y_pt, y_reshape, atol=1e-2, rtol=1e-2))
+
+    def test_argmax(self):
+        self._test_argmax(shape=(300, 80), dim=1, test_name="argmax")
+
+
+if __name__ == "__main__":
+    torch.manual_seed(1024)
+    unittest.main()
diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
new file mode 100644
index 000000000..3cb7cd934
--- /dev/null
+++ b/tests/unittest/ops/test_attention.py
@@ -0,0 +1,294 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for flash_attenion Operator.
+"""
+import math
+import os
+import unittest
+
+import torch
+import torch.nn.functional as F
+
+from aitemplate.compiler import compile_model, Model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import benchmark_pt, detect_target
+from aitemplate.utils import logger
+from einops import rearrange, repeat
+
+
+def unpad_input(hidden_states, attention_mask):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, dim)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Return:
+        hidden_states: (total_nnz, dim), where total_nnz = number of tokens in selected in attention_mask.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(
+        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
+    )
+    return (
+        index_first_axis(rearrange(hidden_states, "b s d -> (b s) d"), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def index_first_axis(x, indices):
+    return torch.gather(x, 0, repeat(indices, "z -> z d", d=x.shape[1]))
+
+
+def attention_ref(qkv, attn_mask, dropout_p, upcast=False, causal=False):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        attn_mask: (batch_size, seqlen)
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+        attention: softmax after dropout
+    """
+    q, k, v = (qkv.float() if upcast else qkv).unbind(dim=2)
+    seqlen = qkv.shape[1]
+    d = qkv.shape[-1]
+    scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
+    scores.masked_fill_(rearrange(~attn_mask, "b s -> b 1 1 s"), float("-inf"))
+    if causal:
+        causal_mask = torch.triu(
+            torch.ones(seqlen, seqlen, dtype=torch.bool, device=qkv.device), 1
+        )
+        scores.masked_fill_(causal_mask, float("-inf"))
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
+    return output.to(dtype=qkv.dtype)
+
+
+def attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen):
+    qkv_pt = torch.nn.functional.linear(
+        X_pt, W_pt, bias=B_pt
+    )  # [4096*3, 256] *[768, 256]
+    qkv_pt = torch.reshape(
+        qkv_pt, [1, seqlen, 3, nheads, d]
+    )  # [4096*3, 768] -> [1, 4096, 3, 12, 64]
+    qkv_pt = torch.permute(qkv_pt, [2, 0, 3, 1, 4])  # [3, 1, 12, 4096, 64]
+
+    q_pt, k_pt, v_pt = torch.split(qkv_pt, 1, dim=0)  # [1, 1, 12, 4096, 64]
+    scale_pt = torch.tensor(64**-0.5)
+    q_pt = q_pt * (scale_pt)
+    # #[12, 4096, 64] * [12, 64, 4096] => [12, 4096, 4096]
+    attn_pt = torch.bmm(
+        (torch.reshape(q_pt, [nheads, -1, d])),
+        (torch.transpose(torch.reshape(k_pt, [nheads, -1, d]), 2, 1)),
+    )  # [12,4096,4096]
+    attn_pt = torch.softmax(attn_pt, dim=-1)  # [12,4096,4096]
+    v_pt = torch.reshape(v_pt, [nheads, -1, d])  # [12, 4096, 64]
+    y_pt = torch.bmm(attn_pt, v_pt)  # [12, 4096, 64]
+    y_pt = torch.reshape(y_pt, [1, nheads, seqlen, d])
+    Y_pt = torch.permute(y_pt, [0, 2, 1, 3]).cuda().half()  # [1,4096,12,64]
+    return Y_pt
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class attentionTestCase(unittest.TestCase):
+    def _test_flash_attention(
+        self,
+        batch_size=16,
+        nheads=16,
+        seqlen=1024,
+        n=1024,
+        dropout_p=0.0,
+        causal=False,
+        dtype=torch.float16,
+        device="cuda",
+        test_name="attention",
+        rebuild=True,
+        benchmark_pt=False,
+    ):
+
+        d = n // nheads
+
+        x = torch.randn(
+            batch_size, seqlen, n, device="cuda", dtype=dtype, requires_grad=True
+        )
+        Wqkv = torch.nn.Linear(nheads * d, 3 * nheads * d, device=device, dtype=dtype)
+
+        lengths = torch.tensor(
+            [seqlen] * batch_size, dtype=torch.int, device="cuda"
+        ).reshape(-1, 1)
+        attention_mask_bool = (
+            repeat(torch.arange(seqlen, device="cuda"), "s -> b s", b=batch_size)
+            < lengths
+        )
+        attention_mask = torch.zeros(batch_size, seqlen, device="cuda", dtype=dtype)
+        attention_mask = rearrange(attention_mask, "b s -> b 1 1 s")
+
+        x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
+            x, attention_mask_bool
+        )
+        qkv_unpad = (
+            rearrange(Wqkv(x_unpad), "nnz (t h d) -> nnz t h d", t=3, h=nheads)
+            .detach()
+            .requires_grad_()
+        )
+        qkv = (
+            rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
+            .detach()
+            .requires_grad_()
+        )
+        output = attention_ref(qkv, attention_mask_bool, dropout_p, causal=causal)
+        y_pt = output.detach()
+
+        total, _, num_heads, head_size = qkv_unpad.shape
+
+        X1 = Tensor(
+            shape=[total, 3, num_heads, head_size],
+            dtype="float16",
+            name="qkv",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_size + 1],
+            dtype="int32",
+            name="cu_seqlens",
+            is_input=True,
+        )
+        Y = ops.flash_attention(
+            batch_size=batch_size,
+            dropout=dropout_p,
+            max_seq_len=max_seqlen_in_batch,
+            causal=causal,
+        )(X1, X2)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        if rebuild:
+            target = detect_target()
+            module = compile_model(Y, target, "./tmp", test_name)
+        else:
+            module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+        x1 = qkv_unpad.detach().half().cuda()
+        x2 = cu_seqlens.detach().to(torch.int32).cuda()
+        inputs = {"qkv": x1, "cu_seqlens": x2}
+        y = torch.empty([total, num_heads, head_size]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y = y.reshape((batch_size, -1, nheads, d))
+
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-3, rtol=1e-3))
+
+        if benchmark_pt:
+            from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+            func = attention_ref
+            args = (
+                qkv.cuda().half(),
+                attention_mask_bool.cuda(),
+                dropout_p,
+                False,
+                False,
+            )
+            duration = benchmark_torch_function(100, func, *args)
+            print(
+                f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
+            )
+
+    def test_flash_attention(self):
+        if detect_target().name() == "cuda":
+            self._test_flash_attention(test_name="flash_attention")
+
+    def _test_attention(self, test_name, rebuild=True, benchmark=False):
+        target = detect_target()
+        nheads = 12
+        d = 64  # head_dim
+        seqlen = 4096
+        dim = 768
+        token_emb_init_range = 0.001
+        X = Tensor(shape=[seqlen, dim], dtype="float16", name="input_0", is_input=True)
+        qkv_w = Tensor(
+            shape=[dim * 3, dim], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[dim * 3], dtype="float16", name="input_2", is_input=True)
+
+        qkv = ops.gemm_rcr_bias_permute(shape=(seqlen, 3, nheads), layout="m2n3")(
+            X, qkv_w, B
+        )
+        (q, k, v) = ops.split()(qkv, 1, dim=0)
+        scale = Tensor(shape=[], dtype="float16", name="input_3", value=(d**-0.5))
+        q = ops.elementwise(FuncEnum.MUL)(q, scale)
+        attn = ops.bmm_rcr()(
+            (ops.reshape()(q, [nheads, -1, d])),
+            (ops.reshape()(k, [nheads, -1, d])),
+        )
+        attn = ops.softmax()(attn, -1)
+        v = ops.reshape()(v, [nheads, -1, d])
+        Y = ops.bmm_rrr_permute((nheads,))(attn, v)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        if rebuild:
+            target = detect_target()
+            module = compile_model(Y, target, "./tmp", test_name)
+        else:
+            module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+        X_pt = torch.randn(seqlen, dim).cuda().half() * token_emb_init_range
+        W_pt = torch.randn(dim * 3, dim).cuda().half()
+        B_pt = torch.randn(dim * 3).cuda().half()
+        Y_pt = attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen)
+        inputs = {
+            "input_0": X_pt.half(),
+            "input_1": W_pt.half(),
+            "input_2": B_pt.half(),
+        }
+        y = torch.empty(Y_pt.shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+        if benchmark:
+            pt_time = benchmark_pt.benchmark_torch_function(
+                100, attention_pt, X_pt, W_pt, B_pt, nheads, d, seqlen
+            )
+            logger.info(__file__, "benchmark compiler model time: {0}".format(pt_time))
+
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors(inputs, [y])
+            # Benchmark.
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                inputs,
+                [y],
+                count=100,
+            )
+            logger.info(
+                __file__, "benchmark compiler model time: {0}".format(time_per_iter_ms)
+            )
+
+    def test_attention(self):
+        if detect_target().name() == "rocm":
+            self._test_attention(test_name="attention")
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_avg_pool2d.py b/tests/unittest/ops/test_avg_pool2d.py
new file mode 100644
index 000000000..fc92d39f4
--- /dev/null
+++ b/tests/unittest/ops/test_avg_pool2d.py
@@ -0,0 +1,51 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import IntVar, nn, Tensor
+from aitemplate.testing import detect_target
+
+
+class AvgPoolTestCase(unittest.TestCase):
+    def test_fp16(self):
+        target = detect_target()
+        batch_size = [1, 3]
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), 7, 7, 2048],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        OP = nn.AvgPool2d(kernel_size=7, stride=1, padding=0)
+        Y = OP(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "avg_pool2d")
+        for b in batch_size:
+            X_pt = torch.randn(b, 2048, 7, 7).cuda().half()
+            OP_pt = torch.nn.AvgPool2d(kernel_size=7, stride=1, padding=0)
+            Y_pt = OP_pt(X_pt)
+            y = torch.empty([b, 1, 1, 2048]).cuda().half()
+            x = torch.permute(X_pt, (0, 2, 3, 1)).contiguous()
+            module.run_with_tensors([x], [y])
+            y_transpose = torch.permute(y, (0, 3, 1, 2))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_batch_gather.py b/tests/unittest/ops/test_batch_gather.py
new file mode 100644
index 000000000..9e90c1fbb
--- /dev/null
+++ b/tests/unittest/ops/test_batch_gather.py
@@ -0,0 +1,166 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for batch_gather Operator.
+"""
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class gatherTestCase(unittest.TestCase):
+    def _create_tensors(self, N):
+        scores = torch.randperm(N) / N
+        return scores.cuda().half()
+
+
+class batchGatherTestCase(gatherTestCase):
+    def _create_tensors(self, N):
+        scores = torch.randperm(N) / N
+        return scores.cuda().half()
+
+    def _test_batch_gather(
+        self, shape=(3, 2, 2), ind_shape=(3,), dim=0, max_ind=2, test_name="gather"
+    ):
+
+        in_shape = shape
+
+        o_shape = list(in_shape)
+        rank = len(ind_shape)
+        o_shape[rank - 1] = ind_shape[-1]
+
+        X1 = Tensor(
+            shape=in_shape,
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=ind_shape,
+            dtype="int64",
+            name="indices",
+            is_input=True,
+        )
+        X4 = ops.batch_gather()(X1, X2)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(X4, target, "./tmp", test_name)
+
+        input_x = torch.rand(in_shape).cuda().half()
+        init_index = torch.randint(max_ind, size=ind_shape, dtype=torch.int64).cuda()
+
+        reshaped_shape = list(ind_shape)
+        for _ in range(len(input_x.shape) - len(ind_shape)):
+            reshaped_shape.append(1)
+
+        gather_index = torch.reshape(init_index, reshaped_shape)
+        gather_index = torch.broadcast_to(gather_index, o_shape)
+
+        y_pt = torch.gather(input_x, dim, gather_index)
+
+        x = input_x.reshape(in_shape).contiguous()
+
+        indices = init_index.reshape(ind_shape).contiguous()
+
+        y = torch.empty(o_shape).cuda().half()
+        module.run_with_tensors({"X": x, "indices": indices}, [y])
+
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_batch_gather(self):
+        self._test_batch_gather(
+            shape=(8, 2, 2), ind_shape=(2,), dim=0, max_ind=8, test_name="batch_gather1"
+        )
+        self._test_batch_gather(
+            shape=(2, 2), ind_shape=(2, 2), dim=1, max_ind=2, test_name="batch_gather2"
+        )
+        self._test_batch_gather(
+            shape=(2, 2), ind_shape=(2, 1), dim=1, max_ind=2, test_name="batch_gather3"
+        )
+        self._test_batch_gather(
+            shape=(8, 4, 4, 2, 2),
+            ind_shape=(8, 4, 1),
+            dim=2,
+            max_ind=4,
+            test_name="batch_gather4",
+        )
+
+
+class batchGatherTopkTestCase(gatherTestCase):
+    def _test_batch_gather_topk(
+        self, shape=(2, 2, 2), batch_size=1, N=1000, topK=100, test_name="topk"
+    ):
+
+        m_shape = (N,) + shape
+        n_shape = (topK,) + shape
+
+        X1 = Tensor(
+            shape=m_shape,
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[N],
+            dtype="float16",
+            name="scores",
+            is_input=True,
+        )
+        X3 = ops.topk(k=topK)(X2)
+        X4 = ops.batch_gather()(X1, X3)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(X4, target, "./tmp", test_name)
+
+        input_x = torch.rand(m_shape).cuda().half()
+        scores = self._create_tensors(N)
+
+        (_, init_index) = torch.topk(scores, k=topK, dim=0)
+
+        reshaped_shape = [topK]
+        for _ in range(len(input_x.shape) - 1):
+            reshaped_shape.append(1)
+
+        gather_index = torch.reshape(init_index, reshaped_shape)
+        gather_index = torch.broadcast_to(gather_index, n_shape)
+
+        y_pt = torch.gather(input_x, 0, gather_index)
+
+        x = input_x.reshape(m_shape).contiguous()
+
+        x_scores = scores.reshape((N,)).contiguous()
+
+        y = torch.empty(n_shape).cuda().half()
+        module.run_with_tensors({"X": x, "scores": x_scores}, [y])
+
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_batch_gather_topk(self):
+        self._test_batch_gather_topk(
+            shape=(4, 1, 1), N=2000, topK=300, test_name="batch_gather_topk"
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_bert_embeddings.py b/tests/unittest/ops/test_bert_embeddings.py
new file mode 100644
index 000000000..39dc2315d
--- /dev/null
+++ b/tests/unittest/ops/test_bert_embeddings.py
@@ -0,0 +1,167 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+def get_ait_inputs(batch_size=1, seq_len=512, dtype="int64"):
+    input_ids = Tensor(
+        shape=[batch_size, seq_len],
+        name="input_ids",
+        dtype=dtype,
+        is_input=True,
+    )
+    token_type_ids = Tensor(
+        shape=[batch_size, seq_len],
+        name="token_type_ids",
+        dtype=dtype,
+        is_input=True,
+    )
+    position_ids = Tensor(
+        shape=[batch_size, seq_len],
+        name="position_ids",
+        dtype=dtype,
+        is_input=True,
+    )
+    return (input_ids, token_type_ids, position_ids)
+
+
+def get_ait_params(
+    hidden_size, vocab_size, max_position_embeddings, type_vocab_size, dtype="float16"
+):
+    word_embeddings = Tensor(
+        shape=[vocab_size, hidden_size],
+        dtype=dtype,
+        name="word_embeddings",
+        is_input=True,
+    )
+    token_type_embeddings = Tensor(
+        shape=[type_vocab_size, hidden_size],
+        dtype=dtype,
+        name="token_type_embeddings",
+        is_input=True,
+    )
+    position_embeddings = Tensor(
+        shape=[max_position_embeddings, hidden_size],
+        dtype=dtype,
+        name="position_embeddings",
+        is_input=True,
+    )
+    gamma = Tensor(
+        shape=[hidden_size],
+        dtype=dtype,
+        name="gamma",
+        is_input=True,
+    )
+    beta = Tensor(
+        shape=[hidden_size],
+        dtype=dtype,
+        name="beta",
+        is_input=True,
+    )
+    return (word_embeddings, token_type_embeddings, position_embeddings, gamma, beta)
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class bertEmbeddingsTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(bertEmbeddingsTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_bert_embeddings(
+        self,
+        batch_size,
+        seq_len,
+        hidden_size,
+        vocab_size,
+        max_position_embeddings,
+        type_vocab_size,
+        indices_type="int64",
+    ):
+        inputs = get_ait_inputs(batch_size, seq_len, indices_type)
+        params = get_ait_params(
+            hidden_size,
+            vocab_size,
+            max_position_embeddings,
+            type_vocab_size,
+        )
+        y = ops.bert_embeddings()(*(inputs + params), 1e-5)
+        y._attrs["is_output"] = True
+        y._attrs["name"] = "output"
+
+        target = detect_target()
+        with compile_model(
+            y, target, "./tmp", f"test_bert_embeddings_{self._test_id}"
+        ) as module:
+            dtype = torch.long
+            input_ids = torch.randint(
+                0, vocab_size, (batch_size, seq_len), dtype=dtype
+            ).cuda()
+            token_type_ids = torch.randint(
+                0, type_vocab_size, input_ids.size(), dtype=dtype
+            ).cuda()
+            position_ids = (
+                torch.arange(seq_len, dtype=dtype)
+                .reshape((1, -1))
+                .expand(batch_size, -1)
+                .contiguous()
+                .cuda()
+            )
+            inputs = {
+                "input_ids": input_ids,
+                "token_type_ids": token_type_ids,
+                "position_ids": position_ids,
+            }
+            for param in params:
+                name = param._attrs["name"]
+                shape = [shape.value() for shape in param.shape()]
+                w = torch.randn(shape).cuda().half()
+                inputs[name] = w
+
+            word_embedding = torch.nn.functional.embedding(
+                input_ids, inputs["word_embeddings"]
+            )
+            token_type_embedding = torch.nn.functional.embedding(
+                token_type_ids, inputs["token_type_embeddings"]
+            )
+            position_embedding = torch.nn.functional.embedding(
+                position_ids, inputs["position_embeddings"]
+            )
+
+            pt_embedding = word_embedding + token_type_embedding + position_embedding
+            pt_embedding = torch.nn.functional.layer_norm(
+                pt_embedding, [hidden_size], inputs["gamma"], inputs["beta"], eps=1e-5
+            )
+
+            embedding = torch.empty(pt_embedding.shape).cuda().half()
+            module.run_with_tensors(inputs, [embedding])
+            self.assertTrue(
+                torch.allclose(embedding, pt_embedding, atol=1e-3, rtol=1e-3)
+            )
+
+    def test_bert_embeddings(self):
+        self._test_bert_embeddings(15, 17, 264, 10000, 512, 2)
+        self._test_bert_embeddings(1, 13, 264, 10000, 512, 2)
+        self._test_bert_embeddings(8, 512, 512, 10000, 512, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
new file mode 100644
index 000000000..0c14c348c
--- /dev/null
+++ b/tests/unittest/ops/test_bmm.py
@@ -0,0 +1,399 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class BMMTestCase(unittest.TestCase):
+    def _test_rcr(self, bs, ms, N, K, test_name):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+        )
+        OP = ops.bmm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
+
+        for (b, m) in itertools.product(bs, ms):
+            X_pt = torch.randn(b, m, K).cuda().half()
+            W_pt = torch.randn(b, N, K).cuda().half()
+
+            WT = torch.transpose(W_pt, 2, 1)
+            Y_pt = torch.bmm(X_pt, WT)
+
+            y = torch.empty([b, m, N]).cuda().half()
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            if X_pt.nelement() == 0 or Y_pt.nelement() == 0:
+                pass
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_rcr(self):
+        self._test_rcr([1024], [128], N=512, K=256, test_name="static")
+        if detect_target().name() == "cuda":
+            self._test_rcr([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
+            self._test_rcr([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
+            self._test_rcr(
+                [1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm"
+            )
+            self._test_rcr([0], [128], N=512, K=256, test_name="zero_batch")
+            self._test_rcr([1], [128], N=512, K=0, test_name="zero_k")
+            self._test_rcr([1], [128], N=0, K=8, test_name="zero_n")
+
+    def _test_crr(self, bs, ks, test_name):
+        M = 256
+        N = 512
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        k_dim = shape_utils.gen_int_var_min_max(ks, name="k")
+        X = Tensor(
+            shape=[batch_dim, k_dim, M], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[batch_dim, k_dim, N], dtype="float16", name="input_1", is_input=True
+        )
+        OP = ops.bmm_crr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_crr_{}".format(test_name))
+
+        for (b, k) in itertools.product(bs, ks):
+            X_pt = torch.randn(b, k, M).cuda().half()
+            W_pt = torch.randn(b, k, N).cuda().half()
+
+            XT = torch.transpose(X_pt, 2, 1)
+            Y_pt = torch.bmm(XT, W_pt)
+
+            y = torch.empty([b, M, N]).cuda().half()
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_crr(self):
+        self._test_crr([1024], [128], "static")
+        if detect_target().name() == "cuda":
+            self._test_crr([3, 977, 1024], [128], "dynamic_b")
+            self._test_crr([5], [45, 56, 78], "dynamic_k")
+            self._test_crr([1, 2, 5], [3, 6, 8], "dynamic_bk")
+
+    def _test_rrr(self, bs, ms, test_name):
+        K = 256
+        N = 512
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[batch_dim, K, N], dtype="float16", name="input_1", is_input=True
+        )
+        OP = ops.bmm_rrr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
+
+        for (b, m) in itertools.product(bs, ms):
+            X_pt = torch.randn(b, m, K).cuda().half()
+            W_pt = torch.randn(b, K, N).cuda().half()
+
+            Y_pt = torch.bmm(X_pt, W_pt)
+
+            y = torch.empty([b, m, N]).cuda().half()
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_rrr(self):
+        self._test_rrr([87], [23], "static")
+        if detect_target().name() == "cuda":
+            self._test_rrr([2, 5, 99], [23], "dynamic_b")
+            self._test_rrr([77], [4, 7, 9], "dynamic_m")
+            self._test_rrr([2, 5, 7], [1, 7, 9], "dynamic_bm")
+
+    def _test_ccr(self, bs, test_name):
+        M = 256
+        N = 64
+        K = 128
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        X = Tensor(
+            shape=[batch_dim, K, M], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+        )
+        OP = ops.bmm_ccr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_ccr_{}".format(test_name))
+
+        for b in bs:
+            X_pt = torch.randn(b, K, M).cuda().half()
+            W_pt = torch.randn(b, N, K).cuda().half()
+
+            XT = torch.transpose(X_pt, 2, 1)
+            Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
+            y = torch.empty([b, M, N]).cuda().half()
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_ccr(self):
+        self._test_ccr([77], "static")
+        if detect_target().name() == "cuda":
+            self._test_ccr([1, 9, 101], "dynamic_b")
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class BMMBroadcastTestCase(unittest.TestCase):
+    def test_rcr_with_accessors(self):
+        A_shape = [2, 2, 4]
+        B_shape = [2, 8, 4]
+        C_shape = [2, 2, 8]
+
+        X_expanded = Tensor(
+            shape=A_shape, dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        C = Tensor(shape=C_shape, dtype="float16", name="input_2", is_input=True)
+
+        X, _ = ops.split()(X_expanded, [1, 1], 0)
+        OP = ops.bmm_rcr()
+        Y = OP(X, W)
+        out = ops.concatenate()([Y, C], 0)
+        out._attrs["name"] = "output_0"
+        out._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(out, target, "./tmp", "bmm_rcr_with_accessor")
+
+        X_pt = torch.randn(*A_shape).cuda().half()
+        W_pt = torch.randn(*B_shape).cuda().half()
+        C_pt = torch.randn(*C_shape).cuda().half()
+
+        X_feed, _ = torch.split(X_pt, [1, 1], 0)
+
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(X_feed, WT)
+        out_pt = torch.cat((Y_pt, C_pt), dim=0)
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0, 0, 0]
+        inputs[input_name_to_index["input_0"]] = X_pt
+        inputs[input_name_to_index["input_1"]] = W_pt
+        inputs[input_name_to_index["input_2"]] = C_pt
+        y = torch.empty([4, 2, 8]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(out_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr_merge_with_accessors(self):
+        A_shape = [2, 2, 4]
+        B_shape = [4, 8, 4]
+
+        X_expanded = Tensor(
+            shape=A_shape, dtype="float16", name="input_0", is_input=True
+        )
+        W_expanded = Tensor(
+            shape=B_shape, dtype="float16", name="input_1", is_input=True
+        )
+
+        X1, X2 = ops.split()(X_expanded, [1, 1], 0)
+        W1, W2 = ops.split()(W_expanded, [2, 2], 0)
+        Y1 = ops.bmm_rcr()(X1, W1)
+        Y2 = ops.bmm_rcr()(X2, W2)
+        out = ops.concatenate()([Y1, Y2], 0)
+        out._attrs["name"] = "output_0"
+        out._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(out, target, "./tmp", "bmm_rcr_merge_with_accessor")
+
+        X_pt = torch.randn(*A_shape).cuda().half()
+        W_pt = torch.randn(*B_shape).cuda().half()
+
+        X1_pt, X2_pt = torch.split(X_pt, [1, 1], 0)
+
+        WT = torch.transpose(W_pt, -2, -1)
+        W1_pt, W2_pt = torch.split(WT, [2, 2], 0)
+        Y1_pt = torch.matmul(X1_pt, W1_pt)
+        Y2_pt = torch.matmul(X2_pt, W2_pt)
+        out_pt = torch.cat((Y1_pt, Y2_pt), dim=0)
+
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0, 0]
+        inputs[input_name_to_index["input_0"]] = X_pt
+        inputs[input_name_to_index["input_1"]] = W_pt
+        y = torch.empty([4, 2, 8]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(out_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_rcr(self, A_shape, B_shape, test_name):
+        M, N = A_shape[-2], B_shape[-2]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        OP = ops.bmm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
+
+        X_pt = torch.randn(*A_shape).cuda().half()
+        W_pt = torch.randn(*B_shape).cuda().half()
+
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(X_pt, WT)
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr(self):
+        self._test_rcr([1, 16, 8], [2, 32, 8], "broadcastable_a")
+        self._test_rcr([2, 16, 8], [1, 32, 8], "broadcastable_b")
+        self._test_rcr([16, 8], [8, 32, 8], "2d_broadcastable_a")
+        self._test_rcr([8, 16, 8], [32, 8], "2d_broadcastable_b")
+
+    def _test_crr(self, A_shape, B_shape, test_name):
+        M, N = A_shape[-1], B_shape[-1]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        OP = ops.bmm_crr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_crr_{}".format(test_name))
+
+        X_pt = torch.randn(*A_shape).cuda().half()
+        W_pt = torch.randn(*B_shape).cuda().half()
+
+        XT = torch.transpose(X_pt, -2, -1)
+        Y_pt = torch.matmul(XT, W_pt)
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_crr(self):
+        self._test_crr([1, 8, 16], [2, 8, 32], "broadcastable_a")
+        self._test_crr([2, 8, 16], [1, 8, 32], "broadcastable_b")
+        self._test_crr([8, 16], [8, 8, 32], "2d_broadcastable_a")
+        self._test_crr([8, 8, 16], [8, 32], "2d_broadcastable_b")
+
+    def _test_rrr(self, A_shape, B_shape, test_name):
+        M, N = A_shape[-2], B_shape[-1]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        OP = ops.bmm_rrr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
+
+        X_pt = torch.randn(*A_shape).cuda().half()
+        W_pt = torch.randn(*B_shape).cuda().half()
+
+        Y_pt = torch.matmul(X_pt, W_pt)
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rrr(self):
+        self._test_rrr([1, 16, 8], [2, 8, 32], "broadcastable_a")
+        self._test_rrr([2, 16, 8], [1, 8, 32], "broadcastable_b")
+        self._test_rrr([16, 8], [8, 8, 32], "2d_broadcastable_a")
+        self._test_rrr([8, 16, 8], [8, 32], "2d_broadcastable_b")
+
+    def _test_ccr(self, A_shape, B_shape, test_name):
+        M, N = A_shape[-1], B_shape[-2]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        OP = ops.bmm_ccr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_ccr_{}".format(test_name))
+
+        X_pt = torch.randn(*A_shape).cuda().half()
+        W_pt = torch.randn(*B_shape).cuda().half()
+
+        XT = torch.transpose(X_pt, -2, -1)
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(XT, WT)
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_ccr(self):
+        self._test_ccr([1, 8, 16], [2, 32, 8], "broadcastable_a")
+        self._test_ccr([2, 8, 16], [1, 32, 8], "broadcastable_b")
+        self._test_ccr([8, 16], [8, 32, 8], "2d_broadcastable_a")
+        self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
new file mode 100644
index 000000000..5b0499965
--- /dev/null
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -0,0 +1,288 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class BMMTestCase(unittest.TestCase):
+    def test_rrr(self):
+        B = 32
+        M = 256
+        K = 256
+        N = 512
+        target = detect_target()
+        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+        D = Tensor(shape=[B, M, N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.bmm_rrr_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rrr_add")
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        Y_pt = torch.bmm(X_pt, W_pt)
+        Y_pt = Y_pt + D_pt
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_ccr(self, B, M, N, K, test_name):
+        target = detect_target()
+        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+        D = Tensor(shape=[B, M, N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.bmm_ccr_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
+        Y_pt = Y_pt + D_pt
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        if X_pt.nelement() == 0 or W_pt.nelement == 0:
+            pass
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_ccr(self):
+        self._test_ccr(B=32, M=256, N=256, K=512, test_name="bmm_ccr_add")
+        self._test_ccr(B=0, M=256, N=256, K=512, test_name="bmm_ccr_zero_batch")
+        self._test_ccr(B=1, M=0, N=256, K=512, test_name="bmm_ccr_zero_m")
+        self._test_ccr(B=1, M=256, N=256, K=0, test_name="bmm_ccr_zero_k")
+
+    # def test_crr(self):
+    #     B = 32
+    #     M = 256
+    #     K = 256
+    #     N = 512
+    #     target = detect_target()
+    #     X = Tensor(
+    #         shape=[B, K, M],
+    #         dtype="float16",
+    #         name="input_0"
+    #     )
+    #     W = Tensor(
+    #         shape=[B, K, N],
+    #         dtype="float16",
+    #         name="input_1"
+    #     )
+    #     D = Tensor(
+    #         shape=[B, M, N],
+    #         dtype="float16",
+    #         name="input_2"
+    #     )
+    #     OP = ops.bmm_crr_add()
+    #     Y = OP(X, W, D)
+    #     Y._attrs["name"] = "output_0"
+    #     Y._attrs["is_output"] = True
+    #     module = compile_model(Y, target, "./tmp", "bmm_crr_add")
+    #     X_pt = torch.randn(B, K, M).cuda().half()
+    #     W_pt = torch.randn(B, K, N).cuda().half()
+    #     D_pt = torch.randn(B, M, N).cuda().half()
+
+    #     XT = torch.transpose(X_pt, 2, 1)
+    #     Y_pt = torch.bmm(XT, W_pt)
+    #     Y_pt = Y_pt + D_pt
+    #     Y_np = Y_pt.cpu().numpy()
+
+    #     x = X_pt.cpu().numpy()
+    #     w = W_pt.cpu().numpy()
+    #     d = D_pt.cpu().numpy()
+    #     module.SetInput("input_0", x)
+    #     module.SetInput("input_1", w)
+    #     module.SetInput("input_2", d)
+    #     module.benchmark()
+    #     y = module.GetOutput("output_0", [B, M, N])
+    #     np.testing.assert_allclose(Y_np,
+    #                                y,
+    #                                atol=1e-2, rtol=1e-2)
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class BMMBroadcastTestCase(unittest.TestCase):
+    def _test_crr(self, A_shape, B_shape, bias_shape, test_name):
+        M, N = A_shape[-1], B_shape[-1]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+        Y = ops.bmm_crr_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_crr_{}".format(test_name))
+
+        X_pt = torch.randn(*A_shape).cuda().half()
+        W_pt = torch.randn(*B_shape).cuda().half()
+        bias_pt = torch.randn(*bias_shape).cuda().half()
+
+        XT = torch.transpose(X_pt, -2, -1)
+        Y_pt = torch.matmul(XT, W_pt) + bias_pt
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_crr(self):
+        self._test_crr(
+            [1, 8, 16], [2, 8, 32], bias_shape=[32], test_name="broadcastable_bias1d"
+        )
+        self._test_crr(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[1, 32],
+            test_name="broadcastable_bias1d_2",
+        )
+        self._test_crr(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[16, 32],
+            test_name="broadcastable_bias2d",
+        )
+        self._test_crr(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[1, 16, 32],
+            test_name="broadcastable_bias3d",
+        )
+
+    def _test_rrr(self, A_shape, B_shape, bias_shape, test_name):
+        M, N = A_shape[-2], B_shape[-1]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+        Y = ops.bmm_rrr_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
+
+        X_pt = torch.randn(*A_shape).cuda().half()
+        W_pt = torch.randn(*B_shape).cuda().half()
+        bias_pt = torch.randn(*bias_shape).cuda().half()
+
+        Y_pt = torch.matmul(X_pt, W_pt) + bias_pt
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rrr(self):
+        self._test_rrr(
+            [1, 16, 8], [2, 8, 32], bias_shape=[32], test_name="broadcastable_bias1d"
+        )
+        self._test_rrr(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 32],
+            test_name="broadcastable_bias1d_2",
+        )
+        self._test_rrr(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[16, 32],
+            test_name="broadcastable_bias2d",
+        )
+        self._test_rrr(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 16, 32],
+            test_name="broadcastable_bias3d",
+        )
+
+    def _test_ccr(self, A_shape, B_shape, bias_shape, test_name):
+        M, N = A_shape[-1], B_shape[-2]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+        Y = ops.bmm_ccr_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_ccr_{}".format(test_name))
+
+        X_pt = torch.randn(*A_shape).cuda().half()
+        W_pt = torch.randn(*B_shape).cuda().half()
+        bias_pt = torch.randn(*bias_shape).cuda().half()
+
+        XT = torch.transpose(X_pt, -2, -1)
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(XT, WT) + bias_pt
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_ccr(self):
+        self._test_ccr(
+            [1, 8, 16], [2, 32, 8], bias_shape=[32], test_name="broadcastable_bias1d"
+        )
+        self._test_ccr(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 32],
+            test_name="broadcastable_bias1d_2",
+        )
+        self._test_ccr(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[16, 32],
+            test_name="broadcastable_bias2d",
+        )
+        self._test_ccr(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name="broadcastable_bias3d",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_bmm_alpha.py b/tests/unittest/ops/test_bmm_alpha.py
new file mode 100644
index 000000000..4eff3f11a
--- /dev/null
+++ b/tests/unittest/ops/test_bmm_alpha.py
@@ -0,0 +1,282 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class BMMAlphaTestCase(unittest.TestCase):
+    def _test_bmm_alpha(
+        self,
+        bmm_op,
+        is_div,
+        X_trans,
+        W_trans,
+        B,
+        M,
+        N,
+        K,
+        cst_val,
+        expected_num_tensors,
+        expected_num_ops,
+        use_fp16_acc=False,
+        with_add=False,
+    ):
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        if X_trans:
+            X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
+        else:
+            X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
+        if W_trans:
+            W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+        else:
+            W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+        if with_add:
+            D = Tensor(shape=[B, M, N], dtype="float16", name="input_2", is_input=True)
+        BMM_OP = bmm_op()
+        Y1 = BMM_OP(X, W, D) if with_add else BMM_OP(X, W)
+        elem_func_type = FuncEnum.DIV if is_div else FuncEnum.MUL
+        Y2 = ops.elementwise(elem_func_type)(Y1, Tensor([], value=cst_val))
+        Y2._attrs["name"] = "output_0"
+        Y2._attrs["is_output"] = True
+        module = compile_model(
+            Y2, target, "./tmp", f"bmm_alpha_{B}_{M}_{N}_{K}_{use_fp16_acc}"
+        )
+        expected_cst_val = 1.0 / float(cst_val) if is_div else float(cst_val)
+
+        bmm_tensor = None
+        bmm_op = None
+        for tensor in module.debug_sorted_graph:
+            if len(tensor.src_ops()) != 1:
+                continue
+            src_op = list(tensor.src_ops())[0]
+            if src_op._attrs["op"].startswith("bmm"):
+                self.assertIsNone(bmm_tensor, "multiple bmm tensor found")
+                bmm_tensor = tensor
+                bmm_op = src_op
+        self.assertIsNotNone(bmm_tensor, "No bmm_tensor found")
+
+        np.testing.assert_allclose(
+            bmm_op._attrs["alpha"], 1.0 / float(expected_cst_val), atol=1e2, rtol=1e2
+        )
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), expected_num_tensors)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+        if X_trans:
+            X_pt = torch.randn(B, K, M).cuda().half()
+        else:
+            X_pt = torch.randn(B, M, K).cuda().half()
+        if W_trans:
+            W_pt = torch.randn(B, N, K).cuda().half()
+        else:
+            W_pt = torch.randn(B, K, N).cuda().half()
+        if with_add:
+            D_pt = torch.randn(B, M, N).cuda().half()
+
+        def pt_bmm():
+            XT = torch.transpose(X_pt, 2, 1) if X_trans else X_pt
+            WT = torch.transpose(W_pt, 2, 1) if W_trans else W_pt
+            Y_pt_1 = torch.bmm(XT, WT)
+            Y_pt_2 = Y_pt_1 * expected_cst_val
+            if with_add:
+                Y_pt_2 = Y_pt_2 + D_pt
+            return Y_pt_2
+
+        Y_pt = pt_bmm()
+
+        inputs = {"input_0": X_pt, "input_1": W_pt}
+        if with_add:
+            inputs["input_2"] = D_pt
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+
+        if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+            pass
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y, atol=0.1, rtol=0.1))
+
+    def test_bmm_alpha(self):
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rcr,
+            is_div=True,
+            X_trans=False,
+            W_trans=True,
+            B=1,
+            M=1000000,
+            N=3,
+            K=0,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            cst_val=2.3,
+            use_fp16_acc=True,
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rcr,
+            is_div=True,
+            X_trans=False,
+            W_trans=True,
+            B=1,
+            M=1000000,
+            N=0,
+            K=32,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            cst_val=2.3,
+            use_fp16_acc=True,
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rcr,
+            is_div=True,
+            X_trans=False,
+            W_trans=True,
+            B=1,
+            M=1000000,
+            N=3,
+            K=32,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            cst_val=2.3,
+            use_fp16_acc=True,
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rcr,
+            is_div=False,
+            X_trans=False,
+            W_trans=True,
+            B=1,
+            M=1000000,
+            N=3,
+            K=32,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            cst_val=2.3,
+            use_fp16_acc=False,
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rrr,
+            is_div=False,
+            X_trans=False,
+            W_trans=False,
+            B=2,
+            M=2,
+            N=3,
+            K=8,
+            # Padding on N is applied.
+            expected_num_tensors=4,
+            expected_num_ops=2,
+            cst_val=4.32,
+            use_fp16_acc=False,
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_ccr,
+            is_div=True,
+            X_trans=True,
+            W_trans=True,
+            B=2,
+            M=11,
+            N=8,
+            K=3,
+            # Padding on M is applied.
+            expected_num_tensors=7,
+            expected_num_ops=4,
+            cst_val=0.32,
+            use_fp16_acc=False,
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_crr,
+            is_div=True,
+            X_trans=True,
+            W_trans=False,
+            B=2,
+            M=11,
+            N=8,
+            K=3,
+            # Padding on M is applied.
+            expected_num_tensors=6,
+            expected_num_ops=3,
+            cst_val=0.32,
+            use_fp16_acc=False,
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rrr_add,
+            is_div=False,
+            X_trans=False,
+            W_trans=False,
+            B=2,
+            M=12,
+            N=8,
+            K=4,
+            cst_val=0.32,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            use_fp16_acc=True,
+            with_add=True,
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_ccr_add,
+            is_div=True,
+            X_trans=True,
+            W_trans=True,
+            B=2,
+            M=12,
+            N=8,
+            K=4,
+            cst_val=0.32,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            use_fp16_acc=True,
+            with_add=True,
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rcr_n1,
+            is_div=True,
+            X_trans=False,
+            W_trans=True,
+            B=1,
+            M=1000000,
+            N=1,
+            K=32,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            cst_val=2.3,
+            use_fp16_acc=True,
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rcr_n1,
+            is_div=False,
+            X_trans=False,
+            W_trans=True,
+            B=1,
+            M=1000000,
+            N=1,
+            K=32,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            cst_val=2.3,
+            use_fp16_acc=False,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_bmm_permute.py b/tests/unittest/ops/test_bmm_permute.py
new file mode 100644
index 000000000..d77573c53
--- /dev/null
+++ b/tests/unittest/ops/test_bmm_permute.py
@@ -0,0 +1,112 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class BMMTestCase(unittest.TestCase):
+    def _test_rrr(self, bs, ms, N, K, d1, test_name):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[batch_dim, K, N], dtype="float16", name="input_1", is_input=True
+        )
+        OP = ops.bmm_rrr_permute(shape=(d1,))
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
+
+        for (b, m) in itertools.product(bs, ms):
+            X_pt = torch.randn(b, m, K).cuda().half()
+            W_pt = torch.randn(b, K, N).cuda().half()
+
+            Y_l = torch.bmm(X_pt, W_pt)
+            Y_r = Y_l.reshape(b // d1, d1, m, N)
+            Y_pt = torch.permute(Y_r, [0, 2, 1, 3])
+
+            y = torch.empty(Y_pt.shape).cuda().half()
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+                pass
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_rrr(self):
+        self._test_rrr([24], [80], N=88, K=64, d1=12, test_name="permute1")
+        self._test_rrr([10240], [88], N=88, K=64, d1=10, test_name="permute2")
+        self._test_rrr([100], [88], N=88, K=64, d1=10, test_name="permute3")
+        if detect_target().name() != "rocm":
+            self._test_rrr([24], [80], N=0, K=96, d1=12, test_name="permute1_zero_n")
+            self._test_rrr([24], [0], N=32, K=96, d1=12, test_name="permute1_zero_m")
+
+    def _test_rcr(self, bs, ms, N, K, d1, test_name):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+        )
+        OP = ops.bmm_rcr_permute(shape=(d1,))
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
+
+        for (b, m) in itertools.product(bs, ms):
+            X_pt = torch.randn(b, m, K).cuda().half()
+            W_pt = torch.randn(b, N, K).cuda().half()
+
+            WT = torch.transpose(W_pt, 2, 1)
+            Y_l = torch.bmm(X_pt, WT)
+            Y_r = Y_l.reshape(b // d1, d1, m, N)
+            Y_pt = torch.permute(Y_r, [0, 2, 1, 3])
+
+            y = torch.empty(Y_pt.shape).cuda().half()
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+                pass
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_rcr(self):
+        self._test_rcr([10240], [88], N=64, K=88, d1=10, test_name="permute1")
+        self._test_rcr([24], [80], N=64, K=88, d1=12, test_name="permute2")
+        self._test_rcr([100], [88], N=64, K=88, d1=10, test_name="permute3")
+        if detect_target().name() != "rocm":
+            self._test_rcr(
+                [0], [80], N=96, K=32, d1=12, test_name="permute1_zero_batch"
+            )
+            self._test_rcr([24], [80], N=96, K=0, d1=12, test_name="permute1_zero_k")
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_bmm_rcr_n1.py b/tests/unittest/ops/test_bmm_rcr_n1.py
new file mode 100644
index 000000000..9d2d6e7b2
--- /dev/null
+++ b/tests/unittest/ops/test_bmm_rcr_n1.py
@@ -0,0 +1,89 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import logging
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class BMMTestCase(unittest.TestCase):
+    def _test_rcr_n1(self, Bs, Ms, N, K, use_fp16_acc, test_name):
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        BDim = shape_utils.gen_int_var_min_max(Bs, name="batch")
+        MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
+        X = Tensor(
+            shape=[BDim, MDim, IntImm(K)],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[BDim, IntImm(N), IntImm(K)],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        OP = ops.bmm_rcr_n1()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y, target, "./tmp", f"bmm_rcr_n1_{use_fp16_acc}_{test_name}"
+        )
+        for B, M in itertools.product(Bs, Ms):
+            logging.info(f"Testing {B=} {M=}")
+            X_pt = torch.randn(B, M, K).cuda().half()
+            W_pt = torch.randn(B, N, K).cuda().half()
+
+            Y_pt = torch.bmm(X_pt, torch.transpose(W_pt, 2, 1))
+
+            y = torch.empty([B, M, N]).half().cuda()
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+
+            if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+                pass
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr_n1(self):
+        self._test_rcr_n1([1], [1000000], 1, 32, True, "static")
+        self._test_rcr_n1([1], [1000000], 1, 32, False, "static")
+
+        self._test_rcr_n1([1, 63, 96], [1024], 1, 32, True, "dynamic_batch")
+        self._test_rcr_n1([1, 63, 96], [1024], 1, 32, False, "dynamic_batch")
+
+        self._test_rcr_n1([1], [1, 1000, 100000], 1, 32, True, "dynamic_m")
+        self._test_rcr_n1([1], [1, 1000, 100000], 1, 32, False, "dynamic_m")
+
+        self._test_rcr_n1([1, 16], [1, 1024], 1, 32, True, "dynamic_batch_dynamic_m")
+        self._test_rcr_n1([1, 16], [1, 1024], 1, 32, False, "dynamic_batch_dynamic_m")
+
+        self._test_rcr_n1([1, 5, 8], [100], 1, 7, True, "static")
+        self._test_rcr_n1([1, 5, 8], [100], 1, 123, False, "static")
+
+        self._test_rcr_n1([1], [100], 1, 0, False, "zero_k")
+        self._test_rcr_n1([1], [0], 1, 3, False, "zero_m")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
new file mode 100644
index 000000000..43e7b0aff
--- /dev/null
+++ b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
@@ -0,0 +1,55 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class BMMTestCase(unittest.TestCase):
+    def _test_rrr(self, B, M, K, N, test_name):
+        target = detect_target()
+        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+        OP = ops.bmm_rrr_k1_tanh()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        Y_pt = torch.bmm(X_pt, W_pt)
+        Y_pt = torch.tanh(Y_pt)
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+            pass
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rrr(self):
+        self._test_rrr(B=1024, M=32, K=1, N=32, test_name="bmm_rrr_k1")
+        self._test_rrr(B=1024, M=0, K=1, N=32, test_name="bmm_rrr_k1_zero_m")
+        self._test_rrr(B=1024, M=32, K=0, N=32, test_name="bmm_rrr_k1_zero_k")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_bmm_softmax.py b/tests/unittest/ops/test_bmm_softmax.py
new file mode 100644
index 000000000..0e5440336
--- /dev/null
+++ b/tests/unittest/ops/test_bmm_softmax.py
@@ -0,0 +1,62 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+
+
+# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skip("BMM + Softmax is disabled for now")
+class BMMSoftmaxTestCase(unittest.TestCase):
+    def _test_bmm_rcr_softmax(
+        self, B=16, M=16, K=64, N=24, test_name="bmm_rcr_softmax"
+    ):
+
+        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+        OP = ops.bmm_rcr_softmax()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Skip this test on SM75")
+            return
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        WT = torch.transpose(W_pt, 2, 1)
+        Y_pt = torch.bmm(X_pt, WT)
+        Y_pt = torch.softmax(Y_pt, dim=-1)
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        eps = 1e-1
+        self.assertTrue(torch.allclose(Y_pt, y, atol=eps, rtol=eps))
+
+    def test_bmm_softmax(self):
+        self._test_bmm_rcr_softmax()
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_bmm_softmax_bmm.py b/tests/unittest/ops/test_bmm_softmax_bmm.py
new file mode 100644
index 000000000..0933c75f3
--- /dev/null
+++ b/tests/unittest/ops/test_bmm_softmax_bmm.py
@@ -0,0 +1,189 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# BMM + Softmax + BMM
+# (B, M, K) * (B, N, K) = (B, M, N) #RCR
+# softmax on dim N (B, M, N)
+# (B, M, N) * (B, N, O) = (B, M, O) #RRR
+import itertools
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+def build_causal_attention_mask(bsz, seq_len, dtype):
+    # lazily create causal attention mask, with full attention between the vision tokens
+    # pytorch uses additive attention mask; fill with -inf
+    mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+    mask.fill_(torch.tensor(torch.finfo(dtype).min))
+    mask.triu_(1)  # zero out the lower diagonal
+    mask = mask.unsqueeze(1)  # expand mask
+    return mask
+
+
+@unittest.skipIf(detect_target().name() == "cuda", "Only supported by ROCM.")
+class BMMSoftmaxBMMTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(BMMSoftmaxBMMTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_bmm_permute(
+        self,
+        bs,
+        ms,
+        N,
+        K,
+        D,
+        head_dim=64,
+        num_heads=12,
+        causal=False,
+        test_name="ck_attn",
+    ):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
+        )
+        B0 = Tensor(
+            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+        )
+        B1 = Tensor(
+            shape=[batch_dim, N, D], dtype="float16", name="input_2", is_input=True
+        )
+
+        scale = head_dim**-0.5
+
+        OP = ops.bmm_softmax_bmm_permute(shape=(num_heads,), scale=scale, causal=causal)
+        Y = OP(X, B0, B1)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            Y, target, "./tmp", f"bmm_{test_name}_permute", dll_name=dll_name
+        )
+
+        for (b, m) in itertools.product(bs, ms):
+            X_pt = torch.randn(b, m, K).cuda().half()  # Q
+            W_pt = torch.randn(b, N, K).cuda().half()  # K
+            B1_pt = torch.randn(b, N, D).cuda().half()  # V
+
+            attn = (X_pt @ W_pt.transpose(-2, -1)) * scale
+
+            if causal:
+                bsz = 1
+                tgt_len = m
+                src_len = N
+                causal_attention_mask = build_causal_attention_mask(
+                    bsz, m, attn.dtype
+                ).to(attn.device)
+                attn_weights = attn.reshape(bsz, num_heads, m, N)
+                if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                        f" {causal_attention_mask.size()}"
+                    )
+                attn_weights = (
+                    attn_weights.view(bsz, num_heads, tgt_len, src_len)
+                    + causal_attention_mask
+                )
+                attn = attn_weights.view(bsz * num_heads, tgt_len, src_len)
+
+            attn = attn.softmax(dim=-1)
+            Y_l = attn @ B1_pt
+            Y_r = Y_l.reshape(b // num_heads, num_heads, m, D)
+            Y2_pt = torch.permute(Y_r, [0, 2, 1, 3])
+
+            y = torch.empty([b // num_heads, m, num_heads, D]).cuda().half()
+            module.run_with_tensors([X_pt, W_pt, B1_pt], [y])
+            if X_pt.nelement() == 0 or Y2_pt.nelement() == 0:
+                pass
+            else:
+                self.assertTrue(torch.allclose(Y2_pt, y, atol=1e-1, rtol=1e-1))
+
+            # benchmark
+            # time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+            #     [X_pt, W_pt, B1_pt], [y], count=200, repeat=2
+            # )
+
+    def _test_b2b(self, bs, ms, N, K, D, head_dim=64, test_name="ck_attn"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
+        )
+        B0 = Tensor(
+            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+        )
+        B1 = Tensor(
+            shape=[batch_dim, N, D], dtype="float16", name="input_2", is_input=True
+        )
+
+        scale = head_dim**-0.5
+
+        OP = ops.bmm_softmax_bmm(scale=scale)
+        Y = OP(X, B0, B1)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            Y, target, "./tmp", f"bmm_{test_name}_permute", dll_name=dll_name
+        )
+
+        for (b, m) in itertools.product(bs, ms):
+            X_pt = torch.randn(b, m, K).cuda().half()  # Q
+            W_pt = torch.randn(b, N, K).cuda().half()  # K
+            B1_pt = torch.randn(b, N, D).cuda().half()  # V
+
+            attn = (X_pt @ W_pt.transpose(-2, -1)) * scale
+            attn = attn.softmax(dim=-1)
+            Y2_pt = attn @ B1_pt
+
+            y = torch.empty([b, m, D]).cuda().half()
+            module.run_with_tensors([X_pt, W_pt, B1_pt], [y])
+            if X_pt.nelement() == 0 or Y2_pt.nelement() == 0:
+                pass
+            else:
+                self.assertTrue(torch.allclose(Y2_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr(self):
+        # FIXME: re-enable it after we fix the missing parameter for bmm_softmax_bmm
+        # self._test_b2b([16], [576], N=576, K=64, D=64, test_name="static")
+        self._test_bmm_permute([24], [256], N=256, K=64, D=64, test_name="static")
+        self._test_bmm_permute([24], [196], N=196, K=64, D=64, test_name="static")
+        self._test_bmm_permute([24], [128], N=1024, K=64, D=128, test_name="static")
+        self._test_bmm_permute([24], [128], N=49, K=64, D=128, test_name="static")
+        self._test_bmm_permute([24], [49], N=49, K=64, D=64, test_name="static")
+        self._test_bmm_permute([24], [1020], N=1020, K=64, D=128, test_name="static")
+        self._test_bmm_permute(
+            [32], [49], N=49, K=64, D=64, num_heads=4, test_name="static"
+        )
+        self._test_bmm_permute(
+            [16], [4096], N=64, K=40, D=40, num_heads=8, test_name="static"
+        )
+        self._test_bmm_permute(
+            [12], [64], N=64, K=64, D=64, num_heads=12, causal=True, test_name="static"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_chunk.py b/tests/unittest/ops/test_chunk.py
new file mode 100644
index 000000000..7da51ee6d
--- /dev/null
+++ b/tests/unittest/ops/test_chunk.py
@@ -0,0 +1,120 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+from typing import List
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class ChunkTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(ChunkTestCase, self).__init__(*args, **kwargs)
+
+    def _run_chunk(
+        self,
+        *,
+        input_shape: List[IntVar],
+        chunks: int,
+        dim: int = 0,
+        input_type="float16",
+    ):
+        logging.info(f"{input_shape=}, " f"{chunks=}, " f"{dim=}")
+
+        chunk_op = ops.chunk()
+        target = detect_target()
+        X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
+        Ys = chunk_op(X, chunks, dim)
+        for idx, Y in enumerate(Ys):
+            Y._attrs["name"] = "output_{}".format(idx)
+            Y._attrs["is_output"] = True
+
+        module = compile_model(Ys, target, "./tmp", "chunk")
+
+        for batch_size in input_shape[0]._attrs["values"]:
+            logging.info(f"Testing {batch_size=}")
+            x_pt = get_random_torch_tensor(
+                [batch_size, *[v.value() for v in input_shape[1:]]], input_type
+            )
+            ys_pt = torch.chunk(x_pt, chunks, dim)
+            y_shapes = [Y_pt.size() for Y_pt in ys_pt]
+            outputs = {
+                f"output_{idx}": torch.empty(y_shape).cuda().half()
+                for idx, y_shape in enumerate(y_shapes)
+            }
+
+            module.run_with_tensors([x_pt], outputs)
+
+            for idx, y_pt in enumerate(ys_pt):
+                self.assertTrue(
+                    torch.allclose(y_pt, outputs[f"output_{idx}"], atol=1e-2, rtol=1e-2)
+                )
+
+    def test_chunk(self):
+        self._run_chunk(
+            input_shape=[IntImm(17), IntImm(5), IntImm(29)],
+            chunks=2,
+            dim=0,
+            input_type="float16",
+        )
+        self._run_chunk(
+            input_shape=[IntImm(17), IntImm(5), IntImm(29)],
+            chunks=7,
+            dim=1,
+            input_type="float16",
+        )
+        self._run_chunk(
+            input_shape=[IntImm(17), IntImm(5), IntImm(29)],
+            chunks=11,
+            dim=2,
+            input_type="float16",
+        )
+
+    def test_dynamic_chunk(self):
+        self._run_chunk(
+            input_shape=[
+                IntVar(values=[13, 17], name="batch_dim"),
+                IntImm(5),
+                IntImm(29),
+            ],
+            chunks=2,
+            dim=1,
+            input_type="float16",
+        )
+        with self.assertRaises(RuntimeError) as context:
+            self._run_chunk(
+                input_shape=[
+                    IntVar(values=[13, 17], name="batch_dim"),
+                    IntImm(5),
+                    IntImm(29),
+                ],
+                chunks=2,
+                dim=0,
+                input_type="float16",
+            )
+        self.assertTrue(
+            "Not implemented: chunk along dynamic axes" in str(context.exception)
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_clamp_nan_to_num.py b/tests/unittest/ops/test_clamp_nan_to_num.py
new file mode 100644
index 000000000..0dc6e5415
--- /dev/null
+++ b/tests/unittest/ops/test_clamp_nan_to_num.py
@@ -0,0 +1,178 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+from typing import Callable, List
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm, IntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ClampTestCase(unittest.TestCase):
+    def _create_shape_from_list(self, shape: List[int]) -> IntVar:
+        if len(shape) > 1:
+            return IntVar(shape)
+        return IntImm(shape[0])
+
+    def _float_to_tensor(self, name: str, value: float) -> Tensor:
+        return Tensor(shape=[], dtype="float16", name=name, value=value)
+
+    def _test_helper(
+        self,
+        input_shape: List[List[int]],
+        arg_a: float,
+        arg_b: float,
+        arg_c: float,
+        add_nans: bool,
+        add_infs: bool,
+        test_name: str,
+        func: FuncEnum,
+        get_expected: Callable[[torch.Tensor], torch.Tensor],
+    ):
+        self.assertGreater(len(input_shape), 0)
+        X = Tensor(
+            shape=[self._create_shape_from_list(shape) for shape in input_shape],
+            dtype="float16",
+            name="input",
+            is_input=True,
+        )
+        a_tensor = self._float_to_tensor("a", arg_a)
+        b_tensor = self._float_to_tensor("b", arg_b)
+        c_tensor = self._float_to_tensor("c", arg_c)
+
+        result = ops.elementwise(func)(X, a_tensor, b_tensor, c_tensor)
+        result._attrs["is_output"] = True
+        result._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(result, target, "./tmp", test_name)
+
+        for shape in itertools.product(*input_shape):
+            X_pt = torch.randn(shape, dtype=torch.half).cuda()
+            if add_nans:
+                X_pt[0].fill_(float("nan"))
+            if add_infs:
+                X_pt[1].fill_(float("inf"))
+                X_pt[2].fill_(-float("inf"))
+
+            actual = torch.empty(shape).cuda().half()
+            module.run_with_tensors([X_pt], [actual])
+
+            expected = get_expected(X_pt).cuda()
+            self.assertTrue(torch.equal(expected, actual))
+
+    def _test_nan_to_num(
+        self,
+        test_num: int,
+        input_shape: List[List[int]],
+        nan_replacement: float,
+        inf_replacement: float,
+        neginf_replacement: float,
+        add_nans: bool = False,
+        add_infs: bool = False,
+    ):
+        nan_to_num_pt = (
+            lambda x: x.to(torch.float)
+            .nan_to_num(
+                posinf=inf_replacement, neginf=neginf_replacement, nan=nan_replacement
+            )
+            .to(torch.half)
+        )
+        self._test_helper(
+            input_shape,
+            nan_replacement,
+            inf_replacement,
+            neginf_replacement,
+            add_nans,
+            add_infs,
+            f"nan_to_num_{test_num}",
+            FuncEnum.NAN_TO_NUM,
+            nan_to_num_pt,
+        )
+
+    def _test_clamp_nan_to_num(
+        self,
+        test_num: int,
+        input_shape: List[List[int]],
+        clamp_min: float,
+        clamp_max: float,
+        nan_replacement: float,
+        add_nans: bool = False,
+    ):
+        clamp_nan_to_num_pt = (
+            lambda x: x.to(torch.float)
+            .clamp(clamp_min, clamp_max)
+            .nan_to_num(nan=nan_replacement)
+            .to(torch.half)
+        )
+        self._test_helper(
+            input_shape,
+            clamp_min,
+            clamp_max,
+            nan_replacement,
+            add_nans,
+            False,
+            f"clamp_nan_to_num_{test_num}",
+            FuncEnum.CLAMP_NAN_TO_NUM,
+            clamp_nan_to_num_pt,
+        )
+
+    def test_clamp_nan_to_num(self):
+        clamp_arg_sets = [(-1.0, 2.0, 0.0), (-42.0, 2.0, 43.0)]
+        test_num = 0
+        for clamp_args in clamp_arg_sets:
+            self._test_clamp_nan_to_num(
+                test_num,
+                [[40, 2], [40], [40]],
+                *clamp_args,
+                add_nans=False,
+            )
+            self._test_clamp_nan_to_num(
+                test_num + 1,
+                [[40, 3], [3], [3]],
+                *clamp_args,
+                add_nans=True,
+            )
+            test_num += 2
+
+    def test_nan_to_num(self):
+        clamp_arg_sets = [(-1.0, 2.0, 0.0), (-42.0, 2.0, 43.0)]
+        test_num = 0
+        for clamp_args in clamp_arg_sets:
+            self._test_nan_to_num(
+                test_num,
+                [[40, 2], [40], [40]],
+                *clamp_args,
+                add_nans=False,
+                add_infs=False,
+            )
+            self._test_nan_to_num(
+                test_num + 1,
+                [[40, 3], [3], [3]],
+                *clamp_args,
+                add_nans=True,
+                add_infs=True,
+            )
+            test_num += 2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_concatenate.py b/tests/unittest/ops/test_concatenate.py
new file mode 100644
index 000000000..0a5eec59e
--- /dev/null
+++ b/tests/unittest/ops/test_concatenate.py
@@ -0,0 +1,401 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils import shape_utils
+
+
+class ConcatenateTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(ConcatenateTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _run_concatenate(
+        self, *, concatenate_op, input_shapes, dim=None, input_type="float16"
+    ):
+        logging.info(
+            "Test input shapes {input_shapes}, dim={dim}".format(
+                input_shapes=input_shapes, dim=dim
+            )
+        )
+
+        # generate torch reference result
+        input_tensors_pt = [
+            get_random_torch_tensor(shape, input_type)
+            for i, shape in enumerate(input_shapes)
+        ]
+        Y_pt = (
+            torch.cat(input_tensors_pt)
+            if dim is None
+            else torch.cat(input_tensors_pt, dim)
+        )
+
+        target = detect_target()
+        inputs = [
+            Tensor(
+                shape=shape, dtype=input_type, name="input_{}".format(i), is_input=True
+            )
+            for i, shape in enumerate(input_shapes)
+        ]
+        Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
+
+        logging.info("AITemplate output_shape: {}".format(y_shape))
+
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", "concatenate", dll_name=dll_name)
+
+        input_tensors_ait = {
+            f"input_{idx}": input_tensors_pt[idx] for idx in range(len(inputs))
+        }
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(input_tensors_ait, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def _run_batch_concatenate(
+        self, *, batch_sizes, concatenate_op, input_shapes, dim=0, input_type="float16"
+    ):
+        logging.info(
+            "Batch test input shapes {input_shapes}, dim={dim}".format(
+                input_shapes=input_shapes, dim=dim
+            )
+        )
+        target = detect_target()
+        BATCH_DIM_NAME = "input_batch"
+        batch_dim = shape_utils.gen_int_var_min_max(
+            values=batch_sizes, name=BATCH_DIM_NAME
+        )
+        inputs = [
+            Tensor(
+                shape=[
+                    batch_dim,
+                    *shape,
+                ],
+                dtype=input_type,
+                name="input_{}".format(i),
+                is_input=True,
+            )
+            for i, shape in enumerate(input_shapes)
+        ]
+        Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        batch_tag = "_".join([str(b) for b in batch_sizes])
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            Y, target, "./tmp", f"concatenate_batched_{batch_tag}", dll_name=dll_name
+        )
+        for batch in batch_sizes:
+            logging.info("checking batch: {}".format(batch))
+            input_tensors_pt = [
+                get_random_torch_tensor([batch, *shape], input_type)
+                for i, shape in enumerate(input_shapes)
+            ]
+            Y_pt = (
+                torch.cat(input_tensors_pt)
+                if dim is None
+                else torch.cat(input_tensors_pt, dim)
+            )
+            input_tensors_ait = {
+                f"input_{idx}": input_tensors_pt[idx] for idx in range(len(inputs))
+            }
+            y = torch.empty_like(Y_pt).cuda().half()
+            module.run_with_tensors(input_tensors_ait, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+            self.test_count += 1
+
+    def _run_masked_concatenate(
+        self,
+        *,
+        concatenate_op,
+        input_shapes,
+        input_masks,
+        dim=None,
+        input_type="float16",
+    ):
+        logging.info(
+            "Test input shapes {input_shapes}, input_masks={input_masks}, dim={dim}".format(
+                input_shapes=input_shapes, input_masks=input_masks, dim=dim
+            )
+        )
+
+        # generate torch reference result
+        input_tensors_pt = [
+            get_random_torch_tensor(shape, input_type)
+            for i, shape in enumerate(input_shapes)
+        ]
+        Y_pt = (
+            torch.cat(input_tensors_pt)
+            if dim is None
+            else torch.cat(input_tensors_pt, dim)
+        )
+        y_pt = Y_pt.cpu().numpy()
+
+        target = detect_target()
+        inputs = [
+            Tensor(
+                shape=shape, dtype=input_type, name="input_{}".format(i), is_input=True
+            )
+            for i, shape in enumerate(input_shapes)
+        ]
+        Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
+
+        # setup new input_masks, inputs and input_accessors
+        inputs = [i for mask, i in zip(input_masks, inputs) if mask is True]
+        input_accessors = [
+            i
+            for mask, i in zip(input_masks, concatenate_op._attrs["input_accessors"])
+            if mask is True
+        ]
+        concatenate_op._attrs["input_masks"] = input_masks
+        concatenate_op._attrs["inputs"] = inputs
+        concatenate_op._attrs["input_accessors"] = input_accessors
+
+        logging.info("AITemplate output_shape: {}".format(y_shape))
+
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(
+            Y, target, "./tmp", "concatenate_masked", dll_name=dll_name
+        )
+
+        inputs = []
+        for i, x_tensor_pt in enumerate(input_tensors_pt):
+            if input_masks[i]:
+                inputs.append(x_tensor_pt)
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+
+        split_sections = []
+        split_offset = 0
+        for shape in input_shapes[:-1]:
+            split_offset = split_offset + shape[dim]
+            split_sections.append(split_offset)
+
+        ys_pt = np.split(y_pt, split_sections, axis=dim)
+        ys = np.split(y.cpu().numpy(), split_sections, axis=dim)
+        for mask, pt, actual in zip(input_masks, ys_pt, ys):
+            if mask is True:
+                np.testing.assert_allclose(actual, pt, atol=1e-2, rtol=1e-2)
+        self.test_count += 1
+
+    def test_batch_cat(self):
+        self._run_batch_concatenate(
+            batch_sizes=[1, 1],
+            concatenate_op=ops.concatenate(),
+            input_shapes=([1], [1]),
+            dim=0,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[1, 1],
+            concatenate_op=ops.concatenate(),
+            input_shapes=([1], [1]),
+            dim=1,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=0,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=1,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=2,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=3,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 1, 4], [2, 3, 4]),
+            dim=2,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4], [2, 3, 2]),
+            dim=3,
+        )
+
+    def test_cat(self):
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(), input_shapes=([1], [1]), dim=0
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(), input_shapes=([1, 1], [1, 1]), dim=0
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(), input_shapes=([1, 1], [1, 1]), dim=1
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(), input_shapes=([2, 1], [2, 1]), dim=1
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(), input_shapes=[[2, 3, 4]], dim=1
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(), input_shapes=([2, 3, 4], [2, 3, 4]), dim=0
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(), input_shapes=([2, 3, 4], [2, 3, 4]), dim=1
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(), input_shapes=([2, 3, 4], [2, 3, 4]), dim=2
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4], [3, 3, 4], [4, 3, 4]),
+            dim=0,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4], [2, 4, 4], [2, 5, 4]),
+            dim=1,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 6], [2, 3, 5], [2, 3, 4]),
+            dim=2,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([1024, 32, 32], [1024, 16, 32], [1024, 8, 32]),
+            dim=1,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([12, 3, 4, 5], [3, 3, 4, 5], [7, 3, 4, 5]),
+            dim=0,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4, 5], [2, 3, 4, 5], [2, 3, 4, 5]),
+            dim=1,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 9, 5], [2, 3, 4, 5], [2, 3, 1, 5]),
+            dim=2,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]),
+            dim=3,
+        )
+
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
+        )
+
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([12, 3, 4, 5], [3, 3, 4, 5], [7, 3, 4, 5]),
+            dim=0,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4, 5], [2, 3, 4, 5], [2, 3, 4, 5]),
+            dim=1,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 9, 5], [2, 3, 4, 5], [2, 3, 1, 5]),
+            dim=2,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]),
+            dim=3,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]),
+            dim=-1,
+        )
+        # special cases where inputs contain a number of zero-size tensor
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(), input_shapes=([3, 0], [3, 0]), dim=0
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(), input_shapes=([3, 0], [3, 1]), dim=1
+        )
+
+    def test_masked_cat(self):
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2], [2]),
+            input_masks=[True, False],
+            dim=0,
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3], [5, 3], [3, 3]),
+            input_masks=[False, True, True],
+            dim=0,
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 11, 4], [2, 5, 4], [2, 2, 4]),
+            input_masks=[True, False, True],
+            dim=1,
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([1, 1, 1], [1, 1, 2], [1, 1, 4]),
+            input_masks=[False, True, False],
+            dim=2,
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4], [2, 3, 8], [2, 3, 16]),
+            input_masks=[False, True, False],
+            dim=2,
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_concatenate_tanh.py b/tests/unittest/ops/test_concatenate_tanh.py
new file mode 100644
index 000000000..e4fa82ed1
--- /dev/null
+++ b/tests/unittest/ops/test_concatenate_tanh.py
@@ -0,0 +1,369 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils import shape_utils
+
+
+class ConcatenateTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(ConcatenateTestCase, self).__init__(*args, **kwargs)
+
+    def _run_concatenate(
+        self, *, concatenate_op, input_shapes, dim=None, input_type="float16"
+    ):
+        logging.info(
+            "Test input shapes {input_shapes}, dim={dim}".format(
+                input_shapes=input_shapes, dim=dim
+            )
+        )
+
+        # generate torch reference result
+        input_tensors_pt = [
+            get_random_torch_tensor(shape, input_type)
+            for i, shape in enumerate(input_shapes)
+        ]
+        Y_pt = (
+            torch.cat(input_tensors_pt)
+            if dim is None
+            else torch.cat(input_tensors_pt, dim)
+        )
+        Y_pt = torch.tanh(Y_pt)
+
+        target = detect_target()
+        inputs = [
+            Tensor(
+                shape=shape, dtype=input_type, name="input_{}".format(i), is_input=True
+            )
+            for i, shape in enumerate(input_shapes)
+        ]
+        Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
+
+        logging.info("AITemplate output_shape: {}".format(y_shape))
+
+        module = compile_model(Y, target, "./tmp", "concatenate_tanh")
+
+        input_tensors_ait = {
+            f"input_{idx}": input_tensors_pt[idx] for idx in range(len(inputs))
+        }
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(input_tensors_ait, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def _run_batch_concatenate(
+        self, *, batch_sizes, concatenate_op, input_shapes, dim=0, input_type="float16"
+    ):
+        logging.info(
+            "Batch test input shapes {input_shapes}, dim={dim}".format(
+                input_shapes=input_shapes, dim=dim
+            )
+        )
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+        target = detect_target()
+        inputs = [
+            Tensor(
+                shape=[
+                    batch_dim,
+                    *shape,
+                ],
+                dtype=input_type,
+                name="input_{}".format(i),
+                is_input=True,
+            )
+            for i, shape in enumerate(input_shapes)
+        ]
+        Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        batch_tag = "_".join([str(b) for b in batch_sizes])
+        module = compile_model(Y, target, "./tmp", f"concatenate_tanh_{batch_tag}")
+        for batch in batch_sizes:
+            logging.info("checking batch: {}".format(batch))
+            input_tensors_pt = [
+                get_random_torch_tensor([batch, *shape], input_type)
+                for i, shape in enumerate(input_shapes)
+            ]
+            Y_pt = (
+                torch.cat(input_tensors_pt)
+                if dim is None
+                else torch.cat(input_tensors_pt, dim)
+            )
+            Y_pt = torch.tanh(Y_pt)
+
+            input_tensors_ait = {
+                f"input_{idx}": input_tensors_pt[idx] for idx in range(len(inputs))
+            }
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors(input_tensors_ait, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def _run_masked_concatenate(
+        self,
+        *,
+        concatenate_op,
+        input_shapes,
+        input_masks,
+        dim=None,
+        input_type="float16",
+    ):
+        logging.info(
+            "Test input shapes {input_shapes}, input_masks={input_masks}, dim={dim}".format(
+                input_shapes=input_shapes, input_masks=input_masks, dim=dim
+            )
+        )
+
+        # generate torch reference result
+        input_tensors_pt = [
+            get_random_torch_tensor(shape, input_type)
+            for i, shape in enumerate(input_shapes)
+        ]
+        Y_pt = (
+            torch.tanh(torch.cat(input_tensors_pt))
+            if dim is None
+            else torch.tanh(torch.cat(input_tensors_pt, dim))
+        )
+        y_pt = Y_pt.cpu().numpy()
+
+        target = detect_target()
+        inputs = [
+            Tensor(
+                shape=shape, dtype=input_type, name="input_{}".format(i), is_input=True
+            )
+            for i, shape in enumerate(input_shapes)
+        ]
+        Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
+
+        # setup new input_masks, inputs and input_accessors
+        inputs = [i for mask, i in zip(input_masks, inputs) if mask is True]
+        input_accessors = [
+            i
+            for mask, i in zip(input_masks, concatenate_op._attrs["input_accessors"])
+            if mask is True
+        ]
+        concatenate_op._attrs["input_masks"] = input_masks
+        concatenate_op._attrs["inputs"] = inputs
+        concatenate_op._attrs["input_accessors"] = input_accessors
+
+        logging.info("AITemplate output_shape: {}".format(y_shape))
+
+        module = compile_model(Y, target, "./tmp", "concatenate_tanh")
+
+        inputs = []
+        for i, x_tensor_pt in enumerate(input_tensors_pt):
+            if input_masks[i]:
+                inputs.append(x_tensor_pt)
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+
+        split_sections = []
+        split_offset = 0
+        for shape in input_shapes[:-1]:
+            split_offset = split_offset + shape[dim]
+            split_sections.append(split_offset)
+
+        ys_pt = np.split(y_pt, split_sections, axis=dim)
+        ys = np.split(y.cpu().numpy(), split_sections, axis=dim)
+        for mask, pt, actual in zip(input_masks, ys_pt, ys):
+            if mask is True:
+                np.testing.assert_allclose(actual, pt, atol=1e-2, rtol=1e-2)
+
+    def test_batch_cat(self):
+        self._run_batch_concatenate(
+            batch_sizes=[1, 1],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1], [1]),
+            dim=0,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[1, 1],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1], [1]),
+            dim=1,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=0,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=1,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=2,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=3,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 1, 4], [2, 3, 4]),
+            dim=2,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 2]),
+            dim=3,
+        )
+
+    def test_cat(self):
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(), input_shapes=([1], [1]), dim=0
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(), input_shapes=([1, 1], [1, 1]), dim=0
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(), input_shapes=([1, 1], [1, 1]), dim=1
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(), input_shapes=([2, 1], [2, 1]), dim=1
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(), input_shapes=[[2, 3, 4]], dim=1
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=0,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=1,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=2,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [3, 3, 4], [4, 3, 4]),
+            dim=0,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 4, 4], [2, 5, 4]),
+            dim=1,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 6], [2, 3, 5], [2, 3, 4]),
+            dim=2,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1024, 32, 32], [1024, 16, 32], [1024, 8, 32]),
+            dim=1,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([12, 3, 4, 5], [3, 3, 4, 5], [7, 3, 4, 5]),
+            dim=0,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4, 5], [2, 3, 4, 5], [2, 3, 4, 5]),
+            dim=1,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 9, 5], [2, 3, 4, 5], [2, 3, 1, 5]),
+            dim=2,
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]),
+            dim=3,
+        )
+
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
+        )
+
+        # self._run_concatenate(concatenate_op=tensor.cat(),
+        #                       input_shapes=([12, 3, 4, 5], [3, 3, 4, 5], [7, 3, 4, 5]), dim=0)
+        # self._run_concatenate(concatenate_op=tensor.cat(),
+        #                       input_shapes=([2, 3, 4, 5], [2, 3, 4, 5], [2, 3, 4, 5]), dim=1)
+        # self._run_concatenate(concatenate_op=tensor.cat(),
+        #                       input_shapes=([2, 3, 9, 5], [2, 3, 4, 5], [2, 3, 1, 5]), dim=2)
+        # self._run_concatenate(concatenate_op=tensor.cat(),
+        #                       input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]), dim=3)
+        # self._run_concatenate(concatenate_op=tensor.cat(),
+        #                       input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]))
+
+    def test_masked_cat(self):
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2], [2]),
+            input_masks=[True, False],
+            dim=0,
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3], [5, 3], [3, 3]),
+            input_masks=[False, True, True],
+            dim=0,
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 11, 4], [2, 5, 4], [2, 2, 4]),
+            input_masks=[True, False, True],
+            dim=1,
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1, 1], [1, 1, 2], [1, 1, 4]),
+            input_masks=[False, True, False],
+            dim=2,
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 8], [2, 3, 16]),
+            input_masks=[False, True, False],
+            dim=2,
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv.py b/tests/unittest/ops/test_conv.py
new file mode 100644
index 000000000..b4565844a
--- /dev/null
+++ b/tests/unittest/ops/test_conv.py
@@ -0,0 +1,58 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class ConvTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        target = detect_target()
+        X = Tensor(
+            shape=[IntImm(batch), 28, 28, 128],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+        )
+        OP = ops.conv2d(stride=1, pad=1, dilate=1)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv2d")
+
+        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
+        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        if target.name() == "cuda":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv2d_bias_add.py b/tests/unittest/ops/test_conv2d_bias_add.py
new file mode 100644
index 000000000..1eec1c540
--- /dev/null
+++ b/tests/unittest/ops/test_conv2d_bias_add.py
@@ -0,0 +1,71 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ConvBiasAddTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        target = detect_target()
+        CO, HH, WW, CI = 256, 28, 28, 128
+        X = Tensor(
+            shape=[IntImm(batch), HH, WW, CI],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        W = Tensor(shape=[CO, 3, 3, CI], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        R = Tensor(
+            shape=[IntImm(batch), HH, WW, CO],
+            dtype="float16",
+            name="input_3",
+            is_input=True,
+        )
+        OP = ops.conv2d_bias_add(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B, R)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv2d_bias_add")
+
+        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
+        W_pt = torch.randn(CO, CI, 3, 3).cuda().half()
+        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        R_pt = torch.randn(batch, CO, HH, WW).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = Y_pt + B_pt + R_pt
+
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        r = R_pt.permute((0, 2, 3, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze(), "input_3": r}
+        y = torch.empty([batch, HH, WW, CO]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute(0, 3, 1, 2)
+        if target.name() == "cuda":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias.py b/tests/unittest/ops/test_conv_bias.py
new file mode 100644
index 000000000..4ac8b285b
--- /dev/null
+++ b/tests/unittest/ops/test_conv_bias.py
@@ -0,0 +1,62 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class ConvBiasTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        target = detect_target()
+        X = Tensor(
+            shape=[IntImm(batch), 28, 28, 128],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
+        OP = ops.conv2d_bias(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv2d_bias")
+
+        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
+        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
+        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = Y_pt + B_pt
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
+        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        if target.name() == "cuda":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias_act_few_channels.py b/tests/unittest/ops/test_conv_bias_act_few_channels.py
new file mode 100644
index 000000000..1113b367b
--- /dev/null
+++ b/tests/unittest/ops/test_conv_bias_act_few_channels.py
@@ -0,0 +1,104 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+def hard_swish(x):
+    # return x * F.relu6(x + 3) / 6
+    return x * torch.clamp((x + 3), 0, 6) / 6
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ConvBiasReluTestCase(unittest.TestCase):
+    def test_relu(self, HH=224, WW=224, CI=4, CO=64, batch=1):
+        KK = 7
+        stride = 2
+        pad = 3
+        target = detect_target()
+        X = Tensor(
+            shape=[IntImm(batch), HH, WW, CI],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[CO, KK, KK, CI], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        OP = ops.conv2d_bias_relu_few_channels(stride=stride, pad=pad, dilate=1)
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv_bias_relu_few_channels")
+
+        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
+        W_pt = torch.randn(CO, CI, KK, KK).cuda().half()
+        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=pad, stride=stride)
+        Y_pt = Y_pt + B_pt
+        Y_pt = torch.nn.functional.relu(Y_pt)
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
+        y = torch.empty([batch, HH // stride, WW // stride, CO]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+    def test_hardswish(self, HH=224, WW=224, CI=4, CO=64, batch=1):
+        KK = 7
+        stride = 2
+        pad = 3
+        target = detect_target()
+        X = Tensor(
+            shape=[IntImm(batch), HH, WW, CI],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[CO, KK, KK, CI], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        OP = ops.conv2d_bias_hardswish_few_channels(stride=stride, pad=pad, dilate=1)
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv_bias_hardswish_few_channels")
+
+        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
+        W_pt = torch.randn(CO, CI, KK, KK).cuda().half()
+        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=pad, stride=stride)
+        Y_pt = Y_pt + B_pt
+        Y_pt = hard_swish(Y_pt)
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
+        y = torch.empty([batch, HH // stride, WW // stride, CO]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias_add_hardswish.py b/tests/unittest/ops/test_conv_bias_add_hardswish.py
new file mode 100644
index 000000000..feeeae827
--- /dev/null
+++ b/tests/unittest/ops/test_conv_bias_add_hardswish.py
@@ -0,0 +1,74 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+def hard_swish(x):
+    # return x * F.relu6(x + 3) / 6
+    return x * torch.clamp((x + 3), 0, 6) / 6
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ConvBiasHardswishAddTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        target = detect_target()
+        CO, HH, WW, CI = 256, 28, 28, 128
+        X = Tensor(
+            shape=[IntImm(batch), HH, WW, CI],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        W = Tensor(shape=[CO, 3, 3, CI], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        R = Tensor(
+            shape=[IntImm(batch), HH, WW, CO],
+            dtype="float16",
+            name="input_3",
+            is_input=True,
+        )
+        OP = ops.conv2d_bias_add_hardswish(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B, R)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv2d_bias_add_hardswish")
+
+        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
+        W_pt = torch.randn(CO, CI, 3, 3).cuda().half()
+        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        R_pt = torch.randn(batch, CO, HH, WW).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = Y_pt + B_pt + R_pt
+        Y_pt = hard_swish(Y_pt)
+
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        r = R_pt.permute((0, 2, 3, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze(), "input_3": r}
+        y = torch.empty([batch, HH, WW, CO]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute(0, 3, 1, 2)
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias_add_relu.py b/tests/unittest/ops/test_conv_bias_add_relu.py
new file mode 100644
index 000000000..dca345105
--- /dev/null
+++ b/tests/unittest/ops/test_conv_bias_add_relu.py
@@ -0,0 +1,71 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class ConvBiasReluAddTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        target = detect_target()
+        CO, HH, WW, CI = 256, 28, 28, 128
+        X = Tensor(
+            shape=[IntImm(batch), HH, WW, CI],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        W = Tensor(shape=[CO, 3, 3, CI], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        R = Tensor(
+            shape=[IntImm(batch), HH, WW, CO],
+            dtype="float16",
+            name="input_3",
+            is_input=True,
+        )
+        OP = ops.conv2d_bias_add_relu(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B, R)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv2d_bias_add_relu")
+
+        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
+        W_pt = torch.randn(CO, CI, 3, 3).cuda().half()
+        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        R_pt = torch.randn(batch, CO, HH, WW).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = Y_pt + B_pt + R_pt
+        Y_pt = torch.nn.functional.relu(Y_pt)
+
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        r = R_pt.permute((0, 2, 3, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze(), "input_3": r}
+        y = torch.empty([batch, HH, WW, CO]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute(0, 3, 1, 2)
+        if target.name() == "cuda":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias_hardswish.py b/tests/unittest/ops/test_conv_bias_hardswish.py
new file mode 100644
index 000000000..08512aab6
--- /dev/null
+++ b/tests/unittest/ops/test_conv_bias_hardswish.py
@@ -0,0 +1,71 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+def hard_swish(x):
+    # return x * F.relu6(x + 3) / 6
+    return x * torch.clamp((x + 3), 0, 6) / 6
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ConvBiasHardswishTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        target = detect_target()
+        X = Tensor(
+            shape=[IntImm(batch), 28, 28, 128],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
+        OP = ops.conv2d_bias_hardswish(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv_bias_hardswish")
+
+        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
+        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
+        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = Y_pt + B_pt
+        Y_pt = hard_swish(Y_pt)
+        # np.savetxt("y.txt", Y_np.flatten())
+        # module.SetDim("input_batch", batch)
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
+        # np.savetxt("x.txt", x.flatten())
+        # np.savetxt("w.txt", w.flatten())
+        # np.savetxt("b.txt", b.flatten())
+        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias_relu.py b/tests/unittest/ops/test_conv_bias_relu.py
new file mode 100644
index 000000000..a1b711b27
--- /dev/null
+++ b/tests/unittest/ops/test_conv_bias_relu.py
@@ -0,0 +1,63 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class ConvBiasReluTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        target = detect_target()
+        X = Tensor(
+            shape=[IntImm(batch), 28, 28, 128],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
+        OP = ops.conv2d_bias_relu(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv_bias_relu")
+
+        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
+        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
+        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = Y_pt + B_pt
+        Y_pt = torch.nn.functional.relu(Y_pt)
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
+        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        if target.name() == "cuda":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias_sigmoid.py b/tests/unittest/ops/test_conv_bias_sigmoid.py
new file mode 100644
index 000000000..e8b609b5f
--- /dev/null
+++ b/tests/unittest/ops/test_conv_bias_sigmoid.py
@@ -0,0 +1,63 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class ConvBiasSigmoidTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        target = detect_target()
+        X = Tensor(
+            shape=[IntImm(batch), 28, 28, 128],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
+        OP = ops.conv2d_bias_sigmoid(stride=1, pad=1, dilate=1)
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv_bias_sigmoid")
+
+        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
+        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
+        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        Y_pt = Y_pt + B_pt
+        Y_pt = torch.sigmoid(Y_pt)
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
+        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        if target.name() == "cuda":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_dynamic_conv.py b/tests/unittest/ops/test_dynamic_conv.py
new file mode 100644
index 000000000..9b92c6d0a
--- /dev/null
+++ b/tests/unittest/ops/test_dynamic_conv.py
@@ -0,0 +1,70 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# import torch
+# import numpy as np
+
+
+# from aitemplate.frontend import IntVar, Tensor
+# from aitemplate.compiler import ops
+# from aitemplate.frontend import nn
+# from aitemplate.testing import compile_model, detect_target
+
+
+# def test_fp16(batch_size=[4, 32, 48]):
+#     target = detect_target()
+#     X = Tensor(
+#         shape=[
+#             IntVar(values=batch_size, name="input_batch"),
+#             28,
+#             28,
+#             128
+#         ],
+#         dtype="float16",
+#         name="input_0"
+#     )
+#     W = Tensor(
+#         shape=[
+#             256,
+#             3,
+#             3,
+#             128
+#         ],
+#         dtype="float16",
+#         name="input_1"
+#     )
+#     OP = ops.conv2d(stride=1, pad=1, dilate=1)
+#     Y = OP(X, W)
+#     Y._attrs["name"] = "output_0"
+#     Y._attrs["is_output"] = True
+#     module = compile_model(Y, target, "./tmp", "dynamic_conv", dynamic_batch=True)
+#     for batch in range(batch_size[0], batch_size[-1] + 1):
+#         print("Test batch: %d" % batch)
+#         X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
+#         W_pt = torch.randn(256, 128, 3, 3).cuda().half()
+#         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt,  padding=1)
+#         Y_np = Y_pt.cpu().numpy()
+#         module.SetDim("input_batch", batch)
+#         x = np.transpose(X_pt.cpu().numpy(), (0, 2, 3, 1)).copy()
+#         w = np.transpose(W_pt.cpu().numpy(), (0, 2, 3, 1)).copy()
+#         module.SetInput("input_0", x)
+#         module.SetInput("input_1", w)
+#         module.benchmark()
+#         y = module.GetOutput("output_0", [batch, 28, 28, 256])
+#         np.testing.assert_allclose(Y_np,
+#                                    np.transpose(y, (0, 3, 1, 2)),
+#                                    atol=1e-2, rtol=1e-2)
+
+
+# test_fp16()
diff --git a/tests/unittest/ops/test_efficient_nms.py b/tests/unittest/ops/test_efficient_nms.py
new file mode 100644
index 000000000..a46f3df22
--- /dev/null
+++ b/tests/unittest/ops/test_efficient_nms.py
@@ -0,0 +1,307 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for nms Operator.
+"""
+import os
+import shutil
+import unittest
+from math import log
+from unittest import skipIf
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model, Model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+try:
+    from torchvision.ops import boxes as box_ops
+
+    HAS_TORCHVISION = True
+except ImportError:
+    HAS_TORCHVISION = False
+skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision")
+
+
+def random_boxes(num_boxes, max_coord=100):
+    boxes = torch.rand(num_boxes, 4) * (max_coord * 0.5)
+    boxes.clamp_(min=1.0)
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def nonempty(box, threshold=0.0):
+    widths = box[:, 2] - box[:, 0]
+    heights = box[:, 3] - box[:, 1]
+    keep = (widths < threshold) | (heights < threshold)
+    return keep
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+def create_tensors(N):
+    dets = np.array(
+        [
+            [1.5862e02, 1.6100e02, 4.2800e02, 3.9400e02, 7.7100e-01],
+            [1.5162e02, 1.5938e02, 4.2800e02, 4.0100e02, 9.2676e-01],
+            [1.4700e02, 1.6175e02, 4.3050e02, 3.9925e02, 7.8516e-01],
+            [1.4688e02, 1.6038e02, 4.3150e02, 4.0050e02, 8.5498e-01],
+            [1.4912e02, 1.6000e02, 4.3150e02, 3.9750e02, 7.0020e-01],
+            [1.4925e02, 2.7775e02, 2.4175e02, 3.4775e02, 5.4053e-01],
+            [0.0000e00, 2.0250e02, 5.1200e02, 4.9900e02, 2.5223e-02],
+            [1.5250e02, 1.5900e02, 4.3100e02, 3.9300e02, 6.5674e-01],
+            [1.5262e02, 1.6125e02, 4.3300e02, 3.9475e02, 6.2646e-01],
+            [1.5362e02, 1.5375e02, 4.5000e02, 3.9125e02, 7.8857e-02],
+            [0.0000e00, 8.8875e01, 5.1200e02, 4.9050e02, 4.6120e-03],
+            [1.5000e02, 1.5700e02, 4.2800e02, 3.9900e02, 8.8672e-01],
+            [1.5712e02, 1.6150e02, 4.2850e02, 3.9850e02, 9.1162e-01],
+            [1.5412e02, 1.6050e02, 4.2650e02, 3.9700e02, 7.0654e-01],
+            [1.5112e02, 2.8100e02, 2.4688e02, 3.4700e02, 4.1577e-01],
+            [1.3862e02, 1.7175e02, 4.2450e02, 4.0675e02, 4.4495e-02],
+            [1.5275e02, 1.6350e02, 4.3175e02, 3.9700e02, 8.6182e-01],
+            [1.4875e02, 1.5950e02, 4.2875e02, 3.9700e02, 8.0908e-01],
+            [1.4850e02, 1.6000e02, 4.3900e02, 4.0100e02, 6.3965e-01],
+            [1.4375e02, 1.2675e02, 4.6275e02, 3.8525e02, 6.5689e-03],
+            [0.0000e00, 2.7700e02, 4.6600e02, 4.6800e02, 2.2247e-02],
+            [1.6250e00, 4.7650e02, 7.2812e01, 5.0900e02, 5.0430e-03],
+            [1.4975e02, 1.6500e02, 4.3125e02, 3.9850e02, 8.2031e-01],
+            [1.4950e02, 2.7625e02, 2.7125e02, 3.6025e02, 1.2842e-01],
+            [1.5475e02, 1.5788e02, 4.3575e02, 3.9900e02, 8.3789e-01],
+            [2.5925e02, 1.7750e01, 5.0925e02, 3.2475e02, 1.0967e-03],
+            [2.6200e02, 3.2812e00, 4.9500e02, 7.5375e01, 2.4612e-02],
+            [3.3000e01, 1.1462e02, 5.1200e02, 4.6850e02, 3.6469e-03],
+            [1.4962e02, 1.6250e02, 4.3650e02, 3.9800e02, 7.9492e-01],
+            [1.4850e02, 1.5975e02, 4.3250e02, 3.9275e02, 2.7051e-01],
+        ],
+        dtype="float16",
+    )
+    return dets[:N, :4], dets[:N, -1]
+
+
+def op_gflop(bz, N, max_out):
+    proposal_num = bz * N
+    flop = proposal_num * log(proposal_num) + max_out * proposal_num * 16
+    return flop / pow(10, 9)
+
+
+@skipIfNoTorchVision
+class nmsTestCase(unittest.TestCase):
+    def _create_tensors(self, N, rand=False):
+        if rand:
+            boxes = random_boxes(N, 200)
+            scores = torch.rand(N)
+            return boxes.numpy().astype("float16"), scores.numpy().astype("float16")
+        else:
+            boxes, scores = create_tensors(N)
+            return boxes, scores
+
+    def _test_nms(
+        self,
+        batch_size=1,
+        N=100,
+        num_classes=1,
+        preNmsTop=100,
+        nmsMaxOut=100,
+        iouThreshold=0.5,
+        confidence=0.5,
+        minBoxSize=16,
+        rand_box=False,
+        bench_pt=False,
+        rebuild=True,
+        test_name="efficient_nms",
+        benchmark_shapes=False,
+    ):
+        X1 = Tensor(
+            shape=[batch_size, N, num_classes, 4],
+            dtype="float16",
+            name="boxes",
+            is_input=True,
+        )
+
+        X2 = Tensor(
+            shape=[batch_size, N, num_classes],
+            dtype="float16",
+            name="scores",
+            is_input=True,
+        )
+
+        Y = ops.efficient_nms(
+            preNmsTop=preNmsTop,
+            nmsMaxOut=nmsMaxOut,
+            iouThreshold=iouThreshold,
+            minBoxSize=minBoxSize,
+        )(X1, X2)
+        mark_output(Y)
+
+        boxes, scores = self._create_tensors(N, rand=rand_box)
+        idxs = torch.randint(0, num_classes, (N,)).cuda().half()
+        iou = iouThreshold
+        boxes_pt = torch.tensor(boxes).cuda().half()
+        kept = nonempty(boxes_pt, threshold=minBoxSize)
+        score_pt = torch.tensor(scores).cuda().half()
+        score_pt[kept] = -1
+
+        if bench_pt:
+            func = box_ops.batched_nms
+            args = (boxes_pt, score_pt, idxs, iou)
+            batch_size = 1
+            duration = benchmark_torch_function(100, func, *args)
+            print(
+                f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
+            )
+
+        keep = box_ops.batched_nms(boxes_pt, score_pt, idxs, iou)
+
+        if keep.shape[0] >= nmsMaxOut:
+            keep = keep[:nmsMaxOut]
+            ref_box = boxes_pt[keep].cpu()
+        else:
+            ref_box = torch.zeros(nmsMaxOut, 4)
+            ref_box[
+                : keep.shape[0],
+            ] = boxes_pt[keep].cpu()
+
+        x = boxes.reshape((1, N, 1, 4)).copy()
+        x_scores = scores.reshape((1, N, 1)).copy()
+
+        x = np.repeat(x, repeats=num_classes, axis=2)
+        x_scores = np.repeat(x_scores, repeats=num_classes, axis=2)
+        x = np.repeat(x, repeats=batch_size, axis=0)
+        x_scores = np.repeat(x_scores, repeats=batch_size, axis=0)
+
+        rebuild = 1
+        target = detect_target()
+        if rebuild:
+            try:
+                shutil.rmtree("./tmp/" + str(test_name))
+            except FileNotFoundError:
+                pass
+            module = compile_model(Y, target, "./tmp", test_name)
+        else:
+            module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+        x_reshaped = torch.from_numpy(x.reshape(batch_size, N, num_classes, 4)).cuda()
+        scores_reshaped = torch.from_numpy(
+            x_scores.reshape(batch_size, N, num_classes)
+        ).cuda()
+        inputs = {"boxes": x_reshaped, "scores": scores_reshaped}
+
+        y0 = torch.empty([batch_size, 1]).cuda().to(torch.int64)
+        y1 = torch.empty([batch_size, nmsMaxOut, 4]).cuda().half()
+        y2 = torch.empty([batch_size, nmsMaxOut]).cuda().half()
+        y3 = torch.empty([batch_size, nmsMaxOut]).cuda().to(torch.int64)
+        outputs = {"output_0": y0, "output_1": y1, "output_2": y2, "output_3": y3}
+        module.run_with_tensors(inputs, outputs)
+
+        if benchmark_shapes:
+            module.benchmark_with_tensors(inputs, outputs)
+            gflop = op_gflop(batch_size, N, nmsMaxOut)
+            print(
+                f"NMS op gflop [batch size={batch_size}, N={N}, nmsMaxOut={nmsMaxOut}]: {gflop}"
+            )
+            return
+
+        module.run_with_tensors(inputs, outputs)
+        if batch_size > 1 and num_classes > 1:
+            idx1, idx2 = 0, -1
+            for y in [y0, y1, y2, y3]:
+                self.assertTrue(
+                    torch.allclose(y[idx1, :], y[idx2, :], atol=1e-2, rtol=1e-2)
+                )
+        else:
+            self.assertTrue(
+                torch.allclose(y1[0, :], ref_box.cuda().half(), atol=1e-2, rtol=1e-2)
+            )
+
+    def test_nms(self):
+        # self._test_nms(
+        #     N=15000,
+        #     preNmsTop=6000,
+        #     nmsMaxOut=1000,
+        #     iouThreshold=0.7,
+        #     minBoxSize=0,
+        #     batch_size=2,
+        #     rand_box=True,
+        #     test_name="nms1",
+        # )
+
+        self._test_nms(
+            N=30,
+            preNmsTop=30,
+            nmsMaxOut=10,
+            iouThreshold=0.5,
+            minBoxSize=0,
+            batch_size=1,
+            num_classes=1,
+            rand_box=False,
+            test_name="nms1",
+        )
+        self._test_nms(
+            N=30,
+            preNmsTop=30,
+            nmsMaxOut=10,
+            iouThreshold=0.5,
+            minBoxSize=0,
+            batch_size=2,
+            num_classes=4,
+            rand_box=False,
+            test_name="nms2",
+        )
+
+    @unittest.skip("manually enable it for benchmarking")
+    def test_nms_benchmark_shapes(self):
+        self._test_nms(
+            N=3350,
+            preNmsTop=2000,
+            nmsMaxOut=100,
+            iouThreshold=0.5,
+            minBoxSize=0,
+            batch_size=16,
+            num_classes=1,
+            rand_box=True,
+            test_name="nms_fcos_shape",
+            benchmark_shapes=True,
+        )
+
+        for bz in (1, 4, 16):
+            for N in (6000, 12000, 20000, 60000):
+                for maxout in (100, 300, 1000):
+                    self._test_nms(
+                        N=N,
+                        preNmsTop=6000,
+                        nmsMaxOut=maxout,
+                        iouThreshold=0.5,
+                        minBoxSize=0,
+                        batch_size=bz,
+                        num_classes=1,
+                        rand_box=True,
+                        test_name="nms_" + str(bz) + "_" + str(N) + "_" + str(maxout),
+                        benchmark_shapes=True,
+                    )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(1024)
+    unittest.main()
diff --git a/tests/unittest/ops/test_expand.py b/tests/unittest/ops/test_expand.py
new file mode 100644
index 000000000..d259ee961
--- /dev/null
+++ b/tests/unittest/ops/test_expand.py
@@ -0,0 +1,104 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import IntVar, Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import graph_has_op
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ExpandTestCase(unittest.TestCase):
+    def test_expand_fails_mismatched_ndim(self):
+        x = Tensor(shape=[5, IntVar([1, 10]), 5])
+        expand_shape = [5, -1]
+        self.assertRaises(ValueError, ops.expand().__call__, x, expand_shape)
+
+    def test_expand_fails_non_singleton_dim(self):
+        x = Tensor(shape=[5, 1, 2])
+        expand_shape = [6, 1, 2]
+        self.assertRaises(ValueError, ops.expand().__call__, x, expand_shape)
+
+        x = Tensor(shape=[IntVar([1, 10])])
+        expand_shape = [20]
+        self.assertRaises(ValueError, ops.expand().__call__, x, expand_shape)
+
+    def test_no_op_expands_removed_static_shapes(self):
+        x = Tensor([1, 2, 3], name="input_0", is_input=True)
+        y = ops.expand()(x, [1, -1, -1])
+        z = ops.elementwise(FuncEnum.MUL)(y, y)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "output_0"
+
+        x_pt = torch.randn((1, 2, 3)).half().cuda()
+        z_pt = x_pt * x_pt
+        z_ait = torch.empty_like(z_pt)
+        with compile_model(
+            z, detect_target(), "./tmp", "test_no_op_expands_removed_static_shapes"
+        ) as module:
+            module.run_with_tensors({"input_0": x_pt}, {"output_0": z_ait})
+            self.assertFalse(graph_has_op(module.debug_sorted_graph, "expand"))
+            self.assertTrue(torch.equal(z_ait, z_pt))
+
+    def test_no_op_expands_removed_dynamic_shapes(self):
+        dynamic_dim = IntVar([1, 5], name="dynamic_dim")
+        x = Tensor([1, dynamic_dim, 3], name="input_0", is_input=True)
+        y = ops.expand()(x, [IntVar([1, 1]), -1, -1])
+        z = ops.elementwise(FuncEnum.MUL)(y, y)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "output_0"
+
+        x_pt = torch.randn((1, 2, 3)).half().cuda()
+        z_pt = x_pt * x_pt
+        z_ait = torch.empty_like(z_pt)
+        with compile_model(
+            z, detect_target(), "./tmp", "test_no_op_expands_removed_dynamic_shapes"
+        ) as module:
+            module.run_with_tensors({"input_0": x_pt}, {"output_0": z_ait})
+            self.assertFalse(graph_has_op(module.debug_sorted_graph, "expand"))
+            self.assertTrue(torch.equal(z_ait, z_pt))
+
+    def test_no_op_expands_removed_size_op(self):
+        x = Tensor([1, 2, 3], name="input_0", is_input=True)
+        y = Tensor([IntVar([1, 1]), 2, 3], name="input_1", is_input=True)
+        x_size = ops.size()(x, 0)
+        y_size = ops.size()(y, 0)
+        x_expand = ops.expand()(x, [x_size, -1, -1])
+        y_expand = ops.expand()(y, [y_size, -1, -1])
+        z = ops.elementwise(FuncEnum.MUL)(x_expand, y_expand)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "output_0"
+
+        x_pt = torch.randn((1, 2, 3)).half().cuda()
+        y_pt = torch.randn((1, 2, 3)).half().cuda()
+        z_pt = x_pt * y_pt
+        z_ait = torch.empty_like(z_pt)
+        with compile_model(
+            z, detect_target(), "./tmp", "test_no_op_expands_removed_dynamic_shapes"
+        ) as module:
+            module.run_with_tensors(
+                {"input_0": x_pt, "input_1": y_pt}, {"output_0": z_ait}
+            )
+            self.assertFalse(graph_has_op(module.debug_sorted_graph, "expand"))
+            self.assertTrue(torch.equal(z_ait, z_pt))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_flatten.py b/tests/unittest/ops/test_flatten.py
new file mode 100644
index 000000000..2a7057d3e
--- /dev/null
+++ b/tests/unittest/ops/test_flatten.py
@@ -0,0 +1,135 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.compiler.base import IntImm, IntVar
+
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class FlattenTestCase(unittest.TestCase):
+    def _test_fp16_single_op(
+        self,
+        X_shape,
+        start_dim=0,
+        end_dim=-1,
+        test_name="flatten",
+        check_name_retention=False,
+    ):
+        target = detect_target()
+        dynamic_dim_names = [
+            dim._attrs["name"] for dim in X_shape if isinstance(dim, IntVar)
+        ]
+        dynamic_dim_name = dynamic_dim_names[0] if 0 < len(dynamic_dim_names) else None
+        X_shape = [dim if isinstance(dim, IntVar) else IntImm(dim) for dim in X_shape]
+        X = Tensor(
+            shape=X_shape,
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        OP = nn.Flatten(start_dim, end_dim)
+        Y = OP(X)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        x_shape_values = [var._attrs["values"] for var in X_shape]
+        x_shapes = itertools.product(*x_shape_values)
+
+        for x_shape in x_shapes:
+            X_pt = torch.randn(x_shape).cuda().half()
+            Y_pt = torch.flatten(X_pt, start_dim, end_dim)
+            y = torch.empty_like(Y_pt)
+            in_x = X_pt.clone()
+            module.run_with_tensors([in_x], [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+            if check_name_retention and dynamic_dim_name is not None:
+                self.assertTrue(
+                    1
+                    == sum(
+                        dynamic_dim_name == dim._attrs["name"]
+                        for dim in Y._attrs["shape"]
+                    )
+                )
+
+    def test_flatten(self):
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=[1, 3]), 16, 32, 64), test_name="flatten0"
+        )
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=[2, 5]), 16, 32, 64),
+            start_dim=0,
+            end_dim=1,
+            test_name="flatten1",
+        )
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=[2, 5]), 16, 32, 64),
+            start_dim=0,
+            end_dim=0,
+            test_name="flatten2",
+        )
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=[3, 4]), 16, 32, 64),
+            start_dim=1,
+            end_dim=-2,
+            test_name="flatten3",
+        )
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=[3, 4], name="input_batch"), 16, 32, 2, 64),
+            start_dim=1,
+            end_dim=-2,
+            test_name="flatten_name",
+            check_name_retention=True,
+        )
+        self._test_fp16_single_op(
+            X_shape=(16, 32, IntVar(values=[3, 4], name="input_batch"), 2, 64),
+            start_dim=1,
+            end_dim=-1,
+            test_name="flatten_dynamic_nonbatch",
+        )
+        self._test_fp16_single_op(
+            X_shape=(32, 16, 4, IntVar(values=[3, 4], name="input_batch"), 16),
+            start_dim=0,
+            end_dim=2,
+            test_name="flatten_dynamic_nonbatch_name",
+            check_name_retention=True,
+        )
+
+        self._test_fp16_single_op(
+            X_shape=(32, 16, 4, 3, 16),
+            start_dim=0,
+            end_dim=2,
+            test_name="flatten_static_1",
+        )
+
+        self._test_fp16_single_op(
+            X_shape=(32, 3, 16, 4, 16),
+            start_dim=0,
+            end_dim=-1,
+            test_name="flatten_static_2",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_fpn_roi_align.py b/tests/unittest/ops/test_fpn_roi_align.py
new file mode 100644
index 000000000..c4e2f2ea8
--- /dev/null
+++ b/tests/unittest/ops/test_fpn_roi_align.py
@@ -0,0 +1,201 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+from unittest import skipIf
+
+import torch
+from aitemplate.compiler import compile_model, Model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+try:
+    from detectron2.modeling.poolers import ROIPooler
+
+    HAS_D2 = True
+except ImportError:
+    HAS_D2 = False
+skipIfNoD2 = skipIf(not HAS_D2, "no detectron2")
+
+
+def random_boxes(num_boxes, max_coord=512):
+    boxes = torch.rand(num_boxes, 4) * (max_coord * 0.5)
+    boxes.clamp_(min=1.0)
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes.cuda().half()
+
+
+@skipIfNoD2
+class RoiAlignTestCase(unittest.TestCase):
+    def _test_fpn_roi_align(
+        self,
+        boxes,
+        features,
+        CC=16,
+        num_rois=3,
+        pooled_size=7,
+        spatial_scale=1 / 16.0,
+        sampling_ratio=0,
+        batch_size=1,
+        test_name="fpn_roi_align",
+        im_shape=(512, 512),
+        rebuild=True,
+        bench=False,
+    ):
+        HH, WW = im_shape
+        target = detect_target()
+
+        P2 = Tensor(
+            shape=[1, HH // 4, WW // 4, CC], dtype="float16", name="P2", is_input=True
+        )
+
+        P3 = Tensor(
+            shape=[1, HH // 8, WW // 8, CC], dtype="float16", name="P3", is_input=True
+        )
+        P4 = Tensor(
+            shape=[1, HH // 16, WW // 16, CC], dtype="float16", name="P4", is_input=True
+        )
+        P5 = Tensor(
+            shape=[1, HH // 32, WW // 32, CC], dtype="float16", name="P5", is_input=True
+        )
+        R = Tensor(shape=[num_rois, 5], dtype="float16", name="ROI", is_input=True)
+
+        OP = ops.multi_level_roi_align(
+            num_rois=num_rois,
+            pooled_size=pooled_size,
+            spatial_scale=spatial_scale,
+            sampling_ratio=sampling_ratio,
+            position_sensitive=False,
+            continuous_coordinate=True,
+            im_shape=im_shape,
+        )
+        Y = OP(P2, P3, P4, P5, R)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        if rebuild:
+            module = compile_model(Y, target, "./tmp", test_name)
+        else:
+            module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+        def fpn_roialign_pt(boxes, features, device="cuda"):
+            from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+            from detectron2.structures import Boxes
+
+            pooler_resolution = pooled_size
+            canonical_level = 4
+            canonical_scale_factor = 2**canonical_level
+            pooler_scales = (
+                4.0 / canonical_scale_factor,
+                2.0 / canonical_scale_factor,
+                1.0 / canonical_scale_factor,
+                0.5 / canonical_scale_factor,
+            )
+            sampling_ratio = 0
+
+            rois = [Boxes(boxes).to(device)]
+
+            roialignv2_pooler = ROIPooler(
+                output_size=pooler_resolution,
+                scales=pooler_scales,
+                sampling_ratio=sampling_ratio,
+                pooler_type="ROIAlignV2",
+            )
+            roialignv2_out = roialignv2_pooler(features, rois)
+            if bench:
+                func = roialignv2_pooler
+                args = (features, rois)
+                duration = benchmark_torch_function(100, func, *args)
+                print(
+                    f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
+                )
+
+            return roialignv2_out
+
+        y_pt = fpn_roialign_pt(boxes, features)  # fp32 pt
+
+        rois = torch.zeros(num_rois, 5)
+        rois[:, 1:] = boxes
+        rois = rois.cuda().half()
+        X_p2 = features[0].half()
+        X_p3 = features[1].half()
+        X_p4 = features[2].half()
+        X_p5 = features[3].half()
+
+        x_p2 = X_p2.permute((0, 2, 3, 1)).contiguous()
+        x_p3 = X_p3.permute((0, 2, 3, 1)).contiguous()
+        x_p4 = X_p4.permute((0, 2, 3, 1)).contiguous()
+        x_p5 = X_p5.permute((0, 2, 3, 1)).contiguous()
+
+        inputs = {
+            "P2": x_p2,
+            "P3": x_p3,
+            "P4": x_p4,
+            "P5": x_p5,
+            "ROI": rois,
+        }
+        y = torch.empty([num_rois, pooled_size, pooled_size, CC]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        eps = 1e-2
+        self.assertTrue(torch.allclose(y_pt.half(), y_transpose, atol=eps, rtol=eps))
+
+    def test_fpn_roi_align(self):
+        N, C, H, W = 1, 16, 512, 512
+        std = 11
+        mean = 0
+
+        feature2 = (torch.rand(N, C, H // 4, W // 4) - 0.5) * 2 * std + mean
+        feature3 = (torch.rand(N, C, H // 8, W // 8) - 0.5) * 2 * std + mean
+        feature4 = (torch.rand(N, C, H // 16, W // 16) - 0.5) * 2 * std + mean
+        feature5 = (torch.rand(N, C, H // 32, W // 32) - 0.5) * 2 * std + mean
+
+        features = [
+            feature2.cuda(),
+            feature3.cuda(),
+            feature4.cuda(),
+            feature5.cuda(),
+        ]
+
+        boxes = torch.tensor(
+            [
+                [100.0, 120.0, 152.0, 152.0],
+                [2.0, 2.0, 52.0, 52.0],
+                [1.0, 1.0, 100.0, 100.0],
+                [110.0, 110.0, 300.0, 300.0],
+                [1.0, 1.0, 150.0, 150.0],
+                [10.0, 10.0, 300.0, 300.0],
+                [10.0, 10.0, 400.0, 400.0],
+                [110.0, 110.0, 400.0, 400.0],
+                [110.0, 110.0, 350.0, 350.0],
+                [10.0, 10.0, 510.0, 510.0],
+            ]
+        ).cuda()
+        boxes = random_boxes(100)
+        self._test_fpn_roi_align(
+            boxes,
+            features,
+            CC=C,
+            num_rois=boxes.shape[0],
+            im_shape=(H, W),
+            pooled_size=7,
+            rebuild=1,
+            test_name="fpn_roi_align",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_fused_elementwise.py b/tests/unittest/ops/test_fused_elementwise.py
new file mode 100644
index 000000000..2798dc774
--- /dev/null
+++ b/tests/unittest/ops/test_fused_elementwise.py
@@ -0,0 +1,381 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for fused_elementwise Operator.
+"""
+import math
+import unittest
+from typing import List
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops, transform
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class FusedElementwiseTestCase(unittest.TestCase):
+    def test_fused_elementwise_constructor(self):
+        BATCH_SIZE = 1024
+        M = 256
+        K = 128
+
+        op1 = ops.elementwise(FuncEnum.ADD)
+        op1._attrs["name"] = "e1"
+        op2 = ops.elementwise(FuncEnum.TANH)
+        op2._attrs["name"] = "e2"
+        X1 = Tensor(
+            shape=[BATCH_SIZE, M, K],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="X2",
+            value=3.0,
+        )
+        X3 = op1(X1, X2)
+        X3._attrs["name"] = "X3"
+        X4 = op2(X3)
+        X4._attrs["name"] = "output0"
+        X4._attrs["is_output"] = True
+
+        graph = transform.toposort(X4)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        transform.refine_graph(graph)
+
+        fused_op = ops.fused_elementwise([op1, op2])
+        fused_op._attrs["name"] = "fused_elementwise0"
+
+        self.assertEqual(fused_op._attrs["inputs"], [X1])
+        self.assertEqual(fused_op._attrs["outputs"], [X4])
+
+        self.assertEqual(X4._attrs["src_ops"], {fused_op})
+        self.assertEqual(X1._attrs["dst_ops"], {fused_op})
+
+        self.assertEqual(fused_op._attrs["depth"], 0)
+        self.assertEqual(X1._attrs["depth"], 0)
+        self.assertEqual(X4._attrs["depth"], 2)
+
+    def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name):
+        X1 = Tensor(
+            shape=[
+                shape_utils.gen_int_var_min_max(batch_sizes),
+                shape_utils.gen_int_var_min_max(ms),
+                shape_utils.gen_int_var_min_max(ks),
+            ],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="X2",
+            value=3.0,
+        )
+        X3 = X1 + X2
+        X3._attrs["name"] = "X3"
+        X4 = ops.tanh(X3)
+        X4._attrs["name"] = "output0"
+        X4._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            X4,
+            target,
+            "./tmp",
+            "fused_elementwise_{}".format(test_name),
+        )
+
+        for batch_size in batch_sizes:
+            for m in ms:
+                for k in ks:
+                    x1_pt = torch.randn(batch_size, m, k).cuda().half()
+                    x4_pt = torch.tanh(x1_pt + 3.0)
+
+                    x4 = torch.empty([batch_size, m, k]).cuda().half()
+                    module.run_with_tensors([x1_pt], [x4])
+                    self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
+
+    def test_fused_elementwise_e2e(self):
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1024], ms=[256], ks=[128], test_name="static_shapes"
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1, 99, 998, 1024],
+            ms=[256],
+            ks=[128],
+            test_name="dynamic_batch_size",
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1024], ms=[1, 128, 256], ks=[128], test_name="dynamic_m"
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1024], ms=[256], ks=[1, 3, 8, 128], test_name="dynamic_k"
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[700, 80, 1024],
+            ms=[23, 78, 256],
+            ks=[10, 30, 128],
+            test_name="dynamic_all",
+        )
+
+    def test_fused_elementwise_kernel1(self):
+        BATCH_SIZE = 1024
+        M = 1496
+
+        X1 = Tensor(
+            shape=[IntImm(BATCH_SIZE), IntImm(2), IntImm(M)],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="constant_number",
+            value=1.0,
+        )
+        X3 = Tensor(
+            shape=[IntImm(2), IntImm(M)],
+            dtype="float16",
+            name="constant_matrix",
+            is_input=True,
+        )
+        X4 = ops.elementwise(FuncEnum.SIGN)(X1)
+        X5 = ops.elementwise(FuncEnum.ABS)(X1)
+        X6 = ops.elementwise(FuncEnum.ADD)(X5, X2)
+        X7 = ops.elementwise(FuncEnum.LOGE)(X6)
+        X8 = ops.elementwise(FuncEnum.MUL)(X4, X7)
+        X9 = ops.elementwise(FuncEnum.MUL)(X8, X3)
+        X9._attrs["is_output"] = True
+        X9._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X9, target, "./tmp", "fused_elementwise_kernel1")
+
+        x1_pt = torch.randn(BATCH_SIZE, 2, M).cuda().half()
+        x3_pt = torch.randn(2, M).cuda().half()
+        x9_pt = torch.sign(x1_pt) * torch.log1p(torch.abs(x1_pt)) * x3_pt
+
+        inputs = {"input0": x1_pt, "constant_matrix": x3_pt}
+        x9 = torch.empty([BATCH_SIZE, 2, M]).cuda().half()
+        module.run_with_tensors(inputs, [x9])
+        self.assertTrue(torch.allclose(x9, x9_pt, atol=1e-2, rtol=1e-2))
+
+    def _test_sigmoid(self, input_size, test_name="sigmoid"):
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.SIGMOID)(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", test_name)
+
+        x1_pt = torch.randn(input_size).cuda().half()
+        x2_pt = torch.sigmoid(x1_pt)
+
+        x2 = torch.empty(input_size).cuda().half()
+        module.run_with_tensors([x1_pt], [x2])
+        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
+    def test_sigmoid(self):
+        self._test_sigmoid([1024, 2 * 1496], "sigmoid_1")
+        self._test_sigmoid([1024, 23744], "sigmoid_2")
+        self._test_sigmoid([1024, 70144], "sigmoid_3")
+
+    def _test_tanh(self, input_size, test_name="tanh"):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.TANH)(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", test_name)
+
+        x1_pt = torch.randn(input_size).cuda().half()
+        x2_pt = torch.tanh(x1_pt)
+
+        x2 = torch.empty(input_size).cuda().half()
+        module.run_with_tensors([x1_pt], [x2])
+        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+
+    def test_tanh(self):
+        self._test_tanh([1024, 22400], "tanh_1")
+        self._test_tanh([1024, 70144], "tanh_2")
+        self._test_tanh([1024, 23744], "tanh_3")
+
+    def _test_min_max(
+        self, input_size: List[List[int]], test_name: str, is_min: bool, add_nans: bool
+    ) -> None:
+        assert len(input_size) == 2
+        X0 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype="float16",
+            name="input1",
+            is_input=True,
+        )
+        if is_min:
+            result = ops.elementwise(FuncEnum.MIN)(X0, X1)
+        else:
+            result = ops.elementwise(FuncEnum.MAX)(X0, X1)
+
+        result._attrs["is_output"] = True
+        result._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(result, target, "./tmp", test_name)
+
+        x0_pt = torch.randn(input_size).cuda().half()
+        x1_pt = torch.randn(input_size).cuda().half()
+        if add_nans:
+            x1_pt[0].fill_(float("nan"))
+
+        if is_min:
+            x2_pt = torch.min(x0_pt, x1_pt)
+        else:
+            x2_pt = torch.max(x0_pt, x1_pt)
+        x2_np = x2_pt.cpu().numpy()
+
+        inputs = {"input0": x0_pt, "input1": x1_pt}
+        x2 = torch.empty(input_size).cuda().half()
+        module.run_with_tensors(inputs, [x2])
+        x2 = x2.cpu().numpy()
+
+        if add_nans:
+            nans = np.full(x2_np[0].shape, np.nan)
+            np.testing.assert_allclose(nans, x2_np[0], equal_nan=True)
+            np.testing.assert_allclose(nans, x2[0], equal_nan=True)
+
+        np.testing.assert_allclose(x2, x2_np, atol=1e-2, rtol=1e-2)
+
+    def test_min(self):
+        self._test_min_max([512, 512], test_name="min_1", is_min=True, add_nans=False)
+        self._test_min_max([512, 512], test_name="min_2", is_min=True, add_nans=True)
+
+    def test_max(self):
+        self._test_min_max([512, 512], test_name="max_1", is_min=False, add_nans=False)
+        self._test_min_max([512, 512], test_name="max_2", is_min=False, add_nans=True)
+
+    def _test_clamp(
+        self, input_size: List[List[int]], min_val: int, max_val: int, test_name: str
+    ) -> None:
+        assert len(input_size) == 2
+        X0 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        result = ops.clamp()(X0, min_val, max_val)
+        result._attrs["is_output"] = True
+        result._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(result, target, "./tmp", test_name)
+
+        x0_pt = torch.randn(input_size).cuda().half()
+
+        x1_pt = torch.clamp(x0_pt, min_val, max_val)
+
+        x1 = torch.empty(input_size).cuda().half()
+        module.run_with_tensors([x0_pt], [x1])
+
+        self.assertTrue(torch.allclose(x1, x1_pt, atol=1e-2, rtol=1e-2))
+
+    def test_clamp(self):
+        self._test_clamp([512, 106], -1, 1, "clamp_0")
+        self._test_clamp([128, 46], None, 1, "clamp_1")
+        self._test_clamp([56, 265], -1, None, "clamp_2")
+        self._test_clamp([17, 123], 1, -1, "clamp_3")
+
+    def test_operator_overload(self):
+        input_size = [4, 2]
+        X1 = Tensor(
+            shape=input_size,
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=input_size,
+            dtype="float16",
+            name="input1",
+            is_input=True,
+        )
+        OUTPUT = -ops.tanh(X1 + X2) + ops.tanh(X2) + ops.tanh(X1)
+        OUTPUT._attrs["is_output"] = True
+        OUTPUT._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(OUTPUT, target, "./tmp", "test_op_overload")
+
+        x1_pt = torch.randn(input_size).cuda().half()
+        x2_pt = torch.randn(input_size).cuda().half()
+        output_pt = -torch.tanh(x1_pt + x2_pt) + torch.tanh(x2_pt) + torch.tanh(x1_pt)
+
+        output = torch.empty(input_size).cuda().half()
+        module.run_with_tensors([x1_pt, x2_pt], [output])
+        self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
+
+    def test_operator_overload_with_constant_number(self):
+        input_size = [4, 2]
+        X1 = Tensor(
+            shape=input_size,
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        OUTPUT = 10 / ops.tanh(X1 + 5) - ops.cos(10)
+        OUTPUT._attrs["is_output"] = True
+        OUTPUT._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(OUTPUT, target, "./tmp", "test_op_overload")
+
+        x1_pt = torch.randn(input_size).cuda().half()
+        output_pt = 10 / torch.tanh(x1_pt + 5) - math.cos(10)
+        output = torch.empty(input_size).cuda().half()
+        module.run_with_tensors([x1_pt], [output])
+        self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_fused_elementwise_broadcast.py b/tests/unittest/ops/test_fused_elementwise_broadcast.py
new file mode 100644
index 000000000..5de8ea4b6
--- /dev/null
+++ b/tests/unittest/ops/test_fused_elementwise_broadcast.py
@@ -0,0 +1,471 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for fused_elementwise broadcast.
+"""
+import itertools
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class FusedElementwiseBroadcastTestCase(unittest.TestCase):
+    def _test_different_dim(
+        self,
+        batch_sizes,
+        ms,
+        ks,
+        test_name,
+        expected_read_t,
+        expected_op_t,
+        expected_data_t,
+    ):
+        """
+        Tests tanh(A(B, M, K) + B(M, K)).
+        """
+
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes)
+        m_dim = shape_utils.gen_int_var_min_max(ms)
+        k_dim = shape_utils.gen_int_var_min_max(ks)
+
+        X1 = Tensor(
+            shape=[batch_dim, m_dim, k_dim],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[m_dim, k_dim],
+            dtype="float16",
+            name="input1",
+            is_input=True,
+        )
+        X3 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        X4 = ops.elementwise(FuncEnum.TANH)(X3)
+        X4._attrs["name"] = "output0"
+        X4._attrs["is_output"] = True
+        self.assertEqual(X4._attrs["shape"], [batch_dim, m_dim, k_dim])
+
+        target = detect_target()
+        module = compile_model(
+            X4,
+            target,
+            "./tmp",
+            "fused_elementwise_different_dims_{}".format(test_name),
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
+        self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
+
+        for batch_size, m, k in itertools.product(batch_sizes, ms, ks):
+            x1_pt = torch.randn(batch_size, m, k).cuda().half()
+            x2_pt = torch.randn(m, k).cuda().half()
+            x4_pt = torch.tanh(x1_pt + x2_pt)
+            inputs = {"input0": x1_pt, "input1": x2_pt}
+            x4 = torch.empty([batch_size, m, k]).cuda().half()
+            module.run_with_tensors(inputs, [x4])
+            self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
+
+    def test_different_dim(self):
+        self._test_different_dim(
+            batch_sizes=[1024],
+            ms=[256],
+            ks=[128],
+            test_name="static_shapes",
+            expected_read_t="uint4",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+        self._test_different_dim(
+            batch_sizes=[23, 56, 1024],
+            ms=[256],
+            ks=[128],
+            test_name="dynamic_bs",
+            expected_read_t="uint4",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+        self._test_different_dim(
+            batch_sizes=[1024],
+            ms=[34, 67, 256],
+            ks=[128],
+            test_name="dynamic_ms",
+            expected_read_t="uint4",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+        self._test_different_dim(
+            batch_sizes=[1024],
+            ms=[256],
+            ks=[34, 87, 128],
+            test_name="dynamic_ks",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+        self._test_different_dim(
+            batch_sizes=[23, 1024],
+            ms=[13, 256],
+            ks=[34, 128],
+            test_name="dynamic_all",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+
+    def _test_1_shape(
+        self,
+        batch_sizes,
+        ms,
+        ns,
+        ks,
+        test_name,
+        expected_read_t,
+        expected_op_t,
+        expected_data_t,
+    ):
+        """
+        Tests tanh(A(B, 1, 1, M, K, 1) + B(N, N, 1, K, M)).
+        """
+
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes)
+        m_dim = shape_utils.gen_int_var_min_max(ms)
+        n_dim = shape_utils.gen_int_var_min_max(ns)
+        k_dim = shape_utils.gen_int_var_min_max(ks)
+
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(1), IntImm(1), m_dim, k_dim, IntImm(1)],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[n_dim, n_dim, IntImm(1), k_dim, m_dim],
+            dtype="float16",
+            name="input1",
+            is_input=True,
+        )
+        X3 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        X4 = ops.elementwise(FuncEnum.TANH)(X3)
+        X4._attrs["name"] = "output0"
+        X4._attrs["is_output"] = True
+        self.assertEqual(
+            X4._attrs["shape"], [batch_dim, n_dim, n_dim, m_dim, k_dim, m_dim]
+        )
+
+        target = detect_target()
+        module = compile_model(
+            X4,
+            target,
+            "./tmp",
+            "fused_elementwise_1_shape_{}".format(test_name),
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
+        self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
+
+        for batch_size, m, n, k in itertools.product(batch_sizes, ms, ns, ks):
+            x1_pt = torch.randn(batch_size, 1, 1, m, k, 1).cuda().half()
+            x2_pt = torch.randn(n, n, 1, k, m).cuda().half()
+            x4_pt = torch.tanh(x1_pt + x2_pt)
+            inputs = {"input0": x1_pt, "input1": x2_pt}
+            x4 = torch.empty([batch_size, n, n, m, k, m]).cuda().half()
+            module.run_with_tensors(inputs, [x4])
+            self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
+
+    def test_1_shape(self):
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[16],
+            test_name="static_shapes",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+        self._test_1_shape(
+            batch_sizes=[23, 56, 1024],
+            ms=[8],
+            ns=[4],
+            ks=[16],
+            test_name="dynamic_bs",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[1, 3, 8],
+            ns=[4],
+            ks=[16],
+            test_name="dynamic_ms",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[1, 3, 4],
+            ks=[16],
+            test_name="dynamic_ns",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[1, 4, 7, 16],
+            test_name="dynamic_ks",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+        self._test_1_shape(
+            batch_sizes=[25, 1024],
+            ms=[7, 8],
+            ns=[3, 4],
+            ks=[1, 16],
+            test_name="dynamic_all",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+
+    def _test_chained_broadcasts(
+        self,
+        batch_sizes,
+        ms,
+        ns,
+        ks,
+        test_name,
+        expected_read_t,
+        expected_op_t,
+        expected_data_t,
+    ):
+        """
+        Tests A(B, 1, 1, M) + B(1, N, 1, M) + C(1, N, K, M).
+        """
+
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes)
+        m_dim = shape_utils.gen_int_var_min_max(ms)
+        n_dim = shape_utils.gen_int_var_min_max(ns)
+        k_dim = shape_utils.gen_int_var_min_max(ks)
+
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(1), IntImm(1), m_dim],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[IntImm(1), n_dim, IntImm(1), m_dim],
+            dtype="float16",
+            name="input1",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[IntImm(1), n_dim, k_dim, m_dim],
+            dtype="float16",
+            name="input2",
+            is_input=True,
+        )
+
+        X4 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        X5 = ops.elementwise(FuncEnum.ADD)(X3, X4)
+        X5._attrs["name"] = "output0"
+        X5._attrs["is_output"] = True
+        self.assertEqual(X5._attrs["shape"], [batch_dim, n_dim, k_dim, m_dim])
+
+        target = detect_target()
+        module = compile_model(
+            X5,
+            target,
+            "./tmp",
+            "fused_elementwise_chained_broadcasts_{}".format(test_name),
+        )
+
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
+        self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
+
+        for batch_size, m, n, k in itertools.product(batch_sizes, ms, ns, ks):
+            x1_pt = torch.randn(batch_size, 1, 1, m).cuda().half()
+            x2_pt = torch.randn(1, n, 1, m).cuda().half()
+            x3_pt = torch.randn(1, n, k, m).cuda().half()
+            x5_pt = x3_pt + x1_pt + x2_pt
+            inputs = {"input0": x1_pt, "input1": x2_pt, "input2": x3_pt}
+            x5 = torch.empty([batch_size, n, k, m]).cuda().half()
+            module.run_with_tensors(inputs, [x5])
+            self.assertTrue(torch.allclose(x5, x5_pt, atol=1e-2, rtol=1e-2))
+
+    def test_chained_shapes(self):
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[16],
+            test_name="static_shapes",
+            expected_read_t="uint4",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[23, 56, 1024],
+            ms=[2],
+            ns=[4],
+            ks=[16],
+            test_name="dynamic_bs",
+            expected_read_t="uint",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[1, 3, 8],
+            ns=[4],
+            ks=[16],
+            test_name="dynamic_ms",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[4],
+            ns=[1, 3, 4],
+            ks=[16],
+            test_name="dynamic_ns",
+            expected_read_t="uint2",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[1, 4, 7, 16],
+            test_name="dynamic_ks",
+            expected_read_t="uint4",
+            expected_op_t="half2",
+            expected_data_t="half",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[25, 1024],
+            ms=[7, 8],
+            ns=[3, 4],
+            ks=[1, 16],
+            test_name="dynamic_all",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+
+    def _test_consecutive_1s_broadcast(
+        self,
+        ks,
+        test_name,
+        expected_read_t,
+        expected_op_t,
+        expected_data_t,
+    ):
+        """
+        Tests A(1, 1, K, 1, 1, K) / B(1, 1, 1, 1, 1, 1).
+        """
+
+        k_dim = shape_utils.gen_int_var_min_max(ks)
+
+        X1 = Tensor(
+            shape=[IntImm(1), IntImm(1), k_dim, IntImm(1), IntImm(1), k_dim],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[IntImm(1), IntImm(1), IntImm(1), IntImm(1), IntImm(1), IntImm(1)],
+            dtype="float16",
+            name="input1",
+            is_input=True,
+        )
+        X3 = ops.elementwise(FuncEnum.DIV)(X1, X2)
+        X3._attrs["name"] = "output0"
+        X3._attrs["is_output"] = True
+        self.assertEqual(X3._attrs["shape"], X1._attrs["shape"])
+
+        target = detect_target()
+        module = compile_model(
+            X3,
+            target,
+            "./tmp",
+            "fused_elementwise_consecutive_1s_broadcast_{}".format(test_name),
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
+        self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
+
+        for k in ks:
+            x1_pt = torch.randn(1, 1, k, 1, 1, k).cuda().half()
+            x2_pt = torch.randn(1, 1, 1, 1, 1, 1).cuda().half()
+            x3_pt = x1_pt / x2_pt
+            inputs = {"input0": x1_pt, "input1": x2_pt}
+            x3 = torch.empty([1, 1, k, 1, 1, k]).cuda().half()
+            module.run_with_tensors(inputs, [x3])
+            self.assertTrue(torch.allclose(x3, x3_pt, atol=1e-2, rtol=1e-2))
+
+    def test_consecutive_1s_broadcast(self):
+        self._test_consecutive_1s_broadcast(
+            ks=[32],
+            test_name="static_shapes",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+        self._test_consecutive_1s_broadcast(
+            ks=[1, 5, 7, 32],
+            test_name="dynamic_shapes",
+            expected_read_t="half",
+            expected_op_t="half",
+            expected_data_t="half",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py b/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
new file mode 100644
index 000000000..dff0f67b9
--- /dev/null
+++ b/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
@@ -0,0 +1,141 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for fused_elementwise Operator with strided outputs.
+"""
+import unittest
+
+from typing import List
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class FusedElementwiseWithStridedOutputsTestCase(unittest.TestCase):
+    def _fused_elementwise_e2e_helper(
+        self, batch0_sizes: List[int], batch1_sizes: List[int], m1: int, m2: int, k: int
+    ):
+        # Construct one graph with 2 fused_elementwises + 1 cat.
+        batch0_dim = shape_utils.gen_int_var_min_max(batch0_sizes, "batch0")
+        batch1_dim = shape_utils.gen_int_var_min_max(batch1_sizes, "batch1")
+
+        X1 = Tensor(
+            shape=[
+                batch0_dim,
+                batch1_dim,
+                IntImm(m1),
+                IntImm(k),
+            ],
+            dtype="float16",
+            name="input0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[],
+            dtype="float16",
+            name="X2",
+            value=3.0,
+        )
+        X3 = Tensor(
+            shape=[
+                batch0_dim,
+                batch1_dim,
+                IntImm(m2),
+                IntImm(k),
+            ],
+            dtype="float16",
+            name="input1",
+            is_input=True,
+        )
+
+        X4 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        X5 = ops.elementwise(FuncEnum.TANH)(X4)
+        X6 = ops.elementwise(FuncEnum.TANH)(X3)
+        X7 = ops.concatenate()([X5, X6], dim=2)
+        X7._attrs["name"] = "output0"
+        X7._attrs["is_output"] = True
+
+        target = detect_target()
+        # Gen module.
+        with compile_model(
+            [X7],
+            target,
+            "./tmp",
+            f"fused_elementwise_with_strided_outputs_m1_{m1}_m2_{m2}_k_{k}",
+        ) as module:
+            for batch0_size in batch0_sizes:
+                for batch1_size in batch1_sizes:
+                    # Run PyTorch baseline.
+                    x1_pt = torch.randn(batch0_size, batch1_size, m1, k).cuda().half()
+                    x3_pt = torch.randn(batch0_size, batch1_size, m2, k).cuda().half()
+                    x5_pt = torch.tanh(x1_pt + 3.0)
+                    x6_pt = torch.tanh(x3_pt)
+                    x7_pt = torch.cat([x5_pt, x6_pt], dim=2)
+
+                    # Run AITemplate module.
+                    inputs = [0, 0]
+                    name_to_index_map = module.get_input_name_to_index_map()
+                    inputs[name_to_index_map["input0"]] = x1_pt
+                    inputs[name_to_index_map["input1"]] = x3_pt
+
+                    x7 = (
+                        torch.empty([batch0_size, batch1_size, m1 + m2, k])
+                        .cuda()
+                        .half()
+                    )
+                    module.run_with_tensors(inputs, [x7])
+                    # Do comparisons.
+                    self.assertTrue(torch.allclose(x7, x7_pt, atol=1e-2, rtol=1e-2))
+
+    def test_all_aligned(self):
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[1], batch1_sizes=[2, 4, 5], m1=8, m2=8, k=1
+        )
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[1, 99, 1024], batch1_sizes=[8], m1=8, m2=16, k=1
+        )
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[3, 5, 1024], batch1_sizes=[2, 5], m1=4, m2=4, k=2
+        )
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[1024], batch1_sizes=[2], m1=4, m2=2, k=4
+        )
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[1024], batch1_sizes=[2], m1=16, m2=64, k=32
+        )
+
+    def test_not_aligned(self):
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[8], batch1_sizes=[23, 88, 100], m1=1, m2=1, k=1
+        )
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[88, 100, 234], batch1_sizes=[40], m1=4, m2=2, k=1
+        )
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[23, 56, 93], batch1_sizes=[12, 34, 55], m1=1, m2=2, k=2
+        )
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2], batch1_sizes=[1024], m1=8, m2=2, k=1
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gather.py b/tests/unittest/ops/test_gather.py
new file mode 100644
index 000000000..dcfec0d02
--- /dev/null
+++ b/tests/unittest/ops/test_gather.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GatherTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(GatherTestCase, self).__init__(*args, **kwargs)
+
+    def _run_gather_test(self, *, input_shape, gather_dim, dim_size, index_shape=None):
+        logging.info(
+            "Test with input_shape {}, gather_dim {}".format(input_shape, gather_dim)
+        )
+
+        input_type = "float16"
+        index_type = "int64"
+        if index_shape is None:
+            index_shape = [
+                random.randint(0, d - 1) if i != gather_dim else dim_size
+                for (i, d) in enumerate(input_shape)
+            ]
+        logging.info("index_shape {}".format(index_shape))
+
+        X = Tensor(shape=input_shape, dtype=input_type, name="X", is_input=True)
+        Index = Tensor(shape=index_shape, dtype=index_type, name="Index", is_input=True)
+        gather_op = ops.gather()
+        Y = gather_op(X, gather_dim, Index)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        np.testing.assert_equal(y_shape, index_shape)
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "gather")
+
+        X_pt = get_random_torch_tensor(input_shape, input_type)
+        Index_pt = torch.randint(
+            input_shape[gather_dim], size=index_shape, dtype=torch.int64
+        ).cuda()
+        Y_pt = torch.gather(X_pt, gather_dim, Index_pt)
+        Y_np = Y_pt.cpu().numpy()
+        np.testing.assert_equal(y_shape, Y_np.shape)
+
+        Index_pt = Index_pt.to(torch.int64)
+        inputs = {"X": X_pt, "Index": Index_pt}
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_gather(self):
+        self._run_gather_test(input_shape=[2], gather_dim=0, dim_size=1)
+        self._run_gather_test(input_shape=[2], gather_dim=0, dim_size=2)
+        self._run_gather_test(input_shape=[2], gather_dim=0, dim_size=3)
+
+        self._run_gather_test(input_shape=[3, 4, 5], gather_dim=2, dim_size=7)
+        self._run_gather_test(input_shape=[3, 4, 5], gather_dim=1, dim_size=4)
+        self._run_gather_test(
+            input_shape=[3, 4, 5], gather_dim=0, dim_size=2, index_shape=[7, 1, 4]
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5], gather_dim=2, dim_size=7, index_shape=[0, 1, 2]
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm.py b/tests/unittest/ops/test_gemm.py
new file mode 100644
index 000000000..b3a6a6b47
--- /dev/null
+++ b/tests/unittest/ops/test_gemm.py
@@ -0,0 +1,190 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class GEMMTestCase(unittest.TestCase):
+    def _test_rcr(self, ms, k, n, test_name):
+        target = detect_target()
+        X = Tensor(
+            shape=[shape_utils.gen_int_var_min_max(ms), k],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        OP = ops.gemm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rcr_{}".format(test_name))
+
+        for m in ms:
+            X_pt = torch.randn(m, k).cuda().half()
+            W_pt = torch.randn(n, k).cuda().half()
+            Y_pt = torch.nn.functional.linear(X_pt, W_pt)
+
+            inputs = {"input_0": X_pt, "input_1": W_pt}
+            y = torch.empty([m, n]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+                pass
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr(self):
+        self._test_rcr([1024], 256, 512, "static")
+        if detect_target().name() == "cuda":
+            self._test_rcr([1, 1024], 256, 512, "dynamic1")
+            self._test_rcr([1, 99, 84, 987, 1024], 128, 8, "dynamic2")
+            self._test_rcr([8], 0, 4, "zero_k")
+            self._test_rcr([0], 8, 4, "zero_m")
+
+    def _test_3d_2d_rcr(self, m0s, m1s, k, n, test_name):
+        target = detect_target()
+        X = Tensor(
+            shape=[
+                shape_utils.gen_int_var_min_max(m0s),
+                shape_utils.gen_int_var_min_max(m1s),
+                k,
+            ],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        X._attrs["is_input"] = True
+        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        OP = ops.gemm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y, target, "./tmp", "gemm_3d_2d_rcr_{}".format(test_name)
+        )
+
+        for m0, m1 in itertools.product(m0s, m1s):
+            X_pt = torch.randn(m0, m1, k).cuda().half()
+            W_pt = torch.randn(n, k).cuda().half()
+            Y_pt = torch.nn.functional.linear(X_pt, W_pt)
+
+            inputs = {"input_0": X_pt, "input_1": W_pt}
+            y = torch.empty([m0, m1, n]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_3d_2d_rcr(self):
+        self._test_3d_2d_rcr([1024], [2], 256, 512, "static")
+        self._test_3d_2d_rcr([1, 1024], [2], 256, 512, "dynamic1")
+        self._test_3d_2d_rcr([3], [128, 256], 256, 512, "dynamic2")
+        self._test_3d_2d_rcr([1, 99, 1024], [1, 2], 128, 8, "dynamic3")
+
+    def _test_rrr(self, ms, k, n, test_name):
+        target = detect_target()
+        X = Tensor(
+            shape=[shape_utils.gen_int_var_min_max(ms), k],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[k, n], dtype="float16", name="input_1", is_input=True)
+        OP = ops.gemm_rrr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rrr_{}".format(test_name))
+
+        for m in ms:
+            X_pt = torch.randn(m, k).cuda().half()
+            W_pt = torch.randn(k, n).cuda().half()
+            Y_pt = torch.matmul(X_pt, W_pt)
+            inputs = {"input_0": X_pt, "input_1": W_pt}
+            y = torch.empty([m, n]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rrr(self):
+        self._test_rrr([256], 128, 32, "static")
+        if detect_target().name() == "cuda":
+            self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic")
+
+    def _test_3d_2d_rrr(self, m0s, m1s, k, n, test_name):
+        target = detect_target()
+        X = Tensor(
+            shape=[
+                shape_utils.gen_int_var_min_max(m0s),
+                shape_utils.gen_int_var_min_max(m1s),
+                k,
+            ],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[k, n], dtype="float16", name="input_1", is_input=True)
+        OP = ops.gemm_rrr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rrr_{}".format(test_name))
+
+        for m0, m1 in itertools.product(m0s, m1s):
+            X_pt = torch.randn(m0, m1, k).cuda().half()
+            W_pt = torch.randn(k, n).cuda().half()
+            Y_pt = torch.matmul(X_pt, W_pt)
+
+            inputs = {"input_0": X_pt, "input_1": W_pt}
+            y = torch.empty([m0, m1, n]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_3d_2d_rrr(self):
+        self._test_3d_2d_rrr([256], [2], 128, 32, "static")
+        self._test_3d_2d_rrr([1, 128], [3], 256, 16, "dynamic1")
+        self._test_3d_2d_rrr([2], [24, 36], 256, 16, "dynamic2")
+        self._test_3d_2d_rrr([2, 34, 48], [1, 3, 5], 256, 16, "dynamic3")
+
+    def test_h_rcr(self):
+        M = 256
+        K = 256
+        N = 512
+        target = detect_target(use_fp16_acc=True)
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        OP = ops.gemm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "hgemm_rcr")
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        Y_pt = torch.nn.functional.linear(X_pt, W_pt)
+
+        inputs = {"input_0": X_pt, "input_1": W_pt}
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias.py b/tests/unittest/ops/test_gemm_bias.py
new file mode 100644
index 000000000..f155d9c93
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_bias.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class GEMMTestCase(unittest.TestCase):
+    def _test_rcr(self, Ms, N, K, test_name):
+        target = detect_target()
+        MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
+        X = Tensor(
+            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[IntImm(N)], dtype="float16", name="input_2", is_input=True)
+        OP = ops.gemm_rcr_bias()
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"gemm_rcr_bias_{test_name}")
+
+        for M in Ms:
+            logging.info(f"Testing {M=}")
+
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+
+            y = torch.empty([M, N]).half().cuda()
+            module.run_with_tensors(
+                {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
+                [y],
+            )
+            if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+                pass
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr(self):
+        target = detect_target()
+        self._test_rcr([128], N=64, K=1024, test_name="static")
+        self._test_rcr([4096], N=4, K=4, test_name="static")
+        self._test_rcr([1000], N=81, K=1024, test_name="static")
+        self._test_rcr([67200], N=3, K=256, test_name="static")
+        if target.name() == "cuda":
+            self._test_rcr([1, 7, 64, 127], N=64, K=1024, test_name="dynamic_m")
+            # This test triggered a c10 assertion failure internally
+            # caffe2/c10/util/SmallVector.h:338:
+            # Assertion `idx < size()' failed
+
+            self._test_rcr([2], N=0, K=4, test_name="zero_n")
+            self._test_rcr([0], N=4, K=4, test_name="zero_m")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_broadcast.py b/tests/unittest/ops/test_gemm_bias_broadcast.py
new file mode 100644
index 000000000..2226b77d1
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_bias_broadcast.py
@@ -0,0 +1,316 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class GEMMBiasBroadcastTestCase(unittest.TestCase):
+    def _init_tensors(self, m, k, n, m0=None, m1=None):
+        m_shape = [m] if m is not None else [m0, m1]
+        self.X = Tensor(
+            shape=m_shape + [k], dtype="float16", name="input_0", is_input=True
+        )
+        self.W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        self.B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
+        self.D0 = Tensor(shape=m_shape + [n], dtype="float16", name="d0", is_input=True)
+        self.D1 = Tensor(shape=m_shape + [n], dtype="float16", name="d1", is_input=True)
+        self.X_pt = torch.randn(*m_shape, k).cuda().half()
+        self.W_pt = torch.randn(n, k).cuda().half()
+        self.B_pt = torch.randn(n).cuda().half()
+        self.D0_pt = torch.randn(*m_shape, n).cuda().half()
+        self.D1_pt = torch.randn(*m_shape, n).cuda().half()
+
+    def _test_and_verify(
+        self, module, numpy_output, has_d1=False, module_output_name="output_0"
+    ):
+        inputs = {
+            "input_0": self.X_pt,
+            "input_1": self.W_pt,
+            "input_2": self.B_pt,
+            "d0": self.D0_pt,
+        }
+        if has_d1:
+            inputs["d1"] = self.D1_pt
+        y = torch.empty(list(numpy_output.shape)).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        if self.X_pt.nelement() == 0 or self.W_pt.nelement() == 0:
+            pass
+        else:
+            np.testing.assert_allclose(
+                numpy_output, y.cpu().numpy(), atol=1e-1, rtol=1e-1
+            )
+
+    def _test_bias_rcr_mul_add(self, m, m0, m1, k, n):
+        target = detect_target()
+        self._init_tensors(m, k, n, m0, m1)
+        OP = ops.gemm_rcr_bias_mul_add()
+        Y = OP(self.X, self.W, self.B, self.D0, self.D1)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y, target, "./tmp", "gemm_rcr_bias_mul_add_k_{}_n_{}".format(k, n)
+        )
+        Y_pt = (
+            torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
+            * self.D0_pt
+            + self.D1_pt
+        )
+        Y_np = Y_pt.cpu().numpy()
+        self._test_and_verify(module, Y_np, has_d1=True)
+
+    def test_bias_rcr_mul_add(self):
+        self._test_bias_rcr_mul_add(8, None, None, 8, 8)
+        if detect_target().name() == "cuda":
+            self._test_bias_rcr_mul_add(None, 2, 32, 256, 128)
+            self._test_bias_rcr_mul_add(None, 21, 5, 1024, 512)
+
+    def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n):
+        target = detect_target()
+        self._init_tensors(m, k, n, m0, m1)
+        OP = ops.gemm_rcr_bias_sigmoid_mul()
+        Y = OP(self.X, self.W, self.B, self.D0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            "gemm_rcr_bias_sigmoid_mul_k_{}_n_{}".format(k, n),
+        )
+
+        Y_pt = (
+            torch.sigmoid(
+                torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
+            )
+            * self.D0_pt
+        )
+        Y_np = Y_pt.cpu().numpy()
+        self._test_and_verify(module, Y_np)
+
+    def test_bias_rcr_sigmoid_mul(self):
+        self._test_bias_rcr_sigmoid_mul(8, None, None, 8, 8)
+        if detect_target().name() == "cuda":
+            self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128)
+            self._test_bias_rcr_sigmoid_mul(None, 21, 5, 1024, 512)
+
+    def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n):
+        target = detect_target()
+        self._init_tensors(m, k, n, m0, m1)
+        OP = ops.gemm_rcr_bias_sigmoid_mul_tanh()
+        Y = OP(self.X, self.W, self.B, self.D0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            "gemm_rcr_bias_sigmoid_mul_tanh_k_{}_n_{}".format(k, n),
+        )
+
+        Y_pt = torch.tanh(
+            torch.sigmoid(
+                torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
+            )
+            * self.D0_pt
+        )
+        Y_np = Y_pt.cpu().numpy()
+        self._test_and_verify(module, Y_np)
+
+    def test_bias_rcr_sigmoid_mul_tanh(self):
+        self._test_bias_rcr_sigmoid_mul_tanh(8, None, None, 8, 8)
+        if detect_target().name() == "cuda":
+            self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128)
+            self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 512)
+            self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 0)
+
+    def _test_bias_rcr_add(self, m, m0, m1, k, n):
+        target = detect_target()
+        self._init_tensors(m, k, n, m0, m1)
+        OP = ops.gemm_rcr_bias_add()
+        Y = OP(self.X, self.W, self.B, self.D0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            "gemm_rcr_bias_add_k_{}_n_{}".format(k, n),
+        )
+
+        Y_pt = (
+            torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
+            + self.D0_pt
+        )
+        Y_np = Y_pt.cpu().numpy()
+        self._test_and_verify(module, Y_np)
+
+    def test_bias_rcr_add(self):
+        self._test_bias_rcr_add(8, None, None, 8, 8)
+        if detect_target().name() == "cuda":
+            self._test_bias_rcr_add(None, 2, 32, 256, 128)
+            self._test_bias_rcr_add(None, 21, 5, 1024, 512)
+
+    def _test_bias_rcr_add_relu(self, m, m0, m1, k, n):
+        target = detect_target()
+        self._init_tensors(m, k, n, m0, m1)
+        OP = ops.gemm_rcr_bias_add_relu()
+        Y = OP(self.X, self.W, self.B, self.D0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            "gemm_rcr_bias_add_relu_k_{}_n_{}".format(k, n),
+        )
+
+        Y_pt = torch.relu(
+            torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
+            + self.D0_pt
+        )
+        Y_np = Y_pt.cpu().numpy()
+        self._test_and_verify(module, Y_np)
+
+    def test_bias_rcr_add_relu(self):
+        self._test_bias_rcr_add_relu(8, None, None, 8, 8)
+        if detect_target().name() == "cuda":
+            self._test_bias_rcr_add_relu(None, 2, 32, 256, 128)
+            self._test_bias_rcr_add_relu(None, 21, 5, 1024, 512)
+
+    def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n):
+        target = detect_target()
+        self._init_tensors(m, k, n, m0, m1)
+        OP = ops.gemm_rcr_bias_add_add_relu()
+        Y = OP(self.X, self.W, self.B, self.D0, self.D1)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            "gemm_rcr_bias_add_add_relu_k_{}_n_{}".format(k, n),
+        )
+
+        Y_pt = torch.relu(
+            torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
+            + self.D0_pt
+            + self.D1_pt
+        )
+        Y_np = Y_pt.cpu().numpy()
+        self._test_and_verify(module, Y_np, has_d1=True)
+
+    def test_bias_rcr_add_add_relu(self):
+        target = detect_target()
+        self._test_bias_rcr_add_add_relu(8, None, None, 8, 8)
+        if target.name() == "cuda":
+            self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128)
+            self._test_bias_rcr_add_add_relu(None, 21, 5, 1024, 512)
+            self._test_bias_rcr_add_add_relu(None, 21, 5, 1024, 0)
+            # This test triggered a c10 assertion failure internally
+            # caffe2/c10/util/SmallVector.h:338:
+            # Assertion `idx < size()' failed
+
+    def _test_bias_rcr_mul(self, m, m0, m1, k, n):
+        target = detect_target()
+        self._init_tensors(m, k, n, m0, m1)
+        OP = ops.gemm_rcr_bias_mul()
+        Y = OP(self.X, self.W, self.B, self.D0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            "gemm_rcr_bias_mul_k_{}_n_{}".format(k, n),
+        )
+
+        Y_pt = (
+            torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
+            * self.D0_pt
+        )
+        Y_np = Y_pt.cpu().numpy()
+        self._test_and_verify(module, Y_np)
+
+    def test_bias_rcr_mul(self):
+        self._test_bias_rcr_mul(8, None, None, 8, 8)
+        if detect_target().name() == "cuda":
+            self._test_bias_rcr_mul(None, 2, 32, 256, 128)
+            self._test_bias_rcr_mul(None, 21, 5, 1024, 512)
+
+    def _test_bias_rcr_add_add(self, m, m0, m1, k, n):
+        target = detect_target()
+        self._init_tensors(m, k, n, m0, m1)
+        OP = ops.gemm_rcr_bias_add_add()
+        Y = OP(self.X, self.W, self.B, self.D0, self.D1)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            "gemm_rcr_bias_add_add_k_{}_n_{}".format(k, n),
+        )
+
+        Y_pt = (
+            torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
+            + self.D0_pt
+            + self.D1_pt
+        )
+        Y_np = Y_pt.cpu().numpy()
+        self._test_and_verify(module, Y_np, has_d1=True)
+
+    def test_bias_rcr_add_add(self):
+        self._test_bias_rcr_add_add(8, None, None, 8, 8)
+        if detect_target().name() == "cuda":
+            self._test_bias_rcr_add_add(None, 2, 32, 256, 128)
+            self._test_bias_rcr_add_add(None, 21, 5, 1024, 512)
+            self._test_bias_rcr_add_add(None, 0, 5, 1024, 512)
+
+    def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n):
+        target = detect_target()
+        self._init_tensors(m, k, n, m0, m1)
+        OP = ops.gemm_rcr_bias_mul_tanh()
+        Y = OP(self.X, self.W, self.B, self.D0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            "gemm_rcr_bias_mul_tanh_k_{}_n_{}".format(k, n),
+        )
+
+        Y_pt = torch.tanh(
+            torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
+            * self.D0_pt
+        )
+        Y_np = Y_pt.cpu().numpy()
+        self._test_and_verify(module, Y_np)
+
+    def test_bias_rcr_mul_tanh(self):
+        self._test_bias_rcr_mul_tanh(8, None, None, 8, 8)
+        if detect_target().name() == "cuda":
+            self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128)
+            self._test_bias_rcr_mul_tanh(None, 21, 5, 1024, 512)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_hardswish.py b/tests/unittest/ops/test_gemm_bias_hardswish.py
new file mode 100644
index 000000000..f82c97c39
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_bias_hardswish.py
@@ -0,0 +1,55 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+def hard_swish(x):
+    return x * torch.clamp((x + 3), 0, 6) / 6
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMTestCase(unittest.TestCase):
+    def test_rcr(self):
+        M = 128
+        K = 1024
+        N = 64
+        target = detect_target()
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.gemm_rcr_bias_hardswish()
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_hardswish")
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+        Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+        Y_pt = hard_swish(Y_pt)
+
+        inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_permute.py b/tests/unittest/ops/test_gemm_bias_permute.py
new file mode 100644
index 000000000..bde06b9d4
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_bias_permute.py
@@ -0,0 +1,187 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "cuda", "Not supported by CUDA.")
+class GEMMTestCase(unittest.TestCase):
+    def test_gemm_rcr_bias_permute_m2n3(self):
+        M0 = 4
+        M1 = 256
+        N0 = 4
+        N1 = 16
+        N2 = 128
+        M = M0 * M1
+        N = N0 * N1 * N2
+        K = 256
+        shape = (M1, N0, N1)
+        target = detect_target()
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.gemm_rcr_bias_permute(shape, layout="m2n3")
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_permute_m2n3")
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+
+        Y_l = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+        Y_r = Y_l.reshape(M0, M1, N0, N1, N2)
+        Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
+
+        inputs = [X_pt, W_pt, B_pt]
+        y = torch.empty(Y_pt.shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_gemm_rcr_bias_permute_m3n2(self):
+        M0 = 4
+        M1 = 16
+        M2 = 32
+        N0 = 8
+        N1 = 128
+        M = M0 * M1 * M2
+        N = N0 * N1
+        K = 256
+        shape = (M1, M2, N0)
+        target = detect_target()
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.gemm_rcr_bias_permute(shape, layout="m3n2")
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_permute_m3n2")
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+        Y_l = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+        Y_r = Y_l.reshape(M0, M1, M2, N0, N1)
+        Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
+
+        inputs = [X_pt, W_pt, B_pt]
+        y = torch.empty(Y_pt.shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_gemm_rcr_permute_m2n3(self):
+        M0 = 4
+        M1 = 256
+        N0 = 4
+        N1 = 16
+        N2 = 128
+        M = M0 * M1
+        N = N0 * N1 * N2
+        K = 256
+        shape = (M1, N0, N1)
+        target = detect_target()
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        OP = ops.gemm_rcr_permute(shape, layout="m2n3")
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rcr_permute_m2n3")
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+
+        Y_l = torch.nn.functional.linear(X_pt, W_pt)
+        Y_r = Y_l.reshape(M0, M1, N0, N1, N2)
+        Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
+
+        inputs = [X_pt, W_pt]
+        y = torch.empty(Y_pt.shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    # ========== enable them after fix profiler =========
+    # def test_gemm_rcr_bias_relu(self):
+    #     M0 = 4
+    #     M1 = 32
+    #     M2 = 128
+    #     N0 = 16
+    #     N1 = 256
+    #     M = M0 * M1 * M2
+    #     N = N0 * N1
+    #     K = 128
+    #     shape = (M1, M2, N0)
+    #     target = detect_target()
+    #     X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+    #     W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+    #     B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+    #     OP = ops.gemm_rcr_bias_permute(shape)
+    #     Y = OP(X, W, B)
+    #     Y._attrs["name"] = "output_0"
+    #     Y._attrs["is_output"] = True
+    #     module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_permute")
+    #     X_pt = torch.randn(M, K).cuda().half()
+    #     W_pt = torch.randn(N, K).cuda().half()
+    #     B_pt = torch.randn(N).cuda().half()
+    #     Y_l = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+    #     Y_r = Y_l.reshape(M0, M1, M2, N0, N1)
+    #     Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
+
+    #     inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
+    #     y = torch.empty(Y_pt.shape).cuda().half()
+    #     module.run_with_tensors(inputs, [y])
+    #     self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    # def test_gemm_rrr_bias_relu(self):
+    #     M0 = 4
+    #     M1 = 32
+    #     M2 = 128
+    #     N0 = 16
+    #     N1 = 256
+    #     M = M0 * M1 * M2
+    #     N = N0 * N1
+    #     K = 128
+    #     shape = (M1, M2, N0)
+    #     target = detect_target()
+    #     X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+    #     W = Tensor(shape=[K, N], dtype="float16", name="input_1", is_input=True)
+    #     B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+    #     OP = ops.gemm_rrr_bias_permute(shape)
+    #     Y = OP(X, W, B)
+    #     Y._attrs["name"] = "output_0"
+    #     Y._attrs["is_output"] = True
+    #     module = compile_model(Y, target, "./tmp", "gemm_rrr_bias_permute")
+    #     X_pt = torch.randn(M, K).cuda().half()
+    #     W_pt = torch.randn(K, N).cuda().half()
+    #     B_pt = torch.randn(N).cuda().half()
+    #     Y_l = torch.matmul(X_pt, W_pt) + B_pt
+    #     Y_r = Y_l.reshape(M0, M1, M2, N0, N1)
+    #     Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
+
+    #     inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
+    #     y = torch.empty(Y_pt.shape).cuda().half()
+    #     module.run_with_tensors(inputs, [y])
+    #     self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_relu.py b/tests/unittest/ops/test_gemm_bias_relu.py
new file mode 100644
index 000000000..ea60e6ec5
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_bias_relu.py
@@ -0,0 +1,76 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class GEMMTestCase(unittest.TestCase):
+    def test_gemm_rcr_bias_relu(self):
+        M = 128
+        K = 1024
+        N = 64
+        target = detect_target()
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.gemm_rcr_bias_relu()
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_relu")
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+        Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+        Y_pt = torch.relu(Y_pt)
+
+        inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_gemm_rcr_bias_add_relu(self):
+        M = 128
+        K = 1024
+        N = 64
+        target = detect_target()
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        D = Tensor(shape=[M, N], dtype="float16", name="input_3", is_input=True)
+        OP = ops.gemm_rcr_bias_add_relu()
+        Y = OP(X, W, B, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_add_relu")
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+        D_pt = torch.randn(M, N).cuda().half()
+        Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D_pt
+        Y_pt = torch.relu(Y_pt)
+
+        inputs = [X_pt, W_pt, B_pt, D_pt]
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_sigmoid.py b/tests/unittest/ops/test_gemm_bias_sigmoid.py
new file mode 100644
index 000000000..98d9197af
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_bias_sigmoid.py
@@ -0,0 +1,50 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class GEMMTestCase(unittest.TestCase):
+    def test_rcr(self):
+        M = 128
+        K = 1024
+        N = 64
+        target = detect_target()
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.gemm_rcr_bias_sigmoid()
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_sigmoid")
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+        Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+        Y_pt = torch.sigmoid(Y_pt)
+
+        inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_softmax.py b/tests/unittest/ops/test_gemm_bias_softmax.py
new file mode 100644
index 000000000..16dfb4487
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_bias_softmax.py
@@ -0,0 +1,69 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model, Model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skip("GEMM + Softmax is disabled for now")
+class GEMMTestCase(unittest.TestCase):
+    def _test_gemm_rcr_bias_softmax(
+        self, M=16, K=64, N=24, rebuild=True, test_name="gemm_bias_softmax"
+    ):
+        target = detect_target()
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.gemm_rcr_bias_softmax()
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+        Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+        Y_pt = torch.softmax(Y_pt, dim=1)
+        Y_np = Y_pt.cpu().numpy()
+
+        if rebuild:
+            target = detect_target()
+            module = compile_model(Y, target, "./tmp", test_name)
+        else:
+            module = Model(os.path.join("./tmp", test_name, "test.so"))
+        inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+        np.testing.assert_allclose(
+            np.argmax(Y_np, axis=1),
+            np.argmax(y.cpu().numpy(), axis=1),
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
+    def test_gemm_bias_softmax(self):
+        self._test_gemm_rcr_bias_softmax(N=81)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_swish.py b/tests/unittest/ops/test_gemm_bias_swish.py
new file mode 100644
index 000000000..293dca0b9
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_bias_swish.py
@@ -0,0 +1,55 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMTestCase(unittest.TestCase):
+    def test_rcr(self):
+        M = 128
+        K = 1024
+        N = 64
+        target = detect_target()
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.gemm_rcr_bias_swish()
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_swish")
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.randn(N).cuda().half()
+        Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+        Y_pt = swish(Y_pt)
+
+        inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_tanh.py b/tests/unittest/ops/test_gemm_bias_tanh.py
new file mode 100644
index 000000000..de8358e20
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_bias_tanh.py
@@ -0,0 +1,68 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class GEMMTestCase(unittest.TestCase):
+    def _test_rcr(self, Ms, test_name):
+        K = 1024
+        N = 64
+        target = detect_target()
+        MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
+        X = Tensor(
+            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[IntImm(N)], dtype="float16", name="input_2", is_input=True)
+        OP = ops.gemm_rcr_bias_tanh()
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"gemm_rcr_bias_tanh_{test_name}")
+
+        for M in Ms:
+            logging.info(f"Testing {M=}")
+
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            Y_pt = torch.tanh(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(
+                {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
+                [y],
+            )
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr(self):
+        self._test_rcr([128], "static")
+        if detect_target().name() == "cuda":
+            self._test_rcr([1, 7, 64, 127], "dynamic_m")
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_permute.py b/tests/unittest/ops/test_gemm_permute.py
new file mode 100644
index 000000000..b17340e35
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_permute.py
@@ -0,0 +1,100 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMTestCase(unittest.TestCase):
+    def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False):
+        target = detect_target()
+        X = Tensor(
+            shape=[shape_utils.gen_int_var_min_max(ms), k],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
+        if has_bias:
+            Y = ops.gemm_rcr_bias_permute(shape)(X, W, B)
+        else:
+            Y = ops.gemm_rcr_permute(shape)(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"gemm_rcr_{test_name}")
+
+        for m in ms:
+            X_pt = torch.randn(m, k).cuda().half()
+            W_pt = torch.randn(n, k).cuda().half()
+            B_pt = torch.randn(n).cuda().half()
+            if has_bias:
+                Y_l = torch.nn.functional.linear(X_pt, W_pt, B_pt)
+            else:
+                Y_l = torch.nn.functional.linear(X_pt, W_pt)
+            Y_r = Y_l.reshape(16, *shape, 16)
+            Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
+
+            inputs = {"input_0": X_pt, "input_1": W_pt}
+            if has_bias:
+                inputs["input_2"] = B_pt
+            y = torch.empty(Y_pt.shape).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr(self):
+        for has_bias in (True, False):
+            self._test_rcr([80], 32, 96, (5, 3, 2), "permute1", has_bias=has_bias)
+            self._test_rcr([128], 64, 256, (8, 4, 4), "permute2", has_bias=has_bias)
+
+    def _test_rrr(self, ms, k, n, shape, test_name):
+        target = detect_target()
+        X = Tensor(
+            shape=[shape_utils.gen_int_var_min_max(ms), k],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[k, n], dtype="float16", name="input_1", is_input=True)
+        OP = ops.gemm_rrr_permute(shape)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rrr_{}".format(test_name))
+
+        for m in ms:
+            X_pt = torch.randn(m, k).cuda().half()
+            W_pt = torch.randn(k, n).cuda().half()
+            Y_l = torch.matmul(X_pt, W_pt)
+            Y_r = Y_l.reshape(16, *shape, 16)
+            Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
+            inputs = {"input_0": X_pt, "input_1": W_pt}
+            y = torch.empty(Y_pt.shape).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rrr(self):
+        self._test_rrr([80], 32, 96, (5, 3, 2), "permute1")
+        self._test_rrr([128], 64, 256, (8, 4, 4), "permute2")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
new file mode 100644
index 000000000..5a4dba5cc
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+class GEMMTestCase(unittest.TestCase):
+    def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
+        K = 1024
+        N = 64
+        target = detect_target()
+        MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
+        X = Tensor(
+            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
+        )
+        W = Tensor(
+            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[IntImm(N)], dtype="float16", name="input_2", is_input=True)
+        OP = (
+            ops.gemm_rcr_bias_fast_gelu() if use_fast_gelu else ops.gemm_rcr_bias_gelu()
+        )
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            f"gemm_rcr_bias_fast_gelu_{test_name}"
+            if use_fast_gelu
+            else f"gemm_rcr_bias_gelu_{test_name}",
+        )
+
+        for M in Ms:
+            logging.info(f"Testing {M=}")
+
+            X_pt = torch.randn(M, K).cuda().half()
+            W_pt = torch.randn(N, K).cuda().half()
+            B_pt = torch.randn(N).cuda().half()
+            Y_pt = torch.nn.GELU()(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
+            y = torch.empty([M, N]).cuda().half()
+            module.run_with_tensors(
+                {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
+                [y],
+            )
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr(self):
+        self._test_rcr([128], "static", use_fast_gelu=True)
+        if detect_target().name() == "cuda":
+            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=True)
+            self._test_rcr([128], "static", use_fast_gelu=False)
+            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_rrr_small_nk.py b/tests/unittest/ops/test_gemm_rrr_small_nk.py
new file mode 100644
index 000000000..252076212
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_rrr_small_nk.py
@@ -0,0 +1,68 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMTestCase(unittest.TestCase):
+    def _test_rrr(self, M, N, K, use_fp16_acc=True):
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        X = Tensor(shape=[*M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[K, N], dtype="float16", name="input_1", is_input=True)
+        OP = ops.gemm_rrr_small_nk()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "gemm_rrr_small_nk")
+        X_pt = torch.randn(*M, K).cuda().half()
+        W_pt = torch.randn(K, N).cuda().half()
+        Y_pt = torch.matmul(X_pt, W_pt)
+
+        inputs = {"input_0": X_pt, "input_1": W_pt}
+        y = torch.empty([*M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+            pass
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+        # from aitemplate.testing.benchmark_pt import benchmark_torch_function
+        # t = benchmark_torch_function(100, torch.matmul, X_pt, W_pt)
+        # print("pt time: ", t)
+        # module.benchmark_with_tensors(inputs, [y])
+
+    def test_rrr(self):
+        self._test_rrr([0, 1], 6, 3)
+        self._test_rrr([1000], 6, 0)
+        self._test_rrr([1, 1000], 6, 3)
+        self._test_rrr([10000], 6, 3, False)
+        self._test_rrr([10000], 6, 10, False)
+        self._test_rrr([10, 13], 6, 3)
+        self._test_rrr([105], 7, 1)
+        # self._test_rrr([1000000], 6, 3)
+        # self._test_rrr([1000000], 6, 10)
+        # self._test_rrr([1000000], 8, 16)
+        # self._test_rrr([1000000], 6, 3, False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_softmax.py b/tests/unittest/ops/test_gemm_softmax.py
new file mode 100644
index 000000000..14beb65cd
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_softmax.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model, Model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skip("GEMM + Softmax is disabled for now")
+class GEMMTestCase(unittest.TestCase):
+    def _test_gemm_rcr_softmax(
+        self, M=16, K=64, N=24, rebuild=True, test_name="gemm_softmax"
+    ):
+        target = detect_target()
+        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        OP = ops.gemm_rcr_softmax()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        X_pt = torch.randn(M, K).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        Y_pt = torch.nn.functional.linear(X_pt, W_pt)
+        Y_pt = torch.softmax(Y_pt, dim=1)
+        Y_np = Y_pt.cpu().numpy()
+
+        if rebuild:
+            target = detect_target()
+            module = compile_model(Y, target, "./tmp", test_name)
+        else:
+            module = Model(os.path.join("./tmp", test_name, "test.so"))
+        inputs = {"input_0": X_pt, "input_1": W_pt}
+        y = torch.empty([M, N]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        y_ait_np = y.cpu().numpy()
+        np.testing.assert_allclose(Y_np, y_ait_np, atol=1e-1, rtol=1e-1)
+        np.testing.assert_allclose(
+            np.argmax(Y_np, axis=1),
+            np.argmax(y_ait_np, axis=1),
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
+    def test_gemm_softmax(self):
+        self._test_gemm_rcr_softmax()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr.py b/tests/unittest/ops/test_group_gemm_rcr.py
new file mode 100644
index 000000000..b29c2a399
--- /dev/null
+++ b/tests/unittest/ops/test_group_gemm_rcr.py
@@ -0,0 +1,91 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+from parameterized import param, parameterized
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMTestCase(unittest.TestCase):
+    @parameterized.expand(
+        [
+            param(False, "group_gemm_rcr_run_once"),
+            param(True, "group_gemm_rcr_run_twice"),
+        ]
+    )
+    def test_rcr_foo(self, run_twice: bool, test_name: str):
+        M = 256
+        K1 = 128
+        N1 = 60
+        K2 = 192
+        N2 = 64
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        OP = ops.group_gemm_rcr()
+        Y1, Y2 = OP(operand_groups=[[X1, W1], [X2, W2]])
+        Y1._attrs["name"] = "y1"
+        Y1._attrs["is_output"] = True
+        Y2._attrs["name"] = "y2"
+        Y2._attrs["is_output"] = True
+
+        graph_outputs = [Y1, Y2]
+        if run_twice:
+            # Run twice to exercise having different unique_workspace offsets
+            Y3 = ops.group_gemm_rcr()(operand_groups=[[X1, W1]])[0]
+            Y3._attrs["name"] = "y3"
+            Y3._attrs["is_output"] = True
+            graph_outputs.append(Y3)
+
+        module = compile_model(graph_outputs, target, "./tmp", test_name)
+        X1_pt = torch.randn(M, K1).cuda().half()
+        X2_pt = torch.randn(M, K2).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+        }
+        y1 = torch.empty([M, N1]).cuda().half()
+        y2 = torch.empty([M, N2]).cuda().half()
+        outputs = {"y1": y1, "y2": y2}
+        if run_twice:
+            outputs["y3"] = torch.empty([M, N1]).cuda().half()
+
+        module.run_with_tensors(inputs, outputs)
+        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+        if run_twice:
+            self.assertTrue(torch.allclose(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias.py b/tests/unittest/ops/test_group_gemm_rcr_bias.py
new file mode 100644
index 000000000..3db0fb882
--- /dev/null
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMTestCase(unittest.TestCase):
+    def test_rcr(self):
+        M = 256
+        K1 = 128
+        N1 = 60
+        K2 = 192
+        N2 = 64
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning("Group Gemm need SM80 HW")
+            return
+        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
+        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        OP = ops.group_gemm_rcr_bias()
+        Y1, Y2 = OP(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
+        Y1._attrs["name"] = "y1"
+        Y1._attrs["is_output"] = True
+        Y2._attrs["name"] = "y2"
+        Y2._attrs["is_output"] = True
+        module = compile_model([Y1, Y2], target, "./tmp", "group_gemm_rcr_bias")
+        X1_pt = torch.randn(M, K1).cuda().half()
+        X2_pt = torch.randn(M, K2).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        B1_pt = torch.randn(N1).cuda().half()
+        B2_pt = torch.randn(N2).cuda().half()
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "b1": B1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+            "b2": B2_pt,
+        }
+        y1 = torch.empty([M, N1]).cuda().half()
+        y2 = torch.empty([M, N2]).cuda().half()
+        module.run_with_tensors(inputs, {"y1": y1, "y2": y2})
+        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
new file mode 100644
index 000000000..f9a998200
--- /dev/null
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMTestCase(unittest.TestCase):
+    def test_rcr_relu(self):
+        M = 256
+        K1 = 128
+        N1 = 60
+        K2 = 192
+        N2 = 64
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
+        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        OP = ops.group_gemm_rcr_bias_relu()
+        Y1, Y2 = OP(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
+        Y1._attrs["name"] = "y1"
+        Y1._attrs["is_output"] = True
+        Y2._attrs["name"] = "y2"
+        Y2._attrs["is_output"] = True
+        module = compile_model([Y1, Y2], target, "./tmp", "group_gemm_rcr_bias_relu")
+        X1_pt = torch.randn(M, K1).cuda().half()
+        X2_pt = torch.randn(M, K2).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        B1_pt = torch.randn(N1).cuda().half()
+        B2_pt = torch.randn(N2).cuda().half()
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
+        Y1_pt = torch.relu(Y1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
+        Y2_pt = torch.relu(Y2_pt)
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "b1": B1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+            "b2": B2_pt,
+        }
+        y1 = torch.empty([M, N1]).cuda().half()
+        y2 = torch.empty([M, N2]).cuda().half()
+        module.run_with_tensors(inputs, {"y1": y1, "y2": y2})
+        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
+        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
new file mode 100644
index 000000000..54d9c124c
--- /dev/null
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMTestCase(unittest.TestCase):
+    def test_rcr_bias_cat(self):
+        M = 256
+        K1 = 128
+        N1 = 60
+        K2 = 192
+        N2 = 64
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
+        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        OP = ops.group_gemm_rcr_bias()
+        Y = OP(operand_groups=[[X1, W1, B1], [X2, W2, B2]], output_stride_dim=1)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        module = compile_model([Y], target, "./tmp", "group_gemm_rcr_bias_cat")
+        X1_pt = torch.randn(M, K1).cuda().half()
+        X2_pt = torch.randn(M, K2).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        B1_pt = torch.randn(N1).cuda().half()
+        B2_pt = torch.randn(N2).cuda().half()
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
+        Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
+        Y_np = Y_pt.cpu().numpy()
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_np.shape)
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "b1": B1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+            "b2": B2_pt,
+        }
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_cat.py b/tests/unittest/ops/test_group_gemm_rcr_cat.py
new file mode 100644
index 000000000..9b22c0ae1
--- /dev/null
+++ b/tests/unittest/ops/test_group_gemm_rcr_cat.py
@@ -0,0 +1,74 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMTestCase(unittest.TestCase):
+    def test_rcr_cat(self):
+        M = 256
+        K1 = 128
+        N1 = 60
+        K2 = 192
+        N2 = 64
+        target = detect_target()
+        if int(target._arch) < 80:
+            logger.warning(__file__, "Group Gemm need SM80 HW")
+            return
+        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        OP = ops.group_gemm_rcr()
+        Y = OP(operand_groups=[[X1, W1], [X2, W2]], output_stride_dim=1)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        module = compile_model([Y], target, "./tmp", "group_gemm_rcr_cat")
+
+        X1_pt = torch.randn(M, K1).cuda().half()
+        X2_pt = torch.randn(M, K2).cuda().half()
+        W1_pt = torch.randn(N1, K1).cuda().half()
+        W2_pt = torch.randn(N2, K2).cuda().half()
+        Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
+        Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
+        Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
+        Y_np = Y_pt.cpu().numpy()
+
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate y_shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_np.shape)
+
+        inputs = {
+            "x1": X1_pt,
+            "w1": W1_pt,
+            "x2": X2_pt,
+            "w2": W2_pt,
+        }
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_groupnorm.py b/tests/unittest/ops/test_groupnorm.py
new file mode 100644
index 000000000..126c95016
--- /dev/null
+++ b/tests/unittest/ops/test_groupnorm.py
@@ -0,0 +1,149 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for group norm Operator.
+"""
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import logger
+
+
+class GroupnormTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(GroupnormTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_groupnorm(
+        self,
+        x_shape=(4, 14, 14, 1024),
+        num_groups=32,
+        gamma_is_none=False,
+        beta_is_none=False,
+        use_size_op=False,
+        eps=1e-5,
+        use_swish=False,
+    ):
+        test_name = "group_norm_swish" if use_swish else "group_norm"
+        logger.info(
+            __file__, f"Testing {test_name}: {x_shape}, num_groups: {num_groups}"
+        )
+        num_channels = x_shape[-1]
+        X1 = Tensor(
+            shape=x_shape,
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[num_channels],
+            dtype="float16",
+            name="gamma",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[num_channels],
+            dtype="float16",
+            name="beta",
+            is_input=True,
+        )
+
+        op_name = "group_norm_swish" if use_swish else "group_norm"
+        X4 = getattr(ops, op_name)(num_groups, num_channels)(X1, X2, X3, eps)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(X4, target, "./tmp", op_name, dll_name=dll_name)
+
+        x1_nhwc_pt = torch.randn(*x_shape).cuda().half()
+        x1_nchw_pt = x1_nhwc_pt.permute(0, 3, 1, 2).contiguous()
+        gamma_pt = torch.randn(num_channels).cuda().half()
+        beta_pt = torch.randn(num_channels).cuda().half()
+
+        x4_pt = torch.nn.functional.group_norm(
+            x1_nchw_pt, num_groups, gamma_pt, beta_pt, eps=eps
+        )
+        if use_swish:
+            x4_pt = torch.nn.SiLU()(x4_pt)
+
+        inputs = {"X": x1_nhwc_pt}
+        inputs["gamma"] = gamma_pt
+        inputs["beta"] = beta_pt
+        x4 = torch.empty(x_shape).cuda().half()
+        module.run_with_tensors(inputs, [x4])
+
+        # from aitemplate.testing.benchmark_pt import benchmark_torch_function
+        # module.benchmark_with_tensors(inputs, [x4], count=100000)
+        # t = benchmark_torch_function(
+        #    100000,
+        #    torch.nn.functional.group_norm,
+        #    x1_nchw_pt,
+        #    num_groups,
+        #    gamma_pt,
+        #    beta_pt,
+        #    eps=eps,
+        # )
+        # print("pt: ", t)
+
+        self.assertTrue(
+            torch.allclose(
+                x4, x4_pt.permute(0, 2, 3, 1).contiguous(), atol=1e-2, rtol=1e-2
+            )
+        )
+        self.test_count += 1
+
+    def test_layernorm(self):
+        self._test_groupnorm()
+        self._test_groupnorm(x_shape=[1, 16, 16, 8192], num_groups=32, eps=1e-3)
+        self._test_groupnorm(x_shape=[3, 64, 64, 128], num_groups=16, eps=1e-5)
+        self._test_groupnorm(x_shape=[3, 33, 64, 120], num_groups=10, eps=1e-5)
+        self._test_groupnorm(x_shape=[8, 34, 10, 72], num_groups=6, eps=1e-5)
+        self._test_groupnorm(x_shape=[1, 8, 1, 64], num_groups=32, eps=1e-5)
+        self._test_groupnorm(x_shape=[1, 8, 1, 4], num_groups=2, eps=1e-5)
+
+    def test_layernorm_swish(self):
+        self._test_groupnorm(use_swish=True)
+
+        shapes = [
+            (2, 16, 16, 1280),
+            (2, 16, 16, 1920),
+            (2, 16, 16, 2560),
+            (2, 16, 16, 640),
+            (2, 32, 32, 1280),
+            (2, 32, 32, 1920),
+            (2, 32, 32, 320),
+            (2, 32, 32, 640),
+            (2, 32, 32, 960),
+            (2, 64, 64, 320),
+            (2, 8, 8, 1280),
+            (2, 8, 8, 2560),
+            (2, 64, 64, 640),
+            (2, 64, 64, 960),
+            (1, 256, 256, 128),
+            (1, 512, 512, 256),
+        ]
+
+        for shape in shapes:
+            self._test_groupnorm(x_shape=shape, num_groups=32, eps=1e-5, use_swish=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_layernorm.py b/tests/unittest/ops/test_layernorm.py
new file mode 100644
index 000000000..d05eaafa6
--- /dev/null
+++ b/tests/unittest/ops/test_layernorm.py
@@ -0,0 +1,146 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for LayerNorm Operator.
+"""
+import logging
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm, IntVar
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class LayernormTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(LayernormTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_layernorm(
+        self,
+        MS=(),
+        NS=(1496,),
+        gamma_is_none=False,
+        beta_is_none=False,
+        use_size_op=False,
+        eps=1e-5,
+    ):
+        BS = [1, 1024]
+        input_shapes = ((BS), *MS, *NS)
+        logging.info(
+            f"input shapes: {input_shapes}"
+            f"gamma_is_none: {gamma_is_none}, beta_is_none: {beta_is_none}, "
+            f"use_size_op: {use_size_op}"
+        )
+        assert isinstance(MS, (list, tuple))
+        assert isinstance(NS, (list, tuple))
+
+        X1 = Tensor(
+            shape=[IntVar(name="input_batch", values=BS), *MS, *NS],
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+        if gamma_is_none:
+            X2 = None
+        else:
+            X2 = Tensor(
+                shape=NS,
+                dtype="float16",
+                name="gamma",
+                is_input=True,
+            )
+        if beta_is_none:
+            X3 = None
+        else:
+            X3 = Tensor(
+                shape=NS,
+                dtype="float16",
+                name="beta",
+                is_input=True,
+            )
+        if use_size_op:
+            norm_shapes = [
+                ops.getitem()(ops.size()(X1), i) for i in range(1 + len(MS), X1._rank())
+            ]
+        else:
+            norm_shapes = [IntImm(n) for n in NS]
+
+        X4 = (
+            ops.layernorm()(X1, X2, X3, NS, eps)
+            if not use_size_op
+            else ops.layernorm()(X1, X2, X3, norm_shapes, eps)
+        )
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(X4, target, "./tmp", "layernorm", dll_name=dll_name)
+
+        for batch_size in [50, 900, 1024]:
+            x1_pt = torch.randn(batch_size, *MS, *NS).cuda().half()
+            if gamma_is_none:
+                x2_pt = None
+            else:
+                x2_pt = torch.randn(NS).cuda().half()
+            if beta_is_none:
+                x3_pt = None
+            else:
+                x3_pt = torch.randn(NS).cuda().half()
+            x4_pt = torch.nn.functional.layer_norm(x1_pt, NS, x2_pt, x3_pt, eps=eps)
+
+            inputs = {"X": x1_pt}
+            if not gamma_is_none:
+                inputs["gamma"] = x2_pt
+            if not beta_is_none:
+                inputs["beta"] = x3_pt
+            x4 = torch.empty([batch_size, *MS, *NS]).cuda().half()
+            module.run_with_tensors(inputs, [x4])
+            self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-3, rtol=1e-3))
+            self.test_count += 1
+
+    def test_layernorm(self):
+        if detect_target().name() == "rocm":
+            self._test_layernorm(use_size_op=False, MS=(256,), NS=(768,))
+            self._test_layernorm(use_size_op=False, MS=(), NS=(768,))
+            self._test_layernorm(
+                use_size_op=False,
+                MS=(
+                    256,
+                    3,
+                ),
+                NS=(256,),
+            )
+        else:
+            for use_size_op in (True, False):
+                self._test_layernorm(use_size_op=use_size_op)
+                self._test_layernorm(gamma_is_none=True, use_size_op=use_size_op)
+                self._test_layernorm(beta_is_none=True, use_size_op=use_size_op)
+                self._test_layernorm(
+                    gamma_is_none=True, beta_is_none=True, use_size_op=use_size_op
+                )
+                self._test_layernorm(use_size_op=use_size_op, eps=0.1)
+                self._test_layernorm(MS=(16, 64), NS=(4, 32), use_size_op=use_size_op)
+                self._test_layernorm(
+                    MS=(16, 8, 4), NS=(2, 4, 32), use_size_op=use_size_op
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_layernorm_sigmoid_mul.py b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
new file mode 100644
index 000000000..672da2bf7
--- /dev/null
+++ b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
@@ -0,0 +1,700 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for FusedLayernormSigmoidMul Operator.
+"""
+import logging
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm, IntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class FusedLayernormSigmoidMulTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(FusedLayernormSigmoidMulTestCase, self).__init__(*args, **kwargs)
+        torch.manual_seed(0)
+        self._atol = 1e-2
+        self._rtol = 1e-3
+
+    def _test_fused_layernorm_sigmoid_mul(
+        self,
+        MS=(),
+        NS=(16,),
+        gamma_is_none=False,
+        beta_is_none=False,
+        use_size_op=False,
+        eps=1e-5,
+    ):
+        logging.info(
+            f"_test_fused_layernorm_sigmoid_mul: M={MS}, N={NS}, "
+            f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}"
+        )
+        assert isinstance(MS, (list, tuple))
+        assert isinstance(NS, (list, tuple))
+
+        X1 = Tensor(
+            shape=[IntVar(name="input_batch", values=[1, 1024]), *MS, *NS],
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+        if gamma_is_none:
+            X2 = None
+        else:
+            X2 = Tensor(
+                shape=NS,
+                dtype="float16",
+                name="gamma",
+                is_input=True,
+            )
+        if beta_is_none:
+            X3 = None
+        else:
+            X3 = Tensor(
+                shape=NS,
+                dtype="float16",
+                name="beta",
+                is_input=True,
+            )
+        if use_size_op:
+            norm_shapes = [
+                ops.getitem()(ops.size()(X1), i) for i in range(1 + len(MS), X1._rank())
+            ]
+        else:
+            norm_shapes = [IntImm(n) for n in NS]
+        X4 = (
+            ops.layernorm()(X1, X2, X3, norm_shapes, eps)
+            if not use_size_op
+            else ops.layernorm()(X1, X2, X3, norm_shapes, eps)
+        )
+        X5 = ops.elementwise(FuncEnum.SIGMOID)(X4)
+        X6 = ops.elementwise(FuncEnum.MUL)(X1, X5)
+        X6._attrs["is_output"] = True
+        X6._attrs["name"] = "output"
+
+        target = detect_target()
+        with compile_model(
+            X6, target, "./tmp", "fused_layernorm_sigmoid_mul_test"
+        ) as module:
+            for batch_size in [50, 900, 1024]:
+                logging.info(
+                    f"Run test layernorm_sigmoid_mul. Problem size {[batch_size,] + list(MS) + list(NS)}"
+                )
+                x1_pt = torch.randn(batch_size, *MS, *NS).cuda().half()
+                if gamma_is_none:
+                    x2_pt = None
+                else:
+                    x2_pt = torch.randn(NS).cuda().half()
+                if beta_is_none:
+                    x3_pt = None
+                else:
+                    x3_pt = torch.randn(NS).cuda().half()
+
+                x4_pt = torch.nn.functional.layer_norm(x1_pt, NS, x2_pt, x3_pt, eps=eps)
+                x6_pt = torch.mul(x1_pt, torch.sigmoid(x4_pt))
+
+                inputs = {"X": x1_pt}
+                if not gamma_is_none:
+                    inputs["gamma"] = x2_pt
+                if not beta_is_none:
+                    inputs["beta"] = x3_pt
+                x6 = torch.empty([batch_size, *MS, *NS]).cuda().half()
+                module.run_with_tensors(inputs, [x6])
+                self.assertTrue(
+                    torch.allclose(x6, x6_pt, atol=self._atol, rtol=self._rtol),
+                    f"max diff: {torch.max(x6 - x6_pt) if x6_pt.numel() > 0 else 0}, "
+                    f"min diff: {torch.min(x6 - x6_pt) if x6_pt.numel() > 0 else 0}",
+                )
+
+    def test_fused_layernorm_sigmoid_mul(self):
+        for eps in (1e-5, 1e-1):
+            # half4 kernel
+            self._test_fused_layernorm_sigmoid_mul(NS=(1496,), eps=eps)
+            # block_size = n kernel
+            self._test_fused_layernorm_sigmoid_mul(NS=(515,), eps=eps)
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(NS=(1055,), eps=eps)
+
+        # test ND inputs
+        eps = 1e-5
+        # half4 kernel
+        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(64, 8), eps=eps)
+        # block_size = n kernel
+        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(213, 2), eps=eps)
+        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(3, 2), eps=eps)
+        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(1, 1), eps=eps)
+        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(0, 1), eps=eps)
+        # block_size = 512 kernel
+        self._test_fused_layernorm_sigmoid_mul(MS=(2, 4), NS=(1055, 5), eps=eps)
+
+        self._test_fused_layernorm_sigmoid_mul(
+            NS=(1496,), gamma_is_none=True, beta_is_none=True
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            NS=(515,), gamma_is_none=True, beta_is_none=True
+        )
+        for use_size_op in (True, False):
+            self._test_fused_layernorm_sigmoid_mul(NS=(1055,), use_size_op=use_size_op)
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1055,),
+                gamma_is_none=True,
+                beta_is_none=True,
+                use_size_op=use_size_op,
+            )
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,), gamma_is_none=True, use_size_op=use_size_op
+            )
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(515,), beta_is_none=True, use_size_op=use_size_op
+            )
+
+    # dim0 is batch size
+    def _test_batch_fused_layernorm_sigmoid_mul(
+        self, M, N, gamma_is_none=False, beta_is_none=False, use_size_op=False, eps=1e-5
+    ):
+        logging.info(
+            f"_test_batch_fused_layernorm_sigmoid_mul: M={M}, N={N}, "
+            f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}"
+        )
+        X1 = Tensor(
+            shape=[IntVar(name="input_batch", values=[2, 32]), IntImm(M), IntImm(N)],
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+        if gamma_is_none:
+            X2 = None
+        else:
+            X2 = Tensor(
+                shape=[IntVar(name="input_batch", values=[2, 32]), IntImm(N)],
+                dtype="float16",
+                name="gamma",
+                is_input=True,
+            )
+        if beta_is_none:
+            X3 = None
+        else:
+            X3 = Tensor(
+                shape=[IntVar(name="input_batch", values=[2, 32]), IntImm(N)],
+                dtype="float16",
+                name="beta",
+                is_input=True,
+            )
+        X4 = (
+            ops.batch_layernorm_sigmoid_mul()(X1, X2, X3, [IntImm(N)], eps)
+            if not use_size_op
+            else ops.batch_layernorm_sigmoid_mul()(
+                X1, X2, X3, [ops.getitem()(ops.size()(X1), 2)], eps
+            )
+        )
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        target = detect_target()
+        with compile_model(
+            X4, target, "./tmp", f"batch_fused_layernorm_sigmoid_mul_{M}_{N}_test"
+        ) as module:
+            for batch_size in [2, 16, 32]:
+                logging.info(
+                    "Run test batch_layernorm_sigmoid_mul. Problem size [{}, {}, {}]".format(
+                        batch_size, M, N
+                    )
+                )
+                xs_pt = [torch.randn(M, N).cuda().half() for i in range(batch_size)]
+                if gamma_is_none:
+                    gammas_pt = [None] * batch_size
+                else:
+                    gammas_pt = [
+                        torch.randn(N).cuda().half() for i in range(batch_size)
+                    ]
+                if beta_is_none:
+                    betas_pt = [None] * batch_size
+                else:
+                    betas_pt = [torch.randn(N).cuda().half() for i in range(batch_size)]
+
+                ys_pt = []
+                for i in range(batch_size):
+                    y0 = torch.nn.functional.layer_norm(
+                        xs_pt[i],
+                        xs_pt[i].size()[1:],
+                        gammas_pt[i],
+                        betas_pt[i],
+                        eps=eps,
+                    )
+                    y = torch.mul(xs_pt[i], torch.sigmoid(y0))
+                    ys_pt.append(y)
+                y_t = torch.stack(ys_pt, dim=0)
+
+                x_pt = torch.stack(xs_pt, dim=0)
+                if not gamma_is_none:
+                    gamma_pt = torch.stack(gammas_pt, dim=0)
+                if not beta_is_none:
+                    beta_pt = torch.stack(betas_pt, dim=0)
+
+                inputs = {"X": x_pt}
+                if not gamma_is_none:
+                    inputs["gamma"] = gamma_pt
+                if not beta_is_none:
+                    inputs["beta"] = beta_pt
+                x4 = torch.empty([batch_size, M, N]).cuda().half()
+                module.run_with_tensors(inputs, [x4])
+                self.assertTrue(
+                    torch.allclose(x4, y_t, atol=self._atol, rtol=self._rtol),
+                    f"max diff: {torch.max(x4 - y_t) if y_t.numel() > 0 else 0}, "
+                    f"min diff: {torch.min(x4 - y_t) if y_t.numel() > 0 else 0}",
+                )
+
+    # dim1 is the batch size
+    def _test_batch_fused_layernorm_sigmoid_mul_dim1(
+        self, B, N, gamma_is_none=False, beta_is_none=False
+    ):
+        logging.info(
+            f"_test_batch_fused_layernorm_sigmoid_mul_dim1: M={B}, N={N}, "
+            f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}"
+        )
+        X1 = Tensor(
+            shape=[
+                IntImm(B),
+                IntVar(name="input_batch", values=[128, 1024]),
+                IntImm(N),
+            ],
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+        if gamma_is_none:
+            X2 = None
+        else:
+            X2 = Tensor(
+                shape=[IntImm(B), IntImm(N)],
+                dtype="float16",
+                name="gamma",
+                is_input=True,
+            )
+        if beta_is_none:
+            X3 = None
+        else:
+            X3 = Tensor(
+                shape=[IntImm(B), IntImm(N)],
+                dtype="float16",
+                name="beta",
+                is_input=True,
+            )
+        X4 = ops.batch_layernorm_sigmoid_mul(normalized_shape=[IntImm(N)])(X1, X2, X3)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        target = detect_target()
+        with compile_model(
+            X4,
+            target,
+            "./tmp",
+            f"batch_fused_layernorm_sigmoid_mul_dim1_{B}_{N}_test",
+        ) as module:
+            for M in [128, 1024]:
+                logging.info(
+                    "Run test batch_layernorm_sigmoid_mul. Problem size [{}, {}, {}]".format(
+                        B, M, N
+                    )
+                )
+                xs_pt = [torch.randn(M, N).cuda().half() for i in range(B)]
+                if gamma_is_none:
+                    gammas_pt = [None] * B
+                else:
+                    gammas_pt = [torch.randn(N).cuda().half() for i in range(B)]
+                if beta_is_none:
+                    betas_pt = [None] * B
+                else:
+                    betas_pt = [torch.randn(N).cuda().half() for i in range(B)]
+
+                ys_pt = []
+                for i in range(B):
+                    y0 = torch.nn.functional.layer_norm(
+                        xs_pt[i], xs_pt[i].size()[1:], gammas_pt[i], betas_pt[i]
+                    )
+                    y = torch.mul(xs_pt[i], torch.sigmoid(y0))
+                    ys_pt.append(y)
+                y_t = torch.stack(ys_pt, dim=0)
+
+                x_pt = torch.stack(xs_pt, dim=0)
+                if not gamma_is_none:
+                    gamma_pt = torch.stack(gammas_pt, dim=0)
+                if not beta_is_none:
+                    beta_pt = torch.stack(betas_pt, dim=0)
+
+                inputs = {"X": x_pt}
+                if not gamma_is_none:
+                    inputs["gamma"] = gamma_pt
+                if not beta_is_none:
+                    inputs["beta"] = beta_pt
+                x4 = torch.empty([B, M, N]).cuda().half()
+                module.run_with_tensors(inputs, [x4])
+                self.assertTrue(
+                    torch.allclose(x4, y_t, atol=self._atol, rtol=self._rtol),
+                    f"max diff: {torch.max(x4 - y_t) if y_t.numel() > 0 else 0}, "
+                    f"min diff: {torch.min(x4 - y_t) if y_t.numel() > 0 else 0}",
+                )
+
+    def test_batch_fused_layernorm_sigmoid_mul(self):
+        for eps in (1e-5, 1e-1):
+            self._test_batch_fused_layernorm_sigmoid_mul(512, 1024, eps=eps)
+            self._test_batch_fused_layernorm_sigmoid_mul(512, 64, eps=eps)
+
+        self._test_batch_fused_layernorm_sigmoid_mul(
+            512, 1024, gamma_is_none=True, beta_is_none=True
+        )
+        self._test_batch_fused_layernorm_sigmoid_mul(
+            512, 64, gamma_is_none=True, beta_is_none=True
+        )
+        for use_size_op in (True, False):
+            self._test_batch_fused_layernorm_sigmoid_mul(
+                1024, 1055, use_size_op=use_size_op, eps=1e-1
+            )
+            self._test_batch_fused_layernorm_sigmoid_mul(
+                1024, 1055, use_size_op=use_size_op
+            )
+            self._test_batch_fused_layernorm_sigmoid_mul(
+                1024,
+                1055,
+                gamma_is_none=True,
+                beta_is_none=True,
+                use_size_op=use_size_op,
+            )
+            self._test_batch_fused_layernorm_sigmoid_mul(
+                512, 1024, gamma_is_none=True, use_size_op=use_size_op
+            )
+            self._test_batch_fused_layernorm_sigmoid_mul(
+                512, 1024, beta_is_none=True, use_size_op=use_size_op
+            )
+
+        self._test_batch_fused_layernorm_sigmoid_mul_dim1(1, 512)
+        self._test_batch_fused_layernorm_sigmoid_mul_dim1(16, 512)
+
+        self._test_batch_fused_layernorm_sigmoid_mul_dim1(
+            1, 512, gamma_is_none=True, beta_is_none=True
+        )
+        self._test_batch_fused_layernorm_sigmoid_mul_dim1(
+            16, 512, gamma_is_none=True, beta_is_none=True
+        )
+
+    def _test_group_fused_layernorm_sigmoid_mul(
+        self,
+        input_shapes,
+        norm_ndim=1,
+        gamma_is_none=False,
+        beta_is_none=False,
+        use_size_op=False,
+        eps=1e-5,
+        fuse_sigmoid_mul=True,
+    ):
+        testname = (
+            "group_fused_layernorm_sigmoid_mul_test"
+            if fuse_sigmoid_mul
+            else "group_layernorm_test"
+        )
+        logging.info(
+            f"{testname}: input_shapes={input_shapes}, "
+            f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}, "
+            f"use_size_op={use_size_op}"
+        )
+        inputs = []
+        gammas = []
+        betas = []
+        normalized_shapes = []
+        batch_ndim = len(input_shapes[0]) - norm_ndim
+        for i, shape in enumerate(input_shapes):
+            inputs.append(
+                Tensor(
+                    shape=[IntImm(n) for n in shape],
+                    dtype="float16",
+                    name="X_" + str(i),
+                    is_input=True,
+                )
+            )
+            gamma = (
+                None
+                if gamma_is_none
+                else Tensor(
+                    shape=[IntImm(n) for n in shape[batch_ndim:]],
+                    dtype="float16",
+                    name="gamma_" + str(i),
+                    is_input=True,
+                )
+            )
+            gammas.append(gamma)
+            beta = (
+                None
+                if beta_is_none
+                else Tensor(
+                    shape=[IntImm(n) for n in shape[batch_ndim:]],
+                    dtype="float16",
+                    name="beta_" + str(i),
+                    is_input=True,
+                )
+            )
+            betas.append(beta)
+            if use_size_op:
+                normalized_shapes.append(
+                    [
+                        ops.getitem()(ops.size()(inputs[-1]), i)
+                        for i in range(batch_ndim, len(shape))
+                    ]
+                )
+            else:
+                normalized_shapes.append([IntImm(n) for n in shape[batch_ndim:]])
+
+        if fuse_sigmoid_mul:
+            Ys = ops.group_layernorm_sigmoid_mul()(
+                inputs, gammas, betas, normalized_shapes, eps
+            )
+        else:
+            Ys = ops.group_layernorm()(inputs, gammas, betas, normalized_shapes, eps)
+
+        for i, Y in enumerate(Ys):
+            Y._attrs["is_output"] = True
+            Y._attrs["name"] = "output_" + str(i)
+
+        target = detect_target()
+
+        with compile_model(
+            Ys,
+            target,
+            "./tmp",
+            testname,
+        ) as module:
+            B = len(input_shapes)
+
+            logging.info(f"Run test {testname}. Input shapes: {input_shapes}")
+
+            xs_pt = []
+            gammas_pt = []
+            betas_pt = []
+            for shape in input_shapes:
+                xs_pt.append(torch.randn(shape).cuda().half())
+                norm_shape = shape[batch_ndim:]
+                gamma_pt = (
+                    None if gamma_is_none else torch.randn(norm_shape).cuda().half()
+                )
+                gammas_pt.append(gamma_pt)
+                beta_pt = (
+                    None if beta_is_none else torch.randn(norm_shape).cuda().half()
+                )
+                betas_pt.append(beta_pt)
+
+            ys_pt = []
+            for i in range(B):
+                y0 = torch.nn.functional.layer_norm(
+                    xs_pt[i],
+                    xs_pt[i].size()[batch_ndim:],
+                    gammas_pt[i],
+                    betas_pt[i],
+                    eps=eps,
+                )
+                if fuse_sigmoid_mul:
+                    y = torch.mul(xs_pt[i], torch.sigmoid(y0))
+                    ys_pt.append(y)
+                else:
+                    ys_pt.append(y0)
+
+            num_inputs = len(input_shapes) * (
+                1 + (not gamma_is_none) + (not beta_is_none)
+            )
+            inputs = [0 for i in range(num_inputs)]
+            input_name_map = module.get_input_name_to_index_map()
+            for i in range(len(input_shapes)):
+                inputs[input_name_map[f"X_{i}"]] = xs_pt[i]
+                if not gamma_is_none:
+                    inputs[input_name_map[f"gamma_{i}"]] = gammas_pt[i]
+                if not beta_is_none:
+                    inputs[input_name_map[f"beta_{i}"]] = betas_pt[i]
+            outputs = [torch.empty_like(y) for y in ys_pt]
+            module.run_with_tensors(inputs, outputs)
+            # module.benchmark_with_tensors(inputs, outputs)
+
+        for i in range(B):
+            logging.debug("output: {}".format(str(i)))
+            y = outputs[i]
+            self.assertTrue(
+                torch.allclose(ys_pt[i], y, atol=self._atol, rtol=self._rtol),
+                f"max diff: {torch.max(ys_pt[i]- y) if y.numel() > 0 else 0}, "
+                f"min diff: {torch.min(ys_pt[i] - y) if y.numel() > 0 else 0}",
+            )
+
+    def test_group_fused_layernorm_sigmoid_mul(self):
+        # half4 kernel
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 256], [1024, 128]], eps=1e-1
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 256], [1024, 128]], use_size_op=False
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 256]] * 4, use_size_op=True
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [
+                [1024, 256],
+                [1024, 256],
+                [1024, 128],
+                [1024, 256],
+                [1024, 128],
+                [1024, 256],
+            ]
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [
+                [2048, 2048],
+                [2048, 1024],
+            ]
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 256], [1024, 128]],
+            gamma_is_none=True,
+            beta_is_none=True,
+            use_size_op=False,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 256]] * 4, gamma_is_none=True, use_size_op=False
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 256]] * 4, gamma_is_none=True, use_size_op=True
+        )
+
+        # Make sure we test the boundary between being able to fit the arguments in constant memory vs not.
+        for num_groups in range(38, 41):
+            self._test_group_fused_layernorm_sigmoid_mul(
+                [[1024, 256]] * num_groups, use_size_op=True
+            )
+
+        # < 1024 kernel
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[4, 16]],
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 64], [1024, 256], [1024, 125]], eps=1e-1
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 64], [1024, 256], [1024, 125]]
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 64], [1024, 256], [1024, 125]],
+            gamma_is_none=True,
+            beta_is_none=True,
+            use_size_op=True,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 64], [1024, 256], [1024, 125]],
+            beta_is_none=True,
+            use_size_op=False,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 64], [1024, 256], [1024, 125]],
+            beta_is_none=True,
+            use_size_op=True,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1, 1]],
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1, 1], [1, 0], [1, 1]],
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 256], [1024, 128], [1024, 0]]
+        )
+
+        # fallback kernel
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 1025], [1024, 1276], [1024, 1023]], eps=1e-1
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 1025], [1024, 1276], [1024, 1023]]
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 1025], [1024, 1276], [1024, 1023]],
+            gamma_is_none=True,
+            beta_is_none=True,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[128, 1025], [128, 0], [128, 1023]]
+        )
+        # Ditto boundary test
+        for num_groups_divided_by_3 in range(12, 15):
+            self._test_group_fused_layernorm_sigmoid_mul(
+                [[1024, 1025], [1024, 1276], [1024, 1023]] * num_groups_divided_by_3
+            )
+
+        # ND
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[2, 512, 256, 16], [2, 512, 128, 4]], 2, use_size_op=False
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[3, 256, 64], [3, 256, 256], [3, 256, 125]], 1
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[4, 16, 3, 1025], [4, 16, 2, 1276], [4, 16, 1, 1023]],
+            2,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[4, 16, 1025], [4, 16, 1276], [4, 16, 1023]],
+            1,
+            gamma_is_none=True,
+            beta_is_none=True,
+        )
+
+    def test_group_layernorm(self):
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [
+                [1024, 256],
+                [1024, 256],
+                [1024, 128],
+                [1024, 256],
+                [1024, 128],
+                [1024, 256],
+            ],
+            fuse_sigmoid_mul=False,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 64], [1024, 256], [1024, 125]],
+            gamma_is_none=True,
+            beta_is_none=True,
+            use_size_op=True,
+            fuse_sigmoid_mul=False,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1024, 1025], [1024, 1276], [1024, 1023]],
+            eps=1e-1,
+            fuse_sigmoid_mul=False,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[1, 1], [1, 0], [1, 1]],
+            fuse_sigmoid_mul=False,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [[2, 512, 256, 16], [2, 512, 128, 4]],
+            2,
+            use_size_op=False,
+            fuse_sigmoid_mul=False,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_max_pool2d.py b/tests/unittest/ops/test_max_pool2d.py
new file mode 100644
index 000000000..f605d810d
--- /dev/null
+++ b/tests/unittest/ops/test_max_pool2d.py
@@ -0,0 +1,51 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import IntVar, nn, Tensor
+from aitemplate.testing import detect_target
+
+
+class MaxPool2dTestCase(unittest.TestCase):
+    def test_max_pool_2d_fp16(self):
+        batch_size = [1, 3]
+        target = detect_target()
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), 112, 112, 64],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        OP = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        Y = OP(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "max_pool2d")
+        for batch in batch_size:
+            X_pt = torch.randn(batch, 64, 112, 112).cuda().half()
+            OP_pt = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+            Y_pt = OP_pt(X_pt)
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            y = torch.empty([batch, 56, 56, 64]).cuda().half()
+            module.run_with_tensors([x], [y])
+            y_transpose = y.permute((0, 3, 1, 2))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_nhwc3to4.py b/tests/unittest/ops/test_nhwc3to4.py
new file mode 100644
index 000000000..d0dbbe402
--- /dev/null
+++ b/tests/unittest/ops/test_nhwc3to4.py
@@ -0,0 +1,57 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Nhcw3To4TestCase(unittest.TestCase):
+    def test_nhcw3to8_fp16(self):
+        target = detect_target()
+        batch_size = [1, 3]
+        if target.name() == "rocm":
+            return True
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), 224, 224, 3],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        OP = ops.nhwc3to4()
+        Y = OP(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "nhwc3to4")
+        for batch in batch_size:
+            X_np = np.random.uniform(-1, 1, (batch, 224, 224, 3)).astype("float16")
+            Y_np = np.zeros((batch, 224, 224, 4)).astype("float16")
+            Y_np[:, :, :, 0] = X_np[:, :, :, 0]
+            Y_np[:, :, :, 1] = X_np[:, :, :, 1]
+            Y_np[:, :, :, 2] = X_np[:, :, :, 2]
+            Y_pt = torch.from_numpy(Y_np).cuda()
+            X_pt = torch.from_numpy(X_np).cuda()
+            y = torch.empty([batch, 224, 224, 4]).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_nhwc3to8.py b/tests/unittest/ops/test_nhwc3to8.py
new file mode 100644
index 000000000..d53a327e9
--- /dev/null
+++ b/tests/unittest/ops/test_nhwc3to8.py
@@ -0,0 +1,57 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import IntVar, nn, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Nhcw3To8TestCase(unittest.TestCase):
+    def test_nhcw3to8_fp16(self):
+        target = detect_target()
+        batch_size = [1, 3]
+        if target.name() == "rocm":
+            return True
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), 224, 224, 3],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        OP = nn.Nhwc3to8()
+        Y = OP(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "nhwc3to8")
+        for batch in batch_size:
+            X_np = np.random.uniform(-1, 1, (batch, 224, 224, 3)).astype("float16")
+            Y_np = np.zeros((batch, 224, 224, 8)).astype("float16")
+            Y_np[:, :, :, 0] = X_np[:, :, :, 0]
+            Y_np[:, :, :, 1] = X_np[:, :, :, 1]
+            Y_np[:, :, :, 2] = X_np[:, :, :, 2]
+            Y_pt = torch.from_numpy(Y_np).cuda()
+            X_pt = torch.from_numpy(X_np).cuda()
+            y = torch.empty([batch, 224, 224, 8]).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_nms.py b/tests/unittest/ops/test_nms.py
new file mode 100644
index 000000000..e33399639
--- /dev/null
+++ b/tests/unittest/ops/test_nms.py
@@ -0,0 +1,207 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for nms Operator.
+"""
+import unittest
+from unittest import skipIf
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+try:
+    from torchvision.ops import boxes as box_ops
+
+    HAS_TORCHVISION = True
+except ImportError:
+    HAS_TORCHVISION = False
+skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision")
+
+
+def nonempty(box, threshold=0.0):
+    widths = box[:, 2] - box[:, 0]
+    heights = box[:, 3] - box[:, 1]
+    keep = (widths < threshold) | (heights < threshold)
+    return keep
+
+
+def create_tensors(N):
+    dets = np.array(
+        [
+            [1.5862e02, 1.6100e02, 4.2800e02, 3.9400e02, 7.7100e-01],
+            [1.5162e02, 1.5938e02, 4.2800e02, 4.0100e02, 9.2676e-01],
+            [1.4700e02, 1.6175e02, 4.3050e02, 3.9925e02, 7.8516e-01],
+            [1.4688e02, 1.6038e02, 4.3150e02, 4.0050e02, 8.5498e-01],
+            [1.4912e02, 1.6000e02, 4.3150e02, 3.9750e02, 7.0020e-01],
+            [1.4925e02, 2.7775e02, 2.4175e02, 3.4775e02, 5.4053e-01],
+            [0.0000e00, 2.0250e02, 5.1200e02, 4.9900e02, 2.5223e-02],
+            [1.5250e02, 1.5900e02, 4.3100e02, 3.9300e02, 6.5674e-01],
+            [1.5262e02, 1.6125e02, 4.3300e02, 3.9475e02, 6.2646e-01],
+            [1.5362e02, 1.5375e02, 4.5000e02, 3.9125e02, 7.8857e-02],
+            [0.0000e00, 8.8875e01, 5.1200e02, 4.9050e02, 4.6120e-03],
+            [1.5000e02, 1.5700e02, 4.2800e02, 3.9900e02, 8.8672e-01],
+            [1.5712e02, 1.6150e02, 4.2850e02, 3.9850e02, 9.1162e-01],
+            [1.5412e02, 1.6050e02, 4.2650e02, 3.9700e02, 7.0654e-01],
+            [1.5112e02, 2.8100e02, 2.4688e02, 3.4700e02, 4.1577e-01],
+            [1.3862e02, 1.7175e02, 4.2450e02, 4.0675e02, 4.4495e-02],
+            [1.5275e02, 1.6350e02, 4.3175e02, 3.9700e02, 8.6182e-01],
+            [1.4875e02, 1.5950e02, 4.2875e02, 3.9700e02, 8.0908e-01],
+            [1.4850e02, 1.6000e02, 4.3900e02, 4.0100e02, 6.3965e-01],
+            [1.4375e02, 1.2675e02, 4.6275e02, 3.8525e02, 6.5689e-03],
+            [0.0000e00, 2.7700e02, 4.6600e02, 4.6800e02, 2.2247e-02],
+            [1.6250e00, 4.7650e02, 7.2812e01, 5.0900e02, 5.0430e-03],
+            [1.4975e02, 1.6500e02, 4.3125e02, 3.9850e02, 8.2031e-01],
+            [1.4950e02, 2.7625e02, 2.7125e02, 3.6025e02, 1.2842e-01],
+            [1.5475e02, 1.5788e02, 4.3575e02, 3.9900e02, 8.3789e-01],
+            [2.5925e02, 1.7750e01, 5.0925e02, 3.2475e02, 1.0967e-03],
+            [2.6200e02, 3.2812e00, 4.9500e02, 7.5375e01, 2.4612e-02],
+            [3.3000e01, 1.1462e02, 5.1200e02, 4.6850e02, 3.6469e-03],
+            [1.4962e02, 1.6250e02, 4.3650e02, 3.9800e02, 7.9492e-01],
+            [1.4850e02, 1.5975e02, 4.3250e02, 3.9275e02, 2.7051e-01],
+        ],
+        dtype="float16",
+    )
+
+    return dets[:N, :4], dets[:N, -1]
+
+
+@skipIfNoTorchVision
+class nmsTestCase(unittest.TestCase):
+    def _create_tensors(self, N):
+        boxes, scores = create_tensors(N)
+
+        return torch.tensor(boxes).cuda().half(), torch.tensor(scores).cuda().half()
+
+    def test_nms(
+        self,
+        N=30,
+        preNmsTop=30,
+        nmsMaxOut=10,
+        iouThreshold=0.5,
+        minBoxSize=0,
+        num_classes=1,
+        test_name="proposal_nms",
+    ):
+        target = detect_target()
+
+        X1 = Tensor(
+            shape=[1, N, 4],
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+
+        X2 = Tensor(
+            shape=[1, N],
+            dtype="float16",
+            name="kernel",
+            is_input=True,
+        )
+
+        X4 = ops.nms(
+            preNmsTop=preNmsTop,
+            nmsMaxOut=nmsMaxOut,
+            iouThreshold=iouThreshold,
+            minBoxSize=minBoxSize,
+        )(X1, X2)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        module = compile_model(X4, target, "./tmp", test_name)
+
+        boxes, scores = self._create_tensors(N)
+        idxs = torch.randint(0, num_classes, (N,)).cuda().half()
+        iou = iouThreshold
+        kept = nonempty(boxes, threshold=minBoxSize)
+        score_pt = scores.clone()
+        score_pt[kept] = -1
+        keep = box_ops.batched_nms(boxes, score_pt, idxs, iou)
+
+        if keep.shape[0] >= nmsMaxOut:
+            keep = keep[:nmsMaxOut]
+            ref_box = boxes[keep]
+        else:
+            ref_box = torch.zeros(nmsMaxOut, 4).half()
+            ref_box[
+                : keep.shape[0],
+            ] = boxes[keep]
+
+        x = boxes.reshape((1, N, 4)).contiguous()
+        x_scores = scores.reshape((1, N)).contiguous()
+        inputs = [x, x_scores]
+        y = torch.empty([1, nmsMaxOut, 4]).cuda().half()
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(ref_box.cuda(), y, atol=1e-2, rtol=1e-2))
+
+    def test_topk_nms(self, batch_size=1, N=30, topK=30, iou=0.5, test_name="topk_nms"):
+
+        target = detect_target()
+        if target.name() == "rocm":
+            return
+        m_shape = (N, 4)
+
+        def model():
+            X_boxes = Tensor(
+                shape=m_shape,
+                dtype="float16",
+                name="X",
+                is_input=True,
+            )
+            X_scores = Tensor(
+                shape=[N],
+                dtype="float16",
+                name="scores",
+                is_input=True,
+            )
+            score_inds = ops.topk(k=topK)(X_scores)
+            bboxes = ops.batch_gather()(X_boxes, score_inds)
+            keep = ops.batched_nms(iou_threshold=iou, keep_n=N)(bboxes)
+            return keep, score_inds
+
+        Y = model()
+        Y[0]._attrs["is_output"] = True
+        Y[0]._attrs["name"] = "output_0"
+        Y[1]._attrs["is_output"] = True
+        Y[1]._attrs["name"] = "output_1"
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        boxes, scores = self._create_tensors(N)
+        idxs = torch.randint(0, 1, (N,)).cuda().half()
+        y_pt = box_ops.batched_nms(boxes, scores, idxs, iou)
+        y_np = y_pt.cpu().numpy()
+
+        x = boxes.reshape(m_shape).contiguous()
+        x_scores = scores.reshape((N,)).contiguous()
+        inputs = {"scores": x_scores, "X": x}
+
+        keep = torch.empty([topK]).cuda().to(torch.int64)
+        score_inds = torch.empty([topK]).cuda().to(torch.int64)
+
+        module.run_with_tensors(inputs, {"output_0": keep, "output_1": score_inds})
+        keep = keep.cpu().numpy()
+        score_inds = score_inds.cpu().numpy()
+
+        index = keep.nonzero()[0]
+        y = score_inds[index]
+        np.testing.assert_allclose(y_np, y, atol=1e-2, rtol=1e-2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_norm.py b/tests/unittest/ops/test_norm.py
new file mode 100644
index 000000000..705b727b4
--- /dev/null
+++ b/tests/unittest/ops/test_norm.py
@@ -0,0 +1,176 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import dtype_to_torch_dtype, get_random_torch_tensor
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class VectorNormTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(VectorNormTestCase, self).__init__(*args, **kwargs)
+
+    def _run_vector_norm(
+        self,
+        *,
+        test_name,
+        dim,
+        input_shape,
+        ord_kind,
+        keepdim=False,
+        input_type="float16",
+        output_type=None,
+    ):
+        torch.manual_seed(0)
+        logging.info(
+            "Test input_shape={input_shape}, reduction_axes={dim}".format(
+                input_shape=input_shape, dim=dim
+            )
+        )
+        target = detect_target()
+        X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
+
+        op = ops.vector_norm(
+            ord_kind=ord_kind, dim=dim, keepdim=keepdim, dtype=output_type
+        )
+        Y = op(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        y_dtype = Y._attrs["dtype"]
+
+        logging.info("AITemplate output_shape: {}".format(y_shape))
+        logging.info("AITemplate output_type: {}".format(y_dtype))
+
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = get_random_torch_tensor(input_shape, input_type)
+        dtype_pt = dtype_to_torch_dtype(output_type)
+        Y_pt = torch.linalg.vector_norm(
+            X_pt, ord=ord_kind, dim=dim, keepdim=keepdim, dtype=dtype_pt
+        )
+
+        y = torch.empty(y_shape).half().cuda()
+        module.run_with_tensors([X_pt], [y])
+        y_pt = Y_pt.cpu().numpy()
+
+        np.testing.assert_equal(y_shape, y_pt.shape)
+        np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
+        np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+
+    def _run_l2_norm(
+        self, *, dim, input_shape, keepdim, input_type="float16", output_type=None
+    ):
+        self._run_vector_norm(
+            test_name="l2_norm",
+            ord_kind=2,
+            dim=dim,
+            input_shape=input_shape,
+            keepdim=keepdim,
+            input_type=input_type,
+            output_type=output_type,
+        )
+
+    def test_l2_norm(self):
+        self._run_l2_norm(dim=0, input_shape=[1], keepdim=True)
+        self._run_l2_norm(dim=-1, input_shape=[3, 2, 2048], keepdim=False)
+        self._run_l2_norm(dim=1, input_shape=[3, 1234, 4], keepdim=True)
+        self._run_l2_norm(dim=1, input_shape=[5, 60, 34, 4], keepdim=False)
+        self._run_l2_norm(dim=0, input_shape=[5, 60, 34, 4], keepdim=False)
+        self._run_l2_norm(dim=2, input_shape=[5, 1, 34, 4], keepdim=False)
+        self._run_l2_norm(dim=-1, input_shape=[4, 1230, 1237], keepdim=True)
+        self._run_l2_norm(dim=-1, input_shape=[1, 1000000, 6], keepdim=True)
+
+    def _run_batched_vector_norm(
+        self,
+        *,
+        test_name,
+        dim,
+        ord_kind,
+        keepdim=False,
+        input_type="float16",
+        output_type=None,
+    ):
+        torch.manual_seed(0)
+        logging.info(
+            "Test batched_vector_norm with reduction_axes={dim}".format(dim=dim)
+        )
+        target = detect_target()
+
+        M = 4
+        N = 32
+        X = Tensor(
+            shape=[IntImm(M), IntVar(name="input_batch", values=[1, 2048]), IntImm(N)],
+            dtype=input_type,
+            name="input_0",
+            is_input=True,
+        )
+
+        op = ops.vector_norm(
+            ord_kind=ord_kind, dim=dim, keepdim=keepdim, dtype=output_type
+        )
+        Y = op(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_dtype = Y._attrs["dtype"]
+
+        logging.info("AITemplate output_type: {}".format(y_dtype))
+
+        dtype_pt = dtype_to_torch_dtype(output_type)
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for B in [5, 128, 1024, 1237, 2002]:
+            input_shape = [M, B, N]
+            logging.info("Testing input_shape={}".format(input_shape))
+
+            X_pt = get_random_torch_tensor(input_shape, input_type)
+            Y_pt = torch.linalg.vector_norm(
+                X_pt, ord=ord_kind, dim=dim, keepdim=keepdim, dtype=dtype_pt
+            )
+            y_pt = Y_pt.cpu().numpy()
+
+            y = torch.empty(y_pt.shape).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+
+            np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
+            np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+
+    def _run_batched_l2_norm(
+        self, *, dim, keepdim, input_type="float16", output_type=None
+    ):
+        self._run_batched_vector_norm(
+            test_name="batched_l2_norm",
+            ord_kind=2,
+            dim=dim,
+            keepdim=keepdim,
+            input_type=input_type,
+            output_type=output_type,
+        )
+
+    def test_batched_l2_norm(self):
+        self._run_batched_l2_norm(dim=0, keepdim=True)
+        self._run_batched_l2_norm(dim=1, keepdim=False)
+        self._run_batched_l2_norm(dim=1, keepdim=True)
+        self._run_batched_l2_norm(dim=2, keepdim=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_pad_last_dim.py b/tests/unittest/ops/test_pad_last_dim.py
new file mode 100644
index 000000000..2fc7d9831
--- /dev/null
+++ b/tests/unittest/ops/test_pad_last_dim.py
@@ -0,0 +1,70 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class PadLastDim(unittest.TestCase):
+    def test_static_shape_4d(self):
+        NN = 2
+        HH = 7
+        WW = 7
+        CI = 262
+        CO = 264
+        X = Tensor(shape=[NN, HH, WW, CI], name="X", is_input=True)
+        op = ops.pad_last_dim(4, CO)
+        Y = op(X)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "pad_last_dim4d")
+
+        X_pt = torch.randn(NN, HH, WW, CI).cuda().half()
+        Pad_pt = torch.zeros(NN, HH, WW, CO - CI).cuda().half()
+        Y_pt = torch.cat([X_pt, Pad_pt], dim=3)
+
+        y = torch.empty([NN, HH, WW, CO]).cuda().half()
+        module.run_with_tensors([X_pt], [y])
+        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_static_shape_2d(self):
+        NN = 32
+        CI = 259
+        CO = 264
+        X = Tensor(shape=[NN, CI], name="X", is_input=True)
+        op = ops.pad_last_dim(2, CO)
+        Y = op(X)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "pad_last_dim2d")
+
+        X_pt = torch.randn(NN, CI).cuda().half()
+        Pad_pt = torch.zeros(NN, CO - CI).cuda().half()
+        Y_pt = torch.cat([X_pt, Pad_pt], dim=1)
+
+        y = torch.empty([NN, CO]).cuda().half()
+        module.run_with_tensors([X_pt], [y])
+        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_ccr.py b/tests/unittest/ops/test_perm021fc_ccr.py
new file mode 100644
index 000000000..9e20ddc71
--- /dev/null
+++ b/tests/unittest/ops/test_perm021fc_ccr.py
@@ -0,0 +1,62 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+# _3306 = _3305.permute(0, 2, 1)  # Transpose
+# _3307 = _3306  # torch.reshape(_3306, (-1, 745))  # Reshape
+# _3308 = torch.nn.functional.linear(_3307, self._1184, bias=self._1185)  # FC
+"""
+
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class BMMTestCase(unittest.TestCase):
+    def test_ccr(self):
+        B = 1024
+        M = 128
+        K = 745
+        # K = 752
+        N = 30
+        target = detect_target()
+        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[1, N, K], dtype="float16", name="input_1", is_input=True)
+        OP = ops.perm021fc_ccr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "perm021_fc")
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+
+        XT = X_pt.permute(0, 2, 1)
+        XT = torch.reshape(XT, (-1, K))
+        Y_pt = torch.nn.functional.linear(XT, W_pt)
+        Y_pt = torch.reshape(Y_pt, (B, M, N))
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt.unsqueeze(0)}, [y])
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_ccr_bias.py b/tests/unittest/ops/test_perm021fc_ccr_bias.py
new file mode 100644
index 000000000..2b5916bb0
--- /dev/null
+++ b/tests/unittest/ops/test_perm021fc_ccr_bias.py
@@ -0,0 +1,68 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+# _3306 = _3305.permute(0, 2, 1)  # Transpose
+# _3307 = _3306  # torch.reshape(_3306, (-1, 745))  # Reshape
+# _3308 = torch.nn.functional.linear(_3307, self._1184, bias=self._1185)  # FC
+"""
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm021FCCCRBiasTestCase(unittest.TestCase):
+    def test_ccr(self):
+        B = 1024
+        M = 128
+        # K = 745
+        K = 742
+        # N = 30
+        N = 64
+        target = detect_target()
+        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[1, N, K], dtype="float16", name="input_1", is_input=True)
+        BIAS = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.perm021fc_ccr_bias()
+        Y = OP(X, W, BIAS)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "perm021_fc_bias")
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        # B_pt = torch.randn(N).cuda().half()
+        B_pt = torch.ones(N).cuda().half() * 0.5
+
+        XT = X_pt.permute(0, 2, 1)
+        XT = torch.reshape(XT, (-1, K))
+        Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
+        Y_pt = torch.reshape(Y_pt, (B, M, N))
+
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt.unsqueeze(0), "input_2": B_pt}, [y]
+        )
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py b/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
new file mode 100644
index 000000000..f9fbc8cf4
--- /dev/null
+++ b/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
@@ -0,0 +1,69 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+# _3306 = _3305.permute(0, 2, 1)  # Transpose
+# _3307 = _3306  # torch.reshape(_3306, (-1, 745))  # Reshape
+# _3308 = torch.nn.functional.linear(_3307, self._1184, bias=self._1185)  # FC
+"""
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skip("Re-enable after cutlass fix")
+# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm021FCCCRBiasTestCase(unittest.TestCase):
+    def test_ccr(self):
+        B = 1024
+        M = 128
+        # K = 745
+        K = 742
+        # N = 30
+        N = 64
+        target = detect_target()
+        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[1, N, K], dtype="float16", name="input_1", is_input=True)
+        BIAS = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.perm021fc_ccr_bias_permute()
+        Y = OP(X, W, BIAS)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "perm021_fc_bias")
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        # B_pt = torch.randn(N).cuda().half()
+        B_pt = torch.ones(N).cuda().half() * 0.5
+
+        XT = X_pt.permute(0, 2, 1)
+        XT = torch.reshape(XT, (-1, K))
+        Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
+        Y_pt = torch.reshape(Y_pt, (B, M, N))
+        Y_pt = Y_pt.permute(0, 2, 1)
+        y = torch.empty([B, N, M]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt.unsqueeze(0), "input_2": B_pt}, [y]
+        )
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_crc.py b/tests/unittest/ops/test_perm021fc_crc.py
new file mode 100644
index 000000000..30faa3cc4
--- /dev/null
+++ b/tests/unittest/ops/test_perm021fc_crc.py
@@ -0,0 +1,63 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+# _3306 = _3305.permute(0, 2, 1)  # Transpose
+# _3307 = _3306  # torch.reshape(_3306, (-1, 745))  # Reshape
+# _3308 = torch.nn.functional.linear(_3307, self._1184, bias=self._1185)  # FC
+"""
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm021BMMTestCase(unittest.TestCase):
+    def test_crc(self):
+        B = 1024
+        M = 128
+        K = 742
+        # K = 752
+        N = 64
+        target = detect_target()
+        X = Tensor(shape=[1, K, N], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, M], dtype="float16", name="input_1", is_input=True)
+        OP = ops.perm021fc_crc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "perm021_fc_crc")
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+
+        XT = X_pt.permute(0, 2, 1)
+        XT = torch.reshape(XT, (-1, K))
+        Y_pt = torch.nn.functional.linear(XT, W_pt)
+        Y_pt = torch.reshape(Y_pt, (B, M, N))
+
+        WT = W_pt.transpose(0, 1).contiguous()
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors({"input_0": WT.unsqueeze(0), "input_1": X_pt}, [y])
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_crc_bias.py b/tests/unittest/ops/test_perm021fc_crc_bias.py
new file mode 100644
index 000000000..e880018c9
--- /dev/null
+++ b/tests/unittest/ops/test_perm021fc_crc_bias.py
@@ -0,0 +1,66 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+# _3306 = _3305.permute(0, 2, 1)  # Transpose
+# _3307 = _3306  # torch.reshape(_3306, (-1, 745))  # Reshape
+# _3308 = torch.nn.functional.linear(_3307, self._1184, bias=self._1185)  # FC
+"""
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm021BMMTestCase(unittest.TestCase):
+    def test_crc(self):
+        B = 1024
+        M = 128
+        K = 742
+        # K = 752
+        N = 64
+        target = detect_target()
+        X = Tensor(shape=[1, K, N], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, M], dtype="float16", name="input_1", is_input=True)
+        BIAS = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.perm021fc_crc_bias()
+        Y = OP(X, W, BIAS)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "perm021_fc_crc_bias")
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(N, K).cuda().half()
+        B_pt = torch.ones(N).cuda().half() * 0.5
+
+        XT = X_pt.permute(0, 2, 1)
+        XT = torch.reshape(XT, (-1, K))
+        Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
+        Y_pt = torch.reshape(Y_pt, (B, M, N)).contiguous()
+        WT = W_pt.transpose(0, 1).contiguous()
+        y = torch.empty([B, M, N]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": WT.unsqueeze(0), "input_1": X_pt, "input_2": B_pt}, [y]
+        )
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_perm102_bmm_rcr.py b/tests/unittest/ops/test_perm102_bmm_rcr.py
new file mode 100644
index 000000000..e7c44e1f3
--- /dev/null
+++ b/tests/unittest/ops/test_perm102_bmm_rcr.py
@@ -0,0 +1,97 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
+in torch it is
+# _2905_2929 = _2904.view(B, 25, -1).permute(1, 0, 2)
+# _2930_2954 = torch.baddbmm(
+#      self._1085_1133, _2905_2929, self._1084_1132) # baddbmm(bias, X, W)
+"""
+
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm102BMM_RCR_TestCase(unittest.TestCase):
+    def test_perm102_bmm_rrr(self):
+        B = 25
+        M = 128
+        K = 256
+        N = 100
+        target = detect_target()
+        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+        OP = ops.perm102_bmm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr")
+
+        X_pt = torch.randn(M, B, K).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = X_pt.permute(1, 0, 2)
+        Y_pt = torch.bmm(XT, W_pt.permute([0, 2, 1]))
+        Y_pt = Y_pt.permute(1, 0, 2)
+        y = torch.empty([M, B, N]).cuda().half()
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm102BMM_RCR_BiasTestCase(unittest.TestCase):
+    def test_perm102_bmm_rrr_bias(self):
+        B = 25
+        M = 128
+        K = 256
+        N = 100
+        target = detect_target()
+        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+        BIAS = Tensor(shape=[B, N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.perm102_bmm_rcr_bias()
+        Y = OP(X, W, BIAS)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr_bias")
+
+        X_pt = torch.randn(M, B, K).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        B_pt = torch.randn(B, N).cuda().half()
+
+        XT = X_pt.permute(1, 0, 2)
+        Bias = B_pt.unsqueeze(1)
+        Y_pt = torch.baddbmm(Bias, XT, W_pt.permute([0, 2, 1]))
+        Y_pt = Y_pt.permute(1, 0, 2)
+
+        y = torch.empty([M, B, N]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}, [y]
+        )
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_perm102_bmm_rrr.py b/tests/unittest/ops/test_perm102_bmm_rrr.py
new file mode 100644
index 000000000..b45c46a34
--- /dev/null
+++ b/tests/unittest/ops/test_perm102_bmm_rrr.py
@@ -0,0 +1,97 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row))
+in torch it is
+# _2905_2929 = _2904.view(B, 25, -1).permute(1, 0, 2)
+# _2930_2954 = torch.baddbmm(
+#      self._1085_1133, _2905_2929, self._1084_1132) # baddbmm(bias, X, W)
+"""
+
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm102BMMTestCase(unittest.TestCase):
+    def test_perm102_bmm_rrr(self):
+        B = 25
+        M = 128
+        K = 256
+        N = 100
+        target = detect_target()
+        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+        OP = ops.perm102_bmm_rrr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr")
+
+        X_pt = torch.randn(M, B, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        XT = X_pt.permute(1, 0, 2)
+        Y_pt = torch.bmm(XT, W_pt)
+        Y_pt = Y_pt.permute(1, 0, 2)
+        y = torch.empty([M, B, N]).cuda().half()
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm102BMMBiasTestCase(unittest.TestCase):
+    def test_perm102_bmm_rrr_bias(self):
+        B = 25
+        M = 128
+        K = 256
+        N = 100
+        target = detect_target()
+        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+        BIAS = Tensor(shape=[B, N], dtype="float16", name="input_2", is_input=True)
+        OP = ops.perm102_bmm_rrr_bias()
+        Y = OP(X, W, BIAS)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr_bias")
+
+        X_pt = torch.randn(M, B, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        B_pt = torch.randn(B, N).cuda().half()
+
+        XT = X_pt.permute(1, 0, 2)
+        Bias = B_pt.unsqueeze(1)
+        Y_pt = torch.baddbmm(Bias, XT, W_pt)
+        Y_pt = Y_pt.permute(1, 0, 2)
+
+        y = torch.empty([M, B, N]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}, [y]
+        )
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_permute.py b/tests/unittest/ops/test_permute.py
new file mode 100644
index 000000000..f9f61aef9
--- /dev/null
+++ b/tests/unittest/ops/test_permute.py
@@ -0,0 +1,55 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from parameterized import param, parameterized
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class PermuteTest(unittest.TestCase):
+    @parameterized.expand(
+        [
+            param((0, 2, 1), "permute_1"),
+            param((1, 0, 2), "permute_2"),
+            param((2, 1, 0), "permute_3"),
+        ]
+    )
+    def test_static_shape_3d(self, dims, testname):
+        NN = 80
+        WW = 300
+        CI = 2
+        X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
+        op = ops.permute()
+        Y = op(X, dims)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", testname)
+
+        X_pt = torch.randn(NN, WW, CI).cuda().half()
+        Y_pt = torch.permute(X_pt, dims)
+
+        y = torch.empty(Y_pt.size()).cuda().half()
+        module.run_with_tensors([X_pt], [y])
+        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_permute021.py b/tests/unittest/ops/test_permute021.py
new file mode 100644
index 000000000..4d040657d
--- /dev/null
+++ b/tests/unittest/ops/test_permute021.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class Permute021(unittest.TestCase):
+    def test_static_shape_3d(self):
+        NN = 2
+        WW = 384
+        CI = 262
+        X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
+        op = ops.permute021()
+        Y = op(X)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "perm021")
+
+        X_pt = torch.randn(NN, WW, CI).cuda().half()
+        Y_pt = torch.permute(X_pt, [0, 2, 1])
+        y = torch.empty([NN, CI, WW]).cuda().half()
+        module.run_with_tensors([X_pt], [y])
+        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_permute102.py b/tests/unittest/ops/test_permute102.py
new file mode 100644
index 000000000..c2901fa0b
--- /dev/null
+++ b/tests/unittest/ops/test_permute102.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class Permute102(unittest.TestCase):
+    def test_static_shape_3d(self):
+        NN = 80
+        WW = 300
+        CI = 2
+        X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
+        op = ops.permute102()
+        Y = op(X)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "perm102")
+
+        X_pt = torch.randn(NN, WW, CI).cuda().half()
+        Y_pt = torch.permute(X_pt, [1, 0, 2])
+        y = torch.empty([WW, NN, CI]).cuda().half()
+        module.run_with_tensors([X_pt], [y])
+        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_permute210.py b/tests/unittest/ops/test_permute210.py
new file mode 100644
index 000000000..78c480e19
--- /dev/null
+++ b/tests/unittest/ops/test_permute210.py
@@ -0,0 +1,48 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class Permute210(unittest.TestCase):
+    def test_static_shape_3d(self):
+        for NWC in itertools.product([2, 80, 300], [2, 80, 300], [2, 80, 300]):
+            with self.subTest(NWC=NWC):
+                NN, WW, CI = NWC
+                X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
+                op = ops.permute210()
+                Y = op(X)
+                Y._attrs["is_output"] = True
+                Y._attrs["name"] = "output"
+                target = detect_target()
+                module = compile_model(
+                    Y, target, "./tmp", "perm210_{}_{}_{}".format(NN, WW, CI)
+                )
+
+                X_pt = torch.randn(NN, WW, CI).cuda().half()
+                Y_pt = torch.permute(X_pt, [2, 1, 0])
+                y = torch.empty([CI, WW, NN]).cuda().half()
+                module.run_with_tensors([X_pt], [y])
+                self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_proposal.py b/tests/unittest/ops/test_proposal.py
new file mode 100644
index 000000000..30862be85
--- /dev/null
+++ b/tests/unittest/ops/test_proposal.py
@@ -0,0 +1,499 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import numpy as np
+
+np.random.seed(0)
+import unittest
+
+import numpy.random as npr
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+DEBUG = False
+
+
+def nms(dets, thresh):
+    """
+    greedily select boxes with high confidence and overlap with current maximum <= thresh
+    rule out overlap >= thresh
+    :param dets: [[x1, y1, x2, y2 score]]
+    :param thresh: retain overlap < thresh
+    :return: indexes to keep
+    """
+    if dets.shape[0] == 0:
+        return []
+
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def py_nms_wrapper(thresh):
+    def _nms(dets):
+        return nms(dets, thresh)
+
+    return _nms
+
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack(
+        (
+            x_ctr - 0.5 * (ws - 1),
+            y_ctr - 0.5 * (hs - 1),
+            x_ctr + 0.5 * (ws - 1),
+            y_ctr + 0.5 * (hs - 1),
+        )
+    )
+    return anchors
+
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def generate_anchors(base_size=16, ratios=(0.5, 1, 2), scales=(8, 16, 32)):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+
+    base_anchor = np.array([1, 1, base_size, base_size]) - 1
+    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    anchors = np.vstack(
+        [
+            _scale_enum(ratio_anchors[i, :], scales)
+            for i in range(ratio_anchors.shape[0])
+        ]
+    )
+    return anchors
+
+
+def clip_boxes(boxes, im_shape):
+    """
+    Clip boxes to image boundaries.
+    :param boxes: [N, 4* num_classes]
+    :param im_shape: tuple of 2
+    :return: [N, 4* num_classes]
+    """
+    if DEBUG:
+        print("clip boxes shape:", boxes.shape)
+    # x1 >= 0
+    boxes[:, 0] = np.maximum(np.minimum(boxes[:, 0], im_shape[1]), 0)
+    # y1 >= 0
+    boxes[:, 1] = np.maximum(np.minimum(boxes[:, 1], im_shape[0]), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], im_shape[1]), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], im_shape[0]), 0)
+    return boxes
+
+
+def bbox_pred(boxes, box_deltas):
+    """
+    Transform the set of class-agnostic boxes into class-specific boxes
+    by applying the predicted offsets (box_deltas)
+    :param boxes: !important [N 4]
+    :param box_deltas: [N, 4 * num_classes]
+    :return: [N 4 * num_classes]
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, box_deltas.shape[1]))
+
+    # eltwise
+    boxes = boxes.astype(np.float32, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    # slice
+    dx = box_deltas[:, 0::4]
+    dy = box_deltas[:, 1::4]
+    dw = box_deltas[:, 2::4]
+    dh = box_deltas[:, 3::4]
+
+    # eltwise
+    pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+    pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+    pred_w = np.exp(dw) * widths[:, None]
+    pred_h = np.exp(dh) * heights[:, None]
+
+    x1 = pred_ctr_x - 0.5 * pred_w
+    y1 = pred_ctr_y - 0.5 * pred_h
+    x2 = pred_ctr_x + 0.5 * pred_w
+    y2 = pred_ctr_y + 0.5 * pred_h
+
+    pred_boxes = np.hstack((x1, y1, x2, y2))
+
+    return pred_boxes
+
+
+class ProposalOP:
+    def __init__(
+        self,
+        feat_stride,
+        scales,
+        ratios,
+        output_score,
+        rpn_pre_nms_top_n,
+        rpn_post_nms_top_n,
+        threshold,
+        rpn_min_size,
+    ):
+        self._feat_stride = feat_stride
+        self._scales = np.array(scales, dtype="float32")
+        self._ratios = np.array(ratios, dtype="float32")
+        self._anchors = generate_anchors(
+            base_size=self._feat_stride, scales=self._scales, ratios=self._ratios
+        )
+        self._num_anchors = self._anchors.shape[0]
+        self._output_score = output_score
+        self._rpn_pre_nms_top_n = rpn_pre_nms_top_n
+        self._rpn_post_nms_top_n = rpn_post_nms_top_n
+        self._threshold = threshold
+        self._rpn_min_size = rpn_min_size
+
+        if DEBUG:
+            print("feat_stride: {}".format(self._feat_stride))
+            print("anchors:", self._anchors)
+
+    def forward(self, in_data):
+        nms = py_nms_wrapper(self._threshold)
+
+        batch_size = in_data[0].shape[0]
+        if batch_size > 1:
+            raise ValueError("Sorry, multiple images each device is not implemented")
+
+        # for each (H, W) location i
+        #   generate A anchor boxes centered on cell i
+        #   apply predicted bbox deltas at cell i to each of the A anchors
+        # clip predicted boxes to image
+        # remove predicted boxes with either height or width < threshold
+        # sort all (proposal, score) pairs by score from highest to lowest
+        # take top pre_nms_topN proposals before NMS
+        # apply NMS with threshold 0.7 to remaining proposals
+        # take after_nms_topN proposals after NMS
+        # return the top proposals (-> RoIs top, scores top)
+
+        pre_nms_topN = self._rpn_pre_nms_top_n
+        post_nms_topN = self._rpn_post_nms_top_n
+        min_size = self._rpn_min_size
+
+        # the first set of anchors are background probabilities
+        # keep the second part
+        scores = in_data[0]
+        bbox_deltas = in_data[1]
+        im_info = in_data[2]
+
+        if DEBUG:
+            print("im_size: ({}, {})".format(im_info[0], im_info[1]))
+            print("scale: {}".format(im_info[2]))
+
+        # 1. Generate proposals from bbox_deltas and shifted anchors
+        # use real image size instead of padded feature map sizes
+        height, width = (
+            int(im_info[0] / self._feat_stride),
+            int(im_info[1] / self._feat_stride),
+        )
+
+        if DEBUG:
+            print("score map size: {}".format(scores.shape))
+            print(
+                "residual: {}".format(
+                    (scores.shape[2] - height, scores.shape[3] - width)
+                )
+            )
+
+        # Enumerate all shifts
+        shift_x = np.arange(0, width) * self._feat_stride
+        shift_y = np.arange(0, height) * self._feat_stride
+        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+        shifts = np.vstack(
+            (shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())
+        ).transpose()
+
+        # Enumerate all shifted anchors:
+        #
+        # add A anchors (1, A, 4) to
+        # cell K shifts (K, 1, 4) to get
+        # shift anchors (K, A, 4)
+        # reshape to (K*A, 4) shifted anchors
+        A = self._num_anchors
+        K = shifts.shape[0]
+        anchors = self._anchors.reshape((1, A, 4)) + shifts.reshape(
+            (1, K, 4)
+        ).transpose((1, 0, 2))
+        anchors = anchors.reshape((K * A, 4))
+
+        # ----------- anchor can be pre-compuated ---------------
+
+        # Transpose and reshape predicted bbox transformations to get them
+        # into the same order as the anchors:
+        #
+        # bbox deltas will be (1, 4 * A, H, W) format
+        # transpose to (1, H, W, 4 * A)
+        # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a)
+        # in slowest to fastest order
+
+        # may not need clip_pad if h/w are well aligned
+        bbox_deltas = self._clip_pad(bbox_deltas, (height, width))
+        # permute + reshape
+        bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
+
+        # Same story for the scores:
+        #
+        # scores are (1, A, H, W) format
+        # transpose to (1, H, W, A)
+        # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a)
+        scores = self._clip_pad(scores, (height, width))
+        scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
+
+        # Convert anchors into proposals via bbox transformations
+        proposals = bbox_pred(anchors, bbox_deltas)
+
+        # 2. clip predicted boxes to image
+        proposals = clip_boxes(proposals, im_info[:2])
+
+        # 3. remove predicted boxes with either height or width < threshold
+        # (NOTE: convert min_size to input image scale stored in im_info[2])
+        keep = self._filter_boxes(proposals, min_size * im_info[2])
+        proposals = proposals[keep, :]
+        scores = scores[keep]
+
+        # 4. sort all (proposal, score) pairs by score from highest to lowest
+        # 5. take top pre_nms_topN (e.g. 6000)
+        order = scores.ravel().argsort()[::-1]
+        if pre_nms_topN > 0:
+            order = order[:pre_nms_topN]
+        proposals = proposals[order, :]
+        scores = scores[order]
+
+        # 6. apply nms (e.g. threshold = 0.7)
+        # 7. take after_nms_topN (e.g. 300)
+        # 8. return the top proposals (-> RoIs top)
+        det = np.hstack((proposals, scores)).astype(np.float32)
+        keep = nms(det)
+        if post_nms_topN > 0:
+            keep = keep[:post_nms_topN]
+        # pad to ensure output size remains unchanged
+        if len(keep) < post_nms_topN:
+            pad = npr.choice(keep, size=post_nms_topN - len(keep))
+            keep = np.hstack((keep, pad))
+        proposals = proposals[keep, :]
+        scores = scores[keep]
+
+        # Output rois array
+        # Our RPN implementation only supports a single input image, so all
+        # batch inds are 0
+        batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
+        blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
+
+        if self._output_score:
+            return blob, scores
+        else:
+            return blob
+
+    @staticmethod
+    def _filter_boxes(boxes, min_size):
+        """Remove all boxes with any side smaller than min_size"""
+        ws = boxes[:, 2] - boxes[:, 0]
+        hs = boxes[:, 3] - boxes[:, 1]
+        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+        return keep
+
+    @staticmethod
+    def _clip_pad(tensor, pad_shape):
+        """
+        Clip boxes of the pad area.
+        :param tensor: [n, c, H, W]
+        :param pad_shape: [h, w]
+        :return: [n, c, h, w]
+        """
+        H, W = tensor.shape[2:]
+        h, w = pad_shape
+
+        if h < H or w < W:
+            tensor = tensor[:, :, :h, :w].copy()
+
+        return tensor
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+class ProposalTestCase(unittest.TestCase):
+    def test_fp16_single_op(self, test_name="proposal"):
+        target = detect_target()
+        feat_stride = 16
+        scales = [128, 256, 512]
+        ratios = [0.5, 1, 2]
+        num_anchors = len(scales) * len(ratios)
+
+        output_score = False
+        rpn_pre_nms_top_n = 6000
+        rpn_post_nms_top_n = 300
+        threshold = 0.7
+        rpn_min_size = 0
+        op = ProposalOP(
+            feat_stride,
+            scales,
+            ratios,
+            output_score,
+            rpn_pre_nms_top_n,
+            rpn_post_nms_top_n,
+            threshold,
+            rpn_min_size,
+        )
+        batch_size = 2
+        in_data = [
+            npr.rand(1, num_anchors, 32, 32),
+            npr.rand(1, num_anchors * 4, 32, 32),
+            np.array([512, 512, 1.0]),
+        ]
+        _ = op.forward(in_data)
+
+        scores = in_data[0]
+        bbox_deltas = in_data[1]
+        im_info = in_data[2]
+
+        bbox_deltas = np.repeat(bbox_deltas, repeats=batch_size, axis=0)
+        scores = np.repeat(scores, repeats=batch_size, axis=0)
+
+        bbox_deltas_ait = np.transpose(
+            bbox_deltas.astype("float16"), (0, 2, 3, 1)
+        ).copy()
+        scores_ait = np.transpose(scores.astype("float16"), (0, 2, 3, 1)).copy()
+
+        X_bbox_deltas = Tensor(
+            shape=bbox_deltas_ait.shape,
+            name="X_bbox_deltas",
+            dtype="float16",
+            is_input=True,
+        )
+
+        X_scores = Tensor(
+            shape=scores_ait.shape, name="X_scores", dtype="float16", is_input=True
+        )
+        OP = nn.Proposal(
+            im_shape=im_info[:2],
+            scales=scales,
+            ratios=ratios,
+            clip_box=True,
+            nms_on=True,
+            rpn_pre_nms_top_n=rpn_pre_nms_top_n,
+            rpn_post_nms_top_n=rpn_post_nms_top_n,
+            iou_threshold=threshold,
+            rpn_min_size=rpn_min_size,
+            batch_size=batch_size,
+        )
+
+        y = OP(X_bbox_deltas, X_scores)
+        mark_output(y)
+        module = compile_model(y, target, "./tmp", test_name)
+
+        anchors = torch.from_numpy(OP._anchors.copy()).cuda()
+        batch_inds = torch.from_numpy(OP._batch_inds.copy()).cuda()
+        module.set_constant_with_tensor("anchors", anchors)
+        module.set_constant_with_tensor("batch_inds", batch_inds)
+        inputs_pt = [
+            torch.from_numpy(bbox_deltas_ait).cuda().half(),
+            torch.from_numpy(scores_ait).cuda().half(),
+        ]
+        out0_shape = module.get_output_maximum_shape(0)
+        out0 = torch.empty(out0_shape).cuda().half()
+        y_ait_shape = module.get_output_maximum_shape(1)
+        y_ait = torch.empty(y_ait_shape).cuda().half()
+        module.run_with_tensors(inputs_pt, [out0, y_ait])
+        y_ait = y_ait.reshape(2, -1, 4)
+        self.assertTrue(torch.allclose(y_ait[0, :], y_ait[1, :], atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_reduce.py b/tests/unittest/ops/test_reduce.py
new file mode 100644
index 000000000..d0dcbe964
--- /dev/null
+++ b/tests/unittest/ops/test_reduce.py
@@ -0,0 +1,367 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import dtype_to_torch_dtype, get_random_torch_tensor
+
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ReduceTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(ReduceTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _run_reduce(
+        self,
+        *,
+        test_name,
+        reduce_op,
+        torch_reduce_op,
+        dim,
+        input_shape,
+        keepdim,
+        input_type="float16",
+        output_type=None,
+    ):
+        torch.manual_seed(0)
+        logger.info(
+            "Test input_shape={input_shape}, reduction_axes={dim}".format(
+                input_shape=input_shape, dim=dim
+            )
+        )
+        target = detect_target()
+        X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
+
+        if keepdim is None:
+            op = reduce_op(dim, dtype=output_type)
+        else:
+            op = reduce_op(dim, keepdim=keepdim, dtype=output_type)
+        Y = op(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        y_dtype = Y._attrs["dtype"]
+
+        logger.info("AITemplate output_shape: {}".format(y_shape))
+        logger.info("AITemplate output_type: {}".format(y_dtype))
+
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor(input_shape, input_type)
+        dtype_pt = dtype_to_torch_dtype(output_type)
+        if keepdim is None:
+            Y_pt = torch_reduce_op(X_pt, dim, dtype=dtype_pt)
+        else:
+            Y_pt = torch_reduce_op(X_pt, dim, keepdim=keepdim, dtype=dtype_pt)
+
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors([X_pt], [y])
+        y_pt = Y_pt.cpu().numpy()
+
+        np.testing.assert_equal(y_shape, y_pt.shape)
+        np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
+        np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+        self.test_count += 1
+
+    def _run_reduce_sum(
+        self, *, dim, input_shape, keepdim, input_type="float16", output_type=None
+    ):
+        self._run_reduce(
+            test_name="reduce_sum",
+            reduce_op=ops.reduce_sum,
+            torch_reduce_op=torch.sum,
+            dim=dim,
+            input_shape=input_shape,
+            keepdim=keepdim,
+            input_type=input_type,
+            output_type=output_type,
+        )
+
+    def test_reduce_sum(self):
+        self._run_reduce_sum(
+            dim=0, input_shape=[1], keepdim=True, input_type="float16", output_type=None
+        )
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[1, 4],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_sum(
+            dim=0,
+            input_shape=[1, 4],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_sum(
+            dim=0,
+            input_shape=[2, 4],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_sum(
+            dim=0,
+            input_shape=[1, 2, 1],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[1, 2, 1],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_sum(
+            dim=2,
+            input_shape=[5, 4, 3],
+            keepdim=True,
+            input_type="float16",
+            output_type="float16",
+        )
+
+        self._run_reduce_sum(
+            dim=0,
+            input_shape=[4],
+            keepdim=False,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_sum(
+            dim=0,
+            input_shape=[1, 4],
+            keepdim=False,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_sum(
+            dim=0,
+            input_shape=[5, 4, 3],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[5, 4, 3],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_sum(
+            dim=2,
+            input_shape=[5, 4, 3],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_sum(
+            dim=-1,
+            input_shape=[1, 1000000, 6],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+        )
+        # allocate workspace for the strided tensor_reduce kernel
+        self._run_reduce_sum(
+            dim=2,
+            input_shape=[1, 1, 8, 128],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+        )
+
+    def _run_reduce_mean(
+        self, *, dim, input_shape, keepdim, input_type="float16", output_type=None
+    ):
+        self._run_reduce(
+            test_name="reduce_mean",
+            reduce_op=ops.reduce_mean,
+            torch_reduce_op=torch.mean,
+            dim=dim,
+            input_shape=input_shape,
+            keepdim=keepdim,
+            input_type=input_type,
+            output_type=output_type,
+        )
+
+    def test_reduce_mean(self):
+        self._run_reduce_mean(
+            dim=0, input_shape=[1], keepdim=True, input_type="float16", output_type=None
+        )
+        self._run_reduce_mean(
+            dim=1,
+            input_shape=[2, 3],
+            keepdim=False,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_mean(
+            dim=2,
+            input_shape=[5, 7, 1234],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=-1,
+            input_shape=[3, 2, 2048],
+            keepdim=True,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=0,
+            input_shape=[2, 1],
+            keepdim=False,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_mean(
+            dim=0,
+            input_shape=[4, 3],
+            keepdim=False,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_mean(
+            dim=1,
+            input_shape=[2, 1, 3],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_mean(
+            dim=1,
+            input_shape=[3, 2057, 4],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_mean(
+            dim=-2,
+            input_shape=[3, 2048, 4],
+            keepdim=False,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_mean(
+            dim=1,
+            input_shape=[0, 2048, 4],
+            keepdim=False,
+            input_type="float16",
+            output_type=None,
+        )
+        self._run_reduce_mean(
+            dim=2,
+            input_shape=[0, 7, 1234],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=0,
+            input_shape=[5, 7, 4],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=1,
+            input_shape=[4, 5, 7, 4],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=2,
+            input_shape=[4, 5, 7, 4],
+            keepdim=True,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=3,
+            input_shape=[4, 5, 7, 4],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=-1,
+            input_shape=[1, 1000000, 6],
+            keepdim=True,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=-1,
+            input_shape=[1, 31],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=-1,
+            input_shape=[127, 63],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=-1,
+            input_shape=[1270, 63],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=-1,
+            input_shape=[4, 1280, 123],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=-1,
+            input_shape=[2, 22, 68],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+        self._run_reduce_mean(
+            dim=-1,
+            input_shape=[1270, 1223],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_reshape.py b/tests/unittest/ops/test_reshape.py
new file mode 100644
index 000000000..cfff48af1
--- /dev/null
+++ b/tests/unittest/ops/test_reshape.py
@@ -0,0 +1,168 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.compiler.ops.common.view_ops import reshape
+
+from aitemplate.frontend import IntImm, IntVar, nn, Tensor
+from aitemplate.testing import detect_target
+
+
+class ReshapeTestCase(unittest.TestCase):
+    def _test_fp16(
+        self,
+        batch_size=(1, 3),
+        X_shape=(16, 32, 64),
+        Y_shape=(-1, 16, 16, 128),
+        test_name="reshape",
+    ):
+        target = detect_target()
+        # N, H, W, C
+        X = Tensor(
+            shape=[IntVar(values=list(batch_size), name="input_batch"), *X_shape],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        shape = list(Y_shape)
+
+        OP1 = nn.AvgPool2d(kernel_size=3, stride=1, padding=1)
+        OP2 = nn.Reshape()
+        OP3 = nn.Reshape()
+
+        Y1 = OP1(X)
+        Y2 = OP2(Y1, shape)
+        Y = OP3(Y2, shape + [1])
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            # C, H, W
+            X_shape_pt = (X_shape[2], X_shape[0], X_shape[1])
+            X_pt = torch.randn(b, *X_shape_pt).cuda().half()
+            OP_pt = torch.nn.AvgPool2d(kernel_size=3, stride=1, padding=1)
+            Y1_pt = OP_pt(X_pt).permute([0, 2, 3, 1])
+            Y2_pt = torch.reshape(Y1_pt, shape)  # reshape 1
+            Y_pt = torch.reshape(Y2_pt, shape + [1])  # reshape 2
+
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors([x], [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def _test_fp16_single_op(
+        self,
+        X_shape,
+        Y_shape,
+        test_name="reshape",
+        check_name_retention=False,
+    ):
+        target = detect_target()
+        X_shape = [dim if isinstance(dim, IntVar) else IntImm(dim) for dim in X_shape]
+        Y_shape = [dim if isinstance(dim, IntVar) else IntImm(dim) for dim in Y_shape]
+        X = Tensor(
+            shape=X_shape,
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        OP = nn.Reshape()
+        OP_backend = reshape()
+        Y = OP(X, Y_shape)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        # yank shape inference from op internals to let pt know what the real runtime shape will be
+        x_shapes = list(itertools.product(*[var._attrs["values"] for var in X_shape]))
+        new_shapes = list(itertools.product(*[var._attrs["values"] for var in Y_shape]))
+        if len(x_shapes) > len(new_shapes):
+            assert len(new_shapes) == 1
+            new_shapes = new_shapes * len(x_shapes)
+        y_shapes = [
+            OP_backend._infer_shape(x_shape, new_shape)
+            for x_shape, new_shape in zip(x_shapes, new_shapes)
+        ]
+
+        for x_shape, y_shape in zip(x_shapes, y_shapes):
+            X_pt = torch.randn(x_shape).cuda().half()
+            Y_pt = torch.reshape(X_pt, y_shape)
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        if check_name_retention:
+            self.assertTrue(
+                1
+                == sum("input_batch" == dim._attrs["name"] for dim in Y._attrs["shape"])
+            )
+
+    def test_reshape(self):
+        self._test_fp16(test_name="reshape0")
+        self._test_fp16([4, 2], (4, 8, 8), (-1,), "reshape1")
+        self._test_fp16([3, 1], (5, 4, 16), (-1, 8), "reshape2")
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=(1, 3), name="input_batch"), 16, 32, 64),
+            Y_shape=(-1, 16, 16, 128),
+            test_name="reshape3",
+        )
+        self._test_fp16_single_op(
+            X_shape=(1, 16, 32, 64), Y_shape=[1, 64, 16, 32], test_name="reshape4"
+        )
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=(2, 4), name="input_batch"), 0, 8),
+            Y_shape=(0, 2, 4),
+            test_name="reshape1",
+        )
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
+            Y_shape=(5, 4, -1, 3, 2),
+            test_name="reshape_name",
+            check_name_retention=True,
+        )
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
+            Y_shape=(5, 4, IntVar(values=(2, 4)), 3, -1),
+            test_name="reshape_name_unknown_static_dim",
+            check_name_retention=True,
+        )
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
+            Y_shape=(5, IntVar(values=(2, 4)), 3, 4, 2),
+            test_name="reshape_name_no_unknown_dims",
+            check_name_retention=True,
+        )
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
+            Y_shape=(IntVar(values=(10, 20)), 4, 2, 3, -1),
+            test_name="reshape_squeeze_intvar_dim",
+        )
+        self._test_fp16_single_op(
+            X_shape=(IntVar(values=(20, 40), name="input_batch"), 1, 12),
+            Y_shape=(4, 2, IntVar(values=(2, 4)), 3, 5),
+            test_name="reshape_unsqueeze_intvar_dim",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_roi_align.py b/tests/unittest/ops/test_roi_align.py
new file mode 100644
index 000000000..979897181
--- /dev/null
+++ b/tests/unittest/ops/test_roi_align.py
@@ -0,0 +1,136 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from unittest import skipIf
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import IntVar, nn, Tensor
+from aitemplate.testing import detect_target
+
+try:
+    import torchvision
+
+    HAS_TORCHVISION = True
+except ImportError:
+    HAS_TORCHVISION = False
+skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision")
+
+
+def random_boxes(num_boxes, max_coord=100):
+    boxes = torch.rand(num_boxes, 4) * (max_coord * 0.5)
+    boxes.clamp_(min=1.0)
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+@skipIfNoTorchVision
+class RoiAlignTestCase(unittest.TestCase):
+    def _create_tensors(self, num_rois, b, rand=False):
+        if rand:
+            boxes = random_boxes(num_rois, 200)
+            inds = np.arange(b)
+            batch_inds = np.repeat(inds.reshape(-1, 1), repeats=num_rois // b, axis=1)
+            rois = torch.cat(
+                (torch.tensor(batch_inds).reshape(b, -1, 1), boxes.reshape(b, -1, 4)), 2
+            )
+            rois = rois.reshape(-1, 5).cuda().half()
+        else:
+            rois = (
+                torch.tensor(
+                    [
+                        [0, -2.0, -2.0, 22.0, 22.0],
+                        [0, 10.0, 10.0, 30.0, 30.0],
+                        [0, 1.0, 1.0, 10.0, 10.0],
+                        [1, -2.0, -2.0, 22.0, 22.0],
+                        [1, 10.0, 10.0, 30.0, 30.0],
+                        [1, 1.0, 1.0, 10.0, 10.0],
+                    ]
+                )
+                .cuda()
+                .half()
+            )
+        return rois
+
+    def _test_fp16_single_op(
+        self,
+        HH,
+        WW,
+        CC,
+        num_rois=3,
+        pooled_size=7,
+        spatial_scale=1 / 16.0,
+        sampling_ratio=2,
+        batch_size=(1, 1),
+        rand=False,
+        test_name="roi_align",
+    ):
+        target = detect_target()
+
+        X = Tensor(
+            shape=[IntVar(values=list(batch_size), name="input_batch"), HH, WW, CC],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        R = Tensor(
+            shape=[IntVar(values=[num_rois, num_rois], name="roi_batch"), 5],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        output_size = (pooled_size, pooled_size)
+
+        OP = nn.RoiAlign(
+            num_rois=num_rois,
+            pooled_size=pooled_size,
+            spatial_scale=spatial_scale,
+            sampling_ratio=sampling_ratio,
+            position_sensitive=False,
+            continuous_coordinate=False,
+        )
+        Y = OP(X, R)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_pt = torch.randn(b, CC, WW, HH).cuda().half()
+            rois = self._create_tensors(num_rois, b, rand)
+
+            if b == 1:
+                rois = rois[:num_rois, :]
+
+            OP_pt = torchvision.ops.RoIAlign(
+                output_size, spatial_scale=spatial_scale, sampling_ratio=sampling_ratio
+            )
+            Y_pt = OP_pt(X_pt, rois)
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            inputs = [x, rois]
+            y = torch.empty([num_rois, pooled_size, pooled_size, CC]).cuda().half()
+            module.run_with_tensors(inputs, [y])
+            y_transpose = y.permute((0, 3, 1, 2))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+    def test_roi_align(self):
+        self._test_fp16_single_op(HH=56, WW=56, CC=256, test_name="roi_align1")
+        # self._test_fp16_single_op(HH=16, WW=16, CC=32, num_rois=6, batch_size=(2, 2), rand=True, test_name="roi_align2")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_size_getitem_ops.py b/tests/unittest/ops/test_size_getitem_ops.py
new file mode 100644
index 000000000..2d1781506
--- /dev/null
+++ b/tests/unittest/ops/test_size_getitem_ops.py
@@ -0,0 +1,112 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class SizeOpTestCase(unittest.TestCase):
+    def _test_size_op(
+        self,
+        batch_size=(1, 3),
+        X_shape=(16, 32, 64),
+        Y_shape=(-1, 16, 16, 128),
+        test_name="size_op",
+    ):
+        target = detect_target()
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        X = Tensor(
+            shape=[b_dim, *X_shape],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        Y1 = ops.size()(X)
+        Y2 = ops.getitem()(Y1, 0)
+        Y = ops.reshape()(X, [Y2, -1, X_shape[-1]])
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_shape_pt = (b, *X_shape)
+            X_pt = torch.randn(X_shape_pt).cuda().half()
+            Y_pt = X_pt.reshape(b, -1, X_shape_pt[-1])
+
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def _test_size_op_2(
+        self,
+        batch_size=(1, 3),
+        X_shape=(16, 32, 64),
+        Y_shape=(-1, 16, 16, 128),
+        test_name="tensor_size_op",
+    ):
+        target = detect_target()
+        X1 = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), *X_shape],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        Y1 = ops.flatten(1, -1)(ops.elementwise(FuncEnum.ADD)(X1, X1))
+        Y2 = ops.flatten(1, -1)(ops.elementwise(FuncEnum.MUL)(X1, X1))
+        Y3 = ops.concatenate()([Y1, Y2], 0)
+        dim = ops.size()(Y3, -4)  # test negative dim
+        Y = ops.reshape()(Y2, [dim, -1])
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        self.assertEqual(len(module.debug_sorted_graph), 6)
+
+        for b in batch_size:
+            X_shape_pt = (b, *X_shape)
+            X_pt = torch.randn(X_shape_pt).cuda().half()
+            Y2_pt = X_pt * X_pt
+            Y_pt = Y2_pt.reshape(2 * b, -1)
+
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_size_op(self):
+        self._test_size_op(test_name="size_op_0")
+        self._test_size_op([1], (4, 8, 8), (-1,), "size_op_static")
+        self._test_size_op([4, 2], (4, 8, 8), (-1,), "size_op_1")
+        self._test_size_op([3, 1], (5, 4, 16), (-1, 8), "size_op_2")
+
+        self._test_size_op_2(test_name="size_op_3")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_slice.py b/tests/unittest/ops/test_slice.py
new file mode 100644
index 000000000..82ee06d45
--- /dev/null
+++ b/tests/unittest/ops/test_slice.py
@@ -0,0 +1,220 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVarTensor
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils import shape_utils
+
+
+class SliceTestCase(unittest.TestCase):
+    def _run_dynamic_slice(
+        self, *, input_shape, start_indices, end_indices, input_type="float16"
+    ):
+        logging.info(
+            "Test with start_indices {}, end_indices {}".format(
+                start_indices, end_indices
+            )
+        )
+
+        slice_op = ops.dynamic_slice()
+        # generate torch reference result
+        X_pt = get_random_torch_tensor(input_shape, input_type)
+        slice_indices = [
+            slice(
+                shape_utils.convert_IntVar_to_int(i) if i is not None else i,
+                shape_utils.convert_IntVar_to_int(j) if j is not None else j,
+            )
+            for i, j in zip(start_indices, end_indices)
+        ]
+        Y_pt = X_pt[slice_indices]
+
+        target = detect_target()
+        X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
+        Y = slice_op(X, start_indices=start_indices, end_indices=end_indices)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        logging.info("AITemplate output_0 shape: {}".format(y_shape))
+        np.testing.assert_equal(y_shape, Y_pt.size())
+
+        module = compile_model(Y, target, "./tmp", "dynamic_slice")
+
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors([X_pt], [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def _run_batch_dynamic_slice(
+        self,
+        *,
+        batch_sizes,
+        input_shape,
+        start_indices,
+        end_indices,
+        input_type="float16",
+    ):
+        logging.info(
+            "Batch test with batch_sizes {}, start_indices {}, end_indices {}".format(
+                batch_sizes, start_indices, end_indices
+            )
+        )
+
+        slice_op = ops.dynamic_slice()
+
+        target = detect_target()
+        X = Tensor(
+            shape=[
+                IntVar(values=batch_sizes, name="input_batch_0"),
+                *input_shape,
+            ],
+            dtype=input_type,
+            name="input_0",
+            is_input=True,
+        )
+        Y = slice_op(X, start_indices=start_indices, end_indices=end_indices)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", "dynamic_slice_batched")
+
+        for batch in batch_sizes:
+            logging.info("checking batch: {}".format(batch))
+
+            # generate torch reference result
+            X_pt = get_random_torch_tensor([batch, *input_shape], input_type)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+            Y_pt = X_pt[slice_indices]
+            y_pt = Y_pt.cpu().numpy()
+
+            y = torch.empty(y_pt.shape).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_dynamic_slice(self):
+        self._run_dynamic_slice(input_shape=[1], start_indices=[0], end_indices=[1])
+        self._run_dynamic_slice(input_shape=[2], start_indices=[0], end_indices=[-1])
+        self._run_dynamic_slice(
+            input_shape=[2, 3], start_indices=[0, 0], end_indices=[2, 2]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 5], start_indices=[0, 0, 0], end_indices=[2, 2, -1]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4], start_indices=[1, 0, 1], end_indices=[2, 2, 4]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 0, 4], start_indices=[0, 1, 0], end_indices=[1, 3, 4]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4], start_indices=[0, 1, 0], end_indices=[1, 3, 4]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4], start_indices=[0, 0, 0], end_indices=[1, 3, 4]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4], start_indices=[0, 1, 0], end_indices=[1, 3, -1]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4], start_indices=[0, 1, 1], end_indices=[-11, 3, 2]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4], start_indices=[0, -3, -4], end_indices=[9, -1, 2]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4], start_indices=[4, 0, 1], end_indices=[1, 1, 2]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2048, 256, 64],
+            start_indices=[256, 32, 0],
+            end_indices=[1024, 193, 65],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 5], start_indices=[None, 0, 0], end_indices=[2, None, -1]
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3],
+            start_indices=[IntVar([1, 1]), IntImm(1)],
+            end_indices=[IntVarTensor(IntImm(2)), IntVarTensor(IntImm(2))],
+        )
+
+    def test_batch_dynamic_slice(self):
+        self._run_batch_dynamic_slice(
+            batch_sizes=[1, 1],
+            input_shape=[1],
+            start_indices=[0, 0],
+            end_indices=[1, 1],
+        )
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 6, 7],
+            input_shape=[2, 3, 4],
+            start_indices=[2, 1, 0, 1],
+            end_indices=[5, 2, 2, 4],
+        )
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 6, 7],
+            input_shape=[2, 3, 4],
+            start_indices=[2, 1, 0, 1],
+            end_indices=[-1, 2, -1, 4],
+        )
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 6, 7],
+            input_shape=[2, 3, 4],
+            start_indices=[-5, 1, 0, 1],
+            end_indices=[123, 2, -1, 4],
+        )
+        self._run_batch_dynamic_slice(
+            batch_sizes=[7, 6, 5],
+            input_shape=[128, 57, 74],
+            start_indices=[1, 15, 32, 0],
+            end_indices=[4, 73, 54, 65],
+        )
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 6, 0],
+            input_shape=[2, 3, 4],
+            start_indices=[2, 1, 0, 1],
+            end_indices=[-1, 2, -1, 4],
+        )
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 3, 9],
+            input_shape=[2, 0, 4],
+            start_indices=[2, 1, 0, 1],
+            end_indices=[-1, 2, -1, 4],
+        )
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 3, 9],
+            input_shape=[2, 4, 3],
+            start_indices=[2, 1, 0, -1],
+            end_indices=[-1, 2, -1, 0],
+        )
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 3, 9],
+            input_shape=[2, 4, 3],
+            start_indices=[None, 1, None, -1],
+            end_indices=[None, None, -1, 0],
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_softmax.py b/tests/unittest/ops/test_softmax.py
new file mode 100644
index 000000000..70b6f8e44
--- /dev/null
+++ b/tests/unittest/ops/test_softmax.py
@@ -0,0 +1,74 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for LayerNorm Operator.
+"""
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class SoftmaxTestCase(unittest.TestCase):
+    def _test_softmax(
+        self,
+        batch_sizes=(1, 1024),
+        input_shapes=(6,),
+        dim=-1,
+        dtype="float16",
+        testname="softmax",
+    ):
+
+        X = Tensor(
+            shape=[IntVar(name="input_batch", values=list(batch_sizes)), *input_shapes],
+            dtype=dtype,
+            name="X",
+            is_input=True,
+        )
+        Y = ops.softmax()(X, dim)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", testname)
+
+        for batch_size in batch_sizes:
+            x_pt = torch.randn(batch_size, *input_shapes).cuda().half()
+            y_pt = torch.nn.functional.softmax(x_pt, dim=dim)
+
+            y = torch.empty([batch_size, *input_shapes]).cuda().half()
+            module.run_with_tensors([x_pt], [y])
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_softmax(self):
+        self._test_softmax()
+        self._test_softmax(dim=1)
+        self._test_softmax((1, 13), (7,))
+        self._test_softmax((10, 1025), (16,))
+        self._test_softmax((1, 17), (9, 8))
+        self._test_softmax((2, 64), (9, 1, 6))
+        self._test_softmax((1, 4096), (33,))
+        self._test_softmax((2, 21), (34,))
+        self._test_softmax((2, 17), (36,))
+        self._test_softmax((1, 64), (128,))
+        self._test_softmax((2, 31), (513,))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_split.py b/tests/unittest/ops/test_split.py
new file mode 100644
index 000000000..cc96a043e
--- /dev/null
+++ b/tests/unittest/ops/test_split.py
@@ -0,0 +1,193 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class SplitTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SplitTestCase, self).__init__(*args, **kwargs)
+
+    def _run_split(
+        self, *, input_shape, split_size_or_sections, dim=None, input_type="float16"
+    ):
+        logging.info(
+            f"Test input shape {input_shape}, "
+            f"split_size_or_sections={split_size_or_sections}, dim={dim}"
+        )
+
+        split_op = ops.split()
+        # generate torch reference result
+        X_pt = get_random_torch_tensor(input_shape, input_type)
+        Ys_pt = (
+            torch.split(X_pt, split_size_or_sections)
+            if dim is None
+            else torch.split(X_pt, split_size_or_sections, dim)
+        )
+        target = detect_target()
+        X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
+        Ys = (
+            split_op(X, split_size_or_sections)
+            if dim is None
+            else split_op(X, split_size_or_sections, dim)
+        )
+        np.testing.assert_equal(len(Ys_pt), len(Ys))
+
+        y_shapes = []
+        for idx, Y in enumerate(Ys):
+            Y._attrs["name"] = f"output_{idx}"
+            Y._attrs["is_output"] = True
+            y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
+            logging.info(f"AITemplate output_{idx} shape: {y_shape}")
+            y_shapes.append(y_shape)
+
+        module = compile_model(Ys, target, "./tmp", "split")
+
+        outputs = {
+            f"output_{idx}": torch.empty(y_shape).cuda().half()
+            for idx, y_shape in enumerate(y_shapes)
+        }
+        module.run_with_tensors([X_pt], outputs)
+
+        for idx, y_pt in enumerate(Ys_pt):
+            y = outputs[f"output_{idx}"]
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def _run_batch_split(
+        self,
+        *,
+        batch_sizes,
+        input_shape,
+        split_size_or_sections,
+        dim=None,
+        input_type="float16",
+    ):
+        logging.info(
+            f"Batch test: batch_sizes {batch_sizes}, input shape {input_shape}, "
+            f"split_size_or_sections={split_size_or_sections}, dim={dim}"
+        )
+
+        split_op = ops.split()
+
+        target = detect_target()
+        X = Tensor(
+            shape=[IntVar(values=batch_sizes, name="input_batch_0"), *input_shape],
+            dtype=input_type,
+            name="input_0",
+            is_input=True,
+        )
+        Ys = (
+            split_op(X, split_size_or_sections)
+            if dim is None
+            else split_op(X, split_size_or_sections, dim)
+        )
+
+        for idx, Y in enumerate(Ys):
+            Y._attrs["name"] = f"output_{idx}"
+            Y._attrs["is_output"] = True
+
+        module = compile_model(Ys, target, "./tmp", "split")
+
+        for batch in batch_sizes:
+            logging.info(f"checking batch: {batch}")
+
+            # generate torch reference result
+            X_pt = get_random_torch_tensor([batch, *input_shape], input_type)
+            Ys_pt = (
+                torch.split(X_pt, split_size_or_sections)
+                if dim is None
+                else torch.split(X_pt, split_size_or_sections, dim)
+            )
+
+            np.testing.assert_equal(len(Ys_pt), len(Ys))
+
+            y_shapes = [Y_pt.size() for Y_pt in Ys_pt]
+            outputs = {
+                f"output_{idx}": torch.empty(y_shape).cuda().half()
+                for idx, y_shape in enumerate(y_shapes)
+            }
+            module.run_with_tensors(
+                [X_pt],
+                outputs,
+            )
+
+            for idx, y_pt in enumerate(Ys_pt):
+                y = outputs[f"output_{idx}"]
+                self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split(self):
+        self._run_split(input_shape=[1], split_size_or_sections=1, dim=0)
+        self._run_split(input_shape=[2, 1], split_size_or_sections=1, dim=0)
+        self._run_split(input_shape=[2, 3], split_size_or_sections=2, dim=1)
+        self._run_split(input_shape=[2, 3, 4], split_size_or_sections=10, dim=1)
+        self._run_split(input_shape=[2, 3, 4], split_size_or_sections=4, dim=2)
+        self._run_split(input_shape=[8, 6, 4], split_size_or_sections=2, dim=0)
+        self._run_split(input_shape=[8, 6, 4], split_size_or_sections=3, dim=0)
+        self._run_split(input_shape=[4097, 128, 64], split_size_or_sections=1024, dim=0)
+        self._run_split(input_shape=[4097, 128, 64], split_size_or_sections=32, dim=1)
+
+        self._run_split(input_shape=[1], split_size_or_sections=[1], dim=0)
+        self._run_split(input_shape=[8, 6, 4], split_size_or_sections=[2, 3, 3], dim=0)
+        self._run_split(input_shape=[8, 6, 4], split_size_or_sections=(5, 1), dim=1)
+        self._run_split(input_shape=[8, 6, 4], split_size_or_sections=(2, 2), dim=2)
+
+        # some special cases
+        self._run_split(input_shape=[2, 0, 4], split_size_or_sections=4, dim=-2)
+        self._run_split(input_shape=[2, 0, 4], split_size_or_sections=0, dim=-2)
+        self._run_split(input_shape=[2, 0, 4], split_size_or_sections=2, dim=-1)
+        self._run_split(input_shape=[2, 0, 7], split_size_or_sections=[2, 3, 2], dim=-1)
+
+    def test_batch_split(self):
+        self._run_batch_split(
+            batch_sizes=[1, 1], input_shape=[2, 1], split_size_or_sections=1, dim=1
+        )
+        self._run_batch_split(
+            batch_sizes=[3, 4], input_shape=[2, 3, 4], split_size_or_sections=2, dim=2
+        )
+        self._run_batch_split(
+            batch_sizes=[3, 4], input_shape=[2, 3, 4], split_size_or_sections=2, dim=3
+        )
+        self._run_batch_split(
+            batch_sizes=[11, 5, 9],
+            input_shape=[2, 9, 4],
+            split_size_or_sections=[2, 4, 3],
+            dim=2,
+        )
+
+        self._run_batch_split(
+            batch_sizes=[11, 5, 9],
+            input_shape=[4, 0, 4],
+            split_size_or_sections=2,
+            dim=1,
+        )
+        self._run_batch_split(
+            batch_sizes=[11, 5, 9],
+            input_shape=[4, 0, 5],
+            split_size_or_sections=[1, 2, 2],
+            dim=3,
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_split_getitem.py b/tests/unittest/ops/test_split_getitem.py
new file mode 100644
index 000000000..a229063a4
--- /dev/null
+++ b/tests/unittest/ops/test_split_getitem.py
@@ -0,0 +1,222 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class SplitGetItemTestCase(unittest.TestCase):
+    def _test_split_getitem(
+        self,
+        batch_size=(1, 3),
+        X_shape=(16, 32),
+        split_sections=(4, 8, 2, 2),
+        split_dim=1,
+        item_idx=0,
+        test_name="split_getitem",
+    ):
+        assert len(X_shape) == 2, "expected X_shape to be 2 but got {}".format(
+            len(X_shape)
+        )
+        target = detect_target()
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        X = Tensor(
+            shape=[b_dim, *X_shape],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        N = 16
+        if split_dim == 1:
+            K = X_shape[1]
+        elif split_dim == 2:
+            K = split_sections[item_idx]
+        else:
+            assert 0, f"expected split_dim to be either 1 or 2 but got {split_dim}"
+
+        W = Tensor(shape=[b_dim, N, K], dtype="float16", name="input_1", is_input=True)
+
+        Y1 = ops.split()(X, split_sections, split_dim)
+        Y2 = ops.getitem()(Y1, item_idx)
+        Y = ops.bmm_rcr()(Y2, W)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_shape_pt = (b, *X_shape)
+            X_pt = torch.randn(X_shape_pt).cuda().half()
+            W_pt = torch.randn(b, N, K).cuda().half()
+            WT = torch.transpose(W_pt, 2, 1)
+
+            Y1_pt = torch.split(X_pt, split_sections, split_dim)
+            Y_pt = torch.bmm(Y1_pt[item_idx], WT)
+
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split_getitem(self):
+        self._test_split_getitem(test_name="split_getitem_0")
+        self._test_split_getitem(
+            batch_size=[5],
+            X_shape=(16, 32),
+            split_sections=[8, 20, 4],
+            split_dim=2,
+            item_idx=1,
+            test_name="split_getitem_1",
+        )
+
+    def _test_split_getitem_output(
+        self,
+        batch_size=(1, 3),
+        X_shape=(16, 32),
+        split_sections=(4, 8, 2, 2),
+        split_dim=1,
+        item_idx=0,
+        test_name="split_getitem",
+    ):
+        assert len(X_shape) == 2, "expected X_shape to be 2 but got {}".format(
+            len(X_shape)
+        )
+        target = detect_target()
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        X = Tensor(
+            shape=[b_dim, *X_shape],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        Y1 = ops.split()(X, split_sections, split_dim)
+        Y = ops.getitem()(Y1, item_idx)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_shape_pt = (b, *X_shape)
+            X_pt = torch.randn(X_shape_pt).cuda().half()
+
+            Y1_pt = torch.split(X_pt, split_sections, split_dim)
+            Y_pt = Y1_pt[item_idx]
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split_getitem_output(self):
+        self._test_split_getitem_output(test_name="split_getitem_output_0")
+        self._test_split_getitem_output(
+            batch_size=[10],
+            X_shape=(16, 31),
+            split_sections=[9, 19, 3],
+            split_dim=2,
+            item_idx=1,
+            test_name="split_getitem_output_1",
+        )
+
+    def _test_split_multiple_getitems(
+        self,
+        batch_size=(1, 3),
+        X_shape=(16, 32),
+        split_sections=(4, 4, 6, 2),
+        split_dim=1,
+        test_name="split_getitem",
+    ):
+        assert len(X_shape) == 2, "expected X_shape to be 2 but got {}".format(
+            len(X_shape)
+        )
+        assert (
+            len(split_sections) >= 2
+        ), "expected split_sections to have at least 2 values, but got {}".format(
+            split_sections
+        )
+        target = detect_target()
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        X = Tensor(
+            shape=[b_dim, *X_shape],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        X2_shape = list(X_shape)
+        item_idx0 = 0
+        item_idx1 = 1
+        assert split_sections[item_idx0] == split_sections[item_idx1], (
+            f"expected values of split_sections at {item_idx0} and {item_idx1} "
+            "are equal, but got {split_sections[item_idx0]} and "
+            "{split_sections[item_idx1]}"
+        )
+        X2_shape[split_dim - 1] = split_sections[item_idx0]
+        X2 = Tensor(
+            shape=[b_dim, *X2_shape],
+            dtype="float16",
+            name="input_2",
+            is_input=True,
+        )
+
+        Y1 = ops.split()(X, split_sections, split_dim)
+        Y2 = ops.getitem()(Y1, item_idx0)
+        Y3 = ops.getitem()(Y1, item_idx1)
+        Y4 = ops.elementwise(FuncEnum.ADD)(Y2, X2)
+        Y5 = ops.elementwise(FuncEnum.ADD)(Y3, Y3)
+        Y = ops.elementwise(FuncEnum.ADD)(Y4, Y5)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_shape_pt = (b, *X_shape)
+            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X2_shape_pt = (b, *X2_shape)
+            X2_pt = torch.randn(X2_shape_pt).cuda().half()
+
+            Y1_pt = torch.split(X_pt, split_sections, split_dim)
+            Y2_pt = Y1_pt[item_idx0]
+            Y3_pt = Y1_pt[item_idx1]
+            Y4_pt = Y2_pt + X2_pt
+            Y5_pt = Y3_pt + Y3_pt
+            Y_pt = Y4_pt + Y5_pt
+
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors({"input_0": X_pt, "input_2": X2_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split_mutiple_getitems(self):
+        self._test_split_multiple_getitems(test_name="split_multiple_getitems_0")
+        self._test_split_multiple_getitems(
+            batch_size=[10],
+            X_shape=(16, 31),
+            split_sections=[9, 9, 13],
+            split_dim=2,
+            test_name="split_multiple_getitems_1",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_squeeze.py b/tests/unittest/ops/test_squeeze.py
new file mode 100644
index 000000000..d20d01bea
--- /dev/null
+++ b/tests/unittest/ops/test_squeeze.py
@@ -0,0 +1,133 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+from typing import List, Optional, Tuple
+
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.compiler.ops import elementwise, squeeze, unsqueeze
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+
+
+def _construct_shape(
+    shape: List[List[int]], input_number: int
+) -> Tuple[List[IntVar], List[Optional[str]]]:
+    result = []
+    dim_names = []
+    num_dynamic = 0
+    for dim in shape:
+        dim_name = None
+        if len(dim) == 1:
+            result.append(IntImm(dim[0]))
+        else:
+            dim_name = f"dynamic{input_number}{num_dynamic}"
+            result.append(IntVar(dim, name=dim_name))
+            num_dynamic += 1
+        dim_names.append(dim_name)
+    return result, dim_names
+
+
+class SqueezeTestCase(unittest.TestCase):
+    def _test_helper(self, dim, shape, expected_shape, test_name, do_squeeze):
+        target = detect_target()
+
+        shape_vars, input_0_names = _construct_shape(shape, 0)
+        expected_shape_vars, input_1_names = _construct_shape(expected_shape, 0)
+
+        input_0 = Tensor(
+            shape=shape_vars, dtype="float16", name="input_0", is_input=True
+        )
+        input_1 = Tensor(
+            shape=expected_shape_vars, dtype="float16", name="input_1", is_input=True
+        )
+
+        if do_squeeze:
+            op = squeeze(dim)
+        else:
+            op = unsqueeze(dim)
+
+        Y = op(input_0)
+        # Elementwise multiply with another input tensor with the expected shape
+        # This makes sure that squeeze/unsqueeze infer the correct shape
+        output = elementwise(FuncEnum.MUL)(Y, input_1)
+
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", test_name)
+
+        all_input_0_shapes = itertools.product(*shape)
+        all_input_1_shapes = itertools.product(*expected_shape)
+
+        for input_0_shape, input_1_shape in zip(all_input_0_shapes, all_input_1_shapes):
+            input_0_pt = torch.randn(input_0_shape).cuda().half()
+            input_1_pt = torch.randn(input_1_shape).cuda().half()
+            if do_squeeze:
+                # For some reason, torch.squeeze(X_pt, dim) fails when
+                # dim is None (even though the docs say dim is Optional[int])!
+                if dim is not None:
+                    Y_pt = torch.squeeze(input_0_pt, dim=dim)
+                else:
+                    Y_pt = torch.squeeze(input_0_pt)
+            else:
+                Y_pt = torch.unsqueeze(input_0_pt, dim)
+
+            output_pt = torch.mul(Y_pt, input_1_pt)
+            inputs = [input_0_pt, input_1_pt]
+
+            output = torch.empty(input_1_shape).cuda().half()
+            module.run_with_tensors(inputs, [output])
+            self.assertTrue(torch.equal(output, output_pt))
+
+    def test_squeeze(self):
+        self._test_helper(
+            None, [[4, 3], [1], [2], [1]], [[4, 3], [2]], "squeeze0", True
+        )
+        self._test_helper(0, [[1], [1], [2], [1]], [[1], [2], [1]], "squeeze1", True)
+        self._test_helper(
+            2, [[4, 2], [4], [1], [8]], [[4, 2], [4], [8]], "squeeze2", True
+        )
+        self._test_helper(-2, [[6], [1], [1], [16]], [[6], [1], [16]], "squeeze3", True)
+
+    def test_unsqueeze(self):
+        self._test_helper(
+            1,
+            [[4, 3], [1], [2], [1]],
+            [[4, 3], [1], [1], [2], [1]],
+            "unsqueeze0",
+            False,
+        )
+        self._test_helper(
+            0,
+            [[4, 3], [1], [2], [1]],
+            [[1], [4, 3], [1], [2], [1]],
+            "unsqueeze1",
+            False,
+        )
+        self._test_helper(
+            -1,
+            [[4, 3], [1], [2], [3]],
+            [[4, 3], [1], [2], [3], [1]],
+            "unsqueeze2",
+            False,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_topk.py b/tests/unittest/ops/test_topk.py
new file mode 100644
index 000000000..6a2c795bb
--- /dev/null
+++ b/tests/unittest/ops/test_topk.py
@@ -0,0 +1,73 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for topk Operator.
+"""
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+
+class topkTestCase(unittest.TestCase):
+    def _create_tensors(self, shape):
+        N = np.prod(shape)
+        scores = torch.randperm(N) / N
+        return scores.reshape(shape).cuda().half()
+
+    def _test_topk(
+        self, batch_size=1, shape=(2, 500), dim=0, topK=100, test_name="topk"
+    ):
+
+        o_shape = list(shape)
+        o_shape[-1] = topK
+
+        X1 = Tensor(
+            shape=shape,
+            dtype="float16",
+            name="X",
+            is_input=True,
+        )
+        X4 = ops.topk(k=topK)(X1)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(X4, target, "./tmp", test_name)
+
+        scores = self._create_tensors(shape)
+        (values, y_pt) = torch.topk(scores, k=topK, dim=dim)
+
+        x = scores.reshape(shape).contiguous()
+        y = torch.empty(o_shape).cuda().to(torch.int64)
+        module.run_with_tensors([x], [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_topk_heap(self):
+        self._test_topk(shape=(2000,), topK=100, test_name="topk_heap")
+        self._test_topk(shape=(4, 500), topK=100, dim=1, test_name="topk_heap2")
+
+    def test_topk_sort(self):
+        self._test_topk(shape=(2000,), topK=300, test_name="topk_sort")
+        self._test_topk(shape=(4, 500), topK=200, dim=1, test_name="topk_sort2")
+
+
+if __name__ == "__main__":
+    torch.manual_seed(1024)
+    unittest.main()
diff --git a/tests/unittest/ops/test_transpose_conv2d.py b/tests/unittest/ops/test_transpose_conv2d.py
new file mode 100644
index 000000000..2f4dc7726
--- /dev/null
+++ b/tests/unittest/ops/test_transpose_conv2d.py
@@ -0,0 +1,58 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class conv2dTransposeTestCase(unittest.TestCase):
+    def test_fp16(self, batch=32):
+        target = detect_target()
+        if target.name() == "cuda" and int(target._arch) < 80:
+            return
+        X = Tensor(
+            shape=[IntImm(batch), 28, 28, 256],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 2, 2, 256], dtype="float16", name="input_1", is_input=True
+        )
+        OP = ops.transposed_conv2d(stride=2, pad=0, dilate=1)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "transpose_conv2d")
+
+        X_pt = torch.randn(batch, 256, 28, 28).cuda().half()
+        W_pt = torch.randn(256, 256, 2, 2).cuda().half()
+        Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, padding=0, stride=2)
+
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        y = torch.empty([batch, 56, 56, 256]).cuda().half()
+        module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+        y_transpose = y.permute((0, 3, 1, 2))
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_transpose_conv2d_bias.py b/tests/unittest/ops/test_transpose_conv2d_bias.py
new file mode 100644
index 000000000..90692c8cd
--- /dev/null
+++ b/tests/unittest/ops/test_transpose_conv2d_bias.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class conv2dTransposeTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        target = detect_target()
+        if int(target._arch) < 80:
+            return
+        X = Tensor(
+            shape=[IntImm(batch), 14, 14, 256],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 2, 2, 256], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
+        OP = ops.transposed_conv2d_bias(stride=2, pad=0, dilate=1)
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "transpose_conv2d_bias")
+
+        X_pt = torch.randn(batch, 256, 14, 14).cuda().half()
+        W_pt = torch.randn(256, 256, 2, 2).cuda().half()
+        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, padding=0, stride=2)
+        Y_pt = Y_pt + B_pt
+
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}, [y]
+        )
+        y_transpose = y.permute((0, 3, 1, 2))
+        if target.name() == "cuda":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_transpose_conv2d_bias_relu.py b/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
new file mode 100644
index 000000000..d5489f0bd
--- /dev/null
+++ b/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
@@ -0,0 +1,68 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class conv2dTransposeTestCase(unittest.TestCase):
+    def test_fp16(self, batch=4):
+        target = detect_target()
+        if int(target._arch) < 80:
+            return
+        X = Tensor(
+            shape=[IntImm(batch), 14, 14, 256],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 2, 2, 256], dtype="float16", name="input_1", is_input=True
+        )
+        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
+        OP = ops.transposed_conv2d_bias_relu(stride=2, pad=0, dilate=1)
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "transpose_conv2d_bias")
+
+        X_pt = torch.randn(batch, 256, 14, 14).cuda().half()
+        W_pt = torch.randn(256, 256, 2, 2).cuda().half()
+        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, padding=0, stride=2)
+        Y_pt = Y_pt + B_pt
+        Y_pt = torch.relu(Y_pt)
+
+        x = X_pt.permute((0, 2, 3, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 1)).contiguous()
+        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        module.run_with_tensors(
+            {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}, [y]
+        )
+        y_transpose = y.permute((0, 3, 1, 2))
+        if target.name() == "cuda":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_tuple_list_construct.py b/tests/unittest/ops/test_tuple_list_construct.py
new file mode 100644
index 000000000..a4a4b539a
--- /dev/null
+++ b/tests/unittest/ops/test_tuple_list_construct.py
@@ -0,0 +1,82 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class TupleConstructTestCase(unittest.TestCase):
+    def _test_tuple_construct(
+        self,
+        batch_size=(1, 3),
+        X_shape=(16, 32, 64),
+        test_op=ops.tuple_construct,
+        test_name="tuple",
+    ):
+        target = detect_target()
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), *X_shape],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        X1 = ops.reshape()(X, [-1, X_shape[-1]])
+        X2 = ops.flatten()(X)
+        X3 = ops.unsqueeze(1)(X2)
+        T = test_op()(X1, X2, X3)
+        Y1 = ops.getitem()(T, 0)
+        Y2 = ops.getitem()(T, 1)
+        Y3 = ops.getitem()(T, 2)
+
+        Y1._attrs["name"] = "output_0"
+        Y1._attrs["is_output"] = True
+        Y2._attrs["name"] = "output_1"
+        Y2._attrs["is_output"] = True
+        Y3._attrs["name"] = "output_2"
+        Y3._attrs["is_output"] = True
+
+        module = compile_model([Y1, Y2, Y3], target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_shape_pt = (b, *X_shape)
+            X_pt = torch.randn(X_shape_pt).cuda().half()
+            Y1_pt = X_pt.reshape(-1, X_shape_pt[-1])
+            Y2_pt = X_pt.flatten()
+            Y3_pt = Y2_pt.unsqueeze(1)
+
+            outputs = [
+                torch.empty(Y1_pt.size()).cuda().half(),
+                torch.empty(Y2_pt.size()).cuda().half(),
+                torch.empty(Y3_pt.size()).cuda().half(),
+            ]
+            module.run_with_tensors([X_pt], outputs)
+
+            self.assertTrue(torch.allclose(Y1_pt, outputs[0], atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.allclose(Y2_pt, outputs[1], atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.allclose(Y3_pt, outputs[2], atol=1e-2, rtol=1e-2))
+
+    def test_tuple_construct(self):
+        self._test_tuple_construct(test_op=ops.tuple_construct, test_name="tuple_0")
+        self._test_tuple_construct(test_op=ops.list_construct, test_name="list_0")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_upsamping2d.py b/tests/unittest/ops/test_upsamping2d.py
new file mode 100644
index 000000000..41cd32883
--- /dev/null
+++ b/tests/unittest/ops/test_upsamping2d.py
@@ -0,0 +1,78 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, nn, Tensor
+from aitemplate.testing import detect_target
+
+_DEFAULT_BATCH_SIZE = [1, 3]
+
+
+class UpsamplingTestCase(unittest.TestCase):
+    def _test_fp16_single_op(
+        self,
+        scale_factor=2.0,
+        mode="bilinear",
+        batch_size=_DEFAULT_BATCH_SIZE,
+        test_name="bilinear_upsampling2d",
+    ):
+        channels = 1024
+        HH, WW = 8, 8
+        target = detect_target()
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), HH, WW, channels],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        OP = nn.Upsampling2d(scale_factor=scale_factor, mode=mode)
+        Y = OP(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_pt = torch.randn(b, channels, HH, WW).cuda().half()
+            Y_pt = torch.nn.functional.interpolate(
+                X_pt, scale_factor=scale_factor, mode=mode
+            )
+            x = torch.permute(X_pt, (0, 2, 3, 1)).contiguous()
+            y = (
+                torch.empty(
+                    [b, int(HH * scale_factor), int(WW * scale_factor), channels]
+                )
+                .cuda()
+                .half()
+            )
+            module.run_with_tensors([x], [y])
+            y_transpose = torch.permute(y, (0, 3, 1, 2))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+    def test_bilinear_upsample(self):
+        self._test_fp16_single_op(
+            scale_factor=3.5, mode="bilinear", test_name="bilinear_upsampling2d"
+        )
+
+    def test_nearest_upsample(self):
+        self._test_fp16_single_op(
+            scale_factor=2.0, mode="nearest", test_name="nearest_upsampling2d"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_upsamping2d_add.py b/tests/unittest/ops/test_upsamping2d_add.py
new file mode 100644
index 000000000..f4e23105f
--- /dev/null
+++ b/tests/unittest/ops/test_upsamping2d_add.py
@@ -0,0 +1,109 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, nn, Tensor
+from aitemplate.testing import detect_target
+
+
+_DEFAULT_BATCH_SIZE = [1, 16]
+
+
+class UpsamplingAddTestCase(unittest.TestCase):
+    def _test_fp16_single_op(
+        self,
+        scale_factor=2.0,
+        mode="bilinear",
+        channels=1024,
+        batch_size=_DEFAULT_BATCH_SIZE,
+        test_name="upsampling2d_add",
+    ):
+        HH, WW = 32, 32
+        target = detect_target()
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), HH, WW, channels],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+
+        R = Tensor(
+            shape=[
+                IntVar(values=batch_size, name="input_batch_r"),
+                int(HH * scale_factor),
+                int(WW * scale_factor),
+                channels,
+            ],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+
+        OP = nn.Upsampling2dAdd(scale_factor=scale_factor, mode=mode)
+        Y = OP(X, R)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_pt = torch.randn(b, channels, HH, WW).cuda().half()
+            R_pt = (
+                torch.randn(b, channels, int(HH * scale_factor), int(WW * scale_factor))
+                .cuda()
+                .half()
+            )
+
+            Y_pt = (
+                torch.nn.functional.interpolate(
+                    X_pt, scale_factor=scale_factor, mode=mode
+                )
+                + R_pt
+            )
+
+            x = torch.permute(X_pt, (0, 2, 3, 1)).contiguous()
+            r = torch.permute(R_pt, (0, 2, 3, 1)).contiguous()
+            y = (
+                torch.empty(
+                    [b, int(HH * scale_factor), int(WW * scale_factor), channels]
+                )
+                .cuda()
+                .half()
+            )
+            module.run_with_tensors({"input_0": x, "input_1": r}, [y])
+            y_tranpose = torch.permute(y, (0, 3, 1, 2))
+            self.assertTrue(torch.allclose(Y_pt, y_tranpose, atol=1e-2, rtol=1e-2))
+
+    def test_bilinear_upsample_add(self):
+        self._test_fp16_single_op(
+            scale_factor=2.0, test_name="bilinear_upsampling2d_add"
+        )
+
+    def test_nearest_upsample_add(self):
+        self._test_fp16_single_op(
+            scale_factor=3.0, mode="nearest", test_name="nearest_add1"
+        )
+        self._test_fp16_single_op(
+            scale_factor=2.0, mode="nearest", channels=514, test_name="nearest_add2"
+        )
+        self._test_fp16_single_op(
+            scale_factor=2.0, mode="nearest", channels=231, test_name="nearest_add3"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_var.py b/tests/unittest/ops/test_var.py
new file mode 100644
index 000000000..a3549f28d
--- /dev/null
+++ b/tests/unittest/ops/test_var.py
@@ -0,0 +1,132 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import dtype_to_torch_dtype, get_random_torch_tensor
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class VarTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(VarTestCase, self).__init__(*args, **kwargs)
+
+    def _run_var(
+        self,
+        *,
+        dim,
+        unbiased,
+        input_shape,
+        keepdim=False,
+        input_type="float16",
+        output_type=None,
+    ):
+        torch.manual_seed(0)
+        logging.info(
+            "Test input_shape={input_shape}, reduction_axes={dim}".format(
+                input_shape=input_shape, dim=dim
+            )
+        )
+        target = detect_target()
+        X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
+
+        op = ops.var(dim=dim, unbiased=unbiased, keepdim=keepdim, dtype=output_type)
+        Y = op(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
+        y_dtype = Y._attrs["dtype"]
+
+        logging.info("AITemplate output_shape: {}".format(y_shape))
+        logging.info("AITemplate output_type: {}".format(y_dtype))
+
+        test_name = "var"
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = get_random_torch_tensor(input_shape, input_type)
+        Y_pt = torch.var(X_pt, dim=dim, unbiased=unbiased, keepdim=keepdim)
+
+        y = torch.empty(y_shape).cuda().half()
+        module.run_with_tensors([X_pt], [y])
+
+        np.testing.assert_equal(y_shape, Y_pt.size())
+        np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2, equal_nan=True))
+
+    def test_var(self):
+        self._run_var(dim=-1, unbiased=True, input_shape=[1, 1], keepdim=False)
+        self._run_var(dim=-1, unbiased=False, input_shape=[1, 1], keepdim=False)
+        self._run_var(dim=-1, unbiased=True, input_shape=[1, 5], keepdim=False)
+        self._run_var(dim=-1, unbiased=False, input_shape=[2, 8], keepdim=False)
+        self._run_var(dim=-1, unbiased=False, input_shape=[3, 2, 2050], keepdim=False)
+        self._run_var(dim=-1, unbiased=True, input_shape=[3, 2, 2050], keepdim=False)
+        self._run_var(dim=1, unbiased=True, input_shape=[3, 2050, 2], keepdim=True)
+        self._run_var(dim=0, unbiased=True, input_shape=[3001, 4, 2], keepdim=True)
+        self._run_var(dim=-1, unbiased=True, input_shape=[1, 1000000, 6], keepdim=False)
+
+    def _run_batched_var(
+        self, *, dim, unbiased, keepdim=False, input_type="float16", output_type=None
+    ):
+        torch.manual_seed(0)
+        logging.info("Test batched_var with reduction_axes={dim}".format(dim=dim))
+        target = detect_target()
+
+        M = 4
+        N = 32
+        X = Tensor(
+            shape=[IntImm(M), IntVar(name="input_batch", values=[1, 2048]), IntImm(N)],
+            dtype=input_type,
+            name="input_0",
+            is_input=True,
+        )
+
+        op = ops.var(dim=dim, unbiased=unbiased, keepdim=keepdim, dtype=output_type)
+        Y = op(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        y_dtype = Y._attrs["dtype"]
+
+        logging.info("AITemplate output_type: {}".format(y_dtype))
+
+        test_name = "batched_var"
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for B in [5, 128, 1024, 1237, 2002]:
+            input_shape = [M, B, N]
+            logging.info("Testing input_shape={}".format(input_shape))
+
+            X_pt = get_random_torch_tensor(input_shape, input_type)
+            Y_pt = torch.var(X_pt, dim=dim, unbiased=unbiased, keepdim=keepdim)
+
+            y = torch.empty(Y_pt.size()).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+
+            np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_batched_var(self):
+        self._run_batched_var(dim=0, unbiased=False, keepdim=True)
+        self._run_batched_var(dim=1, unbiased=True, keepdim=False)
+        self._run_batched_var(dim=1, unbiased=False, keepdim=True)
+        self._run_batched_var(dim=2, unbiased=True, keepdim=False)
+
+
+if __name__ == "__main__":
+    unittest.main()