Skip to content

Commit

Permalink
[Doc] Customization Optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
Hzfengsy committed Sep 1, 2024
1 parent 98de9ba commit edf1d5e
Show file tree
Hide file tree
Showing 2 changed files with 226 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/how_to/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@
:maxdepth: 1

tutorials/e2e_opt_model
tutorials/customize_opt
tutorials/cross_compilation_and_rpc
dev/index
225 changes: 225 additions & 0 deletions docs/how_to/tutorials/customize_opt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

"""
.. _customize_opt:
Customize Optimization
======================
One main design goal of Apache TVM is to enable easy customization of the optimization pipeline
for both research or development purposes and iterate the engineering optimizations. In this
tutorial we will
.. contents:: Table of Contents
:local:
:depth: 1
"""

######################################################################
# Review Overall Flow
# -------------------
# .. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg
# :align: center
# :width: 80%
#
# The overall flow consists of the following steps:
#
# - **Construct or Import a Model**: Construct a neural network model or import a pre-trained
# model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains
# all the information needed for compilation, including high-level Relax functions for
# computational graph, and low-level TensorIR functions for tensor program.
# - **Perform Composable Optimizations**: Perform a series of optimization transformations,
# such as graph optimizations, tensor program optimizations, and library dispatching.
# - **Build and Universal Deployment**: Build the optimized model to a deployable module to the
# universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators.
#

import os
import tempfile
import numpy as np
import tvm
from tvm import IRModule, relax
from tvm.relax.frontend import nn

######################################################################
# Composable IRModule Optimization
# --------------------------------
# Apache TVM Unity provides a flexible way to optimize the IRModule. Everything centered
# around IRModule optimization can be composed with existing pipelines. Note that each optimization
# can focus on **part of the computation graph**, enabling partial lowering or partial optimization.
#
# In this tutorial, we will demonstrate how to optimize a model with Apache TVM Unity.

######################################################################
# Prepare a Relax Module
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# We first prepare a Relax module. The module can be imported from other frameworks, constructed
# with NN module frontend or TVMScript. Here we use a simple neural network model as an example.


class RelaxModel(nn.Module):
def __init__(self):
super(RelaxModel, self).__init__()
self.fc1 = nn.Linear(784, 256)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(256, 10, bias=False)

def forward(self, x):
x = self.fc1(x)
x = self.relu1(x)
x = self.fc2(x)
return x


input_shape = (1, 784)
mod, params = RelaxModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}})
mod.show()

######################################################################
# Library Dispatch
# ~~~~~~~~~~~~~~~~
# We would like to quickly try out a variant of library optimization for certain platforms
# (e.g., GPU). We can write a certain dispatching pass for the specific platform and
# operator. Here we demonstrate how to dispatch the CUBLAS library for certain patterns.
#
# .. note::
# This tutorial only demonstrates a single operator dispatching for CUBLAS, highlighting
# the flexibility of the optimization pipeline. In real-world cases, we can import multiple
# patterns and dispatch them to different kernels.


# Import cublas pattern
import tvm.relax.backend.contrib.cublas as _cublas


# Define a new pass for CUBLAS dispatch
@tvm.transform.module_pass(opt_level=0, name="CublasDispatch")
class CublasDispatch:
def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule:
# Check if CUBLAS is enabled
if not tvm.get_global_func("relax.ext.cublas", True):
raise Exception("CUBLAS is not enabled.")

# Get interested patterns
patterns = [relax.backend.get_pattern("cublas.matmul_transposed_bias_relu")]
# Note in real-world cases, we usually get all patterns
# patterns = relax.backend.get_patterns_with_prefix("cublas")

# Fuse ops by patterns and then run codegen
mod = relax.transform.FuseOpsByPattern(patterns, annotate_codegen=True)(mod)
mod = relax.transform.RunCodegen()(mod)
return mod


mod = CublasDispatch()(mod)
mod.show()

######################################################################
# After the dispatching pass, we can see that the first ``nn.Linear`` and ``nn.ReLU`` are fused
# and rewritten to a ``call_dps_packed`` function which call the CUBLAS library. Notably, the
# other part is not changed, which means we can selectively dispatch the optimization for
# certain computation.

######################################################################
# Auto Tuning
# ~~~~~~~~~~~
# Continuing from the previous example, we can further optimize the model with auto-tuning for
# the **rest part of the computation**. Here we demonstrate how to use the meta-schedule to auto-tune
# the model.
#
# We can use ``MetaScheduleTuneTIR`` pass to simply tuning the model, while ``MetaScheduleApplyDatabase``
# pass to apply the best configuration to the model. The tuning process will generate search space,
# tune the model and the following steps will apply the best configuration to the model. Before
# running the passes, we need to lowering relax operator into TensorIR functions via ``LegalizeOps``
#
# .. note::
#
# To save CI time and avoid flakiness, we skip the tuning process in CI environment.
#

device = tvm.cuda(0)
target = tvm.target.Target.from_device(device)
if os.getenv("CI", "") != "true":
trials = 2000
with target, tempfile.TemporaryDirectory() as tmp_dir:
mod = tvm.ir.transform.Sequential(
[
relax.get_pipeline("zero"),
relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=trials),
relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir),
]
)(mod)

mod.show()

######################################################################
# DLight Rules
# ~~~~~~~~~~~~
# DLight rules are a set of default rules for scheduling and optimization the kernel.
# DLight rules are designed for fast compilation and **fair** performance. In some cases,
# e.g. language model, DLight provides excellent performance, while for generic models,
# it achieves a balance between performance and compilation time.

from tvm import dlight as dl

# Apply DLight rules
with target:
mod = tvm.ir.transform.Sequential(
[
relax.get_pipeline("zero"),
dl.ApplyDefaultSchedule( # pylint: disable=not-callable
dl.gpu.Matmul(),
dl.gpu.GEMV(),
dl.gpu.Reduction(),
dl.gpu.GeneralReduction(),
dl.gpu.Fallback(),
),
]
)(mod)

mod.show()

######################################################################
# .. note::
#
# This tutorial focuses on the demonstration of the optimization pipeline, instead of
# pushing the performance to the limit. The current optimization may not be the best.


######################################################################
# Deploy the Optimized Model
# --------------------------
# We can build and deploy the optimized model to the TVM runtime.

ex = relax.build(mod, target="cuda")
dev = tvm.device("cuda", 0)
vm = relax.VirtualMachine(ex, dev)
# Need to allocate data and params on GPU device
data = tvm.nd.array(np.random.rand(*input_shape).astype("float32"), dev)
gpu_params = [tvm.nd.array(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params]
gpu_out = vm["forward"](data, *gpu_params).numpy()
print(gpu_out)


######################################################################
# Summary
# -------
# This tutorial demonstrates how to customize the optimization pipeline for ML models in Apache TVM.
# We can easily compose the optimization passes and customize the optimization for different parts
# of the computation graph. The flexibility of the optimization pipeline enables us to quickly
# iterate the optimization and improve the performance of the model.
#

0 comments on commit edf1d5e

Please sign in to comment.