-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
226 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,5 +19,6 @@ | |
:maxdepth: 1 | ||
|
||
tutorials/e2e_opt_model | ||
tutorials/customize_opt | ||
tutorials/cross_compilation_and_rpc | ||
dev/index |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
|
||
""" | ||
.. _customize_opt: | ||
Customize Optimization | ||
====================== | ||
One main design goal of Apache TVM is to enable easy customization of the optimization pipeline | ||
for both research or development purposes and iterate the engineering optimizations. In this | ||
tutorial we will | ||
.. contents:: Table of Contents | ||
:local: | ||
:depth: 1 | ||
""" | ||
|
||
###################################################################### | ||
# Review Overall Flow | ||
# ------------------- | ||
# .. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg | ||
# :align: center | ||
# :width: 80% | ||
# | ||
# The overall flow consists of the following steps: | ||
# | ||
# - **Construct or Import a Model**: Construct a neural network model or import a pre-trained | ||
# model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains | ||
# all the information needed for compilation, including high-level Relax functions for | ||
# computational graph, and low-level TensorIR functions for tensor program. | ||
# - **Perform Composable Optimizations**: Perform a series of optimization transformations, | ||
# such as graph optimizations, tensor program optimizations, and library dispatching. | ||
# - **Build and Universal Deployment**: Build the optimized model to a deployable module to the | ||
# universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators. | ||
# | ||
|
||
import os | ||
import tempfile | ||
import numpy as np | ||
import tvm | ||
from tvm import IRModule, relax | ||
from tvm.relax.frontend import nn | ||
|
||
###################################################################### | ||
# Composable IRModule Optimization | ||
# -------------------------------- | ||
# Apache TVM Unity provides a flexible way to optimize the IRModule. Everything centered | ||
# around IRModule optimization can be composed with existing pipelines. Note that each optimization | ||
# can focus on **part of the computation graph**, enabling partial lowering or partial optimization. | ||
# | ||
# In this tutorial, we will demonstrate how to optimize a model with Apache TVM Unity. | ||
|
||
###################################################################### | ||
# Prepare a Relax Module | ||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
# We first prepare a Relax module. The module can be imported from other frameworks, constructed | ||
# with NN module frontend or TVMScript. Here we use a simple neural network model as an example. | ||
|
||
|
||
class RelaxModel(nn.Module): | ||
def __init__(self): | ||
super(RelaxModel, self).__init__() | ||
self.fc1 = nn.Linear(784, 256) | ||
self.relu1 = nn.ReLU() | ||
self.fc2 = nn.Linear(256, 10, bias=False) | ||
|
||
def forward(self, x): | ||
x = self.fc1(x) | ||
x = self.relu1(x) | ||
x = self.fc2(x) | ||
return x | ||
|
||
|
||
input_shape = (1, 784) | ||
mod, params = RelaxModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}}) | ||
mod.show() | ||
|
||
###################################################################### | ||
# Library Dispatch | ||
# ~~~~~~~~~~~~~~~~ | ||
# We would like to quickly try out a variant of library optimization for certain platforms | ||
# (e.g., GPU). We can write a certain dispatching pass for the specific platform and | ||
# operator. Here we demonstrate how to dispatch the CUBLAS library for certain patterns. | ||
# | ||
# .. note:: | ||
# This tutorial only demonstrates a single operator dispatching for CUBLAS, highlighting | ||
# the flexibility of the optimization pipeline. In real-world cases, we can import multiple | ||
# patterns and dispatch them to different kernels. | ||
|
||
|
||
# Import cublas pattern | ||
import tvm.relax.backend.contrib.cublas as _cublas | ||
|
||
|
||
# Define a new pass for CUBLAS dispatch | ||
@tvm.transform.module_pass(opt_level=0, name="CublasDispatch") | ||
class CublasDispatch: | ||
def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule: | ||
# Check if CUBLAS is enabled | ||
if not tvm.get_global_func("relax.ext.cublas", True): | ||
raise Exception("CUBLAS is not enabled.") | ||
|
||
# Get interested patterns | ||
patterns = [relax.backend.get_pattern("cublas.matmul_transposed_bias_relu")] | ||
# Note in real-world cases, we usually get all patterns | ||
# patterns = relax.backend.get_patterns_with_prefix("cublas") | ||
|
||
# Fuse ops by patterns and then run codegen | ||
mod = relax.transform.FuseOpsByPattern(patterns, annotate_codegen=True)(mod) | ||
mod = relax.transform.RunCodegen()(mod) | ||
return mod | ||
|
||
|
||
mod = CublasDispatch()(mod) | ||
mod.show() | ||
|
||
###################################################################### | ||
# After the dispatching pass, we can see that the first ``nn.Linear`` and ``nn.ReLU`` are fused | ||
# and rewritten to a ``call_dps_packed`` function which call the CUBLAS library. Notably, the | ||
# other part is not changed, which means we can selectively dispatch the optimization for | ||
# certain computation. | ||
|
||
###################################################################### | ||
# Auto Tuning | ||
# ~~~~~~~~~~~ | ||
# Continuing from the previous example, we can further optimize the model with auto-tuning for | ||
# the **rest part of the computation**. Here we demonstrate how to use the meta-schedule to auto-tune | ||
# the model. | ||
# | ||
# We can use ``MetaScheduleTuneTIR`` pass to simply tuning the model, while ``MetaScheduleApplyDatabase`` | ||
# pass to apply the best configuration to the model. The tuning process will generate search space, | ||
# tune the model and the following steps will apply the best configuration to the model. Before | ||
# running the passes, we need to lowering relax operator into TensorIR functions via ``LegalizeOps`` | ||
# | ||
# .. note:: | ||
# | ||
# To save CI time and avoid flakiness, we skip the tuning process in CI environment. | ||
# | ||
|
||
device = tvm.cuda(0) | ||
target = tvm.target.Target.from_device(device) | ||
if os.getenv("CI", "") != "true": | ||
trials = 2000 | ||
with target, tempfile.TemporaryDirectory() as tmp_dir: | ||
mod = tvm.ir.transform.Sequential( | ||
[ | ||
relax.get_pipeline("zero"), | ||
relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=trials), | ||
relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir), | ||
] | ||
)(mod) | ||
|
||
mod.show() | ||
|
||
###################################################################### | ||
# DLight Rules | ||
# ~~~~~~~~~~~~ | ||
# DLight rules are a set of default rules for scheduling and optimization the kernel. | ||
# DLight rules are designed for fast compilation and **fair** performance. In some cases, | ||
# e.g. language model, DLight provides excellent performance, while for generic models, | ||
# it achieves a balance between performance and compilation time. | ||
|
||
from tvm import dlight as dl | ||
|
||
# Apply DLight rules | ||
with target: | ||
mod = tvm.ir.transform.Sequential( | ||
[ | ||
relax.get_pipeline("zero"), | ||
dl.ApplyDefaultSchedule( # pylint: disable=not-callable | ||
dl.gpu.Matmul(), | ||
dl.gpu.GEMV(), | ||
dl.gpu.Reduction(), | ||
dl.gpu.GeneralReduction(), | ||
dl.gpu.Fallback(), | ||
), | ||
] | ||
)(mod) | ||
|
||
mod.show() | ||
|
||
###################################################################### | ||
# .. note:: | ||
# | ||
# This tutorial focuses on the demonstration of the optimization pipeline, instead of | ||
# pushing the performance to the limit. The current optimization may not be the best. | ||
|
||
|
||
###################################################################### | ||
# Deploy the Optimized Model | ||
# -------------------------- | ||
# We can build and deploy the optimized model to the TVM runtime. | ||
|
||
ex = relax.build(mod, target="cuda") | ||
dev = tvm.device("cuda", 0) | ||
vm = relax.VirtualMachine(ex, dev) | ||
# Need to allocate data and params on GPU device | ||
data = tvm.nd.array(np.random.rand(*input_shape).astype("float32"), dev) | ||
gpu_params = [tvm.nd.array(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params] | ||
gpu_out = vm["forward"](data, *gpu_params).numpy() | ||
print(gpu_out) | ||
|
||
|
||
###################################################################### | ||
# Summary | ||
# ------- | ||
# This tutorial demonstrates how to customize the optimization pipeline for ML models in Apache TVM. | ||
# We can easily compose the optimization passes and customize the optimization for different parts | ||
# of the computation graph. The flexibility of the optimization pipeline enables us to quickly | ||
# iterate the optimization and improve the performance of the model. | ||
# |