diff --git a/python/tvm/contrib/cutlass/conv2d_profiler.py b/python/tvm/contrib/cutlass/conv2d_profiler.py new file mode 100644 index 0000000000000..2e4ef4f056afb --- /dev/null +++ b/python/tvm/contrib/cutlass/conv2d_profiler.py @@ -0,0 +1,182 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=import-outside-toplevel, invalid-name +"""Instantiate a C++ source for profiling CUTLASS kernels.""" + + +class Conv2dProfilerEmitter(object): + """Emit a C++ source for profiling CUTLASS kernels.""" + + def __init__(self): + from jinja2 import Template + + self.template = Template( + """ +#include +#include "cutlass/cutlass.h" +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" +#include "cutlass/util/command_line.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "helper.h" + +{{OperatorDef}} +using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<{{OperatorName}}>; + +struct Options { + cutlass::Tensor4DCoord input_size; + cutlass::Tensor4DCoord filter_size; + cutlass::Tensor4DCoord padding; + cutlass::MatrixCoord conv_stride; + cutlass::MatrixCoord dilation; + + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + cmd.get_cmd_line_argument("n", input_size.n()); + cmd.get_cmd_line_argument("h", input_size.h()); + cmd.get_cmd_line_argument("w", input_size.w()); + cmd.get_cmd_line_argument("c", input_size.c()); + cmd.get_cmd_line_argument("k", filter_size.n()); + cmd.get_cmd_line_argument("r", filter_size.h()); + cmd.get_cmd_line_argument("s", filter_size.w()); + int pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w; + cmd.get_cmd_line_argument("pad_h", pad_h); + cmd.get_cmd_line_argument("pad_w", pad_w); + cmd.get_cmd_line_argument("stride_h", stride_h); + cmd.get_cmd_line_argument("stride_w", stride_w); + cmd.get_cmd_line_argument("dilation_h", dilation_h); + cmd.get_cmd_line_argument("dilation_w", dilation_w); + filter_size.c() = input_size.c(); + padding = {pad_h, pad_h, pad_w, pad_w}; + conv_stride = {stride_h, stride_w}; + dilation = {dilation_h, dilation_w}; + } + + cutlass::Tensor4DCoord output_size() const { + auto dilated_h = (filter_size.h() - 1) * dilation.row() + 1; + auto dilated_w = (filter_size.w() - 1) * dilation.column() + 1; + auto h = (input_size.h() + padding.n() + padding.h() - dilated_h) / conv_stride.row() + 1; + auto w = (input_size.w() + padding.w() + padding.c() - dilated_w) / conv_stride.column() + 1; + return cutlass::Tensor4DCoord( + input_size.n(), + h, w, + filter_size.n()); + } +}; + +double profile_convolution(Options const &options) { + using ElementOutput = typename ImplicitGemm::ElementC; + using ElementInputA = typename ImplicitGemm::ElementA; + using ElementInputB = typename ImplicitGemm::ElementB; + auto oshape = options.output_size(); + cutlass::HostTensor tensor_a(options.input_size); + cutlass::HostTensor tensor_b(options.filter_size); + cutlass::HostTensor tensor_c(oshape); + cutlass::HostTensor tensor_ref_c(oshape); + + cutlass::reference::host::TensorFillRandomUniform( + tensor_a.host_view(), + 1, + ElementInputA(7), + ElementInputA(-8), + 0); + + cutlass::reference::host::TensorFillRandomUniform( + tensor_b.host_view(), + 1, + ElementInputB(7), + ElementInputB(-8), + 0); + + cutlass::reference::host::TensorFill( + tensor_c.host_view()); + + cutlass::reference::host::TensorFill( + tensor_ref_c.host_view()); + + tensor_a.sync_device(); + tensor_b.sync_device(); + tensor_c.sync_device(); + tensor_ref_c.sync_device(); + + cutlass::conv::Conv2dProblemSize problem_size( + options.input_size, + options.filter_size, + options.padding, + options.conv_stride, + options.dilation, + options.output_size(), + cutlass::conv::Mode::kCrossCorrelation, + 1 + ); + + using ElementComputeEpilogue = typename ImplicitGemm::ElementCompute; + typename ImplicitGemm::Arguments arguments{ + problem_size, + tensor_a.device_ref(), + tensor_b.device_ref(), + tensor_c.device_ref(), + tensor_c.device_ref(), + {ElementComputeEpilogue(1), ElementComputeEpilogue(0)}, + }; + + ImplicitGemm implicit_gemm_op; + size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + auto status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + + status = implicit_gemm_op.initialize(arguments, workspace.get()); + CUTLASS_CHECK(status); + status = implicit_gemm_op(); + CUTLASS_CHECK(status); + + cudaEvent_t events[2]; + for (auto & event : events) { + cudaEventCreate(&event); + } + cudaEventRecord(events[0]); + + for (int iteration = 0; iteration < 100; ++iteration) { + auto status = implicit_gemm_op(); + CUTLASS_CHECK(status); + } + + cudaEventRecord(events[1]); + cudaEventSynchronize(events[1]); + float runtime_ms = 0; + cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + + for (auto event : events) { + (void)cudaEventDestroy(event); + } + return double(runtime_ms) / 100.0; +} + +int main(int argc, char const **args) { + Options options; + options.parse(argc, args); + std::cout << profile_convolution(options) << std::endl; + return 0; +} +""" + ) + + def emit(self, op_def, op_name): + src = self.template.render(OperatorDef=op_def, OperatorName=op_name) + return src diff --git a/python/tvm/contrib/cutlass/gen_conv2d.py b/python/tvm/contrib/cutlass/gen_conv2d.py index 5a616c9b6e024..d89efa182fc4c 100644 --- a/python/tvm/contrib/cutlass/gen_conv2d.py +++ b/python/tvm/contrib/cutlass/gen_conv2d.py @@ -18,6 +18,12 @@ """Conv2d kernel generator and profiler for CUTLASS.""" from .conv2d_operation import Conv2dOperation, EmitConv2dInstance from .gen_gemm import CutlassGemmProfiler +from .conv2d_profiler import Conv2dProfilerEmitter +from .gen_tensor_op import ( + ProfilerEngine, + generate_sm75_tensor_op_1688, + generate_sm80_tensor_op_16816, +) from .library import ( EpilogueFunctor, SwizzlingFunctor, @@ -39,6 +45,7 @@ def create_conv2d_operator( ret = [] kernel_emitter = EmitConv2dInstance() + profiler_emitter = Conv2dProfilerEmitter() element_a, element_b, element_c, element_epilogue = data_type iterator_algorithms = [IteratorAlgorithm.Optimized] @@ -75,6 +82,7 @@ def create_conv2d_operator( # TODO(masahi): Add profiler source here op_entry["opdef"] = kernel_emitter.emit(op) op_entry["op"] = op + op_entry["src"] = profiler_emitter.emit(op_entry["opdef"], op.procedural_name()) op_entry["name"] = op.procedural_name() op_entry["runtime"] = 9999999 @@ -144,4 +152,6 @@ def profile( alignment = gemm_profile_result["alignment"] data_type = gemm_profile_result["data_type"] - return create_conv2d_operator([tile_description], data_type, [alignment])[0] + out = create_conv2d_operator([tile_description], data_type, [alignment])[0] + print(out["src"]) + return out