From 2cf8989d1e8257299275dfea42f992de5e38b434 Mon Sep 17 00:00:00 2001 From: Christian Convey Date: Tue, 14 Jun 2022 09:40:10 -0700 Subject: [PATCH] [hexagon][testing] add max_pool2d benchmark - Add benchmarking framework for Hexagon maxpool-2d kernels, and one (simple) kernel. - Minor refactor hexagon benchmark utilities. --- .../test_hexagon/benchmark_elemwise_add.py | 18 +- .../test_hexagon/benchmark_maxpool2d.py | 379 ++++++++++++++++++ .../contrib/test_hexagon/benchmark_util.py | 14 + 3 files changed, 402 insertions(+), 9 deletions(-) create mode 100644 tests/python/contrib/test_hexagon/benchmark_maxpool2d.py diff --git a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py index 70266d7939bc5..bd33a4f1a0404 100644 --- a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py +++ b/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py @@ -28,7 +28,7 @@ from tvm.script import tir as T from tvm import te from tvm.contrib.hexagon.build import HexagonLauncherRPC -from . import benchmark_util +from . import benchmark_util as bu # This is a fixed detail of the v68 architecture. HVX_VECTOR_BYTES = 128 @@ -43,7 +43,7 @@ # triggering TIME_WAIT state on the server socket. This prevents another # server to bind to the same port until the wait time elapses. -_BT = benchmark_util.BenchmarksTable() +_BT = bu.BenchmarksTable() _CSV_COLUMN_ORDER = [ # Identifies which TE-compute / TIRScript is used as the basis for the @@ -129,7 +129,7 @@ def _get_irmod_elemwise_add( dtype_str = str(dtype) if mem_scope == "global.vtcm": - raise UnsupportedException("This benchmark kernel does not yet support VTCM buffers.") + raise bu.UnsupportedException("This benchmark kernel does not yet support VTCM buffers.") # This check is currently elided by the one above, but it should become relevant as soon # as we add VTCM support to this kernel generator. @@ -147,7 +147,7 @@ def _get_irmod_elemwise_add( estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes: - raise UnsupportedException("Expect to exceed VTCM budget.") + raise bu.UnsupportedException("Expect to exceed VTCM budget.") @tvm.script.ir_module class BenchmarkModule: @@ -190,10 +190,10 @@ def _benchmark_hexagon_elementwise_add_kernel( "mem_scope": mem_scope, } - desc = benchmark_util.get_benchmark_decription(keys_dict) + desc = bu.get_benchmark_decription(keys_dict) # Create the host-side directory for this benchmark run's files / logs... - host_files_dir_name = benchmark_util.get_benchmark_id(keys_dict) + host_files_dir_name = bu.get_benchmark_id(keys_dict) host_files_dir_path = os.path.join(_HOST_OUTPUT_DIR, host_files_dir_name) os.mkdir(host_files_dir_path) @@ -296,11 +296,11 @@ def _benchmark_hexagon_elementwise_add_kernel( result, host_numpy_C_data_expected, rel_tolerance, abs_tolerance ) except AssertionError as e: - raise NumericalAccuracyException(str(e)) + raise bu.NumericalAccuracyException(str(e)) _BT.record_success(timing_result, **keys_dict) - except NumericalAccuracyException as e: + except bu.NumericalAccuracyException as e: print() print(f"FAIL: Numerical accuracy error. See log file.") @@ -309,7 +309,7 @@ def _benchmark_hexagon_elementwise_add_kernel( _BT.record_fail(**keys_dict, comments=f"Numerical accuracy error. See log file.") - except UnsupportedException as e: + except bu.UnsupportedException as e: print() print(f"SKIP: {e}") diff --git a/tests/python/contrib/test_hexagon/benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/benchmark_maxpool2d.py new file mode 100644 index 0000000000000..68c74e5db6d5c --- /dev/null +++ b/tests/python/contrib/test_hexagon/benchmark_maxpool2d.py @@ -0,0 +1,379 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +This module serves two purposes: + (1) Demonstrates how to write Python code that exercises various + Hexagon-related algorithms / features. + + (2) Benchmark the resulting primfuncs. + +Current limitations: + - Input shapes are limited to NHWC --> NHWC_8h8w32c. + + - Testing parameters (input shapes, dtypes, etc.) currently + support only one value for each parameter. + + - H, W, C must be integer multiples of 8, 8, and 32, + respectively. I.e., partial blocks aren't currently + supported by this script. + + - Requires that I/O tensors reside in "global.VTCM" memory, + rather than "global" memory. + This prevents benchmarking with I/O tensors that are too + large to fit into availble VTCM. + + - The script only develops one primfunc. + Future revisions to this script are expected to add more + primfuncs and demonstrate more coding strategies. +""" + +import sys +import pytest +import numpy as np +import copy +import tempfile +import os + +import tvm.testing +from tvm import te, topi, tir +from tvm.topi import testing +from tvm.script import tir as T +from tvm.tir import IndexMap +from tvm.relay.backend import Executor, Runtime +from tvm.contrib.hexagon.session import Session +from typing import List + +from .infrastructure import allocate_hexagon_array +from . import benchmark_util as bu + + +def _ceil_div(numerator, denominator): + return (numerator + (denominator - 1)) // denominator + + +def _int8_nhwc_8h8w32c_map(n, h, w, c): + return [ + n, + h // 8, + w // 8, + c // 32, + te.AXIS_SEPARATOR, + h % 8, + w % 8, + c % 32, + ] + + +def _int8_nhwc_8h8w32c_shape(n, h, w, c) -> List[int]: + return [ + n, + _ceil_div(h, 8), + _ceil_div(w, 8), + _ceil_div(c, 32), + 8, + 8, + 32, + ] + + +def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray: + """ + Return a deep copy of 'arr_in', transformed from a NWHC to + NHWC-8h8wc32 shape. Any newly created array elements have value 0. + """ + stage1 = copy.copy(arr_in) + + ( + n, + h, + w, + c, + ) = stage1.shape + + ( + h_minor, + w_minor, + c_minor, + ) = [8, 8, 32] + + h_major = _ceil_div(h, h_minor) + w_major = _ceil_div(w, w_minor) + c_major = _ceil_div(c, c_minor) + + # This handles cases where the dimensions of arr_in are not cleanly divided + # by the minor block size, i.e. [8, 8, 32]. + # + # Any additional array elements that this creates will ahve value 0. + # We shouldn't actually care what value is used for those elements, because they + # shouldn't be treated as meaningful by any of our algorithms. + if (h % h_minor) or (w % w_minor) or (c % c_minor): + stage1.resize((n, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False) + + stage2 = stage1.reshape(n, h_major, h_minor, w_major, w_minor, c_major, c_minor) + stage3 = stage2.transpose(0, 1, 3, 5, 2, 4, 6) + return stage3 + + +def _create_test_input(shape, dtype: str) -> np.ndarray: + np_dtype = np.dtype(dtype) + min_value = np.iinfo(np_dtype).min + max_value = np.iinfo(np_dtype).max + return np.random.randint(low=min_value, high=max_value, size=tuple(shape), dtype=np.int8) + + +# This fixture provides some initialization / finalization logic for groups of related +# benchmark runs. +# See the `TestMaxPool2D` class for its intended usage. +@pytest.fixture(scope="class") +def benchmark_group(request): + working_dir = tempfile.mkdtemp() + bt = bu.BenchmarksTable() + + request.cls.working_dir = working_dir + request.cls.benchmark_table = bt + + yield + + tabular_output_filename = os.path.join(working_dir, "benchmark-results.csv") + + if not hasattr(request.cls, "csv_column_order"): + raise Exception('Classes using this fixture must have a member named "csv_column_order"') + + with open(tabular_output_filename, "w") as csv_file: + bt.print_csv(csv_file, request.cls.csv_column_order) + + print() + print("*" * 80) + print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}") + print("*" * 80) + print() + + if bt.has_fail() > 0: + pytest.fail("At least one benchmark configuration failed", pytrace=False) + + +@pytest.mark.usefixtures("benchmark_group") +class TestMaxPool2D: + csv_column_order = [ + # Identifies which TE-compute / TIRScript is used as the basis for the + # benchmarked primfunc. Only needs to be meaningful to humans. + "basic_kernel", + # When applicable, indicates the particular variation of schedules + # apply by the Python code. Decoding this may require looking at this + # script's source code. + "sched_type", + # Values directly based on test parameters... + "input_shape_4d", + "block_shape", + "DTYPE", + "KERNEL", + "STRIDE", + "DILATION", + "PADDING", + "IO_TENSOR_MEM_SCOPE", + # Reserved columns defined by the BenchmarksTable class. + "row_status", + "timings_min_usecs", + "timings_max_usecs", + "timings_median_usecs", + "timings_mean_usecs", + "timings_stddev_usecs", + # For benchmarks that produce files on the host file system, this indicates + # their location. Useful for post-mortem investigation of benchmark results. + "host_files_dir_path", + # Miscellaneous comments about the benchmark. + "comments", + ] + + DTYPE = tvm.testing.parameter("int8") + + # FIXME(cconvey): The script currently fails when H, W, or C is not an + # integer multiple of 8, 8, or 32, respectively. + N = tvm.testing.parameter(1) + H = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]]) + W = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]]) + C = tvm.testing.parameter(*[x * 32 for x in [1, 2]]) + + KERNEL = tvm.testing.parameter((1, 1), (3, 3)) + STRIDE = tvm.testing.parameter((1, 1)) + DILATION = tvm.testing.parameter((1, 1)) + PADDING = tvm.testing.parameter((0, 0, 0, 0)) + IO_TENSOR_MEM_SCOPE = tvm.testing.parameter("global.vtcm") + + @tvm.testing.requires_hexagon + def test_maxpool2d_nhwc( + self, + N, + H, + W, + C, + DTYPE, + KERNEL, + STRIDE, + DILATION, + PADDING, + IO_TENSOR_MEM_SCOPE, + hexagon_session: Session, + ): + keys_dict = { + "basic_kernel": "max_pool2d", + "sched_type": 1, + "input_shape_4d": [N, H, W, C], + "block_shape": [8, 8, 32], + "DTYPE": DTYPE, + "KERNEL": KERNEL, + "STRIDE": STRIDE, + "DILATION": DILATION, + "PADDING": PADDING, + "IO_TENSOR_MEM_SCOPE": IO_TENSOR_MEM_SCOPE, + } + + desc = bu.get_benchmark_decription(keys_dict) + + # Create the host-side directory for this benchmark run's files / logs... + host_files_dir_name = bu.get_benchmark_id(keys_dict) + host_files_dir_path = os.path.join(self.working_dir, host_files_dir_name) + os.mkdir(host_files_dir_path) + + keys_dict["host_files_dir_path"] = host_files_dir_path + + log_file_path = os.path.join(host_files_dir_path, "out.txt") + with open(log_file_path, "w") as log_file: + print(f"CONFIGURATION: {desc}") + log_file.write(f"CONFIGURATION: {desc}\n") + + try: + input_tensor_shape_4d = [N, H, W, C] + input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(N, H, W, C) + + data = te.placeholder(tuple(input_tensor_shape_4d), dtype=DTYPE) + + output = topi.nn.pool2d( + data, KERNEL, STRIDE, DILATION, PADDING, "max", layout="NHWC" + ) + primfunc = te.create_prim_func([data, output]) + + sch = tir.Schedule(primfunc, debug_mask="all") + + sch.transform_layout( + block="tensor", buffer="placeholder", index_map=_int8_nhwc_8h8w32c_map + ) + + target_hexagon = tvm.target.hexagon("v69", link_params=True) + # func = tvm.build(sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon)) + built_module = tvm.build( + sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + + # Save a local copy of the Hexagon object code (in the form of a .so file) + # to allow post-mortem inspection. + host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so") + built_module.save(host_dso_binary_path) + print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}") + + hexagon_mod = hexagon_session.load_module(built_module) + + # Generate the input tensor's data. + # Note that we'll eventually need it in two different layouts: + # (1) NHWC as an argument to testing.poolnd_python. + # (2) NHWC_8h8w32c for as an argument to our Hexagon primfunc. + # a_numpy_4d = np.random.randint(low=-128, high=127, size=input_tensor_shape_4d, dtype=np.int8) + a_numpy_4d = _create_test_input(input_tensor_shape_4d, DTYPE) + + ref_output_4d = testing.poolnd_python( + a_numpy_4d.astype("int32"), + KERNEL, + STRIDE, + DILATION, + PADDING[0:2], + PADDING[2:], + pool_type="max", + dtype="int32", + layout="NHWC", + ).astype(DTYPE) + + output_tensor_shape_4d = ref_output_4d.shape + + a_numpy_7d = _int8_nhwc_8h8w32c_xform_immediate(a_numpy_4d) + + a_hexagon_7d = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=input_tensor_shape_7d, + axis_separators=[4], + dtype=DTYPE, + mem_scope=IO_TENSOR_MEM_SCOPE, + ) + + c_hexagon_4d = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=output_tensor_shape_4d, + axis_separators=[], + dtype=DTYPE, + mem_scope=IO_TENSOR_MEM_SCOPE, + ) + + a_hexagon_7d.copyfrom(a_numpy_7d) + + if DTYPE == "int8": + rel_tolerance = 0 + abs_tolerance = 0 + else: + assert False, f"Need to decide acceptable tolerances for DTYPE {DTYPE}" + + # hexagon_mod(a_hexagon_7d, c_hexagon_4d) + # tvm.testing.assert_allclose(ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance) + + timer = hexagon_mod.time_evaluator( + "main", hexagon_session.device, number=10, repeat=1 + ) + timing_result = timer(a_hexagon_7d, c_hexagon_4d) + + try: + tvm.testing.assert_allclose( + ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance + ) + except AssertionError as e: + raise bu.NumericalAccuracyException(str(e)) + + except bu.NumericalAccuracyException as e: + print() + print(f"FAIL: Numerical accuracy error. See log file.") + + log_file.write("\n") + log_file.write(f"FAIL: {e}\n") + + self.benchmark_table.record_fail( + **keys_dict, comments=f"Numerical accuracy error. See log file." + ) + + except bu.UnsupportedException as e: + print() + print(f"SKIP: {e}") + + log_file.write("\n") + log_file.write(f"SKIP: {e}\n") + + self.benchmark_table.record_skip( + **keys_dict, comments=f"Unsupported configuration: {e}" + ) + + self.benchmark_table.record_success(timing_result, **keys_dict) + + +if __name__ == "__main__": + sys.exit(pytest.main(sys.argv)) diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py index 113c7780c130f..37482d0f8cc36 100644 --- a/tests/python/contrib/test_hexagon/benchmark_util.py +++ b/tests/python/contrib/test_hexagon/benchmark_util.py @@ -18,6 +18,20 @@ import csv +class UnsupportedException(Exception): + """ + Indicates that the specified benchmarking configuration is known to + currently be unsupported. The Exception message may provide more detail. + """ + + +class NumericalAccuracyException(Exception): + """ + Indicates that the benchmarking configuration appeared to run successfully, + but the output data didn't have the expected accuracy. + """ + + class BenchmarksTable: """ Stores/reports the result of benchmark runs.